]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py 
   1  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  13  class  TEDIE ( InfoExtractor
):   16          (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/   18              (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist   20              ((?P<type_talk>talks)) # We have a simple talk   22              (?P<type_watch>watch)/[^/]+/[^/]+   24          (/lang/(.*?))? # The url may contain the language   25          /(?P<name>[\w-]+) # Here goes the name and then ".html"   29          'url' :  'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' ,   30          'md5' :  'fc94ac279feebbce69f21c0c6ee82810' ,   34              'title' :  'The illusion of consciousness' ,   35              'description' : ( 'Philosopher Dan Dennett makes a compelling '   36                              'argument that not only don \' t we understand our own '   37                              'consciousness, but that half the time our brains are '   38                              'actively fooling us.' ),   39              'uploader' :  'Dan Dennett' ,   44          'url' :  'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms' ,   45          'md5' :  '226f4fb9c62380d11b7995efa4c87994' ,   47              'id' :  'vishal-sikka-the-beauty-and-power-of-algorithms' ,   49              'title' :  'Vishal Sikka: The beauty and power of algorithms' ,   50              'thumbnail' :  're:^https?://.+\.jpg' ,   51              'description' :  'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.' ,   54          'url' :  'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best' ,   58              'title' :  'Be passionate. Be courageous. Be your best.' ,   59              'uploader' :  'Gabby Giffords and Mark Kelly' ,   60              'description' :  'md5:5174aed4d0f16021b704120360f72b92' ,   64          'url' :  'http://www.ted.com/playlists/who_are_the_hackers' ,   67              'title' :  'Who are the hackers?' ,   69          'playlist_mincount' :  6 ,   71          # contains a youtube video   72          'url' :  'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything' ,   73          'add_ie' : [ 'Youtube' ],   77              'title' :  'Douglas Adams: Parrots the Universe and Everything' ,   78              'description' :  'md5:01ad1e199c49ac640cb1196c0e9016af' ,   79              'uploader' :  'University of California Television (UCTV)' ,   80              'uploader_id' :  'UCtelevision' ,   81              'upload_date' :  '20080522' ,   84              'skip_download' :  True ,   88          'url' :  'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond' ,   89          'add_ie' : [ 'Youtube' ],   93              'title' :  'The hidden power of siblings: Jeff Kluger at TEDxAsheville' ,   94              'description' :  'md5:3d7a4f50d95ca5dd67104e2a20f43fe1' ,   95              'uploader' :  'TEDx Talks' ,   96              'uploader_id' :  'TEDxTalks' ,   97              'upload_date' :  '20111216' ,  100              'skip_download' :  True ,  105          'low' : { 'preference' :  1 ,  'width' :  320 ,  'height' :  180 },  106          'medium' : { 'preference' :  2 ,  'width' :  512 ,  'height' :  288 },  107          'high' : { 'preference' :  3 ,  'width' :  854 ,  'height' :  480 },  110      def  _extract_info ( self
,  webpage
):  111          info_json 
=  self
._ search
_ regex
( r
'q\("\w+.init",({.+})\)</script>' ,  112                                         webpage
,  'info json' )  113          return  json
. loads ( info_json
)  115      def  _real_extract ( self
,  url
):  116          m 
=  re
. match ( self
._ VALID
_U RL
,  url
,  re
. VERBOSE
)  117          if  m
. group ( 'type' ). startswith ( 'embed' ):  118              desktop_url 
=  m
. group ( 'proto' ) +  'www'  +  m
. group ( 'urlmain' )  119              return  self
. url_result ( desktop_url
,  'TED' )  120          name 
=  m
. group ( 'name' )  121          if  m
. group ( 'type_talk' ):  122              return  self
._ talk
_ info
( url
,  name
)  123          elif  m
. group ( 'type_watch' ):  124              return  self
._ watch
_ info
( url
,  name
)  126              return  self
._ playlist
_ videos
_ info
( url
,  name
)  128      def  _playlist_videos_info ( self
,  url
,  name
):  129          '''Returns the videos of the playlist'''  131          webpage 
=  self
._ download
_ webpage
( url
,  name
,  132                                           'Downloading playlist webpage' )  133          info 
=  self
._ extract
_ info
( webpage
)  134          playlist_info 
=  info
[ 'playlist' ]  137              self
. url_result ( 'http://www.ted.com/talks/'  +  talk
[ 'slug' ],  self
. ie_key ())  138              for  talk 
in  info
[ 'talks' ]  140          return  self
. playlist_result (  142              playlist_id
= compat_str ( playlist_info
[ 'id' ]),  143              playlist_title
= playlist_info
[ 'title' ])  145      def  _talk_info ( self
,  url
,  video_name
):  146          webpage 
=  self
._ download
_ webpage
( url
,  video_name
)  147          self
. report_extraction ( video_name
)  149          talk_info 
=  self
._ extract
_ info
( webpage
)[ 'talks' ][ 0 ]  151          external 
=  talk_info
. get ( 'external' )  153              service 
=  external
[ 'service' ]  154              self
. to_screen ( 'Found video from  %s '  %  service
)  156              if  service
. lower () ==  'youtube' :  157                  ext_url 
=  external
. get ( 'code' )  160                  'url' :  ext_url 
or  external
[ 'uri' ],  165              'format_id' :  format_id
,  167          }  for  ( format_id
,  format_url
)  in  talk_info
[ 'nativeDownloads' ]. items ()  if  format_url 
is not None ]  170                  finfo 
=  self
._ NATIVE
_ FORMATS
. get ( f
[ 'format_id' ])  176                  'format_id' :  f
[ 'name' ],  177                  'url' :  talk_info
[ 'streamer' ],  178                  'play_path' :  f
[ 'file' ],  181                  'height' :  f
[ 'height' ],  183              }  for  f 
in  talk_info
[ 'resources' ][ 'rtmp' ]]  184          self
._ sort
_ formats
( formats
)  186          video_id 
=  compat_str ( talk_info
[ 'id' ])  188          thumbnail 
=  talk_info
[ 'thumb' ]  189          if not  thumbnail
. startswith ( 'http' ):  190              thumbnail 
=  'http://'  +  thumbnail
 193              'title' :  talk_info
[ 'title' ]. strip (),  194              'uploader' :  talk_info
[ 'speaker' ],  195              'thumbnail' :  thumbnail
,  196              'description' :  self
._ og
_ search
_ description
( webpage
),  197              'subtitles' :  self
._ get
_ subtitles
( video_id
,  talk_info
),  199              'duration' :  talk_info
. get ( 'duration' ),  202      def  _get_subtitles ( self
,  video_id
,  talk_info
):  203          languages 
= [ lang
[ 'languageCode' ]  for  lang 
in  talk_info
. get ( 'languages' , [])]  209                          'url' :  'http://www.ted.com/talks/subtitles/id/ %s /lang/ %s /format/ %s '  % ( video_id
,  l
,  ext
),  212                      for  ext 
in  [ 'ted' ,  'srt' ]  218      def  _watch_info ( self
,  url
,  name
):  219          webpage 
=  self
._ download
_ webpage
( url
,  name
)  221          config_json 
=  self
._ html
_ search
_ regex
(  222              r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>' ,  224          config 
=  json
. loads ( config_json
)[ 'config' ]  225          video_url 
=  config
[ 'video' ][ 'url' ]  226          thumbnail 
=  config
. get ( 'image' , {}). get ( 'url' )  228          title 
=  self
._ html
_ search
_ regex
(  229              r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>" ,  webpage
,  'title' )  230          description 
=  self
._ html
_ search
_ regex
(  232                  r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>' ,  233                  r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>' ,  235              webpage
,  'description' ,  fatal
= False )  241              'thumbnail' :  thumbnail
,  242              'description' :  description
,