]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py 
   1  from  __future__ 
import  unicode_literals
   6  from  . subtitles 
import  SubtitlesInfoExtractor
  13  class  TEDIE ( SubtitlesInfoExtractor
):   16          (?P<type>www|embed)(?P<urlmain>\.ted\.com/   18              (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist   20              ((?P<type_talk>talks)) # We have a simple talk   22              (?P<type_watch>watch)/[^/]+/[^/]+   24          (/lang/(.*?))? # The url may contain the language   25          /(?P<name>[\w-]+) # Here goes the name and then ".html"   29          'url' :  'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' ,   30          'md5' :  '4ea1dada91e4174b53dac2bb8ace429d' ,   34              'title' :  'The illusion of consciousness' ,   35              'description' : ( 'Philosopher Dan Dennett makes a compelling '   36                  'argument that not only don \' t we understand our own '   37                  'consciousness, but that half the time our brains are '   38                  'actively fooling us.' ),   39              'uploader' :  'Dan Dennett' ,   43          'url' :  'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms' ,   44          'md5' :  '226f4fb9c62380d11b7995efa4c87994' ,   46              'id' :  'vishal-sikka-the-beauty-and-power-of-algorithms' ,   48              'title' :  'Vishal Sikka: The beauty and power of algorithms' ,   49              'thumbnail' :  're:^https?://.+\.jpg' ,   50              'description' :  'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.' ,   53          'url' :  'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best' ,   54          'md5' :  '49144e345a899b8cb34d315f3b9cfeeb' ,   58              'title' :  'Be passionate. Be courageous. Be your best.' ,   59              'uploader' :  'Gabby Giffords and Mark Kelly' ,   60              'description' :  'md5:5174aed4d0f16021b704120360f72b92' ,   65          'low' : { 'preference' :  1 ,  'width' :  320 ,  'height' :  180 },   66          'medium' : { 'preference' :  2 ,  'width' :  512 ,  'height' :  288 },   67          'high' : { 'preference' :  3 ,  'width' :  854 ,  'height' :  480 },   70      def  _extract_info ( self
,  webpage
):   71          info_json 
=  self
._ search
_ regex
( r
'q\("\w+.init",({.+})\)</script>' ,   73          return  json
. loads ( info_json
)   75      def  _real_extract ( self
,  url
):   76          m 
=  re
. match ( self
._ VALID
_U RL
,  url
,  re
. VERBOSE
)   77          if  m
. group ( 'type' ) ==  'embed' :   78              desktop_url 
=  m
. group ( 'proto' ) +  'www'  +  m
. group ( 'urlmain' )   79              return  self
. url_result ( desktop_url
,  'TED' )   80          name 
=  m
. group ( 'name' )   81          if  m
. group ( 'type_talk' ):   82              return  self
._ talk
_ info
( url
,  name
)   83          elif  m
. group ( 'type_watch' ):   84              return  self
._ watch
_ info
( url
,  name
)   86              return  self
._ playlist
_ videos
_ info
( url
,  name
)   88      def  _playlist_videos_info ( self
,  url
,  name
):   89          '''Returns the videos of the playlist'''   91          webpage 
=  self
._ download
_ webpage
( url
,  name
,   92              'Downloading playlist webpage' )   93          info 
=  self
._ extract
_ info
( webpage
)   94          playlist_info 
=  info
[ 'playlist' ]   97              self
. url_result ( 'http://www.ted.com/talks/'  +  talk
[ 'slug' ],  self
. ie_key ())   98              for  talk 
in  info
[ 'talks' ]  100          return  self
. playlist_result (  102              playlist_id
= compat_str ( playlist_info
[ 'id' ]),  103              playlist_title
= playlist_info
[ 'title' ])  105      def  _talk_info ( self
,  url
,  video_name
):  106          webpage 
=  self
._ download
_ webpage
( url
,  video_name
)  107          self
. report_extraction ( video_name
)  109          talk_info 
=  self
._ extract
_ info
( webpage
)[ 'talks' ][ 0 ]  113              'format_id' :  format_id
,  115          }  for  ( format_id
,  format_url
)  in  talk_info
[ 'nativeDownloads' ]. items ()  if  format_url 
is not None ]  118                  finfo 
=  self
._ NATIVE
_ FORMATS
. get ( f
[ 'format_id' ])  124                  'format_id' :  f
[ 'name' ],  125                  'url' :  talk_info
[ 'streamer' ],  126                  'play_path' :  f
[ 'file' ],  129                  'height' :  f
[ 'height' ],  131              }  for  f 
in  talk_info
[ 'resources' ][ 'rtmp' ]]  132          self
._ sort
_ formats
( formats
)  134          video_id 
=  compat_str ( talk_info
[ 'id' ])  136          video_subtitles 
=  self
. extract_subtitles ( video_id
,  talk_info
)  137          if  self
._ downloader
. params
. get ( 'listsubtitles' ,  False ):  138              self
._l ist
_ available
_ subtitles
( video_id
,  talk_info
)  141          thumbnail 
=  talk_info
[ 'thumb' ]  142          if not  thumbnail
. startswith ( 'http' ):  143              thumbnail 
=  'http://'  +  thumbnail
 146              'title' :  talk_info
[ 'title' ],  147              'uploader' :  talk_info
[ 'speaker' ],  148              'thumbnail' :  thumbnail
,  149              'description' :  self
._ og
_ search
_ description
( webpage
),  150              'subtitles' :  video_subtitles
,  154      def  _get_available_subtitles ( self
,  video_id
,  talk_info
):  155          languages 
= [ lang
[ 'languageCode' ]  for  lang 
in  talk_info
. get ( 'languages' , [])]  159                  url 
=  'http://www.ted.com/talks/subtitles/id/ %s /lang/ %s /format/srt'  % ( video_id
,  l
)  160                  sub_lang_list
[ l
] =  url
 163              self
._ downloader
. report_warning ( 'video doesn \' t have subtitles' )  166      def  _watch_info ( self
,  url
,  name
):  167          webpage 
=  self
._ download
_ webpage
( url
,  name
)  169          config_json 
=  self
._ html
_ search
_ regex
(  170              r
"data-config='([^']+)" ,  webpage
,  'config' )  171          config 
=  json
. loads ( config_json
)  172          video_url 
=  config
[ 'video' ][ 'url' ]  173          thumbnail 
=  config
. get ( 'image' , {}). get ( 'url' )  175          title 
=  self
._ html
_ search
_ regex
(  176              r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>" ,  webpage
,  'title' )  177          description 
=  self
._ html
_ search
_ regex
(  179                  r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>' ,  180                  r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>' ,  182              webpage
,  'description' ,  fatal
= False )  188              'thumbnail' :  thumbnail
,  189              'description' :  description
,