]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
   1 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
   8 from ..compat 
import compat_str
 
  17 class TEDIE(InfoExtractor
): 
  21         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ 
  23             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist 
  25             ((?P<type_talk>talks)) # We have a simple talk 
  27             (?P<type_watch>watch)/[^/]+/[^/]+ 
  29         (/lang/(.*?))? # The url may contain the language 
  30         /(?P<name>[\w-]+) # Here goes the name and then ".html" 
  34         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 
  35         'md5': 'b0ce2b05ca215042124fbc9e3886493a', 
  39             'title': 'The illusion of consciousness', 
  40             'description': ('Philosopher Dan Dennett makes a compelling ' 
  41                             'argument that not only don\'t we understand our own ' 
  42                             'consciousness, but that half the time our brains are ' 
  43                             'actively fooling us.'), 
  44             'uploader': 'Dan Dennett', 
  52             'skip_download': True, 
  55         # missing HTTP bitrates 
  56         'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', 
  60             'title': 'The beauty and power of algorithms', 
  61             'thumbnail': r
're:^https?://.+\.jpg', 
  62             'description': 'md5:734e352710fb00d840ab87ae31aaf688', 
  63             'uploader': 'Vishal Sikka', 
  66             'skip_download': True, 
  69         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 
  70         'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 
  74             'title': 'Be passionate. Be courageous. Be your best.', 
  75             'uploader': 'Gabby Giffords and Mark Kelly', 
  76             'description': 'md5:5174aed4d0f16021b704120360f72b92', 
  80             'skip_download': True, 
  83         'url': 'http://www.ted.com/playlists/who_are_the_hackers', 
  86             'title': 'Who are the hackers?', 
  88         'playlist_mincount': 6, 
  90         # contains a youtube video 
  91         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', 
  92         'add_ie': ['Youtube'], 
  96             'title': 'Douglas Adams: Parrots the Universe and Everything', 
  97             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 
  98             'uploader': 'University of California Television (UCTV)', 
  99             'uploader_id': 'UCtelevision', 
 100             'upload_date': '20080522', 
 103             'skip_download': True, 
 107         'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 
 111             'title': 'The orchestra in my mouth', 
 112             'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 
 113             'uploader': 'Tom Thum', 
 115             'comment_count': int, 
 119             'skip_download': True, 
 124         'low': {'width': 320, 'height': 180}, 
 125         'medium': {'width': 512, 'height': 288}, 
 126         'high': {'width': 854, 'height': 480}, 
 129     def _extract_info(self
, webpage
): 
 130         info_json 
= self
._search
_regex
( 
 131             r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', 
 132             webpage
, 'info json') 
 133         return json
.loads(info_json
) 
 135     def _real_extract(self
, url
): 
 136         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 137         if m
.group('type').startswith('embed'): 
 138             desktop_url 
= m
.group('proto') + 'www' + m
.group('urlmain') 
 139             return self
.url_result(desktop_url
, 'TED') 
 140         name 
= m
.group('name') 
 141         if m
.group('type_talk'): 
 142             return self
._talk
_info
(url
, name
) 
 143         elif m
.group('type_watch'): 
 144             return self
._watch
_info
(url
, name
) 
 146             return self
._playlist
_videos
_info
(url
, name
) 
 148     def _playlist_videos_info(self
, url
, name
): 
 149         '''Returns the videos of the playlist''' 
 151         webpage 
= self
._download
_webpage
(url
, name
, 
 152                                          'Downloading playlist webpage') 
 153         info 
= self
._extract
_info
(webpage
) 
 155         playlist_info 
= try_get( 
 156             info
, lambda x
: x
['__INITIAL_DATA__']['playlist'], 
 157             dict) or info
['playlist'] 
 160             self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key()) 
 162                 info
, lambda x
: x
['__INITIAL_DATA__']['talks'], 
 163                 dict) or info
['talks'] 
 165         return self
.playlist_result( 
 167             playlist_id
=compat_str(playlist_info
['id']), 
 168             playlist_title
=playlist_info
['title']) 
 170     def _talk_info(self
, url
, video_name
): 
 171         webpage 
= self
._download
_webpage
(url
, video_name
) 
 173         info 
= self
._extract
_info
(webpage
) 
 175         data 
= try_get(info
, lambda x
: x
['__INITIAL_DATA__'], dict) or info
 
 176         talk_info 
= data
['talks'][0] 
 178         title 
= talk_info
['title'].strip() 
 180         native_downloads 
= try_get( 
 182             (lambda x
: x
['downloads']['nativeDownloads'], 
 183              lambda x
: x
['nativeDownloads']), 
 188             'format_id': format_id
, 
 190         } for (format_id
, format_url
) in native_downloads
.items() if format_url 
is not None] 
 193                 finfo 
= self
._NATIVE
_FORMATS
.get(f
['format_id']) 
 197         player_talk 
= talk_info
['player_talks'][0] 
 199         external 
= player_talk
.get('external') 
 200         if isinstance(external
, dict): 
 201             service 
= external
.get('service') 
 202             if isinstance(service
, compat_str
): 
 204                 if service
.lower() == 'youtube': 
 205                     ext_url 
= external
.get('code') 
 208                     'url': ext_url 
or external
['uri'], 
 211         resources_ 
= player_talk
.get('resources') or talk_info
.get('resources') 
 214         for format_id
, resources 
in resources_
.items(): 
 215             if not isinstance(resources
, dict): 
 217             if format_id 
== 'h264': 
 218                 for resource 
in resources
: 
 219                     h264_url 
= resource
.get('file') 
 222                     bitrate 
= int_or_none(resource
.get('bitrate')) 
 225                         'format_id': '%s-%sk' % (format_id
, bitrate
), 
 228                     if re
.search(r
'\d+k', h264_url
): 
 230             elif format_id 
== 'rtmp': 
 231                 streamer 
= talk_info
.get('streamer') 
 234                 for resource 
in resources
: 
 236                         'format_id': '%s-%s' % (format_id
, resource
.get('name')), 
 238                         'play_path': resource
['file'], 
 240                         'width': int_or_none(resource
.get('width')), 
 241                         'height': int_or_none(resource
.get('height')), 
 242                         'tbr': int_or_none(resource
.get('bitrate')), 
 244             elif format_id 
== 'hls': 
 245                 stream_url 
= url_or_none(resources
.get('stream')) 
 248                 formats
.extend(self
._extract
_m
3u8_formats
( 
 249                     stream_url
, video_name
, 'mp4', m3u8_id
=format_id
, 
 252         m3u8_formats 
= list(filter( 
 253             lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none', 
 256             for m3u8_format 
in m3u8_formats
: 
 257                 bitrate 
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None) 
 260                 bitrate_url 
= re
.sub(r
'\d+k', bitrate
, http_url
) 
 261                 if not self
._is
_valid
_url
( 
 262                         bitrate_url
, video_name
, '%s bitrate' % bitrate
): 
 264                 f 
= m3u8_format
.copy() 
 267                     'format_id': m3u8_format
['format_id'].replace('hls', 'http'), 
 272         audio_download 
= talk_info
.get('audioDownload') 
 275                 'url': audio_download
, 
 276                 'format_id': 'audio', 
 280         self
._sort
_formats
(formats
) 
 282         video_id 
= compat_str(talk_info
['id']) 
 287             'uploader': player_talk
.get('speaker') or talk_info
.get('speaker'), 
 288             'thumbnail': player_talk
.get('thumb') or talk_info
.get('thumb'), 
 289             'description': self
._og
_search
_description
(webpage
), 
 290             'subtitles': self
._get
_subtitles
(video_id
, talk_info
), 
 292             'duration': float_or_none(talk_info
.get('duration')), 
 293             'view_count': int_or_none(data
.get('viewed_count')), 
 294             'comment_count': int_or_none( 
 295                 try_get(data
, lambda x
: x
['comments']['count'])), 
 296             'tags': try_get(talk_info
, lambda x
: x
['tags'], list), 
 299     def _get_subtitles(self
, video_id
, talk_info
): 
 301         for language 
in try_get( 
 303                 (lambda x
: x
['downloads']['languages'], 
 304                  lambda x
: x
['languages']), list): 
 305             lang_code 
= language
.get('languageCode') or language
.get('ianaCode') 
 308             sub_lang_list
[lang_code
] = [ 
 310                     'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, lang_code
, ext
), 
 313                 for ext 
in ['ted', 'srt'] 
 317     def _watch_info(self
, url
, name
): 
 318         webpage 
= self
._download
_webpage
(url
, name
) 
 320         config_json 
= self
._html
_search
_regex
( 
 321             r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', 
 322             webpage
, 'config', default
=None) 
 324             embed_url 
= self
._search
_regex
( 
 325                 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url') 
 326             return self
.url_result(self
._proto
_relative
_url
(embed_url
)) 
 327         config 
= json
.loads(config_json
)['config'] 
 328         video_url 
= config
['video']['url'] 
 329         thumbnail 
= config
.get('image', {}).get('url') 
 331         title 
= self
._html
_search
_regex
( 
 332             r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title') 
 333         description 
= self
._html
_search
_regex
( 
 335                 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', 
 336                 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', 
 338             webpage
, 'description', fatal
=False) 
 344             'thumbnail': thumbnail
, 
 345             'description': description
,