]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
   1 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
  21 class TEDIE(InfoExtractor
): 
  25         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ 
  27             (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist 
  29             ((?P<type_talk>talks)) # We have a simple talk 
  31             (?P<type_watch>watch)/[^/]+/[^/]+ 
  33         (/lang/(.*?))? # The url may contain the language 
  34         /(?P<name>[\w-]+) # Here goes the name and then ".html" 
  38         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 
  39         'md5': 'b0ce2b05ca215042124fbc9e3886493a', 
  43             'title': 'The illusion of consciousness', 
  44             'description': ('Philosopher Dan Dennett makes a compelling ' 
  45                             'argument that not only don\'t we understand our own ' 
  46                             'consciousness, but that half the time our brains are ' 
  47                             'actively fooling us.'), 
  48             'uploader': 'Dan Dennett', 
  56             'skip_download': True, 
  59         # missing HTTP bitrates 
  60         'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', 
  64             'title': 'The beauty and power of algorithms', 
  65             'thumbnail': r
're:^https?://.+\.jpg', 
  66             'description': 'md5:734e352710fb00d840ab87ae31aaf688', 
  67             'uploader': 'Vishal Sikka', 
  70             'skip_download': True, 
  73         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 
  74         'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 
  78             'title': 'Be passionate. Be courageous. Be your best.', 
  79             'uploader': 'Gabby Giffords and Mark Kelly', 
  80             'description': 'md5:5174aed4d0f16021b704120360f72b92', 
  84             'skip_download': True, 
  87         'url': 'http://www.ted.com/playlists/who_are_the_hackers', 
  90             'title': 'Who are the hackers?', 
  91             'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' 
  93         'playlist_mincount': 6, 
  95         # contains a youtube video 
  96         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', 
  97         'add_ie': ['Youtube'], 
 101             'title': 'Douglas Adams: Parrots the Universe and Everything', 
 102             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 
 103             'uploader': 'University of California Television (UCTV)', 
 104             'uploader_id': 'UCtelevision', 
 105             'upload_date': '20080522', 
 108             'skip_download': True, 
 112         'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 
 116             'title': 'The orchestra in my mouth', 
 117             'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 
 118             'uploader': 'Tom Thum', 
 120             'comment_count': int, 
 124             'skip_download': True, 
 129         'low': {'width': 320, 'height': 180}, 
 130         'medium': {'width': 512, 'height': 288}, 
 131         'high': {'width': 854, 'height': 480}, 
 134     def _extract_info(self
, webpage
): 
 135         info_json 
= self
._search
_regex
( 
 136             r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>', 
 137             webpage
, 'info json') 
 138         return json
.loads(info_json
) 
 140     def _real_extract(self
, url
): 
 141         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 142         if m
.group('type').startswith('embed'): 
 143             desktop_url 
= m
.group('proto') + 'www' + m
.group('urlmain') 
 144             return self
.url_result(desktop_url
, 'TED') 
 145         name 
= m
.group('name') 
 146         if m
.group('type_talk'): 
 147             return self
._talk
_info
(url
, name
) 
 148         elif m
.group('type_watch'): 
 149             return self
._watch
_info
(url
, name
) 
 151             return self
._playlist
_videos
_info
(url
, name
) 
 153     def _playlist_videos_info(self
, url
, name
): 
 154         '''Returns the videos of the playlist''' 
 156         webpage 
= self
._download
_webpage
(url
, name
, 
 157                                          'Downloading playlist webpage') 
 159         playlist_entries 
= [] 
 160         for entry 
in re
.findall(r
'(?s)<[^>]+data-ga-context=["\']playlist
["\'][^>]*>', webpage): 
 161             attrs = extract_attributes(entry) 
 162             entry_url = compat_urlparse.urljoin(url, attrs['href']) 
 163             playlist_entries.append(self.url_result(entry_url, self.ie_key())) 
 165         final_url = self._og_search_url(webpage, fatal=False) 
 167             re.match(self._VALID_URL, final_url).group('playlist_id') 
 168             if final_url else None) 
 170         return self.playlist_result( 
 171             playlist_entries, playlist_id=playlist_id, 
 172             playlist_title=self._og_search_title(webpage, fatal=False), 
 173             playlist_description=self._og_search_description(webpage)) 
 175     def _talk_info(self, url, video_name): 
 176         webpage = self._download_webpage(url, video_name) 
 178         info = self._extract_info(webpage) 
 180         data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info 
 181         talk_info = data['talks'][0] 
 183         title = talk_info['title'].strip() 
 185         downloads = talk_info.get('downloads') or {} 
 186         native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} 
 190             'format_id': format_id, 
 191         } for (format_id, format_url) in native_downloads.items() if format_url is not None] 
 193         subtitled_downloads = downloads.get('subtitledDownloads') or {} 
 194         for lang, subtitled_download in subtitled_downloads.items(): 
 195             for q in self._NATIVE_FORMATS: 
 196                 q_url = subtitled_download.get(q) 
 201                     'format_id': '%s-%s' % (q, lang), 
 207                 finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) 
 211         player_talk = talk_info['player_talks'][0] 
 213         external = player_talk.get('external') 
 214         if isinstance(external, dict): 
 215             service = external.get('service') 
 216             if isinstance(service, compat_str): 
 218                 if service.lower() == 'youtube': 
 219                     ext_url = external.get('code') 
 221                 return self.url_result(ext_url or external['uri']) 
 223         resources_ = player_talk.get('resources') or talk_info.get('resources') 
 226         for format_id, resources in resources_.items(): 
 227             if format_id == 'hls': 
 228                 if not isinstance(resources, dict): 
 230                 stream_url = url_or_none(resources.get('stream')) 
 233                 formats.extend(self._extract_m3u8_formats( 
 234                     stream_url, video_name, 'mp4', m3u8_id=format_id, 
 237                 if not isinstance(resources, list): 
 239                 if format_id == 'h264': 
 240                     for resource in resources: 
 241                         h264_url = resource.get('file') 
 244                         bitrate = int_or_none(resource.get('bitrate')) 
 247                             'format_id': '%s-%sk' % (format_id, bitrate), 
 250                         if re.search(r'\d+k', h264_url): 
 252                 elif format_id == 'rtmp': 
 253                     streamer = talk_info.get('streamer') 
 256                     for resource in resources: 
 258                             'format_id': '%s-%s' % (format_id, resource.get('name')), 
 260                             'play_path': resource['file'], 
 262                             'width': int_or_none(resource.get('width')), 
 263                             'height': int_or_none(resource.get('height')), 
 264                             'tbr': int_or_none(resource.get('bitrate')), 
 267         m3u8_formats = list(filter( 
 268             lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', 
 271             for m3u8_format in m3u8_formats: 
 272                 bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) 
 275                 bitrate_url = re.sub(r'\d+k', bitrate, http_url) 
 276                 if not self._is_valid_url( 
 277                         bitrate_url, video_name, '%s bitrate' % bitrate): 
 279                 f = m3u8_format.copy() 
 282                     'format_id': m3u8_format['format_id'].replace('hls', 'http'), 
 285                 if f.get('acodec') == 'none': 
 289         audio_download = talk_info.get('audioDownload') 
 292                 'url': audio_download, 
 293                 'format_id': 'audio', 
 297         self._sort_formats(formats) 
 299         video_id = compat_str(talk_info['id']) 
 304             'uploader': player_talk.get('speaker') or talk_info.get('speaker'), 
 305             'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 
 306             'description': self._og_search_description(webpage), 
 307             'subtitles': self._get_subtitles(video_id, talk_info), 
 309             'duration': float_or_none(talk_info.get('duration')), 
 310             'view_count': int_or_none(data.get('viewed_count')), 
 311             'comment_count': int_or_none( 
 312                 try_get(data, lambda x: x['comments']['count'])), 
 313             'tags': try_get(talk_info, lambda x: x['tags'], list), 
 316     def _get_subtitles(self, video_id, talk_info): 
 318         for language in try_get( 
 320                 (lambda x: x['downloads']['languages'], 
 321                  lambda x: x['languages']), list): 
 322             lang_code = language.get('languageCode') or language.get('ianaCode') 
 325             sub_lang_list[lang_code] = [ 
 327                     'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), 
 330                 for ext in ['ted', 'srt'] 
 334     def _watch_info(self, url, name): 
 335         webpage = self._download_webpage(url, name) 
 337         config_json = self._html_search_regex( 
 338             r'"pages\
.jwplayer
"\s*,\s*({.+?})\s*\)\s*</script>', 
 339             webpage, 'config', default=None) 
 341             embed_url = self._search_regex( 
 342                 r"<iframe
[^
>]+class='pages-video-embed__video__object'[^
>]+src
='([^']+)'", webpage, 'embed url
') 
 343             return self.url_result(self._proto_relative_url(embed_url)) 
 344         config = json.loads(config_json)['config
'] 
 345         video_url = config['video
']['url
'] 
 346         thumbnail = config.get('image
', {}).get('url
') 
 348         title = self._html_search_regex( 
 349             r"(?s)<h1(?:\s+class='[^
']+')?
>(.+?
)</h1
>", webpage, 'title') 
 350         description = self._html_search_regex( 
 352                 r'(?s)<h4 class="[^
"]+" id="h3--about-this-talk">.*?
</h4
>(.*?
)</div
>', 
 353                 r'(?s
)<p
><strong
>About this talk
:</strong
>\s
+(.*?
)</p
>', 
 355             webpage, 'description
', fatal=False) 
 361             'thumbnail
': thumbnail, 
 362             'description
': description,