]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
   1 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
   8 from ..compat 
import compat_str
 
  15 class TEDIE(InfoExtractor
): 
  19         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ 
  21             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist 
  23             ((?P<type_talk>talks)) # We have a simple talk 
  25             (?P<type_watch>watch)/[^/]+/[^/]+ 
  27         (/lang/(.*?))? # The url may contain the language 
  28         /(?P<name>[\w-]+) # Here goes the name and then ".html" 
  32         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 
  33         'md5': '0de43ac406aa3e4ea74b66c9c7789b13', 
  37             'title': 'The illusion of consciousness', 
  38             'description': ('Philosopher Dan Dennett makes a compelling ' 
  39                             'argument that not only don\'t we understand our own ' 
  40                             'consciousness, but that half the time our brains are ' 
  41                             'actively fooling us.'), 
  42             'uploader': 'Dan Dennett', 
  47         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', 
  48         'md5': 'b899ac15e345fb39534d913f7606082b', 
  52             'title': 'Vishal Sikka: The beauty and power of algorithms', 
  53             'thumbnail': r
're:^https?://.+\.jpg', 
  54             'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', 
  55             'upload_date': '20140122', 
  56             'uploader_id': 'TEDInstitute', 
  57             'uploader': 'TED Institute', 
  59         'add_ie': ['Youtube'], 
  61         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 
  62         'md5': '71b3ab2f4233012dce09d515c9c39ce2', 
  66             'title': 'Be passionate. Be courageous. Be your best.', 
  67             'uploader': 'Gabby Giffords and Mark Kelly', 
  68             'description': 'md5:5174aed4d0f16021b704120360f72b92', 
  72         'url': 'http://www.ted.com/playlists/who_are_the_hackers', 
  75             'title': 'Who are the hackers?', 
  77         'playlist_mincount': 6, 
  79         # contains a youtube video 
  80         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', 
  81         'add_ie': ['Youtube'], 
  85             'title': 'Douglas Adams: Parrots the Universe and Everything', 
  86             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 
  87             'uploader': 'University of California Television (UCTV)', 
  88             'uploader_id': 'UCtelevision', 
  89             'upload_date': '20080522', 
  92             'skip_download': True, 
  96         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', 
  97         'add_ie': ['Youtube'], 
 101             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', 
 102             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', 
 103             'uploader': 'TEDx Talks', 
 104             'uploader_id': 'TEDxTalks', 
 105             'upload_date': '20111216', 
 108             'skip_download': True, 
 113         'low': {'width': 320, 'height': 180}, 
 114         'medium': {'width': 512, 'height': 288}, 
 115         'high': {'width': 854, 'height': 480}, 
 118     def _extract_info(self
, webpage
): 
 119         info_json 
= self
._search
_regex
( 
 120             r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', 
 121             webpage
, 'info json') 
 122         return json
.loads(info_json
) 
 124     def _real_extract(self
, url
): 
 125         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 126         if m
.group('type').startswith('embed'): 
 127             desktop_url 
= m
.group('proto') + 'www' + m
.group('urlmain') 
 128             return self
.url_result(desktop_url
, 'TED') 
 129         name 
= m
.group('name') 
 130         if m
.group('type_talk'): 
 131             return self
._talk
_info
(url
, name
) 
 132         elif m
.group('type_watch'): 
 133             return self
._watch
_info
(url
, name
) 
 135             return self
._playlist
_videos
_info
(url
, name
) 
 137     def _playlist_videos_info(self
, url
, name
): 
 138         '''Returns the videos of the playlist''' 
 140         webpage 
= self
._download
_webpage
(url
, name
, 
 141                                          'Downloading playlist webpage') 
 142         info 
= self
._extract
_info
(webpage
) 
 144         playlist_info 
= try_get( 
 145             info
, lambda x
: x
['__INITIAL_DATA__']['playlist'], 
 146             dict) or info
['playlist'] 
 149             self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key()) 
 151                 info
, lambda x
: x
['__INITIAL_DATA__']['talks'], 
 152                 dict) or info
['talks'] 
 154         return self
.playlist_result( 
 156             playlist_id
=compat_str(playlist_info
['id']), 
 157             playlist_title
=playlist_info
['title']) 
 159     def _talk_info(self
, url
, video_name
): 
 160         webpage 
= self
._download
_webpage
(url
, video_name
) 
 162         info 
= self
._extract
_info
(webpage
) 
 165             info
, lambda x
: x
['__INITIAL_DATA__']['talks'][0], 
 166             dict) or info
['talks'][0] 
 168         title 
= talk_info
['title'].strip() 
 170         external 
= talk_info
.get('external') 
 172             service 
= external
['service'] 
 173             self
.to_screen('Found video from %s' % service
) 
 175             if service
.lower() == 'youtube': 
 176                 ext_url 
= external
.get('code') 
 179                 'url': ext_url 
or external
['uri'], 
 182         native_downloads 
= try_get( 
 183             talk_info
, lambda x
: x
['downloads']['nativeDownloads'], 
 184             dict) or talk_info
['nativeDownloads'] 
 188             'format_id': format_id
, 
 190         } for (format_id
, format_url
) in native_downloads
.items() if format_url 
is not None] 
 193                 finfo 
= self
._NATIVE
_FORMATS
.get(f
['format_id']) 
 197         player_talk 
= talk_info
['player_talks'][0] 
 199         resources_ 
= player_talk
.get('resources') or talk_info
.get('resources') 
 202         for format_id
, resources 
in resources_
.items(): 
 203             if format_id 
== 'h264': 
 204                 for resource 
in resources
: 
 205                     h264_url 
= resource
.get('file') 
 208                     bitrate 
= int_or_none(resource
.get('bitrate')) 
 211                         'format_id': '%s-%sk' % (format_id
, bitrate
), 
 214                     if re
.search(r
'\d+k', h264_url
): 
 216             elif format_id 
== 'rtmp': 
 217                 streamer 
= talk_info
.get('streamer') 
 220                 for resource 
in resources
: 
 222                         'format_id': '%s-%s' % (format_id
, resource
.get('name')), 
 224                         'play_path': resource
['file'], 
 226                         'width': int_or_none(resource
.get('width')), 
 227                         'height': int_or_none(resource
.get('height')), 
 228                         'tbr': int_or_none(resource
.get('bitrate')), 
 230             elif format_id 
== 'hls': 
 231                 formats
.extend(self
._extract
_m
3u8_formats
( 
 232                     resources
.get('stream'), video_name
, 'mp4', m3u8_id
=format_id
, fatal
=False)) 
 234         m3u8_formats 
= list(filter( 
 235             lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none', 
 238             for m3u8_format 
in m3u8_formats
: 
 239                 bitrate 
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None) 
 242                 f 
= m3u8_format
.copy() 
 244                     'url': re
.sub(r
'\d+k', bitrate
, http_url
), 
 245                     'format_id': m3u8_format
['format_id'].replace('hls', 'http'), 
 250         audio_download 
= talk_info
.get('audioDownload') 
 253                 'url': audio_download
, 
 254                 'format_id': 'audio', 
 258         self
._sort
_formats
(formats
) 
 260         video_id 
= compat_str(talk_info
['id']) 
 265             'uploader': player_talk
.get('speaker') or talk_info
.get('speaker'), 
 266             'thumbnail': player_talk
.get('thumb') or talk_info
.get('thumb'), 
 267             'description': self
._og
_search
_description
(webpage
), 
 268             'subtitles': self
._get
_subtitles
(video_id
, talk_info
), 
 270             'duration': talk_info
.get('duration'), 
 273     def _get_subtitles(self
, video_id
, talk_info
): 
 275         for language 
in try_get( 
 277                 (lambda x
: x
['downloads']['languages'], 
 278                  lambda x
: x
['languages']), list): 
 279             lang_code 
= language
.get('languageCode') or language
.get('ianaCode') 
 282             sub_lang_list
[lang_code
] = [ 
 284                     'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, lang_code
, ext
), 
 287                 for ext 
in ['ted', 'srt'] 
 291     def _watch_info(self
, url
, name
): 
 292         webpage 
= self
._download
_webpage
(url
, name
) 
 294         config_json 
= self
._html
_search
_regex
( 
 295             r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', 
 296             webpage
, 'config', default
=None) 
 298             embed_url 
= self
._search
_regex
( 
 299                 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url') 
 300             return self
.url_result(self
._proto
_relative
_url
(embed_url
)) 
 301         config 
= json
.loads(config_json
)['config'] 
 302         video_url 
= config
['video']['url'] 
 303         thumbnail 
= config
.get('image', {}).get('url') 
 305         title 
= self
._html
_search
_regex
( 
 306             r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title') 
 307         description 
= self
._html
_search
_regex
( 
 309                 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', 
 310                 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', 
 312             webpage
, 'description', fatal
=False) 
 318             'thumbnail': thumbnail
, 
 319             'description': description
,