]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
   1 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
   8 from ..compat 
import compat_str
 
   9 from ..utils 
import int_or_none
 
  12 class TEDIE(InfoExtractor
): 
  16         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ 
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist 
  20             ((?P<type_talk>talks)) # We have a simple talk 
  22             (?P<type_watch>watch)/[^/]+/[^/]+ 
  24         (/lang/(.*?))? # The url may contain the language 
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html" 
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 
  30         'md5': '0de43ac406aa3e4ea74b66c9c7789b13', 
  34             'title': 'The illusion of consciousness', 
  35             'description': ('Philosopher Dan Dennett makes a compelling ' 
  36                             'argument that not only don\'t we understand our own ' 
  37                             'consciousness, but that half the time our brains are ' 
  38                             'actively fooling us.'), 
  39             'uploader': 'Dan Dennett', 
  44         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', 
  45         'md5': 'b899ac15e345fb39534d913f7606082b', 
  49             'title': 'Vishal Sikka: The beauty and power of algorithms', 
  50             'thumbnail': 're:^https?://.+\.jpg', 
  51             'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', 
  52             'upload_date': '20140122', 
  53             'uploader_id': 'TEDInstitute', 
  54             'uploader': 'TED Institute', 
  56         'add_ie': ['Youtube'], 
  58         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 
  59         'md5': '71b3ab2f4233012dce09d515c9c39ce2', 
  63             'title': 'Be passionate. Be courageous. Be your best.', 
  64             'uploader': 'Gabby Giffords and Mark Kelly', 
  65             'description': 'md5:5174aed4d0f16021b704120360f72b92', 
  69         'url': 'http://www.ted.com/playlists/who_are_the_hackers', 
  72             'title': 'Who are the hackers?', 
  74         'playlist_mincount': 6, 
  76         # contains a youtube video 
  77         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', 
  78         'add_ie': ['Youtube'], 
  82             'title': 'Douglas Adams: Parrots the Universe and Everything', 
  83             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 
  84             'uploader': 'University of California Television (UCTV)', 
  85             'uploader_id': 'UCtelevision', 
  86             'upload_date': '20080522', 
  89             'skip_download': True, 
  93         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', 
  94         'add_ie': ['Youtube'], 
  98             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', 
  99             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', 
 100             'uploader': 'TEDx Talks', 
 101             'uploader_id': 'TEDxTalks', 
 102             'upload_date': '20111216', 
 105             'skip_download': True, 
 110         'low': {'width': 320, 'height': 180}, 
 111         'medium': {'width': 512, 'height': 288}, 
 112         'high': {'width': 854, 'height': 480}, 
 115     def _extract_info(self
, webpage
): 
 116         info_json 
= self
._search
_regex
(r
'q\("\w+.init",({.+})\)</script>', 
 117                                        webpage
, 'info json') 
 118         return json
.loads(info_json
) 
 120     def _real_extract(self
, url
): 
 121         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 122         if m
.group('type').startswith('embed'): 
 123             desktop_url 
= m
.group('proto') + 'www' + m
.group('urlmain') 
 124             return self
.url_result(desktop_url
, 'TED') 
 125         name 
= m
.group('name') 
 126         if m
.group('type_talk'): 
 127             return self
._talk
_info
(url
, name
) 
 128         elif m
.group('type_watch'): 
 129             return self
._watch
_info
(url
, name
) 
 131             return self
._playlist
_videos
_info
(url
, name
) 
 133     def _playlist_videos_info(self
, url
, name
): 
 134         '''Returns the videos of the playlist''' 
 136         webpage 
= self
._download
_webpage
(url
, name
, 
 137                                          'Downloading playlist webpage') 
 138         info 
= self
._extract
_info
(webpage
) 
 139         playlist_info 
= info
['playlist'] 
 142             self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key()) 
 143             for talk 
in info
['talks'] 
 145         return self
.playlist_result( 
 147             playlist_id
=compat_str(playlist_info
['id']), 
 148             playlist_title
=playlist_info
['title']) 
 150     def _talk_info(self
, url
, video_name
): 
 151         webpage 
= self
._download
_webpage
(url
, video_name
) 
 152         self
.report_extraction(video_name
) 
 154         talk_info 
= self
._extract
_info
(webpage
)['talks'][0] 
 156         external 
= talk_info
.get('external') 
 158             service 
= external
['service'] 
 159             self
.to_screen('Found video from %s' % service
) 
 161             if service
.lower() == 'youtube': 
 162                 ext_url 
= external
.get('code') 
 165                 'url': ext_url 
or external
['uri'], 
 170             'format_id': format_id
, 
 172         } for (format_id
, format_url
) in talk_info
['nativeDownloads'].items() if format_url 
is not None] 
 175                 finfo 
= self
._NATIVE
_FORMATS
.get(f
['format_id']) 
 180         for format_id
, resources 
in talk_info
['resources'].items(): 
 181             if format_id 
== 'h264': 
 182                 for resource 
in resources
: 
 183                     h264_url 
= resource
.get('file') 
 186                     bitrate 
= int_or_none(resource
.get('bitrate')) 
 189                         'format_id': '%s-%sk' % (format_id
, bitrate
), 
 192                     if re
.search('\d+k', h264_url
): 
 194             elif format_id 
== 'rtmp': 
 195                 streamer 
= talk_info
.get('streamer') 
 198                 for resource 
in resources
: 
 200                         'format_id': '%s-%s' % (format_id
, resource
.get('name')), 
 202                         'play_path': resource
['file'], 
 204                         'width': int_or_none(resource
.get('width')), 
 205                         'height': int_or_none(resource
.get('height')), 
 206                         'tbr': int_or_none(resource
.get('bitrate')), 
 208             elif format_id 
== 'hls': 
 209                 formats
.extend(self
._extract
_m
3u8_formats
( 
 210                     resources
.get('stream'), video_name
, 'mp4', m3u8_id
=format_id
, fatal
=False)) 
 212         m3u8_formats 
= list(filter( 
 213             lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none' and f
.get('resolution') != 'multiple', 
 216             for m3u8_format 
in m3u8_formats
: 
 217                 bitrate 
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None) 
 220                 f 
= m3u8_format
.copy() 
 222                     'url': re
.sub(r
'\d+k', bitrate
, http_url
), 
 223                     'format_id': m3u8_format
['format_id'].replace('hls', 'http'), 
 228         audio_download 
= talk_info
.get('audioDownload') 
 231                 'url': audio_download
, 
 232                 'format_id': 'audio', 
 236         self
._sort
_formats
(formats
) 
 238         video_id 
= compat_str(talk_info
['id']) 
 240         thumbnail 
= talk_info
['thumb'] 
 241         if not thumbnail
.startswith('http'): 
 242             thumbnail 
= 'http://' + thumbnail
 
 245             'title': talk_info
['title'].strip(), 
 246             'uploader': talk_info
['speaker'], 
 247             'thumbnail': thumbnail
, 
 248             'description': self
._og
_search
_description
(webpage
), 
 249             'subtitles': self
._get
_subtitles
(video_id
, talk_info
), 
 251             'duration': talk_info
.get('duration'), 
 254     def _get_subtitles(self
, video_id
, talk_info
): 
 255         languages 
= [lang
['languageCode'] for lang 
in talk_info
.get('languages', [])] 
 261                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, l
, ext
), 
 264                     for ext 
in ['ted', 'srt'] 
 270     def _watch_info(self
, url
, name
): 
 271         webpage 
= self
._download
_webpage
(url
, name
) 
 273         config_json 
= self
._html
_search
_regex
( 
 274             r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', 
 275             webpage
, 'config', default
=None) 
 277             embed_url 
= self
._search
_regex
( 
 278                 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url') 
 279             return self
.url_result(self
._proto
_relative
_url
(embed_url
)) 
 280         config 
= json
.loads(config_json
)['config'] 
 281         video_url 
= config
['video']['url'] 
 282         thumbnail 
= config
.get('image', {}).get('url') 
 284         title 
= self
._html
_search
_regex
( 
 285             r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title') 
 286         description 
= self
._html
_search
_regex
( 
 288                 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', 
 289                 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', 
 291             webpage
, 'description', fatal
=False) 
 297             'thumbnail': thumbnail
, 
 298             'description': description
,