]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
f9b6aa48f03d3b3a8cf49f80573eb9d24d115384
1 from __future__
import unicode_literals
6 from .common
import InfoExtractor
8 from ..compat
import compat_str
17 class TEDIE(InfoExtractor
):
21 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
23 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
25 ((?P<type_talk>talks)) # We have a simple talk
27 (?P<type_watch>watch)/[^/]+/[^/]+
29 (/lang/(.*?))? # The url may contain the language
30 /(?P<name>[\w-]+) # Here goes the name and then ".html"
34 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
35 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
39 'title': 'The illusion of consciousness',
40 'description': ('Philosopher Dan Dennett makes a compelling '
41 'argument that not only don\'t we understand our own '
42 'consciousness, but that half the time our brains are '
43 'actively fooling us.'),
44 'uploader': 'Dan Dennett',
52 'skip_download': True,
55 # missing HTTP bitrates
56 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
60 'title': 'The beauty and power of algorithms',
61 'thumbnail': r
're:^https?://.+\.jpg',
62 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
63 'uploader': 'Vishal Sikka',
66 'skip_download': True,
69 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
70 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
74 'title': 'Be passionate. Be courageous. Be your best.',
75 'uploader': 'Gabby Giffords and Mark Kelly',
76 'description': 'md5:5174aed4d0f16021b704120360f72b92',
80 'skip_download': True,
83 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
86 'title': 'Who are the hackers?',
88 'playlist_mincount': 6,
90 # contains a youtube video
91 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
92 'add_ie': ['Youtube'],
96 'title': 'Douglas Adams: Parrots the Universe and Everything',
97 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
98 'uploader': 'University of California Television (UCTV)',
99 'uploader_id': 'UCtelevision',
100 'upload_date': '20080522',
103 'skip_download': True,
107 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
111 'title': 'The orchestra in my mouth',
112 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
113 'uploader': 'Tom Thum',
115 'comment_count': int,
119 'skip_download': True,
124 'low': {'width': 320, 'height': 180},
125 'medium': {'width': 512, 'height': 288},
126 'high': {'width': 854, 'height': 480},
129 def _extract_info(self
, webpage
):
130 info_json
= self
._search
_regex
(
131 r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
132 webpage
, 'info json')
133 return json
.loads(info_json
)
135 def _real_extract(self
, url
):
136 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
137 if m
.group('type').startswith('embed'):
138 desktop_url
= m
.group('proto') + 'www' + m
.group('urlmain')
139 return self
.url_result(desktop_url
, 'TED')
140 name
= m
.group('name')
141 if m
.group('type_talk'):
142 return self
._talk
_info
(url
, name
)
143 elif m
.group('type_watch'):
144 return self
._watch
_info
(url
, name
)
146 return self
._playlist
_videos
_info
(url
, name
)
148 def _playlist_videos_info(self
, url
, name
):
149 '''Returns the videos of the playlist'''
151 webpage
= self
._download
_webpage
(url
, name
,
152 'Downloading playlist webpage')
153 info
= self
._extract
_info
(webpage
)
155 playlist_info
= try_get(
156 info
, lambda x
: x
['__INITIAL_DATA__']['playlist'],
157 dict) or info
['playlist']
160 self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key())
162 info
, lambda x
: x
['__INITIAL_DATA__']['talks'],
163 dict) or info
['talks']
165 return self
.playlist_result(
167 playlist_id
=compat_str(playlist_info
['id']),
168 playlist_title
=playlist_info
['title'])
170 def _talk_info(self
, url
, video_name
):
171 webpage
= self
._download
_webpage
(url
, video_name
)
173 info
= self
._extract
_info
(webpage
)
175 data
= try_get(info
, lambda x
: x
['__INITIAL_DATA__'], dict) or info
176 talk_info
= data
['talks'][0]
178 title
= talk_info
['title'].strip()
180 native_downloads
= try_get(
182 (lambda x
: x
['downloads']['nativeDownloads'],
183 lambda x
: x
['nativeDownloads']),
188 'format_id': format_id
,
190 } for (format_id
, format_url
) in native_downloads
.items() if format_url
is not None]
193 finfo
= self
._NATIVE
_FORMATS
.get(f
['format_id'])
197 player_talk
= talk_info
['player_talks'][0]
199 external
= player_talk
.get('external')
200 if isinstance(external
, dict):
201 service
= external
.get('service')
202 if isinstance(service
, compat_str
):
204 if service
.lower() == 'youtube':
205 ext_url
= external
.get('code')
208 'url': ext_url
or external
['uri'],
211 resources_
= player_talk
.get('resources') or talk_info
.get('resources')
214 for format_id
, resources
in resources_
.items():
215 if format_id
== 'h264':
216 for resource
in resources
:
217 h264_url
= resource
.get('file')
220 bitrate
= int_or_none(resource
.get('bitrate'))
223 'format_id': '%s-%sk' % (format_id
, bitrate
),
226 if re
.search(r
'\d+k', h264_url
):
228 elif format_id
== 'rtmp':
229 streamer
= talk_info
.get('streamer')
232 for resource
in resources
:
234 'format_id': '%s-%s' % (format_id
, resource
.get('name')),
236 'play_path': resource
['file'],
238 'width': int_or_none(resource
.get('width')),
239 'height': int_or_none(resource
.get('height')),
240 'tbr': int_or_none(resource
.get('bitrate')),
242 elif format_id
== 'hls':
243 if not isinstance(resources
, dict):
245 stream_url
= url_or_none(resources
.get('stream'))
248 formats
.extend(self
._extract
_m
3u8_formats
(
249 stream_url
, video_name
, 'mp4', m3u8_id
=format_id
,
252 m3u8_formats
= list(filter(
253 lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none',
256 for m3u8_format
in m3u8_formats
:
257 bitrate
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None)
260 bitrate_url
= re
.sub(r
'\d+k', bitrate
, http_url
)
261 if not self
._is
_valid
_url
(
262 bitrate_url
, video_name
, '%s bitrate' % bitrate
):
264 f
= m3u8_format
.copy()
267 'format_id': m3u8_format
['format_id'].replace('hls', 'http'),
272 audio_download
= talk_info
.get('audioDownload')
275 'url': audio_download
,
276 'format_id': 'audio',
280 self
._sort
_formats
(formats
)
282 video_id
= compat_str(talk_info
['id'])
287 'uploader': player_talk
.get('speaker') or talk_info
.get('speaker'),
288 'thumbnail': player_talk
.get('thumb') or talk_info
.get('thumb'),
289 'description': self
._og
_search
_description
(webpage
),
290 'subtitles': self
._get
_subtitles
(video_id
, talk_info
),
292 'duration': float_or_none(talk_info
.get('duration')),
293 'view_count': int_or_none(data
.get('viewed_count')),
294 'comment_count': int_or_none(
295 try_get(data
, lambda x
: x
['comments']['count'])),
296 'tags': try_get(talk_info
, lambda x
: x
['tags'], list),
299 def _get_subtitles(self
, video_id
, talk_info
):
301 for language
in try_get(
303 (lambda x
: x
['downloads']['languages'],
304 lambda x
: x
['languages']), list):
305 lang_code
= language
.get('languageCode') or language
.get('ianaCode')
308 sub_lang_list
[lang_code
] = [
310 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, lang_code
, ext
),
313 for ext
in ['ted', 'srt']
317 def _watch_info(self
, url
, name
):
318 webpage
= self
._download
_webpage
(url
, name
)
320 config_json
= self
._html
_search
_regex
(
321 r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
322 webpage
, 'config', default
=None)
324 embed_url
= self
._search
_regex
(
325 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url')
326 return self
.url_result(self
._proto
_relative
_url
(embed_url
))
327 config
= json
.loads(config_json
)['config']
328 video_url
= config
['video']['url']
329 thumbnail
= config
.get('image', {}).get('url')
331 title
= self
._html
_search
_regex
(
332 r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title')
333 description
= self
._html
_search
_regex
(
335 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
336 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
338 webpage
, 'description', fatal
=False)
344 'thumbnail': thumbnail
,
345 'description': description
,