]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
1 from __future__
import unicode_literals
6 from .common
import InfoExtractor
8 from ..compat
import compat_str
15 class TEDIE(InfoExtractor
):
19 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
21 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
23 ((?P<type_talk>talks)) # We have a simple talk
25 (?P<type_watch>watch)/[^/]+/[^/]+
27 (/lang/(.*?))? # The url may contain the language
28 /(?P<name>[\w-]+) # Here goes the name and then ".html"
32 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
33 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
37 'title': 'The illusion of consciousness',
38 'description': ('Philosopher Dan Dennett makes a compelling '
39 'argument that not only don\'t we understand our own '
40 'consciousness, but that half the time our brains are '
41 'actively fooling us.'),
42 'uploader': 'Dan Dennett',
47 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
48 'md5': 'b899ac15e345fb39534d913f7606082b',
52 'title': 'Vishal Sikka: The beauty and power of algorithms',
53 'thumbnail': r
're:^https?://.+\.jpg',
54 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
55 'upload_date': '20140122',
56 'uploader_id': 'TEDInstitute',
57 'uploader': 'TED Institute',
59 'add_ie': ['Youtube'],
61 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
62 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
66 'title': 'Be passionate. Be courageous. Be your best.',
67 'uploader': 'Gabby Giffords and Mark Kelly',
68 'description': 'md5:5174aed4d0f16021b704120360f72b92',
72 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
75 'title': 'Who are the hackers?',
77 'playlist_mincount': 6,
79 # contains a youtube video
80 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
81 'add_ie': ['Youtube'],
85 'title': 'Douglas Adams: Parrots the Universe and Everything',
86 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
87 'uploader': 'University of California Television (UCTV)',
88 'uploader_id': 'UCtelevision',
89 'upload_date': '20080522',
92 'skip_download': True,
96 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
97 'add_ie': ['Youtube'],
101 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
102 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
103 'uploader': 'TEDx Talks',
104 'uploader_id': 'TEDxTalks',
105 'upload_date': '20111216',
108 'skip_download': True,
113 'low': {'width': 320, 'height': 180},
114 'medium': {'width': 512, 'height': 288},
115 'high': {'width': 854, 'height': 480},
118 def _extract_info(self
, webpage
):
119 info_json
= self
._search
_regex
(
120 r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
121 webpage
, 'info json')
122 return json
.loads(info_json
)
124 def _real_extract(self
, url
):
125 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
126 if m
.group('type').startswith('embed'):
127 desktop_url
= m
.group('proto') + 'www' + m
.group('urlmain')
128 return self
.url_result(desktop_url
, 'TED')
129 name
= m
.group('name')
130 if m
.group('type_talk'):
131 return self
._talk
_info
(url
, name
)
132 elif m
.group('type_watch'):
133 return self
._watch
_info
(url
, name
)
135 return self
._playlist
_videos
_info
(url
, name
)
137 def _playlist_videos_info(self
, url
, name
):
138 '''Returns the videos of the playlist'''
140 webpage
= self
._download
_webpage
(url
, name
,
141 'Downloading playlist webpage')
142 info
= self
._extract
_info
(webpage
)
144 playlist_info
= try_get(
145 info
, lambda x
: x
['__INITIAL_DATA__']['playlist'],
146 dict) or info
['playlist']
149 self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key())
151 info
, lambda x
: x
['__INITIAL_DATA__']['talks'],
152 dict) or info
['talks']
154 return self
.playlist_result(
156 playlist_id
=compat_str(playlist_info
['id']),
157 playlist_title
=playlist_info
['title'])
159 def _talk_info(self
, url
, video_name
):
160 webpage
= self
._download
_webpage
(url
, video_name
)
162 info
= self
._extract
_info
(webpage
)
165 info
, lambda x
: x
['__INITIAL_DATA__']['talks'][0],
166 dict) or info
['talks'][0]
168 title
= talk_info
['title'].strip()
170 external
= talk_info
.get('external')
172 service
= external
['service']
173 self
.to_screen('Found video from %s' % service
)
175 if service
.lower() == 'youtube':
176 ext_url
= external
.get('code')
179 'url': ext_url
or external
['uri'],
182 native_downloads
= try_get(
183 talk_info
, lambda x
: x
['downloads']['nativeDownloads'],
184 dict) or talk_info
['nativeDownloads']
188 'format_id': format_id
,
190 } for (format_id
, format_url
) in native_downloads
.items() if format_url
is not None]
193 finfo
= self
._NATIVE
_FORMATS
.get(f
['format_id'])
197 player_talk
= talk_info
['player_talks'][0]
199 resources_
= player_talk
.get('resources') or talk_info
.get('resources')
202 for format_id
, resources
in resources_
.items():
203 if format_id
== 'h264':
204 for resource
in resources
:
205 h264_url
= resource
.get('file')
208 bitrate
= int_or_none(resource
.get('bitrate'))
211 'format_id': '%s-%sk' % (format_id
, bitrate
),
214 if re
.search(r
'\d+k', h264_url
):
216 elif format_id
== 'rtmp':
217 streamer
= talk_info
.get('streamer')
220 for resource
in resources
:
222 'format_id': '%s-%s' % (format_id
, resource
.get('name')),
224 'play_path': resource
['file'],
226 'width': int_or_none(resource
.get('width')),
227 'height': int_or_none(resource
.get('height')),
228 'tbr': int_or_none(resource
.get('bitrate')),
230 elif format_id
== 'hls':
231 formats
.extend(self
._extract
_m
3u8_formats
(
232 resources
.get('stream'), video_name
, 'mp4', m3u8_id
=format_id
, fatal
=False))
234 m3u8_formats
= list(filter(
235 lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none',
238 for m3u8_format
in m3u8_formats
:
239 bitrate
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None)
242 f
= m3u8_format
.copy()
244 'url': re
.sub(r
'\d+k', bitrate
, http_url
),
245 'format_id': m3u8_format
['format_id'].replace('hls', 'http'),
250 audio_download
= talk_info
.get('audioDownload')
253 'url': audio_download
,
254 'format_id': 'audio',
258 self
._sort
_formats
(formats
)
260 video_id
= compat_str(talk_info
['id'])
265 'uploader': player_talk
.get('speaker') or talk_info
.get('speaker'),
266 'thumbnail': player_talk
.get('thumb') or talk_info
.get('thumb'),
267 'description': self
._og
_search
_description
(webpage
),
268 'subtitles': self
._get
_subtitles
(video_id
, talk_info
),
270 'duration': talk_info
.get('duration'),
273 def _get_subtitles(self
, video_id
, talk_info
):
275 for language
in try_get(
277 (lambda x
: x
['downloads']['languages'],
278 lambda x
: x
['languages']), list):
279 lang_code
= language
.get('languageCode') or language
.get('ianaCode')
282 sub_lang_list
[lang_code
] = [
284 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, lang_code
, ext
),
287 for ext
in ['ted', 'srt']
291 def _watch_info(self
, url
, name
):
292 webpage
= self
._download
_webpage
(url
, name
)
294 config_json
= self
._html
_search
_regex
(
295 r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
296 webpage
, 'config', default
=None)
298 embed_url
= self
._search
_regex
(
299 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url')
300 return self
.url_result(self
._proto
_relative
_url
(embed_url
))
301 config
= json
.loads(config_json
)['config']
302 video_url
= config
['video']['url']
303 thumbnail
= config
.get('image', {}).get('url')
305 title
= self
._html
_search
_regex
(
306 r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title')
307 description
= self
._html
_search
_regex
(
309 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
310 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
312 webpage
, 'description', fatal
=False)
318 'thumbnail': thumbnail
,
319 'description': description
,