]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
debian/copyright: Fix lintian's dep5-copyright-license-name-not-unique.
[youtubedl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .common import InfoExtractor
7
8 from ..compat import compat_str
9 from ..utils import int_or_none
10
11
12 class TEDIE(InfoExtractor):
13 IE_NAME = 'ted'
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
23 )
24 (/lang/(.*?))? # The url may contain the language
25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
26 .*)$
27 '''
28 _TESTS = [{
29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
31 'info_dict': {
32 'id': '102',
33 'ext': 'mp4',
34 'title': 'The illusion of consciousness',
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
39 'uploader': 'Dan Dennett',
40 'width': 854,
41 'duration': 1308,
42 }
43 }, {
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'info_dict': {
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'ext': 'mp4',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
52 }
53 }, {
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
55 'info_dict': {
56 'id': '1972',
57 'ext': 'mp4',
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
61 'duration': 1128,
62 },
63 }, {
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
65 'info_dict': {
66 'id': '10',
67 'title': 'Who are the hackers?',
68 },
69 'playlist_mincount': 6,
70 }, {
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
74 'info_dict': {
75 'id': '_ZG8HBuDjgc',
76 'ext': 'mp4',
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
82 },
83 'params': {
84 'skip_download': True,
85 },
86 }, {
87 # YouTube video
88 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89 'add_ie': ['Youtube'],
90 'info_dict': {
91 'id': 'aFBIPO-P7LM',
92 'ext': 'mp4',
93 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95 'uploader': 'TEDx Talks',
96 'uploader_id': 'TEDxTalks',
97 'upload_date': '20111216',
98 },
99 'params': {
100 'skip_download': True,
101 },
102 }]
103
104 _NATIVE_FORMATS = {
105 'low': {'preference': 1, 'width': 320, 'height': 180},
106 'medium': {'preference': 2, 'width': 512, 'height': 288},
107 'high': {'preference': 3, 'width': 854, 'height': 480},
108 }
109
110 def _extract_info(self, webpage):
111 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
112 webpage, 'info json')
113 return json.loads(info_json)
114
115 def _real_extract(self, url):
116 m = re.match(self._VALID_URL, url, re.VERBOSE)
117 if m.group('type').startswith('embed'):
118 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
119 return self.url_result(desktop_url, 'TED')
120 name = m.group('name')
121 if m.group('type_talk'):
122 return self._talk_info(url, name)
123 elif m.group('type_watch'):
124 return self._watch_info(url, name)
125 else:
126 return self._playlist_videos_info(url, name)
127
128 def _playlist_videos_info(self, url, name):
129 '''Returns the videos of the playlist'''
130
131 webpage = self._download_webpage(url, name,
132 'Downloading playlist webpage')
133 info = self._extract_info(webpage)
134 playlist_info = info['playlist']
135
136 playlist_entries = [
137 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
138 for talk in info['talks']
139 ]
140 return self.playlist_result(
141 playlist_entries,
142 playlist_id=compat_str(playlist_info['id']),
143 playlist_title=playlist_info['title'])
144
145 def _talk_info(self, url, video_name):
146 webpage = self._download_webpage(url, video_name)
147 self.report_extraction(video_name)
148
149 talk_info = self._extract_info(webpage)['talks'][0]
150
151 external = talk_info.get('external')
152 if external:
153 service = external['service']
154 self.to_screen('Found video from %s' % service)
155 ext_url = None
156 if service.lower() == 'youtube':
157 ext_url = external.get('code')
158 return {
159 '_type': 'url',
160 'url': ext_url or external['uri'],
161 }
162
163 formats = [{
164 'url': format_url,
165 'format_id': format_id,
166 'format': format_id,
167 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
168 if formats:
169 for f in formats:
170 finfo = self._NATIVE_FORMATS.get(f['format_id'])
171 if finfo:
172 f.update(finfo)
173
174 for format_id, resources in talk_info['resources'].items():
175 if format_id == 'h264':
176 for resource in resources:
177 bitrate = int_or_none(resource.get('bitrate'))
178 formats.append({
179 'url': resource['file'],
180 'format_id': '%s-%sk' % (format_id, bitrate),
181 'tbr': bitrate,
182 })
183 elif format_id == 'rtmp':
184 streamer = talk_info.get('streamer')
185 if not streamer:
186 continue
187 for resource in resources:
188 formats.append({
189 'format_id': '%s-%s' % (format_id, resource.get('name')),
190 'url': streamer,
191 'play_path': resource['file'],
192 'ext': 'flv',
193 'width': int_or_none(resource.get('width')),
194 'height': int_or_none(resource.get('height')),
195 'tbr': int_or_none(resource.get('bitrate')),
196 })
197 elif format_id == 'hls':
198 hls_formats = self._extract_m3u8_formats(
199 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
200 for f in hls_formats:
201 if f.get('format_id') == 'hls-meta':
202 continue
203 if not f.get('height'):
204 f['vcodec'] = 'none'
205 else:
206 f['acodec'] = 'none'
207 formats.extend(hls_formats)
208
209 audio_download = talk_info.get('audioDownload')
210 if audio_download:
211 formats.append({
212 'url': audio_download,
213 'format_id': 'audio',
214 'vcodec': 'none',
215 'preference': -0.5,
216 })
217
218 self._sort_formats(formats)
219
220 video_id = compat_str(talk_info['id'])
221
222 thumbnail = talk_info['thumb']
223 if not thumbnail.startswith('http'):
224 thumbnail = 'http://' + thumbnail
225 return {
226 'id': video_id,
227 'title': talk_info['title'].strip(),
228 'uploader': talk_info['speaker'],
229 'thumbnail': thumbnail,
230 'description': self._og_search_description(webpage),
231 'subtitles': self._get_subtitles(video_id, talk_info),
232 'formats': formats,
233 'duration': talk_info.get('duration'),
234 }
235
236 def _get_subtitles(self, video_id, talk_info):
237 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
238 if languages:
239 sub_lang_list = {}
240 for l in languages:
241 sub_lang_list[l] = [
242 {
243 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
244 'ext': ext,
245 }
246 for ext in ['ted', 'srt']
247 ]
248 return sub_lang_list
249 else:
250 return {}
251
252 def _watch_info(self, url, name):
253 webpage = self._download_webpage(url, name)
254
255 config_json = self._html_search_regex(
256 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
257 webpage, 'config')
258 config = json.loads(config_json)['config']
259 video_url = config['video']['url']
260 thumbnail = config.get('image', {}).get('url')
261
262 title = self._html_search_regex(
263 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
264 description = self._html_search_regex(
265 [
266 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
267 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
268 ],
269 webpage, 'description', fatal=False)
270
271 return {
272 'id': name,
273 'url': video_url,
274 'title': title,
275 'thumbnail': thumbnail,
276 'description': description,
277 }