]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
Imported Upstream version 2015.02.28
[youtubedl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .common import InfoExtractor
7
8 from ..compat import (
9 compat_str,
10 )
11
12
13 class TEDIE(InfoExtractor):
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
23 )
24 (/lang/(.*?))? # The url may contain the language
25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
26 .*)$
27 '''
28 _TESTS = [{
29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
31 'info_dict': {
32 'id': '102',
33 'ext': 'mp4',
34 'title': 'The illusion of consciousness',
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
39 'uploader': 'Dan Dennett',
40 'width': 854,
41 'duration': 1308,
42 }
43 }, {
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'info_dict': {
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'ext': 'mp4',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
52 }
53 }, {
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
55 'info_dict': {
56 'id': '1972',
57 'ext': 'mp4',
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
61 'duration': 1128,
62 },
63 }, {
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
65 'info_dict': {
66 'id': '10',
67 'title': 'Who are the hackers?',
68 },
69 'playlist_mincount': 6,
70 }, {
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
74 'info_dict': {
75 'id': '_ZG8HBuDjgc',
76 'ext': 'mp4',
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
82 },
83 'params': {
84 'skip_download': True,
85 },
86 }, {
87 # YouTube video
88 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89 'add_ie': ['Youtube'],
90 'info_dict': {
91 'id': 'aFBIPO-P7LM',
92 'ext': 'mp4',
93 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95 'uploader': 'TEDx Talks',
96 'uploader_id': 'TEDxTalks',
97 'upload_date': '20111216',
98 },
99 'params': {
100 'skip_download': True,
101 },
102 }]
103
104 _NATIVE_FORMATS = {
105 'low': {'preference': 1, 'width': 320, 'height': 180},
106 'medium': {'preference': 2, 'width': 512, 'height': 288},
107 'high': {'preference': 3, 'width': 854, 'height': 480},
108 }
109
110 def _extract_info(self, webpage):
111 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
112 webpage, 'info json')
113 return json.loads(info_json)
114
115 def _real_extract(self, url):
116 m = re.match(self._VALID_URL, url, re.VERBOSE)
117 if m.group('type').startswith('embed'):
118 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
119 return self.url_result(desktop_url, 'TED')
120 name = m.group('name')
121 if m.group('type_talk'):
122 return self._talk_info(url, name)
123 elif m.group('type_watch'):
124 return self._watch_info(url, name)
125 else:
126 return self._playlist_videos_info(url, name)
127
128 def _playlist_videos_info(self, url, name):
129 '''Returns the videos of the playlist'''
130
131 webpage = self._download_webpage(url, name,
132 'Downloading playlist webpage')
133 info = self._extract_info(webpage)
134 playlist_info = info['playlist']
135
136 playlist_entries = [
137 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
138 for talk in info['talks']
139 ]
140 return self.playlist_result(
141 playlist_entries,
142 playlist_id=compat_str(playlist_info['id']),
143 playlist_title=playlist_info['title'])
144
145 def _talk_info(self, url, video_name):
146 webpage = self._download_webpage(url, video_name)
147 self.report_extraction(video_name)
148
149 talk_info = self._extract_info(webpage)['talks'][0]
150
151 external = talk_info.get('external')
152 if external:
153 service = external['service']
154 self.to_screen('Found video from %s' % service)
155 ext_url = None
156 if service.lower() == 'youtube':
157 ext_url = external.get('code')
158 return {
159 '_type': 'url',
160 'url': ext_url or external['uri'],
161 }
162
163 formats = [{
164 'url': format_url,
165 'format_id': format_id,
166 'format': format_id,
167 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
168 if formats:
169 for f in formats:
170 finfo = self._NATIVE_FORMATS.get(f['format_id'])
171 if finfo:
172 f.update(finfo)
173 else:
174 # Use rtmp downloads
175 formats = [{
176 'format_id': f['name'],
177 'url': talk_info['streamer'],
178 'play_path': f['file'],
179 'ext': 'flv',
180 'width': f['width'],
181 'height': f['height'],
182 'tbr': f['bitrate'],
183 } for f in talk_info['resources']['rtmp']]
184 self._sort_formats(formats)
185
186 video_id = compat_str(talk_info['id'])
187
188 thumbnail = talk_info['thumb']
189 if not thumbnail.startswith('http'):
190 thumbnail = 'http://' + thumbnail
191 return {
192 'id': video_id,
193 'title': talk_info['title'].strip(),
194 'uploader': talk_info['speaker'],
195 'thumbnail': thumbnail,
196 'description': self._og_search_description(webpage),
197 'subtitles': self._get_subtitles(video_id, talk_info),
198 'formats': formats,
199 'duration': talk_info.get('duration'),
200 }
201
202 def _get_subtitles(self, video_id, talk_info):
203 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
204 if languages:
205 sub_lang_list = {}
206 for l in languages:
207 sub_lang_list[l] = [
208 {
209 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
210 'ext': ext,
211 }
212 for ext in ['ted', 'srt']
213 ]
214 return sub_lang_list
215 else:
216 return {}
217
218 def _watch_info(self, url, name):
219 webpage = self._download_webpage(url, name)
220
221 config_json = self._html_search_regex(
222 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
223 webpage, 'config')
224 config = json.loads(config_json)['config']
225 video_url = config['video']['url']
226 thumbnail = config.get('image', {}).get('url')
227
228 title = self._html_search_regex(
229 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
230 description = self._html_search_regex(
231 [
232 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
233 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
234 ],
235 webpage, 'description', fatal=False)
236
237 return {
238 'id': name,
239 'url': video_url,
240 'title': title,
241 'thumbnail': thumbnail,
242 'description': description,
243 }