]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ted.py
Imported Upstream version 2014.06.07
[youtubedl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .subtitles import SubtitlesInfoExtractor
7
8 from ..utils import (
9 compat_str,
10 )
11
12
13 class TEDIE(SubtitlesInfoExtractor):
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
23 )
24 (/lang/(.*?))? # The url may contain the language
25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
26 .*)$
27 '''
28 _TESTS = [{
29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30 'md5': '4ea1dada91e4174b53dac2bb8ace429d',
31 'info_dict': {
32 'id': '102',
33 'ext': 'mp4',
34 'title': 'The illusion of consciousness',
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
39 'uploader': 'Dan Dennett',
40 'width': 854,
41 }
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54 'md5': '49144e345a899b8cb34d315f3b9cfeeb',
55 'info_dict': {
56 'id': '1972',
57 'ext': 'mp4',
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
61 },
62 }]
63
64 _NATIVE_FORMATS = {
65 'low': {'preference': 1, 'width': 320, 'height': 180},
66 'medium': {'preference': 2, 'width': 512, 'height': 288},
67 'high': {'preference': 3, 'width': 854, 'height': 480},
68 }
69
70 def _extract_info(self, webpage):
71 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
72 webpage, 'info json')
73 return json.loads(info_json)
74
75 def _real_extract(self, url):
76 m = re.match(self._VALID_URL, url, re.VERBOSE)
77 if m.group('type') == 'embed':
78 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
79 return self.url_result(desktop_url, 'TED')
80 name = m.group('name')
81 if m.group('type_talk'):
82 return self._talk_info(url, name)
83 elif m.group('type_watch'):
84 return self._watch_info(url, name)
85 else:
86 return self._playlist_videos_info(url, name)
87
88 def _playlist_videos_info(self, url, name):
89 '''Returns the videos of the playlist'''
90
91 webpage = self._download_webpage(url, name,
92 'Downloading playlist webpage')
93 info = self._extract_info(webpage)
94 playlist_info = info['playlist']
95
96 playlist_entries = [
97 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
98 for talk in info['talks']
99 ]
100 return self.playlist_result(
101 playlist_entries,
102 playlist_id=compat_str(playlist_info['id']),
103 playlist_title=playlist_info['title'])
104
105 def _talk_info(self, url, video_name):
106 webpage = self._download_webpage(url, video_name)
107 self.report_extraction(video_name)
108
109 talk_info = self._extract_info(webpage)['talks'][0]
110
111 formats = [{
112 'url': format_url,
113 'format_id': format_id,
114 'format': format_id,
115 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
116 if formats:
117 for f in formats:
118 finfo = self._NATIVE_FORMATS.get(f['format_id'])
119 if finfo:
120 f.update(finfo)
121 else:
122 # Use rtmp downloads
123 formats = [{
124 'format_id': f['name'],
125 'url': talk_info['streamer'],
126 'play_path': f['file'],
127 'ext': 'flv',
128 'width': f['width'],
129 'height': f['height'],
130 'tbr': f['bitrate'],
131 } for f in talk_info['resources']['rtmp']]
132 self._sort_formats(formats)
133
134 video_id = compat_str(talk_info['id'])
135 # subtitles
136 video_subtitles = self.extract_subtitles(video_id, talk_info)
137 if self._downloader.params.get('listsubtitles', False):
138 self._list_available_subtitles(video_id, talk_info)
139 return
140
141 thumbnail = talk_info['thumb']
142 if not thumbnail.startswith('http'):
143 thumbnail = 'http://' + thumbnail
144 return {
145 'id': video_id,
146 'title': talk_info['title'],
147 'uploader': talk_info['speaker'],
148 'thumbnail': thumbnail,
149 'description': self._og_search_description(webpage),
150 'subtitles': video_subtitles,
151 'formats': formats,
152 }
153
154 def _get_available_subtitles(self, video_id, talk_info):
155 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
156 if languages:
157 sub_lang_list = {}
158 for l in languages:
159 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
160 sub_lang_list[l] = url
161 return sub_lang_list
162 else:
163 self._downloader.report_warning('video doesn\'t have subtitles')
164 return {}
165
166 def _watch_info(self, url, name):
167 webpage = self._download_webpage(url, name)
168
169 config_json = self._html_search_regex(
170 r"data-config='([^']+)", webpage, 'config')
171 config = json.loads(config_json)
172 video_url = config['video']['url']
173 thumbnail = config.get('image', {}).get('url')
174
175 title = self._html_search_regex(
176 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
177 description = self._html_search_regex(
178 [
179 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
180 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
181 ],
182 webpage, 'description', fatal=False)
183
184 return {
185 'id': name,
186 'url': video_url,
187 'title': title,
188 'thumbnail': thumbnail,
189 'description': description,
190 }