]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
Merge tag 'upstream/2017.02.07'
[youtubedl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 parse_filesize,
9 qualities,
10 )
11
12
13 class Channel9IE(InfoExtractor):
14 '''
15 Common extractor for channel9.msdn.com.
16
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
20 '''
21 IE_DESC = 'Channel 9'
22 IE_NAME = 'channel9'
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
24
25 _TESTS = [{
26 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
27 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
28 'info_dict': {
29 'id': 'Events/TechEd/Australia/2013/KOS002',
30 'ext': 'mp4',
31 'title': 'Developer Kick-Off Session: Stuff We Love',
32 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
33 'duration': 4576,
34 'thumbnail': r're:http://.*\.jpg',
35 'session_code': 'KOS002',
36 'session_day': 'Day 1',
37 'session_room': 'Arena 1A',
38 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
39 'Mads Kristensen'],
40 },
41 }, {
42 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
44 'info_dict': {
45 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
46 'ext': 'mp4',
47 'title': 'Self-service BI with Power BI - nuclear testing',
48 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
49 'duration': 1540,
50 'thumbnail': r're:http://.*\.jpg',
51 'authors': ['Mike Wilmot'],
52 },
53 }, {
54 # low quality mp4 is best
55 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
56 'info_dict': {
57 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58 'ext': 'mp4',
59 'title': 'Ranges for the Standard Library',
60 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
61 'duration': 5646,
62 'thumbnail': r're:http://.*\.jpg',
63 },
64 'params': {
65 'skip_download': True,
66 },
67 }, {
68 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
69 'info_dict': {
70 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
71 'title': 'Channel 9',
72 },
73 'playlist_count': 2,
74 }, {
75 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
76 'only_matching': True,
77 }, {
78 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
79 'only_matching': True,
80 }]
81
82 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
83
84 def _formats_from_html(self, html):
85 FORMAT_REGEX = r'''
86 (?x)
87 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
88 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
89 (?:<div\s+class="popup\s+rounded">\s*
90 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
91 </div>)? # File size part may be missing
92 '''
93 quality = qualities((
94 'MP3', 'MP4',
95 'Low Quality WMV', 'Low Quality MP4',
96 'Mid Quality WMV', 'Mid Quality MP4',
97 'High Quality WMV', 'High Quality MP4'))
98 formats = [{
99 'url': x.group('url'),
100 'format_id': x.group('quality'),
101 'format_note': x.group('note'),
102 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
103 'filesize_approx': parse_filesize(x.group('filesize')),
104 'quality': quality(x.group('quality')),
105 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
106 } for x in list(re.finditer(FORMAT_REGEX, html))]
107
108 self._sort_formats(formats)
109
110 return formats
111
112 def _extract_title(self, html):
113 title = self._html_search_meta('title', html, 'title')
114 if title is None:
115 title = self._og_search_title(html)
116 TITLE_SUFFIX = ' (Channel 9)'
117 if title is not None and title.endswith(TITLE_SUFFIX):
118 title = title[:-len(TITLE_SUFFIX)]
119 return title
120
121 def _extract_description(self, html):
122 DESCRIPTION_REGEX = r'''(?sx)
123 <div\s+class="entry-content">\s*
124 <div\s+id="entry-body">\s*
125 (?P<description>.+?)\s*
126 </div>\s*
127 </div>
128 '''
129 m = re.search(DESCRIPTION_REGEX, html)
130 if m is not None:
131 return m.group('description')
132 return self._html_search_meta('description', html, 'description')
133
134 def _extract_duration(self, html):
135 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
136 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
137
138 def _extract_slides(self, html):
139 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
140 return m.group('slidesurl') if m is not None else None
141
142 def _extract_zip(self, html):
143 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
144 return m.group('zipurl') if m is not None else None
145
146 def _extract_avg_rating(self, html):
147 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
148 return float(m.group('avgrating')) if m is not None else 0
149
150 def _extract_rating_count(self, html):
151 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
152 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
153
154 def _extract_view_count(self, html):
155 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
156 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
157
158 def _extract_comment_count(self, html):
159 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
160 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
161
162 def _fix_count(self, count):
163 return int(str(count).replace(',', '')) if count is not None else None
164
165 def _extract_authors(self, html):
166 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
167 if m is None:
168 return None
169 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
170
171 def _extract_session_code(self, html):
172 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
173 return m.group('code') if m is not None else None
174
175 def _extract_session_day(self, html):
176 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
177 return m.group('day').strip() if m is not None else None
178
179 def _extract_session_room(self, html):
180 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
181 return m.group('room') if m is not None else None
182
183 def _extract_session_speakers(self, html):
184 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
185
186 def _extract_content(self, html, content_path):
187 # Look for downloadable content
188 formats = self._formats_from_html(html)
189 slides = self._extract_slides(html)
190 zip_ = self._extract_zip(html)
191
192 # Nothing to download
193 if len(formats) == 0 and slides is None and zip_ is None:
194 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
195 return
196
197 # Extract meta
198 title = self._extract_title(html)
199 description = self._extract_description(html)
200 thumbnail = self._og_search_thumbnail(html)
201 duration = self._extract_duration(html)
202 avg_rating = self._extract_avg_rating(html)
203 rating_count = self._extract_rating_count(html)
204 view_count = self._extract_view_count(html)
205 comment_count = self._extract_comment_count(html)
206
207 common = {
208 '_type': 'video',
209 'id': content_path,
210 'description': description,
211 'thumbnail': thumbnail,
212 'duration': duration,
213 'avg_rating': avg_rating,
214 'rating_count': rating_count,
215 'view_count': view_count,
216 'comment_count': comment_count,
217 }
218
219 result = []
220
221 if slides is not None:
222 d = common.copy()
223 d.update({'title': title + '-Slides', 'url': slides})
224 result.append(d)
225
226 if zip_ is not None:
227 d = common.copy()
228 d.update({'title': title + '-Zip', 'url': zip_})
229 result.append(d)
230
231 if len(formats) > 0:
232 d = common.copy()
233 d.update({'title': title, 'formats': formats})
234 result.append(d)
235
236 return result
237
238 def _extract_entry_item(self, html, content_path):
239 contents = self._extract_content(html, content_path)
240 if contents is None:
241 return contents
242
243 if len(contents) > 1:
244 raise ExtractorError('Got more than one entry')
245 result = contents[0]
246 result['authors'] = self._extract_authors(html)
247
248 return result
249
250 def _extract_session(self, html, content_path):
251 contents = self._extract_content(html, content_path)
252 if contents is None:
253 return contents
254
255 session_meta = {
256 'session_code': self._extract_session_code(html),
257 'session_day': self._extract_session_day(html),
258 'session_room': self._extract_session_room(html),
259 'session_speakers': self._extract_session_speakers(html),
260 }
261
262 for content in contents:
263 content.update(session_meta)
264
265 return self.playlist_result(contents)
266
267 def _extract_list(self, video_id, rss_url=None):
268 if not rss_url:
269 rss_url = self._RSS_URL % video_id
270 rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
271 entries = [self.url_result(session_url.text, 'Channel9')
272 for session_url in rss.findall('./channel/item/link')]
273 title_text = rss.find('./channel/title').text
274 return self.playlist_result(entries, video_id, title_text)
275
276 def _real_extract(self, url):
277 mobj = re.match(self._VALID_URL, url)
278 content_path = mobj.group('contentpath')
279 rss = mobj.group('rss')
280
281 if rss:
282 return self._extract_list(content_path, url)
283
284 webpage = self._download_webpage(
285 url, content_path, 'Downloading web page')
286
287 page_type = self._search_regex(
288 r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
289 webpage, 'page type', default=None, group='pagetype')
290 if page_type:
291 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
292 return self._extract_entry_item(webpage, content_path)
293 elif page_type == 'Session': # Event session page, may contain downloadable content
294 return self._extract_session(webpage, content_path)
295 elif page_type == 'Event':
296 return self._extract_list(content_path)
297 else:
298 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
299 else: # Assuming list
300 return self._extract_list(content_path)