]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
Merge tag 'upstream/2015.11.27.1'
[youtubedl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 parse_filesize,
9 qualities,
10 )
11
12
13 class Channel9IE(InfoExtractor):
14 '''
15 Common extractor for channel9.msdn.com.
16
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
20 '''
21 IE_DESC = 'Channel 9'
22 IE_NAME = 'channel9'
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
24
25 _TESTS = [
26 {
27 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29 'info_dict': {
30 'id': 'Events/TechEd/Australia/2013/KOS002',
31 'ext': 'mp4',
32 'title': 'Developer Kick-Off Session: Stuff We Love',
33 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34 'duration': 4576,
35 'thumbnail': 're:http://.*\.jpg',
36 'session_code': 'KOS002',
37 'session_day': 'Day 1',
38 'session_room': 'Arena 1A',
39 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
40 },
41 },
42 {
43 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45 'info_dict': {
46 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47 'ext': 'mp4',
48 'title': 'Self-service BI with Power BI - nuclear testing',
49 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50 'duration': 1540,
51 'thumbnail': 're:http://.*\.jpg',
52 'authors': ['Mike Wilmot'],
53 },
54 },
55 {
56 # low quality mp4 is best
57 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58 'info_dict': {
59 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
60 'ext': 'mp4',
61 'title': 'Ranges for the Standard Library',
62 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
63 'duration': 5646,
64 'thumbnail': 're:http://.*\.jpg',
65 },
66 'params': {
67 'skip_download': True,
68 },
69 }
70 ]
71
72 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
73
74 def _formats_from_html(self, html):
75 FORMAT_REGEX = r'''
76 (?x)
77 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
78 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
79 (?:<div\s+class="popup\s+rounded">\s*
80 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
81 </div>)? # File size part may be missing
82 '''
83 quality = qualities((
84 'MP3', 'MP4',
85 'Low Quality WMV', 'Low Quality MP4',
86 'Mid Quality WMV', 'Mid Quality MP4',
87 'High Quality WMV', 'High Quality MP4'))
88 formats = [{
89 'url': x.group('url'),
90 'format_id': x.group('quality'),
91 'format_note': x.group('note'),
92 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
93 'filesize_approx': parse_filesize(x.group('filesize')),
94 'quality': quality(x.group('quality')),
95 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
96 } for x in list(re.finditer(FORMAT_REGEX, html))]
97
98 self._sort_formats(formats)
99
100 return formats
101
102 def _extract_title(self, html):
103 title = self._html_search_meta('title', html, 'title')
104 if title is None:
105 title = self._og_search_title(html)
106 TITLE_SUFFIX = ' (Channel 9)'
107 if title is not None and title.endswith(TITLE_SUFFIX):
108 title = title[:-len(TITLE_SUFFIX)]
109 return title
110
111 def _extract_description(self, html):
112 DESCRIPTION_REGEX = r'''(?sx)
113 <div\s+class="entry-content">\s*
114 <div\s+id="entry-body">\s*
115 (?P<description>.+?)\s*
116 </div>\s*
117 </div>
118 '''
119 m = re.search(DESCRIPTION_REGEX, html)
120 if m is not None:
121 return m.group('description')
122 return self._html_search_meta('description', html, 'description')
123
124 def _extract_duration(self, html):
125 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
126 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
127
128 def _extract_slides(self, html):
129 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
130 return m.group('slidesurl') if m is not None else None
131
132 def _extract_zip(self, html):
133 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
134 return m.group('zipurl') if m is not None else None
135
136 def _extract_avg_rating(self, html):
137 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
138 return float(m.group('avgrating')) if m is not None else 0
139
140 def _extract_rating_count(self, html):
141 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
142 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
143
144 def _extract_view_count(self, html):
145 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
146 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
147
148 def _extract_comment_count(self, html):
149 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
150 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
151
152 def _fix_count(self, count):
153 return int(str(count).replace(',', '')) if count is not None else None
154
155 def _extract_authors(self, html):
156 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
157 if m is None:
158 return None
159 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
160
161 def _extract_session_code(self, html):
162 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
163 return m.group('code') if m is not None else None
164
165 def _extract_session_day(self, html):
166 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
167 return m.group('day').strip() if m is not None else None
168
169 def _extract_session_room(self, html):
170 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
171 return m.group('room') if m is not None else None
172
173 def _extract_session_speakers(self, html):
174 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
175
176 def _extract_content(self, html, content_path):
177 # Look for downloadable content
178 formats = self._formats_from_html(html)
179 slides = self._extract_slides(html)
180 zip_ = self._extract_zip(html)
181
182 # Nothing to download
183 if len(formats) == 0 and slides is None and zip_ is None:
184 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
185 return
186
187 # Extract meta
188 title = self._extract_title(html)
189 description = self._extract_description(html)
190 thumbnail = self._og_search_thumbnail(html)
191 duration = self._extract_duration(html)
192 avg_rating = self._extract_avg_rating(html)
193 rating_count = self._extract_rating_count(html)
194 view_count = self._extract_view_count(html)
195 comment_count = self._extract_comment_count(html)
196
197 common = {
198 '_type': 'video',
199 'id': content_path,
200 'description': description,
201 'thumbnail': thumbnail,
202 'duration': duration,
203 'avg_rating': avg_rating,
204 'rating_count': rating_count,
205 'view_count': view_count,
206 'comment_count': comment_count,
207 }
208
209 result = []
210
211 if slides is not None:
212 d = common.copy()
213 d.update({'title': title + '-Slides', 'url': slides})
214 result.append(d)
215
216 if zip_ is not None:
217 d = common.copy()
218 d.update({'title': title + '-Zip', 'url': zip_})
219 result.append(d)
220
221 if len(formats) > 0:
222 d = common.copy()
223 d.update({'title': title, 'formats': formats})
224 result.append(d)
225
226 return result
227
228 def _extract_entry_item(self, html, content_path):
229 contents = self._extract_content(html, content_path)
230 if contents is None:
231 return contents
232
233 if len(contents) > 1:
234 raise ExtractorError('Got more than one entry')
235 result = contents[0]
236 result['authors'] = self._extract_authors(html)
237
238 return result
239
240 def _extract_session(self, html, content_path):
241 contents = self._extract_content(html, content_path)
242 if contents is None:
243 return contents
244
245 session_meta = {
246 'session_code': self._extract_session_code(html),
247 'session_day': self._extract_session_day(html),
248 'session_room': self._extract_session_room(html),
249 'session_speakers': self._extract_session_speakers(html),
250 }
251
252 for content in contents:
253 content.update(session_meta)
254
255 return self.playlist_result(contents)
256
257 def _extract_list(self, content_path):
258 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
259 entries = [self.url_result(session_url.text, 'Channel9')
260 for session_url in rss.findall('./channel/item/link')]
261 title_text = rss.find('./channel/title').text
262 return self.playlist_result(entries, content_path, title_text)
263
264 def _real_extract(self, url):
265 mobj = re.match(self._VALID_URL, url)
266 content_path = mobj.group('contentpath')
267
268 webpage = self._download_webpage(url, content_path, 'Downloading web page')
269
270 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
271 if page_type_m is not None:
272 page_type = page_type_m.group('pagetype')
273 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
274 return self._extract_entry_item(webpage, content_path)
275 elif page_type == 'Session': # Event session page, may contain downloadable content
276 return self._extract_session(webpage, content_path)
277 elif page_type == 'Event':
278 return self._extract_list(content_path)
279 else:
280 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
281
282 else: # Assuming list
283 return self._extract_list(content_path)