]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
Update changelog.
[youtubedl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8
9 class Channel9IE(InfoExtractor):
10 '''
11 Common extractor for channel9.msdn.com.
12
13 The type of provided URL (video or playlist) is determined according to
14 meta Search.PageType from web page HTML rather than URL itself, as it is
15 not always possible to do.
16 '''
17 IE_DESC = 'Channel 9'
18 IE_NAME = 'channel9'
19 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
20
21 _TESTS = [
22 {
23 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
25 'info_dict': {
26 'id': 'Events/TechEd/Australia/2013/KOS002',
27 'ext': 'mp4',
28 'title': 'Developer Kick-Off Session: Stuff We Love',
29 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30 'duration': 4576,
31 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
32 'session_code': 'KOS002',
33 'session_day': 'Day 1',
34 'session_room': 'Arena 1A',
35 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
36 },
37 },
38 {
39 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
40 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41 'info_dict': {
42 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'ext': 'mp4',
44 'title': 'Self-service BI with Power BI - nuclear testing',
45 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
46 'duration': 1540,
47 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48 'authors': ['Mike Wilmot'],
49 },
50 }
51 ]
52
53 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54
55 # Sorted by quality
56 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
57
58 def _restore_bytes(self, formatted_size):
59 if not formatted_size:
60 return 0
61 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
62 if not m:
63 return 0
64 units = m.group('units')
65 try:
66 exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
67 except ValueError:
68 return 0
69 size = float(m.group('size'))
70 return int(size * (1024 ** exponent))
71
72 def _formats_from_html(self, html):
73 FORMAT_REGEX = r'''
74 (?x)
75 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
76 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
77 (?:<div\s+class="popup\s+rounded">\s*
78 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
79 </div>)? # File size part may be missing
80 '''
81 # Extract known formats
82 formats = [{
83 'url': x.group('url'),
84 'format_id': x.group('quality'),
85 'format_note': x.group('note'),
86 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
87 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
88 'preference': self._known_formats.index(x.group('quality')),
89 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
90 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
91
92 self._sort_formats(formats)
93
94 return formats
95
96 def _extract_title(self, html):
97 title = self._html_search_meta('title', html, 'title')
98 if title is None:
99 title = self._og_search_title(html)
100 TITLE_SUFFIX = ' (Channel 9)'
101 if title is not None and title.endswith(TITLE_SUFFIX):
102 title = title[:-len(TITLE_SUFFIX)]
103 return title
104
105 def _extract_description(self, html):
106 DESCRIPTION_REGEX = r'''(?sx)
107 <div\s+class="entry-content">\s*
108 <div\s+id="entry-body">\s*
109 (?P<description>.+?)\s*
110 </div>\s*
111 </div>
112 '''
113 m = re.search(DESCRIPTION_REGEX, html)
114 if m is not None:
115 return m.group('description')
116 return self._html_search_meta('description', html, 'description')
117
118 def _extract_duration(self, html):
119 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
120 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
121
122 def _extract_slides(self, html):
123 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
124 return m.group('slidesurl') if m is not None else None
125
126 def _extract_zip(self, html):
127 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
128 return m.group('zipurl') if m is not None else None
129
130 def _extract_avg_rating(self, html):
131 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
132 return float(m.group('avgrating')) if m is not None else 0
133
134 def _extract_rating_count(self, html):
135 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
136 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
137
138 def _extract_view_count(self, html):
139 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
140 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
141
142 def _extract_comment_count(self, html):
143 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
144 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
145
146 def _fix_count(self, count):
147 return int(str(count).replace(',', '')) if count is not None else None
148
149 def _extract_authors(self, html):
150 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
151 if m is None:
152 return None
153 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
154
155 def _extract_session_code(self, html):
156 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
157 return m.group('code') if m is not None else None
158
159 def _extract_session_day(self, html):
160 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
161 return m.group('day') if m is not None else None
162
163 def _extract_session_room(self, html):
164 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
165 return m.group('room') if m is not None else None
166
167 def _extract_session_speakers(self, html):
168 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
169
170 def _extract_content(self, html, content_path):
171 # Look for downloadable content
172 formats = self._formats_from_html(html)
173 slides = self._extract_slides(html)
174 zip_ = self._extract_zip(html)
175
176 # Nothing to download
177 if len(formats) == 0 and slides is None and zip_ is None:
178 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
179 return
180
181 # Extract meta
182 title = self._extract_title(html)
183 description = self._extract_description(html)
184 thumbnail = self._og_search_thumbnail(html)
185 duration = self._extract_duration(html)
186 avg_rating = self._extract_avg_rating(html)
187 rating_count = self._extract_rating_count(html)
188 view_count = self._extract_view_count(html)
189 comment_count = self._extract_comment_count(html)
190
191 common = {
192 '_type': 'video',
193 'id': content_path,
194 'description': description,
195 'thumbnail': thumbnail,
196 'duration': duration,
197 'avg_rating': avg_rating,
198 'rating_count': rating_count,
199 'view_count': view_count,
200 'comment_count': comment_count,
201 }
202
203 result = []
204
205 if slides is not None:
206 d = common.copy()
207 d.update({'title': title + '-Slides', 'url': slides})
208 result.append(d)
209
210 if zip_ is not None:
211 d = common.copy()
212 d.update({'title': title + '-Zip', 'url': zip_})
213 result.append(d)
214
215 if len(formats) > 0:
216 d = common.copy()
217 d.update({'title': title, 'formats': formats})
218 result.append(d)
219
220 return result
221
222 def _extract_entry_item(self, html, content_path):
223 contents = self._extract_content(html, content_path)
224 if contents is None:
225 return contents
226
227 authors = self._extract_authors(html)
228
229 for content in contents:
230 content['authors'] = authors
231
232 return contents
233
234 def _extract_session(self, html, content_path):
235 contents = self._extract_content(html, content_path)
236 if contents is None:
237 return contents
238
239 session_meta = {'session_code': self._extract_session_code(html),
240 'session_day': self._extract_session_day(html),
241 'session_room': self._extract_session_room(html),
242 'session_speakers': self._extract_session_speakers(html),
243 }
244
245 for content in contents:
246 content.update(session_meta)
247
248 return contents
249
250 def _extract_list(self, content_path):
251 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
252 entries = [self.url_result(session_url.text, 'Channel9')
253 for session_url in rss.findall('./channel/item/link')]
254 title_text = rss.find('./channel/title').text
255 return self.playlist_result(entries, content_path, title_text)
256
257 def _real_extract(self, url):
258 mobj = re.match(self._VALID_URL, url)
259 content_path = mobj.group('contentpath')
260
261 webpage = self._download_webpage(url, content_path, 'Downloading web page')
262
263 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
264 if page_type_m is not None:
265 page_type = page_type_m.group('pagetype')
266 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
267 return self._extract_entry_item(webpage, content_path)
268 elif page_type == 'Session': # Event session page, may contain downloadable content
269 return self._extract_session(webpage, content_path)
270 elif page_type == 'Event':
271 return self._extract_list(content_path)
272 else:
273 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
274
275 else: # Assuming list
276 return self._extract_list(content_path)