]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
Imported Upstream version 2013.12.23
[youtubedl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8 class Channel9IE(InfoExtractor):
9 '''
10 Common extractor for channel9.msdn.com.
11
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
15 '''
16 IE_DESC = u'Channel 9'
17 IE_NAME = u'channel9'
18 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
19
20 _TESTS = [
21 {
22 u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
24 u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
25 u'info_dict': {
26 u'title': u'Developer Kick-Off Session: Stuff We Love',
27 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
28 u'duration': 4576,
29 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 u'session_code': u'KOS002',
31 u'session_day': u'Day 1',
32 u'session_room': u'Arena 1A',
33 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
34 },
35 },
36 {
37 u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
40 u'info_dict': {
41 u'title': u'Self-service BI with Power BI - nuclear testing',
42 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
43 u'duration': 1540,
44 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 u'authors': [ u'Mike Wilmot' ],
46 },
47 }
48 ]
49
50 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
51
52 # Sorted by quality
53 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
54
55 def _restore_bytes(self, formatted_size):
56 if not formatted_size:
57 return 0
58 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
59 if not m:
60 return 0
61 units = m.group('units')
62 try:
63 exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
64 except ValueError:
65 return 0
66 size = float(m.group('size'))
67 return int(size * (1024 ** exponent))
68
69 def _formats_from_html(self, html):
70 FORMAT_REGEX = r'''
71 (?x)
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
77 '''
78 # Extract known formats
79 formats = [{'url': x.group('url'),
80 'format_id': x.group('quality'),
81 'format_note': x.group('note'),
82 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
83 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
84 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
85 # Sort according to known formats list
86 formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
87 return formats
88
89 def _extract_title(self, html):
90 title = self._html_search_meta(u'title', html, u'title')
91 if title is None:
92 title = self._og_search_title(html)
93 TITLE_SUFFIX = u' (Channel 9)'
94 if title is not None and title.endswith(TITLE_SUFFIX):
95 title = title[:-len(TITLE_SUFFIX)]
96 return title
97
98 def _extract_description(self, html):
99 DESCRIPTION_REGEX = r'''(?sx)
100 <div\s+class="entry-content">\s*
101 <div\s+id="entry-body">\s*
102 (?P<description>.+?)\s*
103 </div>\s*
104 </div>
105 '''
106 m = re.search(DESCRIPTION_REGEX, html)
107 if m is not None:
108 return m.group('description')
109 return self._html_search_meta(u'description', html, u'description')
110
111 def _extract_duration(self, html):
112 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
113 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
114
115 def _extract_slides(self, html):
116 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
117 return m.group('slidesurl') if m is not None else None
118
119 def _extract_zip(self, html):
120 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
121 return m.group('zipurl') if m is not None else None
122
123 def _extract_avg_rating(self, html):
124 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
125 return float(m.group('avgrating')) if m is not None else 0
126
127 def _extract_rating_count(self, html):
128 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
129 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
130
131 def _extract_view_count(self, html):
132 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
133 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
134
135 def _extract_comment_count(self, html):
136 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
137 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
138
139 def _fix_count(self, count):
140 return int(str(count).replace(',', '')) if count is not None else None
141
142 def _extract_authors(self, html):
143 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
144 if m is None:
145 return None
146 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
147
148 def _extract_session_code(self, html):
149 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
150 return m.group('code') if m is not None else None
151
152 def _extract_session_day(self, html):
153 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
154 return m.group('day') if m is not None else None
155
156 def _extract_session_room(self, html):
157 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
158 return m.group('room') if m is not None else None
159
160 def _extract_session_speakers(self, html):
161 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
162
163 def _extract_content(self, html, content_path):
164 # Look for downloadable content
165 formats = self._formats_from_html(html)
166 slides = self._extract_slides(html)
167 zip_ = self._extract_zip(html)
168
169 # Nothing to download
170 if len(formats) == 0 and slides is None and zip_ is None:
171 self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
172 return
173
174 # Extract meta
175 title = self._extract_title(html)
176 description = self._extract_description(html)
177 thumbnail = self._og_search_thumbnail(html)
178 duration = self._extract_duration(html)
179 avg_rating = self._extract_avg_rating(html)
180 rating_count = self._extract_rating_count(html)
181 view_count = self._extract_view_count(html)
182 comment_count = self._extract_comment_count(html)
183
184 common = {'_type': 'video',
185 'id': content_path,
186 'description': description,
187 'thumbnail': thumbnail,
188 'duration': duration,
189 'avg_rating': avg_rating,
190 'rating_count': rating_count,
191 'view_count': view_count,
192 'comment_count': comment_count,
193 }
194
195 result = []
196
197 if slides is not None:
198 d = common.copy()
199 d.update({ 'title': title + '-Slides', 'url': slides })
200 result.append(d)
201
202 if zip_ is not None:
203 d = common.copy()
204 d.update({ 'title': title + '-Zip', 'url': zip_ })
205 result.append(d)
206
207 if len(formats) > 0:
208 d = common.copy()
209 d.update({ 'title': title, 'formats': formats })
210 result.append(d)
211
212 return result
213
214 def _extract_entry_item(self, html, content_path):
215 contents = self._extract_content(html, content_path)
216 if contents is None:
217 return contents
218
219 authors = self._extract_authors(html)
220
221 for content in contents:
222 content['authors'] = authors
223
224 return contents
225
226 def _extract_session(self, html, content_path):
227 contents = self._extract_content(html, content_path)
228 if contents is None:
229 return contents
230
231 session_meta = {'session_code': self._extract_session_code(html),
232 'session_day': self._extract_session_day(html),
233 'session_room': self._extract_session_room(html),
234 'session_speakers': self._extract_session_speakers(html),
235 }
236
237 for content in contents:
238 content.update(session_meta)
239
240 return contents
241
242 def _extract_list(self, content_path):
243 rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
244 entries = [self.url_result(session_url.text, 'Channel9')
245 for session_url in rss.findall('./channel/item/link')]
246 title_text = rss.find('./channel/title').text
247 return self.playlist_result(entries, content_path, title_text)
248
249 def _real_extract(self, url):
250 mobj = re.match(self._VALID_URL, url)
251 content_path = mobj.group('contentpath')
252
253 webpage = self._download_webpage(url, content_path, u'Downloading web page')
254
255 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
256 if page_type_m is None:
257 raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
258
259 page_type = page_type_m.group('pagetype')
260 if page_type == 'List': # List page, may contain list of 'item'-like objects
261 return self._extract_list(content_path)
262 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
263 return self._extract_entry_item(webpage, content_path)
264 elif page_type == 'Session': # Event session page, may contain downloadable content
265 return self._extract_session(webpage, content_path)
266 else:
267 raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)