]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
Merge tag 'upstream/2014.01.17.2'
[youtubedl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8 class Channel9IE(InfoExtractor):
9 '''
10 Common extractor for channel9.msdn.com.
11
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
15 '''
16 IE_DESC = u'Channel 9'
17 IE_NAME = u'channel9'
18 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
19
20 _TESTS = [
21 {
22 u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
24 u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
25 u'info_dict': {
26 u'title': u'Developer Kick-Off Session: Stuff We Love',
27 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
28 u'duration': 4576,
29 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 u'session_code': u'KOS002',
31 u'session_day': u'Day 1',
32 u'session_room': u'Arena 1A',
33 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
34 },
35 },
36 {
37 u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
40 u'info_dict': {
41 u'title': u'Self-service BI with Power BI - nuclear testing',
42 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
43 u'duration': 1540,
44 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 u'authors': [ u'Mike Wilmot' ],
46 },
47 }
48 ]
49
50 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
51
52 # Sorted by quality
53 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
54
55 def _restore_bytes(self, formatted_size):
56 if not formatted_size:
57 return 0
58 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
59 if not m:
60 return 0
61 units = m.group('units')
62 try:
63 exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
64 except ValueError:
65 return 0
66 size = float(m.group('size'))
67 return int(size * (1024 ** exponent))
68
69 def _formats_from_html(self, html):
70 FORMAT_REGEX = r'''
71 (?x)
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
77 '''
78 # Extract known formats
79 formats = [{
80 'url': x.group('url'),
81 'format_id': x.group('quality'),
82 'format_note': x.group('note'),
83 'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
84 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
85 'preference': self._known_formats.index(x.group('quality')),
86 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
87 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
88
89 self._sort_formats(formats)
90
91 return formats
92
93 def _extract_title(self, html):
94 title = self._html_search_meta(u'title', html, u'title')
95 if title is None:
96 title = self._og_search_title(html)
97 TITLE_SUFFIX = u' (Channel 9)'
98 if title is not None and title.endswith(TITLE_SUFFIX):
99 title = title[:-len(TITLE_SUFFIX)]
100 return title
101
102 def _extract_description(self, html):
103 DESCRIPTION_REGEX = r'''(?sx)
104 <div\s+class="entry-content">\s*
105 <div\s+id="entry-body">\s*
106 (?P<description>.+?)\s*
107 </div>\s*
108 </div>
109 '''
110 m = re.search(DESCRIPTION_REGEX, html)
111 if m is not None:
112 return m.group('description')
113 return self._html_search_meta(u'description', html, u'description')
114
115 def _extract_duration(self, html):
116 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
117 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
118
119 def _extract_slides(self, html):
120 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
121 return m.group('slidesurl') if m is not None else None
122
123 def _extract_zip(self, html):
124 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
125 return m.group('zipurl') if m is not None else None
126
127 def _extract_avg_rating(self, html):
128 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
129 return float(m.group('avgrating')) if m is not None else 0
130
131 def _extract_rating_count(self, html):
132 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
133 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
134
135 def _extract_view_count(self, html):
136 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
137 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
138
139 def _extract_comment_count(self, html):
140 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
141 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
142
143 def _fix_count(self, count):
144 return int(str(count).replace(',', '')) if count is not None else None
145
146 def _extract_authors(self, html):
147 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
148 if m is None:
149 return None
150 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
151
152 def _extract_session_code(self, html):
153 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
154 return m.group('code') if m is not None else None
155
156 def _extract_session_day(self, html):
157 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
158 return m.group('day') if m is not None else None
159
160 def _extract_session_room(self, html):
161 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
162 return m.group('room') if m is not None else None
163
164 def _extract_session_speakers(self, html):
165 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
166
167 def _extract_content(self, html, content_path):
168 # Look for downloadable content
169 formats = self._formats_from_html(html)
170 slides = self._extract_slides(html)
171 zip_ = self._extract_zip(html)
172
173 # Nothing to download
174 if len(formats) == 0 and slides is None and zip_ is None:
175 self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
176 return
177
178 # Extract meta
179 title = self._extract_title(html)
180 description = self._extract_description(html)
181 thumbnail = self._og_search_thumbnail(html)
182 duration = self._extract_duration(html)
183 avg_rating = self._extract_avg_rating(html)
184 rating_count = self._extract_rating_count(html)
185 view_count = self._extract_view_count(html)
186 comment_count = self._extract_comment_count(html)
187
188 common = {'_type': 'video',
189 'id': content_path,
190 'description': description,
191 'thumbnail': thumbnail,
192 'duration': duration,
193 'avg_rating': avg_rating,
194 'rating_count': rating_count,
195 'view_count': view_count,
196 'comment_count': comment_count,
197 }
198
199 result = []
200
201 if slides is not None:
202 d = common.copy()
203 d.update({ 'title': title + '-Slides', 'url': slides })
204 result.append(d)
205
206 if zip_ is not None:
207 d = common.copy()
208 d.update({ 'title': title + '-Zip', 'url': zip_ })
209 result.append(d)
210
211 if len(formats) > 0:
212 d = common.copy()
213 d.update({ 'title': title, 'formats': formats })
214 result.append(d)
215
216 return result
217
218 def _extract_entry_item(self, html, content_path):
219 contents = self._extract_content(html, content_path)
220 if contents is None:
221 return contents
222
223 authors = self._extract_authors(html)
224
225 for content in contents:
226 content['authors'] = authors
227
228 return contents
229
230 def _extract_session(self, html, content_path):
231 contents = self._extract_content(html, content_path)
232 if contents is None:
233 return contents
234
235 session_meta = {'session_code': self._extract_session_code(html),
236 'session_day': self._extract_session_day(html),
237 'session_room': self._extract_session_room(html),
238 'session_speakers': self._extract_session_speakers(html),
239 }
240
241 for content in contents:
242 content.update(session_meta)
243
244 return contents
245
246 def _extract_list(self, content_path):
247 rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
248 entries = [self.url_result(session_url.text, 'Channel9')
249 for session_url in rss.findall('./channel/item/link')]
250 title_text = rss.find('./channel/title').text
251 return self.playlist_result(entries, content_path, title_text)
252
253 def _real_extract(self, url):
254 mobj = re.match(self._VALID_URL, url)
255 content_path = mobj.group('contentpath')
256
257 webpage = self._download_webpage(url, content_path, u'Downloading web page')
258
259 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
260 if page_type_m is None:
261 raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
262
263 page_type = page_type_m.group('pagetype')
264 if page_type == 'List': # List page, may contain list of 'item'-like objects
265 return self._extract_list(content_path)
266 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
267 return self._extract_entry_item(webpage, content_path)
268 elif page_type == 'Session': # Event session page, may contain downloadable content
269 return self._extract_session(webpage, content_path)
270 else:
271 raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)