]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
13 class Channel9IE(InfoExtractor
):
15 Common extractor for channel9.msdn.com.
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
23 _VALID_URL
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
26 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
27 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29 'id': 'Events/TechEd/Australia/2013/KOS002',
31 'title': 'Developer Kick-Off Session: Stuff We Love',
32 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34 'thumbnail': r
're:http://.*\.jpg',
35 'session_code': 'KOS002',
36 'session_day': 'Day 1',
37 'session_room': 'Arena 1A',
38 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
42 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47 'title': 'Self-service BI with Power BI - nuclear testing',
48 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50 'thumbnail': r
're:http://.*\.jpg',
51 'authors': ['Mike Wilmot'],
54 # low quality mp4 is best
55 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
57 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
59 'title': 'Ranges for the Standard Library',
60 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
62 'thumbnail': r
're:http://.*\.jpg',
65 'skip_download': True,
68 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
70 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
75 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
76 'only_matching': True,
78 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
79 'only_matching': True,
82 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
84 def _formats_from_html(self
, html
):
87 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
88 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
89 (?:<div\s+class="popup\s+rounded">\s*
90 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
91 </div>)? # File size part may be missing
95 'Low Quality WMV', 'Low Quality MP4',
96 'Mid Quality WMV', 'Mid Quality MP4',
97 'High Quality WMV', 'High Quality MP4'))
99 'url': x
.group('url'),
100 'format_id': x
.group('quality'),
101 'format_note': x
.group('note'),
102 'format': '%s (%s)' % (x
.group('quality'), x
.group('note')),
103 'filesize_approx': parse_filesize(x
.group('filesize')),
104 'quality': quality(x
.group('quality')),
105 'vcodec': 'none' if x
.group('note') == 'Audio only' else None,
106 } for x
in list(re
.finditer(FORMAT_REGEX
, html
))]
108 self
._sort
_formats
(formats
)
112 def _extract_title(self
, html
):
113 title
= self
._html
_search
_meta
('title', html
, 'title')
115 title
= self
._og
_search
_title
(html
)
116 TITLE_SUFFIX
= ' (Channel 9)'
117 if title
is not None and title
.endswith(TITLE_SUFFIX
):
118 title
= title
[:-len(TITLE_SUFFIX
)]
121 def _extract_description(self
, html
):
122 DESCRIPTION_REGEX
= r
'''(?sx)
123 <div\s+class="entry-content">\s*
124 <div\s+id="entry-body">\s*
125 (?P<description>.+?)\s*
129 m
= re
.search(DESCRIPTION_REGEX
, html
)
131 return m
.group('description')
132 return self
._html
_search
_meta
('description', html
, 'description')
134 def _extract_duration(self
, html
):
135 m
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
136 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
138 def _extract_slides(self
, html
):
139 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
140 return m
.group('slidesurl') if m
is not None else None
142 def _extract_zip(self
, html
):
143 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
144 return m
.group('zipurl') if m
is not None else None
146 def _extract_avg_rating(self
, html
):
147 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
148 return float(m
.group('avgrating')) if m
is not None else 0
150 def _extract_rating_count(self
, html
):
151 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
152 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
154 def _extract_view_count(self
, html
):
155 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
156 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
158 def _extract_comment_count(self
, html
):
159 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
160 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
162 def _fix_count(self
, count
):
163 return int(str(count
).replace(',', '')) if count
is not None else None
165 def _extract_authors(self
, html
):
166 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
169 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
171 def _extract_session_code(self
, html
):
172 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
173 return m
.group('code') if m
is not None else None
175 def _extract_session_day(self
, html
):
176 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
177 return m
.group('day').strip() if m
is not None else None
179 def _extract_session_room(self
, html
):
180 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
181 return m
.group('room') if m
is not None else None
183 def _extract_session_speakers(self
, html
):
184 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
186 def _extract_content(self
, html
, content_path
):
187 # Look for downloadable content
188 formats
= self
._formats
_from
_html
(html
)
189 slides
= self
._extract
_slides
(html
)
190 zip_
= self
._extract
_zip
(html
)
192 # Nothing to download
193 if len(formats
) == 0 and slides
is None and zip_
is None:
194 self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
)
198 title
= self
._extract
_title
(html
)
199 description
= self
._extract
_description
(html
)
200 thumbnail
= self
._og
_search
_thumbnail
(html
)
201 duration
= self
._extract
_duration
(html
)
202 avg_rating
= self
._extract
_avg
_rating
(html
)
203 rating_count
= self
._extract
_rating
_count
(html
)
204 view_count
= self
._extract
_view
_count
(html
)
205 comment_count
= self
._extract
_comment
_count
(html
)
210 'description': description
,
211 'thumbnail': thumbnail
,
212 'duration': duration
,
213 'avg_rating': avg_rating
,
214 'rating_count': rating_count
,
215 'view_count': view_count
,
216 'comment_count': comment_count
,
221 if slides
is not None:
223 d
.update({'title': title
+ '-Slides', 'url': slides
})
228 d
.update({'title': title
+ '-Zip', 'url': zip_
})
233 d
.update({'title': title
, 'formats': formats
})
238 def _extract_entry_item(self
, html
, content_path
):
239 contents
= self
._extract
_content
(html
, content_path
)
243 if len(contents
) > 1:
244 raise ExtractorError('Got more than one entry')
246 result
['authors'] = self
._extract
_authors
(html
)
250 def _extract_session(self
, html
, content_path
):
251 contents
= self
._extract
_content
(html
, content_path
)
256 'session_code': self
._extract
_session
_code
(html
),
257 'session_day': self
._extract
_session
_day
(html
),
258 'session_room': self
._extract
_session
_room
(html
),
259 'session_speakers': self
._extract
_session
_speakers
(html
),
262 for content
in contents
:
263 content
.update(session_meta
)
265 return self
.playlist_result(contents
)
267 def _extract_list(self
, video_id
, rss_url
=None):
269 rss_url
= self
._RSS
_URL
% video_id
270 rss
= self
._download
_xml
(rss_url
, video_id
, 'Downloading RSS')
271 entries
= [self
.url_result(session_url
.text
, 'Channel9')
272 for session_url
in rss
.findall('./channel/item/link')]
273 title_text
= rss
.find('./channel/title').text
274 return self
.playlist_result(entries
, video_id
, title_text
)
276 def _real_extract(self
, url
):
277 mobj
= re
.match(self
._VALID
_URL
, url
)
278 content_path
= mobj
.group('contentpath')
279 rss
= mobj
.group('rss')
282 return self
._extract
_list
(content_path
, url
)
284 webpage
= self
._download
_webpage
(
285 url
, content_path
, 'Downloading web page')
287 page_type
= self
._search
_regex
(
288 r
'<meta[^>]+name=(["\'])WT\
.entryid\
1[^
>]+content
=(["\'])(?P<pagetype>[^:]+).+?\2',
289 webpage, 'page type', default=None, group='pagetype')
291 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
292 return self._extract_entry_item(webpage, content_path)
293 elif page_type == 'Session': # Event session page, may contain downloadable content
294 return self._extract_session(webpage, content_path)
295 elif page_type == 'Event':
296 return self._extract_list(content_path)
298 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
299 else: # Assuming list
300 return self._extract_list(content_path)