]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
13 class Channel9IE(InfoExtractor
):
15 Common extractor for channel9.msdn.com.
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
23 _VALID_URL
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
27 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
30 'id': 'Events/TechEd/Australia/2013/KOS002',
32 'title': 'Developer Kick-Off Session: Stuff We Love',
33 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
35 'thumbnail': 're:http://.*\.jpg',
36 'session_code': 'KOS002',
37 'session_day': 'Day 1',
38 'session_room': 'Arena 1A',
39 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
43 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
46 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
48 'title': 'Self-service BI with Power BI - nuclear testing',
49 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
51 'thumbnail': 're:http://.*\.jpg',
52 'authors': ['Mike Wilmot'],
56 # low quality mp4 is best
57 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
59 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
61 'title': 'Ranges for the Standard Library',
62 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
64 'thumbnail': 're:http://.*\.jpg',
67 'skip_download': True,
72 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
74 def _formats_from_html(self
, html
):
77 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
78 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
79 (?:<div\s+class="popup\s+rounded">\s*
80 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
81 </div>)? # File size part may be missing
85 'Low Quality WMV', 'Low Quality MP4',
86 'Mid Quality WMV', 'Mid Quality MP4',
87 'High Quality WMV', 'High Quality MP4'))
89 'url': x
.group('url'),
90 'format_id': x
.group('quality'),
91 'format_note': x
.group('note'),
92 'format': '%s (%s)' % (x
.group('quality'), x
.group('note')),
93 'filesize_approx': parse_filesize(x
.group('filesize')),
94 'quality': quality(x
.group('quality')),
95 'vcodec': 'none' if x
.group('note') == 'Audio only' else None,
96 } for x
in list(re
.finditer(FORMAT_REGEX
, html
))]
98 self
._sort
_formats
(formats
)
102 def _extract_title(self
, html
):
103 title
= self
._html
_search
_meta
('title', html
, 'title')
105 title
= self
._og
_search
_title
(html
)
106 TITLE_SUFFIX
= ' (Channel 9)'
107 if title
is not None and title
.endswith(TITLE_SUFFIX
):
108 title
= title
[:-len(TITLE_SUFFIX
)]
111 def _extract_description(self
, html
):
112 DESCRIPTION_REGEX
= r
'''(?sx)
113 <div\s+class="entry-content">\s*
114 <div\s+id="entry-body">\s*
115 (?P<description>.+?)\s*
119 m
= re
.search(DESCRIPTION_REGEX
, html
)
121 return m
.group('description')
122 return self
._html
_search
_meta
('description', html
, 'description')
124 def _extract_duration(self
, html
):
125 m
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
126 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
128 def _extract_slides(self
, html
):
129 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
130 return m
.group('slidesurl') if m
is not None else None
132 def _extract_zip(self
, html
):
133 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
134 return m
.group('zipurl') if m
is not None else None
136 def _extract_avg_rating(self
, html
):
137 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
138 return float(m
.group('avgrating')) if m
is not None else 0
140 def _extract_rating_count(self
, html
):
141 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
142 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
144 def _extract_view_count(self
, html
):
145 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
146 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
148 def _extract_comment_count(self
, html
):
149 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
150 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
152 def _fix_count(self
, count
):
153 return int(str(count
).replace(',', '')) if count
is not None else None
155 def _extract_authors(self
, html
):
156 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
159 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
161 def _extract_session_code(self
, html
):
162 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
163 return m
.group('code') if m
is not None else None
165 def _extract_session_day(self
, html
):
166 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
167 return m
.group('day').strip() if m
is not None else None
169 def _extract_session_room(self
, html
):
170 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
171 return m
.group('room') if m
is not None else None
173 def _extract_session_speakers(self
, html
):
174 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
176 def _extract_content(self
, html
, content_path
):
177 # Look for downloadable content
178 formats
= self
._formats
_from
_html
(html
)
179 slides
= self
._extract
_slides
(html
)
180 zip_
= self
._extract
_zip
(html
)
182 # Nothing to download
183 if len(formats
) == 0 and slides
is None and zip_
is None:
184 self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
)
188 title
= self
._extract
_title
(html
)
189 description
= self
._extract
_description
(html
)
190 thumbnail
= self
._og
_search
_thumbnail
(html
)
191 duration
= self
._extract
_duration
(html
)
192 avg_rating
= self
._extract
_avg
_rating
(html
)
193 rating_count
= self
._extract
_rating
_count
(html
)
194 view_count
= self
._extract
_view
_count
(html
)
195 comment_count
= self
._extract
_comment
_count
(html
)
200 'description': description
,
201 'thumbnail': thumbnail
,
202 'duration': duration
,
203 'avg_rating': avg_rating
,
204 'rating_count': rating_count
,
205 'view_count': view_count
,
206 'comment_count': comment_count
,
211 if slides
is not None:
213 d
.update({'title': title
+ '-Slides', 'url': slides
})
218 d
.update({'title': title
+ '-Zip', 'url': zip_
})
223 d
.update({'title': title
, 'formats': formats
})
228 def _extract_entry_item(self
, html
, content_path
):
229 contents
= self
._extract
_content
(html
, content_path
)
233 if len(contents
) > 1:
234 raise ExtractorError('Got more than one entry')
236 result
['authors'] = self
._extract
_authors
(html
)
240 def _extract_session(self
, html
, content_path
):
241 contents
= self
._extract
_content
(html
, content_path
)
246 'session_code': self
._extract
_session
_code
(html
),
247 'session_day': self
._extract
_session
_day
(html
),
248 'session_room': self
._extract
_session
_room
(html
),
249 'session_speakers': self
._extract
_session
_speakers
(html
),
252 for content
in contents
:
253 content
.update(session_meta
)
255 return self
.playlist_result(contents
)
257 def _extract_list(self
, content_path
):
258 rss
= self
._download
_xml
(self
._RSS
_URL
% content_path
, content_path
, 'Downloading RSS')
259 entries
= [self
.url_result(session_url
.text
, 'Channel9')
260 for session_url
in rss
.findall('./channel/item/link')]
261 title_text
= rss
.find('./channel/title').text
262 return self
.playlist_result(entries
, content_path
, title_text
)
264 def _real_extract(self
, url
):
265 mobj
= re
.match(self
._VALID
_URL
, url
)
266 content_path
= mobj
.group('contentpath')
268 webpage
= self
._download
_webpage
(url
, content_path
, 'Downloading web page')
270 page_type_m
= re
.search(r
'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage
)
271 if page_type_m
is not None:
272 page_type
= page_type_m
.group('pagetype')
273 if page_type
== 'Entry': # Any 'item'-like page, may contain downloadable content
274 return self
._extract
_entry
_item
(webpage
, content_path
)
275 elif page_type
== 'Session': # Event session page, may contain downloadable content
276 return self
._extract
_session
(webpage
, content_path
)
277 elif page_type
== 'Event':
278 return self
._extract
_list
(content_path
)
280 raise ExtractorError('Unexpected WT.entryid %s' % page_type
, expected
=True)
282 else: # Assuming list
283 return self
._extract
_list
(content_path
)