]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
6 from ..utils
import ExtractorError
8 class Channel9IE(InfoExtractor
):
10 Common extractor for channel9.msdn.com.
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
18 _VALID_URL
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
25 'id': 'Events/TechEd/Australia/2013/KOS002',
27 'title': 'Developer Kick-Off Session: Stuff We Love',
28 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
31 'session_code': 'KOS002',
32 'session_day': 'Day 1',
33 'session_room': 'Arena 1A',
34 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
38 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
39 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'title': 'Self-service BI with Power BI - nuclear testing',
44 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
46 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
47 'authors': [ 'Mike Wilmot' ],
52 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
55 _known_formats
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
57 def _restore_bytes(self
, formatted_size
):
58 if not formatted_size
:
60 m
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
)
63 units
= m
.group('units')
65 exponent
= ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units
.upper())
68 size
= float(m
.group('size'))
69 return int(size
* (1024 ** exponent
))
71 def _formats_from_html(self
, html
):
74 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
75 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
76 (?:<div\s+class="popup\s+rounded">\s*
77 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
78 </div>)? # File size part may be missing
80 # Extract known formats
82 'url': x
.group('url'),
83 'format_id': x
.group('quality'),
84 'format_note': x
.group('note'),
85 'format': '%s (%s)' % (x
.group('quality'), x
.group('note')),
86 'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate
87 'preference': self
._known
_formats
.index(x
.group('quality')),
88 'vcodec': 'none' if x
.group('note') == 'Audio only' else None,
89 } for x
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
]
91 self
._sort
_formats
(formats
)
95 def _extract_title(self
, html
):
96 title
= self
._html
_search
_meta
('title', html
, 'title')
98 title
= self
._og
_search
_title
(html
)
99 TITLE_SUFFIX
= ' (Channel 9)'
100 if title
is not None and title
.endswith(TITLE_SUFFIX
):
101 title
= title
[:-len(TITLE_SUFFIX
)]
104 def _extract_description(self
, html
):
105 DESCRIPTION_REGEX
= r
'''(?sx)
106 <div\s+class="entry-content">\s*
107 <div\s+id="entry-body">\s*
108 (?P<description>.+?)\s*
112 m
= re
.search(DESCRIPTION_REGEX
, html
)
114 return m
.group('description')
115 return self
._html
_search
_meta
('description', html
, 'description')
117 def _extract_duration(self
, html
):
118 m
= re
.search(r
'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
119 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
121 def _extract_slides(self
, html
):
122 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
123 return m
.group('slidesurl') if m
is not None else None
125 def _extract_zip(self
, html
):
126 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
127 return m
.group('zipurl') if m
is not None else None
129 def _extract_avg_rating(self
, html
):
130 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
131 return float(m
.group('avgrating')) if m
is not None else 0
133 def _extract_rating_count(self
, html
):
134 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
135 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
137 def _extract_view_count(self
, html
):
138 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
139 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
141 def _extract_comment_count(self
, html
):
142 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
143 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
145 def _fix_count(self
, count
):
146 return int(str(count
).replace(',', '')) if count
is not None else None
148 def _extract_authors(self
, html
):
149 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
152 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
154 def _extract_session_code(self
, html
):
155 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
156 return m
.group('code') if m
is not None else None
158 def _extract_session_day(self
, html
):
159 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
160 return m
.group('day') if m
is not None else None
162 def _extract_session_room(self
, html
):
163 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
164 return m
.group('room') if m
is not None else None
166 def _extract_session_speakers(self
, html
):
167 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
169 def _extract_content(self
, html
, content_path
):
170 # Look for downloadable content
171 formats
= self
._formats
_from
_html
(html
)
172 slides
= self
._extract
_slides
(html
)
173 zip_
= self
._extract
_zip
(html
)
175 # Nothing to download
176 if len(formats
) == 0 and slides
is None and zip_
is None:
177 self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
)
181 title
= self
._extract
_title
(html
)
182 description
= self
._extract
_description
(html
)
183 thumbnail
= self
._og
_search
_thumbnail
(html
)
184 duration
= self
._extract
_duration
(html
)
185 avg_rating
= self
._extract
_avg
_rating
(html
)
186 rating_count
= self
._extract
_rating
_count
(html
)
187 view_count
= self
._extract
_view
_count
(html
)
188 comment_count
= self
._extract
_comment
_count
(html
)
190 common
= {'_type': 'video',
192 'description': description
,
193 'thumbnail': thumbnail
,
194 'duration': duration
,
195 'avg_rating': avg_rating
,
196 'rating_count': rating_count
,
197 'view_count': view_count
,
198 'comment_count': comment_count
,
203 if slides
is not None:
205 d
.update({ 'title': title
+ '-Slides', 'url': slides
})
210 d
.update({ 'title': title
+ '-Zip', 'url': zip_
})
215 d
.update({ 'title': title
, 'formats': formats
})
220 def _extract_entry_item(self
, html
, content_path
):
221 contents
= self
._extract
_content
(html
, content_path
)
225 authors
= self
._extract
_authors
(html
)
227 for content
in contents
:
228 content
['authors'] = authors
232 def _extract_session(self
, html
, content_path
):
233 contents
= self
._extract
_content
(html
, content_path
)
237 session_meta
= {'session_code': self
._extract
_session
_code
(html
),
238 'session_day': self
._extract
_session
_day
(html
),
239 'session_room': self
._extract
_session
_room
(html
),
240 'session_speakers': self
._extract
_session
_speakers
(html
),
243 for content
in contents
:
244 content
.update(session_meta
)
248 def _extract_list(self
, content_path
):
249 rss
= self
._download
_xml
(self
._RSS
_URL
% content_path
, content_path
, 'Downloading RSS')
250 entries
= [self
.url_result(session_url
.text
, 'Channel9')
251 for session_url
in rss
.findall('./channel/item/link')]
252 title_text
= rss
.find('./channel/title').text
253 return self
.playlist_result(entries
, content_path
, title_text
)
255 def _real_extract(self
, url
):
256 mobj
= re
.match(self
._VALID
_URL
, url
)
257 content_path
= mobj
.group('contentpath')
259 webpage
= self
._download
_webpage
(url
, content_path
, 'Downloading web page')
261 page_type_m
= re
.search(r
'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage
)
262 if page_type_m
is None:
263 raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected
=True)
265 page_type
= page_type_m
.group('pagetype')
266 if page_type
== 'List': # List page, may contain list of 'item'-like objects
267 return self
._extract
_list
(content_path
)
268 elif page_type
== 'Entry.Item': # Any 'item'-like page, may contain downloadable content
269 return self
._extract
_entry
_item
(webpage
, content_path
)
270 elif page_type
== 'Session': # Event session page, may contain downloadable content
271 return self
._extract
_session
(webpage
, content_path
)
273 raise ExtractorError('Unexpected Search.PageType %s' % page_type
, expected
=True)