]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
6 from ..utils
import ExtractorError
9 class Channel9IE(InfoExtractor
):
11 Common extractor for channel9.msdn.com.
13 The type of provided URL (video or playlist) is determined according to
14 meta Search.PageType from web page HTML rather than URL itself, as it is
15 not always possible to do.
19 _VALID_URL
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
23 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
26 'id': 'Events/TechEd/Australia/2013/KOS002',
28 'title': 'Developer Kick-Off Session: Stuff We Love',
29 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
31 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
32 'session_code': 'KOS002',
33 'session_day': 'Day 1',
34 'session_room': 'Arena 1A',
35 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
39 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
40 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
42 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
44 'title': 'Self-service BI with Power BI - nuclear testing',
45 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
47 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48 'authors': ['Mike Wilmot'],
53 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
56 _known_formats
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
58 def _restore_bytes(self
, formatted_size
):
59 if not formatted_size
:
61 m
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
)
64 units
= m
.group('units')
66 exponent
= ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units
.upper())
69 size
= float(m
.group('size'))
70 return int(size
* (1024 ** exponent
))
72 def _formats_from_html(self
, html
):
75 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
76 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
77 (?:<div\s+class="popup\s+rounded">\s*
78 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
79 </div>)? # File size part may be missing
81 # Extract known formats
83 'url': x
.group('url'),
84 'format_id': x
.group('quality'),
85 'format_note': x
.group('note'),
86 'format': '%s (%s)' % (x
.group('quality'), x
.group('note')),
87 'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate
88 'preference': self
._known
_formats
.index(x
.group('quality')),
89 'vcodec': 'none' if x
.group('note') == 'Audio only' else None,
90 } for x
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
]
92 self
._sort
_formats
(formats
)
96 def _extract_title(self
, html
):
97 title
= self
._html
_search
_meta
('title', html
, 'title')
99 title
= self
._og
_search
_title
(html
)
100 TITLE_SUFFIX
= ' (Channel 9)'
101 if title
is not None and title
.endswith(TITLE_SUFFIX
):
102 title
= title
[:-len(TITLE_SUFFIX
)]
105 def _extract_description(self
, html
):
106 DESCRIPTION_REGEX
= r
'''(?sx)
107 <div\s+class="entry-content">\s*
108 <div\s+id="entry-body">\s*
109 (?P<description>.+?)\s*
113 m
= re
.search(DESCRIPTION_REGEX
, html
)
115 return m
.group('description')
116 return self
._html
_search
_meta
('description', html
, 'description')
118 def _extract_duration(self
, html
):
119 m
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
120 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
122 def _extract_slides(self
, html
):
123 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
124 return m
.group('slidesurl') if m
is not None else None
126 def _extract_zip(self
, html
):
127 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
128 return m
.group('zipurl') if m
is not None else None
130 def _extract_avg_rating(self
, html
):
131 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
132 return float(m
.group('avgrating')) if m
is not None else 0
134 def _extract_rating_count(self
, html
):
135 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
136 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
138 def _extract_view_count(self
, html
):
139 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
140 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
142 def _extract_comment_count(self
, html
):
143 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
144 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
146 def _fix_count(self
, count
):
147 return int(str(count
).replace(',', '')) if count
is not None else None
149 def _extract_authors(self
, html
):
150 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
153 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
155 def _extract_session_code(self
, html
):
156 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
157 return m
.group('code') if m
is not None else None
159 def _extract_session_day(self
, html
):
160 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
161 return m
.group('day') if m
is not None else None
163 def _extract_session_room(self
, html
):
164 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
165 return m
.group('room') if m
is not None else None
167 def _extract_session_speakers(self
, html
):
168 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
170 def _extract_content(self
, html
, content_path
):
171 # Look for downloadable content
172 formats
= self
._formats
_from
_html
(html
)
173 slides
= self
._extract
_slides
(html
)
174 zip_
= self
._extract
_zip
(html
)
176 # Nothing to download
177 if len(formats
) == 0 and slides
is None and zip_
is None:
178 self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
)
182 title
= self
._extract
_title
(html
)
183 description
= self
._extract
_description
(html
)
184 thumbnail
= self
._og
_search
_thumbnail
(html
)
185 duration
= self
._extract
_duration
(html
)
186 avg_rating
= self
._extract
_avg
_rating
(html
)
187 rating_count
= self
._extract
_rating
_count
(html
)
188 view_count
= self
._extract
_view
_count
(html
)
189 comment_count
= self
._extract
_comment
_count
(html
)
194 'description': description
,
195 'thumbnail': thumbnail
,
196 'duration': duration
,
197 'avg_rating': avg_rating
,
198 'rating_count': rating_count
,
199 'view_count': view_count
,
200 'comment_count': comment_count
,
205 if slides
is not None:
207 d
.update({'title': title
+ '-Slides', 'url': slides
})
212 d
.update({'title': title
+ '-Zip', 'url': zip_
})
217 d
.update({'title': title
, 'formats': formats
})
222 def _extract_entry_item(self
, html
, content_path
):
223 contents
= self
._extract
_content
(html
, content_path
)
227 authors
= self
._extract
_authors
(html
)
229 for content
in contents
:
230 content
['authors'] = authors
234 def _extract_session(self
, html
, content_path
):
235 contents
= self
._extract
_content
(html
, content_path
)
239 session_meta
= {'session_code': self
._extract
_session
_code
(html
),
240 'session_day': self
._extract
_session
_day
(html
),
241 'session_room': self
._extract
_session
_room
(html
),
242 'session_speakers': self
._extract
_session
_speakers
(html
),
245 for content
in contents
:
246 content
.update(session_meta
)
250 def _extract_list(self
, content_path
):
251 rss
= self
._download
_xml
(self
._RSS
_URL
% content_path
, content_path
, 'Downloading RSS')
252 entries
= [self
.url_result(session_url
.text
, 'Channel9')
253 for session_url
in rss
.findall('./channel/item/link')]
254 title_text
= rss
.find('./channel/title').text
255 return self
.playlist_result(entries
, content_path
, title_text
)
257 def _real_extract(self
, url
):
258 mobj
= re
.match(self
._VALID
_URL
, url
)
259 content_path
= mobj
.group('contentpath')
261 webpage
= self
._download
_webpage
(url
, content_path
, 'Downloading web page')
263 page_type_m
= re
.search(r
'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage
)
264 if page_type_m
is not None:
265 page_type
= page_type_m
.group('pagetype')
266 if page_type
== 'Entry': # Any 'item'-like page, may contain downloadable content
267 return self
._extract
_entry
_item
(webpage
, content_path
)
268 elif page_type
== 'Session': # Event session page, may contain downloadable content
269 return self
._extract
_session
(webpage
, content_path
)
270 elif page_type
== 'Event':
271 return self
._extract
_list
(content_path
)
273 raise ExtractorError('Unexpected WT.entryid %s' % page_type
, expected
=True)
275 else: # Assuming list
276 return self
._extract
_list
(content_path
)