]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
ae70ea22967a8d880ba15fa1bb64f32904139094
5 from .common
import InfoExtractor
6 from ..utils
import ExtractorError
8 class Channel9IE(InfoExtractor
):
10 Common extractor for channel9.msdn.com.
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
16 IE_DESC
= u
'Channel 9'
18 _VALID_URL
= r
'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22 u
'url': u
'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 u
'file': u
'Events_TechEd_Australia_2013_KOS002.mp4',
24 u
'md5': u
'bbd75296ba47916b754e73c3a4bbdf10',
26 u
'title': u
'Developer Kick-Off Session: Stuff We Love',
27 u
'description': u
'md5:c08d72240b7c87fcecafe2692f80e35f',
29 u
'thumbnail': u
'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 u
'session_code': u
'KOS002',
31 u
'session_day': u
'Day 1',
32 u
'session_room': u
'Arena 1A',
33 u
'session_speakers': [ u
'Ed Blankenship', u
'Andrew Coates', u
'Brady Gaster', u
'Patrick Klug', u
'Mads Kristensen' ],
37 u
'url': u
'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 u
'file': u
'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 u
'md5': u
'b43ee4529d111bc37ba7ee4f34813e68',
41 u
'title': u
'Self-service BI with Power BI - nuclear testing',
42 u
'description': u
'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
44 u
'thumbnail': u
'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 u
'authors': [ u
'Mike Wilmot' ],
50 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
53 _known_formats
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
55 def _restore_bytes(self
, formatted_size
):
56 if not formatted_size
:
58 m
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
)
61 units
= m
.group('units')
63 exponent
= [u
'B', u
'KB', u
'MB', u
'GB', u
'TB', u
'PB', u
'EB', u
'ZB', u
'YB'].index(units
.upper())
66 size
= float(m
.group('size'))
67 return int(size
* (1024 ** exponent
))
69 def _formats_from_html(self
, html
):
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
78 # Extract known formats
79 formats
= [{'url': x
.group('url'),
80 'format_id': x
.group('quality'),
81 'format_note': x
.group('note'),
82 'format': '%s (%s)' % (x
.group('quality'), x
.group('note')),
83 'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate
84 } for x
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
]
85 # Sort according to known formats list
86 formats
.sort(key
=lambda fmt
: self
._known
_formats
.index(fmt
['format_id']))
89 def _extract_title(self
, html
):
90 title
= self
._html
_search
_meta
(u
'title', html
, u
'title')
92 title
= self
._og
_search
_title
(html
)
93 TITLE_SUFFIX
= u
' (Channel 9)'
94 if title
is not None and title
.endswith(TITLE_SUFFIX
):
95 title
= title
[:-len(TITLE_SUFFIX
)]
98 def _extract_description(self
, html
):
99 DESCRIPTION_REGEX
= r
'''(?sx)
100 <div\s+class="entry-content">\s*
101 <div\s+id="entry-body">\s*
102 (?P<description>.+?)\s*
106 m
= re
.search(DESCRIPTION_REGEX
, html
)
108 return m
.group('description')
109 return self
._html
_search
_meta
(u
'description', html
, u
'description')
111 def _extract_duration(self
, html
):
112 m
= re
.search(r
'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
113 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
115 def _extract_slides(self
, html
):
116 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
117 return m
.group('slidesurl') if m
is not None else None
119 def _extract_zip(self
, html
):
120 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
121 return m
.group('zipurl') if m
is not None else None
123 def _extract_avg_rating(self
, html
):
124 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
125 return float(m
.group('avgrating')) if m
is not None else 0
127 def _extract_rating_count(self
, html
):
128 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
129 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
131 def _extract_view_count(self
, html
):
132 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
133 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
135 def _extract_comment_count(self
, html
):
136 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
137 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
139 def _fix_count(self
, count
):
140 return int(str(count
).replace(',', '')) if count
is not None else None
142 def _extract_authors(self
, html
):
143 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
146 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
148 def _extract_session_code(self
, html
):
149 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
150 return m
.group('code') if m
is not None else None
152 def _extract_session_day(self
, html
):
153 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
154 return m
.group('day') if m
is not None else None
156 def _extract_session_room(self
, html
):
157 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
158 return m
.group('room') if m
is not None else None
160 def _extract_session_speakers(self
, html
):
161 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
163 def _extract_content(self
, html
, content_path
):
164 # Look for downloadable content
165 formats
= self
._formats
_from
_html
(html
)
166 slides
= self
._extract
_slides
(html
)
167 zip_
= self
._extract
_zip
(html
)
169 # Nothing to download
170 if len(formats
) == 0 and slides
is None and zip_
is None:
171 self
._downloader
.report_warning(u
'None of recording, slides or zip are available for %s' % content_path
)
175 title
= self
._extract
_title
(html
)
176 description
= self
._extract
_description
(html
)
177 thumbnail
= self
._og
_search
_thumbnail
(html
)
178 duration
= self
._extract
_duration
(html
)
179 avg_rating
= self
._extract
_avg
_rating
(html
)
180 rating_count
= self
._extract
_rating
_count
(html
)
181 view_count
= self
._extract
_view
_count
(html
)
182 comment_count
= self
._extract
_comment
_count
(html
)
184 common
= {'_type': 'video',
186 'description': description
,
187 'thumbnail': thumbnail
,
188 'duration': duration
,
189 'avg_rating': avg_rating
,
190 'rating_count': rating_count
,
191 'view_count': view_count
,
192 'comment_count': comment_count
,
197 if slides
is not None:
199 d
.update({ 'title': title
+ '-Slides', 'url': slides
})
204 d
.update({ 'title': title
+ '-Zip', 'url': zip_
})
209 d
.update({ 'title': title
, 'formats': formats
})
214 def _extract_entry_item(self
, html
, content_path
):
215 contents
= self
._extract
_content
(html
, content_path
)
219 authors
= self
._extract
_authors
(html
)
221 for content
in contents
:
222 content
['authors'] = authors
226 def _extract_session(self
, html
, content_path
):
227 contents
= self
._extract
_content
(html
, content_path
)
231 session_meta
= {'session_code': self
._extract
_session
_code
(html
),
232 'session_day': self
._extract
_session
_day
(html
),
233 'session_room': self
._extract
_session
_room
(html
),
234 'session_speakers': self
._extract
_session
_speakers
(html
),
237 for content
in contents
:
238 content
.update(session_meta
)
242 def _extract_list(self
, content_path
):
243 rss
= self
._download
_xml
(self
._RSS
_URL
% content_path
, content_path
, u
'Downloading RSS')
244 entries
= [self
.url_result(session_url
.text
, 'Channel9')
245 for session_url
in rss
.findall('./channel/item/link')]
246 title_text
= rss
.find('./channel/title').text
247 return self
.playlist_result(entries
, content_path
, title_text
)
249 def _real_extract(self
, url
):
250 mobj
= re
.match(self
._VALID
_URL
, url
)
251 content_path
= mobj
.group('contentpath')
253 webpage
= self
._download
_webpage
(url
, content_path
, u
'Downloading web page')
255 page_type_m
= re
.search(r
'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage
)
256 if page_type_m
is None:
257 raise ExtractorError(u
'Search.PageType not found, don\'t know how to process this page', expected
=True)
259 page_type
= page_type_m
.group('pagetype')
260 if page_type
== 'List': # List page, may contain list of 'item'-like objects
261 return self
._extract
_list
(content_path
)
262 elif page_type
== 'Entry.Item': # Any 'item'-like page, may contain downloadable content
263 return self
._extract
_entry
_item
(webpage
, content_path
)
264 elif page_type
== 'Session': # Event session page, may contain downloadable content
265 return self
._extract
_session
(webpage
, content_path
)
267 raise ExtractorError(u
'Unexpected Search.PageType %s' % page_type
, expected
=True)