]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
574881b70de67b9521b5e813f0cafa6da59d1068
5 from .common
import InfoExtractor
6 from ..utils
import ExtractorError
8 class Channel9IE(InfoExtractor
):
10 Common extractor for channel9.msdn.com.
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
16 IE_DESC
= u
'Channel 9'
18 _VALID_URL
= r
'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22 u
'url': u
'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 u
'file': u
'Events_TechEd_Australia_2013_KOS002.mp4',
24 u
'md5': u
'bbd75296ba47916b754e73c3a4bbdf10',
26 u
'title': u
'Developer Kick-Off Session: Stuff We Love',
27 u
'description': u
'md5:c08d72240b7c87fcecafe2692f80e35f',
29 u
'thumbnail': u
'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 u
'session_code': u
'KOS002',
31 u
'session_day': u
'Day 1',
32 u
'session_room': u
'Arena 1A',
33 u
'session_speakers': [ u
'Ed Blankenship', u
'Andrew Coates', u
'Brady Gaster', u
'Patrick Klug', u
'Mads Kristensen' ],
37 u
'url': u
'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 u
'file': u
'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 u
'md5': u
'b43ee4529d111bc37ba7ee4f34813e68',
41 u
'title': u
'Self-service BI with Power BI - nuclear testing',
42 u
'description': u
'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
44 u
'thumbnail': u
'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 u
'authors': [ u
'Mike Wilmot' ],
50 _RSS_URL
= 'http://channel9.msdn.com/%s/RSS'
53 _known_formats
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
55 def _restore_bytes(self
, formatted_size
):
56 if not formatted_size
:
58 m
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
)
61 units
= m
.group('units')
63 exponent
= [u
'B', u
'KB', u
'MB', u
'GB', u
'TB', u
'PB', u
'EB', u
'ZB', u
'YB'].index(units
.upper())
66 size
= float(m
.group('size'))
67 return int(size
* (1024 ** exponent
))
69 def _formats_from_html(self
, html
):
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
78 # Extract known formats
80 'url': x
.group('url'),
81 'format_id': x
.group('quality'),
82 'format_note': x
.group('note'),
83 'format': u
'%s (%s)' % (x
.group('quality'), x
.group('note')),
84 'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate
85 'preference': self
._known
_formats
.index(x
.group('quality')),
86 'vcodec': 'none' if x
.group('note') == 'Audio only' else None,
87 } for x
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
]
89 self
._sort
_formats
(formats
)
93 def _extract_title(self
, html
):
94 title
= self
._html
_search
_meta
(u
'title', html
, u
'title')
96 title
= self
._og
_search
_title
(html
)
97 TITLE_SUFFIX
= u
' (Channel 9)'
98 if title
is not None and title
.endswith(TITLE_SUFFIX
):
99 title
= title
[:-len(TITLE_SUFFIX
)]
102 def _extract_description(self
, html
):
103 DESCRIPTION_REGEX
= r
'''(?sx)
104 <div\s+class="entry-content">\s*
105 <div\s+id="entry-body">\s*
106 (?P<description>.+?)\s*
110 m
= re
.search(DESCRIPTION_REGEX
, html
)
112 return m
.group('description')
113 return self
._html
_search
_meta
(u
'description', html
, u
'description')
115 def _extract_duration(self
, html
):
116 m
= re
.search(r
'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
)
117 return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m
else None
119 def _extract_slides(self
, html
):
120 m
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
)
121 return m
.group('slidesurl') if m
is not None else None
123 def _extract_zip(self
, html
):
124 m
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
)
125 return m
.group('zipurl') if m
is not None else None
127 def _extract_avg_rating(self
, html
):
128 m
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
)
129 return float(m
.group('avgrating')) if m
is not None else 0
131 def _extract_rating_count(self
, html
):
132 m
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
)
133 return int(self
._fix
_count
(m
.group('ratingcount'))) if m
is not None else 0
135 def _extract_view_count(self
, html
):
136 m
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
)
137 return int(self
._fix
_count
(m
.group('viewcount'))) if m
is not None else 0
139 def _extract_comment_count(self
, html
):
140 m
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
)
141 return int(self
._fix
_count
(m
.group('commentcount'))) if m
is not None else 0
143 def _fix_count(self
, count
):
144 return int(str(count
).replace(',', '')) if count
is not None else None
146 def _extract_authors(self
, html
):
147 m
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
)
150 return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1))
152 def _extract_session_code(self
, html
):
153 m
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
)
154 return m
.group('code') if m
is not None else None
156 def _extract_session_day(self
, html
):
157 m
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
)
158 return m
.group('day') if m
is not None else None
160 def _extract_session_room(self
, html
):
161 m
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
)
162 return m
.group('room') if m
is not None else None
164 def _extract_session_speakers(self
, html
):
165 return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
)
167 def _extract_content(self
, html
, content_path
):
168 # Look for downloadable content
169 formats
= self
._formats
_from
_html
(html
)
170 slides
= self
._extract
_slides
(html
)
171 zip_
= self
._extract
_zip
(html
)
173 # Nothing to download
174 if len(formats
) == 0 and slides
is None and zip_
is None:
175 self
._downloader
.report_warning(u
'None of recording, slides or zip are available for %s' % content_path
)
179 title
= self
._extract
_title
(html
)
180 description
= self
._extract
_description
(html
)
181 thumbnail
= self
._og
_search
_thumbnail
(html
)
182 duration
= self
._extract
_duration
(html
)
183 avg_rating
= self
._extract
_avg
_rating
(html
)
184 rating_count
= self
._extract
_rating
_count
(html
)
185 view_count
= self
._extract
_view
_count
(html
)
186 comment_count
= self
._extract
_comment
_count
(html
)
188 common
= {'_type': 'video',
190 'description': description
,
191 'thumbnail': thumbnail
,
192 'duration': duration
,
193 'avg_rating': avg_rating
,
194 'rating_count': rating_count
,
195 'view_count': view_count
,
196 'comment_count': comment_count
,
201 if slides
is not None:
203 d
.update({ 'title': title
+ '-Slides', 'url': slides
})
208 d
.update({ 'title': title
+ '-Zip', 'url': zip_
})
213 d
.update({ 'title': title
, 'formats': formats
})
218 def _extract_entry_item(self
, html
, content_path
):
219 contents
= self
._extract
_content
(html
, content_path
)
223 authors
= self
._extract
_authors
(html
)
225 for content
in contents
:
226 content
['authors'] = authors
230 def _extract_session(self
, html
, content_path
):
231 contents
= self
._extract
_content
(html
, content_path
)
235 session_meta
= {'session_code': self
._extract
_session
_code
(html
),
236 'session_day': self
._extract
_session
_day
(html
),
237 'session_room': self
._extract
_session
_room
(html
),
238 'session_speakers': self
._extract
_session
_speakers
(html
),
241 for content
in contents
:
242 content
.update(session_meta
)
246 def _extract_list(self
, content_path
):
247 rss
= self
._download
_xml
(self
._RSS
_URL
% content_path
, content_path
, u
'Downloading RSS')
248 entries
= [self
.url_result(session_url
.text
, 'Channel9')
249 for session_url
in rss
.findall('./channel/item/link')]
250 title_text
= rss
.find('./channel/title').text
251 return self
.playlist_result(entries
, content_path
, title_text
)
253 def _real_extract(self
, url
):
254 mobj
= re
.match(self
._VALID
_URL
, url
)
255 content_path
= mobj
.group('contentpath')
257 webpage
= self
._download
_webpage
(url
, content_path
, u
'Downloading web page')
259 page_type_m
= re
.search(r
'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage
)
260 if page_type_m
is None:
261 raise ExtractorError(u
'Search.PageType not found, don\'t know how to process this page', expected
=True)
263 page_type
= page_type_m
.group('pagetype')
264 if page_type
== 'List': # List page, may contain list of 'item'-like objects
265 return self
._extract
_list
(content_path
)
266 elif page_type
== 'Entry.Item': # Any 'item'-like page, may contain downloadable content
267 return self
._extract
_entry
_item
(webpage
, content_path
)
268 elif page_type
== 'Session': # Event session page, may contain downloadable content
269 return self
._extract
_session
(webpage
, content_path
)
271 raise ExtractorError(u
'Unexpected Search.PageType %s' % page_type
, expected
=True)