]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  13 class Channel9IE(InfoExtractor
): 
  15     Common extractor for channel9.msdn.com. 
  17     The type of provided URL (video or playlist) is determined according to 
  18     meta Search.PageType from web page HTML rather than URL itself, as it is 
  19     not always possible to do. 
  23     _VALID_URL 
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' 
  26         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  27         'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 
  29             'id': 'Events/TechEd/Australia/2013/KOS002', 
  31             'title': 'Developer Kick-Off Session: Stuff We Love', 
  32             'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 
  34             'thumbnail': 're:http://.*\.jpg', 
  35             'session_code': 'KOS002', 
  36             'session_day': 'Day 1', 
  37             'session_room': 'Arena 1A', 
  38             'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 
  42         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  43         'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 
  45             'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  47             'title': 'Self-service BI with Power BI - nuclear testing', 
  48             'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  50             'thumbnail': 're:http://.*\.jpg', 
  51             'authors': ['Mike Wilmot'], 
  54         # low quality mp4 is best 
  55         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 
  57             'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 
  59             'title': 'Ranges for the Standard Library', 
  60             'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', 
  62             'thumbnail': 're:http://.*\.jpg', 
  65             'skip_download': True, 
  68         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', 
  70             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', 
  75         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', 
  76         'only_matching': True, 
  78         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', 
  79         'only_matching': True, 
  82     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  84     def _formats_from_html(self
, html
): 
  87             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  88             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  89             (?:<div\s+class="popup\s+rounded">\s* 
  90             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  91             </div>)?                                                # File size part may be missing 
  95             'Low Quality WMV', 'Low Quality MP4', 
  96             'Mid Quality WMV', 'Mid Quality MP4', 
  97             'High Quality WMV', 'High Quality MP4')) 
  99             'url': x
.group('url'), 
 100             'format_id': x
.group('quality'), 
 101             'format_note': x
.group('note'), 
 102             'format': '%s (%s)' % (x
.group('quality'), x
.group('note')), 
 103             'filesize_approx': parse_filesize(x
.group('filesize')), 
 104             'quality': quality(x
.group('quality')), 
 105             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
 106         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
))] 
 108         self
._sort
_formats
(formats
) 
 112     def _extract_title(self
, html
): 
 113         title 
= self
._html
_search
_meta
('title', html
, 'title') 
 115             title 
= self
._og
_search
_title
(html
) 
 116             TITLE_SUFFIX 
= ' (Channel 9)' 
 117             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
 118                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 121     def _extract_description(self
, html
): 
 122         DESCRIPTION_REGEX 
= r
'''(?sx) 
 123             <div\s+class="entry-content">\s* 
 124             <div\s+id="entry-body">\s* 
 125             (?P<description>.+?)\s* 
 129         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 131             return m
.group('description') 
 132         return self
._html
_search
_meta
('description', html
, 'description') 
 134     def _extract_duration(self
, html
): 
 135         m 
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 136         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 138     def _extract_slides(self
, html
): 
 139         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 140         return m
.group('slidesurl') if m 
is not None else None 
 142     def _extract_zip(self
, html
): 
 143         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 144         return m
.group('zipurl') if m 
is not None else None 
 146     def _extract_avg_rating(self
, html
): 
 147         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 148         return float(m
.group('avgrating')) if m 
is not None else 0 
 150     def _extract_rating_count(self
, html
): 
 151         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 152         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 154     def _extract_view_count(self
, html
): 
 155         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 156         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 158     def _extract_comment_count(self
, html
): 
 159         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 160         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 162     def _fix_count(self
, count
): 
 163         return int(str(count
).replace(',', '')) if count 
is not None else None 
 165     def _extract_authors(self
, html
): 
 166         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 169         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 171     def _extract_session_code(self
, html
): 
 172         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 173         return m
.group('code') if m 
is not None else None 
 175     def _extract_session_day(self
, html
): 
 176         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 177         return m
.group('day').strip() if m 
is not None else None 
 179     def _extract_session_room(self
, html
): 
 180         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 181         return m
.group('room') if m 
is not None else None 
 183     def _extract_session_speakers(self
, html
): 
 184         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 186     def _extract_content(self
, html
, content_path
): 
 187         # Look for downloadable content 
 188         formats 
= self
._formats
_from
_html
(html
) 
 189         slides 
= self
._extract
_slides
(html
) 
 190         zip_ 
= self
._extract
_zip
(html
) 
 192         # Nothing to download 
 193         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 194             self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
) 
 198         title 
= self
._extract
_title
(html
) 
 199         description 
= self
._extract
_description
(html
) 
 200         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 201         duration 
= self
._extract
_duration
(html
) 
 202         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 203         rating_count 
= self
._extract
_rating
_count
(html
) 
 204         view_count 
= self
._extract
_view
_count
(html
) 
 205         comment_count 
= self
._extract
_comment
_count
(html
) 
 210             'description': description
, 
 211             'thumbnail': thumbnail
, 
 212             'duration': duration
, 
 213             'avg_rating': avg_rating
, 
 214             'rating_count': rating_count
, 
 215             'view_count': view_count
, 
 216             'comment_count': comment_count
, 
 221         if slides 
is not None: 
 223             d
.update({'title': title 
+ '-Slides', 'url': slides
}) 
 228             d
.update({'title': title 
+ '-Zip', 'url': zip_
}) 
 233             d
.update({'title': title
, 'formats': formats
}) 
 238     def _extract_entry_item(self
, html
, content_path
): 
 239         contents 
= self
._extract
_content
(html
, content_path
) 
 243         if len(contents
) > 1: 
 244             raise ExtractorError('Got more than one entry') 
 246         result
['authors'] = self
._extract
_authors
(html
) 
 250     def _extract_session(self
, html
, content_path
): 
 251         contents 
= self
._extract
_content
(html
, content_path
) 
 256             'session_code': self
._extract
_session
_code
(html
), 
 257             'session_day': self
._extract
_session
_day
(html
), 
 258             'session_room': self
._extract
_session
_room
(html
), 
 259             'session_speakers': self
._extract
_session
_speakers
(html
), 
 262         for content 
in contents
: 
 263             content
.update(session_meta
) 
 265         return self
.playlist_result(contents
) 
 267     def _extract_list(self
, video_id
, rss_url
=None): 
 269             rss_url 
= self
._RSS
_URL 
% video_id
 
 270         rss 
= self
._download
_xml
(rss_url
, video_id
, 'Downloading RSS') 
 271         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 272                    for session_url 
in rss
.findall('./channel/item/link')] 
 273         title_text 
= rss
.find('./channel/title').text
 
 274         return self
.playlist_result(entries
, video_id
, title_text
) 
 276     def _real_extract(self
, url
): 
 277         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 278         content_path 
= mobj
.group('contentpath') 
 279         rss 
= mobj
.group('rss') 
 282             return self
._extract
_list
(content_path
, url
) 
 284         webpage 
= self
._download
_webpage
( 
 285             url
, content_path
, 'Downloading web page') 
 287         page_type 
= self
._search
_regex
( 
 288             r
'<meta[^>]+name=(["\'])WT\
.entryid\
1[^
>]+content
=(["\'])(?P<pagetype>[^:]+).+?\2', 
 289             webpage, 'page type', default=None, group='pagetype') 
 291             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content 
 292                 return self._extract_entry_item(webpage, content_path) 
 293             elif page_type == 'Session':  # Event session page, may contain downloadable content 
 294                 return self._extract_session(webpage, content_path) 
 295             elif page_type == 'Event': 
 296                 return self._extract_list(content_path) 
 298                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) 
 299         else:  # Assuming list 
 300             return self._extract_list(content_path)