]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  13 class Channel9IE(InfoExtractor
): 
  15     Common extractor for channel9.msdn.com. 
  17     The type of provided URL (video or playlist) is determined according to 
  18     meta Search.PageType from web page HTML rather than URL itself, as it is 
  19     not always possible to do. 
  23     _VALID_URL 
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' 
  27             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  28             'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 
  30                 'id': 'Events/TechEd/Australia/2013/KOS002', 
  32                 'title': 'Developer Kick-Off Session: Stuff We Love', 
  33                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 
  35                 'thumbnail': 're:http://.*\.jpg', 
  36                 'session_code': 'KOS002', 
  37                 'session_day': 'Day 1', 
  38                 'session_room': 'Arena 1A', 
  39                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], 
  43             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  44             'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 
  46                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  48                 'title': 'Self-service BI with Power BI - nuclear testing', 
  49                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  51                 'thumbnail': 're:http://.*\.jpg', 
  52                 'authors': ['Mike Wilmot'], 
  56             # low quality mp4 is best 
  57             'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 
  59                 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 
  61                 'title': 'Ranges for the Standard Library', 
  62                 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', 
  64                 'thumbnail': 're:http://.*\.jpg', 
  67                 'skip_download': True, 
  72     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  74     def _formats_from_html(self
, html
): 
  77             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  78             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  79             (?:<div\s+class="popup\s+rounded">\s* 
  80             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  81             </div>)?                                                # File size part may be missing 
  85             'Low Quality WMV', 'Low Quality MP4', 
  86             'Mid Quality WMV', 'Mid Quality MP4', 
  87             'High Quality WMV', 'High Quality MP4')) 
  89             'url': x
.group('url'), 
  90             'format_id': x
.group('quality'), 
  91             'format_note': x
.group('note'), 
  92             'format': '%s (%s)' % (x
.group('quality'), x
.group('note')), 
  93             'filesize_approx': parse_filesize(x
.group('filesize')), 
  94             'quality': quality(x
.group('quality')), 
  95             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
  96         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
))] 
  98         self
._sort
_formats
(formats
) 
 102     def _extract_title(self
, html
): 
 103         title 
= self
._html
_search
_meta
('title', html
, 'title') 
 105             title 
= self
._og
_search
_title
(html
) 
 106             TITLE_SUFFIX 
= ' (Channel 9)' 
 107             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
 108                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 111     def _extract_description(self
, html
): 
 112         DESCRIPTION_REGEX 
= r
'''(?sx) 
 113             <div\s+class="entry-content">\s* 
 114             <div\s+id="entry-body">\s* 
 115             (?P<description>.+?)\s* 
 119         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 121             return m
.group('description') 
 122         return self
._html
_search
_meta
('description', html
, 'description') 
 124     def _extract_duration(self
, html
): 
 125         m 
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 126         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 128     def _extract_slides(self
, html
): 
 129         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 130         return m
.group('slidesurl') if m 
is not None else None 
 132     def _extract_zip(self
, html
): 
 133         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 134         return m
.group('zipurl') if m 
is not None else None 
 136     def _extract_avg_rating(self
, html
): 
 137         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 138         return float(m
.group('avgrating')) if m 
is not None else 0 
 140     def _extract_rating_count(self
, html
): 
 141         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 142         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 144     def _extract_view_count(self
, html
): 
 145         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 146         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 148     def _extract_comment_count(self
, html
): 
 149         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 150         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 152     def _fix_count(self
, count
): 
 153         return int(str(count
).replace(',', '')) if count 
is not None else None 
 155     def _extract_authors(self
, html
): 
 156         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 159         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 161     def _extract_session_code(self
, html
): 
 162         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 163         return m
.group('code') if m 
is not None else None 
 165     def _extract_session_day(self
, html
): 
 166         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 167         return m
.group('day').strip() if m 
is not None else None 
 169     def _extract_session_room(self
, html
): 
 170         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 171         return m
.group('room') if m 
is not None else None 
 173     def _extract_session_speakers(self
, html
): 
 174         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 176     def _extract_content(self
, html
, content_path
): 
 177         # Look for downloadable content 
 178         formats 
= self
._formats
_from
_html
(html
) 
 179         slides 
= self
._extract
_slides
(html
) 
 180         zip_ 
= self
._extract
_zip
(html
) 
 182         # Nothing to download 
 183         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 184             self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
) 
 188         title 
= self
._extract
_title
(html
) 
 189         description 
= self
._extract
_description
(html
) 
 190         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 191         duration 
= self
._extract
_duration
(html
) 
 192         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 193         rating_count 
= self
._extract
_rating
_count
(html
) 
 194         view_count 
= self
._extract
_view
_count
(html
) 
 195         comment_count 
= self
._extract
_comment
_count
(html
) 
 200             'description': description
, 
 201             'thumbnail': thumbnail
, 
 202             'duration': duration
, 
 203             'avg_rating': avg_rating
, 
 204             'rating_count': rating_count
, 
 205             'view_count': view_count
, 
 206             'comment_count': comment_count
, 
 211         if slides 
is not None: 
 213             d
.update({'title': title 
+ '-Slides', 'url': slides
}) 
 218             d
.update({'title': title 
+ '-Zip', 'url': zip_
}) 
 223             d
.update({'title': title
, 'formats': formats
}) 
 228     def _extract_entry_item(self
, html
, content_path
): 
 229         contents 
= self
._extract
_content
(html
, content_path
) 
 233         if len(contents
) > 1: 
 234             raise ExtractorError('Got more than one entry') 
 236         result
['authors'] = self
._extract
_authors
(html
) 
 240     def _extract_session(self
, html
, content_path
): 
 241         contents 
= self
._extract
_content
(html
, content_path
) 
 246             'session_code': self
._extract
_session
_code
(html
), 
 247             'session_day': self
._extract
_session
_day
(html
), 
 248             'session_room': self
._extract
_session
_room
(html
), 
 249             'session_speakers': self
._extract
_session
_speakers
(html
), 
 252         for content 
in contents
: 
 253             content
.update(session_meta
) 
 255         return self
.playlist_result(contents
) 
 257     def _extract_list(self
, content_path
): 
 258         rss 
= self
._download
_xml
(self
._RSS
_URL 
% content_path
, content_path
, 'Downloading RSS') 
 259         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 260                    for session_url 
in rss
.findall('./channel/item/link')] 
 261         title_text 
= rss
.find('./channel/title').text
 
 262         return self
.playlist_result(entries
, content_path
, title_text
) 
 264     def _real_extract(self
, url
): 
 265         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 266         content_path 
= mobj
.group('contentpath') 
 268         webpage 
= self
._download
_webpage
(url
, content_path
, 'Downloading web page') 
 270         page_type_m 
= re
.search(r
'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage
) 
 271         if page_type_m 
is not None: 
 272             page_type 
= page_type_m
.group('pagetype') 
 273             if page_type 
== 'Entry':      # Any 'item'-like page, may contain downloadable content 
 274                 return self
._extract
_entry
_item
(webpage
, content_path
) 
 275             elif page_type 
== 'Session':  # Event session page, may contain downloadable content 
 276                 return self
._extract
_session
(webpage
, content_path
) 
 277             elif page_type 
== 'Event': 
 278                 return self
._extract
_list
(content_path
) 
 280                 raise ExtractorError('Unexpected WT.entryid %s' % page_type
, expected
=True) 
 282         else:  # Assuming list 
 283             return self
._extract
_list
(content_path
)