]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
   6 from ..utils 
import ExtractorError
 
   8 class Channel9IE(InfoExtractor
): 
  10     Common extractor for channel9.msdn.com. 
  12     The type of provided URL (video or playlist) is determined according to 
  13     meta Search.PageType from web page HTML rather than URL itself, as it is 
  14     not always possible to do. 
  18     _VALID_URL 
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' 
  22             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  23             'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 
  25                 'id': 'Events/TechEd/Australia/2013/KOS002', 
  27                 'title': 'Developer Kick-Off Session: Stuff We Love', 
  28                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 
  30                 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 
  31                 'session_code': 'KOS002', 
  32                 'session_day': 'Day 1', 
  33                 'session_room': 'Arena 1A', 
  34                 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ], 
  38             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  39             'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 
  41                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  43                 'title': 'Self-service BI with Power BI - nuclear testing', 
  44                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  46                 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 
  47                 'authors': [ 'Mike Wilmot' ], 
  52     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  55     _known_formats 
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] 
  57     def _restore_bytes(self
, formatted_size
): 
  58         if not formatted_size
: 
  60         m 
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
) 
  63         units 
= m
.group('units') 
  65             exponent 
= ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units
.upper()) 
  68         size 
= float(m
.group('size')) 
  69         return int(size 
* (1024 ** exponent
)) 
  71     def _formats_from_html(self
, html
): 
  74             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  75             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  76             (?:<div\s+class="popup\s+rounded">\s* 
  77             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  78             </div>)?                                                # File size part may be missing 
  80         # Extract known formats 
  82             'url': x
.group('url'), 
  83             'format_id': x
.group('quality'), 
  84             'format_note': x
.group('note'), 
  85             'format': '%s (%s)' % (x
.group('quality'), x
.group('note')), 
  86             'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate 
  87             'preference': self
._known
_formats
.index(x
.group('quality')), 
  88             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
  89         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
] 
  91         self
._sort
_formats
(formats
) 
  95     def _extract_title(self
, html
): 
  96         title 
= self
._html
_search
_meta
('title', html
, 'title') 
  98             title 
= self
._og
_search
_title
(html
) 
  99             TITLE_SUFFIX 
= ' (Channel 9)' 
 100             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
 101                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 104     def _extract_description(self
, html
): 
 105         DESCRIPTION_REGEX 
= r
'''(?sx) 
 106             <div\s+class="entry-content">\s* 
 107             <div\s+id="entry-body">\s* 
 108             (?P<description>.+?)\s* 
 112         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 114             return m
.group('description') 
 115         return self
._html
_search
_meta
('description', html
, 'description') 
 117     def _extract_duration(self
, html
): 
 118         m 
= re
.search(r
'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 119         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 121     def _extract_slides(self
, html
): 
 122         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 123         return m
.group('slidesurl') if m 
is not None else None 
 125     def _extract_zip(self
, html
): 
 126         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 127         return m
.group('zipurl') if m 
is not None else None 
 129     def _extract_avg_rating(self
, html
): 
 130         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 131         return float(m
.group('avgrating')) if m 
is not None else 0 
 133     def _extract_rating_count(self
, html
): 
 134         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 135         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 137     def _extract_view_count(self
, html
): 
 138         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 139         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 141     def _extract_comment_count(self
, html
): 
 142         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 143         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 145     def _fix_count(self
, count
): 
 146         return int(str(count
).replace(',', '')) if count 
is not None else None 
 148     def _extract_authors(self
, html
): 
 149         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 152         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 154     def _extract_session_code(self
, html
): 
 155         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 156         return m
.group('code') if m 
is not None else None 
 158     def _extract_session_day(self
, html
): 
 159         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 160         return m
.group('day') if m 
is not None else None 
 162     def _extract_session_room(self
, html
): 
 163         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 164         return m
.group('room') if m 
is not None else None 
 166     def _extract_session_speakers(self
, html
): 
 167         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 169     def _extract_content(self
, html
, content_path
): 
 170         # Look for downloadable content         
 171         formats 
= self
._formats
_from
_html
(html
) 
 172         slides 
= self
._extract
_slides
(html
) 
 173         zip_ 
= self
._extract
_zip
(html
) 
 175         # Nothing to download 
 176         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 177             self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
) 
 181         title 
= self
._extract
_title
(html
) 
 182         description 
= self
._extract
_description
(html
) 
 183         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 184         duration 
= self
._extract
_duration
(html
) 
 185         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 186         rating_count 
= self
._extract
_rating
_count
(html
) 
 187         view_count 
= self
._extract
_view
_count
(html
) 
 188         comment_count 
= self
._extract
_comment
_count
(html
) 
 190         common 
= {'_type': 'video', 
 192                   'description': description
, 
 193                   'thumbnail': thumbnail
, 
 194                   'duration': duration
, 
 195                   'avg_rating': avg_rating
, 
 196                   'rating_count': rating_count
, 
 197                   'view_count': view_count
, 
 198                   'comment_count': comment_count
, 
 203         if slides 
is not None: 
 205             d
.update({ 'title': title 
+ '-Slides', 'url': slides 
}) 
 210             d
.update({ 'title': title 
+ '-Zip', 'url': zip_ 
}) 
 215             d
.update({ 'title': title
, 'formats': formats 
}) 
 220     def _extract_entry_item(self
, html
, content_path
): 
 221         contents 
= self
._extract
_content
(html
, content_path
) 
 225         authors 
= self
._extract
_authors
(html
) 
 227         for content 
in contents
: 
 228             content
['authors'] = authors
 
 232     def _extract_session(self
, html
, content_path
): 
 233         contents 
= self
._extract
_content
(html
, content_path
) 
 237         session_meta 
= {'session_code': self
._extract
_session
_code
(html
), 
 238                         'session_day': self
._extract
_session
_day
(html
), 
 239                         'session_room': self
._extract
_session
_room
(html
), 
 240                         'session_speakers': self
._extract
_session
_speakers
(html
), 
 243         for content 
in contents
: 
 244             content
.update(session_meta
) 
 248     def _extract_list(self
, content_path
): 
 249         rss 
= self
._download
_xml
(self
._RSS
_URL 
% content_path
, content_path
, 'Downloading RSS') 
 250         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 251                    for session_url 
in rss
.findall('./channel/item/link')] 
 252         title_text 
= rss
.find('./channel/title').text
 
 253         return self
.playlist_result(entries
, content_path
, title_text
) 
 255     def _real_extract(self
, url
): 
 256         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 257         content_path 
= mobj
.group('contentpath') 
 259         webpage 
= self
._download
_webpage
(url
, content_path
, 'Downloading web page') 
 261         page_type_m 
= re
.search(r
'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage
) 
 262         if page_type_m 
is None: 
 263             raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected
=True) 
 265         page_type 
= page_type_m
.group('pagetype') 
 266         if page_type 
== 'List':         # List page, may contain list of 'item'-like objects 
 267             return self
._extract
_list
(content_path
) 
 268         elif page_type 
== 'Entry.Item': # Any 'item'-like page, may contain downloadable content 
 269             return self
._extract
_entry
_item
(webpage
, content_path
) 
 270         elif page_type 
== 'Session':    # Event session page, may contain downloadable content 
 271             return self
._extract
_session
(webpage
, content_path
) 
 273             raise ExtractorError('Unexpected Search.PageType %s' % page_type
, expected
=True)