]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
   6 from ..utils 
import ExtractorError
 
   9 class Channel9IE(InfoExtractor
): 
  11     Common extractor for channel9.msdn.com. 
  13     The type of provided URL (video or playlist) is determined according to 
  14     meta Search.PageType from web page HTML rather than URL itself, as it is 
  15     not always possible to do. 
  19     _VALID_URL 
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' 
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 
  26                 'id': 'Events/TechEd/Australia/2013/KOS002', 
  28                 'title': 'Developer Kick-Off Session: Stuff We Love', 
  29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 
  31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 
  32                 'session_code': 'KOS002', 
  33                 'session_day': 'Day 1', 
  34                 'session_room': 'Arena 1A', 
  35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], 
  39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 
  42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  44                 'title': 'Self-service BI with Power BI - nuclear testing', 
  45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 
  48                 'authors': ['Mike Wilmot'], 
  53     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  56     _known_formats 
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] 
  58     def _restore_bytes(self
, formatted_size
): 
  59         if not formatted_size
: 
  61         m 
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
) 
  64         units 
= m
.group('units') 
  66             exponent 
= ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units
.upper()) 
  69         size 
= float(m
.group('size')) 
  70         return int(size 
* (1024 ** exponent
)) 
  72     def _formats_from_html(self
, html
): 
  75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  77             (?:<div\s+class="popup\s+rounded">\s* 
  78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  79             </div>)?                                                # File size part may be missing 
  81         # Extract known formats 
  83             'url': x
.group('url'), 
  84             'format_id': x
.group('quality'), 
  85             'format_note': x
.group('note'), 
  86             'format': '%s (%s)' % (x
.group('quality'), x
.group('note')), 
  87             'filesize': self
._restore
_bytes
(x
.group('filesize')),  # File size is approximate 
  88             'preference': self
._known
_formats
.index(x
.group('quality')), 
  89             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
  90         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
] 
  92         self
._sort
_formats
(formats
) 
  96     def _extract_title(self
, html
): 
  97         title 
= self
._html
_search
_meta
('title', html
, 'title') 
  99             title 
= self
._og
_search
_title
(html
) 
 100             TITLE_SUFFIX 
= ' (Channel 9)' 
 101             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
 102                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 105     def _extract_description(self
, html
): 
 106         DESCRIPTION_REGEX 
= r
'''(?sx) 
 107             <div\s+class="entry-content">\s* 
 108             <div\s+id="entry-body">\s* 
 109             (?P<description>.+?)\s* 
 113         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 115             return m
.group('description') 
 116         return self
._html
_search
_meta
('description', html
, 'description') 
 118     def _extract_duration(self
, html
): 
 119         m 
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 120         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 122     def _extract_slides(self
, html
): 
 123         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 124         return m
.group('slidesurl') if m 
is not None else None 
 126     def _extract_zip(self
, html
): 
 127         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 128         return m
.group('zipurl') if m 
is not None else None 
 130     def _extract_avg_rating(self
, html
): 
 131         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 132         return float(m
.group('avgrating')) if m 
is not None else 0 
 134     def _extract_rating_count(self
, html
): 
 135         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 136         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 138     def _extract_view_count(self
, html
): 
 139         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 140         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 142     def _extract_comment_count(self
, html
): 
 143         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 144         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 146     def _fix_count(self
, count
): 
 147         return int(str(count
).replace(',', '')) if count 
is not None else None 
 149     def _extract_authors(self
, html
): 
 150         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 153         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 155     def _extract_session_code(self
, html
): 
 156         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 157         return m
.group('code') if m 
is not None else None 
 159     def _extract_session_day(self
, html
): 
 160         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 161         return m
.group('day') if m 
is not None else None 
 163     def _extract_session_room(self
, html
): 
 164         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 165         return m
.group('room') if m 
is not None else None 
 167     def _extract_session_speakers(self
, html
): 
 168         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 170     def _extract_content(self
, html
, content_path
): 
 171         # Look for downloadable content 
 172         formats 
= self
._formats
_from
_html
(html
) 
 173         slides 
= self
._extract
_slides
(html
) 
 174         zip_ 
= self
._extract
_zip
(html
) 
 176         # Nothing to download 
 177         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 178             self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
) 
 182         title 
= self
._extract
_title
(html
) 
 183         description 
= self
._extract
_description
(html
) 
 184         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 185         duration 
= self
._extract
_duration
(html
) 
 186         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 187         rating_count 
= self
._extract
_rating
_count
(html
) 
 188         view_count 
= self
._extract
_view
_count
(html
) 
 189         comment_count 
= self
._extract
_comment
_count
(html
) 
 194             'description': description
, 
 195             'thumbnail': thumbnail
, 
 196             'duration': duration
, 
 197             'avg_rating': avg_rating
, 
 198             'rating_count': rating_count
, 
 199             'view_count': view_count
, 
 200             'comment_count': comment_count
, 
 205         if slides 
is not None: 
 207             d
.update({'title': title 
+ '-Slides', 'url': slides
}) 
 212             d
.update({'title': title 
+ '-Zip', 'url': zip_
}) 
 217             d
.update({'title': title
, 'formats': formats
}) 
 222     def _extract_entry_item(self
, html
, content_path
): 
 223         contents 
= self
._extract
_content
(html
, content_path
) 
 227         authors 
= self
._extract
_authors
(html
) 
 229         for content 
in contents
: 
 230             content
['authors'] = authors
 
 234     def _extract_session(self
, html
, content_path
): 
 235         contents 
= self
._extract
_content
(html
, content_path
) 
 239         session_meta 
= {'session_code': self
._extract
_session
_code
(html
), 
 240                         'session_day': self
._extract
_session
_day
(html
), 
 241                         'session_room': self
._extract
_session
_room
(html
), 
 242                         'session_speakers': self
._extract
_session
_speakers
(html
), 
 245         for content 
in contents
: 
 246             content
.update(session_meta
) 
 250     def _extract_list(self
, content_path
): 
 251         rss 
= self
._download
_xml
(self
._RSS
_URL 
% content_path
, content_path
, 'Downloading RSS') 
 252         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 253                    for session_url 
in rss
.findall('./channel/item/link')] 
 254         title_text 
= rss
.find('./channel/title').text
 
 255         return self
.playlist_result(entries
, content_path
, title_text
) 
 257     def _real_extract(self
, url
): 
 258         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 259         content_path 
= mobj
.group('contentpath') 
 261         webpage 
= self
._download
_webpage
(url
, content_path
, 'Downloading web page') 
 263         page_type_m 
= re
.search(r
'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage
) 
 264         if page_type_m 
is not None: 
 265             page_type 
= page_type_m
.group('pagetype') 
 266             if page_type 
== 'Entry':      # Any 'item'-like page, may contain downloadable content 
 267                 return self
._extract
_entry
_item
(webpage
, content_path
) 
 268             elif page_type 
== 'Session':  # Event session page, may contain downloadable content 
 269                 return self
._extract
_session
(webpage
, content_path
) 
 270             elif page_type 
== 'Event': 
 271                 return self
._extract
_list
(content_path
) 
 273                 raise ExtractorError('Unexpected WT.entryid %s' % page_type
, expected
=True) 
 275         else:  # Assuming list 
 276             return self
._extract
_list
(content_path
)