]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
   6 from ..utils 
import ExtractorError
 
   9 class Channel9IE(InfoExtractor
): 
  11     Common extractor for channel9.msdn.com. 
  13     The type of provided URL (video or playlist) is determined according to 
  14     meta Search.PageType from web page HTML rather than URL itself, as it is 
  15     not always possible to do. 
  19     _VALID_URL 
= r
'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' 
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 
  26                 'id': 'Events/TechEd/Australia/2013/KOS002', 
  28                 'title': 'Developer Kick-Off Session: Stuff We Love', 
  29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 
  31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 
  32                 'session_code': 'KOS002', 
  33                 'session_day': 'Day 1', 
  34                 'session_room': 'Arena 1A', 
  35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], 
  39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 
  42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  44                 'title': 'Self-service BI with Power BI - nuclear testing', 
  45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 
  48                 'authors': ['Mike Wilmot'], 
  53     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  56     _known_formats 
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] 
  58     def _restore_bytes(self
, formatted_size
): 
  59         if not formatted_size
: 
  61         m 
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
) 
  64         units 
= m
.group('units') 
  66             exponent 
= ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units
.upper()) 
  69         size 
= float(m
.group('size')) 
  70         return int(size 
* (1024 ** exponent
)) 
  72     def _formats_from_html(self
, html
): 
  75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  77             (?:<div\s+class="popup\s+rounded">\s* 
  78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  79             </div>)?                                                # File size part may be missing 
  81         # Extract known formats 
  83             'url': x
.group('url'), 
  84             'format_id': x
.group('quality'), 
  85             'format_note': x
.group('note'), 
  86             'format': '%s (%s)' % (x
.group('quality'), x
.group('note')), 
  87             'filesize': self
._restore
_bytes
(x
.group('filesize')),  # File size is approximate 
  88             'preference': self
._known
_formats
.index(x
.group('quality')), 
  89             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
  90         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
] 
  92         self
._sort
_formats
(formats
) 
  96     def _extract_title(self
, html
): 
  97         title 
= self
._html
_search
_meta
('title', html
, 'title') 
  99             title 
= self
._og
_search
_title
(html
) 
 100             TITLE_SUFFIX 
= ' (Channel 9)' 
 101             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
 102                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 105     def _extract_description(self
, html
): 
 106         DESCRIPTION_REGEX 
= r
'''(?sx) 
 107             <div\s+class="entry-content">\s* 
 108             <div\s+id="entry-body">\s* 
 109             (?P<description>.+?)\s* 
 113         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 115             return m
.group('description') 
 116         return self
._html
_search
_meta
('description', html
, 'description') 
 118     def _extract_duration(self
, html
): 
 119         m 
= re
.search(r
'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 120         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 122     def _extract_slides(self
, html
): 
 123         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 124         return m
.group('slidesurl') if m 
is not None else None 
 126     def _extract_zip(self
, html
): 
 127         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 128         return m
.group('zipurl') if m 
is not None else None 
 130     def _extract_avg_rating(self
, html
): 
 131         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 132         return float(m
.group('avgrating')) if m 
is not None else 0 
 134     def _extract_rating_count(self
, html
): 
 135         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 136         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 138     def _extract_view_count(self
, html
): 
 139         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 140         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 142     def _extract_comment_count(self
, html
): 
 143         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 144         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 146     def _fix_count(self
, count
): 
 147         return int(str(count
).replace(',', '')) if count 
is not None else None 
 149     def _extract_authors(self
, html
): 
 150         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 153         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 155     def _extract_session_code(self
, html
): 
 156         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 157         return m
.group('code') if m 
is not None else None 
 159     def _extract_session_day(self
, html
): 
 160         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 161         return m
.group('day') if m 
is not None else None 
 163     def _extract_session_room(self
, html
): 
 164         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 165         return m
.group('room') if m 
is not None else None 
 167     def _extract_session_speakers(self
, html
): 
 168         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 170     def _extract_content(self
, html
, content_path
): 
 171         # Look for downloadable content 
 172         formats 
= self
._formats
_from
_html
(html
) 
 173         slides 
= self
._extract
_slides
(html
) 
 174         zip_ 
= self
._extract
_zip
(html
) 
 176         # Nothing to download 
 177         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 178             self
._downloader
.report_warning('None of recording, slides or zip are available for %s' % content_path
) 
 182         title 
= self
._extract
_title
(html
) 
 183         description 
= self
._extract
_description
(html
) 
 184         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 185         duration 
= self
._extract
_duration
(html
) 
 186         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 187         rating_count 
= self
._extract
_rating
_count
(html
) 
 188         view_count 
= self
._extract
_view
_count
(html
) 
 189         comment_count 
= self
._extract
_comment
_count
(html
) 
 194             'description': description
, 
 195             'thumbnail': thumbnail
, 
 196             'duration': duration
, 
 197             'avg_rating': avg_rating
, 
 198             'rating_count': rating_count
, 
 199             'view_count': view_count
, 
 200             'comment_count': comment_count
, 
 205         if slides 
is not None: 
 207             d
.update({'title': title 
+ '-Slides', 'url': slides
}) 
 212             d
.update({'title': title 
+ '-Zip', 'url': zip_
}) 
 217             d
.update({'title': title
, 'formats': formats
}) 
 222     def _extract_entry_item(self
, html
, content_path
): 
 223         contents 
= self
._extract
_content
(html
, content_path
) 
 227         authors 
= self
._extract
_authors
(html
) 
 229         for content 
in contents
: 
 230             content
['authors'] = authors
 
 234     def _extract_session(self
, html
, content_path
): 
 235         contents 
= self
._extract
_content
(html
, content_path
) 
 240             'session_code': self
._extract
_session
_code
(html
), 
 241             'session_day': self
._extract
_session
_day
(html
), 
 242             'session_room': self
._extract
_session
_room
(html
), 
 243             'session_speakers': self
._extract
_session
_speakers
(html
), 
 246         for content 
in contents
: 
 247             content
.update(session_meta
) 
 249         return self
.playlist_result(contents
) 
 251     def _extract_list(self
, content_path
): 
 252         rss 
= self
._download
_xml
(self
._RSS
_URL 
% content_path
, content_path
, 'Downloading RSS') 
 253         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 254                    for session_url 
in rss
.findall('./channel/item/link')] 
 255         title_text 
= rss
.find('./channel/title').text
 
 256         return self
.playlist_result(entries
, content_path
, title_text
) 
 258     def _real_extract(self
, url
): 
 259         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 260         content_path 
= mobj
.group('contentpath') 
 262         webpage 
= self
._download
_webpage
(url
, content_path
, 'Downloading web page') 
 264         page_type_m 
= re
.search(r
'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage
) 
 265         if page_type_m 
is not None: 
 266             page_type 
= page_type_m
.group('pagetype') 
 267             if page_type 
== 'Entry':      # Any 'item'-like page, may contain downloadable content 
 268                 return self
._extract
_entry
_item
(webpage
, content_path
) 
 269             elif page_type 
== 'Session':  # Event session page, may contain downloadable content 
 270                 return self
._extract
_session
(webpage
, content_path
) 
 271             elif page_type 
== 'Event': 
 272                 return self
._extract
_list
(content_path
) 
 274                 raise ExtractorError('Unexpected WT.entryid %s' % page_type
, expected
=True) 
 276         else:  # Assuming list 
 277             return self
._extract
_list
(content_path
)