]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py
574881b70de67b9521b5e813f0cafa6da59d1068
   5 from .common 
import InfoExtractor
 
   6 from ..utils 
import ExtractorError
 
   8 class Channel9IE(InfoExtractor
): 
  10     Common extractor for channel9.msdn.com. 
  12     The type of provided URL (video or playlist) is determined according to 
  13     meta Search.PageType from web page HTML rather than URL itself, as it is 
  14     not always possible to do.     
  16     IE_DESC 
= u
'Channel 9' 
  18     _VALID_URL 
= r
'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' 
  22             u
'url': u
'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 
  23             u
'file': u
'Events_TechEd_Australia_2013_KOS002.mp4', 
  24             u
'md5': u
'bbd75296ba47916b754e73c3a4bbdf10', 
  26                 u
'title': u
'Developer Kick-Off Session: Stuff We Love', 
  27                 u
'description': u
'md5:c08d72240b7c87fcecafe2692f80e35f', 
  29                 u
'thumbnail': u
'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 
  30                 u
'session_code': u
'KOS002', 
  31                 u
'session_day': u
'Day 1', 
  32                 u
'session_room': u
'Arena 1A', 
  33                 u
'session_speakers': [ u
'Ed Blankenship', u
'Andrew Coates', u
'Brady Gaster', u
'Patrick Klug', u
'Mads Kristensen' ], 
  37             u
'url': u
'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 
  38             u
'file': u
'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', 
  39             u
'md5': u
'b43ee4529d111bc37ba7ee4f34813e68', 
  41                 u
'title': u
'Self-service BI with Power BI - nuclear testing', 
  42                 u
'description': u
'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 
  44                 u
'thumbnail': u
'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 
  45                 u
'authors': [ u
'Mike Wilmot' ], 
  50     _RSS_URL 
= 'http://channel9.msdn.com/%s/RSS' 
  53     _known_formats 
= ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] 
  55     def _restore_bytes(self
, formatted_size
): 
  56         if not formatted_size
: 
  58         m 
= re
.match(r
'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size
) 
  61         units 
= m
.group('units') 
  63             exponent 
= [u
'B', u
'KB', u
'MB', u
'GB', u
'TB', u
'PB', u
'EB', u
'ZB', u
'YB'].index(units
.upper()) 
  66         size 
= float(m
.group('size')) 
  67         return int(size 
* (1024 ** exponent
)) 
  69     def _formats_from_html(self
, html
): 
  72             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* 
  73             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* 
  74             (?:<div\s+class="popup\s+rounded">\s* 
  75             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* 
  76             </div>)?                                                # File size part may be missing 
  78         # Extract known formats 
  80             'url': x
.group('url'), 
  81             'format_id': x
.group('quality'), 
  82             'format_note': x
.group('note'), 
  83             'format': u
'%s (%s)' % (x
.group('quality'), x
.group('note')), 
  84             'filesize': self
._restore
_bytes
(x
.group('filesize')), # File size is approximate 
  85             'preference': self
._known
_formats
.index(x
.group('quality')), 
  86             'vcodec': 'none' if x
.group('note') == 'Audio only' else None, 
  87         } for x 
in list(re
.finditer(FORMAT_REGEX
, html
)) if x
.group('quality') in self
._known
_formats
] 
  89         self
._sort
_formats
(formats
) 
  93     def _extract_title(self
, html
): 
  94         title 
= self
._html
_search
_meta
(u
'title', html
, u
'title') 
  96             title 
= self
._og
_search
_title
(html
) 
  97             TITLE_SUFFIX 
= u
' (Channel 9)' 
  98             if title 
is not None and title
.endswith(TITLE_SUFFIX
): 
  99                 title 
= title
[:-len(TITLE_SUFFIX
)] 
 102     def _extract_description(self
, html
): 
 103         DESCRIPTION_REGEX 
= r
'''(?sx) 
 104             <div\s+class="entry-content">\s* 
 105             <div\s+id="entry-body">\s* 
 106             (?P<description>.+?)\s* 
 110         m 
= re
.search(DESCRIPTION_REGEX
, html
) 
 112             return m
.group('description') 
 113         return self
._html
_search
_meta
(u
'description', html
, u
'description') 
 115     def _extract_duration(self
, html
): 
 116         m 
= re
.search(r
'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html
) 
 117         return ((int(m
.group('hours')) * 60 * 60) + (int(m
.group('minutes')) * 60) + int(m
.group('seconds'))) if m 
else None 
 119     def _extract_slides(self
, html
): 
 120         m 
= re
.search(r
'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html
) 
 121         return m
.group('slidesurl') if m 
is not None else None 
 123     def _extract_zip(self
, html
): 
 124         m 
= re
.search(r
'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html
) 
 125         return m
.group('zipurl') if m 
is not None else None 
 127     def _extract_avg_rating(self
, html
): 
 128         m 
= re
.search(r
'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html
) 
 129         return float(m
.group('avgrating')) if m 
is not None else 0 
 131     def _extract_rating_count(self
, html
): 
 132         m 
= re
.search(r
'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html
) 
 133         return int(self
._fix
_count
(m
.group('ratingcount'))) if m 
is not None else 0 
 135     def _extract_view_count(self
, html
): 
 136         m 
= re
.search(r
'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html
) 
 137         return int(self
._fix
_count
(m
.group('viewcount'))) if m 
is not None else 0 
 139     def _extract_comment_count(self
, html
): 
 140         m 
= re
.search(r
'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html
) 
 141         return int(self
._fix
_count
(m
.group('commentcount'))) if m 
is not None else 0 
 143     def _fix_count(self
, count
): 
 144         return int(str(count
).replace(',', '')) if count 
is not None else None 
 146     def _extract_authors(self
, html
): 
 147         m 
= re
.search(r
'(?s)<li class="author">(.*?)</li>', html
) 
 150         return re
.findall(r
'<a href="/Niners/[^"]+">([^<]+)</a>', m
.group(1)) 
 152     def _extract_session_code(self
, html
): 
 153         m 
= re
.search(r
'<li class="code">\s*(?P<code>.+?)\s*</li>', html
) 
 154         return m
.group('code') if m 
is not None else None 
 156     def _extract_session_day(self
, html
): 
 157         m 
= re
.search(r
'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html
) 
 158         return m
.group('day') if m 
is not None else None 
 160     def _extract_session_room(self
, html
): 
 161         m 
= re
.search(r
'<li class="room">\s*(?P<room>.+?)\s*</li>', html
) 
 162         return m
.group('room') if m 
is not None else None 
 164     def _extract_session_speakers(self
, html
): 
 165         return re
.findall(r
'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html
) 
 167     def _extract_content(self
, html
, content_path
): 
 168         # Look for downloadable content         
 169         formats 
= self
._formats
_from
_html
(html
) 
 170         slides 
= self
._extract
_slides
(html
) 
 171         zip_ 
= self
._extract
_zip
(html
) 
 173         # Nothing to download 
 174         if len(formats
) == 0 and slides 
is None and zip_ 
is None: 
 175             self
._downloader
.report_warning(u
'None of recording, slides or zip are available for %s' % content_path
) 
 179         title 
= self
._extract
_title
(html
) 
 180         description 
= self
._extract
_description
(html
) 
 181         thumbnail 
= self
._og
_search
_thumbnail
(html
) 
 182         duration 
= self
._extract
_duration
(html
) 
 183         avg_rating 
= self
._extract
_avg
_rating
(html
) 
 184         rating_count 
= self
._extract
_rating
_count
(html
) 
 185         view_count 
= self
._extract
_view
_count
(html
) 
 186         comment_count 
= self
._extract
_comment
_count
(html
) 
 188         common 
= {'_type': 'video', 
 190                   'description': description
, 
 191                   'thumbnail': thumbnail
, 
 192                   'duration': duration
, 
 193                   'avg_rating': avg_rating
, 
 194                   'rating_count': rating_count
, 
 195                   'view_count': view_count
, 
 196                   'comment_count': comment_count
, 
 201         if slides 
is not None: 
 203             d
.update({ 'title': title 
+ '-Slides', 'url': slides 
}) 
 208             d
.update({ 'title': title 
+ '-Zip', 'url': zip_ 
}) 
 213             d
.update({ 'title': title
, 'formats': formats 
}) 
 218     def _extract_entry_item(self
, html
, content_path
): 
 219         contents 
= self
._extract
_content
(html
, content_path
) 
 223         authors 
= self
._extract
_authors
(html
) 
 225         for content 
in contents
: 
 226             content
['authors'] = authors
 
 230     def _extract_session(self
, html
, content_path
): 
 231         contents 
= self
._extract
_content
(html
, content_path
) 
 235         session_meta 
= {'session_code': self
._extract
_session
_code
(html
), 
 236                         'session_day': self
._extract
_session
_day
(html
), 
 237                         'session_room': self
._extract
_session
_room
(html
), 
 238                         'session_speakers': self
._extract
_session
_speakers
(html
), 
 241         for content 
in contents
: 
 242             content
.update(session_meta
) 
 246     def _extract_list(self
, content_path
): 
 247         rss 
= self
._download
_xml
(self
._RSS
_URL 
% content_path
, content_path
, u
'Downloading RSS') 
 248         entries 
= [self
.url_result(session_url
.text
, 'Channel9') 
 249                    for session_url 
in rss
.findall('./channel/item/link')] 
 250         title_text 
= rss
.find('./channel/title').text
 
 251         return self
.playlist_result(entries
, content_path
, title_text
) 
 253     def _real_extract(self
, url
): 
 254         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 255         content_path 
= mobj
.group('contentpath') 
 257         webpage 
= self
._download
_webpage
(url
, content_path
, u
'Downloading web page') 
 259         page_type_m 
= re
.search(r
'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage
) 
 260         if page_type_m 
is None: 
 261             raise ExtractorError(u
'Search.PageType not found, don\'t know how to process this page', expected
=True) 
 263         page_type 
= page_type_m
.group('pagetype') 
 264         if page_type 
== 'List':         # List page, may contain list of 'item'-like objects 
 265             return self
._extract
_list
(content_path
) 
 266         elif page_type 
== 'Entry.Item': # Any 'item'-like page, may contain downloadable content 
 267             return self
._extract
_entry
_item
(webpage
, content_path
) 
 268         elif page_type 
== 'Session':    # Event session page, may contain downloadable content 
 269             return self
._extract
_session
(webpage
, content_path
) 
 271             raise ExtractorError(u
'Unexpected Search.PageType %s' % page_type
, expected
=True)