Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py

   1 # encoding: utf-8
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8 class Channel9IE(InfoExtractor):
   9     '''
  10     Common extractor for channel9.msdn.com.
  11
  12     The type of provided URL (video or playlist) is determined according to
  13     meta Search.PageType from web page HTML rather than URL itself, as it is
  14     not always possible to do.
  15     '''
  16     IE_DESC = u'Channel 9'
  17     IE_NAME = u'channel9'
  18     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  19
  20     _TESTS = [
  21         {
  22             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  23             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
  24             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
  25             u'info_dict': {
  26                 u'title': u'Developer Kick-Off Session: Stuff We Love',
  27                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
  28                 u'duration': 4576,
  29                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  30                 u'session_code': u'KOS002',
  31                 u'session_day': u'Day 1',
  32                 u'session_room': u'Arena 1A',
  33                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
  34             },
  35         },
  36         {
  37             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  38             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
  39             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
  40             u'info_dict': {
  41                 u'title': u'Self-service BI with Power BI - nuclear testing',
  42                 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  43                 u'duration': 1540,
  44                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  45                 u'authors': [ u'Mike Wilmot' ],
  46             },
  47         }
  48     ]
  49
  50     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  51
  52     # Sorted by quality
  53     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  54
  55     def _restore_bytes(self, formatted_size):
  56         if not formatted_size:
  57             return 0
  58         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  59         if not m:
  60             return 0
  61         units = m.group('units')
  62         try:
  63             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
  64         except ValueError:
  65             return 0
  66         size = float(m.group('size'))
  67         return int(size * (1024 ** exponent))
  68
  69     def _formats_from_html(self, html):
  70         FORMAT_REGEX = r'''
  71             (?x)
  72             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  73             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  74             (?:<div\s+class="popup\s+rounded">\s*
  75             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  76             </div>)?                                                # File size part may be missing
  77         '''
  78         # Extract known formats
  79         formats = [{
  80             'url': x.group('url'),
  81             'format_id': x.group('quality'),
  82             'format_note': x.group('note'),
  83             'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
  84             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
  85             'preference': self._known_formats.index(x.group('quality')),
  86             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  87         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  88
  89         self._sort_formats(formats)
  90
  91         return formats
  92
  93     def _extract_title(self, html):
  94         title = self._html_search_meta(u'title', html, u'title')
  95         if title is None:
  96             title = self._og_search_title(html)
  97             TITLE_SUFFIX = u' (Channel 9)'
  98             if title is not None and title.endswith(TITLE_SUFFIX):
  99                 title = title[:-len(TITLE_SUFFIX)]
 100         return title
 101
 102     def _extract_description(self, html):
 103         DESCRIPTION_REGEX = r'''(?sx)
 104             <div\s+class="entry-content">\s*
 105             <div\s+id="entry-body">\s*
 106             (?P<description>.+?)\s*
 107             </div>\s*
 108             </div>
 109         '''
 110         m = re.search(DESCRIPTION_REGEX, html)
 111         if m is not None:
 112             return m.group('description')
 113         return self._html_search_meta(u'description', html, u'description')
 114
 115     def _extract_duration(self, html):
 116         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 117         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 118
 119     def _extract_slides(self, html):
 120         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 121         return m.group('slidesurl') if m is not None else None
 122
 123     def _extract_zip(self, html):
 124         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 125         return m.group('zipurl') if m is not None else None
 126
 127     def _extract_avg_rating(self, html):
 128         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 129         return float(m.group('avgrating')) if m is not None else 0
 130
 131     def _extract_rating_count(self, html):
 132         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 133         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 134
 135     def _extract_view_count(self, html):
 136         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 137         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 138
 139     def _extract_comment_count(self, html):
 140         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 141         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 142
 143     def _fix_count(self, count):
 144         return int(str(count).replace(',', '')) if count is not None else None
 145
 146     def _extract_authors(self, html):
 147         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 148         if m is None:
 149             return None
 150         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 151
 152     def _extract_session_code(self, html):
 153         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 154         return m.group('code') if m is not None else None
 155
 156     def _extract_session_day(self, html):
 157         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 158         return m.group('day') if m is not None else None
 159
 160     def _extract_session_room(self, html):
 161         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 162         return m.group('room') if m is not None else None
 163
 164     def _extract_session_speakers(self, html):
 165         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 166
 167     def _extract_content(self, html, content_path):
 168         # Look for downloadable content
 169         formats = self._formats_from_html(html)
 170         slides = self._extract_slides(html)
 171         zip_ = self._extract_zip(html)
 172
 173         # Nothing to download
 174         if len(formats) == 0 and slides is None and zip_ is None:
 175             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
 176             return
 177
 178         # Extract meta
 179         title = self._extract_title(html)
 180         description = self._extract_description(html)
 181         thumbnail = self._og_search_thumbnail(html)
 182         duration = self._extract_duration(html)
 183         avg_rating = self._extract_avg_rating(html)
 184         rating_count = self._extract_rating_count(html)
 185         view_count = self._extract_view_count(html)
 186         comment_count = self._extract_comment_count(html)
 187
 188         common = {'_type': 'video',
 189                   'id': content_path,
 190                   'description': description,
 191                   'thumbnail': thumbnail,
 192                   'duration': duration,
 193                   'avg_rating': avg_rating,
 194                   'rating_count': rating_count,
 195                   'view_count': view_count,
 196                   'comment_count': comment_count,
 197                 }
 198
 199         result = []
 200
 201         if slides is not None:
 202             d = common.copy()
 203             d.update({ 'title': title + '-Slides', 'url': slides })
 204             result.append(d)
 205
 206         if zip_ is not None:
 207             d = common.copy()
 208             d.update({ 'title': title + '-Zip', 'url': zip_ })
 209             result.append(d)
 210
 211         if len(formats) > 0:
 212             d = common.copy()
 213             d.update({ 'title': title, 'formats': formats })
 214             result.append(d)
 215
 216         return result
 217
 218     def _extract_entry_item(self, html, content_path):
 219         contents = self._extract_content(html, content_path)
 220         if contents is None:
 221             return contents
 222
 223         authors = self._extract_authors(html)
 224
 225         for content in contents:
 226             content['authors'] = authors
 227
 228         return contents
 229
 230     def _extract_session(self, html, content_path):
 231         contents = self._extract_content(html, content_path)
 232         if contents is None:
 233             return contents
 234
 235         session_meta = {'session_code': self._extract_session_code(html),
 236                         'session_day': self._extract_session_day(html),
 237                         'session_room': self._extract_session_room(html),
 238                         'session_speakers': self._extract_session_speakers(html),
 239                         }
 240
 241         for content in contents:
 242             content.update(session_meta)
 243
 244         return contents
 245
 246     def _extract_list(self, content_path):
 247         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
 248         entries = [self.url_result(session_url.text, 'Channel9')
 249                    for session_url in rss.findall('./channel/item/link')]
 250         title_text = rss.find('./channel/title').text
 251         return self.playlist_result(entries, content_path, title_text)
 252
 253     def _real_extract(self, url):
 254         mobj = re.match(self._VALID_URL, url)
 255         content_path = mobj.group('contentpath')
 256
 257         webpage = self._download_webpage(url, content_path, u'Downloading web page')
 258
 259         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
 260         if page_type_m is None:
 261             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
 262
 263         page_type = page_type_m.group('pagetype')
 264         if page_type == 'List':         # List page, may contain list of 'item'-like objects
 265             return self._extract_list(content_path)
 266         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
 267             return self._extract_entry_item(webpage, content_path)
 268         elif page_type == 'Session':    # Event session page, may contain downloadable content
 269             return self._extract_session(webpage, content_path)
 270         else:
 271             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)