Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8
   9 class Channel9IE(InfoExtractor):
  10     '''
  11     Common extractor for channel9.msdn.com.
  12
  13     The type of provided URL (video or playlist) is determined according to
  14     meta Search.PageType from web page HTML rather than URL itself, as it is
  15     not always possible to do.
  16     '''
  17     IE_DESC = 'Channel 9'
  18     IE_NAME = 'channel9'
  19     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  20
  21     _TESTS = [
  22         {
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  25             'info_dict': {
  26                 'id': 'Events/TechEd/Australia/2013/KOS002',
  27                 'ext': 'mp4',
  28                 'title': 'Developer Kick-Off Session: Stuff We Love',
  29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  30                 'duration': 4576,
  31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  32                 'session_code': 'KOS002',
  33                 'session_day': 'Day 1',
  34                 'session_room': 'Arena 1A',
  35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
  36             },
  37         },
  38         {
  39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  41             'info_dict': {
  42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  43                 'ext': 'mp4',
  44                 'title': 'Self-service BI with Power BI - nuclear testing',
  45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  46                 'duration': 1540,
  47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  48                 'authors': ['Mike Wilmot'],
  49             },
  50         }
  51     ]
  52
  53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  54
  55     # Sorted by quality
  56     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  57
  58     def _restore_bytes(self, formatted_size):
  59         if not formatted_size:
  60             return 0
  61         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  62         if not m:
  63             return 0
  64         units = m.group('units')
  65         try:
  66             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
  67         except ValueError:
  68             return 0
  69         size = float(m.group('size'))
  70         return int(size * (1024 ** exponent))
  71
  72     def _formats_from_html(self, html):
  73         FORMAT_REGEX = r'''
  74             (?x)
  75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  77             (?:<div\s+class="popup\s+rounded">\s*
  78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  79             </div>)?                                                # File size part may be missing
  80         '''
  81         # Extract known formats
  82         formats = [{
  83             'url': x.group('url'),
  84             'format_id': x.group('quality'),
  85             'format_note': x.group('note'),
  86             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  87             'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
  88             'preference': self._known_formats.index(x.group('quality')),
  89             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  90         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  91
  92         self._sort_formats(formats)
  93
  94         return formats
  95
  96     def _extract_title(self, html):
  97         title = self._html_search_meta('title', html, 'title')
  98         if title is None:
  99             title = self._og_search_title(html)
 100             TITLE_SUFFIX = ' (Channel 9)'
 101             if title is not None and title.endswith(TITLE_SUFFIX):
 102                 title = title[:-len(TITLE_SUFFIX)]
 103         return title
 104
 105     def _extract_description(self, html):
 106         DESCRIPTION_REGEX = r'''(?sx)
 107             <div\s+class="entry-content">\s*
 108             <div\s+id="entry-body">\s*
 109             (?P<description>.+?)\s*
 110             </div>\s*
 111             </div>
 112         '''
 113         m = re.search(DESCRIPTION_REGEX, html)
 114         if m is not None:
 115             return m.group('description')
 116         return self._html_search_meta('description', html, 'description')
 117
 118     def _extract_duration(self, html):
 119         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 120         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 121
 122     def _extract_slides(self, html):
 123         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 124         return m.group('slidesurl') if m is not None else None
 125
 126     def _extract_zip(self, html):
 127         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 128         return m.group('zipurl') if m is not None else None
 129
 130     def _extract_avg_rating(self, html):
 131         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 132         return float(m.group('avgrating')) if m is not None else 0
 133
 134     def _extract_rating_count(self, html):
 135         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 136         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 137
 138     def _extract_view_count(self, html):
 139         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 140         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 141
 142     def _extract_comment_count(self, html):
 143         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 144         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 145
 146     def _fix_count(self, count):
 147         return int(str(count).replace(',', '')) if count is not None else None
 148
 149     def _extract_authors(self, html):
 150         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 151         if m is None:
 152             return None
 153         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 154
 155     def _extract_session_code(self, html):
 156         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 157         return m.group('code') if m is not None else None
 158
 159     def _extract_session_day(self, html):
 160         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 161         return m.group('day') if m is not None else None
 162
 163     def _extract_session_room(self, html):
 164         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 165         return m.group('room') if m is not None else None
 166
 167     def _extract_session_speakers(self, html):
 168         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 169
 170     def _extract_content(self, html, content_path):
 171         # Look for downloadable content
 172         formats = self._formats_from_html(html)
 173         slides = self._extract_slides(html)
 174         zip_ = self._extract_zip(html)
 175
 176         # Nothing to download
 177         if len(formats) == 0 and slides is None and zip_ is None:
 178             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 179             return
 180
 181         # Extract meta
 182         title = self._extract_title(html)
 183         description = self._extract_description(html)
 184         thumbnail = self._og_search_thumbnail(html)
 185         duration = self._extract_duration(html)
 186         avg_rating = self._extract_avg_rating(html)
 187         rating_count = self._extract_rating_count(html)
 188         view_count = self._extract_view_count(html)
 189         comment_count = self._extract_comment_count(html)
 190
 191         common = {
 192             '_type': 'video',
 193             'id': content_path,
 194             'description': description,
 195             'thumbnail': thumbnail,
 196             'duration': duration,
 197             'avg_rating': avg_rating,
 198             'rating_count': rating_count,
 199             'view_count': view_count,
 200             'comment_count': comment_count,
 201         }
 202
 203         result = []
 204
 205         if slides is not None:
 206             d = common.copy()
 207             d.update({'title': title + '-Slides', 'url': slides})
 208             result.append(d)
 209
 210         if zip_ is not None:
 211             d = common.copy()
 212             d.update({'title': title + '-Zip', 'url': zip_})
 213             result.append(d)
 214
 215         if len(formats) > 0:
 216             d = common.copy()
 217             d.update({'title': title, 'formats': formats})
 218             result.append(d)
 219
 220         return result
 221
 222     def _extract_entry_item(self, html, content_path):
 223         contents = self._extract_content(html, content_path)
 224         if contents is None:
 225             return contents
 226
 227         authors = self._extract_authors(html)
 228
 229         for content in contents:
 230             content['authors'] = authors
 231
 232         return contents
 233
 234     def _extract_session(self, html, content_path):
 235         contents = self._extract_content(html, content_path)
 236         if contents is None:
 237             return contents
 238
 239         session_meta = {
 240             'session_code': self._extract_session_code(html),
 241             'session_day': self._extract_session_day(html),
 242             'session_room': self._extract_session_room(html),
 243             'session_speakers': self._extract_session_speakers(html),
 244         }
 245
 246         for content in contents:
 247             content.update(session_meta)
 248
 249         return self.playlist_result(contents)
 250
 251     def _extract_list(self, content_path):
 252         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 253         entries = [self.url_result(session_url.text, 'Channel9')
 254                    for session_url in rss.findall('./channel/item/link')]
 255         title_text = rss.find('./channel/title').text
 256         return self.playlist_result(entries, content_path, title_text)
 257
 258     def _real_extract(self, url):
 259         mobj = re.match(self._VALID_URL, url)
 260         content_path = mobj.group('contentpath')
 261
 262         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 263
 264         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
 265         if page_type_m is not None:
 266             page_type = page_type_m.group('pagetype')
 267             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 268                 return self._extract_entry_item(webpage, content_path)
 269             elif page_type == 'Session':  # Event session page, may contain downloadable content
 270                 return self._extract_session(webpage, content_path)
 271             elif page_type == 'Event':
 272                 return self._extract_list(content_path)
 273             else:
 274                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 275
 276         else:  # Assuming list
 277             return self._extract_list(content_path)