Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     parse_filesize,
   9     qualities,
  10 )
  11
  12
  13 class Channel9IE(InfoExtractor):
  14     '''
  15     Common extractor for channel9.msdn.com.
  16
  17     The type of provided URL (video or playlist) is determined according to
  18     meta Search.PageType from web page HTML rather than URL itself, as it is
  19     not always possible to do.
  20     '''
  21     IE_DESC = 'Channel 9'
  22     IE_NAME = 'channel9'
  23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  24
  25     _TESTS = [
  26         {
  27             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  28             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  29             'info_dict': {
  30                 'id': 'Events/TechEd/Australia/2013/KOS002',
  31                 'ext': 'mp4',
  32                 'title': 'Developer Kick-Off Session: Stuff We Love',
  33                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  34                 'duration': 4576,
  35                 'thumbnail': 're:http://.*\.jpg',
  36                 'session_code': 'KOS002',
  37                 'session_day': 'Day 1',
  38                 'session_room': 'Arena 1A',
  39                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
  40             },
  41         },
  42         {
  43             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  44             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  45             'info_dict': {
  46                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  47                 'ext': 'mp4',
  48                 'title': 'Self-service BI with Power BI - nuclear testing',
  49                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  50                 'duration': 1540,
  51                 'thumbnail': 're:http://.*\.jpg',
  52                 'authors': ['Mike Wilmot'],
  53             },
  54         },
  55         {
  56             # low quality mp4 is best
  57             'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  58             'info_dict': {
  59                 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  60                 'ext': 'mp4',
  61                 'title': 'Ranges for the Standard Library',
  62                 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
  63                 'duration': 5646,
  64                 'thumbnail': 're:http://.*\.jpg',
  65             },
  66             'params': {
  67                 'skip_download': True,
  68             },
  69         }
  70     ]
  71
  72     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  73
  74     def _formats_from_html(self, html):
  75         FORMAT_REGEX = r'''
  76             (?x)
  77             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  78             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  79             (?:<div\s+class="popup\s+rounded">\s*
  80             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  81             </div>)?                                                # File size part may be missing
  82         '''
  83         quality = qualities((
  84             'MP3', 'MP4',
  85             'Low Quality WMV', 'Low Quality MP4',
  86             'Mid Quality WMV', 'Mid Quality MP4',
  87             'High Quality WMV', 'High Quality MP4'))
  88         formats = [{
  89             'url': x.group('url'),
  90             'format_id': x.group('quality'),
  91             'format_note': x.group('note'),
  92             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  93             'filesize_approx': parse_filesize(x.group('filesize')),
  94             'quality': quality(x.group('quality')),
  95             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  96         } for x in list(re.finditer(FORMAT_REGEX, html))]
  97
  98         self._sort_formats(formats)
  99
 100         return formats
 101
 102     def _extract_title(self, html):
 103         title = self._html_search_meta('title', html, 'title')
 104         if title is None:
 105             title = self._og_search_title(html)
 106             TITLE_SUFFIX = ' (Channel 9)'
 107             if title is not None and title.endswith(TITLE_SUFFIX):
 108                 title = title[:-len(TITLE_SUFFIX)]
 109         return title
 110
 111     def _extract_description(self, html):
 112         DESCRIPTION_REGEX = r'''(?sx)
 113             <div\s+class="entry-content">\s*
 114             <div\s+id="entry-body">\s*
 115             (?P<description>.+?)\s*
 116             </div>\s*
 117             </div>
 118         '''
 119         m = re.search(DESCRIPTION_REGEX, html)
 120         if m is not None:
 121             return m.group('description')
 122         return self._html_search_meta('description', html, 'description')
 123
 124     def _extract_duration(self, html):
 125         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 126         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 127
 128     def _extract_slides(self, html):
 129         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 130         return m.group('slidesurl') if m is not None else None
 131
 132     def _extract_zip(self, html):
 133         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 134         return m.group('zipurl') if m is not None else None
 135
 136     def _extract_avg_rating(self, html):
 137         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 138         return float(m.group('avgrating')) if m is not None else 0
 139
 140     def _extract_rating_count(self, html):
 141         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 142         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 143
 144     def _extract_view_count(self, html):
 145         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 146         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 147
 148     def _extract_comment_count(self, html):
 149         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 150         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 151
 152     def _fix_count(self, count):
 153         return int(str(count).replace(',', '')) if count is not None else None
 154
 155     def _extract_authors(self, html):
 156         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 157         if m is None:
 158             return None
 159         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 160
 161     def _extract_session_code(self, html):
 162         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 163         return m.group('code') if m is not None else None
 164
 165     def _extract_session_day(self, html):
 166         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 167         return m.group('day').strip() if m is not None else None
 168
 169     def _extract_session_room(self, html):
 170         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 171         return m.group('room') if m is not None else None
 172
 173     def _extract_session_speakers(self, html):
 174         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 175
 176     def _extract_content(self, html, content_path):
 177         # Look for downloadable content
 178         formats = self._formats_from_html(html)
 179         slides = self._extract_slides(html)
 180         zip_ = self._extract_zip(html)
 181
 182         # Nothing to download
 183         if len(formats) == 0 and slides is None and zip_ is None:
 184             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 185             return
 186
 187         # Extract meta
 188         title = self._extract_title(html)
 189         description = self._extract_description(html)
 190         thumbnail = self._og_search_thumbnail(html)
 191         duration = self._extract_duration(html)
 192         avg_rating = self._extract_avg_rating(html)
 193         rating_count = self._extract_rating_count(html)
 194         view_count = self._extract_view_count(html)
 195         comment_count = self._extract_comment_count(html)
 196
 197         common = {
 198             '_type': 'video',
 199             'id': content_path,
 200             'description': description,
 201             'thumbnail': thumbnail,
 202             'duration': duration,
 203             'avg_rating': avg_rating,
 204             'rating_count': rating_count,
 205             'view_count': view_count,
 206             'comment_count': comment_count,
 207         }
 208
 209         result = []
 210
 211         if slides is not None:
 212             d = common.copy()
 213             d.update({'title': title + '-Slides', 'url': slides})
 214             result.append(d)
 215
 216         if zip_ is not None:
 217             d = common.copy()
 218             d.update({'title': title + '-Zip', 'url': zip_})
 219             result.append(d)
 220
 221         if len(formats) > 0:
 222             d = common.copy()
 223             d.update({'title': title, 'formats': formats})
 224             result.append(d)
 225
 226         return result
 227
 228     def _extract_entry_item(self, html, content_path):
 229         contents = self._extract_content(html, content_path)
 230         if contents is None:
 231             return contents
 232
 233         if len(contents) > 1:
 234             raise ExtractorError('Got more than one entry')
 235         result = contents[0]
 236         result['authors'] = self._extract_authors(html)
 237
 238         return result
 239
 240     def _extract_session(self, html, content_path):
 241         contents = self._extract_content(html, content_path)
 242         if contents is None:
 243             return contents
 244
 245         session_meta = {
 246             'session_code': self._extract_session_code(html),
 247             'session_day': self._extract_session_day(html),
 248             'session_room': self._extract_session_room(html),
 249             'session_speakers': self._extract_session_speakers(html),
 250         }
 251
 252         for content in contents:
 253             content.update(session_meta)
 254
 255         return self.playlist_result(contents)
 256
 257     def _extract_list(self, content_path):
 258         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 259         entries = [self.url_result(session_url.text, 'Channel9')
 260                    for session_url in rss.findall('./channel/item/link')]
 261         title_text = rss.find('./channel/title').text
 262         return self.playlist_result(entries, content_path, title_text)
 263
 264     def _real_extract(self, url):
 265         mobj = re.match(self._VALID_URL, url)
 266         content_path = mobj.group('contentpath')
 267
 268         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 269
 270         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
 271         if page_type_m is not None:
 272             page_type = page_type_m.group('pagetype')
 273             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 274                 return self._extract_entry_item(webpage, content_path)
 275             elif page_type == 'Session':  # Event session page, may contain downloadable content
 276                 return self._extract_session(webpage, content_path)
 277             elif page_type == 'Event':
 278                 return self._extract_list(content_path)
 279             else:
 280                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 281
 282         else:  # Assuming list
 283             return self._extract_list(content_path)