]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/channel9.py
Merge tag 'upstream/2015.06.04.1'
[youtubedl] / youtube_dl / extractor / channel9.py
index ae70ea22967a8d880ba15fa1bb64f32904139094..3dfc24f5ba447ea92858e89868ad3684caf3a6d2 100644 (file)
@@ -1,48 +1,51 @@
-# encoding: utf-8
+from __future__ import unicode_literals
 
 import re
 
 from .common import InfoExtractor
 from ..utils import ExtractorError
 
 
 import re
 
 from .common import InfoExtractor
 from ..utils import ExtractorError
 
+
 class Channel9IE(InfoExtractor):
     '''
     Common extractor for channel9.msdn.com.
 
     The type of provided URL (video or playlist) is determined according to
     meta Search.PageType from web page HTML rather than URL itself, as it is
 class Channel9IE(InfoExtractor):
     '''
     Common extractor for channel9.msdn.com.
 
     The type of provided URL (video or playlist) is determined according to
     meta Search.PageType from web page HTML rather than URL itself, as it is
-    not always possible to do.    
+    not always possible to do.
     '''
     '''
-    IE_DESC = u'Channel 9'
-    IE_NAME = u'channel9'
-    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
+    IE_DESC = 'Channel 9'
+    IE_NAME = 'channel9'
+    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
 
     _TESTS = [
         {
 
     _TESTS = [
         {
-            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
-            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
-            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
-            u'info_dict': {
-                u'title': u'Developer Kick-Off Session: Stuff We Love',
-                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
-                u'duration': 4576,
-                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
-                u'session_code': u'KOS002',
-                u'session_day': u'Day 1',
-                u'session_room': u'Arena 1A',
-                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
+            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+            'info_dict': {
+                'id': 'Events/TechEd/Australia/2013/KOS002',
+                'ext': 'mp4',
+                'title': 'Developer Kick-Off Session: Stuff We Love',
+                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+                'duration': 4576,
+                'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+                'session_code': 'KOS002',
+                'session_day': 'Day 1',
+                'session_room': 'Arena 1A',
+                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
             },
         },
         {
             },
         },
         {
-            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
-            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
-            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
-            u'info_dict': {
-                u'title': u'Self-service BI with Power BI - nuclear testing',
-                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
-                u'duration': 1540,
-                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
-                u'authors': [ u'Mike Wilmot' ],
+            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+            'info_dict': {
+                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
+                'ext': 'mp4',
+                'title': 'Self-service BI with Power BI - nuclear testing',
+                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+                'duration': 1540,
+                'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+                'authors': ['Mike Wilmot'],
             },
         }
     ]
             },
         }
     ]
@@ -60,7 +63,7 @@ class Channel9IE(InfoExtractor):
             return 0
         units = m.group('units')
         try:
             return 0
         units = m.group('units')
         try:
-            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
+            exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
         except ValueError:
             return 0
         size = float(m.group('size'))
         except ValueError:
             return 0
         size = float(m.group('size'))
@@ -76,21 +79,25 @@ class Channel9IE(InfoExtractor):
             </div>)?                                                # File size part may be missing
         '''
         # Extract known formats
             </div>)?                                                # File size part may be missing
         '''
         # Extract known formats
-        formats = [{'url': x.group('url'),
-                 'format_id': x.group('quality'),
-                 'format_note': x.group('note'),
-                 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
-                 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
-                 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
-        # Sort according to known formats list
-        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+        formats = [{
+            'url': x.group('url'),
+            'format_id': x.group('quality'),
+            'format_note': x.group('note'),
+            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
+            'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
+            'preference': self._known_formats.index(x.group('quality')),
+            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
+        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+
+        self._sort_formats(formats)
+
         return formats
 
     def _extract_title(self, html):
         return formats
 
     def _extract_title(self, html):
-        title = self._html_search_meta(u'title', html, u'title')
-        if title is None:           
+        title = self._html_search_meta('title', html, 'title')
+        if title is None:
             title = self._og_search_title(html)
             title = self._og_search_title(html)
-            TITLE_SUFFIX = u' (Channel 9)'
+            TITLE_SUFFIX = ' (Channel 9)'
             if title is not None and title.endswith(TITLE_SUFFIX):
                 title = title[:-len(TITLE_SUFFIX)]
         return title
             if title is not None and title.endswith(TITLE_SUFFIX):
                 title = title[:-len(TITLE_SUFFIX)]
         return title
@@ -106,10 +113,10 @@ class Channel9IE(InfoExtractor):
         m = re.search(DESCRIPTION_REGEX, html)
         if m is not None:
             return m.group('description')
         m = re.search(DESCRIPTION_REGEX, html)
         if m is not None:
             return m.group('description')
-        return self._html_search_meta(u'description', html, u'description')
+        return self._html_search_meta('description', html, 'description')
 
     def _extract_duration(self, html):
 
     def _extract_duration(self, html):
-        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 
     def _extract_slides(self, html):
         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 
     def _extract_slides(self, html):
@@ -161,14 +168,14 @@ class Channel9IE(InfoExtractor):
         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 
     def _extract_content(self, html, content_path):
         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 
     def _extract_content(self, html, content_path):
-        # Look for downloadable content        
+        # Look for downloadable content
         formats = self._formats_from_html(html)
         slides = self._extract_slides(html)
         zip_ = self._extract_zip(html)
 
         # Nothing to download
         if len(formats) == 0 and slides is None and zip_ is None:
         formats = self._formats_from_html(html)
         slides = self._extract_slides(html)
         zip_ = self._extract_zip(html)
 
         # Nothing to download
         if len(formats) == 0 and slides is None and zip_ is None:
-            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
+            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
             return
 
         # Extract meta
             return
 
         # Extract meta
@@ -181,32 +188,33 @@ class Channel9IE(InfoExtractor):
         view_count = self._extract_view_count(html)
         comment_count = self._extract_comment_count(html)
 
         view_count = self._extract_view_count(html)
         comment_count = self._extract_comment_count(html)
 
-        common = {'_type': 'video',
-                  'id': content_path,
-                  'description': description,
-                  'thumbnail': thumbnail,
-                  'duration': duration,
-                  'avg_rating': avg_rating,
-                  'rating_count': rating_count,
-                  'view_count': view_count,
-                  'comment_count': comment_count,
-                }
+        common = {
+            '_type': 'video',
+            'id': content_path,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'avg_rating': avg_rating,
+            'rating_count': rating_count,
+            'view_count': view_count,
+            'comment_count': comment_count,
+        }
 
         result = []
 
         if slides is not None:
             d = common.copy()
 
         result = []
 
         if slides is not None:
             d = common.copy()
-            d.update({ 'title': title + '-Slides', 'url': slides })
+            d.update({'title': title + '-Slides', 'url': slides})
             result.append(d)
 
         if zip_ is not None:
             d = common.copy()
             result.append(d)
 
         if zip_ is not None:
             d = common.copy()
-            d.update({ 'title': title + '-Zip', 'url': zip_ })
+            d.update({'title': title + '-Zip', 'url': zip_})
             result.append(d)
 
         if len(formats) > 0:
             d = common.copy()
             result.append(d)
 
         if len(formats) > 0:
             d = common.copy()
-            d.update({ 'title': title, 'formats': formats })
+            d.update({'title': title, 'formats': formats})
             result.append(d)
 
         return result
             result.append(d)
 
         return result
@@ -228,19 +236,20 @@ class Channel9IE(InfoExtractor):
         if contents is None:
             return contents
 
         if contents is None:
             return contents
 
-        session_meta = {'session_code': self._extract_session_code(html),
-                        'session_day': self._extract_session_day(html),
-                        'session_room': self._extract_session_room(html),
-                        'session_speakers': self._extract_session_speakers(html),
-                        }
+        session_meta = {
+            'session_code': self._extract_session_code(html),
+            'session_day': self._extract_session_day(html),
+            'session_room': self._extract_session_room(html),
+            'session_speakers': self._extract_session_speakers(html),
+        }
 
         for content in contents:
             content.update(session_meta)
 
 
         for content in contents:
             content.update(session_meta)
 
-        return contents
+        return self.playlist_result(contents)
 
     def _extract_list(self, content_path):
 
     def _extract_list(self, content_path):
-        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
+        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
         entries = [self.url_result(session_url.text, 'Channel9')
                    for session_url in rss.findall('./channel/item/link')]
         title_text = rss.find('./channel/title').text
         entries = [self.url_result(session_url.text, 'Channel9')
                    for session_url in rss.findall('./channel/item/link')]
         title_text = rss.find('./channel/title').text
@@ -250,18 +259,19 @@ class Channel9IE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         content_path = mobj.group('contentpath')
 
         mobj = re.match(self._VALID_URL, url)
         content_path = mobj.group('contentpath')
 
-        webpage = self._download_webpage(url, content_path, u'Downloading web page')
-
-        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
-        if page_type_m is None:
-            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
-
-        page_type = page_type_m.group('pagetype')
-        if page_type == 'List':         # List page, may contain list of 'item'-like objects
+        webpage = self._download_webpage(url, content_path, 'Downloading web page')
+
+        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+        if page_type_m is not None:
+            page_type = page_type_m.group('pagetype')
+            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
+                return self._extract_entry_item(webpage, content_path)
+            elif page_type == 'Session':  # Event session page, may contain downloadable content
+                return self._extract_session(webpage, content_path)
+            elif page_type == 'Event':
+                return self._extract_list(content_path)
+            else:
+                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+        else:  # Assuming list
             return self._extract_list(content_path)
             return self._extract_list(content_path)
-        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
-            return self._extract_entry_item(webpage, content_path)
-        elif page_type == 'Session':    # Event session page, may contain downloadable content
-            return self._extract_session(webpage, content_path)
-        else:
-            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file