d/control: Add build-dependency on dh-python as per warning from dh_python2.

[youtubedl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index e58184adcf4b329c85ddb8bde2f352e383404ed8..aacb999ce9044ba07af7597d03b34e1aa37ebddd 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              # YouTube sets the expire time to about two months
              expire_time=time.time() + 2 * 30 * 24 * 3600)
  
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
      def _login(self):
          """
          Attempt to log in to YouTube.
@@ -1121,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      self.report_warning(
                          'Skipping DASH manifest: %r' % e, video_id)
                  else:
-                    # Hide the formats we found through non-DASH
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
                      dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
+                    formats = [f for f in formats if f['format_id'] not in dash_keys]
                      formats.extend(dash_formats)
  
          # Check for malformed aspect ratio
@@ -1261,11 +1267,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      def _real_initialize(self):
          self._login()
  
-    def _ids_to_results(self, ids):
-        return [
-            self.url_result(vid_id, 'Youtube', video_id=vid_id)
-            for vid_id in ids]
-
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a single video
          # the id of the playlist is just 'RD' + video_id
@@ -1398,6 +1399,22 @@ class YoutubeChannelIE(InfoExtractor):
          channel_id = self._match_id(url)
  
          url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._search_regex(
+            [r'<meta itemprop="channelId" content="([^"]+)">',
+             r'data-channel-external-id="([^"]+)"'],
+            channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
          channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
          autogenerated = re.search(r'''(?x)
                  class="[^"]*?(?:
@@ -1601,20 +1618,10 @@ class YoutubeShowIE(InfoExtractor):
  
  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
      """
-    Base class for extractors that fetch info from
-    http://www.youtube.com/feed_ajax
+    Base class for feed extractors
      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
      """
      _LOGIN_REQUIRED = True
-    # use action_load_personal_feed instead of action_load_system_feed
-    _PERSONAL_FEED = False
-
-    @property
-    def _FEED_TEMPLATE(self):
-        action = 'action_load_system_feed'
-        if self._PERSONAL_FEED:
-            action = 'action_load_personal_feed'
-        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
  
      @property
      def IE_NAME(self):
@@ -1624,67 +1631,23 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          self._login()
  
      def _real_extract(self, url):
-        feed_entries = []
-        paging = 0
-        for i in itertools.count(1):
-            info = self._download_json(
-                self._FEED_TEMPLATE % paging,
-                '%s feed' % self._FEED_NAME,
-                'Downloading page %s' % i,
-                transform_source=uppercase_escape)
-            feed_html = info.get('feed_html') or info.get('content_html')
-            load_more_widget_html = info.get('load_more_widget_html') or feed_html
-            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
-            ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in ids)
-            mobj = re.search(
-                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                load_more_widget_html)
-            if mobj is None:
-                break
-            paging = mobj.group('paging')
-        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-
-
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:recommended'
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
-    _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
-
-
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:watchlater'
-    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
-
-    _TESTS = []  # override PlaylistIE tests
-
-    def _real_extract(self, url):
-        return self._extract_playlist('WL')
-
-
-class YoutubeHistoryIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:history'
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube History'
-        page = self._download_webpage('https://www.youtube.com/feed/history', title)
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
  
          # The extraction process is the same as for playlists, but the regex
          # for the video ids doesn't contain an index
          ids = []
          more_widget_html = content_html = page
-
          for page_num in itertools.count(1):
              matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
+                break
+
              ids.extend(new_ids)
  
              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
@@ -1692,17 +1655,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE):
                  break
  
              more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
                  'Downloading page #%s' % page_num,
                  transform_source=uppercase_escape)
              content_html = more['content_html']
              more_widget_html = more['load_more_widget_html']
  
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+        return self.playlist_result(
+            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
+    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+    _TESTS = []  # override PlaylistIE tests
+
+    def _real_extract(self, url):
+        return self._extract_playlist('WL')
  
  
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -1717,42 +1688,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
          return self.url_result(playlist_id, 'YoutubePlaylist')
  
  
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:subscriptions'
-    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube Subscriptions'
-        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
  
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
-            ids.extend(new_ids)
  
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
  
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
  
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
  
  
  class YoutubeTruncatedURLIE(InfoExtractor):