X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/540fd68c40df72763aee5d75598675c45cfa9aba..67afe88251ff8b1fcb43246c9a4bb1a0ea0185a2:/youtube_dl/extractor/youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index e58184a..aacb999 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600)
+ def _ids_to_results(self, ids):
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
def _login(self):
"""
Attempt to log in to YouTube.
@@ -1121,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
else:
- # Hide the formats we found through non-DASH
+ # Remove the formats we found through non-DASH, they
+ # contain less info and it can be wrong, because we use
+ # fixed values (for example the resolution). See
+ # https://github.com/rg3/youtube-dl/issues/5774 for an
+ # example.
dash_keys = set(df['format_id'] for df in dash_formats)
- for f in formats:
- if f['format_id'] in dash_keys:
- f['format_id'] = 'nondash-%s' % f['format_id']
- f['preference'] = f.get('preference', 0) - 10000
+ formats = [f for f in formats if f['format_id'] not in dash_keys]
formats.extend(dash_formats)
# Check for malformed aspect ratio
@@ -1261,11 +1267,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@@ -1398,6 +1399,22 @@ class YoutubeChannelIE(InfoExtractor):
channel_id = self._match_id(url)
url = self._TEMPLATE_URL % channel_id
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ channel_playlist_id = self._search_regex(
+ [r'',
+ r'data-channel-external-id="([^"]+)"'],
+ channel_page, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
autogenerated = re.search(r'''(?x)
class="[^"]*?(?:
@@ -1601,20 +1618,10 @@ class YoutubeShowIE(InfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
- Base class for extractors that fetch info from
- http://www.youtube.com/feed_ajax
+ Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- # use action_load_personal_feed instead of action_load_system_feed
- _PERSONAL_FEED = False
-
- @property
- def _FEED_TEMPLATE(self):
- action = 'action_load_system_feed'
- if self._PERSONAL_FEED:
- action = 'action_load_personal_feed'
- return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
@property
def IE_NAME(self):
@@ -1624,67 +1631,23 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- feed_entries = []
- paging = 0
- for i in itertools.count(1):
- info = self._download_json(
- self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i,
- transform_source=uppercase_escape)
- feed_html = info.get('feed_html') or info.get('content_html')
- load_more_widget_html = info.get('load_more_widget_html') or feed_html
- m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
- ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in ids)
- mobj = re.search(
- r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)',
- load_more_widget_html)
- if mobj is None:
- break
- paging = mobj.group('paging')
- return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-
-
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_NAME = 'youtube:recommended'
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
- _FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = 'Youtube Recommended videos'
-
-
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:watchlater'
- IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
-
- _TESTS = [] # override PlaylistIE tests
-
- def _real_extract(self, url):
- return self._extract_playlist('WL')
-
-
-class YoutubeHistoryIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:history'
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube History'
- page = self._download_webpage('https://www.youtube.com/feed/history', title)
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
-
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
+ break
+
ids.extend(new_ids)
mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html)
@@ -1692,17 +1655,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE):
break
more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
+ 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+ return self.playlist_result(
+ self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:watchlater'
+ IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+ _TESTS = [] # override PlaylistIE tests
+
+ def _real_extract(self, url):
+ return self._extract_playlist('WL')
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -1717,42 +1688,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
return self.url_result(playlist_id, 'YoutubePlaylist')
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:subscriptions'
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube Subscriptions'
- page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
- # The extraction process is the same as for playlists, but the regex
- # for the video ids doesn't contain an index
- ids = []
- more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
- for page_num in itertools.count(1):
- matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
- ids.extend(new_ids)
- mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html)
- if not mobj:
- break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = 'Youtube Subscriptions'
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+ _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PLAYLIST_TITLE = 'Youtube History'
class YoutubeTruncatedURLIE(InfoExtractor):