X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/0626de37fa3331c654d4affae363132a0d766ffb..20205e79eb4216762c923790b27dbe38f945293a:/youtube_dl/extractor/youtube.py diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ce2212a..02f3ab6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -77,7 +77,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _set_language(self): self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', + '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) @@ -303,7 +303,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -1264,7 +1264,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, - } + }, + { + # empty description results in an empty string + 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', + 'info_dict': { + 'id': 'x41yOUIvK2k', + 'ext': 'mp4', + 'title': 'IMG 3456', + 'description': '', + 'upload_date': '20170613', + 'uploader_id': 'ElevageOrVert', + 'uploader': 'ElevageOrVert', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -1384,7 +1400,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', @@ -1825,7 +1841,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video info video_info = {} embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: + if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' + or re.search(r'player-age-gate-content">', video_webpage) is not None): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -1930,7 +1947,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ''', replace_url, video_description) video_description = clean_html(video_description) else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') + video_description = video_details.get('shortDescription') + if video_description is None: + video_description = self._html_search_meta('description', video_webpage) if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): @@ -2356,17 +2375,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*\s*]*>(.*?)', video_webpage, 'categories', default=None) + category = None if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', default=None) - video_categories = None if category is None else [category] - else: - video_categories = None + if not category: + category = try_get( + microformat, lambda x: x['category'], compat_str) + video_categories = None if category is None else [category] video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + if not video_tags: + video_tags = try_get(video_details, lambda x: x['keywords'], list) def _extract_count(count_name): return str_to_int(self._search_regex( @@ -2772,7 +2795,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -3004,7 +3027,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_%-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -3034,6 +3057,9 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', + 'only_matching': True, }, { 'url': 'https://www.youtube.com/gametrailers', 'only_matching': True, @@ -3112,7 +3138,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3138,6 +3164,9 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, }] @@ -3282,7 +3311,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, headers=self._YOUTUBE_CLIENT_HEADERS)