X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/41fbda5a2400e16fa4926340df283c9c1d6b39a4..refs/heads/master:/youtube_dl/extractor/youtube.py?ds=inline diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 908defe..02f3ab6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -70,9 +70,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _YOUTUBE_CLIENT_HEADERS = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '1.20200609.04.02', + } + def _set_language(self): self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', + '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) @@ -298,10 +303,11 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape) + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -388,8 +394,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yewtu\.be/| (?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.lelux\.fi/| + (?:www\.)?invidious\.ggc-project\.de/| + (?:www\.)?yt\.maisputain\.ovh/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.toot\.koeln/| + (?:www\.)?invidious\.fdn\.fr/| + (?:www\.)?watch\.nettohikari\.com/| (?:www\.)?kgg2m7yk5aybusll\.onion/| (?:www\.)?qklhadlycap4cnod\.onion/| (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| @@ -397,6 +410,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| + (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -426,6 +440,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' + _PLAYER_INFO_RE = ( + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + ) _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1227,6 +1245,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, }, + { + # invalid -> valid video id redirection + 'url': 'DJztXj2GPfl', + 'info_dict': { + 'id': 'DJztXj2GPfk', + 'ext': 'mp4', + 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', + 'description': 'md5:bf577a41da97918e94fa9798d9228825', + 'upload_date': '20090125', + 'uploader': 'Prochorowka', + 'uploader_id': 'Prochorowka', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', + 'artist': 'Panjabi MC', + 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', + 'album': 'Beware of the Boys (Mundian To Bach Ke)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # empty description results in an empty string + 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', + 'info_dict': { + 'id': 'x41yOUIvK2k', + 'ext': 'mp4', + 'title': 'IMG 3456', + 'description': '', + 'upload_date': '20170613', + 'uploader_id': 'ElevageOrVert', + 'uploader': 'ElevageOrVert', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -1253,14 +1307,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?[-.](?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', - player_url) - if not id_m: + @classmethod + def _extract_player_info(cls, player_url): + for player_re in cls._PLAYER_INFO_RE: + id_m = re.search(player_re, player_url) + if id_m: + break + else: raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') + return id_m.group('ext'), id_m.group('id') + + def _extract_signature_function(self, video_id, player_url, example_sig): + player_type, player_id = self._extract_player_info(player_url) # Read from filesystem cache func_id = '%s_%s_%s' % ( @@ -1342,7 +1400,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', @@ -1616,8 +1674,63 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id + def _extract_chapters_from_json(self, webpage, video_id, duration): + if not webpage: + return + player = self._parse_json( + self._search_regex( + r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, + 'player args', default='{}'), + video_id, fatal=False) + if not player or not isinstance(player, dict): + return + watch_next_response = player.get('watch_next_response') + if not isinstance(watch_next_response, compat_str): + return + response = self._parse_json(watch_next_response, video_id, fatal=False) + if not response or not isinstance(response, dict): + return + chapters_list = try_get( + response, + lambda x: x['playerOverlays'] + ['playerOverlayRenderer'] + ['decoratedPlayerBarRenderer'] + ['decoratedPlayerBarRenderer'] + ['playerBar'] + ['chapteredPlayerBarRenderer'] + ['chapters'], + list) + if not chapters_list: + return + + def chapter_time(chapter): + return float_or_none( + try_get( + chapter, + lambda x: x['chapterRenderer']['timeRangeStartMillis'], + int), + scale=1000) + chapters = [] + for next_num, chapter in enumerate(chapters_list, start=1): + start_time = chapter_time(chapter) + if start_time is None: + continue + end_time = (chapter_time(chapters_list[next_num]) + if next_num < len(chapters_list) else duration) + if end_time is None: + continue + title = try_get( + chapter, lambda x: x['chapterRenderer']['title']['simpleText'], + compat_str) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': title, + }) + return chapters + @staticmethod - def _extract_chapters(description, duration): + def _extract_chapters_from_description(description, duration): if not description: return None chapter_lines = re.findall( @@ -1651,6 +1764,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) return chapters + def _extract_chapters(self, webpage, description, video_id, duration): + return (self._extract_chapters_from_json(webpage, video_id, duration) + or self._extract_chapters_from_description(description, duration)) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1678,7 +1795,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) + video_webpage, urlh = self._download_webpage_handle(url, video_id) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) + video_id = qs.get('v', [None])[0] or video_id # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1721,7 +1841,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video info video_info = {} embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: + if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' + or re.search(r'player-age-gate-content">', video_webpage) is not None): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -1794,6 +1915,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} + microformat = try_get( + player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} + video_title = video_info.get('title', [None])[0] or video_details.get('title') if not video_title: self._downloader.report_warning('Unable to extract video title') @@ -1823,7 +1947,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ''', replace_url, video_description) video_description = clean_html(video_description) else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') + video_description = video_details.get('shortDescription') + if video_description is None: + video_description = self._html_search_meta('description', video_webpage) if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): @@ -1840,15 +1966,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get(feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + 'title': title, }) - feed_ids.append(feed_data['id'][0]) + feed_ids.append(feed_id) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) @@ -1860,6 +1997,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): view_count = extract_view_count(video_info) if view_count is None and video_details: view_count = int_or_none(video_details.get('viewCount')) + if view_count is None and microformat: + view_count = int_or_none(microformat.get('viewCount')) if is_live is None: is_live = bool_or_none(video_details.get('isLive')) @@ -1919,12 +2058,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } for fmt in streaming_formats: - if fmt.get('drm_families'): + if fmt.get('drmFamilies') or fmt.get('drm_families'): continue url = url_or_none(fmt.get('url')) if not url: - cipher = fmt.get('cipher') + cipher = fmt.get('cipher') or fmt.get('signatureCipher') if not cipher: continue url_data = compat_parse_qs(cipher) @@ -1975,22 +2114,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('verbose'): if player_url is None: - player_version = 'unknown' player_desc = 'unknown' else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - + player_type, player_version = self._extract_player_info(player_url) + player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) @@ -2123,7 +2250,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_uploader_id = mobj.group('uploader_id') video_uploader_url = mobj.group('uploader_url') else: - self._downloader.report_warning('unable to extract uploader nickname') + owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) + if owner_profile_url: + video_uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', + default=None) + video_uploader_url = owner_profile_url channel_id = ( str_or_none(video_details.get('channelId')) @@ -2134,17 +2266,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'channel id', default=None, group='id')) channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') + thumbnails = [] + thumbnails_list = try_get( + video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] + for t in thumbnails_list: + if not isinstance(t, dict): + continue + thumbnail_url = url_or_none(t.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(t.get('width')), + 'height': int_or_none(t.get('height')), + }) + + if not thumbnails: video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + # We try first to get a high quality image: + m_thumb = re.search(r'', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) + if thumbnail_url: + video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) + if video_thumbnail: + thumbnails.append({'url': video_thumbnail}) # upload date upload_date = self._html_search_meta( @@ -2154,6 +2302,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): [r'(?s)id="eow-date.*?>(.*?)', r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = microformat.get('publishDate') or microformat.get('uploadDate') upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -2225,17 +2375,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*\s*]*>(.*?)', video_webpage, 'categories', default=None) + category = None if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', default=None) - video_categories = None if category is None else [category] - else: - video_categories = None + if not category: + category = try_get( + microformat, lambda x: x['category'], compat_str) + video_categories = None if category is None else [category] video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + if not video_tags: + video_tags = try_get(video_details, lambda x: x['keywords'], list) def _extract_count(count_name): return str_to_int(self._search_regex( @@ -2286,7 +2440,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): errnote='Unable to download video annotations', fatal=False, data=urlencode_postdata({xsrf_field_name: xsrf_token})) - chapters = self._extract_chapters(description_original, video_duration) + chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): @@ -2377,7 +2531,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'creator': video_creator or artist, 'title': video_title, 'alt_title': video_alt_title or track, - 'thumbnail': video_thumbnail, + 'thumbnails': thumbnails, 'description': video_description, 'categories': video_categories, 'tags': video_tags, @@ -2641,7 +2795,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -2873,7 +3027,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_%-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -2903,6 +3057,9 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', + 'only_matching': True, }, { 'url': 'https://www.youtube.com/gametrailers', 'only_matching': True, @@ -2981,7 +3138,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3007,6 +3164,9 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, }] @@ -3151,9 +3311,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) content_html = more['content_html'] more_widget_html = more['load_more_widget_html']