X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/b8d8e13c1f9e4d3cdd7d41c5c9d711a36dd5f9c3..39393b81acfaf4045fb7f20454a0226f0dc9142e:/youtube_dl/extractor/youtube.py?ds=inline diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7671093..1bc79e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,8 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_HTTPError, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -25,8 +27,10 @@ from ..compat import ( compat_str, ) from ..utils import ( + bool_or_none, clean_html, error_to_compat_str, + extract_attributes, ExtractorError, float_or_none, get_element_by_attribute, @@ -34,19 +38,20 @@ from ..utils import ( int_or_none, mimetype2ext, orderedSet, + parse_codecs, parse_duration, remove_quotes, remove_start, - sanitized_Request, smuggle_url, + str_or_none, str_to_int, try_get, unescapeHTML, unified_strdate, unsmuggle_url, uppercase_escape, + url_or_none, urlencode_postdata, - ISO3166Utils, ) @@ -54,14 +59,25 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + + _YOUTUBE_CLIENT_HEADERS = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '1.20200609.04.02', + } + def _set_language(self): self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', + '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) @@ -78,10 +94,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True @@ -94,74 +110,172 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) - login_form.update({ - 'checkConnection': 'youtube', - 'Email': username, - 'Passwd': password, - }) + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + # TODO: reverse actual botguard identifier generation algo + 'bgRequest': '["identifier",""]', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_results = self._download_webpage( - self._PASSWORD_CHALLENGE_URL, None, - note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form)) - if login_results is False: - return False + def warn(message): + self._downloader.report_warning(message) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] - error_msg = self._html_search_regex( - r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', - login_results, 'error message', default=None) - if error_msg: - raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)]+id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor ' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - tfa_code = remove_start(tfa_code, 'G-') + if challenge_results is False: + return - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_data = urlencode_postdata(tfa_form_strs) + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(login_challenge, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor ' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)]+id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)]+id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)]+id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True + def _download_webpage_handle(self, *args, **kwargs): + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -182,10 +296,26 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): if not mobj: break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) + count = 0 + retries = 3 + while count <= retries: + try: + # Downloading page may result in intermittent 5xx HTTP error + # that is usually worked around with a retry + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s%s' + % (page_num, ' (retry #%d)' % count if count else ''), + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): + count += 1 + if count <= retries: + continue + raise + content_html = more['content_html'] if not content_html.strip(): # Some webpages show a "Load more" button but they don't @@ -199,17 +329,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(content): yield self.url_result(video_id, 'Youtube', video_id, video_title) - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(self._VIDEO_RE, page): + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) + video_title = unescapeHTML( + mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() + if video_title == '► Play all': + video_title = None try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: @@ -217,6 +348,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) return zip(ids_in_page, titles_in_page) @@ -240,11 +377,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances + (?:(?:www|dev)\.)?invidio\.us/| + (?:(?:www|no)\.)?invidiou\.sh/| + (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?invidious\.drycat\.fr/| + (?:www\.)?tube\.poal\.co/| + (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yewtu\.be/| + (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?yt\.lelux\.fi/| + (?:www\.)?invidious\.ggc-project\.de/| + (?:www\.)?yt\.maisputain\.ovh/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.toot\.koeln/| + (?:www\.)?invidious\.fdn\.fr/| + (?:www\.)?watch\.nettohikari\.com/| + (?:www\.)?kgg2m7yk5aybusll\.onion/| + (?:www\.)?qklhadlycap4cnod\.onion/| + (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| + (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| + (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| + (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| + (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| + (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -265,10 +431,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) (?(1).+)? # if we found the ID, everything can follow - $""" + $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' + _PLAYER_INFO_RE = ( + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + ) _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -310,65 +485,73 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } - _SUBTITLE_FORMATS = ('ttml', 'vtt') + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False IE_NAME = 'youtube' _TESTS = [ @@ -381,12 +564,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -402,7 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', + 'description': 'md5:19a2f98d9032b9311e686ed039564f63', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], @@ -410,8 +595,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -421,15 +607,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', 'alt_title': 'Tunnel Vision', - 'description': 'md5:64249768eec3bc4276236606ea996373', + 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -445,7 +632,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SET India', 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'license': 'Standard YouTube License', 'age_limit': 18, } }, @@ -460,11 +646,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -483,7 +669,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', - 'license': 'Standard YouTube License', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -498,13 +683,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', - 'license': 'Standard YouTube License', }, 'params': { 'youtube_include_dash_manifest': True, @@ -518,14 +702,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'alt_title': 'Shake It Off', - 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'description': 'md5:307195cd21ff7fa352270fe884570ef0', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'license': 'Standard YouTube License', - 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -540,10 +721,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -561,7 +741,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -573,16 +752,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, - # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', 'info_dict': { @@ -592,18 +771,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', + 'creator': 'Dada Life, deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', - 'license': 'Standard YouTube License', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', + 'alt_title': 'This Machine Kills Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', ] }, - # Olympics (https://github.com/rg3/youtube-dl/issues/4431) + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', 'info_dict': { @@ -613,7 +791,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', @@ -634,8 +811,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', - 'license': 'Standard YouTube License', + 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, @@ -656,12 +832,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This live event has ended.', }, - # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) { 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -669,9 +845,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', - 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -684,7 +860,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'license': 'Standard YouTube License', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -757,9 +932,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { - # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) + # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', 'info_dict': { 'id': 'gVfLd0zydlo', @@ -777,31 +953,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, { - # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) + # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see - # https://github.com/rg3/youtube-dl/issues/1892, - # https://github.com/rg3/youtube-dl/issues/8164) + # https://github.com/ytdl-org/youtube-dl/issues/1892, + # https://github.com/ytdl-org/youtube-dl/issues/8164) 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', 'info_dict': { 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { 'skip_download': True, }, }, { - # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468) + # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', 'only_matching': True, }, @@ -851,7 +1029,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -865,7 +1043,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, { - # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) + # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', 'only_matching': True, }, @@ -886,6 +1064,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -894,13 +1073,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', 'uploader_id': 'Vsauce', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'license': 'Standard YouTube License', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -912,10 +1090,180 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Skipping DASH manifest', ], }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + }, + 'params': { + 'skip_download': True, + }, + }, { # itag 212 'url': '1t24XAntNCY', 'only_matching': True, + }, + { + # geo restricted to JP + 'url': 'sJL6WA-aGkQ', + 'only_matching': True, + }, + { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + }, + { + # Video with unsupported adaptive stream type formats + 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', + 'info_dict': { + 'id': 'Z4Vy8R84T1U', + 'ext': 'mp4', + 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 433, + 'upload_date': '20130923', + 'uploader': 'Amelia Putri Harwita', + 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', + 'formats': 'maxcount:10', + }, + 'params': { + 'skip_download': True, + 'youtube_include_dash_manifest': False, + }, + 'skip': 'not actual anymore', + }, + { + # Youtube Music Auto-generated description + 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'title': 'Voyeur Girl', + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'upload_date': '20190312', + 'uploader': 'Stephen - Topic', + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'artist': 'Stephen', + 'track': 'Voyeur Girl', + 'album': 'it\'s too much love to know my dear', + 'release_date': '20190313', + 'release_year': 2019, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # Retrieve 'artist' field from 'Artist:' in video description + # when it is present on youtube music video + 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', + 'info_dict': { + 'id': 'k0jLE7tTwjY', + 'ext': 'mp4', + 'title': 'Latch Feat. Sam Smith', + 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', + 'upload_date': '20150110', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', + 'artist': 'Disclosure', + 'track': 'Latch Feat. Sam Smith', + 'album': 'Latch Featuring Sam Smith', + 'release_date': '20121008', + 'release_year': 2012, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # handle multiple artists on youtube music video + 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', + 'info_dict': { + 'id': '74qn0eJSjpA', + 'ext': 'mp4', + 'title': 'Eastside', + 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', + 'upload_date': '20180710', + 'uploader': 'Benny Blanco - Topic', + 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', + 'artist': 'benny blanco, Halsey, Khalid', + 'track': 'Eastside', + 'album': 'Eastside', + 'release_date': '20180713', + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # handle youtube music video with release_year and no release_date + 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', + 'info_dict': { + 'id': '-hcAI0g-f5M', + 'ext': 'mp4', + 'title': 'Put It On Me', + 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', + 'upload_date': '20180426', + 'uploader': 'Matt Maeson - Topic', + 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', + 'artist': 'Matt Maeson', + 'track': 'Put It On Me', + 'album': 'The Hearse', + 'release_date': None, + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', + 'only_matching': True, + }, + { + # invalid -> valid video id redirection + 'url': 'DJztXj2GPfl', + 'info_dict': { + 'id': 'DJztXj2GPfk', + 'ext': 'mp4', + 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', + 'description': 'md5:bf577a41da97918e94fa9798d9228825', + 'upload_date': '20090125', + 'uploader': 'Prochorowka', + 'uploader_id': 'Prochorowka', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', + 'artist': 'Panjabi MC', + 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', + 'album': 'Beware of the Boys (Mundian To Bach Ke)', + }, + 'params': { + 'skip_download': True, + }, } ] @@ -943,14 +1291,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P[a-z]+)$', - player_url) - if not id_m: + @classmethod + def _extract_player_info(cls, player_url): + for player_re in cls._PLAYER_INFO_RE: + id_m = re.search(player_re, player_url) + if id_m: + break + else: raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') + return id_m.group('ext'), id_m.group('id') + + def _extract_signature_function(self, video_id, player_url, example_sig): + player_type, player_id = self._extract_player_info(player_url) # Read from filesystem cache func_id = '%s_%s_%s' % ( @@ -1030,8 +1382,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P[a-zA-Z0-9$]+)\('), + (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + # Obsolete patterns + r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1111,8 +1474,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # regex won't capture the whole JSON. Yet working around by trying more # concrete regex first keeping in mind proper quoted string handling # to be implemented in future that will replace this workaround (see - # https://github.com/rg3/youtube-dl/issues/7468, - # https://github.com/rg3/youtube-dl/pull/7599) + # https://github.com/ytdl-org/youtube-dl/issues/7468, + # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', ) @@ -1170,42 +1533,65 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang_list[sub_lang] = sub_formats return sub_lang_list + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + # Some videos don't provide ttsurl but rather caption_tracks and # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urllib_parse_urlparse(caption_url) - caption_qs = compat_parse_qs(parsed_caption_url.query) - - sub_lang_list = {} + sub_lang_list = [] for lang in caption_translation_languages.split(','): lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) sub_lang = lang_qs.get('lc', [None])[0] - if not sub_lang: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} - def _mark_watched(self, video_id, video_info): - playback_url = video_info.get('videostats_playback_base_url', [None])[0] + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1227,6 +1613,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + ]+?src=| + data-video-url=| + ]+?src=| + embedSWF\(?:\s*| + ]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)]+ + class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1235,24 +1658,99 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_from_m3u8(self, manifest_url, video_id): - url_map = {} - - def _get_urls(_manifest): - lines = _manifest.split('\n') - urls = filter(lambda l: l and not l.startswith('#'), - lines) - return urls - manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') - formats_urls = _get_urls(manifest) - for format_url in formats_urls: - itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') - url_map[itag] = format_url - return url_map - - def _extract_annotations(self, video_id): - url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id - return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + def _extract_chapters_from_json(self, webpage, video_id, duration): + if not webpage: + return + player = self._parse_json( + self._search_regex( + r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, + 'player args', default='{}'), + video_id, fatal=False) + if not player or not isinstance(player, dict): + return + watch_next_response = player.get('watch_next_response') + if not isinstance(watch_next_response, compat_str): + return + response = self._parse_json(watch_next_response, video_id, fatal=False) + if not response or not isinstance(response, dict): + return + chapters_list = try_get( + response, + lambda x: x['playerOverlays'] + ['playerOverlayRenderer'] + ['decoratedPlayerBarRenderer'] + ['decoratedPlayerBarRenderer'] + ['playerBar'] + ['chapteredPlayerBarRenderer'] + ['chapters'], + list) + if not chapters_list: + return + + def chapter_time(chapter): + return float_or_none( + try_get( + chapter, + lambda x: x['chapterRenderer']['timeRangeStartMillis'], + int), + scale=1000) + chapters = [] + for next_num, chapter in enumerate(chapters_list, start=1): + start_time = chapter_time(chapter) + if start_time is None: + continue + end_time = (chapter_time(chapters_list[next_num]) + if next_num < len(chapters_list) else duration) + if end_time is None: + continue + title = try_get( + chapter, lambda x: x['chapterRenderer']['title']['simpleText'], + compat_str) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': title, + }) + return chapters + + @staticmethod + def _extract_chapters_from_description(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)[^>]*)(?=$|)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + if start_time > duration: + break + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + if end_time > duration: + end_time = duration + if start_time > end_time: + break + chapter_title = re.sub( + r']+>[^<]+', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + + def _extract_chapters(self, webpage, description, video_id, duration): + return (self._extract_chapters_from_json(webpage, video_id, duration) + or self._extract_chapters_from_description(description, duration)) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1281,7 +1779,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) + video_webpage, urlh = self._download_webpage_handle(url, video_id) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) + video_id = qs.get('v', [None])[0] or video_id # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1297,9 +1798,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + + def extract_player_response(player_response, video_id): + pl_response = str_or_none(player_response) + if not pl_response: + return + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + add_dash_mpd_pr(pl_response) + return pl_response + + player_response = {} + # Get video info + video_info = {} embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1313,92 +1838,88 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(video_info) + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False - video_info = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map'): + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) # Rental video is not rented but preview is available (e.g. # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/rg3/youtube-dl/issues/10532) + # https://github.com/ytdl-org/youtube-dl/issues/10532) if not video_info and args.get('ypc_vid'): return self.url_result( args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) - video_info_webpage = self._download_webpage( - video_info_url, - video_id, note=False, - errnote='unable to download video info webpage') - get_video_info = compat_parse_qs(video_info_webpage) - if get_video_info.get('use_cipher_signature') != ['True']: - add_dash_mpd(get_video_info) - if not video_info: - video_info = get_video_info - if 'token' in get_video_info: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - if 'token' not in video_info: - video_info = get_video_info - break - if 'token' not in video_info: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed: - raise ExtractorError('YouTube said: This video is available in %s only' % ( - ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), - expected=True) - raise ExtractorError( - 'YouTube said: %s' % video_info['reason'][0], - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) + add_dash_mpd_pr(player_response) + + def extract_unavailable_message(): + messages = [] + for tag, kind in (('h1', 'message'), ('div', 'submessage')): + msg = self._html_search_regex( + r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), + video_webpage, 'unavailable %s' % kind, default=None) + if msg: + messages.append(msg) + if messages: + return '\n'.join(messages) + + if not video_info and not player_response: + unavailable_message = extract_unavailable_message() + if not unavailable_message: + unavailable_message = 'Unable to extract video data' + raise ExtractorError( + 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - # title - if 'title' in video_info: - video_title = video_info['title'][0] - else: + if not isinstance(video_info, dict): + video_info = {} + + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} + + microformat = try_get( + player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} + + video_title = video_info.get('title', [None])[0] or video_details.get('title') + if not video_title: self._downloader.report_warning('Unable to extract video title') video_title = '_' - # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + + def replace_url(m): + redir_url = compat_urlparse.urljoin(url, m.group(1)) + parsed_redir_url = compat_urllib_parse_urlparse(redir_url) + if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': + qs = compat_parse_qs(parsed_redir_url.query) + q = qs.get('q') + if q and q[0]: + return q[0] + return redir_url + + description_original = video_description = re.sub(r'''(?x) ]*> [^<]+\.{3}\s* - ''', r'\1', video_description) + ''', replace_url, video_description) video_description = clean_html(video_description) else: - fd_mobj = re.search(r'= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): + encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] + if 'rtmpe%3Dyes' in encoded_url_map: + raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats = [] + formats_spec = {} + fmt_list = video_info.get('fmt_list', [''])[0] + if fmt_list: + for fmt in fmt_list.split(','): + spec = fmt.split('/') + if len(spec) > 1: + width_height = spec[1].split('x') + if len(width_height) == 2: + formats_spec[spec[0]] = { + 'resolution': spec[1], + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + } + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } + + for fmt in streaming_formats: + if fmt.get('drmFamilies') or fmt.get('drm_families'): + continue + url = url_or_none(fmt.get('url')) + + if not url: + cipher = fmt.get('cipher') or fmt.get('signatureCipher') + if not cipher: + continue + url_data = compat_parse_qs(cipher) + url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) + if not url: + continue + else: + cipher = None + url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: + continue + + format_id = fmt.get('itag') or url_data['itag'][0] + if not format_id: + continue + format_id = compat_str(format_id) + + if cipher: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + jsplayer_url_json = self._search_regex( + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + jsplayer_url_json = self._search_regex( + ASSETS_RE, embed_webpage, 'JS player URL') + + player_url = json.loads(jsplayer_url_json) + if player_url is None: + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, 'age gate player URL') + player_url = json.loads(player_url_json) + + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + + if self._downloader.params.get('verbose'): + if player_url is None: + player_desc = 'unknown' + else: + player_type, player_version = self._extract_player_info(player_url) + player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) + parts_sizes = self._signature_cache_id(encrypted_sig) + self.to_screen('{%s} signature length %s, %s' % + (format_id, parts_sizes, player_desc)) + + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) + if 'ratebypass' not in url: + url += '&ratebypass=yes' + + dct = { + 'format_id': format_id, + 'url': url, + 'player_url': player_url, + } + if format_id in self._formats: + dct.update(self._formats[format_id]) + if format_id in formats_spec: + dct.update(formats_spec[format_id]) + + # Some itags are not included in DASH manifest thus corresponding formats will + # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). + # Trying to extract metadata from url_encoded_fmt_stream_map entry. + mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) + width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + + if width is None: + width = int_or_none(fmt.get('width')) + if height is None: + height = int_or_none(fmt.get('height')) + + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + + quality = url_data.get('quality', [None])[0] or fmt.get('quality') + quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') + + tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) + or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None + fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) + + more_fields = { + 'filesize': filesize, + 'tbr': tbr, + 'width': width, + 'height': height, + 'fps': fps, + 'format_note': quality_label or quality, + } + for key, value in more_fields.items(): + if value: + dct[key] = value + type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') + if type_: + type_split = type_.split(';') + kind_ext = type_split[0].split('/') + if len(kind_ext) == 2: + kind, _ = kind_ext + dct['ext'] = mimetype2ext(type_split[0]) + if kind in ('audio', 'video'): + codecs = None + for mobj in re.finditer( + r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): + if mobj.group('key') == 'codecs': + codecs = mobj.group('val') + break + if codecs: + dct.update(parse_codecs(codecs)) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + else: + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) + or url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = extract_unavailable_message() + if not error_message: + error_message = clean_html(try_get( + player_response, lambda x: x['playabilityStatus']['reason'], + compat_str)) + if not error_message: + error_message = clean_html( + try_get(video_info, lambda x: x['reason'][0], compat_str)) + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') # uploader_id video_uploader_id = None video_uploader_url = None mobj = re.search( - r'', + r'', video_webpage) if mobj is not None: video_uploader_id = mobj.group('uploader_id') video_uploader_url = mobj.group('uploader_url') else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') + owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) + if owner_profile_url: + video_uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', + default=None) + video_uploader_url = owner_profile_url + + channel_id = ( + str_or_none(video_details.get('channelId')) + or self._html_search_meta( + 'channelId', video_webpage, 'channel id', default=None) + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', + video_webpage, 'channel id', default=None, group='id')) + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + + thumbnails = [] + thumbnails_list = try_get( + video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] + for t in thumbnails_list: + if not isinstance(t, dict): + continue + thumbnail_url = url_or_none(t.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(t.get('width')), + 'height': int_or_none(t.get('height')), + }) + + if not thumbnails: video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + # We try first to get a high quality image: + m_thumb = re.search(r'', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) + if thumbnail_url: + video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) + if video_thumbnail: + thumbnails.append({'url': video_thumbnail}) # upload date upload_date = self._html_search_meta( @@ -1487,10 +2281,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) + if not upload_date: + upload_date = microformat.get('publishDate') or microformat.get('uploadDate') upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -1498,7 +2292,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'license', default=None) m_music = re.search( - r']+class="title"[^>]*>\s*Music\s*\s*]*>\s*
  • (?P.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', video_webpage) if m_music: video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) @@ -1506,11 +2314,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_alt_title = video_creator = None + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + album = extract_meta('Album') + + # Youtube Music Auto-generated description + release_date = release_year = None + if video_description: + mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) + if mobj: + if not track: + track = mobj.group('track').strip() + if not artist: + artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) + if not album: + album = mobj.group('album'.strip()) + release_year = mobj.group('release_year') + release_date = mobj.group('release_date') + if release_date: + release_date = release_date.replace('-', '') + if not release_year: + release_year = int(release_date[:4]) + if release_year: + release_year = int(release_year) + m_episode = re.search( r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', video_webpage) if m_episode: - series = m_episode.group('series') + series = unescapeHTML(m_episode.group('series')) season_number = int(m_episode.group('season')) episode_number = int(m_episode.group('episode')) else: @@ -1519,17 +2356,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', video_webpage, 'categories', default=None) + category = None if m_cat_container: category = self._html_search_regex( r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', default=None) - video_categories = None if category is None else [category] - else: - video_categories = None + if not category: + category = try_get( + microformat, lambda x: x['category'], compat_str) + video_categories = None if category is None else [category] video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + if not video_tags: + video_tags = try_get(video_details, lambda x: x['keywords'], list) def _extract_count(count_name): return str_to_int(self._search_regex( @@ -1540,12 +2381,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): like_count = _extract_count('like') dislike_count = _extract_count('dislike') + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + + average_rating = ( + float_or_none(video_details.get('averageRating')) + or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) if not video_duration: video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) @@ -1553,173 +2405,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # annotations video_annotations = None if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - def _map_to_format_list(urlmap): - formats = [] - for itag, video_real_url in urlmap.items(): - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - if itag in self._formats: - dct.update(self._formats[itag]) - formats.append(dct) - return formats - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data: - continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if self._downloader.params.get('verbose'): - if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - url += '&signature=' + signature - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - more_fields = { - 'filesize': int_or_none(url_data.get('clen', [None])[0]), - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), - 'width': width, - 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs[1], codecs[0] - else: - acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) - dct.update({ - 'acodec': acodec, - 'vcodec': vcodec, - }) - formats.append(dct) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - url_map = self._extract_from_m3u8(manifest_url, video_id) - formats = _map_to_format_list(url_map) - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - for a_format in formats: - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - else: - unavailable_message = self._html_search_regex( - r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', - video_webpage, 'unavailable message', default=None) - if unavailable_message: - raise ExtractorError(unavailable_message, expected=True) - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + xsrf_token = self._search_regex( + r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2', + video_webpage, 'xsrf token', group='xsrf_token', fatal=False) + invideo_url = try_get( + player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) + if xsrf_token and invideo_url: + xsrf_field_name = self._search_regex( + r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', + video_webpage, 'xsrf field name', + group='xsrf_field_name', default='session_token') + video_annotations = self._download_webpage( + self._proto_relative_url(invideo_url), + video_id, note='Downloading annotations', + errnote='Unable to download video annotations', fatal=False, + data=urlencode_postdata({xsrf_field_name: xsrf_token})) + + chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): @@ -1737,6 +2439,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df @@ -1753,7 +2457,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See - # https://github.com/rg3/youtube-dl/issues/5774 for an + # https://github.com/ytdl-org/youtube-dl/issues/5774 for an # example. formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) @@ -1773,21 +2477,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio + if not formats: + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) - self.mark_watched(video_id, video_info) + self.mark_watched(video_id, video_info, player_response) return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, - 'thumbnail': video_thumbnail, + 'alt_title': video_alt_title or track, + 'thumbnails': thumbnails, 'description': video_description, 'categories': video_categories, 'tags': video_tags, @@ -1796,11 +2521,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, - 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), + 'average_rating': average_rating, 'formats': formats, 'is_live': is_live, 'start_time': start_time, @@ -1808,97 +2534,81 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, + 'album': album, + 'release_date': release_date, + 'release_year': release_year, } -class YoutubeSharedVideoIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:shared' - - _TEST = { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': ['Youtube'], - 'params': { - # There are already too many Youtube downloads - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - real_video_id = self._html_search_meta( - 'videoId', webpage, 'YouTube video id', fatal=True) - - return self.url_result(real_video_id, YoutubeIE.ie_key()) - - class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) + youtube(?:kids)?\.com| + invidio\.us + ) + / + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= | p/ )| youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) - )""" - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' + (%(playlist_id)s) + )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' + _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' + _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'Wickydoo', }, 'playlist_mincount': 26, }, { @@ -1907,6 +2617,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'Cauchemar89', }, 'playlist_mincount': 799, }, { @@ -1924,6 +2636,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 485, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'sdragonfang', } }, { 'note': 'Embedded SWF player', @@ -1932,13 +2655,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'JODA7', 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - } + }, + 'skip': 'This playlist does not exist', }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', 'info_dict': { 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'InterstellarMovie1', }, 'playlist_mincount': 21, }, { @@ -1956,12 +2682,14 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', @@ -1973,7 +2701,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader_id': 'backuspagemuseum', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', 'upload_date': '20161008', - 'license': 'Standard YouTube License', 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', 'categories': ['Nonprofits & Activism'], 'tags': list, @@ -1984,17 +2711,65 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'noplaylist': True, 'skip_download': True, }, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'Computerphile', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, }] def _real_initialize(self): self._login() + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + + for item in re.findall( + r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): + attrs = extract_attributes(item) + video_id = attrs['data-video-id'] + video_title = unescapeHTML(attrs.get('data-title')) + if video_title: + video_title = video_title.strip() + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + # Fallback with old _VIDEO_RE + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) + + # Relaxed fallbacks + self.extract_videos_from_page_impl( + r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + self.extract_videos_from_page_impl( + r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + + return zip(ids_in_page, titles_in_page) + def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -2020,9 +2795,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( - search_title('playlist-title') or - search_title('title long-title') or - search_title('title')) + search_title('playlist-title') + or search_title('title long-title') + or search_title('title')) title = clean_html(title_span) return self.playlist_result(url_results, playlist_id, title) @@ -2031,7 +2806,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604) + # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private @@ -2056,6 +2831,19 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', page, 'title', default=None) + _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' + uploader = self._html_search_regex( + r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, + page, 'uploader', default=None) + mobj = re.search( + r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, + page) + if mobj: + uploader_id = mobj.group('uploader_id') + uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) + else: + uploader_id = uploader_url = None + has_videos = True if not playlist_title: @@ -2066,14 +2854,21 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): except StopIteration: has_videos = False - return has_videos, self.playlist_result( + playlist = self.playlist_result( self._entries(page, playlist_id), playlist_id, playlist_title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + }) + + return has_videos, playlist def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:^|//)youtu\.be/([0-9A-Za-z_-]{11})', url, + r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, 'video id', default=None) if video_id: if self._downloader.params.get('noplaylist'): @@ -2104,7 +2899,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): return playlist # Some playlist URLs don't actually serve a playlist (see - # https://github.com/rg3/youtube-dl/issues/10537). + # https://github.com/ytdl-org/youtube-dl/issues/10537). # Fallback to plain video extraction if there is a video id # along with playlist id. return self.url_result(video_id, 'Youtube', video_id=video_id) @@ -2112,7 +2907,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2123,6 +2918,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', 'title': 'Uploads from lex will', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', } }, { 'note': 'Age restricted channel', @@ -2132,7 +2929,15 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', + 'uploader': 'Deus Ex', + 'uploader_id': 'DeusExOfficial', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'only_matching': True, }] @classmethod @@ -2203,7 +3008,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -2213,6 +3018,8 @@ class YoutubeUserIE(YoutubeChannelIE): 'info_dict': { 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', 'title': 'Uploads from The Linux Foundation', + 'uploader': 'The Linux Foundation', + 'uploader_id': 'TheLinuxFoundation', } }, { # Only available via https://www.youtube.com/c/12minuteathlete/videos @@ -2222,6 +3029,8 @@ class YoutubeUserIE(YoutubeChannelIE): 'info_dict': { 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', 'title': 'Uploads from 12 Minute Athlete', + 'uploader': '12 Minute Athlete', + 'uploader_id': 'the12minuteathlete', } }, { 'url': 'ytuser:phihag', @@ -2233,7 +3042,7 @@ class YoutubeUserIE(YoutubeChannelIE): 'url': 'https://www.youtube.com/gametrailers', 'only_matching': True, }, { - # This channel is not available. + # This channel is not available, geo restricted to JP 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', 'only_matching': True, }] @@ -2296,10 +3105,11 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): webpage = self._download_webpage(url, channel_id, fatal=False) if webpage: page_type = self._og_search_property( - 'type', webpage, 'page type', default=None) + 'type', webpage, 'page type', default='') video_id = self._html_search_meta( 'videoId', webpage, 'video id', default=None) - if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + if page_type.startswith('video') and video_id and re.match( + r'^[0-9A-Za-z_-]{11}$', video_id): return self.url_result(video_id, YoutubeIE.ie_key()) return self.url_result(base_url) @@ -2314,7 +3124,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'playlist_mincount': 4, 'info_dict': { 'id': 'ThirstForScience', - 'title': 'Thirst for Science', + 'title': 'ThirstForScience', }, }, { # with "Load more" button @@ -2331,10 +3141,15 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', 'title': 'Chem Player', }, + 'skip': 'Blocked', }] -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): +class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + + +class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -2368,8 +3183,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): raise ExtractorError( '[youtube] No video results', expected=True) - new_videos = self._ids_to_results(orderedSet(re.findall( - r'href="/watch\?v=(.{11})', html_content))) + new_videos = list(self._process_page(html_content)) videos += new_videos if not new_videos or len(videos) > limit: break @@ -2392,11 +3206,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2448,10 +3261,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - + def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] @@ -2462,12 +3272,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos - new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) if not new_ids: break ids.extend(new_ids) + for entry in self._ids_to_results(new_ids): + yield entry + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break @@ -2475,12 +3288,17 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): more = self._download_json( 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) return self.playlist_result( - self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + self._entries(page), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE):