X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/46113edab215c2211a604c06245c16d5d4e57dcf..9bb07a5ee663304e100edc65967d5fc4a521bcd0:/youtube_dl/extractor/youtube.py diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6071d72..25d056b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -6,48 +6,82 @@ from __future__ import unicode_literals import itertools import json import os.path +import random import re +import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter -from ..utils import ( +from ..compat import ( compat_chr, + compat_HTTPError, + compat_kwargs, compat_parse_qs, - compat_urllib_parse, - compat_urllib_request, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, compat_urlparse, compat_str, - +) +from ..utils import ( + bool_or_none, clean_html, - get_element_by_id, - get_element_by_attribute, + dict_get, + error_to_compat_str, + extract_attributes, ExtractorError, + float_or_none, + get_element_by_attribute, + get_element_by_id, int_or_none, - OnDemandPagedList, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, + qualities, + remove_quotes, + remove_start, + smuggle_url, + str_or_none, + str_to_int, + try_get, unescapeHTML, unified_strdate, - orderedSet, + unsmuggle_url, uppercase_escape, + url_or_none, + urlencode_postdata, ) + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor' - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' + def _set_language(self): - return bool(self._download_webpage( - self._LANG_URL, None, - note='Setting language', errnote='unable to set language', - fatal=False)) + self._set_cookie( + '.youtube.com', 'PREF', 'f1=50000000&hl=en', + # YouTube sets the expire time to about two months + expire_time=time.time() + 2 * 30 * 24 * 3600) + + def _ids_to_results(self, ids): + return [ + self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] def _login(self): """ @@ -57,10 +91,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True @@ -71,135 +105,270 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if login_page is False: return - galx = self._search_regex(r'(?s)]* id="gaia_secondfactorform"', login_results) is not None: - tfa_code = self._get_tfa_info() + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - if tfa_code is None: - self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor ') - self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + if challenge_results is False: + return - # Unlike the first login form, secTok and timeStmp are both required for the TFA form - - match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get secTok - did the page structure change?') - secTok = match.group(1) - match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') - timeStmp = match.group(1) - - tfa_form_strs = { - 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', - 'smsToken': '', - 'smsUserPin': tfa_code, - 'smsVerifyPin': 'Verify', - - 'PersistentCookie': 'yes', - 'checkConnection': '', - 'checkedDomains': 'youtube', - 'pstMsg': '1', - 'secTok': secTok, - 'timeStmp': timeStmp, - 'service': 'youtube', - 'hl': 'en_US', - } - tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items()) - tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - if tfa_results is False: + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(login_challenge, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor ' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) return False + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - if re.search(r'(?i)]* id="gaia_secondfactorform"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)]* id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False + + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if check_cookie_results is False: return False - return True - def _confirm_age(self): - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - req = compat_urllib_request.Request(self._AGE_URL, - compat_urllib_parse.urlencode(age_form).encode('ascii')) + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') + return False - self._download_webpage( - req, None, - note='Confirming age', errnote='Unable to confirm age', - fatal=False) + return True + + def _download_webpage_handle(self, *args, **kwargs): + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) def _real_initialize(self): if self._downloader is None: return - if self._get_login_info()[0] is not None: - if not self._set_language(): - return + self._set_language() if not self._login(): return - self._confirm_age() -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): + # Extract entries from page with "Load more" button + def _entries(self, page, playlist_id): + more_widget_html = content_html = page + for page_num in itertools.count(1): + for entry in self._process_page(content_html): + yield entry + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + count = 0 + retries = 3 + while count <= retries: + try: + # Downloading page may result in intermittent 5xx HTTP error + # that is usually worked around with a retry + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s%s' + % (page_num, ' (retry #%d)' % count if count else ''), + transform_source=uppercase_escape) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): + count += 1 + if count <= retries: + continue + raise + + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] + + +class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): + def _process_page(self, content): + for video_id, video_title in self.extract_videos_from_page(content): + yield self.url_result(video_id, 'Youtube', video_id, video_title) + + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + for mobj in re.finditer(video_re, page): + # The link with index 0 is not the first video of the playlist (not sure if still actual) + if 'index' in mobj.groupdict() and mobj.group('id') == '0': + continue + video_id = mobj.group('id') + video_title = unescapeHTML( + mobj.group('title')) if 'title' in mobj.groupdict() else None + if video_title: + video_title = video_title.strip() + if video_title == '► Play all': + video_title = None + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) + + +class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): + def _process_page(self, content): + for playlist_id in orderedSet(re.findall( + r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + content)): + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._og_search_title(webpage, fatal=False) + return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) + + +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -207,8 +376,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances + (?:(?:www|dev)\.)?invidio\.us/| + (?:(?:www|no)\.)?invidiou\.sh/| + (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?invidious\.enkirton\.net/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?tube\.poal\.co/| + (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yt\.elukerio\.org/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -216,131 +398,182 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) - |youtu\.be/ # just youtu.be/xxxx + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx + )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?&list=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) (?(1).+)? # if we found the ID, everything can follow - $""" + $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240}, - '6': {'ext': 'flv', 'width': 450, 'height': 270}, - '13': {'ext': '3gp'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720}, - '34': {'ext': 'flv', 'width': 640, 'height': 360}, - '35': {'ext': 'flv', 'width': 854, 'height': 480}, - '36': {'ext': '3gp', 'width': 320, 'height': 240}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072}, - '43': {'ext': 'webm', 'width': 640, 'height': 360}, - '44': {'ext': 'webm', 'width': 854, 'height': 480}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, - - - # 3d videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20}, + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10}, + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, - '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - # Dash mov - '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, - '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, - '266': {'ext': 'mov', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, + 'start_time': 1, + 'end_time': 9, } }, { - 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', 'note': 'Test generic use_cipher_signature video (#897)', 'info_dict': { 'id': 'UxxajLWwzqY', 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'alt_title': 'I Love It (feat. Charli XCX)', + 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], + 'duration': 180, 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', + 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -350,10 +583,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', - 'description': 'md5:64249768eec3bc4276236606ea996373', + 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', + 'alt_title': 'Tunnel Vision', + 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', + 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', + 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', + 'age_limit': 18, } }, { @@ -366,17 +606,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', - 'uploader_id': 'setindia' + 'uploader_id': 'setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', + 'age_limit': 18, } }, { - 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', 'info_dict': { 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' @@ -385,6 +651,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -392,239 +659,944 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', - 'title': 'Afrojack - The Spark ft. Spree Wilson', - 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, - ] - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._player_cache = {} - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', - player_url) - if not id_m: - raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') - - # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - if player_type == 'js': - code = self._download_webpage( - player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), - errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type - - if cache_spec is None: - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end+step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - start = '(Never used)' # Quelch pyflakes warnings - start will be - # set as soon as step is set - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - r'signature=([$a-zA-Z]+)', jscode, - 'Initial JS player signature function name') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _get_available_subtitles(self, video_id, webpage): - try: - sub_list = self._download_webpage( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) - return {} - lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - - sub_lang_list = {} - for l in lang_list: - lang = l[1] - if lang in sub_lang_list: - continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': unescapeHTML(l[0]).encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_available_automatic_caption(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') - self.to_screen('%s: Looking for automatic captions' % video_id) - mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if mobj is None: - self._downloader.report_warning(err_msg) - return {} - player_config = json.loads(mobj.group(1)) - try: - args = player_config[u'args'] - caption_url = args[u'ttsurl'] - timestamp = args[u'timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', - }) - sub_lang_list[sub_lang] = caption_url + '&' + params - return sub_lang_list - # An extractor error can be raise by the download process if there are + # JS player signature function name containing $ + { + 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', + 'info_dict': { + 'id': 'nfWlot6h_JM', + 'ext': 'm4a', + 'title': 'Taylor Swift - Shake It Off', + 'description': 'md5:bec2185232c05479482cb5a9b82719bf', + 'duration': 242, + 'uploader': 'TaylorSwiftVEVO', + 'uploader_id': 'TaylorSwiftVEVO', + 'upload_date': '20140818', + 'creator': 'Taylor Swift', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, + # Controversy video + { + 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', + 'info_dict': { + 'id': 'T4XJQO3qol8', + 'ext': 'mp4', + 'duration': 219, + 'upload_date': '20100909', + 'uploader': 'Amazing Atheist', + 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'title': 'Burning Everyone\'s Koran', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + } + }, + # Normal age-gate video (No vevo, embed allowed) + { + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', + 'info_dict': { + 'id': 'HtVdAasjOgU', + 'ext': 'mp4', + 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, + 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'upload_date': '20140605', + 'age_limit': 18, + }, + }, + # Age-gate video with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', + 'info_dict': { + 'id': '6kLq3WMV1nU', + 'ext': 'mp4', + 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', + 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'duration': 246, + 'uploader': 'LloydVEVO', + 'uploader_id': 'LloydVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', + 'upload_date': '20110629', + 'age_limit': 18, + }, + }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'deadmau5', + 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'duration': 6085, + 'upload_date': '20150827', + 'uploader_id': 'olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympic', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'duration': 85, + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫ᄋᄅ', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + }, + # url_encoded_fmt_stream_map is empty string + { + 'url': 'qEJwOuvDf7I', + 'info_dict': { + 'id': 'qEJwOuvDf7I', + 'ext': 'webm', + 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', + 'description': '', + 'upload_date': '20150404', + 'uploader_id': 'spbelect', + 'uploader': 'Наблюдатели Петербурга', + }, + 'params': { + 'skip_download': 'requires avconv', + }, + 'skip': 'This live event has ended.', + }, + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'webm', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:31', + }, + 'skip': 'not actual anymore', + }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150501', # According to '[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + player_url) + if not id_m: + raise ExtractorError('Cannot identify player %r' % player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + # Read from filesystem cache + func_id = '%s_%s_%s' % ( + player_type, player_id, self._signature_cache_id(example_sig)) + assert os.path.basename(func_id) == func_id + + cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) + if cache_spec is not None: + return lambda s: ''.join(s[i] for i in cache_spec) + + download_note = ( + 'Downloading player %s' % player_url + if self._downloader.params.get('verbose') else + 'Downloading %s player %s' % (player_type, player_id) + ) + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=download_note, + errnote='Download of %s failed' % player_url) + res = self._parse_sig_js(code) + elif player_type == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=download_note, + errnote='Download of %s failed' % player_url) + code = urlh.read() + res = self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] + + self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) + return res + + def _print_sig_code(self, func, example_sig): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = '' if start == 0 else str(start) + ends = (':%d' % (end + step)) if end + step >= 0 else ':' + steps = '' if step == 1 else (':%d' % step) + return 's[%s%s%s]' % (starts, ends, steps) + + step = None + # Quelch pyflakes warnings - start will be set when step is set + start = '(Never used)' + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield 's[%d]' % prev + if step is None: + yield 's[%d]' % i + else: + yield _genslice(start, i, step) + + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = func(test_string) + cache_spec = [ord(c) for c in cache_res] + expr_code = ' + '.join(gen_sig_code(cache_spec)) + signature_id_tuple = '(%s)' % ( + ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) + code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' + ' return %s\n') % (signature_id_tuple, expr_code) + self.to_screen('Extracted signature function:\n' + code) + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + # Obsolete patterns + r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') + + jsi = JSInterpreter(jscode) + initial_function = jsi.extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + swfi = SWFInterpreter(file_contents) + TARGET_CLASSNAME = 'SignatureDecipher' + searched_class = swfi.extract_class(TARGET_CLASSNAME) + initial_function = swfi.extract_function(searched_class, 'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): + """Turn the encrypted s field into a working signature""" + + if player_url is None: + raise ExtractorError('Cannot decrypt signature without player_url') + + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + try: + player_id = (player_url, self._signature_cache_id(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, s + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, s) + return func(s) + except Exception as e: + tb = traceback.format_exc() + raise ExtractorError( + 'Signature extraction failed: ' + tb, cause=e) + + def _get_subtitles(self, video_id, webpage): + try: + subs_doc = self._download_xml( + 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + video_id, note=False) + except ExtractorError as err: + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) + return {} + + sub_lang_list = {} + for track in subs_doc.findall('track'): + lang = track.attrib['lang_code'] + if lang in sub_lang_list: + continue + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse_urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats + if not sub_lang_list: + self._downloader.report_warning('video doesn\'t have subtitles') + return {} + return sub_lang_list + + def _get_ytplayer_config(self, video_id, webpage): + patterns = ( + # User data may contain arbitrary character sequences that may affect + # JSON extraction with regex, e.g. when '};' is contained the second + # regex won't capture the whole JSON. Yet working around by trying more + # concrete regex first keeping in mind proper quoted string handling + # to be implemented in future that will replace this workaround (see + # https://github.com/ytdl-org/youtube-dl/issues/7468, + # https://github.com/ytdl-org/youtube-dl/pull/7599) + r';ytplayer\.config\s*=\s*({.+?});ytplayer', + r';ytplayer\.config\s*=\s*({.+?});', + ) + config = self._search_regex( + patterns, webpage, 'ytplayer.config', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + + def _get_automatic_captions(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + self.to_screen('%s: Looking for automatic captions' % video_id) + player_config = self._get_ytplayer_config(video_id, webpage) + err_msg = 'Couldn\'t find automatic captions for %s' % video_id + if not player_config: + self._downloader.report_warning(err_msg) + return {} + try: + args = player_config['args'] + caption_url = args.get('ttsurl') + if caption_url: + timestamp = args['timestamp'] + # We get the available subtitles + list_params = compat_urllib_parse_urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, + }) + list_url = caption_url + '&' + list_params + caption_list = self._download_xml(list_url, video_id) + original_lang_node = caption_list.find('track') + if original_lang_node is None: + self._downloader.report_warning('Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse_urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats + return sub_lang_list + + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + sub_lang_list = [] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) + # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) + if not playback_url: + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + ]+?src=| + data-video-url=| + ]+?src=| + embedSWF\(?:\s*| + ]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)]+ + class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -633,47 +1605,68 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_from_m3u8(self, manifest_url, video_id): - url_map = {} - def _get_urls(_manifest): - lines = _manifest.split('\n') - urls = filter(lambda l: l and not l.startswith('#'), - lines) - return urls - manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') - formats_urls = _get_urls(manifest) - for format_url in formats_urls: - itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') - url_map[itag] = format_url - return url_map - - def _extract_annotations(self, video_id): - url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id - return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)[^>]*)(?=$|)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + if start_time > duration: + break + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + if end_time > duration: + end_time = duration + if start_time > end_time: + break + chapter_title = re.sub( + r']+>[^<]+', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') + start_time = None + end_time = None + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + if start_time is None and 't' in query: + start_time = parse_duration(query['t'][0]) + if start_time is None and 'start' in query: + start_time = parse_duration(query['start'][0]) + if end_time is None and 'end' in query: + end_time = parse_duration(query['end'][0]) + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') + url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') video_id = self.extract_id(url) # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - pref_cookies = [ - c for c in self._downloader.cookiejar - if c.domain == '.youtube.com' and c.name == 'PREF'] - for pc in pref_cookies: - if 'hl=' in pc.value: - pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) - else: - if pc.value: - pc.value += '&' - pc.value += 'hl=en' + url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -683,197 +1676,238 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: player_url = None + dash_mpds = [] + + def add_dash_mpd(video_info): + dash_mpd = video_info.get('dashmpd') + if dash_mpd and dash_mpd[0] not in dash_mpds: + dash_mpds.append(dash_mpd[0]) + + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + + def extract_token(v_info): + return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) + + def extract_player_response(player_response, video_id): + pl_response = str_or_none(player_response) + if not pl_response: + return + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + add_dash_mpd_pr(pl_response) + return pl_response + + player_response = {} + # Get video info - self.report_video_info_webpage_download(video_id) + embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: - self.report_age_confirmation() age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube - data = compat_urllib_parse.urlencode({ + url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') + data = compat_urllib_parse_urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'), + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - if 'token' not in video_info: - if 'reason' in video_info: - raise ExtractorError( - 'YouTube said: %s' % video_info['reason'][0], - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported') - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) + video_info = None + sts = None + # Try looking directly into the video webpage + ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) + if ytplayer_config: + args = ytplayer_config['args'] + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + sts = ytplayer_config.get('sts') + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el in ('embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts + video_info_webpage = self._download_webpage( + '%s://www.youtube.com/get_video_info' % proto, + video_id, note=False, + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue + get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) + if not video_info: + video_info = get_video_info + get_token = extract_token(get_video_info) + if get_token: + # Different get_video_info requests may report different results, e.g. + # some may report video unavailability, but some may serve it without + # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, + # the original webpage as well as el=info and el=embedded get_video_info + # requests report video unavailability due to geo restriction while + # el=detailpage succeeds and returns valid data). This is probably + # due to YouTube measures against IP ranges of hosting providers. + # Working around by preferring the first succeeded video_info containing + # the token if no such video_info yet was found. + token = extract_token(video_info) + if not token: + video_info = get_video_info + break + + def extract_unavailable_message(): + messages = [] + for tag, kind in (('h1', 'message'), ('div', 'submessage')): + msg = self._html_search_regex( + r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), + video_webpage, 'unavailable %s' % kind, default=None) + if msg: + messages.append(msg) + if messages: + return '\n'.join(messages) + + if not video_info: + unavailable_message = extract_unavailable_message() + if not unavailable_message: + unavailable_message = 'Unable to extract video data' + raise ExtractorError( + 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - # uploader_id - video_uploader_id = None - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.report_warning('unable to extract uploader nickname') + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} - # title - if 'title' in video_info: - video_title = video_info['title'][0] - else: + video_title = video_info.get('title', [None])[0] or video_details.get('title') + if not video_title: self._downloader.report_warning('Unable to extract video title') video_title = '_' - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = None - mobj = re.search(r'(?s)id="eow-date.*?>(.*?)', video_webpage) - if mobj is None: - mobj = re.search( - r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)', - video_webpage) - if mobj is not None: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - upload_date = unified_strdate(upload_date) - - m_cat_container = self._search_regex( - r'(?s)]*>\s*Category\s*\s*]*>(.*?)', - video_webpage, 'categories', fatal=False) - if m_cat_container: - category = self._html_search_regex( - r'(?s)(.*?)', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + + def replace_url(m): + redir_url = compat_urlparse.urljoin(url, m.group(1)) + parsed_redir_url = compat_urllib_parse_urlparse(redir_url) + if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': + qs = compat_parse_qs(parsed_redir_url.query) + q = qs.get('q') + if q and q[0]: + return q[0] + return redir_url + + description_original = video_description = re.sub(r'''(?x) - [^<]+ + (?:[a-zA-Z-]+="[^"]*"\s+)*? + (?:title|href)="([^"]+)"\s+ + (?:[a-zA-Z-]+="[^"]*"\s+)*? + class="[^"]*"[^>]*> + [^<]+\.{3}\s* - ''', r'\1', video_description) + ''', replace_url, video_description) video_description = clean_html(video_description) else: - fd_mobj = re.search(r']*>.*?([\d,]+)\s*' % re.escape(count_name), - video_webpage, count_name, default=None) - if count is not None: - return int(count.replace(',', '')) - return None - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'length_seconds' not in video_info: - self._downloader.report_warning('unable to extract video duration') - video_duration = None - else: - video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) + if view_count is None: + view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) + if is_live is None: + is_live = bool_or_none(video_details.get('isLive')) - # Decide which formats to download - try: - mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) - if not mobj: - raise ValueError('Could not find vevo ID') - json_code = uppercase_escape(mobj.group(1)) - ytplayer_config = json.loads(json_code) - args = ytplayer_config['args'] - # Easy way to know if the 's' value is in url_encoded_fmt_stream_map - # this signatures are encrypted - if 'url_encoded_fmt_stream_map' not in args: - raise ValueError('No stream_map present') # caught below - re_signature = re.compile(r'[&,]s=') - m_s = re_signature.search(args['url_encoded_fmt_stream_map']) - if m_s is not None: - self.to_screen('%s: Encrypted signatures detected.' % video_id) - video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] - m_s = re_signature.search(args.get('adaptive_fmts', '')) - if m_s is not None: - if 'adaptive_fmts' in video_info: - video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts'] - else: - video_info['adaptive_fmts'] = [args['adaptive_fmts']] - except ValueError: - pass + # Check for "rental" videos + if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: + raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - def _map_to_format_list(urlmap): - formats = [] - for itag, video_real_url in urlmap.items(): - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - if itag in self._formats: - dct.update(self._formats[itag]) - formats.append(dct) - return formats + def _extract_filesize(media_url): + return int_or_none(self._search_regex( + r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() @@ -883,34 +1917,82 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] + elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): + encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) - url_map = {} + raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats_spec = {} + fmt_list = video_info.get('fmt_list', [''])[0] + if fmt_list: + for fmt in fmt_list.split(','): + spec = fmt.split('/') + if len(spec) > 1: + width_height = spec[1].split('x') + if len(width_height) == 2: + formats_spec[spec[0]] = { + 'resolution': spec[1], + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + } + q = qualities(['small', 'medium', 'hd720']) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) + if streaming_formats: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } + formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data: + if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): + continue + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: continue format_id = url_data['itag'][0] url = url_data['url'][0] - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if not age_gate: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + jsplayer_url_json = self._search_regex( + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) + ASSETS_RE, embed_webpage, 'JS player URL') + + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + if self._downloader.params.get('verbose'): if player_url is None: player_version = 'unknown' @@ -923,141 +2005,480 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - r'html5player-([^/]+?)(?:/html5player)?\.js', + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) + (format_id, parts_sizes, player_desc)) signature = self._decrypt_signature( encrypted_sig, video_id, player_url, age_gate) - url += '&signature=' + signature + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: url += '&ratebypass=yes' - url_map[format_id] = url - formats = _map_to_format_list(url_map) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - url_map = self._extract_from_m3u8(manifest_url, video_id) - formats = _map_to_format_list(url_map) + + dct = { + 'format_id': format_id, + 'url': url, + 'player_url': player_url, + } + if format_id in self._formats: + dct.update(self._formats[format_id]) + if format_id in formats_spec: + dct.update(formats_spec[format_id]) + + # Some itags are not included in DASH manifest thus corresponding formats will + # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). + # Trying to extract metadata from url_encoded_fmt_stream_map entry. + mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) + width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + + quality = url_data.get('quality', [None])[0] + + more_fields = { + 'filesize': filesize, + 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), + 'width': width, + 'height': height, + 'fps': int_or_none(url_data.get('fps', [None])[0]), + 'format_note': url_data.get('quality_label', [None])[0] or quality, + 'quality': q(quality), + } + for key, value in more_fields.items(): + if value: + dct[key] = value + type_ = url_data.get('type', [None])[0] + if type_: + type_split = type_.split(';') + kind_ext = type_split[0].split('/') + if len(kind_ext) == 2: + kind, _ = kind_ext + dct['ext'] = mimetype2ext(type_split[0]) + if kind in ('audio', 'video'): + codecs = None + for mobj in re.finditer( + r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): + if mobj.group('key') == 'codecs': + codecs = mobj.group('val') + break + if codecs: + dct.update(parse_codecs(codecs)) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + else: + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) + or url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = extract_unavailable_message() + if not error_message: + error_message = clean_html(try_get( + player_response, lambda x: x['playabilityStatus']['reason'], + compat_str)) + if not error_message: + error_message = clean_html( + try_get(video_info, lambda x: x['reason'][0], compat_str)) + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') + + # uploader + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + channel_id = ( + str_or_none(video_details.get('channelId')) + or self._html_search_meta( + 'channelId', video_webpage, 'channel id', default=None) + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', + video_webpage, 'channel id', default=None, group='id')) + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* + ]*>\s* +
  • (?P.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + album = extract_meta('Album') + + # Youtube Music Auto-generated description + release_date = release_year = None + if video_description: + mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) + if mobj: + if not track: + track = mobj.group('track').strip() + if not artist: + artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) + if not album: + album = mobj.group('album'.strip()) + release_year = mobj.group('release_year') + release_date = mobj.group('release_date') + if release_date: + release_date = release_date.replace('-', '') + if not release_year: + release_year = int(release_date[:4]) + if release_year: + release_year = int(release_year) + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = unescapeHTML(m_episode.group('series')) + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] else: - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + + average_rating = ( + float_or_none(video_details.get('averageRating')) + or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + xsrf_token = self._search_regex( + r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2', + video_webpage, 'xsrf token', group='xsrf_token', fatal=False) + invideo_url = try_get( + player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) + if xsrf_token and invideo_url: + xsrf_field_name = self._search_regex( + r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', + video_webpage, 'xsrf field name', + group='xsrf_field_name', default='session_token') + video_annotations = self._download_webpage( + self._proto_relative_url(invideo_url), + video_id, note='Downloading annotations', + errnote='Unable to download video annotations', fatal=False, + data=urlencode_postdata({xsrf_field_name: xsrf_token})) + + chapters = self._extract_chapters(description_original, video_duration) # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): - try: - # The DASH manifest used needs to be the one from the original video_webpage. - # The one found in get_video_info seems to be using different signatures. - # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage. - # Luckily, it seems, this case uses some kind of default signature (len == 86), so the - # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. - if age_gate: - dash_manifest_url = video_info.get('dashmpd')[0] + dash_mpd_fatal = True + for mpd_url in dash_mpds: + dash_formats = {} + try: + def decrypt_sig(mobj): + s = mobj.group(1) + dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) + return '/signature/%s' % dec_s + + mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) + + for df in self._extract_mpd_formats( + mpd_url, video_id, fatal=dash_mpd_fatal, + formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) + # Do not overwrite DASH format found in some previous DASH manifest + if df['format_id'] not in dash_formats: + dash_formats[df['format_id']] = df + # Additional DASH manifests may end up in HTTP Error 403 therefore + # allow them to fail without bug report message if we already have + # some DASH manifest succeeded. This is temporary workaround to reduce + # burst of bug reports until we figure out the reason and whether it + # can be fixed at all. + dash_mpd_fatal = False + except (ExtractorError, KeyError) as e: + self.report_warning( + 'Skipping DASH manifest: %r' % e, video_id) + if dash_formats: + # Remove the formats we found through non-DASH, they + # contain less info and it can be wrong, because we use + # fixed values (for example the resolution). See + # https://github.com/ytdl-org/youtube-dl/issues/5774 for an + # example. + formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] + formats.extend(dash_formats.values()) + + # Check for malformed aspect ratio + stretched_m = re.search( + r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', + video_webpage) + if stretched_m: + w = float(stretched_m.group('w')) + h = float(stretched_m.group('h')) + # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). + # We will only process correct ratios. + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + + if not formats: + token = extract_token(video_info) + if not token: + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) else: - dash_manifest_url = ytplayer_config['args']['dashmpd'] - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) - dash_doc = self._download_xml( - dash_manifest_url, video_id, - note='Downloading DASH manifest', - errnote='Could not download DASH manifest') - for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): - url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') - if url_el is None: - continue - format_id = r.attrib['id'] - video_url = url_el.text - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) - f = { - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(r.attrib.get('width')), - 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), - 'asr': int_or_none(r.attrib.get('audioSamplingRate')), - 'filesize': filesize, - } - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == format_id) - except StopIteration: - f.update(self._formats.get(format_id, {})) - formats.append(f) - else: - existing_format.update(f) - - except (ExtractorError, KeyError) as e: - self.report_warning('Skipping DASH manifest: %s' % e, video_id) + raise ExtractorError( + '"token" parameter not in video info for unknown reason', + video_id=video_id) + + if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) + self.mark_watched(video_id, video_info, player_response) + return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'categories': video_categories, - 'subtitles': video_subtitles, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, + 'id': video_id, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'upload_date': upload_date, + 'license': video_license, + 'creator': video_creator or artist, + 'title': video_title, + 'alt_title': video_alt_title or track, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'categories': video_categories, + 'tags': video_tags, + 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, + 'duration': video_duration, + 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, + 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, - 'formats': formats, + 'average_rating': average_rating, + 'formats': formats, + 'is_live': is_live, + 'start_time': start_time, + 'end_time': end_time, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'track': track, + 'artist': artist, + 'album': album, + 'release_date': release_date, + 'release_year': release_year, } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): + +class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?&)*? (?:p|a|list)= - | p/ + (?: + youtube\.com| + invidio\.us + ) + / + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} + # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) - )""" + (%(playlist_id)s) + )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _MORE_PAGES_INDICATOR = r'data-link-type="next"' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' + _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' + _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'info_dict': { 'title': 'ytdl test PL', + 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', }, 'playlist_count': 3, }, { 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'info_dict': { + 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'ChRiStIaAn008', }, 'playlist_count': 95, }, { @@ -1065,6 +2486,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'PLBB231211A4F62143', 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'Wickydoo', }, 'playlist_mincount': 26, }, { @@ -1072,195 +2496,362 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', 'info_dict': { 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'Cauchemar89', }, 'playlist_mincount': 799, }, { 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', 'info_dict': { 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, + 'skip': 'This playlist is private', }, { 'note': 'embedded', - 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'playlist_count': 4, 'info_dict': { 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 485, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'sdragonfang', } }, { - 'note': 'Embedded SWF player', - 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, + 'note': 'Embedded SWF player', + 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA7', + 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', + }, + 'skip': 'This playlist does not exist', + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'InterstellarMovie1', + }, + 'playlist_mincount': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', + 'info_dict': { + 'id': 'yeWKywCrFtk', + 'ext': 'mp4', + 'title': 'Small Scale Baler and Braiding Rugs', + 'uploader': 'Backus-Page House Museum', + 'uploader_id': 'backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'upload_date': '20161008', + 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', + 'categories': ['Nonprofits & Activism'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { - 'title': 'JODA7', - } + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'Computerphile', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): self._login() - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + + for item in re.findall( + r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): + attrs = extract_attributes(item) + video_id = attrs['data-video-id'] + video_title = unescapeHTML(attrs.get('data-title')) + if video_title: + video_title = video_title.strip() + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + # Fallback with old _VIDEO_RE + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) + + # Relaxed fallbacks + self.extract_videos_from_page_impl( + r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + self.extract_videos_from_page_impl( + r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + + return zip(ids_in_page, titles_in_page) def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading Youtube mix') + ids = [] + last_id = playlist_id[-11:] + for n in itertools.count(1): + url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + webpage = self._download_webpage( + url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) + new_ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) + # Fetch new pages until all the videos are repeated, it seems that + # there are always 51 unique videos. + new_ids = [_id for _id in new_ids if _id not in ids] + if not new_ids: + break + ids.extend(new_ids) + last_id = ids[-1] + + url_results = self._ids_to_results(ids) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( - search_title('playlist-title') or - search_title('title long-title') or - search_title('title')) + search_title('playlist-title') + or search_title('title long-title') + or search_title('title')) title = clean_html(title_span) - ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) + def _extract_playlist(self, playlist_id): + url = self._TEMPLATE_URL % playlist_id + page = self._download_webpage(url, playlist_id) + + # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) + for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): + match = match.strip() + # Check if the playlist exists or is private + mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) + if mobj: + reason = mobj.group('reason') + message = 'This playlist %s' % reason + if 'private' in reason: + message += ', use --username or --netrc to access it' + message += '.' + raise ExtractorError(message, expected=True) + elif re.match(r'[^<]*Invalid parameters[^<]*', match): + raise ExtractorError( + 'Invalid parameters. Maybe URL is incorrect.', + expected=True) + elif re.match(r'[^<]*Choose your language[^<]*', match): + continue + else: + self.report_warning('Youtube gives an alert message: ' + match) + + playlist_title = self._html_search_regex( + r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', + page, 'title', default=None) + + _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' + uploader = self._search_regex( + r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, + page, 'uploader', default=None) + mobj = re.search( + r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, + page) + if mobj: + uploader_id = mobj.group('uploader_id') + uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) + else: + uploader_id = uploader_url = None + + has_videos = True + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + playlist = self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + }) + + return has_videos, playlist + + def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] + video_id = query_dict.get('v', [None])[0] or self._search_regex( + r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, + 'video id', default=None) + if video_id: if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None - if playlist_id.startswith('RD'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - if playlist_id.startswith('TL'): - raise ExtractorError('For downloading YouTube.com top lists, use ' - 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) - - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - more_widget_html = content_html = page - - # Check if the playlist exists or is private - if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None: - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) - - # Extract the video ids from the playlist pages - ids = [] - - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - ids.extend(new_ids) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', - page, 'title') + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_id, playlist_title) + video_id, video = self._check_download_just_video(url, playlist_id) + if video: + return video + if playlist_id.startswith(('RD', 'UL', 'PU')): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) -class YoutubeTopListIE(YoutubePlaylistIE): - IE_NAME = 'youtube:toplist' - IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' - ' (Example: "yttoplist:music:Top Tracks")') - _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' - _TESTS = [{ - 'url': 'yttoplist:music:Trending', - 'playlist_mincount': 5, - 'skip': 'Only works for logged-in users', - }] + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel = mobj.group('chann') - title = mobj.group('title') - query = compat_urllib_parse.urlencode({'title': title}) - channel_page = self._download_webpage( - 'https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex( - r'''(?x) - <a\s+href="([^"]+)".*?>\s* - <span\s+class="branded-page-module-title-text">\s* - <span[^>]*>.*?%s.*?</span>''' % re.escape(query), - channel_page, 'list') - url = compat_urlparse.urljoin('https://www.youtube.com/', link) - - video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' - ids = [] - # sometimes the webpage doesn't contain the videos - # retry until we get them - for i in itertools.count(0): - msg = 'Downloading Youtube mix' - if i > 0: - msg += ', retry #%d' % i - - webpage = self._download_webpage(url, title, msg) - ids = orderedSet(re.findall(video_re, webpage)) - if ids: - break - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_title=title) + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/ytdl-org/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" - _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' + _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'playlist_mincount': 91, + 'info_dict': { + 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'Uploads from lex will', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + } + }, { + 'note': 'Age restricted channel', + # from https://www.youtube.com/user/DeusExOfficial + 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', + 'playlist_mincount': 64, + 'info_dict': { + 'id': 'UUs0ifCMCm1icqRbqhUINa0w', + 'title': 'Uploads from Deus Ex', + 'uploader': 'Deus Ex', + 'uploader_id': 'DeusExOfficial', + }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] - def extract_videos_from_page(self, page): - ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page + @classmethod + def suitable(cls, url): + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) + + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id def _real_extract(self, url): - # Extract channel id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) + channel_id = self._match_id(url) - # Download channel page - channel_id = mobj.group(1) - video_ids = [] - url = 'https://www.youtube.com/channel/%s/videos' % channel_id - channel_page = self._download_webpage(url, channel_id) + url = self._build_template_url(url, channel_id) + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + if channel_page is False: + channel_playlist_id = False + else: + channel_playlist_id = self._html_search_meta( + 'channelId', channel_page, 'channel id', default=None) + if not channel_playlist_id: + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + return self.url_result( + compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') + + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: channel-header-autogenerated-label| @@ -1270,201 +2861,254 @@ class YoutubeChannelIE(InfoExtractor): if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty - video_ids = self.extract_videos_from_page(channel_page) - else: - # Download all channel pages using the json-based channel_ajax query - for pagenum in itertools.count(1): - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_json( - url, channel_id, note='Downloading page #%s' % pagenum, - transform_source=uppercase_escape) - - ids_in_page = self.extract_videos_from_page(page['content_html']) - video_ids.extend(ids_in_page) - - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: - break + entries = [ + self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + for video_id, video_title in self.extract_videos_from_page(channel_page)] + return self.playlist_result(entries, channel_id) - self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] - return self.playlist_result(url_entries, channel_id) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) -class YoutubeUserIE(InfoExtractor): +class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' - _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', + 'uploader': 'The Linux Foundation', + 'uploader_id': 'TheLinuxFoundation', + } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', + 'uploader': '12 Minute Athlete', + 'uploader_id': 'the12minuteathlete', } }, { 'url': 'ytuser:phihag', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/gametrailers', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, + }, { + # This channel is not available, geo restricted to JP + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. - other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_ies): return False - else: return super(YoutubeUserIE, cls).suitable(url) + other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_yt_ies): + return False + else: + return super(YoutubeUserIE, cls).suitable(url) - def _real_extract(self, url): - # Extract username + def _build_template_url(self, url, channel_id): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) - username = mobj.group(1) - # Download video ids using YouTube Data API. Result size per - # query is limited (currently to 50 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' + IE_NAME = 'youtube:live' - def download_page(pagenum): - start_index = pagenum * self._GDATA_PAGE_SIZE + 1 + _TESTS = [{ + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }] - gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage( - gdata_url, username, - 'Downloading video ids from %d to %d' % ( - start_index, start_index + self._GDATA_PAGE_SIZE)) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default='') + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type.startswith('video') and video_id and re.match( + r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + + +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): + IE_DESC = 'YouTube.com user/channel playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + IE_NAME = 'youtube:playlists' - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError('Invalid JSON in API response: ' + compat_str(err)) - if 'entry' not in response['feed']: - return + _TESTS = [{ + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'ThirstForScience', + 'title': 'ThirstForScience', + }, + }, { + # with "Load more" button + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 70, + 'info_dict': { + 'id': 'igorkle1', + 'title': 'Игорь Клейнер', + }, + }, { + 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', + 'playlist_mincount': 17, + 'info_dict': { + 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', + 'title': 'Chem Player', + }, + 'skip': 'Blocked', + }] - # Extract video identifiers - entries = response['feed']['entry'] - for entry in entries: - title = entry['title']['$t'] - video_id = entry['id']['$t'].split('/')[-1] - yield { - '_type': 'url', - 'url': video_id, - 'ie_key': 'Youtube', - 'id': video_id, - 'title': title, - } - url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) - return self.playlist_result(url_results, playlist_title=username) +class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' -class YoutubeSearchIE(SearchInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' - _MAX_RESULTS = 1000 + # there doesn't appear to be a real limit, for example if you search for + # 'python' you get more than 8.000.000 results + _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' + _EXTRA_QUERY_ARGS = {} + _TESTS = [] def _get_n_results(self, query, n): """Get a specified number of results for a query""" - video_ids = [] - pagenum = 0 + videos = [] limit = n - PAGE_SIZE = 50 - while (PAGE_SIZE * pagenum) < limit: - result_url = self._API_URL % ( - compat_urllib_parse.quote_plus(query.encode('utf-8')), - (PAGE_SIZE * pagenum) + 1) - data_json = self._download_webpage( + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + + for pagenum in itertools.count(1): + data = self._download_json( result_url, video_id='query "%s"' % query, - note='Downloading page %s' % (pagenum + 1), - errnote='Unable to download API page') - data = json.loads(data_json) - api_response = data['data'] + note='Downloading page %s' % pagenum, + errnote='Unable to download API page', + query={'spf': 'navigate'}) + html_content = data[1]['body']['content'] - if 'items' not in api_response: + if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) - new_ids = list(video['id'] for video in api_response['items']) - video_ids += new_ids - - limit = min(n, api_response['totalItems']) - pagenum += 1 + new_videos = list(self._process_page(html_content)) + videos += new_videos + if not new_videos or len(videos) > limit: + break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - if len(video_ids) > n: - video_ids = video_ids[:n] - videos = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + if len(videos) > n: + videos = videos[:n] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' + _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, 'info_dict': { 'title': 'youtube-dl test video', } + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse.unquote_plus(mobj.group('query')) - + query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) + return self.playlist_result(self._process_page(webpage), playlist_title=query) - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } - -class YoutubeShowIE(InfoExtractor): +class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ - 'url': 'http://www.youtube.com/show/airdisasters', - 'playlist_mincount': 3, + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, 'info_dict': { 'id': 'airdisasters', 'title': 'Air Disasters', @@ -1472,44 +3116,17 @@ class YoutubeShowIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - webpage = self._download_webpage( - url, playlist_id, 'Downloading show webpage') - # There's one playlist for each season of the show - m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) - self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) - entries = [ - self.url_result( - 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist') - for season in m_seasons - ] - title = self._og_search_title(webpage, fatal=False) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': title, - 'entries': entries, - } + playlist_id = self._match_id(url) + return super(YoutubeShowIE, self)._real_extract( + 'https://www.youtube.com/show/%s/playlists' % playlist_id) class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ - Base class for extractors that fetch info from - http://www.youtube.com/feed_ajax + Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - # use action_load_personal_feed instead of action_load_system_feed - _PERSONAL_FEED = False - - @property - def _FEED_TEMPLATE(self): - action = 'action_load_system_feed' - if self._PERSONAL_FEED: - action = 'action_load_personal_feed' - return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): @@ -1518,52 +3135,70 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - feed_entries = [] - paging = 0 - for i in itertools.count(1): - info = self._download_json(self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i) - feed_html = info.get('feed_html') or info.get('content_html') - load_more_widget_html = info.get('load_more_widget_html') or feed_html - m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) - ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend( - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids) - mobj = re.search( - r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)', - load_more_widget_html) - if mobj is None: + def _entries(self, page): + # The extraction process is the same as for playlists, but the regex + # for the video ids doesn't contain an index + ids = [] + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) + if not new_ids: break - paging = mobj.group('paging') - return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + ids.extend(new_ids) -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = 'Youtube Watch Later' - _PERSONAL_FEED = True + for entry in self._ids_to_results(new_ids): + yield entry + + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) + return self.playlist_result( + self._entries(page), playlist_title=self._PLAYLIST_TITLE) + + +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' + IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + + _TESTS = [{ + 'url': 'https://www.youtube.com/playlist?list=WL', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'only_matching': True, + }] + + def _real_extract(self, url): + _, video = self._check_download_just_video(url, 'WL') + if video: + return video + _, playlist = self._extract_playlist('WL') + return playlist -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PERSONAL_FEED = True - _PLAYLIST_TITLE = 'Youtube Watch History' class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): @@ -1572,60 +3207,63 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): return self.url_result(playlist_id, 'YoutubePlaylist') -class YoutubeSubscriptionsIE(YoutubePlaylistIE): - IE_NAME = 'youtube:subscriptions' - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _TESTS = [] - - def _real_extract(self, url): - title = 'Youtube Subscriptions' - page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) - - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = 'Youtube Recommended videos' - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) - ids.extend(new_ids) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = 'Youtube Subscriptions' - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PLAYLIST_TITLE = 'Youtube History' class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?(?: + (?:https?://)? + (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ + (?:watch\?(?: feature=[a-z_]+| - annotation_id=annotation_[^&]+ - )?$| - (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ + annotation_id=annotation_[^&]+| + x-yt-cl=[0-9]+| + hl=[^&]*| + t=[0-9]+ + )? + | + attribution_link\?a=[^&]+ + ) + $ ''' _TESTS = [{ - 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?feature=foo', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?hl=en-GB', 'only_matching': True, }, { - 'url': 'http://www.youtube.com/watch?', + 'url': 'https://www.youtube.com/watch?t=2372', 'only_matching': True, }] @@ -1634,6 +3272,23 @@ class YoutubeTruncatedURLIE(InfoExtractor): 'Did you forget to quote the URL? Remember that & is a meta ' 'character in most shells, so you want to put the URL in quotes, ' 'like youtube-dl ' - '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) + + +class YoutubeTruncatedIDIE(InfoExtractor): + IE_NAME = 'youtube:truncated_id' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + raise ExtractorError( + 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), + expected=True)