# coding: utf-8
-import collections
-import errno
-import io
+from __future__ import unicode_literals
+
+
import itertools
import json
import os.path
+import random
import re
-import socket
-import string
-import struct
+import time
import traceback
-import zlib
from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+from ..jsinterp import JSInterpreter
+from ..swfinterp import SWFInterpreter
+from ..compat import (
compat_chr,
- compat_http_client,
+ compat_kwargs,
compat_parse_qs,
- compat_urllib_error,
- compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
+ compat_urllib_parse_urlencode,
+ compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
clean_html,
- get_cachedir,
- get_element_by_id,
- get_element_by_attribute,
+ error_to_compat_str,
ExtractorError,
+ float_or_none,
+ get_element_by_attribute,
+ get_element_by_id,
+ int_or_none,
+ mimetype2ext,
+ orderedSet,
+ parse_codecs,
+ parse_duration,
+ remove_quotes,
+ remove_start,
+ smuggle_url,
+ str_to_int,
+ try_get,
unescapeHTML,
unified_strdate,
- orderedSet,
- write_json_file,
+ unsmuggle_url,
+ uppercase_escape,
+ urlencode_postdata,
)
+
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+ _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
+
+ _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+ _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+ _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- def report_lang(self):
- """Report attempt to set language."""
- self.to_screen(u'Setting language')
+ _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
def _set_language(self):
- request = compat_urllib_request.Request(self._LANG_URL)
- try:
- self.report_lang()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- return False
- return True
+ self._set_cookie(
+ '.youtube.com', 'PREF', 'f1=50000000&hl=en',
+ # YouTube sets the expire time to about two months
+ expire_time=time.time() + 2 * 30 * 24 * 3600)
+
+ def _ids_to_results(self, ids):
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
def _login(self):
+ """
+ Attempt to log in to YouTube.
+ True is returned if successful or skipped.
+ False is returned if login failed.
+
+ If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
+ """
(username, password) = self._get_login_info()
# No authentication to be performed
if username is None:
- if self._LOGIN_REQUIRED:
- raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
+ raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return True
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None,
+ note='Downloading login page',
+ errnote='unable to fetch login page', fatal=False)
+ if login_page is False:
+ return
+
+ login_form = self._hidden_inputs(login_page)
+
+ def req(url, f_req, note, errnote):
+ data = login_form.copy()
+ data.update({
+ 'pstMsg': 1,
+ 'checkConnection': 'youtube',
+ 'checkedDomains': 'youtube',
+ 'hl': 'en',
+ 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+ 'f.req': json.dumps(f_req),
+ 'flowName': 'GlifWebSignIn',
+ 'flowEntry': 'ServiceLogin',
+ })
+ return self._download_json(
+ url, None, note=note, errnote=errnote,
+ transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+ fatal=False,
+ data=urlencode_postdata(data), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+ 'Google-Accounts-XSRF': 1,
+ })
+
+ def warn(message):
+ self._downloader.report_warning(message)
+
+ lookup_req = [
+ username,
+ None, [], None, 'US', None, None, 2, False, True,
+ [
+ None, None,
+ [2, 1, None, 1,
+ 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+ None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ],
+ username,
+ ]
+
+ lookup_results = req(
+ self._LOOKUP_URL, lookup_req,
+ 'Looking up account info', 'Unable to look up account info')
+
+ if lookup_results is False:
return False
- request = compat_urllib_request.Request(self._LOGIN_URL)
- try:
- login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
+ user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+ if not user_hash:
+ warn('Unable to extract user hash')
return False
- galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
- login_page, u'Login GALX parameter')
-
- # Log in
- login_form_strs = {
- u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- u'Email': username,
- u'GALX': galx,
- u'Passwd': password,
- u'PersistentCookie': u'yes',
- u'_utf8': u'霱',
- u'bgresponse': u'js_disabled',
- u'checkConnection': u'',
- u'checkedDomains': u'youtube',
- u'dnConn': u'',
- u'pstMsg': u'0',
- u'rmShown': u'1',
- u'secTok': u'',
- u'signIn': u'Sign in',
- u'timeStmp': u'',
- u'service': u'youtube',
- u'uilel': u'3',
- u'hl': u'en_US',
- }
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
- try:
- self.report_login()
- login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
- return False
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+ challenge_req = [
+ user_hash,
+ None, 1, None, [1, None, None, None, [password, None, True]],
+ [
+ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ]]
+
+ challenge_results = req(
+ self._CHALLENGE_URL, challenge_req,
+ 'Logging in', 'Unable to log in')
+
+ if challenge_results is False:
+ return
+
+ login_res = try_get(challenge_results, lambda x: x[0][5], list)
+ if login_res:
+ login_msg = try_get(login_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to login: %s' % 'Invalid password'
+ if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+ return False
+
+ res = try_get(challenge_results, lambda x: x[0][-1], list)
+ if not res:
+ warn('Unable to extract result entry')
+ return False
+
+ tfa = try_get(res, lambda x: x[0][0], list)
+ if tfa:
+ tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+ if tfa_str == 'TWO_STEP_VERIFICATION':
+ # SEND_SUCCESS - TFA code has been successfully sent to phone
+ # QUOTA_EXCEEDED - reached the limit of TFA codes
+ status = try_get(tfa, lambda x: x[5], compat_str)
+ if status == 'QUOTA_EXCEEDED':
+ warn('Exceeded the limit of TFA codes, try later')
+ return False
+
+ tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+ if not tl:
+ warn('Unable to extract TL')
+ return False
+
+ tfa_code = self._get_tfa_info('2-step verification code')
+
+ if not tfa_code:
+ warn(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ return False
+
+ tfa_code = remove_start(tfa_code, 'G-')
+
+ tfa_req = [
+ user_hash, None, 2, None,
+ [
+ 9, None, None, None, None, None, None, None,
+ [None, tfa_code, True, 2]
+ ]]
+
+ tfa_results = req(
+ self._TFA_URL.format(tl), tfa_req,
+ 'Submitting TFA code', 'Unable to submit TFA code')
+
+ if tfa_results is False:
+ return False
+
+ tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+ if tfa_res:
+ tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to finish TFA: %s' % 'Invalid TFA code'
+ if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+ return False
+
+ check_cookie_url = try_get(
+ tfa_results, lambda x: x[0][-1][2], compat_str)
+ else:
+ check_cookie_url = try_get(res, lambda x: x[2], compat_str)
+
+ if not check_cookie_url:
+ warn('Unable to extract CheckCookie URL')
+ return False
+
+ check_cookie_results = self._download_webpage(
+ check_cookie_url, None, 'Checking cookie', fatal=False)
+
+ if check_cookie_results is False:
+ return False
+
+ if 'https://myaccount.google.com/' not in check_cookie_results:
+ warn('Unable to log in')
return False
- return True
- def _confirm_age(self):
- age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
return True
+ def _download_webpage(self, *args, **kwargs):
+ kwargs.setdefault('query', {})['disable_polymer'] = 'true'
+ return super(YoutubeBaseInfoExtractor, self)._download_webpage(
+ *args, **compat_kwargs(kwargs))
+
def _real_initialize(self):
if self._downloader is None:
return
- if not self._set_language():
- return
+ self._set_language()
if not self._login():
return
- self._confirm_age()
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
- IE_DESC = u'YouTube.com'
+class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
+ # Extract entries from page with "Load more" button
+ def _entries(self, page, playlist_id):
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ for entry in self._process_page(content_html):
+ yield entry
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
+
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for video_id, video_title in self.extract_videos_from_page(content):
+ yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ for mobj in re.finditer(self._VIDEO_RE, page):
+ # The link with index 0 is not the first video of the playlist (not sure if still actual)
+ if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+ continue
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ if video_title:
+ video_title = video_title.strip()
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
+
+
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for playlist_id in orderedSet(re.findall(
+ r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
+ content)):
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._og_search_title(webpage, fatal=False)
+ return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+
+
+class YoutubeIE(YoutubeBaseInfoExtractor):
+ IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
(
- (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
+ (?:www\.)?deturl\.com/www\.youtube\.com/|
+ (?:www\.)?pwnyoutube\.com/|
+ (?:www\.)?hooktube\.com/|
+ (?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e)/) # v/ or embed/ or e/
+ (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
- (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+ (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
v=
)
))
- |youtu\.be/ # just youtu.be/xxxx
+ |(?:
+ youtu\.be| # just youtu.be/xxxx
+ vid\.plus| # or vid.plus/xxxx
+ zwearz\.com/watch| # or zwearz.com/watch/xxxx
+ )/
+ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
+ (?!.*?\blist=
+ (?:
+ %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
+ WL # WL are handled by the watch later IE
+ )
+ )
(?(1).+)? # if we found the ID, everything can follow
- $"""
+ $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
- # Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '84', '102', '83', '101', '82', '100',
- # Dash video
- '138', '137', '248', '136', '247', '135', '246',
- '245', '244', '134', '243', '133', '242', '160',
- # Dash audio
- '141', '172', '140', '171', '139',
- ]
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '102', '84', '101', '83', '100', '82',
- # Dash video
- '138', '248', '137', '247', '136', '246', '245',
- '244', '135', '243', '134', '242', '133', '160',
- # Dash audio
- '172', '141', '171', '140', '139',
- ]
- _video_formats_map = {
- 'flv': ['35', '34', '6', '5'],
- '3gp': ['36', '17', '13'],
- 'mp4': ['38', '37', '22', '18'],
- 'webm': ['46', '45', '44', '43'],
- }
- _video_extensions = {
- '13': '3gp',
- '17': '3gp',
- '18': 'mp4',
- '22': 'mp4',
- '36': '3gp',
- '37': 'mp4',
- '38': 'mp4',
- '43': 'webm',
- '44': 'webm',
- '45': 'webm',
- '46': 'webm',
-
- # 3d videos
- '82': 'mp4',
- '83': 'mp4',
- '84': 'mp4',
- '85': 'mp4',
- '100': 'webm',
- '101': 'webm',
- '102': 'webm',
+ _formats = {
+ '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
+ '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
+ '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+ '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+
+
+ # 3D videos
+ '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
- '92': 'mp4',
- '93': 'mp4',
- '94': 'mp4',
- '95': 'mp4',
- '96': 'mp4',
- '132': 'mp4',
- '151': 'mp4',
-
- # Dash mp4
- '133': 'mp4',
- '134': 'mp4',
- '135': 'mp4',
- '136': 'mp4',
- '137': 'mp4',
- '138': 'mp4',
- '160': 'mp4',
+ '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
+
+ # DASH mp4 video
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
- '139': 'm4a',
- '140': 'm4a',
- '141': 'm4a',
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+ '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
- '171': 'webm',
- '172': 'webm',
- '242': 'webm',
- '243': 'webm',
- '244': 'webm',
- '245': 'webm',
- '246': 'webm',
- '247': 'webm',
- '248': 'webm',
- }
- _video_dimensions = {
- '5': '400x240',
- '6': '???',
- '13': '???',
- '17': '176x144',
- '18': '640x360',
- '22': '1280x720',
- '34': '640x360',
- '35': '854x480',
- '36': '320x240',
- '37': '1920x1080',
- '38': '4096x3072',
- '43': '640x360',
- '44': '854x480',
- '45': '1280x720',
- '46': '1920x1080',
- '82': '360p',
- '83': '480p',
- '84': '720p',
- '85': '1080p',
- '92': '240p',
- '93': '360p',
- '94': '480p',
- '95': '720p',
- '96': '1080p',
- '100': '360p',
- '101': '480p',
- '102': '720p',
- '132': '240p',
- '151': '72p',
- '133': '240p',
- '134': '360p',
- '135': '480p',
- '136': '720p',
- '137': '1080p',
- '138': '>1080p',
- '139': '48k',
- '140': '128k',
- '141': '256k',
- '160': '192p',
- '171': '128k',
- '172': '256k',
- '242': '240p',
- '243': '360p',
- '244': '480p',
- '245': '480p',
- '246': '480p',
- '247': '720p',
- '248': '1080p',
- }
- _special_itags = {
- '82': '3D',
- '83': '3D',
- '84': '3D',
- '85': '3D',
- '100': '3D',
- '101': '3D',
- '102': '3D',
- '133': 'DASH Video',
- '134': 'DASH Video',
- '135': 'DASH Video',
- '136': 'DASH Video',
- '137': 'DASH Video',
- '138': 'DASH Video',
- '139': 'DASH Audio',
- '140': 'DASH Audio',
- '141': 'DASH Audio',
- '160': 'DASH Video',
- '171': 'DASH Audio',
- '172': 'DASH Audio',
- '242': 'DASH Video',
- '243': 'DASH Video',
- '244': 'DASH Video',
- '245': 'DASH Video',
- '246': 'DASH Video',
- '247': 'DASH Video',
- '248': 'DASH Video',
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
+ '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+
+ # Dash webm audio
+ '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+ '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
+
+ # Dash webm audio with opus inside
+ '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+ '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+ '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
+
+ # RTMP (unnamed)
+ '_rtmp': {'protocol': 'rtmp'},
}
+ _SUBTITLE_FORMATS = ('ttml', 'vtt')
+
+ _GEO_BYPASS = False
- IE_NAME = u'youtube'
+ IE_NAME = 'youtube'
_TESTS = [
{
- u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
- u"file": u"BaW_jenozKc.mp4",
- u"info_dict": {
- u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
- u"uploader": u"Philipp Hagemeister",
- u"uploader_id": u"phihag",
- u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+ 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
+ 'info_dict': {
+ 'id': 'BaW_jenozKc',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'phihag',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+ 'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
+ 'duration': 10,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'start_time': 1,
+ 'end_time': 9,
}
},
{
- u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
- u"file": u"UxxajLWwzqY.mp4",
- u"note": u"Test generic use_cipher_signature video (#897)",
- u"info_dict": {
- u"upload_date": u"20120506",
- u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
- u"description": u"md5:5b292926389560516e384ac437c0ec07",
- u"uploader": u"Icona Pop",
- u"uploader_id": u"IconaPop"
+ 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
+ 'note': 'Test generic use_cipher_signature video (#897)',
+ 'info_dict': {
+ 'id': 'UxxajLWwzqY',
+ 'ext': 'mp4',
+ 'upload_date': '20120506',
+ 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
+ 'alt_title': 'I Love It (feat. Charli XCX)',
+ 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
+ 'duration': 180,
+ 'uploader': 'Icona Pop',
+ 'uploader_id': 'IconaPop',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+ 'license': 'Standard YouTube License',
+ 'creator': 'Icona Pop',
}
},
{
- u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
- u"file": u"07FYdnEawAQ.mp4",
- u"note": u"Test VEVO video with age protection (#956)",
- u"info_dict": {
- u"upload_date": u"20130703",
- u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
- u"description": u"md5:64249768eec3bc4276236606ea996373",
- u"uploader": u"justintimberlakeVEVO",
- u"uploader_id": u"justintimberlakeVEVO"
+ 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
+ 'note': 'Test VEVO video with age protection (#956)',
+ 'info_dict': {
+ 'id': '07FYdnEawAQ',
+ 'ext': 'mp4',
+ 'upload_date': '20130703',
+ 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
+ 'alt_title': 'Tunnel Vision',
+ 'description': 'md5:64249768eec3bc4276236606ea996373',
+ 'duration': 419,
+ 'uploader': 'justintimberlakeVEVO',
+ 'uploader_id': 'justintimberlakeVEVO',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+ 'license': 'Standard YouTube License',
+ 'creator': 'Justin Timberlake',
+ 'age_limit': 18,
}
},
{
- u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
- u"file": u"yZIXLfi8CZQ.mp4",
- u"note": u"Embed-only video (#1746)",
- u"info_dict": {
- u"upload_date": u"20120608",
- u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
- u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
- u"uploader": u"SET India",
- u"uploader_id": u"setindia"
+ 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
+ 'note': 'Embed-only video (#1746)',
+ 'info_dict': {
+ 'id': 'yZIXLfi8CZQ',
+ 'ext': 'mp4',
+ 'upload_date': '20120608',
+ 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
+ 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
+ 'uploader': 'SET India',
+ 'uploader_id': 'setindia',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
+ 'license': 'Standard YouTube License',
+ 'age_limit': 18,
}
},
+ {
+ 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
+ 'note': 'Use the first video ID in the URL',
+ 'info_dict': {
+ 'id': 'BaW_jenozKc',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'phihag',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+ 'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
+ 'duration': 10,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
+ 'note': '256k DASH audio (format 141) via DASH manifest',
+ 'info_dict': {
+ 'id': 'a9LDPn-MO4I',
+ 'ext': 'm4a',
+ 'upload_date': '20121002',
+ 'uploader_id': '8KVIDEO',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
+ 'description': '',
+ 'uploader': '8KVIDEO',
+ 'license': 'Standard YouTube License',
+ 'title': 'UHDTV TEST 8K VIDEO.mp4'
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141',
+ },
+ 'skip': 'format 141 not served anymore',
+ },
+ # DASH manifest with encrypted signature
+ {
+ 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ 'info_dict': {
+ 'id': 'IB3lcPjvWLA',
+ 'ext': 'm4a',
+ 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
+ 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
+ 'duration': 244,
+ 'uploader': 'AfrojackVEVO',
+ 'uploader_id': 'AfrojackVEVO',
+ 'upload_date': '20131011',
+ 'license': 'Standard YouTube License',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141/bestaudio[ext=m4a]',
+ },
+ },
+ # JS player signature function name containing $
+ {
+ 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
+ 'info_dict': {
+ 'id': 'nfWlot6h_JM',
+ 'ext': 'm4a',
+ 'title': 'Taylor Swift - Shake It Off',
+ 'alt_title': 'Shake It Off',
+ 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
+ 'duration': 242,
+ 'uploader': 'TaylorSwiftVEVO',
+ 'uploader_id': 'TaylorSwiftVEVO',
+ 'upload_date': '20140818',
+ 'license': 'Standard YouTube License',
+ 'creator': 'Taylor Swift',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141/bestaudio[ext=m4a]',
+ },
+ },
+ # Controversy video
+ {
+ 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
+ 'info_dict': {
+ 'id': 'T4XJQO3qol8',
+ 'ext': 'mp4',
+ 'duration': 219,
+ 'upload_date': '20100909',
+ 'uploader': 'The Amazing Atheist',
+ 'uploader_id': 'TheAmazingAtheist',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+ 'license': 'Standard YouTube License',
+ 'title': 'Burning Everyone\'s Koran',
+ 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
+ }
+ },
+ # Normal age-gate video (No vevo, embed allowed)
+ {
+ 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
+ 'info_dict': {
+ 'id': 'HtVdAasjOgU',
+ 'ext': 'mp4',
+ 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
+ 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
+ 'duration': 142,
+ 'uploader': 'The Witcher',
+ 'uploader_id': 'WitcherGame',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
+ 'upload_date': '20140605',
+ 'license': 'Standard YouTube License',
+ 'age_limit': 18,
+ },
+ },
+ # Age-gate video with encrypted signature
+ {
+ 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
+ 'info_dict': {
+ 'id': '6kLq3WMV1nU',
+ 'ext': 'mp4',
+ 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
+ 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
+ 'duration': 247,
+ 'uploader': 'LloydVEVO',
+ 'uploader_id': 'LloydVEVO',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
+ 'upload_date': '20110629',
+ 'license': 'Standard YouTube License',
+ 'age_limit': 18,
+ },
+ },
+ # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+ # YouTube Red ad is not captured for creator
+ {
+ 'url': '__2ABJjxzNo',
+ 'info_dict': {
+ 'id': '__2ABJjxzNo',
+ 'ext': 'mp4',
+ 'duration': 266,
+ 'upload_date': '20100430',
+ 'uploader_id': 'deadmau5',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
+ 'creator': 'deadmau5',
+ 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'uploader': 'deadmau5',
+ 'license': 'Standard YouTube License',
+ 'title': 'Deadmau5 - Some Chords (HD)',
+ 'alt_title': 'Some Chords',
+ },
+ 'expected_warnings': [
+ 'DASH manifest missing',
+ ]
+ },
+ # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
+ {
+ 'url': 'lqQg6PlCWgI',
+ 'info_dict': {
+ 'id': 'lqQg6PlCWgI',
+ 'ext': 'mp4',
+ 'duration': 6085,
+ 'upload_date': '20150827',
+ 'uploader_id': 'olympic',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
+ 'license': 'Standard YouTube License',
+ 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
+ 'uploader': 'Olympic',
+ 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ }
+ },
+ # Non-square pixels
+ {
+ 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
+ 'info_dict': {
+ 'id': '_b-2C3KPAM0',
+ 'ext': 'mp4',
+ 'stretched_ratio': 16 / 9.,
+ 'duration': 85,
+ 'upload_date': '20110310',
+ 'uploader_id': 'AllenMeow',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
+ 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
+ 'uploader': '孫艾倫',
+ 'license': 'Standard YouTube License',
+ 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
+ },
+ },
+ # url_encoded_fmt_stream_map is empty string
+ {
+ 'url': 'qEJwOuvDf7I',
+ 'info_dict': {
+ 'id': 'qEJwOuvDf7I',
+ 'ext': 'webm',
+ 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
+ 'description': '',
+ 'upload_date': '20150404',
+ 'uploader_id': 'spbelect',
+ 'uploader': 'Наблюдатели Петербурга',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ },
+ 'skip': 'This live event has ended.',
+ },
+ # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+ {
+ 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+ 'info_dict': {
+ 'id': 'FIl7x6_3R5Y',
+ 'ext': 'mp4',
+ 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+ 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+ 'duration': 220,
+ 'upload_date': '20150625',
+ 'uploader_id': 'dorappi2000',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
+ 'uploader': 'dorappi2000',
+ 'license': 'Standard YouTube License',
+ 'formats': 'mincount:32',
+ },
+ },
+ # DASH manifest with segment_list
+ {
+ 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+ 'md5': '8ce563a1d667b599d21064e982ab9e31',
+ 'info_dict': {
+ 'id': 'CsmdDsKjzN8',
+ 'ext': 'mp4',
+ 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+ 'uploader': 'Airtek',
+ 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+ 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'license': 'Standard YouTube License',
+ 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '135', # bestvideo
+ },
+ 'skip': 'This live event has ended.',
+ },
+ {
+ # Multifeed videos (multiple cameras), URL is for Main Camera
+ 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'title': 'teamPGP: Rocket League Noob Stream',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'duration': 7335,
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '6h8e8xoXJzg',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'duration': 7337,
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'PUOgX5z9xZw',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'duration': 7337,
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'teuwxikvS5k',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (zim)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'duration': 7334,
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
+ 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
+ 'info_dict': {
+ 'id': 'gVfLd0zydlo',
+ 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
+ },
+ 'playlist_count': 2,
+ 'skip': 'Not multifeed anymore',
+ },
+ {
+ 'url': 'https://vid.plus/FlRa-iH7PGw',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
+ 'only_matching': True,
+ },
+ {
+ # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
+ # Also tests cut-off URL expansion in video description (see
+ # https://github.com/rg3/youtube-dl/issues/1892,
+ # https://github.com/rg3/youtube-dl/issues/8164)
+ 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+ 'info_dict': {
+ 'id': 'lsguqyKfVQg',
+ 'ext': 'mp4',
+ 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+ 'alt_title': 'Dark Walk',
+ 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+ 'duration': 133,
+ 'upload_date': '20151119',
+ 'uploader_id': 'IronSoulElf',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
+ 'uploader': 'IronSoulElf',
+ 'license': 'Standard YouTube License',
+ 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
+ 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
+ 'only_matching': True,
+ },
+ {
+ # Video with yt:stretch=17:0
+ 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
+ 'info_dict': {
+ 'id': 'Q39EVAstoRM',
+ 'ext': 'mp4',
+ 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
+ 'description': 'md5:ee18a25c350637c8faff806845bddee9',
+ 'upload_date': '20151107',
+ 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
+ 'uploader': 'CH GAMER DROID',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This video does not exist.',
+ },
+ {
+ # Video licensed under Creative Commons
+ 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+ 'info_dict': {
+ 'id': 'M4gD1WSo5mA',
+ 'ext': 'mp4',
+ 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+ 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+ 'duration': 721,
+ 'upload_date': '20150127',
+ 'uploader_id': 'BerkmanCenter',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+ 'uploader': 'The Berkman Klein Center for Internet & Society',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Channel-like uploader_url
+ 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+ 'info_dict': {
+ 'id': 'eQcmzGIKrzg',
+ 'ext': 'mp4',
+ 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+ 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+ 'duration': 4060,
+ 'upload_date': '20151119',
+ 'uploader': 'Bernie 2016',
+ 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',
+ 'only_matching': True,
+ },
+ {
+ # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
+ 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
+ 'only_matching': True,
+ },
+ {
+ # Rental video preview
+ 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
+ 'info_dict': {
+ 'id': 'uGpuVWrhIzE',
+ 'ext': 'mp4',
+ 'title': 'Piku - Trailer',
+ 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
+ 'upload_date': '20150811',
+ 'uploader': 'FlixMatrix',
+ 'uploader_id': 'FlixMatrixKaravan',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
+ 'license': 'Standard YouTube License',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # YouTube Red video with episode data
+ 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
+ 'info_dict': {
+ 'id': 'iqKdEhx-dD4',
+ 'ext': 'mp4',
+ 'title': 'Isolation - Mind Field (Ep 1)',
+ 'description': 'md5:8013b7ddea787342608f63a13ddc9492',
+ 'duration': 2085,
+ 'upload_date': '20170118',
+ 'uploader': 'Vsauce',
+ 'uploader_id': 'Vsauce',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
+ 'license': 'Standard YouTube License',
+ 'series': 'Mind Field',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': [
+ 'Skipping DASH manifest',
+ ],
+ },
+ {
+ # The following content has been identified by the YouTube community
+ # as inappropriate or offensive to some audiences.
+ 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
+ 'info_dict': {
+ 'id': '6SJNVb0GnPI',
+ 'ext': 'mp4',
+ 'title': 'Race Differences in Intelligence',
+ 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
+ 'duration': 965,
+ 'upload_date': '20140124',
+ 'uploader': 'New Century Foundation',
+ 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
+ 'license': 'Standard YouTube License',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # itag 212
+ 'url': '1t24XAntNCY',
+ 'only_matching': True,
+ },
+ {
+ # geo restricted to JP
+ 'url': 'sJL6WA-aGkQ',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+ 'only_matching': True,
+ },
]
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url): return False
- return re.match(cls._VALID_URL, url) is not None
-
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
self._player_cache = {}
- def report_video_webpage_download(self, video_id):
- """Report attempt to download video webpage."""
- self.to_screen(u'%s: Downloading video webpage' % video_id)
-
def report_video_info_webpage_download(self, video_id):
"""Report attempt to download video info webpage."""
- self.to_screen(u'%s: Downloading video info webpage' % video_id)
+ self.to_screen('%s: Downloading video info webpage' % video_id)
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
- self.to_screen(u'%s: Extracting video information' % video_id)
+ self.to_screen('%s: Extracting video information' % video_id)
def report_unavailable_format(self, video_id, format):
"""Report extracted video URL."""
- self.to_screen(u'%s: Format %s not available' % (video_id, format))
+ self.to_screen('%s: Format %s not available' % (video_id, format))
def report_rtmp_download(self):
"""Indicate the download will use the RTMP protocol."""
- self.to_screen(u'RTMP download detected')
-
- def _extract_signature_function(self, video_id, player_url, slen):
- id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
- player_url)
+ self.to_screen('RTMP download detected')
+
+ def _signature_cache_id(self, example_sig):
+ """ Return a string representation of a signature """
+ return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
+
+ def _extract_signature_function(self, video_id, player_url, example_sig):
+ id_m = re.match(
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
+ player_url)
+ if not id_m:
+ raise ExtractorError('Cannot identify player %r' % player_url)
player_type = id_m.group('ext')
player_id = id_m.group('id')
# Read from filesystem cache
- func_id = '%s_%s_%d' % (player_type, player_id, slen)
+ func_id = '%s_%s_%s' % (
+ player_type, player_id, self._signature_cache_id(example_sig))
assert os.path.basename(func_id) == func_id
- cache_dir = get_cachedir(self._downloader.params)
- cache_enabled = cache_dir is not None
- if cache_enabled:
- cache_fn = os.path.join(os.path.expanduser(cache_dir),
- u'youtube-sigfuncs',
- func_id + '.json')
- try:
- with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
- cache_spec = json.load(cachef)
- return lambda s: u''.join(s[i] for i in cache_spec)
- except IOError:
- pass # No cache available
+ cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
+ if cache_spec is not None:
+ return lambda s: ''.join(s[i] for i in cache_spec)
+ download_note = (
+ 'Downloading player %s' % player_url
+ if self._downloader.params.get('verbose') else
+ 'Downloading %s player %s' % (player_type, player_id)
+ )
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
- note=u'Downloading %s player %s' % (player_type, player_id),
- errnote=u'Download of %s failed' % player_url)
+ note=download_note,
+ errnote='Download of %s failed' % player_url)
res = self._parse_sig_js(code)
elif player_type == 'swf':
urlh = self._request_webpage(
player_url, video_id,
- note=u'Downloading %s player %s' % (player_type, player_id),
- errnote=u'Download of %s failed' % player_url)
+ note=download_note,
+ errnote='Download of %s failed' % player_url)
code = urlh.read()
res = self._parse_sig_swf(code)
else:
assert False, 'Invalid player type %r' % player_type
- if cache_enabled:
- try:
- test_string = u''.join(map(compat_chr, range(slen)))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
- try:
- os.makedirs(os.path.dirname(cache_fn))
- except OSError as ose:
- if ose.errno != errno.EEXIST:
- raise
- write_json_file(cache_spec, cache_fn)
- except Exception:
- tb = traceback.format_exc()
- self._downloader.report_warning(
- u'Writing cache to %r failed: %s' % (cache_fn, tb))
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
+ self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
return res
- def _print_sig_code(self, func, slen):
+ def _print_sig_code(self, func, example_sig):
def gen_sig_code(idxs):
def _genslice(start, end, step):
- starts = u'' if start == 0 else str(start)
- ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
- steps = u'' if step == 1 else (u':%d' % step)
- return u's[%s%s%s]' % (starts, ends, steps)
+ starts = '' if start == 0 else str(start)
+ ends = (':%d' % (end + step)) if end + step >= 0 else ':'
+ steps = '' if step == 1 else (':%d' % step)
+ return 's[%s%s%s]' % (starts, ends, steps)
step = None
- start = '(Never used)' # Quelch pyflakes warnings - start will be
- # set as soon as step is set
+ # Quelch pyflakes warnings - start will be set when step is set
+ start = '(Never used)'
for i, prev in zip(idxs[1:], idxs[:-1]):
if step is not None:
if i - prev == step:
start = prev
continue
else:
- yield u's[%d]' % prev
+ yield 's[%d]' % prev
if step is None:
- yield u's[%d]' % i
+ yield 's[%d]' % i
else:
yield _genslice(start, i, step)
- test_string = u''.join(map(compat_chr, range(slen)))
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
cache_res = func(test_string)
cache_spec = [ord(c) for c in cache_res]
- expr_code = u' + '.join(gen_sig_code(cache_spec))
- code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
- self.to_screen(u'Extracted signature function:\n' + code)
+ expr_code = ' + '.join(gen_sig_code(cache_spec))
+ signature_id_tuple = '(%s)' % (
+ ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
+ code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
+ ' return %s\n') % (signature_id_tuple, expr_code)
+ self.to_screen('Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
- r'signature=([a-zA-Z]+)', jscode,
- u'Initial JS player signature function name')
-
- functions = {}
-
- def argidx(varname):
- return string.lowercase.index(varname)
-
- def interpret_statement(stmt, local_vars, allow_recursion=20):
- if allow_recursion < 0:
- raise ExtractorError(u'Recursion limit reached')
-
- if stmt.startswith(u'var '):
- stmt = stmt[len(u'var '):]
- ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
- r'=(?P<expr>.*)$', stmt)
- if ass_m:
- if ass_m.groupdict().get('index'):
- def assign(val):
- lvar = local_vars[ass_m.group('out')]
- idx = interpret_expression(ass_m.group('index'),
- local_vars, allow_recursion)
- assert isinstance(idx, int)
- lvar[idx] = val
- return val
- expr = ass_m.group('expr')
- else:
- def assign(val):
- local_vars[ass_m.group('out')] = val
- return val
- expr = ass_m.group('expr')
- elif stmt.startswith(u'return '):
- assign = lambda v: v
- expr = stmt[len(u'return '):]
- else:
- raise ExtractorError(
- u'Cannot determine left side of statement in %r' % stmt)
-
- v = interpret_expression(expr, local_vars, allow_recursion)
- return assign(v)
-
- def interpret_expression(expr, local_vars, allow_recursion):
- if expr.isdigit():
- return int(expr)
-
- if expr.isalpha():
- return local_vars[expr]
-
- m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
- if m:
- member = m.group('member')
- val = local_vars[m.group('in')]
- if member == 'split("")':
- return list(val)
- if member == 'join("")':
- return u''.join(val)
- if member == 'length':
- return len(val)
- if member == 'reverse()':
- return val[::-1]
- slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
- if slice_m:
- idx = interpret_expression(
- slice_m.group('idx'), local_vars, allow_recursion-1)
- return val[idx:]
-
- m = re.match(
- r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
- if m:
- val = local_vars[m.group('in')]
- idx = interpret_expression(m.group('idx'), local_vars,
- allow_recursion-1)
- return val[idx]
-
- m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
- if m:
- a = interpret_expression(m.group('a'),
- local_vars, allow_recursion)
- b = interpret_expression(m.group('b'),
- local_vars, allow_recursion)
- return a % b
-
- m = re.match(
- r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
- if m:
- fname = m.group('func')
- if fname not in functions:
- functions[fname] = extract_function(fname)
- argvals = [int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]
- return functions[fname](argvals)
- raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
- def extract_function(funcname):
- func_m = re.search(
- r'function ' + re.escape(funcname) +
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
- jscode)
- argnames = func_m.group('args').split(',')
-
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in func_m.group('code').split(';'):
- res = interpret_statement(stmt, local_vars)
- return res
- return resf
-
- initial_function = extract_function(funcname)
+ (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+ r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('),
+ jscode, 'Initial JS player signature function name', group='sig')
+
+ jsi = JSInterpreter(jscode)
+ initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents):
- if file_contents[1:3] != b'WS':
- raise ExtractorError(
- u'Not an SWF file; header is %r' % file_contents[:3])
- if file_contents[:1] == b'C':
- content = zlib.decompress(file_contents[8:])
- else:
- raise NotImplementedError(u'Unsupported compression format %r' %
- file_contents[:1])
-
- def extract_tags(content):
- pos = 0
- while pos < len(content):
- header16 = struct.unpack('<H', content[pos:pos+2])[0]
- pos += 2
- tag_code = header16 >> 6
- tag_len = header16 & 0x3f
- if tag_len == 0x3f:
- tag_len = struct.unpack('<I', content[pos:pos+4])[0]
- pos += 4
- assert pos+tag_len <= len(content)
- yield (tag_code, content[pos:pos+tag_len])
- pos += tag_len
-
- code_tag = next(tag
- for tag_code, tag in extract_tags(content)
- if tag_code == 82)
- p = code_tag.index(b'\0', 4) + 1
- code_reader = io.BytesIO(code_tag[p:])
-
- # Parse ABC (AVM2 ByteCode)
- def read_int(reader=None):
- if reader is None:
- reader = code_reader
- res = 0
- shift = 0
- for _ in range(5):
- buf = reader.read(1)
- assert len(buf) == 1
- b = struct.unpack('<B', buf)[0]
- res = res | ((b & 0x7f) << shift)
- if b & 0x80 == 0:
- break
- shift += 7
- return res
-
- def u30(reader=None):
- res = read_int(reader)
- assert res & 0xf0000000 == 0
- return res
- u32 = read_int
-
- def s32(reader=None):
- v = read_int(reader)
- if v & 0x80000000 != 0:
- v = - ((v ^ 0xffffffff) + 1)
- return v
-
- def read_string(reader=None):
- if reader is None:
- reader = code_reader
- slen = u30(reader)
- resb = reader.read(slen)
- assert len(resb) == slen
- return resb.decode('utf-8')
-
- def read_bytes(count, reader=None):
- if reader is None:
- reader = code_reader
- resb = reader.read(count)
- assert len(resb) == count
- return resb
-
- def read_byte(reader=None):
- resb = read_bytes(1, reader=reader)
- res = struct.unpack('<B', resb)[0]
- return res
-
- # minor_version + major_version
- read_bytes(2 + 2)
-
- # Constant pool
- int_count = u30()
- for _c in range(1, int_count):
- s32()
- uint_count = u30()
- for _c in range(1, uint_count):
- u32()
- double_count = u30()
- read_bytes((double_count-1) * 8)
- string_count = u30()
- constant_strings = [u'']
- for _c in range(1, string_count):
- s = read_string()
- constant_strings.append(s)
- namespace_count = u30()
- for _c in range(1, namespace_count):
- read_bytes(1) # kind
- u30() # name
- ns_set_count = u30()
- for _c in range(1, ns_set_count):
- count = u30()
- for _c2 in range(count):
- u30()
- multiname_count = u30()
- MULTINAME_SIZES = {
- 0x07: 2, # QName
- 0x0d: 2, # QNameA
- 0x0f: 1, # RTQName
- 0x10: 1, # RTQNameA
- 0x11: 0, # RTQNameL
- 0x12: 0, # RTQNameLA
- 0x09: 2, # Multiname
- 0x0e: 2, # MultinameA
- 0x1b: 1, # MultinameL
- 0x1c: 1, # MultinameLA
- }
- multinames = [u'']
- for _c in range(1, multiname_count):
- kind = u30()
- assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
- if kind == 0x07:
- u30() # namespace_idx
- name_idx = u30()
- multinames.append(constant_strings[name_idx])
- else:
- multinames.append('[MULTINAME kind: %d]' % kind)
- for _c2 in range(MULTINAME_SIZES[kind]):
- u30()
-
- # Methods
- method_count = u30()
- MethodInfo = collections.namedtuple(
- 'MethodInfo',
- ['NEED_ARGUMENTS', 'NEED_REST'])
- method_infos = []
- for method_id in range(method_count):
- param_count = u30()
- u30() # return type
- for _ in range(param_count):
- u30() # param type
- u30() # name index (always 0 for youtube)
- flags = read_byte()
- if flags & 0x08 != 0:
- # Options present
- option_count = u30()
- for c in range(option_count):
- u30() # val
- read_bytes(1) # kind
- if flags & 0x80 != 0:
- # Param names present
- for _ in range(param_count):
- u30() # param name
- mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
- method_infos.append(mi)
-
- # Metadata
- metadata_count = u30()
- for _c in range(metadata_count):
- u30() # name
- item_count = u30()
- for _c2 in range(item_count):
- u30() # key
- u30() # value
-
- def parse_traits_info():
- trait_name_idx = u30()
- kind_full = read_byte()
- kind = kind_full & 0x0f
- attrs = kind_full >> 4
- methods = {}
- if kind in [0x00, 0x06]: # Slot or Const
- u30() # Slot id
- u30() # type_name_idx
- vindex = u30()
- if vindex != 0:
- read_byte() # vkind
- elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
- u30() # disp_id
- method_idx = u30()
- methods[multinames[trait_name_idx]] = method_idx
- elif kind == 0x04: # Class
- u30() # slot_id
- u30() # classi
- elif kind == 0x05: # Function
- u30() # slot_id
- function_idx = u30()
- methods[function_idx] = multinames[trait_name_idx]
- else:
- raise ExtractorError(u'Unsupported trait kind %d' % kind)
-
- if attrs & 0x4 != 0: # Metadata present
- metadata_count = u30()
- for _c3 in range(metadata_count):
- u30() # metadata index
-
- return methods
-
- # Classes
- TARGET_CLASSNAME = u'SignatureDecipher'
- searched_idx = multinames.index(TARGET_CLASSNAME)
- searched_class_id = None
- class_count = u30()
- for class_id in range(class_count):
- name_idx = u30()
- if name_idx == searched_idx:
- # We found the class we're looking for!
- searched_class_id = class_id
- u30() # super_name idx
- flags = read_byte()
- if flags & 0x08 != 0: # Protected namespace is present
- u30() # protected_ns_idx
- intrf_count = u30()
- for _c2 in range(intrf_count):
- u30()
- u30() # iinit
- trait_count = u30()
- for _c2 in range(trait_count):
- parse_traits_info()
-
- if searched_class_id is None:
- raise ExtractorError(u'Target class %r not found' %
- TARGET_CLASSNAME)
-
- method_names = {}
- method_idxs = {}
- for class_id in range(class_count):
- u30() # cinit
- trait_count = u30()
- for _c2 in range(trait_count):
- trait_methods = parse_traits_info()
- if class_id == searched_class_id:
- method_names.update(trait_methods.items())
- method_idxs.update(dict(
- (idx, name)
- for name, idx in trait_methods.items()))
-
- # Scripts
- script_count = u30()
- for _c in range(script_count):
- u30() # init
- trait_count = u30()
- for _c2 in range(trait_count):
- parse_traits_info()
-
- # Method bodies
- method_body_count = u30()
- Method = collections.namedtuple('Method', ['code', 'local_count'])
- methods = {}
- for _c in range(method_body_count):
- method_idx = u30()
- u30() # max_stack
- local_count = u30()
- u30() # init_scope_depth
- u30() # max_scope_depth
- code_length = u30()
- code = read_bytes(code_length)
- if method_idx in method_idxs:
- m = Method(code, local_count)
- methods[method_idxs[method_idx]] = m
- exception_count = u30()
- for _c2 in range(exception_count):
- u30() # from
- u30() # to
- u30() # target
- u30() # exc_type
- u30() # var_name
- trait_count = u30()
- for _c2 in range(trait_count):
- parse_traits_info()
-
- assert p + code_reader.tell() == len(code_tag)
- assert len(methods) == len(method_idxs)
-
- method_pyfunctions = {}
-
- def extract_function(func_name):
- if func_name in method_pyfunctions:
- return method_pyfunctions[func_name]
- if func_name not in methods:
- raise ExtractorError(u'Cannot find function %r' % func_name)
- m = methods[func_name]
-
- def resfunc(args):
- registers = ['(this)'] + list(args) + [None] * m.local_count
- stack = []
- coder = io.BytesIO(m.code)
- while True:
- opcode = struct.unpack('!B', coder.read(1))[0]
- if opcode == 36: # pushbyte
- v = struct.unpack('!B', coder.read(1))[0]
- stack.append(v)
- elif opcode == 44: # pushstring
- idx = u30(coder)
- stack.append(constant_strings[idx])
- elif opcode == 48: # pushscope
- # We don't implement the scope register, so we'll just
- # ignore the popped value
- stack.pop()
- elif opcode == 70: # callproperty
- index = u30(coder)
- mname = multinames[index]
- arg_count = u30(coder)
- args = list(reversed(
- [stack.pop() for _ in range(arg_count)]))
- obj = stack.pop()
- if mname == u'split':
- assert len(args) == 1
- assert isinstance(args[0], compat_str)
- assert isinstance(obj, compat_str)
- if args[0] == u'':
- res = list(obj)
- else:
- res = obj.split(args[0])
- stack.append(res)
- elif mname == u'slice':
- assert len(args) == 1
- assert isinstance(args[0], int)
- assert isinstance(obj, list)
- res = obj[args[0]:]
- stack.append(res)
- elif mname == u'join':
- assert len(args) == 1
- assert isinstance(args[0], compat_str)
- assert isinstance(obj, list)
- res = args[0].join(obj)
- stack.append(res)
- elif mname in method_pyfunctions:
- stack.append(method_pyfunctions[mname](args))
- else:
- raise NotImplementedError(
- u'Unsupported property %r on %r'
- % (mname, obj))
- elif opcode == 72: # returnvalue
- res = stack.pop()
- return res
- elif opcode == 79: # callpropvoid
- index = u30(coder)
- mname = multinames[index]
- arg_count = u30(coder)
- args = list(reversed(
- [stack.pop() for _ in range(arg_count)]))
- obj = stack.pop()
- if mname == u'reverse':
- assert isinstance(obj, list)
- obj.reverse()
- else:
- raise NotImplementedError(
- u'Unsupported (void) property %r on %r'
- % (mname, obj))
- elif opcode == 93: # findpropstrict
- index = u30(coder)
- mname = multinames[index]
- res = extract_function(mname)
- stack.append(res)
- elif opcode == 97: # setproperty
- index = u30(coder)
- value = stack.pop()
- idx = stack.pop()
- obj = stack.pop()
- assert isinstance(obj, list)
- assert isinstance(idx, int)
- obj[idx] = value
- elif opcode == 98: # getlocal
- index = u30(coder)
- stack.append(registers[index])
- elif opcode == 99: # setlocal
- index = u30(coder)
- value = stack.pop()
- registers[index] = value
- elif opcode == 102: # getproperty
- index = u30(coder)
- pname = multinames[index]
- if pname == u'length':
- obj = stack.pop()
- assert isinstance(obj, list)
- stack.append(len(obj))
- else: # Assume attribute access
- idx = stack.pop()
- assert isinstance(idx, int)
- obj = stack.pop()
- assert isinstance(obj, list)
- stack.append(obj[idx])
- elif opcode == 128: # coerce
- u30(coder)
- elif opcode == 133: # coerce_s
- assert isinstance(stack[-1], (type(None), compat_str))
- elif opcode == 164: # modulo
- value2 = stack.pop()
- value1 = stack.pop()
- res = value1 % value2
- stack.append(res)
- elif opcode == 208: # getlocal_0
- stack.append(registers[0])
- elif opcode == 209: # getlocal_1
- stack.append(registers[1])
- elif opcode == 210: # getlocal_2
- stack.append(registers[2])
- elif opcode == 211: # getlocal_3
- stack.append(registers[3])
- elif opcode == 214: # setlocal_2
- registers[2] = stack.pop()
- elif opcode == 215: # setlocal_3
- registers[3] = stack.pop()
- else:
- raise NotImplementedError(
- u'Unsupported opcode %d' % opcode)
-
- method_pyfunctions[func_name] = resfunc
- return resfunc
-
- initial_function = extract_function(u'decipher')
+ swfi = SWFInterpreter(file_contents)
+ TARGET_CLASSNAME = 'SignatureDecipher'
+ searched_class = swfi.extract_class(TARGET_CLASSNAME)
+ initial_function = swfi.extract_function(searched_class, 'decipher')
return lambda s: initial_function([s])
def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
"""Turn the encrypted s field into a working signature"""
- if player_url is not None:
- if player_url.startswith(u'//'):
- player_url = u'https:' + player_url
- try:
- player_id = (player_url, len(s))
- if player_id not in self._player_cache:
- func = self._extract_signature_function(
- video_id, player_url, len(s)
- )
- self._player_cache[player_id] = func
- func = self._player_cache[player_id]
- if self._downloader.params.get('youtube_print_sig_code'):
- self._print_sig_code(func, len(s))
- return func(s)
- except Exception:
- tb = traceback.format_exc()
- self._downloader.report_warning(
- u'Automatic signature extraction failed: ' + tb)
-
- self._downloader.report_warning(
- u'Warning: Falling back to static signature algorithm')
-
- return self._static_decrypt_signature(
- s, video_id, player_url, age_gate)
-
- def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
- if age_gate:
- # The videos with age protection use another player, so the
- # algorithms can be different.
- if len(s) == 86:
- return s[2:63] + s[82] + s[64:82] + s[63]
-
- if len(s) == 93:
- return s[86:29:-1] + s[88] + s[28:5:-1]
- elif len(s) == 92:
- return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
- elif len(s) == 91:
- return s[84:27:-1] + s[86] + s[26:5:-1]
- elif len(s) == 90:
- return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
- elif len(s) == 89:
- return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
- elif len(s) == 88:
- return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
- elif len(s) == 87:
- return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
- elif len(s) == 86:
- return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
- elif len(s) == 85:
- return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
- elif len(s) == 84:
- return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
- elif len(s) == 83:
- return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
- elif len(s) == 82:
- return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
- elif len(s) == 81:
- return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
- elif len(s) == 80:
- return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
- elif len(s) == 79:
- return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
+ if player_url is None:
+ raise ExtractorError('Cannot decrypt signature without player_url')
- else:
- raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
+ if player_url.startswith('//'):
+ player_url = 'https:' + player_url
+ elif not re.match(r'https?://', player_url):
+ player_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com', player_url)
+ try:
+ player_id = (player_url, self._signature_cache_id(s))
+ if player_id not in self._player_cache:
+ func = self._extract_signature_function(
+ video_id, player_url, s
+ )
+ self._player_cache[player_id] = func
+ func = self._player_cache[player_id]
+ if self._downloader.params.get('youtube_print_sig_code'):
+ self._print_sig_code(func, s)
+ return func(s)
+ except Exception as e:
+ tb = traceback.format_exc()
+ raise ExtractorError(
+ 'Signature extraction failed: ' + tb, cause=e)
- def _get_available_subtitles(self, video_id, webpage):
+ def _get_subtitles(self, video_id, webpage):
try:
- sub_list = self._download_webpage(
- 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
+ subs_doc = self._download_xml(
+ 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
video_id, note=False)
except ExtractorError as err:
- self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+ self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
return {}
- lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
sub_lang_list = {}
- for l in lang_list:
- lang = l[1]
- params = compat_urllib_parse.urlencode({
- 'lang': lang,
- 'v': video_id,
- 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
- 'name': l[0].encode('utf-8'),
- })
- url = u'http://www.youtube.com/api/timedtext?' + params
- sub_lang_list[lang] = url
+ for track in subs_doc.findall('track'):
+ lang = track.attrib['lang_code']
+ if lang in sub_lang_list:
+ continue
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ params = compat_urllib_parse_urlencode({
+ 'lang': lang,
+ 'v': video_id,
+ 'fmt': ext,
+ 'name': track.attrib['name'].encode('utf-8'),
+ })
+ sub_formats.append({
+ 'url': 'https://www.youtube.com/api/timedtext?' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[lang] = sub_formats
if not sub_lang_list:
- self._downloader.report_warning(u'video doesn\'t have subtitles')
+ self._downloader.report_warning('video doesn\'t have subtitles')
return {}
return sub_lang_list
- def _get_available_automatic_caption(self, video_id, webpage):
+ def _get_ytplayer_config(self, video_id, webpage):
+ patterns = (
+ # User data may contain arbitrary character sequences that may affect
+ # JSON extraction with regex, e.g. when '};' is contained the second
+ # regex won't capture the whole JSON. Yet working around by trying more
+ # concrete regex first keeping in mind proper quoted string handling
+ # to be implemented in future that will replace this workaround (see
+ # https://github.com/rg3/youtube-dl/issues/7468,
+ # https://github.com/rg3/youtube-dl/pull/7599)
+ r';ytplayer\.config\s*=\s*({.+?});ytplayer',
+ r';ytplayer\.config\s*=\s*({.+?});',
+ )
+ config = self._search_regex(
+ patterns, webpage, 'ytplayer.config', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
+ def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_format = self._downloader.params.get('subtitlesformat', 'srt')
- self.to_screen(u'%s: Looking for automatic captions' % video_id)
- mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
- err_msg = u'Couldn\'t find automatic captions for %s' % video_id
- if mobj is None:
+ self.to_screen('%s: Looking for automatic captions' % video_id)
+ player_config = self._get_ytplayer_config(video_id, webpage)
+ err_msg = 'Couldn\'t find automatic captions for %s' % video_id
+ if not player_config:
self._downloader.report_warning(err_msg)
return {}
- player_config = json.loads(mobj.group(1))
try:
- args = player_config[u'args']
- caption_url = args[u'ttsurl']
- timestamp = args[u'timestamp']
- # We get the available subtitles
- list_params = compat_urllib_parse.urlencode({
- 'type': 'list',
- 'tlangs': 1,
- 'asrs': 1,
- })
- list_url = caption_url + '&' + list_params
- caption_list = self._download_xml(list_url, video_id)
- original_lang_node = caption_list.find('track')
- if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
- self._downloader.report_warning(u'Video doesn\'t have automatic captions')
- return {}
- original_lang = original_lang_node.attrib['lang_code']
-
- sub_lang_list = {}
- for lang_node in caption_list.findall('target'):
- sub_lang = lang_node.attrib['lang_code']
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': sub_format,
- 'ts': timestamp,
- 'kind': 'asr',
+ args = player_config['args']
+ caption_url = args.get('ttsurl')
+ if caption_url:
+ timestamp = args['timestamp']
+ # We get the available subtitles
+ list_params = compat_urllib_parse_urlencode({
+ 'type': 'list',
+ 'tlangs': 1,
+ 'asrs': 1,
})
- sub_lang_list[sub_lang] = caption_url + '&' + params
- return sub_lang_list
+ list_url = caption_url + '&' + list_params
+ caption_list = self._download_xml(list_url, video_id)
+ original_lang_node = caption_list.find('track')
+ if original_lang_node is None:
+ self._downloader.report_warning('Video doesn\'t have automatic captions')
+ return {}
+ original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
+
+ sub_lang_list = {}
+ for lang_node in caption_list.findall('target'):
+ sub_lang = lang_node.attrib['lang_code']
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ params = compat_urllib_parse_urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
+ return sub_lang_list
+
+ def make_captions(sub_url, sub_langs):
+ parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
+ caption_qs = compat_parse_qs(parsed_sub_url.query)
+ captions = {}
+ for sub_lang in sub_langs:
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ caption_qs.update({
+ 'tlang': [sub_lang],
+ 'fmt': [ext],
+ })
+ sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
+ query=compat_urllib_parse_urlencode(caption_qs, True)))
+ sub_formats.append({
+ 'url': sub_url,
+ 'ext': ext,
+ })
+ captions[sub_lang] = sub_formats
+ return captions
+
+ # New captions format as of 22.06.2017
+ player_response = args.get('player_response')
+ if player_response and isinstance(player_response, compat_str):
+ player_response = self._parse_json(
+ player_response, video_id, fatal=False)
+ if player_response:
+ renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+ base_url = renderer['captionTracks'][0]['baseUrl']
+ sub_lang_list = []
+ for lang in renderer['translationLanguages']:
+ lang_code = lang.get('languageCode')
+ if lang_code:
+ sub_lang_list.append(lang_code)
+ return make_captions(base_url, sub_lang_list)
+
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ # Does not used anymore as of 22.06.2017
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ sub_lang_list = []
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if sub_lang:
+ sub_lang_list.append(sub_lang)
+ return make_captions(caption_url, sub_lang_list)
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
- except (KeyError, ExtractorError):
+ except (KeyError, IndexError, ExtractorError):
self._downloader.report_warning(err_msg)
return {}
- def _print_formats(self, formats):
- print('Available formats:')
- for x in formats:
- print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
- self._video_dimensions.get(x, '???'),
- ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
+ def _mark_watched(self, video_id, video_info):
+ playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+ if not playback_url:
+ return
+ parsed_playback_url = compat_urlparse.urlparse(playback_url)
+ qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+ # cpn generation algorithm is reverse engineered from base.js.
+ # In fact it works even with dummy cpn.
+ CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+ cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+ qs.update({
+ 'ver': ['2'],
+ 'cpn': [cpn],
+ })
+ playback_url = compat_urlparse.urlunparse(
+ parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+ self._download_webpage(
+ playback_url, video_id, 'Marking watched',
+ 'Unable to mark watched', fatal=False)
+
+ @staticmethod
+ def _extract_urls(webpage):
+ # Embedded YouTube player
+ entries = [
+ unescapeHTML(mobj.group('url'))
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ <iframe[^>]+?src=|
+ data-video-url=|
+ <embed[^>]+?src=|
+ embedSWF\(?:\s*|
+ <object[^>]+data=|
+ new\s+SWFObject\(
+ )
+ (["\'])
+ (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+ (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
+ \1''', webpage)]
+
+ # lazyYT YouTube embed
+ entries.extend(list(map(
+ unescapeHTML,
+ re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
+
+ # Wordpress "YouTube Video Importer" plugin
+ matches = re.findall(r'''(?x)<div[^>]+
+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
+ entries.extend(m[-1] for m in matches)
+
+ return entries
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = YoutubeIE._extract_urls(webpage)
+ return urls[0] if urls else None
- def _extract_id(self, url):
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
+ @classmethod
+ def extract_id(cls, url):
+ mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(2)
return video_id
- def _get_video_url_list(self, url_map):
- """
- Transform a dictionary in the format {itag:url} to a list of (itag, url)
- with the requested formats.
- """
- req_format = self._downloader.params.get('format', None)
- format_limit = self._downloader.params.get('format_limit', None)
- available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
- if format_limit is not None and format_limit in available_formats:
- format_list = available_formats[available_formats.index(format_limit):]
- else:
- format_list = available_formats
- existing_formats = [x for x in format_list if x in url_map]
- if len(existing_formats) == 0:
- raise ExtractorError(u'no known formats available for video')
- if self._downloader.params.get('listformats', None):
- self._print_formats(existing_formats)
- return
- if req_format is None or req_format == 'best':
- video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
- elif req_format == 'worst':
- video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
- elif req_format in ('-1', 'all'):
- video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- else:
- # Specific formats. We pick the first in a slash-delimeted sequence.
- # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
- # available in the specified format. For example,
- # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
- # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
- # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
- req_formats = req_format.split('/')
- video_url_list = None
- for rf in req_formats:
- if rf in url_map:
- video_url_list = [(rf, url_map[rf])]
- break
- if rf in self._video_formats_map:
- for srf in self._video_formats_map[rf]:
- if srf in url_map:
- video_url_list = [(srf, url_map[srf])]
- break
- else:
- continue
- break
- if video_url_list is None:
- raise ExtractorError(u'requested format not available')
- return video_url_list
-
- def _extract_from_m3u8(self, manifest_url, video_id):
- url_map = {}
- def _get_urls(_manifest):
- lines = _manifest.split('\n')
- urls = filter(lambda l: l and not l.startswith('#'),
- lines)
- return urls
- manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
- formats_urls = _get_urls(manifest)
- for format_url in formats_urls:
- itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
- url_map[itag] = format_url
- return url_map
-
def _extract_annotations(self, video_id):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
- return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
+ return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+
+ @staticmethod
+ def _extract_chapters(description, duration):
+ if not description:
+ return None
+ chapter_lines = re.findall(
+ r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+ description)
+ if not chapter_lines:
+ return None
+ chapters = []
+ for next_num, (chapter_line, time_point) in enumerate(
+ chapter_lines, start=1):
+ start_time = parse_duration(time_point)
+ if start_time is None:
+ continue
+ if start_time > duration:
+ break
+ end_time = (duration if next_num == len(chapter_lines)
+ else parse_duration(chapter_lines[next_num][1]))
+ if end_time is None:
+ continue
+ if end_time > duration:
+ end_time = duration
+ if start_time > end_time:
+ break
+ chapter_title = re.sub(
+ r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+ chapter_title = re.sub(r'\s+', ' ', chapter_title)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': chapter_title,
+ })
+ return chapters
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
+ proto = (
+ 'http' if self._downloader.params.get('prefer_insecure', False)
+ else 'https')
+
+ start_time = None
+ end_time = None
+ parsed_url = compat_urllib_parse_urlparse(url)
+ for component in [parsed_url.fragment, parsed_url.query]:
+ query = compat_parse_qs(component)
+ if start_time is None and 't' in query:
+ start_time = parse_duration(query['t'][0])
+ if start_time is None and 'start' in query:
+ start_time = parse_duration(query['start'][0])
+ if end_time is None and 'end' in query:
+ end_time = parse_duration(query['end'][0])
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
- video_id = self._extract_id(url)
+ url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
+ video_id = self.extract_id(url)
# Get video webpage
- self.report_video_webpage_download(video_id)
- url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
- request = compat_urllib_request.Request(url)
- try:
- video_webpage_bytes = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
-
- video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
+ url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
+ video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
else:
player_url = None
+ dash_mpds = []
+
+ def add_dash_mpd(video_info):
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd and dash_mpd[0] not in dash_mpds:
+ dash_mpds.append(dash_mpd[0])
+
+ is_live = None
+ view_count = None
+
+ def extract_view_count(v_info):
+ return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
+
# Get video info
- self.report_video_info_webpage_download(video_id)
+ embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
- self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
- data = compat_urllib_parse.urlencode({'video_id': video_id,
- 'el': 'player_embedded',
- 'gl': 'US',
- 'hl': 'en',
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
- 'asv': 3,
- 'sts':'1588',
- })
- video_info_url = 'https://www.youtube.com/get_video_info?' + data
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
+ url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
+ data = compat_urllib_parse_urlencode({
+ 'video_id': video_id,
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+ 'sts': self._search_regex(
+ r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
+ })
+ video_info_url = proto + '://www.youtube.com/get_video_info?' + data
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(video_info)
else:
age_gate = False
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (video_id, el_type))
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
+ video_info = None
+ sts = None
+ # Try looking directly into the video webpage
+ ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+ if ytplayer_config:
+ args = ytplayer_config['args']
+ if args.get('url_encoded_fmt_stream_map'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ # Rental video is not rented but preview is available (e.g.
+ # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
+ # https://github.com/rg3/youtube-dl/issues/10532)
+ if not video_info and args.get('ypc_vid'):
+ return self.url_result(
+ args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
+ if args.get('livestream') == '1' or args.get('live_playback') == 1:
+ is_live = True
+ sts = ytplayer_config.get('sts')
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ # We also try looking in get_video_info since it may contain different dashmpd
+ # URL that points to a DASH manifest with possibly different itag set (some itags
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+ # manifest pointed by get_video_info's dashmpd).
+ # The general idea is to take a union of itags of both DASH manifests (for example
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
+ self.report_video_info_webpage_download(video_id)
+ for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+ query = {
+ 'video_id': video_id,
+ 'ps': 'default',
+ 'eurl': '',
+ 'gl': 'US',
+ 'hl': 'en',
+ }
+ if el:
+ query['el'] = el
+ if sts:
+ query['sts'] = sts
+ video_info_webpage = self._download_webpage(
+ '%s://www.youtube.com/get_video_info' % proto,
+ video_id, note=False,
+ errnote='unable to download video info webpage',
+ fatal=False, query=query)
+ if not video_info_webpage:
+ continue
+ get_video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(get_video_info)
+ if view_count is None:
+ view_count = extract_view_count(get_video_info)
+ if not video_info:
+ video_info = get_video_info
+ if 'token' in get_video_info:
+ # Different get_video_info requests may report different results, e.g.
+ # some may report video unavailability, but some may serve it without
+ # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
+ # the original webpage as well as el=info and el=embedded get_video_info
+ # requests report video unavailability due to geo restriction while
+ # el=detailpage succeeds and returns valid data). This is probably
+ # due to YouTube measures against IP ranges of hosting providers.
+ # Working around by preferring the first succeeded video_info containing
+ # the token if no such video_info yet was found.
+ if 'token' not in video_info:
+ video_info = get_video_info
+ break
+
+ def extract_unavailable_message():
+ return self._html_search_regex(
+ r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
+ video_webpage, 'unavailable message', default=None)
+
if 'token' not in video_info:
if 'reason' in video_info:
- raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
+ if 'The uploader has not made this video available in your country.' in video_info['reason']:
+ regions_allowed = self._html_search_meta(
+ 'regionsAllowed', video_webpage, default=None)
+ countries = regions_allowed.split(',') if regions_allowed else None
+ self.raise_geo_restricted(
+ msg=video_info['reason'][0], countries=countries)
+ reason = video_info['reason'][0]
+ if 'Invalid parameters' in reason:
+ unavailable_message = extract_unavailable_message()
+ if unavailable_message:
+ reason = unavailable_message
+ raise ExtractorError(
+ 'YouTube said: %s' % reason,
+ expected=True, video_id=video_id)
else:
- raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+ raise ExtractorError(
+ '"token" parameter not in video info for unknown reason',
+ video_id=video_id)
- if 'view_count' in video_info:
- view_count = int(video_info['view_count'][0])
+ # title
+ if 'title' in video_info:
+ video_title = video_info['title'][0]
else:
- view_count = None
+ self._downloader.report_warning('Unable to extract video title')
+ video_title = '_'
+
+ # description
+ description_original = video_description = get_element_by_id("eow-description", video_webpage)
+ if video_description:
+
+ def replace_url(m):
+ redir_url = compat_urlparse.urljoin(url, m.group(1))
+ parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
+ if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
+ qs = compat_parse_qs(parsed_redir_url.query)
+ q = qs.get('q')
+ if q and q[0]:
+ return q[0]
+ return redir_url
+
+ description_original = video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
+ (?:title|href)="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
+ class="[^"]*"[^>]*>
+ [^<]+\.{3}\s*
+ </a>
+ ''', replace_url, video_description)
+ video_description = clean_html(video_description)
+ else:
+ fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
+ if fd_mobj:
+ video_description = unescapeHTML(fd_mobj.group(1))
+ else:
+ video_description = ''
+
+ if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+ if not self._downloader.params.get('noplaylist'):
+ entries = []
+ feed_ids = []
+ multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
+ for feed in multifeed_metadata_list.split(','):
+ # Unquote should take place before split on comma (,) since textual
+ # fields may contain comma as well (see
+ # https://github.com/rg3/youtube-dl/issues/8536)
+ feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+ })
+ feed_ids.append(feed_data['id'][0])
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(entries, video_id, video_title, video_description)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+ if view_count is None:
+ view_count = extract_view_count(video_info)
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
- raise ExtractorError(u'"rental" videos not supported')
+ raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
# Start extracting information
self.report_information_extraction(video_id)
# uploader
if 'author' not in video_info:
- raise ExtractorError(u'Unable to extract uploader name')
- video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ raise ExtractorError('Unable to extract uploader name')
+ video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+ video_uploader_url = None
+ mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+ video_webpage)
if mobj is not None:
- video_uploader_id = mobj.group(1)
- else:
- self._downloader.report_warning(u'unable to extract uploader nickname')
-
- # title
- if 'title' in video_info:
- video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
+ video_uploader_id = mobj.group('uploader_id')
+ video_uploader_url = mobj.group('uploader_url')
else:
- self._downloader.report_warning(u'Unable to extract video title')
- video_title = u'_'
+ self._downloader.report_warning('unable to extract uploader nickname')
# thumbnail image
# We try first to get a high quality image:
if m_thumb is not None:
video_thumbnail = m_thumb.group(1)
elif 'thumbnail_url' not in video_info:
- self._downloader.report_warning(u'unable to extract video thumbnail')
+ self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+ video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
- upload_date = None
- mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
- if mobj is not None:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- upload_date = unified_strdate(upload_date)
-
- # description
- video_description = get_element_by_id("eow-description", video_webpage)
- if video_description:
- video_description = re.sub(r'''(?x)
- <a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- title="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="yt-uix-redirect-link"\s*>
- [^<]+
- </a>
- ''', r'\1', video_description)
- video_description = clean_html(video_description)
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
+ video_webpage, 'upload date', default=None)
+ upload_date = unified_strdate(upload_date)
+
+ video_license = self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+ video_webpage, 'license', default=None)
+
+ m_music = re.search(
+ r'''(?x)
+ <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+ <ul[^>]*>\s*
+ <li>(?P<title>.+?)
+ by (?P<creator>.+?)
+ (?:
+ \(.+?\)|
+ <a[^>]*
+ (?:
+ \bhref=["\']/red[^>]*>| # drop possible
+ >\s*Listen ad-free with YouTube Red # YouTube Red ad
+ )
+ .*?
+ )?</li
+ ''',
+ video_webpage)
+ if m_music:
+ video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
+ video_creator = clean_html(m_music.group('creator'))
else:
- fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
- if fd_mobj:
- video_description = unescapeHTML(fd_mobj.group(1))
- else:
- video_description = u''
+ video_alt_title = video_creator = None
+
+ m_episode = re.search(
+ r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
+ video_webpage)
+ if m_episode:
+ series = m_episode.group('series')
+ season_number = int(m_episode.group('season'))
+ episode_number = int(m_episode.group('episode'))
+ else:
+ series = season_number = episode_number = None
+
+ m_cat_container = self._search_regex(
+ r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
+ video_webpage, 'categories', default=None)
+ if m_cat_container:
+ category = self._html_search_regex(
+ r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
+ default=None)
+ video_categories = None if category is None else [category]
+ else:
+ video_categories = None
+
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+
+ def _extract_count(count_name):
+ return str_to_int(self._search_regex(
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ % re.escape(count_name),
+ video_webpage, count_name, default=None))
+
+ like_count = _extract_count('like')
+ dislike_count = _extract_count('dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
+ automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, video_webpage)
- return
-
- if 'length_seconds' not in video_info:
- self._downloader.report_warning(u'unable to extract video duration')
- video_duration = ''
- else:
- video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+ video_duration = try_get(
+ video_info, lambda x: int_or_none(x['length_seconds'][0]))
+ if not video_duration:
+ video_duration = parse_duration(self._html_search_meta(
+ 'duration', video_webpage, 'video duration'))
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
- video_annotations = self._extract_annotations(video_id)
+ video_annotations = self._extract_annotations(video_id)
- # Decide which formats to download
-
- try:
- mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find vevo ID')
- info = json.loads(mobj.group(1))
- args = info['args']
- # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
- # this signatures are encrypted
- if 'url_encoded_fmt_stream_map' not in args:
- raise ValueError(u'No stream_map present') # caught below
- re_signature = re.compile(r'[&,]s=')
- m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
- if m_s is not None:
- self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
- video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
- m_s = re_signature.search(args.get('adaptive_fmts', u''))
- if m_s is not None:
- if 'adaptive_fmts' in video_info:
- video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
- else:
- video_info['adaptive_fmts'] = [args['adaptive_fmts']]
- except ValueError:
- pass
+ chapters = self._extract_chapters(description_original, video_duration)
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
- video_url_list = [(None, video_info['conn'][0])]
- elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
- encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
+ formats = [{
+ 'format_id': '_rtmp',
+ 'protocol': 'rtmp',
+ 'url': video_info['conn'][0],
+ 'player_url': player_url,
+ }]
+ elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
+ encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
- url_map = {}
+ formats_spec = {}
+ fmt_list = video_info.get('fmt_list', [''])[0]
+ if fmt_list:
+ for fmt in fmt_list.split(','):
+ spec = fmt.split('/')
+ if len(spec) > 1:
+ width_height = spec[1].split('x')
+ if len(width_height) == 2:
+ formats_spec[spec[0]] = {
+ 'resolution': spec[1],
+ 'width': int_or_none(width_height[0]),
+ 'height': int_or_none(width_height[1]),
+ }
+ formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
- if 'itag' in url_data and 'url' in url_data:
- url = url_data['url'][0]
- if 'sig' in url_data:
- url += '&signature=' + url_data['sig'][0]
- elif 's' in url_data:
- encrypted_sig = url_data['s'][0]
- if self._downloader.params.get('verbose'):
- if age_gate:
- if player_url is None:
- player_version = 'unknown'
- else:
- player_version = self._search_regex(
- r'-(.+)\.swf$', player_url,
- u'flash player', fatal=False)
+ if 'itag' not in url_data or 'url' not in url_data:
+ continue
+ format_id = url_data['itag'][0]
+ url = url_data['url'][0]
+
+ if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
+ ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
+ jsplayer_url_json = self._search_regex(
+ ASSETS_RE,
+ embed_webpage if age_gate else video_webpage,
+ 'JS player URL (1)', default=None)
+ if not jsplayer_url_json and not age_gate:
+ # We need the embed website after all
+ if embed_webpage is None:
+ embed_url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(
+ embed_url, video_id, 'Downloading embed webpage')
+ jsplayer_url_json = self._search_regex(
+ ASSETS_RE, embed_webpage, 'JS player URL')
+
+ player_url = json.loads(jsplayer_url_json)
+ if player_url is None:
+ player_url_json = self._search_regex(
+ r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
+ video_webpage, 'age gate player URL')
+ player_url = json.loads(player_url_json)
+
+ if 'sig' in url_data:
+ url += '&signature=' + url_data['sig'][0]
+ elif 's' in url_data:
+ encrypted_sig = url_data['s'][0]
+
+ if self._downloader.params.get('verbose'):
+ if player_url is None:
+ player_version = 'unknown'
+ player_desc = 'unknown'
+ else:
+ if player_url.endswith('swf'):
+ player_version = self._search_regex(
+ r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
+ 'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- r'html5player-(.+?)\.js', video_webpage,
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
+ player_url,
'html5 player', fatal=False)
- player_desc = u'html5 player %s' % player_version
-
- parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
- self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
- (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
-
- if not age_gate:
- jsplayer_url_json = self._search_regex(
- r'"assets":.+?"js":\s*("[^"]+")',
- video_webpage, u'JS player URL')
- player_url = json.loads(jsplayer_url_json)
-
- signature = self._decrypt_signature(
- encrypted_sig, video_id, player_url, age_gate)
- url += '&signature=' + signature
- if 'ratebypass' not in url:
- url += '&ratebypass=yes'
- url_map[url_data['itag'][0]] = url
- video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
+ player_desc = 'html5 player %s' % player_version
+
+ parts_sizes = self._signature_cache_id(encrypted_sig)
+ self.to_screen('{%s} signature length %s, %s' %
+ (format_id, parts_sizes, player_desc))
+
+ signature = self._decrypt_signature(
+ encrypted_sig, video_id, player_url, age_gate)
+ url += '&signature=' + signature
+ if 'ratebypass' not in url:
+ url += '&ratebypass=yes'
+
+ dct = {
+ 'format_id': format_id,
+ 'url': url,
+ 'player_url': player_url,
+ }
+ if format_id in self._formats:
+ dct.update(self._formats[format_id])
+ if format_id in formats_spec:
+ dct.update(formats_spec[format_id])
+
+ # Some itags are not included in DASH manifest thus corresponding formats will
+ # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
+ # Trying to extract metadata from url_encoded_fmt_stream_map entry.
+ mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
+ width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+
+ more_fields = {
+ 'filesize': int_or_none(url_data.get('clen', [None])[0]),
+ 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
+ 'width': width,
+ 'height': height,
+ 'fps': int_or_none(url_data.get('fps', [None])[0]),
+ 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
+ }
+ for key, value in more_fields.items():
+ if value:
+ dct[key] = value
+ type_ = url_data.get('type', [None])[0]
+ if type_:
+ type_split = type_.split(';')
+ kind_ext = type_split[0].split('/')
+ if len(kind_ext) == 2:
+ kind, _ = kind_ext
+ dct['ext'] = mimetype2ext(type_split[0])
+ if kind in ('audio', 'video'):
+ codecs = None
+ for mobj in re.finditer(
+ r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
+ if mobj.group('key') == 'codecs':
+ codecs = mobj.group('val')
+ break
+ if codecs:
+ dct.update(parse_codecs(codecs))
+ if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
+ dct['downloader_options'] = {
+ # Youtube throttles chunks >~10M
+ 'http_chunk_size': 10485760,
+ }
+ formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
- url_map = self._extract_from_m3u8(manifest_url, video_id)
- video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
-
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', fatal=False)
+ for a_format in m3u8_formats:
+ itag = self._search_regex(
+ r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+ if itag:
+ a_format['format_id'] = itag
+ if itag in self._formats:
+ dct = self._formats[itag].copy()
+ dct.update(a_format)
+ a_format = dct
+ a_format['player_url'] = player_url
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+ a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+ formats.append(a_format)
else:
- raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
-
- results = []
- for itag, video_real_url in video_url_list:
- # Extension
- video_extension = self._video_extensions.get(itag, 'flv')
-
- video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
- self._video_dimensions.get(itag, '???'),
- ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
-
- results.append({
- 'id': video_id,
- 'url': video_real_url,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': video_extension,
- 'format': video_format,
- 'format_id': itag,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'player_url': player_url,
- 'subtitles': video_subtitles,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
- 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
- })
- return results
+ unavailable_message = extract_unavailable_message()
+ if unavailable_message:
+ raise ExtractorError(unavailable_message, expected=True)
+ raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
+
+ # Look for the DASH manifest
+ if self._downloader.params.get('youtube_include_dash_manifest', True):
+ dash_mpd_fatal = True
+ for mpd_url in dash_mpds:
+ dash_formats = {}
+ try:
+ def decrypt_sig(mobj):
+ s = mobj.group(1)
+ dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+ return '/signature/%s' % dec_s
+
+ mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
+
+ for df in self._extract_mpd_formats(
+ mpd_url, video_id, fatal=dash_mpd_fatal,
+ formats_dict=self._formats):
+ # Do not overwrite DASH format found in some previous DASH manifest
+ if df['format_id'] not in dash_formats:
+ dash_formats[df['format_id']] = df
+ # Additional DASH manifests may end up in HTTP Error 403 therefore
+ # allow them to fail without bug report message if we already have
+ # some DASH manifest succeeded. This is temporary workaround to reduce
+ # burst of bug reports until we figure out the reason and whether it
+ # can be fixed at all.
+ dash_mpd_fatal = False
+ except (ExtractorError, KeyError) as e:
+ self.report_warning(
+ 'Skipping DASH manifest: %r' % e, video_id)
+ if dash_formats:
+ # Remove the formats we found through non-DASH, they
+ # contain less info and it can be wrong, because we use
+ # fixed values (for example the resolution). See
+ # https://github.com/rg3/youtube-dl/issues/5774 for an
+ # example.
+ formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+ formats.extend(dash_formats.values())
+
+ # Check for malformed aspect ratio
+ stretched_m = re.search(
+ r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
+ video_webpage)
+ if stretched_m:
+ w = float(stretched_m.group('w'))
+ h = float(stretched_m.group('h'))
+ # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
+ # We will only process correct ratios.
+ if w > 0 and h > 0:
+ ratio = w / h
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
+
+ self._sort_formats(formats)
+
+ self.mark_watched(video_id, video_info)
+
+ return {
+ 'id': video_id,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'uploader_url': video_uploader_url,
+ 'upload_date': upload_date,
+ 'license': video_license,
+ 'creator': video_creator,
+ 'title': video_title,
+ 'alt_title': video_alt_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'categories': video_categories,
+ 'tags': video_tags,
+ 'subtitles': video_subtitles,
+ 'automatic_captions': automatic_captions,
+ 'duration': video_duration,
+ 'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations,
+ 'chapters': chapters,
+ 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
+ 'formats': formats,
+ 'is_live': is_live,
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ }
+
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
- IE_DESC = u'YouTube.com playlists'
- _VALID_URL = r"""(?:
+class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
+ IE_DESC = 'YouTube.com playlists'
+ _VALID_URL = r"""(?x)(?:
(?:https?://)?
(?:\w+\.)?
- youtube\.com/
(?:
- (?:course|view_play_list|my_playlists|artist|playlist|watch)
- \? (?:.*?&)*? (?:p|a|list)=
- | p/
+ youtube\.com/
+ (?:
+ (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
+ \? (?:.*?[&;])*? (?:p|a|list)=
+ | p/
+ )|
+ youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
+ )
+ (
+ (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,}
+ # Top tracks, they can also include dots
+ |(?:MC)[\w\.]*
)
- ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
.*
|
- ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
- )"""
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
- _MORE_PAGES_INDICATOR = r'data-link-type="next"'
- _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
- IE_NAME = u'youtube:playlist'
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ (%(playlist_id)s)
+ )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
+ IE_NAME = 'youtube:playlist'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
+ 'info_dict': {
+ 'title': 'ytdl test PL',
+ 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
+ },
+ 'playlist_count': 3,
+ }, {
+ 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
+ 'info_dict': {
+ 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
+ 'title': 'YDL_Empty_List',
+ },
+ 'playlist_count': 0,
+ 'skip': 'This playlist is private',
+ }, {
+ 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+ 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'info_dict': {
+ 'title': '29C3: Not my department',
+ 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ },
+ 'playlist_count': 95,
+ }, {
+ 'note': 'issue #673',
+ 'url': 'PLBB231211A4F62143',
+ 'info_dict': {
+ 'title': '[OLD]Team Fortress 2 (Class-based LP)',
+ 'id': 'PLBB231211A4F62143',
+ },
+ 'playlist_mincount': 26,
+ }, {
+ 'note': 'Large playlist',
+ 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'info_dict': {
+ 'title': 'Uploads from Cauchemar',
+ 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+ },
+ 'playlist_mincount': 799,
+ }, {
+ 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ 'info_dict': {
+ 'title': 'YDL_safe_search',
+ 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ },
+ 'playlist_count': 2,
+ 'skip': 'This playlist is private',
+ }, {
+ 'note': 'embedded',
+ 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'title': 'JODA15',
+ 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ }
+ }, {
+ 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ 'playlist_mincount': 485,
+ 'info_dict': {
+ 'title': '2017 華語最新單曲 (2/24更新)',
+ 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ }
+ }, {
+ 'note': 'Embedded SWF player',
+ 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'title': 'JODA7',
+ 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
+ }
+ }, {
+ 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
+ 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
+ 'info_dict': {
+ 'title': 'Uploads from Interstellar Movie',
+ 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
+ },
+ 'playlist_mincount': 21,
+ }, {
+ # Playlist URL that does not actually serve a playlist
+ 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
+ 'info_dict': {
+ 'id': 'FqZTN594JQw',
+ 'ext': 'webm',
+ 'title': "Smiley's People 01 detective, Adventure Series, Action",
+ 'uploader': 'STREEM',
+ 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
+ 'upload_date': '20150526',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
+ 'categories': ['People & Blogs'],
+ 'tags': list,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [YoutubeIE.ie_key()],
+ }, {
+ 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
+ 'info_dict': {
+ 'id': 'yeWKywCrFtk',
+ 'ext': 'mp4',
+ 'title': 'Small Scale Baler and Braiding Rugs',
+ 'uploader': 'Backus-Page House Museum',
+ 'uploader_id': 'backuspagemuseum',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
+ 'upload_date': '20161008',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
+ 'categories': ['Nonprofits & Activism'],
+ 'tags': list,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
+ 'only_matching': True,
+ }, {
+ 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
+ 'only_matching': True,
+ }]
def _real_initialize(self):
self._login()
- def _ids_to_results(self, ids):
- return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _extract_mix(self, playlist_id):
- # The mixes are generated from a a single video
+ # The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
- url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
- webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
- title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
- get_element_by_attribute('class', 'title ', webpage))
- title = clean_html(title_span)
- video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage))
+ ids = []
+ last_id = playlist_id[-11:]
+ for n in itertools.count(1):
+ url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+ webpage = self._download_webpage(
+ url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
+ new_ids = orderedSet(re.findall(
+ r'''(?xs)data-video-username=".*?".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),
+ webpage))
+ # Fetch new pages until all the videos are repeated, it seems that
+ # there are always 51 unique videos.
+ new_ids = [_id for _id in new_ids if _id not in ids]
+ if not new_ids:
+ break
+ ids.extend(new_ids)
+ last_id = ids[-1]
+
url_results = self._ids_to_results(ids)
+ search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+ title_span = (
+ search_title('playlist-title') or
+ search_title('title long-title') or
+ search_title('title'))
+ title = clean_html(title_span)
+
return self.playlist_result(url_results, playlist_id, title)
- def _real_extract(self, url):
- # Extract playlist id
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- playlist_id = mobj.group(1) or mobj.group(2)
+ def _extract_playlist(self, playlist_id):
+ url = self._TEMPLATE_URL % playlist_id
+ page = self._download_webpage(url, playlist_id)
+
+ # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
+ for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
+ match = match.strip()
+ # Check if the playlist exists or is private
+ mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
+ if mobj:
+ reason = mobj.group('reason')
+ message = 'This playlist %s' % reason
+ if 'private' in reason:
+ message += ', use --username or --netrc to access it'
+ message += '.'
+ raise ExtractorError(message, expected=True)
+ elif re.match(r'[^<]*Invalid parameters[^<]*', match):
+ raise ExtractorError(
+ 'Invalid parameters. Maybe URL is incorrect.',
+ expected=True)
+ elif re.match(r'[^<]*Choose your language[^<]*', match):
+ continue
+ else:
+ self.report_warning('Youtube gives an alert message: ' + match)
+
+ playlist_title = self._html_search_regex(
+ r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
+ page, 'title', default=None)
+
+ _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
+ uploader = self._search_regex(
+ r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
+ page, 'uploader', default=None)
+ mobj = re.search(
+ r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
+ page)
+ if mobj:
+ uploader_id = mobj.group('uploader_id')
+ uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
+ else:
+ uploader_id = uploader_url = None
+
+ has_videos = True
+ if not playlist_title:
+ try:
+ # Some playlist URLs don't actually serve a playlist (e.g.
+ # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
+ next(self._entries(page, playlist_id))
+ except StopIteration:
+ has_videos = False
+
+ playlist = self.playlist_result(
+ self._entries(page, playlist_id), playlist_id, playlist_title)
+ playlist.update({
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ })
+
+ return has_videos, playlist
+
+ def _check_download_just_video(self, url, playlist_id):
# Check if it's a video-specific URL
query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- if 'v' in query_dict:
- video_id = query_dict['v'][0]
+ video_id = query_dict.get('v', [None])[0] or self._search_regex(
+ r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
+ 'video id', default=None)
+ if video_id:
if self._downloader.params.get('noplaylist'):
- self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result(video_id, 'Youtube', video_id=video_id)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
else:
- self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
-
- if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
- # Mixes require a custom extraction process
- return self._extract_mix(playlist_id)
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ return video_id, None
+ return None, None
- # Extract the video ids from the playlist pages
- ids = []
-
- for page_num in itertools.count(1):
- url = self._TEMPLATE_URL % (playlist_id, page_num)
- page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
- matches = re.finditer(self._VIDEO_RE, page)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- ids.extend(new_ids)
-
- if re.search(self._MORE_PAGES_INDICATOR, page) is None:
- break
+ def _real_extract(self, url):
+ # Extract playlist id
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError('Invalid URL: %s' % url)
+ playlist_id = mobj.group(1) or mobj.group(2)
- playlist_title = self._og_search_title(page)
+ video_id, video = self._check_download_just_video(url, playlist_id)
+ if video:
+ return video
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_id, playlist_title)
+ if playlist_id.startswith(('RD', 'UL', 'PU')):
+ # Mixes require a custom extraction process
+ return self._extract_mix(playlist_id)
+ has_videos, playlist = self._extract_playlist(playlist_id)
+ if has_videos or not video_id:
+ return playlist
+
+ # Some playlist URLs don't actually serve a playlist (see
+ # https://github.com/rg3/youtube-dl/issues/10537).
+ # Fallback to plain video extraction if there is a video id
+ # along with playlist id.
+ return self.url_result(video_id, 'Youtube', video_id=video_id)
+
+
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
+ IE_DESC = 'YouTube.com channels'
+ _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+ _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+ _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
+ IE_NAME = 'youtube:channel'
+ _TESTS = [{
+ 'note': 'paginated channel',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'playlist_mincount': 91,
+ 'info_dict': {
+ 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'Uploads from lex will',
+ }
+ }, {
+ 'note': 'Age restricted channel',
+ # from https://www.youtube.com/user/DeusExOfficial
+ 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
+ 'playlist_mincount': 64,
+ 'info_dict': {
+ 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
+ 'title': 'Uploads from Deus Ex',
+ },
+ }]
-class YoutubeChannelIE(InfoExtractor):
- IE_DESC = u'YouTube.com channels'
- _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
- _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
- _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
- IE_NAME = u'youtube:channel'
+ @classmethod
+ def suitable(cls, url):
+ return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
+ else super(YoutubeChannelIE, cls).suitable(url))
- def extract_videos_from_page(self, page):
- ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- return ids_in_page
+ def _build_template_url(self, url, channel_id):
+ return self._TEMPLATE_URL % channel_id
def _real_extract(self, url):
- # Extract channel id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- # Download channel page
- channel_id = mobj.group(1)
- video_ids = []
- url = 'https://www.youtube.com/channel/%s/videos' % channel_id
- channel_page = self._download_webpage(url, channel_id)
- if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
- autogenerated = True
+ channel_id = self._match_id(url)
+
+ url = self._build_template_url(url, channel_id)
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ if channel_page is False:
+ channel_playlist_id = False
else:
- autogenerated = False
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_url = self._html_search_meta(
+ ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
+ channel_page, 'channel url', default=None)
+ if channel_url:
+ channel_playlist_id = self._search_regex(
+ r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
+ channel_url, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
+ channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
+ autogenerated = re.search(r'''(?x)
+ class="[^"]*?(?:
+ channel-header-autogenerated-label|
+ yt-channel-title-autogenerated
+ )[^"]*"''', channel_page) is not None
if autogenerated:
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
- video_ids = self.extract_videos_from_page(channel_page)
- else:
- # Download all channel pages using the json-based channel_ajax query
- for pagenum in itertools.count(1):
- url = self._MORE_PAGES_URL % (pagenum, channel_id)
- page = self._download_webpage(url, channel_id,
- u'Downloading page #%s' % pagenum)
-
- page = json.loads(page)
-
- ids_in_page = self.extract_videos_from_page(page['content_html'])
- video_ids.extend(ids_in_page)
-
- if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
- break
-
- self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
-
- url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_entries, channel_id)
-
-
-class YoutubeUserIE(InfoExtractor):
- IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
- _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
- _GDATA_PAGE_SIZE = 50
- _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
- IE_NAME = u'youtube:user'
+ entries = [
+ self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+ for video_id, video_title in self.extract_videos_from_page(channel_page)]
+ return self.playlist_result(entries, channel_id)
+
+ try:
+ next(self._entries(channel_page, channel_id))
+ except StopIteration:
+ alert_message = self._html_search_regex(
+ r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
+ channel_page, 'alert', default=None, group='alert')
+ if alert_message:
+ raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+
+ return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
+
+
+class YoutubeUserIE(YoutubeChannelIE):
+ IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+ _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
+ IE_NAME = 'youtube:user'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
+ 'playlist_mincount': 320,
+ 'info_dict': {
+ 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
+ 'title': 'Uploads from The Linux Foundation',
+ }
+ }, {
+ # Only available via https://www.youtube.com/c/12minuteathlete/videos
+ # but not https://www.youtube.com/user/12minuteathlete/videos
+ 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
+ 'playlist_mincount': 249,
+ 'info_dict': {
+ 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
+ 'title': 'Uploads from 12 Minute Athlete',
+ }
+ }, {
+ 'url': 'ytuser:phihag',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/gametrailers',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/gametrailers',
+ 'only_matching': True,
+ }, {
+ # This channel is not available, geo restricted to JP
+ 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+ 'only_matching': True,
+ }]
@classmethod
def suitable(cls, url):
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
- other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_ies): return False
- else: return super(YoutubeUserIE, cls).suitable(url)
+ other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
+ if any(ie.suitable(url) for ie in other_yt_ies):
+ return False
+ else:
+ return super(YoutubeUserIE, cls).suitable(url)
- def _real_extract(self, url):
- # Extract username
+ def _build_template_url(self, url, channel_id):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- username = mobj.group(1)
-
- # Download video ids using YouTube Data API. Result size per
- # query is limited (currently to 50 videos) so we need to query
- # page by page until there are no video ids - it means we got
- # all of them.
-
- video_ids = []
-
- for pagenum in itertools.count(0):
- start_index = pagenum * self._GDATA_PAGE_SIZE + 1
-
- gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
- page = self._download_webpage(gdata_url, username,
- u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
-
- try:
- response = json.loads(page)
- except ValueError as err:
- raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
- if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
- break
-
- # Extract video identifiers
- ids_in_page = []
- for entry in response['feed']['entry']:
- ids_in_page.append(entry['id']['$t'].split('/')[-1])
- video_ids.extend(ids_in_page)
+ return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+
+
+class YoutubeLiveIE(YoutubeBaseInfoExtractor):
+ IE_DESC = 'YouTube.com live streams'
+ _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
+ IE_NAME = 'youtube:live'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
+ 'info_dict': {
+ 'id': 'a48o2S1cPoo',
+ 'ext': 'mp4',
+ 'title': 'The Young Turks - Live Main Show',
+ 'uploader': 'The Young Turks',
+ 'uploader_id': 'TheYoungTurks',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
+ 'upload_date': '20150715',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
+ 'categories': ['News & Politics'],
+ 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/TheYoungTurks/live',
+ 'only_matching': True,
+ }]
- # A little optimization - if current page is not
- # "full", ie. does not contain PAGE_SIZE video ids then
- # we can assume that this page is the last one - there
- # are no more ids on further pages - no need to query
- # again.
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
+ base_url = mobj.group('base_url')
+ webpage = self._download_webpage(url, channel_id, fatal=False)
+ if webpage:
+ page_type = self._og_search_property(
+ 'type', webpage, 'page type', default='')
+ video_id = self._html_search_meta(
+ 'videoId', webpage, 'video id', default=None)
+ if page_type.startswith('video') and video_id and re.match(
+ r'^[0-9A-Za-z_-]{11}$', video_id):
+ return self.url_result(video_id, YoutubeIE.ie_key())
+ return self.url_result(base_url)
+
+
+class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+ IE_DESC = 'YouTube.com user/channel playlists'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
+ IE_NAME = 'youtube:playlists'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'ThirstForScience',
+ 'title': 'Thirst for Science',
+ },
+ }, {
+ # with "Load more" button
+ 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 70,
+ 'info_dict': {
+ 'id': 'igorkle1',
+ 'title': 'Игорь Клейнер',
+ },
+ }, {
+ 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
+ 'playlist_mincount': 17,
+ 'info_dict': {
+ 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
+ 'title': 'Chem Player',
+ },
+ }]
- if len(ids_in_page) < self._GDATA_PAGE_SIZE:
- break
- url_results = [
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_results, playlist_title=username)
+class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
-class YoutubeSearchIE(SearchInfoExtractor):
- IE_DESC = u'YouTube.com searches'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
- _MAX_RESULTS = 1000
- IE_NAME = u'youtube:search'
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
+ IE_DESC = 'YouTube.com searches'
+ # there doesn't appear to be a real limit, for example if you search for
+ # 'python' you get more than 8.000.000 results
+ _MAX_RESULTS = float('inf')
+ IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
-
- def report_download_page(self, query, pagenum):
- """Report attempt to download search page with given number."""
- self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
+ _EXTRA_QUERY_ARGS = {}
+ _TESTS = []
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
- video_ids = []
- pagenum = 0
+ videos = []
limit = n
- while (50 * pagenum) < limit:
- self.report_download_page(query, pagenum+1)
- result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
- request = compat_urllib_request.Request(result_url)
- try:
- data = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
- api_response = json.loads(data)['data']
-
- if not 'items' in api_response:
- raise ExtractorError(u'[youtube] No video results')
-
- new_ids = list(video['id'] for video in api_response['items'])
- video_ids += new_ids
+ url_query = {
+ 'search_query': query.encode('utf-8'),
+ }
+ url_query.update(self._EXTRA_QUERY_ARGS)
+ result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
+
+ for pagenum in itertools.count(1):
+ data = self._download_json(
+ result_url, video_id='query "%s"' % query,
+ note='Downloading page %s' % pagenum,
+ errnote='Unable to download API page',
+ query={'spf': 'navigate'})
+ html_content = data[1]['body']['content']
+
+ if 'class="search-message' in html_content:
+ raise ExtractorError(
+ '[youtube] No video results', expected=True)
- limit = min(n, api_response['totalItems'])
- pagenum += 1
+ new_videos = list(self._process_page(html_content))
+ videos += new_videos
+ if not new_videos or len(videos) > limit:
+ break
+ next_link = self._html_search_regex(
+ r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
+ html_content, 'next link', default=None)
+ if next_link is None:
+ break
+ result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
- if len(video_ids) > n:
- video_ids = video_ids[:n]
- videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ if len(videos) > n:
+ videos = videos[:n]
return self.playlist_result(videos, query)
+
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
- IE_DESC = u'YouTube.com searches, newest videos first'
-
-class YoutubeShowIE(InfoExtractor):
- IE_DESC = u'YouTube.com (multi-season) shows'
- _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
- IE_NAME = u'youtube:show'
+ IE_DESC = 'YouTube.com searches, newest videos first'
+ _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
+
+
+class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
+ IE_DESC = 'YouTube.com search URLs'
+ IE_NAME = 'youtube:search_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'title': 'youtube-dl test video',
+ }
+ }, {
+ 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- show_name = mobj.group(1)
- webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
- # There's one playlist for each season of the show
- m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
- return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+ query = compat_urllib_parse_unquote_plus(mobj.group('query'))
+ webpage = self._download_webpage(url, query)
+ return self.playlist_result(self._process_page(webpage), playlist_title=query)
+
+
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
+ IE_DESC = 'YouTube.com (multi-season) shows'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
+ IE_NAME = 'youtube:show'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/show/airdisasters',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'airdisasters',
+ 'title': 'Air Disasters',
+ }
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
- Base class for extractors that fetch info from
- http://www.youtube.com/feed_ajax
+ Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- # use action_load_personal_feed instead of action_load_system_feed
- _PERSONAL_FEED = False
-
- @property
- def _FEED_TEMPLATE(self):
- action = 'action_load_system_feed'
- if self._PERSONAL_FEED:
- action = 'action_load_personal_feed'
- return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
@property
def IE_NAME(self):
- return u'youtube:%s' % self._FEED_NAME
+ return 'youtube:%s' % self._FEED_NAME
def _real_initialize(self):
self._login()
- def _real_extract(self, url):
- feed_entries = []
- paging = 0
- for i in itertools.count(1):
- info = self._download_webpage(self._FEED_TEMPLATE % paging,
- u'%s feed' % self._FEED_NAME,
- u'Downloading page %s' % i)
- info = json.loads(info)
- feed_html = info['feed_html']
- m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
- ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in ids)
- if info['paging'] is None:
+ def _entries(self, page):
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
+ if not new_ids:
break
- paging = info['paging']
- return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _FEED_NAME = 'subscriptions'
- _PLAYLIST_TITLE = u'Youtube Subscriptions'
+ ids.extend(new_ids)
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
- _FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = u'Youtube Recommended videos'
+ for entry in self._ids_to_results(new_ids):
+ yield entry
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
- _FEED_NAME = 'watch_later'
- _PLAYLIST_TITLE = u'Youtube Watch Later'
- _PERSONAL_FEED = True
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ def _real_extract(self, url):
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
+ self._PLAYLIST_TITLE)
+ return self.playlist_result(
+ self._entries(page), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:watchlater'
+ IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/playlist?list=WL',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ _, video = self._check_download_just_video(url, 'WL')
+ if video:
+ return video
+ _, playlist = self._extract_playlist('WL')
+ return playlist
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
- _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
- _FEED_NAME = 'history'
- _PERSONAL_FEED = True
- _PLAYLIST_TITLE = u'Youtube Watch History'
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
- IE_NAME = u'youtube:favorites'
- IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
+ IE_NAME = 'youtube:favorites'
+ IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
_LOGIN_REQUIRED = True
def _real_extract(self, url):
webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
- playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
+ playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
return self.url_result(playlist_id, 'YoutubePlaylist')
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
+
+
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = 'Youtube Subscriptions'
+
+
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PLAYLIST_TITLE = 'Youtube History'
+
+
class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
- _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
+ _VALID_URL = r'''(?x)
+ (?:https?://)?
+ (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
+ (?:watch\?(?:
+ feature=[a-z_]+|
+ annotation_id=annotation_[^&]+|
+ x-yt-cl=[0-9]+|
+ hl=[^&]*|
+ t=[0-9]+
+ )?
+ |
+ attribution_link\?a=[^&]+
+ )
+ $
+ '''
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?feature=foo',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?hl=en-GB',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?t=2372',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ raise ExtractorError(
+ 'Did you forget to quote the URL? Remember that & is a meta '
+ 'character in most shells, so you want to put the URL in quotes, '
+ 'like youtube-dl '
+ '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
+ ' or simply youtube-dl BaW_jenozKc .',
+ expected=True)
+
+
+class YoutubeTruncatedIDIE(InfoExtractor):
+ IE_NAME = 'youtube:truncated_id'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
+ video_id = self._match_id(url)
raise ExtractorError(
- u'Did you forget to quote the URL? Remember that & is a meta '
- u'character in most shells, so you want to put the URL in quotes, '
- u'like youtube-dl '
- u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
- u' (or simply youtube-dl BaW_jenozKc ).',
+ 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
expected=True)