import json
import os.path
import re
+import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
uppercase_escape,
)
+
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
- _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
def _set_language(self):
- return bool(self._download_webpage(
- self._LANG_URL, None,
- note='Setting language', errnote='unable to set language',
- fatal=False))
+ self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en',
+ # YouTube sets the expire time to about two months
+ expire_time=time.time() + 60*24*3600)
def _login(self):
"""
# Log in
login_form_strs = {
- 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- 'Email': username,
- 'GALX': galx,
- 'Passwd': password,
-
- 'PersistentCookie': 'yes',
- '_utf8': '霱',
- 'bgresponse': 'js_disabled',
- 'checkConnection': '',
- 'checkedDomains': 'youtube',
- 'dnConn': '',
- 'pstMsg': '0',
- 'rmShown': '1',
- 'secTok': '',
- 'signIn': 'Sign in',
- 'timeStmp': '',
- 'service': 'youtube',
- 'uilel': '3',
- 'hl': 'en_US',
+ 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ 'Email': username,
+ 'GALX': galx,
+ 'Passwd': password,
+
+ 'PersistentCookie': 'yes',
+ '_utf8': '霱',
+ 'bgresponse': 'js_disabled',
+ 'checkConnection': '',
+ 'checkedDomains': 'youtube',
+ 'dnConn': '',
+ 'pstMsg': '0',
+ 'rmShown': '1',
+ 'secTok': '',
+ 'signIn': 'Sign in',
+ 'timeStmp': '',
+ 'service': 'youtube',
+ 'uilel': '3',
+ 'hl': 'en_US',
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
'service': 'youtube',
'hl': 'en_US',
}
- tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
+ tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
return False
return True
- def _confirm_age(self):
- age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- req = compat_urllib_request.Request(self._AGE_URL,
- compat_urllib_parse.urlencode(age_form).encode('ascii'))
-
- self._download_webpage(
- req, None,
- note='Confirming age', errnote='Unable to confirm age',
- fatal=False)
-
def _real_initialize(self):
if self._downloader is None:
return
- if self._get_login_info()[0] is not None:
- if not self._set_language():
- return
+ self._set_language()
if not self._login():
return
- self._confirm_age()
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
- # Dash mov
- '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
- '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
- '266': {'ext': 'mov', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
+ # Dash webm audio with opus inside
+ '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
+ '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
+ '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
'info_dict': {
'id': 'IB3lcPjvWLA',
'ext': 'm4a',
- 'title': 'Afrojack - The Spark ft. Spree Wilson',
- 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
+ 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
+ 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
'format': '141',
},
},
+ # Controversy video
+ {
+ 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
+ 'info_dict': {
+ 'id': 'T4XJQO3qol8',
+ 'ext': 'mp4',
+ 'upload_date': '20100909',
+ 'uploader': 'The Amazing Atheist',
+ 'uploader_id': 'TheAmazingAtheist',
+ 'title': 'Burning Everyone\'s Koran',
+ 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
+ }
+ },
+ # Normal age-gate video (No vevo, embed allowed)
+ {
+ 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
+ 'info_dict': {
+ 'id': 'HtVdAasjOgU',
+ 'ext': 'mp4',
+ 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
+ 'description': 'md5:eca57043abae25130f58f655ad9a7771',
+ 'uploader': 'The Witcher',
+ 'uploader_id': 'WitcherGame',
+ 'upload_date': '20140605',
+ },
+ },
]
def __init__(self, *args, **kwargs):
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = '' if start == 0 else str(start)
- ends = (':%d' % (end+step)) if end + step >= 0 else ':'
+ ends = (':%d' % (end + step)) if end + step >= 0 else ':'
steps = '' if step == 1 else (':%d' % step)
return 's[%s%s%s]' % (starts, ends, steps)
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
- r'signature=([$a-zA-Z]+)', jscode,
- 'Initial JS player signature function name')
+ r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
+ 'Initial JS player signature function name')
jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
return {}
player_config = json.loads(mobj.group(1))
try:
- args = player_config[u'args']
- caption_url = args[u'ttsurl']
- timestamp = args[u'timestamp']
+ args = player_config['args']
+ caption_url = args['ttsurl']
+ timestamp = args['timestamp']
# We get the available subtitles
list_params = compat_urllib_parse.urlencode({
'type': 'list',
list_url = caption_url + '&' + list_params
caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
- if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
+ if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
self._downloader.report_warning('Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
def _extract_from_m3u8(self, manifest_url, video_id):
url_map = {}
+
def _get_urls(_manifest):
lines = _manifest.split('\n')
urls = filter(lambda l: l and not l.startswith('#'),
- lines)
+ lines)
return urls
manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
formats_urls = _get_urls(manifest)
video_id = self.extract_id(url)
# Get video webpage
- url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
- pref_cookies = [
- c for c in self._downloader.cookiejar
- if c.domain == '.youtube.com' and c.name == 'PREF']
- for pc in pref_cookies:
- if 'hl=' in pc.value:
- pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
- else:
- if pc.value:
- pc.value += '&'
- pc.value += 'hl=en'
+ url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
player_url = None
# Get video info
- self.report_video_info_webpage_download(video_id)
if re.search(r'player-age-gate-content">', video_webpage) is not None:
- self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+ r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
else:
age_gate = False
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ try:
+ # Try looking directly into the video webpage
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+ if not mobj:
+ raise ValueError('Could not find ytplayer.config') # caught below
+ json_code = uppercase_escape(mobj.group(1))
+ ytplayer_config = json.loads(json_code)
+ args = ytplayer_config['args']
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ if 'url_encoded_fmt_stream_map' not in args:
+ raise ValueError('No stream_map present') # caught below
+ except ValueError:
+ # We fallback to the get_video_info pages (used by the embed page)
+ self.report_video_info_webpage_download(video_id)
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
+ video_info_webpage = self._download_webpage(video_info_url,
+ video_id, note=False,
+ errnote='unable to download video info webpage')
+ video_info = compat_parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
- video_annotations = self._extract_annotations(video_id)
-
- # Decide which formats to download
- try:
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find vevo ID')
- json_code = uppercase_escape(mobj.group(1))
- ytplayer_config = json.loads(json_code)
- args = ytplayer_config['args']
- # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
- # this signatures are encrypted
- if 'url_encoded_fmt_stream_map' not in args:
- raise ValueError('No stream_map present') # caught below
- re_signature = re.compile(r'[&,]s=')
- m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
- if m_s is not None:
- self.to_screen('%s: Encrypted signatures detected.' % video_id)
- video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
- m_s = re_signature.search(args.get('adaptive_fmts', ''))
- if m_s is not None:
- if 'adaptive_fmts' in video_info:
- video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
- else:
- video_info['adaptive_fmts'] = [args['adaptive_fmts']]
- except ValueError:
- pass
+ video_annotations = self._extract_annotations(video_id)
def _map_to_format_list(urlmap):
formats = []
'player_url': player_url,
}]
elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
- encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
+ encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
- (format_id, parts_sizes, player_desc))
+ (format_id, parts_sizes, player_desc))
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
# However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
# Luckily, it seems, this case uses some kind of default signature (len == 86), so the
# combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
- if age_gate:
- dash_manifest_url = video_info.get('dashmpd')[0]
- else:
- dash_manifest_url = ytplayer_config['args']['dashmpd']
+ dash_manifest_url = video_info.get('dashmpd')[0]
+
def decrypt_sig(mobj):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
existing_format.update(f)
except (ExtractorError, KeyError) as e:
- self.report_warning('Skipping DASH manifest: %s' % e, video_id)
+ self.report_warning('Skipping DASH manifest: %r' % e, video_id)
self._sort_formats(formats)
return {
- 'id': video_id,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'upload_date': upload_date,
- 'title': video_title,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'categories': video_categories,
- 'subtitles': video_subtitles,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
+ 'id': video_id,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'categories': video_categories,
+ 'subtitles': video_subtitles,
+ 'duration': video_duration,
+ 'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
+ 'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
- 'formats': formats,
+ 'formats': formats,
}
+
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
)
(
(?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
- # Top tracks, they can also include dots
+ # Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
'info_dict': {
'title': 'ytdl test PL',
+ 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
},
'playlist_count': 3,
}, {
return self._extract_mix(playlist_id)
if playlist_id.startswith('TL'):
raise ExtractorError('For downloading YouTube.com top lists, use '
- 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
+ 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
class YoutubeTopListIE(YoutubePlaylistIE):
IE_NAME = 'youtube:toplist'
IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
- ' (Example: "yttoplist:music:Top Tracks")')
+ ' (Example: "yttoplist:music:Top Tracks")')
_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
_TESTS = [{
'url': 'yttoplist:music:Trending',
<span[^>]*>.*?%s.*?</span>''' % re.escape(query),
channel_page, 'list')
url = compat_urlparse.urljoin('https://www.youtube.com/', link)
-
+
video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
ids = []
# sometimes the webpage doesn't contain the videos
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
-
+
if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_ies): return False
- else: return super(YoutubeUserIE, cls).suitable(url)
+ if any(ie.suitable(url) for ie in other_ies):
+ return False
+ else:
+ return super(YoutubeUserIE, cls).suitable(url)
def _real_extract(self, url):
# Extract username
paging = 0
for i in itertools.count(1):
info = self._download_json(self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i)
+ '%s feed' % self._FEED_NAME,
+ 'Downloading page %s' % i)
feed_html = info.get('feed_html') or info.get('content_html')
load_more_widget_html = info.get('load_more_widget_html') or feed_html
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
paging = mobj.group('paging')
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos'
+
class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
+ IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
_PLAYLIST_TITLE = 'Youtube Watch Later'
_PERSONAL_FEED = True
+
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
_FEED_NAME = 'history'
_PERSONAL_FEED = True
_PLAYLIST_TITLE = 'Youtube Watch History'
+
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+ IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
_LOGIN_REQUIRED = True