]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Update upstream source from tag 'upstream/2020.05.08'
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_duration,
43 remove_quotes,
44 remove_start,
45 smuggle_url,
46 str_or_none,
47 str_to_int,
48 try_get,
49 unescapeHTML,
50 unified_strdate,
51 unsmuggle_url,
52 uppercase_escape,
53 url_or_none,
54 urlencode_postdata,
55 )
56
57
58 class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
71 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
72
73 def _set_language(self):
74 self._set_cookie(
75 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
76 # YouTube sets the expire time to about two months
77 expire_time=time.time() + 2 * 30 * 24 * 3600)
78
79 def _ids_to_results(self, ids):
80 return [
81 self.url_result(vid_id, 'Youtube', video_id=vid_id)
82 for vid_id in ids]
83
84 def _login(self):
85 """
86 Attempt to log in to YouTube.
87 True is returned if successful or skipped.
88 False is returned if login failed.
89
90 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
91 """
92 username, password = self._get_login_info()
93 # No authentication to be performed
94 if username is None:
95 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
96 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
97 return True
98
99 login_page = self._download_webpage(
100 self._LOGIN_URL, None,
101 note='Downloading login page',
102 errnote='unable to fetch login page', fatal=False)
103 if login_page is False:
104 return
105
106 login_form = self._hidden_inputs(login_page)
107
108 def req(url, f_req, note, errnote):
109 data = login_form.copy()
110 data.update({
111 'pstMsg': 1,
112 'checkConnection': 'youtube',
113 'checkedDomains': 'youtube',
114 'hl': 'en',
115 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
116 'f.req': json.dumps(f_req),
117 'flowName': 'GlifWebSignIn',
118 'flowEntry': 'ServiceLogin',
119 # TODO: reverse actual botguard identifier generation algo
120 'bgRequest': '["identifier",""]',
121 })
122 return self._download_json(
123 url, None, note=note, errnote=errnote,
124 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
125 fatal=False,
126 data=urlencode_postdata(data), headers={
127 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
128 'Google-Accounts-XSRF': 1,
129 })
130
131 def warn(message):
132 self._downloader.report_warning(message)
133
134 lookup_req = [
135 username,
136 None, [], None, 'US', None, None, 2, False, True,
137 [
138 None, None,
139 [2, 1, None, 1,
140 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
141 None, [], 4],
142 1, [None, None, []], None, None, None, True
143 ],
144 username,
145 ]
146
147 lookup_results = req(
148 self._LOOKUP_URL, lookup_req,
149 'Looking up account info', 'Unable to look up account info')
150
151 if lookup_results is False:
152 return False
153
154 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
155 if not user_hash:
156 warn('Unable to extract user hash')
157 return False
158
159 challenge_req = [
160 user_hash,
161 None, 1, None, [1, None, None, None, [password, None, True]],
162 [
163 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
164 1, [None, None, []], None, None, None, True
165 ]]
166
167 challenge_results = req(
168 self._CHALLENGE_URL, challenge_req,
169 'Logging in', 'Unable to log in')
170
171 if challenge_results is False:
172 return
173
174 login_res = try_get(challenge_results, lambda x: x[0][5], list)
175 if login_res:
176 login_msg = try_get(login_res, lambda x: x[5], compat_str)
177 warn(
178 'Unable to login: %s' % 'Invalid password'
179 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
180 return False
181
182 res = try_get(challenge_results, lambda x: x[0][-1], list)
183 if not res:
184 warn('Unable to extract result entry')
185 return False
186
187 login_challenge = try_get(res, lambda x: x[0][0], list)
188 if login_challenge:
189 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
190 if challenge_str == 'TWO_STEP_VERIFICATION':
191 # SEND_SUCCESS - TFA code has been successfully sent to phone
192 # QUOTA_EXCEEDED - reached the limit of TFA codes
193 status = try_get(login_challenge, lambda x: x[5], compat_str)
194 if status == 'QUOTA_EXCEEDED':
195 warn('Exceeded the limit of TFA codes, try later')
196 return False
197
198 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
199 if not tl:
200 warn('Unable to extract TL')
201 return False
202
203 tfa_code = self._get_tfa_info('2-step verification code')
204
205 if not tfa_code:
206 warn(
207 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
208 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
209 return False
210
211 tfa_code = remove_start(tfa_code, 'G-')
212
213 tfa_req = [
214 user_hash, None, 2, None,
215 [
216 9, None, None, None, None, None, None, None,
217 [None, tfa_code, True, 2]
218 ]]
219
220 tfa_results = req(
221 self._TFA_URL.format(tl), tfa_req,
222 'Submitting TFA code', 'Unable to submit TFA code')
223
224 if tfa_results is False:
225 return False
226
227 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
228 if tfa_res:
229 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
230 warn(
231 'Unable to finish TFA: %s' % 'Invalid TFA code'
232 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
233 return False
234
235 check_cookie_url = try_get(
236 tfa_results, lambda x: x[0][-1][2], compat_str)
237 else:
238 CHALLENGES = {
239 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
240 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
241 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
242 }
243 challenge = CHALLENGES.get(
244 challenge_str,
245 '%s returned error %s.' % (self.IE_NAME, challenge_str))
246 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
247 return False
248 else:
249 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
250
251 if not check_cookie_url:
252 warn('Unable to extract CheckCookie URL')
253 return False
254
255 check_cookie_results = self._download_webpage(
256 check_cookie_url, None, 'Checking cookie', fatal=False)
257
258 if check_cookie_results is False:
259 return False
260
261 if 'https://myaccount.google.com/' not in check_cookie_results:
262 warn('Unable to log in')
263 return False
264
265 return True
266
267 def _download_webpage_handle(self, *args, **kwargs):
268 query = kwargs.get('query', {}).copy()
269 query['disable_polymer'] = 'true'
270 kwargs['query'] = query
271 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
272 *args, **compat_kwargs(kwargs))
273
274 def _real_initialize(self):
275 if self._downloader is None:
276 return
277 self._set_language()
278 if not self._login():
279 return
280
281
282 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
283 # Extract entries from page with "Load more" button
284 def _entries(self, page, playlist_id):
285 more_widget_html = content_html = page
286 for page_num in itertools.count(1):
287 for entry in self._process_page(content_html):
288 yield entry
289
290 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
291 if not mobj:
292 break
293
294 count = 0
295 retries = 3
296 while count <= retries:
297 try:
298 # Downloading page may result in intermittent 5xx HTTP error
299 # that is usually worked around with a retry
300 more = self._download_json(
301 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
302 'Downloading page #%s%s'
303 % (page_num, ' (retry #%d)' % count if count else ''),
304 transform_source=uppercase_escape)
305 break
306 except ExtractorError as e:
307 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
308 count += 1
309 if count <= retries:
310 continue
311 raise
312
313 content_html = more['content_html']
314 if not content_html.strip():
315 # Some webpages show a "Load more" button but they don't
316 # have more videos
317 break
318 more_widget_html = more['load_more_widget_html']
319
320
321 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
322 def _process_page(self, content):
323 for video_id, video_title in self.extract_videos_from_page(content):
324 yield self.url_result(video_id, 'Youtube', video_id, video_title)
325
326 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
327 for mobj in re.finditer(video_re, page):
328 # The link with index 0 is not the first video of the playlist (not sure if still actual)
329 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
330 continue
331 video_id = mobj.group('id')
332 video_title = unescapeHTML(
333 mobj.group('title')) if 'title' in mobj.groupdict() else None
334 if video_title:
335 video_title = video_title.strip()
336 if video_title == 'ā–ŗ Play all':
337 video_title = None
338 try:
339 idx = ids_in_page.index(video_id)
340 if video_title and not titles_in_page[idx]:
341 titles_in_page[idx] = video_title
342 except ValueError:
343 ids_in_page.append(video_id)
344 titles_in_page.append(video_title)
345
346 def extract_videos_from_page(self, page):
347 ids_in_page = []
348 titles_in_page = []
349 self.extract_videos_from_page_impl(
350 self._VIDEO_RE, page, ids_in_page, titles_in_page)
351 return zip(ids_in_page, titles_in_page)
352
353
354 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
355 def _process_page(self, content):
356 for playlist_id in orderedSet(re.findall(
357 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
358 content)):
359 yield self.url_result(
360 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
361
362 def _real_extract(self, url):
363 playlist_id = self._match_id(url)
364 webpage = self._download_webpage(url, playlist_id)
365 title = self._og_search_title(webpage, fatal=False)
366 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
367
368
369 class YoutubeIE(YoutubeBaseInfoExtractor):
370 IE_DESC = 'YouTube.com'
371 _VALID_URL = r"""(?x)^
372 (
373 (?:https?://|//) # http(s):// or protocol-independent URL
374 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
375 (?:www\.)?deturl\.com/www\.youtube\.com/|
376 (?:www\.)?pwnyoutube\.com/|
377 (?:www\.)?hooktube\.com/|
378 (?:www\.)?yourepeat\.com/|
379 tube\.majestyc\.net/|
380 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
381 (?:(?:www|dev)\.)?invidio\.us/|
382 (?:(?:www|no)\.)?invidiou\.sh/|
383 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
384 (?:www\.)?invidious\.kabi\.tk/|
385 (?:www\.)?invidious\.13ad\.de/|
386 (?:www\.)?invidious\.mastodon\.host/|
387 (?:www\.)?invidious\.nixnet\.xyz/|
388 (?:www\.)?invidious\.drycat\.fr/|
389 (?:www\.)?tube\.poal\.co/|
390 (?:www\.)?vid\.wxzm\.sx/|
391 (?:www\.)?yt\.elukerio\.org/|
392 (?:www\.)?yt\.lelux\.fi/|
393 (?:www\.)?kgg2m7yk5aybusll\.onion/|
394 (?:www\.)?qklhadlycap4cnod\.onion/|
395 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
396 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
397 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
398 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
399 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
400 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
401 (?:.*?\#/)? # handle anchor (#/) redirect urls
402 (?: # the various things that can precede the ID:
403 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
404 |(?: # or the v= param in all its forms
405 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
406 (?:\?|\#!?) # the params delimiter ? or # or #!
407 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
408 v=
409 )
410 ))
411 |(?:
412 youtu\.be| # just youtu.be/xxxx
413 vid\.plus| # or vid.plus/xxxx
414 zwearz\.com/watch| # or zwearz.com/watch/xxxx
415 )/
416 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
417 )
418 )? # all until now is optional -> you can pass the naked ID
419 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
420 (?!.*?\blist=
421 (?:
422 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
423 WL # WL are handled by the watch later IE
424 )
425 )
426 (?(1).+)? # if we found the ID, everything can follow
427 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
428 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
429 _PLAYER_INFO_RE = (
430 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
431 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
432 )
433 _formats = {
434 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
435 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
436 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
437 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
438 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
439 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
440 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
441 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
442 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
443 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
444 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
445 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
446 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
447 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
448 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
449 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
450 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
451 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
452
453
454 # 3D videos
455 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
456 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
457 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
458 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
459 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
460 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
461 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
462
463 # Apple HTTP Live Streaming
464 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
465 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
466 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
467 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
468 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
469 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
470 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
471 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
472
473 # DASH mp4 video
474 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
475 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
476 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
477 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
478 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
479 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
480 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
481 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
482 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
483 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
484 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
485 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
486
487 # Dash mp4 audio
488 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
489 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
490 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
491 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
492 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
493 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
494 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
495
496 # Dash webm
497 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
498 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
499 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
500 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
501 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
502 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
503 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
504 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
505 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
506 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
507 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
508 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
509 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
510 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
511 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
512 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
513 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
514 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
515 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
516 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
517 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
518 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
519
520 # Dash webm audio
521 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
522 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
523
524 # Dash webm audio with opus inside
525 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
526 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
527 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
528
529 # RTMP (unnamed)
530 '_rtmp': {'protocol': 'rtmp'},
531
532 # av01 video only formats sometimes served with "unknown" codecs
533 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
534 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
535 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
536 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
537 }
538 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
539
540 _GEO_BYPASS = False
541
542 IE_NAME = 'youtube'
543 _TESTS = [
544 {
545 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
546 'info_dict': {
547 'id': 'BaW_jenozKc',
548 'ext': 'mp4',
549 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
550 'uploader': 'Philipp Hagemeister',
551 'uploader_id': 'phihag',
552 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
553 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
554 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
555 'upload_date': '20121002',
556 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
557 'categories': ['Science & Technology'],
558 'tags': ['youtube-dl'],
559 'duration': 10,
560 'view_count': int,
561 'like_count': int,
562 'dislike_count': int,
563 'start_time': 1,
564 'end_time': 9,
565 }
566 },
567 {
568 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
569 'note': 'Test generic use_cipher_signature video (#897)',
570 'info_dict': {
571 'id': 'UxxajLWwzqY',
572 'ext': 'mp4',
573 'upload_date': '20120506',
574 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
575 'alt_title': 'I Love It (feat. Charli XCX)',
576 'description': 'md5:19a2f98d9032b9311e686ed039564f63',
577 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
578 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
579 'iconic ep', 'iconic', 'love', 'it'],
580 'duration': 180,
581 'uploader': 'Icona Pop',
582 'uploader_id': 'IconaPop',
583 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
584 'creator': 'Icona Pop',
585 'track': 'I Love It (feat. Charli XCX)',
586 'artist': 'Icona Pop',
587 }
588 },
589 {
590 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
591 'note': 'Test VEVO video with age protection (#956)',
592 'info_dict': {
593 'id': '07FYdnEawAQ',
594 'ext': 'mp4',
595 'upload_date': '20130703',
596 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
597 'alt_title': 'Tunnel Vision',
598 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
599 'duration': 419,
600 'uploader': 'justintimberlakeVEVO',
601 'uploader_id': 'justintimberlakeVEVO',
602 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
603 'creator': 'Justin Timberlake',
604 'track': 'Tunnel Vision',
605 'artist': 'Justin Timberlake',
606 'age_limit': 18,
607 }
608 },
609 {
610 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
611 'note': 'Embed-only video (#1746)',
612 'info_dict': {
613 'id': 'yZIXLfi8CZQ',
614 'ext': 'mp4',
615 'upload_date': '20120608',
616 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
617 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
618 'uploader': 'SET India',
619 'uploader_id': 'setindia',
620 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
621 'age_limit': 18,
622 }
623 },
624 {
625 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
626 'note': 'Use the first video ID in the URL',
627 'info_dict': {
628 'id': 'BaW_jenozKc',
629 'ext': 'mp4',
630 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
631 'uploader': 'Philipp Hagemeister',
632 'uploader_id': 'phihag',
633 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
634 'upload_date': '20121002',
635 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
636 'categories': ['Science & Technology'],
637 'tags': ['youtube-dl'],
638 'duration': 10,
639 'view_count': int,
640 'like_count': int,
641 'dislike_count': int,
642 },
643 'params': {
644 'skip_download': True,
645 },
646 },
647 {
648 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
649 'note': '256k DASH audio (format 141) via DASH manifest',
650 'info_dict': {
651 'id': 'a9LDPn-MO4I',
652 'ext': 'm4a',
653 'upload_date': '20121002',
654 'uploader_id': '8KVIDEO',
655 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
656 'description': '',
657 'uploader': '8KVIDEO',
658 'title': 'UHDTV TEST 8K VIDEO.mp4'
659 },
660 'params': {
661 'youtube_include_dash_manifest': True,
662 'format': '141',
663 },
664 'skip': 'format 141 not served anymore',
665 },
666 # DASH manifest with encrypted signature
667 {
668 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
669 'info_dict': {
670 'id': 'IB3lcPjvWLA',
671 'ext': 'm4a',
672 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
673 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
674 'duration': 244,
675 'uploader': 'AfrojackVEVO',
676 'uploader_id': 'AfrojackVEVO',
677 'upload_date': '20131011',
678 },
679 'params': {
680 'youtube_include_dash_manifest': True,
681 'format': '141/bestaudio[ext=m4a]',
682 },
683 },
684 # JS player signature function name containing $
685 {
686 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
687 'info_dict': {
688 'id': 'nfWlot6h_JM',
689 'ext': 'm4a',
690 'title': 'Taylor Swift - Shake It Off',
691 'description': 'md5:307195cd21ff7fa352270fe884570ef0',
692 'duration': 242,
693 'uploader': 'TaylorSwiftVEVO',
694 'uploader_id': 'TaylorSwiftVEVO',
695 'upload_date': '20140818',
696 },
697 'params': {
698 'youtube_include_dash_manifest': True,
699 'format': '141/bestaudio[ext=m4a]',
700 },
701 },
702 # Controversy video
703 {
704 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
705 'info_dict': {
706 'id': 'T4XJQO3qol8',
707 'ext': 'mp4',
708 'duration': 219,
709 'upload_date': '20100909',
710 'uploader': 'Amazing Atheist',
711 'uploader_id': 'TheAmazingAtheist',
712 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
713 'title': 'Burning Everyone\'s Koran',
714 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
715 }
716 },
717 # Normal age-gate video (No vevo, embed allowed)
718 {
719 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
720 'info_dict': {
721 'id': 'HtVdAasjOgU',
722 'ext': 'mp4',
723 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
724 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
725 'duration': 142,
726 'uploader': 'The Witcher',
727 'uploader_id': 'WitcherGame',
728 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
729 'upload_date': '20140605',
730 'age_limit': 18,
731 },
732 },
733 # Age-gate video with encrypted signature
734 {
735 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
736 'info_dict': {
737 'id': '6kLq3WMV1nU',
738 'ext': 'mp4',
739 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
740 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
741 'duration': 246,
742 'uploader': 'LloydVEVO',
743 'uploader_id': 'LloydVEVO',
744 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
745 'upload_date': '20110629',
746 'age_limit': 18,
747 },
748 },
749 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
750 # YouTube Red ad is not captured for creator
751 {
752 'url': '__2ABJjxzNo',
753 'info_dict': {
754 'id': '__2ABJjxzNo',
755 'ext': 'mp4',
756 'duration': 266,
757 'upload_date': '20100430',
758 'uploader_id': 'deadmau5',
759 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
760 'creator': 'Dada Life, deadmau5',
761 'description': 'md5:12c56784b8032162bb936a5f76d55360',
762 'uploader': 'deadmau5',
763 'title': 'Deadmau5 - Some Chords (HD)',
764 'alt_title': 'This Machine Kills Some Chords',
765 },
766 'expected_warnings': [
767 'DASH manifest missing',
768 ]
769 },
770 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
771 {
772 'url': 'lqQg6PlCWgI',
773 'info_dict': {
774 'id': 'lqQg6PlCWgI',
775 'ext': 'mp4',
776 'duration': 6085,
777 'upload_date': '20150827',
778 'uploader_id': 'olympic',
779 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
780 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
781 'uploader': 'Olympic',
782 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
783 },
784 'params': {
785 'skip_download': 'requires avconv',
786 }
787 },
788 # Non-square pixels
789 {
790 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
791 'info_dict': {
792 'id': '_b-2C3KPAM0',
793 'ext': 'mp4',
794 'stretched_ratio': 16 / 9.,
795 'duration': 85,
796 'upload_date': '20110310',
797 'uploader_id': 'AllenMeow',
798 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
799 'description': 'made by Wacom from Korea | 字幕&åŠ ę²¹ę·»é†‹ by TY\'s Allen | ę„Ÿč¬heylisa00cavey1001同å­øē†±ęƒ…ęä¾›ę¢—åŠēæ»č­Æ',
800 'uploader': 'å­«į„‹į„…',
801 'title': '[A-made] č®Šę…‹å¦å­—å¹•ē‰ˆ å¤Ŗ妍 ęˆ‘å°±ę˜Æ這ęØ£ēš„äŗŗ',
802 },
803 },
804 # url_encoded_fmt_stream_map is empty string
805 {
806 'url': 'qEJwOuvDf7I',
807 'info_dict': {
808 'id': 'qEJwOuvDf7I',
809 'ext': 'webm',
810 'title': 'ŠžŠ±ŃŃƒŠ¶Š“ŠµŠ½ŠøŠµ суŠ“ŠµŠ±Š½Š¾Š¹ ŠæрŠ°ŠŗтŠøŠŗŠø ŠæŠ¾ Š²Ń‹Š±Š¾Ń€Š°Š¼ 14 сŠµŠ½Ń‚яŠ±Ń€Ń 2014 Š³Š¾Š“Š° Š² Š”Š°Š½Šŗт-ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Šµ',
811 'description': '',
812 'upload_date': '20150404',
813 'uploader_id': 'spbelect',
814 'uploader': 'ŠŠ°Š±Š»ŃŽŠ“Š°Ń‚ŠµŠ»Šø ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Š°',
815 },
816 'params': {
817 'skip_download': 'requires avconv',
818 },
819 'skip': 'This live event has ended.',
820 },
821 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
822 {
823 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
824 'info_dict': {
825 'id': 'FIl7x6_3R5Y',
826 'ext': 'webm',
827 'title': 'md5:7b81415841e02ecd4313668cde88737a',
828 'description': 'md5:116377fd2963b81ec4ce64b542173306',
829 'duration': 220,
830 'upload_date': '20150625',
831 'uploader_id': 'dorappi2000',
832 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
833 'uploader': 'dorappi2000',
834 'formats': 'mincount:31',
835 },
836 'skip': 'not actual anymore',
837 },
838 # DASH manifest with segment_list
839 {
840 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
841 'md5': '8ce563a1d667b599d21064e982ab9e31',
842 'info_dict': {
843 'id': 'CsmdDsKjzN8',
844 'ext': 'mp4',
845 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
846 'uploader': 'Airtek',
847 'description': 'RetransmisiĆ³n en directo de la XVIII media maratĆ³n de Zaragoza.',
848 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
849 'title': 'RetransmisiĆ³n XVIII Media maratĆ³n Zaragoza 2015',
850 },
851 'params': {
852 'youtube_include_dash_manifest': True,
853 'format': '135', # bestvideo
854 },
855 'skip': 'This live event has ended.',
856 },
857 {
858 # Multifeed videos (multiple cameras), URL is for Main Camera
859 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
860 'info_dict': {
861 'id': 'jqWvoWXjCVs',
862 'title': 'teamPGP: Rocket League Noob Stream',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
864 },
865 'playlist': [{
866 'info_dict': {
867 'id': 'jqWvoWXjCVs',
868 'ext': 'mp4',
869 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
870 'description': 'md5:dc7872fb300e143831327f1bae3af010',
871 'duration': 7335,
872 'upload_date': '20150721',
873 'uploader': 'Beer Games Beer',
874 'uploader_id': 'beergamesbeer',
875 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
876 'license': 'Standard YouTube License',
877 },
878 }, {
879 'info_dict': {
880 'id': '6h8e8xoXJzg',
881 'ext': 'mp4',
882 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
883 'description': 'md5:dc7872fb300e143831327f1bae3af010',
884 'duration': 7337,
885 'upload_date': '20150721',
886 'uploader': 'Beer Games Beer',
887 'uploader_id': 'beergamesbeer',
888 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
889 'license': 'Standard YouTube License',
890 },
891 }, {
892 'info_dict': {
893 'id': 'PUOgX5z9xZw',
894 'ext': 'mp4',
895 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
896 'description': 'md5:dc7872fb300e143831327f1bae3af010',
897 'duration': 7337,
898 'upload_date': '20150721',
899 'uploader': 'Beer Games Beer',
900 'uploader_id': 'beergamesbeer',
901 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
902 'license': 'Standard YouTube License',
903 },
904 }, {
905 'info_dict': {
906 'id': 'teuwxikvS5k',
907 'ext': 'mp4',
908 'title': 'teamPGP: Rocket League Noob Stream (zim)',
909 'description': 'md5:dc7872fb300e143831327f1bae3af010',
910 'duration': 7334,
911 'upload_date': '20150721',
912 'uploader': 'Beer Games Beer',
913 'uploader_id': 'beergamesbeer',
914 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
915 'license': 'Standard YouTube License',
916 },
917 }],
918 'params': {
919 'skip_download': True,
920 },
921 'skip': 'This video is not available.',
922 },
923 {
924 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
925 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
926 'info_dict': {
927 'id': 'gVfLd0zydlo',
928 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
929 },
930 'playlist_count': 2,
931 'skip': 'Not multifeed anymore',
932 },
933 {
934 'url': 'https://vid.plus/FlRa-iH7PGw',
935 'only_matching': True,
936 },
937 {
938 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
939 'only_matching': True,
940 },
941 {
942 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
943 # Also tests cut-off URL expansion in video description (see
944 # https://github.com/ytdl-org/youtube-dl/issues/1892,
945 # https://github.com/ytdl-org/youtube-dl/issues/8164)
946 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
947 'info_dict': {
948 'id': 'lsguqyKfVQg',
949 'ext': 'mp4',
950 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
951 'alt_title': 'Dark Walk - Position Music',
952 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
953 'duration': 133,
954 'upload_date': '20151119',
955 'uploader_id': 'IronSoulElf',
956 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
957 'uploader': 'IronSoulElf',
958 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
959 'track': 'Dark Walk - Position Music',
960 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
961 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
962 },
963 'params': {
964 'skip_download': True,
965 },
966 },
967 {
968 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
969 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
970 'only_matching': True,
971 },
972 {
973 # Video with yt:stretch=17:0
974 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
975 'info_dict': {
976 'id': 'Q39EVAstoRM',
977 'ext': 'mp4',
978 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
979 'description': 'md5:ee18a25c350637c8faff806845bddee9',
980 'upload_date': '20151107',
981 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
982 'uploader': 'CH GAMER DROID',
983 },
984 'params': {
985 'skip_download': True,
986 },
987 'skip': 'This video does not exist.',
988 },
989 {
990 # Video licensed under Creative Commons
991 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
992 'info_dict': {
993 'id': 'M4gD1WSo5mA',
994 'ext': 'mp4',
995 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
996 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
997 'duration': 721,
998 'upload_date': '20150127',
999 'uploader_id': 'BerkmanCenter',
1000 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1001 'uploader': 'The Berkman Klein Center for Internet & Society',
1002 'license': 'Creative Commons Attribution license (reuse allowed)',
1003 },
1004 'params': {
1005 'skip_download': True,
1006 },
1007 },
1008 {
1009 # Channel-like uploader_url
1010 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1011 'info_dict': {
1012 'id': 'eQcmzGIKrzg',
1013 'ext': 'mp4',
1014 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1015 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
1016 'duration': 4060,
1017 'upload_date': '20151119',
1018 'uploader': 'Bernie Sanders',
1019 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1020 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1021 'license': 'Creative Commons Attribution license (reuse allowed)',
1022 },
1023 'params': {
1024 'skip_download': True,
1025 },
1026 },
1027 {
1028 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1029 'only_matching': True,
1030 },
1031 {
1032 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1033 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1034 'only_matching': True,
1035 },
1036 {
1037 # Rental video preview
1038 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1039 'info_dict': {
1040 'id': 'uGpuVWrhIzE',
1041 'ext': 'mp4',
1042 'title': 'Piku - Trailer',
1043 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1044 'upload_date': '20150811',
1045 'uploader': 'FlixMatrix',
1046 'uploader_id': 'FlixMatrixKaravan',
1047 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1048 'license': 'Standard YouTube License',
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 'skip': 'This video is not available.',
1054 },
1055 {
1056 # YouTube Red video with episode data
1057 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1058 'info_dict': {
1059 'id': 'iqKdEhx-dD4',
1060 'ext': 'mp4',
1061 'title': 'Isolation - Mind Field (Ep 1)',
1062 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1063 'duration': 2085,
1064 'upload_date': '20170118',
1065 'uploader': 'Vsauce',
1066 'uploader_id': 'Vsauce',
1067 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1068 'series': 'Mind Field',
1069 'season_number': 1,
1070 'episode_number': 1,
1071 },
1072 'params': {
1073 'skip_download': True,
1074 },
1075 'expected_warnings': [
1076 'Skipping DASH manifest',
1077 ],
1078 },
1079 {
1080 # The following content has been identified by the YouTube community
1081 # as inappropriate or offensive to some audiences.
1082 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1083 'info_dict': {
1084 'id': '6SJNVb0GnPI',
1085 'ext': 'mp4',
1086 'title': 'Race Differences in Intelligence',
1087 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1088 'duration': 965,
1089 'upload_date': '20140124',
1090 'uploader': 'New Century Foundation',
1091 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1092 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1093 },
1094 'params': {
1095 'skip_download': True,
1096 },
1097 },
1098 {
1099 # itag 212
1100 'url': '1t24XAntNCY',
1101 'only_matching': True,
1102 },
1103 {
1104 # geo restricted to JP
1105 'url': 'sJL6WA-aGkQ',
1106 'only_matching': True,
1107 },
1108 {
1109 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1110 'only_matching': True,
1111 },
1112 {
1113 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1114 'only_matching': True,
1115 },
1116 {
1117 # DRM protected
1118 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1119 'only_matching': True,
1120 },
1121 {
1122 # Video with unsupported adaptive stream type formats
1123 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1124 'info_dict': {
1125 'id': 'Z4Vy8R84T1U',
1126 'ext': 'mp4',
1127 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1128 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1129 'duration': 433,
1130 'upload_date': '20130923',
1131 'uploader': 'Amelia Putri Harwita',
1132 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1133 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1134 'formats': 'maxcount:10',
1135 },
1136 'params': {
1137 'skip_download': True,
1138 'youtube_include_dash_manifest': False,
1139 },
1140 'skip': 'not actual anymore',
1141 },
1142 {
1143 # Youtube Music Auto-generated description
1144 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1145 'info_dict': {
1146 'id': 'MgNrAu2pzNs',
1147 'ext': 'mp4',
1148 'title': 'Voyeur Girl',
1149 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1150 'upload_date': '20190312',
1151 'uploader': 'Stephen - Topic',
1152 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1153 'artist': 'Stephen',
1154 'track': 'Voyeur Girl',
1155 'album': 'it\'s too much love to know my dear',
1156 'release_date': '20190313',
1157 'release_year': 2019,
1158 },
1159 'params': {
1160 'skip_download': True,
1161 },
1162 },
1163 {
1164 # Youtube Music Auto-generated description
1165 # Retrieve 'artist' field from 'Artist:' in video description
1166 # when it is present on youtube music video
1167 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1168 'info_dict': {
1169 'id': 'k0jLE7tTwjY',
1170 'ext': 'mp4',
1171 'title': 'Latch Feat. Sam Smith',
1172 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1173 'upload_date': '20150110',
1174 'uploader': 'Various Artists - Topic',
1175 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1176 'artist': 'Disclosure',
1177 'track': 'Latch Feat. Sam Smith',
1178 'album': 'Latch Featuring Sam Smith',
1179 'release_date': '20121008',
1180 'release_year': 2012,
1181 },
1182 'params': {
1183 'skip_download': True,
1184 },
1185 },
1186 {
1187 # Youtube Music Auto-generated description
1188 # handle multiple artists on youtube music video
1189 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1190 'info_dict': {
1191 'id': '74qn0eJSjpA',
1192 'ext': 'mp4',
1193 'title': 'Eastside',
1194 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1195 'upload_date': '20180710',
1196 'uploader': 'Benny Blanco - Topic',
1197 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1198 'artist': 'benny blanco, Halsey, Khalid',
1199 'track': 'Eastside',
1200 'album': 'Eastside',
1201 'release_date': '20180713',
1202 'release_year': 2018,
1203 },
1204 'params': {
1205 'skip_download': True,
1206 },
1207 },
1208 {
1209 # Youtube Music Auto-generated description
1210 # handle youtube music video with release_year and no release_date
1211 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1212 'info_dict': {
1213 'id': '-hcAI0g-f5M',
1214 'ext': 'mp4',
1215 'title': 'Put It On Me',
1216 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1217 'upload_date': '20180426',
1218 'uploader': 'Matt Maeson - Topic',
1219 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1220 'artist': 'Matt Maeson',
1221 'track': 'Put It On Me',
1222 'album': 'The Hearse',
1223 'release_date': None,
1224 'release_year': 2018,
1225 },
1226 'params': {
1227 'skip_download': True,
1228 },
1229 },
1230 {
1231 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1232 'only_matching': True,
1233 },
1234 {
1235 # invalid -> valid video id redirection
1236 'url': 'DJztXj2GPfl',
1237 'info_dict': {
1238 'id': 'DJztXj2GPfk',
1239 'ext': 'mp4',
1240 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1241 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1242 'upload_date': '20090125',
1243 'uploader': 'Prochorowka',
1244 'uploader_id': 'Prochorowka',
1245 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1246 'artist': 'Panjabi MC',
1247 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1248 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1249 },
1250 'params': {
1251 'skip_download': True,
1252 },
1253 }
1254 ]
1255
1256 def __init__(self, *args, **kwargs):
1257 super(YoutubeIE, self).__init__(*args, **kwargs)
1258 self._player_cache = {}
1259
1260 def report_video_info_webpage_download(self, video_id):
1261 """Report attempt to download video info webpage."""
1262 self.to_screen('%s: Downloading video info webpage' % video_id)
1263
1264 def report_information_extraction(self, video_id):
1265 """Report attempt to extract video information."""
1266 self.to_screen('%s: Extracting video information' % video_id)
1267
1268 def report_unavailable_format(self, video_id, format):
1269 """Report extracted video URL."""
1270 self.to_screen('%s: Format %s not available' % (video_id, format))
1271
1272 def report_rtmp_download(self):
1273 """Indicate the download will use the RTMP protocol."""
1274 self.to_screen('RTMP download detected')
1275
1276 def _signature_cache_id(self, example_sig):
1277 """ Return a string representation of a signature """
1278 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1279
1280 @classmethod
1281 def _extract_player_info(cls, player_url):
1282 for player_re in cls._PLAYER_INFO_RE:
1283 id_m = re.search(player_re, player_url)
1284 if id_m:
1285 break
1286 else:
1287 raise ExtractorError('Cannot identify player %r' % player_url)
1288 return id_m.group('ext'), id_m.group('id')
1289
1290 def _extract_signature_function(self, video_id, player_url, example_sig):
1291 player_type, player_id = self._extract_player_info(player_url)
1292
1293 # Read from filesystem cache
1294 func_id = '%s_%s_%s' % (
1295 player_type, player_id, self._signature_cache_id(example_sig))
1296 assert os.path.basename(func_id) == func_id
1297
1298 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1299 if cache_spec is not None:
1300 return lambda s: ''.join(s[i] for i in cache_spec)
1301
1302 download_note = (
1303 'Downloading player %s' % player_url
1304 if self._downloader.params.get('verbose') else
1305 'Downloading %s player %s' % (player_type, player_id)
1306 )
1307 if player_type == 'js':
1308 code = self._download_webpage(
1309 player_url, video_id,
1310 note=download_note,
1311 errnote='Download of %s failed' % player_url)
1312 res = self._parse_sig_js(code)
1313 elif player_type == 'swf':
1314 urlh = self._request_webpage(
1315 player_url, video_id,
1316 note=download_note,
1317 errnote='Download of %s failed' % player_url)
1318 code = urlh.read()
1319 res = self._parse_sig_swf(code)
1320 else:
1321 assert False, 'Invalid player type %r' % player_type
1322
1323 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1324 cache_res = res(test_string)
1325 cache_spec = [ord(c) for c in cache_res]
1326
1327 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1328 return res
1329
1330 def _print_sig_code(self, func, example_sig):
1331 def gen_sig_code(idxs):
1332 def _genslice(start, end, step):
1333 starts = '' if start == 0 else str(start)
1334 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1335 steps = '' if step == 1 else (':%d' % step)
1336 return 's[%s%s%s]' % (starts, ends, steps)
1337
1338 step = None
1339 # Quelch pyflakes warnings - start will be set when step is set
1340 start = '(Never used)'
1341 for i, prev in zip(idxs[1:], idxs[:-1]):
1342 if step is not None:
1343 if i - prev == step:
1344 continue
1345 yield _genslice(start, prev, step)
1346 step = None
1347 continue
1348 if i - prev in [-1, 1]:
1349 step = i - prev
1350 start = prev
1351 continue
1352 else:
1353 yield 's[%d]' % prev
1354 if step is None:
1355 yield 's[%d]' % i
1356 else:
1357 yield _genslice(start, i, step)
1358
1359 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1360 cache_res = func(test_string)
1361 cache_spec = [ord(c) for c in cache_res]
1362 expr_code = ' + '.join(gen_sig_code(cache_spec))
1363 signature_id_tuple = '(%s)' % (
1364 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1365 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1366 ' return %s\n') % (signature_id_tuple, expr_code)
1367 self.to_screen('Extracted signature function:\n' + code)
1368
1369 def _parse_sig_js(self, jscode):
1370 funcname = self._search_regex(
1371 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1372 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1373 r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1374 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1375 # Obsolete patterns
1376 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1377 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1378 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1379 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1380 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1381 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1382 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1383 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1384 jscode, 'Initial JS player signature function name', group='sig')
1385
1386 jsi = JSInterpreter(jscode)
1387 initial_function = jsi.extract_function(funcname)
1388 return lambda s: initial_function([s])
1389
1390 def _parse_sig_swf(self, file_contents):
1391 swfi = SWFInterpreter(file_contents)
1392 TARGET_CLASSNAME = 'SignatureDecipher'
1393 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1394 initial_function = swfi.extract_function(searched_class, 'decipher')
1395 return lambda s: initial_function([s])
1396
1397 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1398 """Turn the encrypted s field into a working signature"""
1399
1400 if player_url is None:
1401 raise ExtractorError('Cannot decrypt signature without player_url')
1402
1403 if player_url.startswith('//'):
1404 player_url = 'https:' + player_url
1405 elif not re.match(r'https?://', player_url):
1406 player_url = compat_urlparse.urljoin(
1407 'https://www.youtube.com', player_url)
1408 try:
1409 player_id = (player_url, self._signature_cache_id(s))
1410 if player_id not in self._player_cache:
1411 func = self._extract_signature_function(
1412 video_id, player_url, s
1413 )
1414 self._player_cache[player_id] = func
1415 func = self._player_cache[player_id]
1416 if self._downloader.params.get('youtube_print_sig_code'):
1417 self._print_sig_code(func, s)
1418 return func(s)
1419 except Exception as e:
1420 tb = traceback.format_exc()
1421 raise ExtractorError(
1422 'Signature extraction failed: ' + tb, cause=e)
1423
1424 def _get_subtitles(self, video_id, webpage):
1425 try:
1426 subs_doc = self._download_xml(
1427 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1428 video_id, note=False)
1429 except ExtractorError as err:
1430 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1431 return {}
1432
1433 sub_lang_list = {}
1434 for track in subs_doc.findall('track'):
1435 lang = track.attrib['lang_code']
1436 if lang in sub_lang_list:
1437 continue
1438 sub_formats = []
1439 for ext in self._SUBTITLE_FORMATS:
1440 params = compat_urllib_parse_urlencode({
1441 'lang': lang,
1442 'v': video_id,
1443 'fmt': ext,
1444 'name': track.attrib['name'].encode('utf-8'),
1445 })
1446 sub_formats.append({
1447 'url': 'https://www.youtube.com/api/timedtext?' + params,
1448 'ext': ext,
1449 })
1450 sub_lang_list[lang] = sub_formats
1451 if not sub_lang_list:
1452 self._downloader.report_warning('video doesn\'t have subtitles')
1453 return {}
1454 return sub_lang_list
1455
1456 def _get_ytplayer_config(self, video_id, webpage):
1457 patterns = (
1458 # User data may contain arbitrary character sequences that may affect
1459 # JSON extraction with regex, e.g. when '};' is contained the second
1460 # regex won't capture the whole JSON. Yet working around by trying more
1461 # concrete regex first keeping in mind proper quoted string handling
1462 # to be implemented in future that will replace this workaround (see
1463 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1464 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1465 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1466 r';ytplayer\.config\s*=\s*({.+?});',
1467 )
1468 config = self._search_regex(
1469 patterns, webpage, 'ytplayer.config', default=None)
1470 if config:
1471 return self._parse_json(
1472 uppercase_escape(config), video_id, fatal=False)
1473
1474 def _get_automatic_captions(self, video_id, webpage):
1475 """We need the webpage for getting the captions url, pass it as an
1476 argument to speed up the process."""
1477 self.to_screen('%s: Looking for automatic captions' % video_id)
1478 player_config = self._get_ytplayer_config(video_id, webpage)
1479 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1480 if not player_config:
1481 self._downloader.report_warning(err_msg)
1482 return {}
1483 try:
1484 args = player_config['args']
1485 caption_url = args.get('ttsurl')
1486 if caption_url:
1487 timestamp = args['timestamp']
1488 # We get the available subtitles
1489 list_params = compat_urllib_parse_urlencode({
1490 'type': 'list',
1491 'tlangs': 1,
1492 'asrs': 1,
1493 })
1494 list_url = caption_url + '&' + list_params
1495 caption_list = self._download_xml(list_url, video_id)
1496 original_lang_node = caption_list.find('track')
1497 if original_lang_node is None:
1498 self._downloader.report_warning('Video doesn\'t have automatic captions')
1499 return {}
1500 original_lang = original_lang_node.attrib['lang_code']
1501 caption_kind = original_lang_node.attrib.get('kind', '')
1502
1503 sub_lang_list = {}
1504 for lang_node in caption_list.findall('target'):
1505 sub_lang = lang_node.attrib['lang_code']
1506 sub_formats = []
1507 for ext in self._SUBTITLE_FORMATS:
1508 params = compat_urllib_parse_urlencode({
1509 'lang': original_lang,
1510 'tlang': sub_lang,
1511 'fmt': ext,
1512 'ts': timestamp,
1513 'kind': caption_kind,
1514 })
1515 sub_formats.append({
1516 'url': caption_url + '&' + params,
1517 'ext': ext,
1518 })
1519 sub_lang_list[sub_lang] = sub_formats
1520 return sub_lang_list
1521
1522 def make_captions(sub_url, sub_langs):
1523 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1524 caption_qs = compat_parse_qs(parsed_sub_url.query)
1525 captions = {}
1526 for sub_lang in sub_langs:
1527 sub_formats = []
1528 for ext in self._SUBTITLE_FORMATS:
1529 caption_qs.update({
1530 'tlang': [sub_lang],
1531 'fmt': [ext],
1532 })
1533 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1534 query=compat_urllib_parse_urlencode(caption_qs, True)))
1535 sub_formats.append({
1536 'url': sub_url,
1537 'ext': ext,
1538 })
1539 captions[sub_lang] = sub_formats
1540 return captions
1541
1542 # New captions format as of 22.06.2017
1543 player_response = args.get('player_response')
1544 if player_response and isinstance(player_response, compat_str):
1545 player_response = self._parse_json(
1546 player_response, video_id, fatal=False)
1547 if player_response:
1548 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1549 base_url = renderer['captionTracks'][0]['baseUrl']
1550 sub_lang_list = []
1551 for lang in renderer['translationLanguages']:
1552 lang_code = lang.get('languageCode')
1553 if lang_code:
1554 sub_lang_list.append(lang_code)
1555 return make_captions(base_url, sub_lang_list)
1556
1557 # Some videos don't provide ttsurl but rather caption_tracks and
1558 # caption_translation_languages (e.g. 20LmZk1hakA)
1559 # Does not used anymore as of 22.06.2017
1560 caption_tracks = args['caption_tracks']
1561 caption_translation_languages = args['caption_translation_languages']
1562 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1563 sub_lang_list = []
1564 for lang in caption_translation_languages.split(','):
1565 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1566 sub_lang = lang_qs.get('lc', [None])[0]
1567 if sub_lang:
1568 sub_lang_list.append(sub_lang)
1569 return make_captions(caption_url, sub_lang_list)
1570 # An extractor error can be raise by the download process if there are
1571 # no automatic captions but there are subtitles
1572 except (KeyError, IndexError, ExtractorError):
1573 self._downloader.report_warning(err_msg)
1574 return {}
1575
1576 def _mark_watched(self, video_id, video_info, player_response):
1577 playback_url = url_or_none(try_get(
1578 player_response,
1579 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1580 video_info, lambda x: x['videostats_playback_base_url'][0]))
1581 if not playback_url:
1582 return
1583 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1584 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1585
1586 # cpn generation algorithm is reverse engineered from base.js.
1587 # In fact it works even with dummy cpn.
1588 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1589 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1590
1591 qs.update({
1592 'ver': ['2'],
1593 'cpn': [cpn],
1594 })
1595 playback_url = compat_urlparse.urlunparse(
1596 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1597
1598 self._download_webpage(
1599 playback_url, video_id, 'Marking watched',
1600 'Unable to mark watched', fatal=False)
1601
1602 @staticmethod
1603 def _extract_urls(webpage):
1604 # Embedded YouTube player
1605 entries = [
1606 unescapeHTML(mobj.group('url'))
1607 for mobj in re.finditer(r'''(?x)
1608 (?:
1609 <iframe[^>]+?src=|
1610 data-video-url=|
1611 <embed[^>]+?src=|
1612 embedSWF\(?:\s*|
1613 <object[^>]+data=|
1614 new\s+SWFObject\(
1615 )
1616 (["\'])
1617 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1618 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1619 \1''', webpage)]
1620
1621 # lazyYT YouTube embed
1622 entries.extend(list(map(
1623 unescapeHTML,
1624 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1625
1626 # Wordpress "YouTube Video Importer" plugin
1627 matches = re.findall(r'''(?x)<div[^>]+
1628 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1629 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1630 entries.extend(m[-1] for m in matches)
1631
1632 return entries
1633
1634 @staticmethod
1635 def _extract_url(webpage):
1636 urls = YoutubeIE._extract_urls(webpage)
1637 return urls[0] if urls else None
1638
1639 @classmethod
1640 def extract_id(cls, url):
1641 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1642 if mobj is None:
1643 raise ExtractorError('Invalid URL: %s' % url)
1644 video_id = mobj.group(2)
1645 return video_id
1646
1647 @staticmethod
1648 def _extract_chapters(description, duration):
1649 if not description:
1650 return None
1651 chapter_lines = re.findall(
1652 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1653 description)
1654 if not chapter_lines:
1655 return None
1656 chapters = []
1657 for next_num, (chapter_line, time_point) in enumerate(
1658 chapter_lines, start=1):
1659 start_time = parse_duration(time_point)
1660 if start_time is None:
1661 continue
1662 if start_time > duration:
1663 break
1664 end_time = (duration if next_num == len(chapter_lines)
1665 else parse_duration(chapter_lines[next_num][1]))
1666 if end_time is None:
1667 continue
1668 if end_time > duration:
1669 end_time = duration
1670 if start_time > end_time:
1671 break
1672 chapter_title = re.sub(
1673 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1674 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1675 chapters.append({
1676 'start_time': start_time,
1677 'end_time': end_time,
1678 'title': chapter_title,
1679 })
1680 return chapters
1681
1682 def _real_extract(self, url):
1683 url, smuggled_data = unsmuggle_url(url, {})
1684
1685 proto = (
1686 'http' if self._downloader.params.get('prefer_insecure', False)
1687 else 'https')
1688
1689 start_time = None
1690 end_time = None
1691 parsed_url = compat_urllib_parse_urlparse(url)
1692 for component in [parsed_url.fragment, parsed_url.query]:
1693 query = compat_parse_qs(component)
1694 if start_time is None and 't' in query:
1695 start_time = parse_duration(query['t'][0])
1696 if start_time is None and 'start' in query:
1697 start_time = parse_duration(query['start'][0])
1698 if end_time is None and 'end' in query:
1699 end_time = parse_duration(query['end'][0])
1700
1701 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1702 mobj = re.search(self._NEXT_URL_RE, url)
1703 if mobj:
1704 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1705 video_id = self.extract_id(url)
1706
1707 # Get video webpage
1708 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1709 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1710
1711 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1712 video_id = qs.get('v', [None])[0] or video_id
1713
1714 # Attempt to extract SWF player URL
1715 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1716 if mobj is not None:
1717 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1718 else:
1719 player_url = None
1720
1721 dash_mpds = []
1722
1723 def add_dash_mpd(video_info):
1724 dash_mpd = video_info.get('dashmpd')
1725 if dash_mpd and dash_mpd[0] not in dash_mpds:
1726 dash_mpds.append(dash_mpd[0])
1727
1728 def add_dash_mpd_pr(pl_response):
1729 dash_mpd = url_or_none(try_get(
1730 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1731 compat_str))
1732 if dash_mpd and dash_mpd not in dash_mpds:
1733 dash_mpds.append(dash_mpd)
1734
1735 is_live = None
1736 view_count = None
1737
1738 def extract_view_count(v_info):
1739 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1740
1741 def extract_player_response(player_response, video_id):
1742 pl_response = str_or_none(player_response)
1743 if not pl_response:
1744 return
1745 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1746 if isinstance(pl_response, dict):
1747 add_dash_mpd_pr(pl_response)
1748 return pl_response
1749
1750 player_response = {}
1751
1752 # Get video info
1753 video_info = {}
1754 embed_webpage = None
1755 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1756 age_gate = True
1757 # We simulate the access to the video from www.youtube.com/v/{video_id}
1758 # this can be viewed without login into Youtube
1759 url = proto + '://www.youtube.com/embed/%s' % video_id
1760 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1761 data = compat_urllib_parse_urlencode({
1762 'video_id': video_id,
1763 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1764 'sts': self._search_regex(
1765 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1766 })
1767 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1768 try:
1769 video_info_webpage = self._download_webpage(
1770 video_info_url, video_id,
1771 note='Refetching age-gated info webpage',
1772 errnote='unable to download video info webpage')
1773 except ExtractorError:
1774 video_info_webpage = None
1775 if video_info_webpage:
1776 video_info = compat_parse_qs(video_info_webpage)
1777 pl_response = video_info.get('player_response', [None])[0]
1778 player_response = extract_player_response(pl_response, video_id)
1779 add_dash_mpd(video_info)
1780 view_count = extract_view_count(video_info)
1781 else:
1782 age_gate = False
1783 # Try looking directly into the video webpage
1784 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1785 if ytplayer_config:
1786 args = ytplayer_config['args']
1787 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1788 # Convert to the same format returned by compat_parse_qs
1789 video_info = dict((k, [v]) for k, v in args.items())
1790 add_dash_mpd(video_info)
1791 # Rental video is not rented but preview is available (e.g.
1792 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1793 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1794 if not video_info and args.get('ypc_vid'):
1795 return self.url_result(
1796 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1797 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1798 is_live = True
1799 if not player_response:
1800 player_response = extract_player_response(args.get('player_response'), video_id)
1801 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1802 add_dash_mpd_pr(player_response)
1803
1804 def extract_unavailable_message():
1805 messages = []
1806 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1807 msg = self._html_search_regex(
1808 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1809 video_webpage, 'unavailable %s' % kind, default=None)
1810 if msg:
1811 messages.append(msg)
1812 if messages:
1813 return '\n'.join(messages)
1814
1815 if not video_info and not player_response:
1816 unavailable_message = extract_unavailable_message()
1817 if not unavailable_message:
1818 unavailable_message = 'Unable to extract video data'
1819 raise ExtractorError(
1820 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1821
1822 if not isinstance(video_info, dict):
1823 video_info = {}
1824
1825 video_details = try_get(
1826 player_response, lambda x: x['videoDetails'], dict) or {}
1827
1828 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1829 if not video_title:
1830 self._downloader.report_warning('Unable to extract video title')
1831 video_title = '_'
1832
1833 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1834 if video_description:
1835
1836 def replace_url(m):
1837 redir_url = compat_urlparse.urljoin(url, m.group(1))
1838 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1839 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1840 qs = compat_parse_qs(parsed_redir_url.query)
1841 q = qs.get('q')
1842 if q and q[0]:
1843 return q[0]
1844 return redir_url
1845
1846 description_original = video_description = re.sub(r'''(?x)
1847 <a\s+
1848 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1849 (?:title|href)="([^"]+)"\s+
1850 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1851 class="[^"]*"[^>]*>
1852 [^<]+\.{3}\s*
1853 </a>
1854 ''', replace_url, video_description)
1855 video_description = clean_html(video_description)
1856 else:
1857 video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')
1858
1859 if not smuggled_data.get('force_singlefeed', False):
1860 if not self._downloader.params.get('noplaylist'):
1861 multifeed_metadata_list = try_get(
1862 player_response,
1863 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1864 compat_str) or try_get(
1865 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1866 if multifeed_metadata_list:
1867 entries = []
1868 feed_ids = []
1869 for feed in multifeed_metadata_list.split(','):
1870 # Unquote should take place before split on comma (,) since textual
1871 # fields may contain comma as well (see
1872 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1873 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1874
1875 def feed_entry(name):
1876 return try_get(feed_data, lambda x: x[name][0], compat_str)
1877
1878 feed_id = feed_entry('id')
1879 if not feed_id:
1880 continue
1881 feed_title = feed_entry('title')
1882 title = video_title
1883 if feed_title:
1884 title += ' (%s)' % feed_title
1885 entries.append({
1886 '_type': 'url_transparent',
1887 'ie_key': 'Youtube',
1888 'url': smuggle_url(
1889 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1890 {'force_singlefeed': True}),
1891 'title': title,
1892 })
1893 feed_ids.append(feed_id)
1894 self.to_screen(
1895 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1896 % (', '.join(feed_ids), video_id))
1897 return self.playlist_result(entries, video_id, video_title, video_description)
1898 else:
1899 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1900
1901 if view_count is None:
1902 view_count = extract_view_count(video_info)
1903 if view_count is None and video_details:
1904 view_count = int_or_none(video_details.get('viewCount'))
1905
1906 if is_live is None:
1907 is_live = bool_or_none(video_details.get('isLive'))
1908
1909 # Check for "rental" videos
1910 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1911 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1912
1913 def _extract_filesize(media_url):
1914 return int_or_none(self._search_regex(
1915 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1916
1917 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1918 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1919
1920 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1921 self.report_rtmp_download()
1922 formats = [{
1923 'format_id': '_rtmp',
1924 'protocol': 'rtmp',
1925 'url': video_info['conn'][0],
1926 'player_url': player_url,
1927 }]
1928 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1929 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1930 if 'rtmpe%3Dyes' in encoded_url_map:
1931 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1932 formats = []
1933 formats_spec = {}
1934 fmt_list = video_info.get('fmt_list', [''])[0]
1935 if fmt_list:
1936 for fmt in fmt_list.split(','):
1937 spec = fmt.split('/')
1938 if len(spec) > 1:
1939 width_height = spec[1].split('x')
1940 if len(width_height) == 2:
1941 formats_spec[spec[0]] = {
1942 'resolution': spec[1],
1943 'width': int_or_none(width_height[0]),
1944 'height': int_or_none(width_height[1]),
1945 }
1946 for fmt in streaming_formats:
1947 itag = str_or_none(fmt.get('itag'))
1948 if not itag:
1949 continue
1950 quality = fmt.get('quality')
1951 quality_label = fmt.get('qualityLabel') or quality
1952 formats_spec[itag] = {
1953 'asr': int_or_none(fmt.get('audioSampleRate')),
1954 'filesize': int_or_none(fmt.get('contentLength')),
1955 'format_note': quality_label,
1956 'fps': int_or_none(fmt.get('fps')),
1957 'height': int_or_none(fmt.get('height')),
1958 # bitrate for itag 43 is always 2147483647
1959 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1960 'width': int_or_none(fmt.get('width')),
1961 }
1962
1963 for fmt in streaming_formats:
1964 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1965 continue
1966 url = url_or_none(fmt.get('url'))
1967
1968 if not url:
1969 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1970 if not cipher:
1971 continue
1972 url_data = compat_parse_qs(cipher)
1973 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1974 if not url:
1975 continue
1976 else:
1977 cipher = None
1978 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1979
1980 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1981 # Unsupported FORMAT_STREAM_TYPE_OTF
1982 if stream_type == 3:
1983 continue
1984
1985 format_id = fmt.get('itag') or url_data['itag'][0]
1986 if not format_id:
1987 continue
1988 format_id = compat_str(format_id)
1989
1990 if cipher:
1991 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1992 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1993 jsplayer_url_json = self._search_regex(
1994 ASSETS_RE,
1995 embed_webpage if age_gate else video_webpage,
1996 'JS player URL (1)', default=None)
1997 if not jsplayer_url_json and not age_gate:
1998 # We need the embed website after all
1999 if embed_webpage is None:
2000 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2001 embed_webpage = self._download_webpage(
2002 embed_url, video_id, 'Downloading embed webpage')
2003 jsplayer_url_json = self._search_regex(
2004 ASSETS_RE, embed_webpage, 'JS player URL')
2005
2006 player_url = json.loads(jsplayer_url_json)
2007 if player_url is None:
2008 player_url_json = self._search_regex(
2009 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2010 video_webpage, 'age gate player URL')
2011 player_url = json.loads(player_url_json)
2012
2013 if 'sig' in url_data:
2014 url += '&signature=' + url_data['sig'][0]
2015 elif 's' in url_data:
2016 encrypted_sig = url_data['s'][0]
2017
2018 if self._downloader.params.get('verbose'):
2019 if player_url is None:
2020 player_desc = 'unknown'
2021 else:
2022 player_type, player_version = self._extract_player_info(player_url)
2023 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2024 parts_sizes = self._signature_cache_id(encrypted_sig)
2025 self.to_screen('{%s} signature length %s, %s' %
2026 (format_id, parts_sizes, player_desc))
2027
2028 signature = self._decrypt_signature(
2029 encrypted_sig, video_id, player_url, age_gate)
2030 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2031 url += '&%s=%s' % (sp, signature)
2032 if 'ratebypass' not in url:
2033 url += '&ratebypass=yes'
2034
2035 dct = {
2036 'format_id': format_id,
2037 'url': url,
2038 'player_url': player_url,
2039 }
2040 if format_id in self._formats:
2041 dct.update(self._formats[format_id])
2042 if format_id in formats_spec:
2043 dct.update(formats_spec[format_id])
2044
2045 # Some itags are not included in DASH manifest thus corresponding formats will
2046 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2047 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2048 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2049 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2050
2051 if width is None:
2052 width = int_or_none(fmt.get('width'))
2053 if height is None:
2054 height = int_or_none(fmt.get('height'))
2055
2056 filesize = int_or_none(url_data.get(
2057 'clen', [None])[0]) or _extract_filesize(url)
2058
2059 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2060 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2061
2062 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2063 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2064 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2065
2066 more_fields = {
2067 'filesize': filesize,
2068 'tbr': tbr,
2069 'width': width,
2070 'height': height,
2071 'fps': fps,
2072 'format_note': quality_label or quality,
2073 }
2074 for key, value in more_fields.items():
2075 if value:
2076 dct[key] = value
2077 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2078 if type_:
2079 type_split = type_.split(';')
2080 kind_ext = type_split[0].split('/')
2081 if len(kind_ext) == 2:
2082 kind, _ = kind_ext
2083 dct['ext'] = mimetype2ext(type_split[0])
2084 if kind in ('audio', 'video'):
2085 codecs = None
2086 for mobj in re.finditer(
2087 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2088 if mobj.group('key') == 'codecs':
2089 codecs = mobj.group('val')
2090 break
2091 if codecs:
2092 dct.update(parse_codecs(codecs))
2093 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2094 dct['downloader_options'] = {
2095 # Youtube throttles chunks >~10M
2096 'http_chunk_size': 10485760,
2097 }
2098 formats.append(dct)
2099 else:
2100 manifest_url = (
2101 url_or_none(try_get(
2102 player_response,
2103 lambda x: x['streamingData']['hlsManifestUrl'],
2104 compat_str))
2105 or url_or_none(try_get(
2106 video_info, lambda x: x['hlsvp'][0], compat_str)))
2107 if manifest_url:
2108 formats = []
2109 m3u8_formats = self._extract_m3u8_formats(
2110 manifest_url, video_id, 'mp4', fatal=False)
2111 for a_format in m3u8_formats:
2112 itag = self._search_regex(
2113 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2114 if itag:
2115 a_format['format_id'] = itag
2116 if itag in self._formats:
2117 dct = self._formats[itag].copy()
2118 dct.update(a_format)
2119 a_format = dct
2120 a_format['player_url'] = player_url
2121 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2122 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2123 formats.append(a_format)
2124 else:
2125 error_message = extract_unavailable_message()
2126 if not error_message:
2127 error_message = clean_html(try_get(
2128 player_response, lambda x: x['playabilityStatus']['reason'],
2129 compat_str))
2130 if not error_message:
2131 error_message = clean_html(
2132 try_get(video_info, lambda x: x['reason'][0], compat_str))
2133 if error_message:
2134 raise ExtractorError(error_message, expected=True)
2135 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2136
2137 # uploader
2138 video_uploader = try_get(
2139 video_info, lambda x: x['author'][0],
2140 compat_str) or str_or_none(video_details.get('author'))
2141 if video_uploader:
2142 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2143 else:
2144 self._downloader.report_warning('unable to extract uploader name')
2145
2146 # uploader_id
2147 video_uploader_id = None
2148 video_uploader_url = None
2149 mobj = re.search(
2150 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2151 video_webpage)
2152 if mobj is not None:
2153 video_uploader_id = mobj.group('uploader_id')
2154 video_uploader_url = mobj.group('uploader_url')
2155 else:
2156 self._downloader.report_warning('unable to extract uploader nickname')
2157
2158 channel_id = (
2159 str_or_none(video_details.get('channelId'))
2160 or self._html_search_meta(
2161 'channelId', video_webpage, 'channel id', default=None)
2162 or self._search_regex(
2163 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2164 video_webpage, 'channel id', default=None, group='id'))
2165 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2166
2167 # thumbnail image
2168 # We try first to get a high quality image:
2169 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2170 video_webpage, re.DOTALL)
2171 if m_thumb is not None:
2172 video_thumbnail = m_thumb.group(1)
2173 elif 'thumbnail_url' not in video_info:
2174 self._downloader.report_warning('unable to extract video thumbnail')
2175 video_thumbnail = None
2176 else: # don't panic if we can't find it
2177 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
2178
2179 # upload date
2180 upload_date = self._html_search_meta(
2181 'datePublished', video_webpage, 'upload date', default=None)
2182 if not upload_date:
2183 upload_date = self._search_regex(
2184 [r'(?s)id="eow-date.*?>(.*?)</span>',
2185 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2186 video_webpage, 'upload date', default=None)
2187 upload_date = unified_strdate(upload_date)
2188
2189 video_license = self._html_search_regex(
2190 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2191 video_webpage, 'license', default=None)
2192
2193 m_music = re.search(
2194 r'''(?x)
2195 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2196 <ul[^>]*>\s*
2197 <li>(?P<title>.+?)
2198 by (?P<creator>.+?)
2199 (?:
2200 \(.+?\)|
2201 <a[^>]*
2202 (?:
2203 \bhref=["\']/red[^>]*>| # drop possible
2204 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2205 )
2206 .*?
2207 )?</li
2208 ''',
2209 video_webpage)
2210 if m_music:
2211 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2212 video_creator = clean_html(m_music.group('creator'))
2213 else:
2214 video_alt_title = video_creator = None
2215
2216 def extract_meta(field):
2217 return self._html_search_regex(
2218 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2219 video_webpage, field, default=None)
2220
2221 track = extract_meta('Song')
2222 artist = extract_meta('Artist')
2223 album = extract_meta('Album')
2224
2225 # Youtube Music Auto-generated description
2226 release_date = release_year = None
2227 if video_description:
2228 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^Ā·]+)Ā·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?ā„—\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2229 if mobj:
2230 if not track:
2231 track = mobj.group('track').strip()
2232 if not artist:
2233 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('Ā·'))
2234 if not album:
2235 album = mobj.group('album'.strip())
2236 release_year = mobj.group('release_year')
2237 release_date = mobj.group('release_date')
2238 if release_date:
2239 release_date = release_date.replace('-', '')
2240 if not release_year:
2241 release_year = int(release_date[:4])
2242 if release_year:
2243 release_year = int(release_year)
2244
2245 m_episode = re.search(
2246 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*ā€¢\s*E(?P<episode>\d+)</span>',
2247 video_webpage)
2248 if m_episode:
2249 series = unescapeHTML(m_episode.group('series'))
2250 season_number = int(m_episode.group('season'))
2251 episode_number = int(m_episode.group('episode'))
2252 else:
2253 series = season_number = episode_number = None
2254
2255 m_cat_container = self._search_regex(
2256 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2257 video_webpage, 'categories', default=None)
2258 if m_cat_container:
2259 category = self._html_search_regex(
2260 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2261 default=None)
2262 video_categories = None if category is None else [category]
2263 else:
2264 video_categories = None
2265
2266 video_tags = [
2267 unescapeHTML(m.group('content'))
2268 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2269
2270 def _extract_count(count_name):
2271 return str_to_int(self._search_regex(
2272 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2273 % re.escape(count_name),
2274 video_webpage, count_name, default=None))
2275
2276 like_count = _extract_count('like')
2277 dislike_count = _extract_count('dislike')
2278
2279 if view_count is None:
2280 view_count = str_to_int(self._search_regex(
2281 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2282 'view count', default=None))
2283
2284 average_rating = (
2285 float_or_none(video_details.get('averageRating'))
2286 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2287
2288 # subtitles
2289 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2290 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2291
2292 video_duration = try_get(
2293 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2294 if not video_duration:
2295 video_duration = int_or_none(video_details.get('lengthSeconds'))
2296 if not video_duration:
2297 video_duration = parse_duration(self._html_search_meta(
2298 'duration', video_webpage, 'video duration'))
2299
2300 # annotations
2301 video_annotations = None
2302 if self._downloader.params.get('writeannotations', False):
2303 xsrf_token = self._search_regex(
2304 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2305 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2306 invideo_url = try_get(
2307 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2308 if xsrf_token and invideo_url:
2309 xsrf_field_name = self._search_regex(
2310 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2311 video_webpage, 'xsrf field name',
2312 group='xsrf_field_name', default='session_token')
2313 video_annotations = self._download_webpage(
2314 self._proto_relative_url(invideo_url),
2315 video_id, note='Downloading annotations',
2316 errnote='Unable to download video annotations', fatal=False,
2317 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2318
2319 chapters = self._extract_chapters(description_original, video_duration)
2320
2321 # Look for the DASH manifest
2322 if self._downloader.params.get('youtube_include_dash_manifest', True):
2323 dash_mpd_fatal = True
2324 for mpd_url in dash_mpds:
2325 dash_formats = {}
2326 try:
2327 def decrypt_sig(mobj):
2328 s = mobj.group(1)
2329 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2330 return '/signature/%s' % dec_s
2331
2332 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2333
2334 for df in self._extract_mpd_formats(
2335 mpd_url, video_id, fatal=dash_mpd_fatal,
2336 formats_dict=self._formats):
2337 if not df.get('filesize'):
2338 df['filesize'] = _extract_filesize(df['url'])
2339 # Do not overwrite DASH format found in some previous DASH manifest
2340 if df['format_id'] not in dash_formats:
2341 dash_formats[df['format_id']] = df
2342 # Additional DASH manifests may end up in HTTP Error 403 therefore
2343 # allow them to fail without bug report message if we already have
2344 # some DASH manifest succeeded. This is temporary workaround to reduce
2345 # burst of bug reports until we figure out the reason and whether it
2346 # can be fixed at all.
2347 dash_mpd_fatal = False
2348 except (ExtractorError, KeyError) as e:
2349 self.report_warning(
2350 'Skipping DASH manifest: %r' % e, video_id)
2351 if dash_formats:
2352 # Remove the formats we found through non-DASH, they
2353 # contain less info and it can be wrong, because we use
2354 # fixed values (for example the resolution). See
2355 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2356 # example.
2357 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2358 formats.extend(dash_formats.values())
2359
2360 # Check for malformed aspect ratio
2361 stretched_m = re.search(
2362 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2363 video_webpage)
2364 if stretched_m:
2365 w = float(stretched_m.group('w'))
2366 h = float(stretched_m.group('h'))
2367 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2368 # We will only process correct ratios.
2369 if w > 0 and h > 0:
2370 ratio = w / h
2371 for f in formats:
2372 if f.get('vcodec') != 'none':
2373 f['stretched_ratio'] = ratio
2374
2375 if not formats:
2376 if 'reason' in video_info:
2377 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2378 regions_allowed = self._html_search_meta(
2379 'regionsAllowed', video_webpage, default=None)
2380 countries = regions_allowed.split(',') if regions_allowed else None
2381 self.raise_geo_restricted(
2382 msg=video_info['reason'][0], countries=countries)
2383 reason = video_info['reason'][0]
2384 if 'Invalid parameters' in reason:
2385 unavailable_message = extract_unavailable_message()
2386 if unavailable_message:
2387 reason = unavailable_message
2388 raise ExtractorError(
2389 'YouTube said: %s' % reason,
2390 expected=True, video_id=video_id)
2391 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2392 raise ExtractorError('This video is DRM protected.', expected=True)
2393
2394 self._sort_formats(formats)
2395
2396 self.mark_watched(video_id, video_info, player_response)
2397
2398 return {
2399 'id': video_id,
2400 'uploader': video_uploader,
2401 'uploader_id': video_uploader_id,
2402 'uploader_url': video_uploader_url,
2403 'channel_id': channel_id,
2404 'channel_url': channel_url,
2405 'upload_date': upload_date,
2406 'license': video_license,
2407 'creator': video_creator or artist,
2408 'title': video_title,
2409 'alt_title': video_alt_title or track,
2410 'thumbnail': video_thumbnail,
2411 'description': video_description,
2412 'categories': video_categories,
2413 'tags': video_tags,
2414 'subtitles': video_subtitles,
2415 'automatic_captions': automatic_captions,
2416 'duration': video_duration,
2417 'age_limit': 18 if age_gate else 0,
2418 'annotations': video_annotations,
2419 'chapters': chapters,
2420 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2421 'view_count': view_count,
2422 'like_count': like_count,
2423 'dislike_count': dislike_count,
2424 'average_rating': average_rating,
2425 'formats': formats,
2426 'is_live': is_live,
2427 'start_time': start_time,
2428 'end_time': end_time,
2429 'series': series,
2430 'season_number': season_number,
2431 'episode_number': episode_number,
2432 'track': track,
2433 'artist': artist,
2434 'album': album,
2435 'release_date': release_date,
2436 'release_year': release_year,
2437 }
2438
2439
2440 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2441 IE_DESC = 'YouTube.com playlists'
2442 _VALID_URL = r"""(?x)(?:
2443 (?:https?://)?
2444 (?:\w+\.)?
2445 (?:
2446 (?:
2447 youtube(?:kids)?\.com|
2448 invidio\.us
2449 )
2450 /
2451 (?:
2452 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2453 \? (?:.*?[&;])*? (?:p|a|list)=
2454 | p/
2455 )|
2456 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2457 )
2458 (
2459 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2460 # Top tracks, they can also include dots
2461 |(?:MC)[\w\.]*
2462 )
2463 .*
2464 |
2465 (%(playlist_id)s)
2466 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2467 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2468 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2469 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2470 IE_NAME = 'youtube:playlist'
2471 _TESTS = [{
2472 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2473 'info_dict': {
2474 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2475 'uploader': 'Sergey M.',
2476 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2477 'title': 'youtube-dl public playlist',
2478 },
2479 'playlist_count': 1,
2480 }, {
2481 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2482 'info_dict': {
2483 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2484 'uploader': 'Sergey M.',
2485 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2486 'title': 'youtube-dl empty playlist',
2487 },
2488 'playlist_count': 0,
2489 }, {
2490 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2491 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2492 'info_dict': {
2493 'title': '29C3: Not my department',
2494 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2495 'uploader': 'Christiaan008',
2496 'uploader_id': 'ChRiStIaAn008',
2497 },
2498 'playlist_count': 96,
2499 }, {
2500 'note': 'issue #673',
2501 'url': 'PLBB231211A4F62143',
2502 'info_dict': {
2503 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2504 'id': 'PLBB231211A4F62143',
2505 'uploader': 'Wickydoo',
2506 'uploader_id': 'Wickydoo',
2507 },
2508 'playlist_mincount': 26,
2509 }, {
2510 'note': 'Large playlist',
2511 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2512 'info_dict': {
2513 'title': 'Uploads from Cauchemar',
2514 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2515 'uploader': 'Cauchemar',
2516 'uploader_id': 'Cauchemar89',
2517 },
2518 'playlist_mincount': 799,
2519 }, {
2520 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2521 'info_dict': {
2522 'title': 'YDL_safe_search',
2523 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2524 },
2525 'playlist_count': 2,
2526 'skip': 'This playlist is private',
2527 }, {
2528 'note': 'embedded',
2529 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2530 'playlist_count': 4,
2531 'info_dict': {
2532 'title': 'JODA15',
2533 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2534 'uploader': 'milan',
2535 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2536 }
2537 }, {
2538 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2539 'playlist_mincount': 485,
2540 'info_dict': {
2541 'title': '2018 Chinese New Singles (11/6 updated)',
2542 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2543 'uploader': 'LBK',
2544 'uploader_id': 'sdragonfang',
2545 }
2546 }, {
2547 'note': 'Embedded SWF player',
2548 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2549 'playlist_count': 4,
2550 'info_dict': {
2551 'title': 'JODA7',
2552 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2553 },
2554 'skip': 'This playlist does not exist',
2555 }, {
2556 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2557 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2558 'info_dict': {
2559 'title': 'Uploads from Interstellar Movie',
2560 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2561 'uploader': 'Interstellar Movie',
2562 'uploader_id': 'InterstellarMovie1',
2563 },
2564 'playlist_mincount': 21,
2565 }, {
2566 # Playlist URL that does not actually serve a playlist
2567 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2568 'info_dict': {
2569 'id': 'FqZTN594JQw',
2570 'ext': 'webm',
2571 'title': "Smiley's People 01 detective, Adventure Series, Action",
2572 'uploader': 'STREEM',
2573 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2575 'upload_date': '20150526',
2576 'license': 'Standard YouTube License',
2577 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2578 'categories': ['People & Blogs'],
2579 'tags': list,
2580 'view_count': int,
2581 'like_count': int,
2582 'dislike_count': int,
2583 },
2584 'params': {
2585 'skip_download': True,
2586 },
2587 'skip': 'This video is not available.',
2588 'add_ie': [YoutubeIE.ie_key()],
2589 }, {
2590 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2591 'info_dict': {
2592 'id': 'yeWKywCrFtk',
2593 'ext': 'mp4',
2594 'title': 'Small Scale Baler and Braiding Rugs',
2595 'uploader': 'Backus-Page House Museum',
2596 'uploader_id': 'backuspagemuseum',
2597 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2598 'upload_date': '20161008',
2599 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2600 'categories': ['Nonprofits & Activism'],
2601 'tags': list,
2602 'like_count': int,
2603 'dislike_count': int,
2604 },
2605 'params': {
2606 'noplaylist': True,
2607 'skip_download': True,
2608 },
2609 }, {
2610 # https://github.com/ytdl-org/youtube-dl/issues/21844
2611 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2612 'info_dict': {
2613 'title': 'Data Analysis with Dr Mike Pound',
2614 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2615 'uploader_id': 'Computerphile',
2616 'uploader': 'Computerphile',
2617 },
2618 'playlist_mincount': 11,
2619 }, {
2620 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2621 'only_matching': True,
2622 }, {
2623 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2624 'only_matching': True,
2625 }, {
2626 # music album playlist
2627 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2628 'only_matching': True,
2629 }, {
2630 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2631 'only_matching': True,
2632 }, {
2633 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2634 'only_matching': True,
2635 }]
2636
2637 def _real_initialize(self):
2638 self._login()
2639
2640 def extract_videos_from_page(self, page):
2641 ids_in_page = []
2642 titles_in_page = []
2643
2644 for item in re.findall(
2645 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2646 attrs = extract_attributes(item)
2647 video_id = attrs['data-video-id']
2648 video_title = unescapeHTML(attrs.get('data-title'))
2649 if video_title:
2650 video_title = video_title.strip()
2651 ids_in_page.append(video_id)
2652 titles_in_page.append(video_title)
2653
2654 # Fallback with old _VIDEO_RE
2655 self.extract_videos_from_page_impl(
2656 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2657
2658 # Relaxed fallbacks
2659 self.extract_videos_from_page_impl(
2660 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2661 ids_in_page, titles_in_page)
2662 self.extract_videos_from_page_impl(
2663 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2664 ids_in_page, titles_in_page)
2665
2666 return zip(ids_in_page, titles_in_page)
2667
2668 def _extract_mix(self, playlist_id):
2669 # The mixes are generated from a single video
2670 # the id of the playlist is just 'RD' + video_id
2671 ids = []
2672 last_id = playlist_id[-11:]
2673 for n in itertools.count(1):
2674 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2675 webpage = self._download_webpage(
2676 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2677 new_ids = orderedSet(re.findall(
2678 r'''(?xs)data-video-username=".*?".*?
2679 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2680 webpage))
2681 # Fetch new pages until all the videos are repeated, it seems that
2682 # there are always 51 unique videos.
2683 new_ids = [_id for _id in new_ids if _id not in ids]
2684 if not new_ids:
2685 break
2686 ids.extend(new_ids)
2687 last_id = ids[-1]
2688
2689 url_results = self._ids_to_results(ids)
2690
2691 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2692 title_span = (
2693 search_title('playlist-title')
2694 or search_title('title long-title')
2695 or search_title('title'))
2696 title = clean_html(title_span)
2697
2698 return self.playlist_result(url_results, playlist_id, title)
2699
2700 def _extract_playlist(self, playlist_id):
2701 url = self._TEMPLATE_URL % playlist_id
2702 page = self._download_webpage(url, playlist_id)
2703
2704 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2705 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2706 match = match.strip()
2707 # Check if the playlist exists or is private
2708 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2709 if mobj:
2710 reason = mobj.group('reason')
2711 message = 'This playlist %s' % reason
2712 if 'private' in reason:
2713 message += ', use --username or --netrc to access it'
2714 message += '.'
2715 raise ExtractorError(message, expected=True)
2716 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2717 raise ExtractorError(
2718 'Invalid parameters. Maybe URL is incorrect.',
2719 expected=True)
2720 elif re.match(r'[^<]*Choose your language[^<]*', match):
2721 continue
2722 else:
2723 self.report_warning('Youtube gives an alert message: ' + match)
2724
2725 playlist_title = self._html_search_regex(
2726 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2727 page, 'title', default=None)
2728
2729 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2730 uploader = self._html_search_regex(
2731 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2732 page, 'uploader', default=None)
2733 mobj = re.search(
2734 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2735 page)
2736 if mobj:
2737 uploader_id = mobj.group('uploader_id')
2738 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2739 else:
2740 uploader_id = uploader_url = None
2741
2742 has_videos = True
2743
2744 if not playlist_title:
2745 try:
2746 # Some playlist URLs don't actually serve a playlist (e.g.
2747 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2748 next(self._entries(page, playlist_id))
2749 except StopIteration:
2750 has_videos = False
2751
2752 playlist = self.playlist_result(
2753 self._entries(page, playlist_id), playlist_id, playlist_title)
2754 playlist.update({
2755 'uploader': uploader,
2756 'uploader_id': uploader_id,
2757 'uploader_url': uploader_url,
2758 })
2759
2760 return has_videos, playlist
2761
2762 def _check_download_just_video(self, url, playlist_id):
2763 # Check if it's a video-specific URL
2764 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2765 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2766 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2767 'video id', default=None)
2768 if video_id:
2769 if self._downloader.params.get('noplaylist'):
2770 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2771 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2772 else:
2773 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2774 return video_id, None
2775 return None, None
2776
2777 def _real_extract(self, url):
2778 # Extract playlist id
2779 mobj = re.match(self._VALID_URL, url)
2780 if mobj is None:
2781 raise ExtractorError('Invalid URL: %s' % url)
2782 playlist_id = mobj.group(1) or mobj.group(2)
2783
2784 video_id, video = self._check_download_just_video(url, playlist_id)
2785 if video:
2786 return video
2787
2788 if playlist_id.startswith(('RD', 'UL', 'PU')):
2789 # Mixes require a custom extraction process
2790 return self._extract_mix(playlist_id)
2791
2792 has_videos, playlist = self._extract_playlist(playlist_id)
2793 if has_videos or not video_id:
2794 return playlist
2795
2796 # Some playlist URLs don't actually serve a playlist (see
2797 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2798 # Fallback to plain video extraction if there is a video id
2799 # along with playlist id.
2800 return self.url_result(video_id, 'Youtube', video_id=video_id)
2801
2802
2803 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2804 IE_DESC = 'YouTube.com channels'
2805 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2806 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2807 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2808 IE_NAME = 'youtube:channel'
2809 _TESTS = [{
2810 'note': 'paginated channel',
2811 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2812 'playlist_mincount': 91,
2813 'info_dict': {
2814 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2815 'title': 'Uploads from lex will',
2816 'uploader': 'lex will',
2817 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2818 }
2819 }, {
2820 'note': 'Age restricted channel',
2821 # from https://www.youtube.com/user/DeusExOfficial
2822 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2823 'playlist_mincount': 64,
2824 'info_dict': {
2825 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2826 'title': 'Uploads from Deus Ex',
2827 'uploader': 'Deus Ex',
2828 'uploader_id': 'DeusExOfficial',
2829 },
2830 }, {
2831 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2832 'only_matching': True,
2833 }, {
2834 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2835 'only_matching': True,
2836 }]
2837
2838 @classmethod
2839 def suitable(cls, url):
2840 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2841 else super(YoutubeChannelIE, cls).suitable(url))
2842
2843 def _build_template_url(self, url, channel_id):
2844 return self._TEMPLATE_URL % channel_id
2845
2846 def _real_extract(self, url):
2847 channel_id = self._match_id(url)
2848
2849 url = self._build_template_url(url, channel_id)
2850
2851 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2852 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2853 # otherwise fallback on channel by page extraction
2854 channel_page = self._download_webpage(
2855 url + '?view=57', channel_id,
2856 'Downloading channel page', fatal=False)
2857 if channel_page is False:
2858 channel_playlist_id = False
2859 else:
2860 channel_playlist_id = self._html_search_meta(
2861 'channelId', channel_page, 'channel id', default=None)
2862 if not channel_playlist_id:
2863 channel_url = self._html_search_meta(
2864 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2865 channel_page, 'channel url', default=None)
2866 if channel_url:
2867 channel_playlist_id = self._search_regex(
2868 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2869 channel_url, 'channel id', default=None)
2870 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2871 playlist_id = 'UU' + channel_playlist_id[2:]
2872 return self.url_result(
2873 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2874
2875 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2876 autogenerated = re.search(r'''(?x)
2877 class="[^"]*?(?:
2878 channel-header-autogenerated-label|
2879 yt-channel-title-autogenerated
2880 )[^"]*"''', channel_page) is not None
2881
2882 if autogenerated:
2883 # The videos are contained in a single page
2884 # the ajax pages can't be used, they are empty
2885 entries = [
2886 self.url_result(
2887 video_id, 'Youtube', video_id=video_id,
2888 video_title=video_title)
2889 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2890 return self.playlist_result(entries, channel_id)
2891
2892 try:
2893 next(self._entries(channel_page, channel_id))
2894 except StopIteration:
2895 alert_message = self._html_search_regex(
2896 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2897 channel_page, 'alert', default=None, group='alert')
2898 if alert_message:
2899 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2900
2901 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2902
2903
2904 class YoutubeUserIE(YoutubeChannelIE):
2905 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2906 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2907 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2908 IE_NAME = 'youtube:user'
2909
2910 _TESTS = [{
2911 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2912 'playlist_mincount': 320,
2913 'info_dict': {
2914 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2915 'title': 'Uploads from The Linux Foundation',
2916 'uploader': 'The Linux Foundation',
2917 'uploader_id': 'TheLinuxFoundation',
2918 }
2919 }, {
2920 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2921 # but not https://www.youtube.com/user/12minuteathlete/videos
2922 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2923 'playlist_mincount': 249,
2924 'info_dict': {
2925 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2926 'title': 'Uploads from 12 Minute Athlete',
2927 'uploader': '12 Minute Athlete',
2928 'uploader_id': 'the12minuteathlete',
2929 }
2930 }, {
2931 'url': 'ytuser:phihag',
2932 'only_matching': True,
2933 }, {
2934 'url': 'https://www.youtube.com/c/gametrailers',
2935 'only_matching': True,
2936 }, {
2937 'url': 'https://www.youtube.com/gametrailers',
2938 'only_matching': True,
2939 }, {
2940 # This channel is not available, geo restricted to JP
2941 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2942 'only_matching': True,
2943 }]
2944
2945 @classmethod
2946 def suitable(cls, url):
2947 # Don't return True if the url can be extracted with other youtube
2948 # extractor, the regex would is too permissive and it would match.
2949 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2950 if any(ie.suitable(url) for ie in other_yt_ies):
2951 return False
2952 else:
2953 return super(YoutubeUserIE, cls).suitable(url)
2954
2955 def _build_template_url(self, url, channel_id):
2956 mobj = re.match(self._VALID_URL, url)
2957 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2958
2959
2960 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2961 IE_DESC = 'YouTube.com live streams'
2962 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2963 IE_NAME = 'youtube:live'
2964
2965 _TESTS = [{
2966 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2967 'info_dict': {
2968 'id': 'a48o2S1cPoo',
2969 'ext': 'mp4',
2970 'title': 'The Young Turks - Live Main Show',
2971 'uploader': 'The Young Turks',
2972 'uploader_id': 'TheYoungTurks',
2973 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2974 'upload_date': '20150715',
2975 'license': 'Standard YouTube License',
2976 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2977 'categories': ['News & Politics'],
2978 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2979 'like_count': int,
2980 'dislike_count': int,
2981 },
2982 'params': {
2983 'skip_download': True,
2984 },
2985 }, {
2986 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2987 'only_matching': True,
2988 }, {
2989 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2990 'only_matching': True,
2991 }, {
2992 'url': 'https://www.youtube.com/TheYoungTurks/live',
2993 'only_matching': True,
2994 }]
2995
2996 def _real_extract(self, url):
2997 mobj = re.match(self._VALID_URL, url)
2998 channel_id = mobj.group('id')
2999 base_url = mobj.group('base_url')
3000 webpage = self._download_webpage(url, channel_id, fatal=False)
3001 if webpage:
3002 page_type = self._og_search_property(
3003 'type', webpage, 'page type', default='')
3004 video_id = self._html_search_meta(
3005 'videoId', webpage, 'video id', default=None)
3006 if page_type.startswith('video') and video_id and re.match(
3007 r'^[0-9A-Za-z_-]{11}$', video_id):
3008 return self.url_result(video_id, YoutubeIE.ie_key())
3009 return self.url_result(base_url)
3010
3011
3012 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3013 IE_DESC = 'YouTube.com user/channel playlists'
3014 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
3015 IE_NAME = 'youtube:playlists'
3016
3017 _TESTS = [{
3018 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3019 'playlist_mincount': 4,
3020 'info_dict': {
3021 'id': 'ThirstForScience',
3022 'title': 'ThirstForScience',
3023 },
3024 }, {
3025 # with "Load more" button
3026 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3027 'playlist_mincount': 70,
3028 'info_dict': {
3029 'id': 'igorkle1',
3030 'title': 'Š˜Š³Š¾Ń€ŃŒ ŠšŠ»ŠµŠ¹Š½ŠµŃ€',
3031 },
3032 }, {
3033 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3034 'playlist_mincount': 17,
3035 'info_dict': {
3036 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3037 'title': 'Chem Player',
3038 },
3039 'skip': 'Blocked',
3040 }]
3041
3042
3043 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3044 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3045
3046
3047 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3048 IE_DESC = 'YouTube.com searches'
3049 # there doesn't appear to be a real limit, for example if you search for
3050 # 'python' you get more than 8.000.000 results
3051 _MAX_RESULTS = float('inf')
3052 IE_NAME = 'youtube:search'
3053 _SEARCH_KEY = 'ytsearch'
3054 _EXTRA_QUERY_ARGS = {}
3055 _TESTS = []
3056
3057 def _get_n_results(self, query, n):
3058 """Get a specified number of results for a query"""
3059
3060 videos = []
3061 limit = n
3062
3063 url_query = {
3064 'search_query': query.encode('utf-8'),
3065 }
3066 url_query.update(self._EXTRA_QUERY_ARGS)
3067 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
3068
3069 for pagenum in itertools.count(1):
3070 data = self._download_json(
3071 result_url, video_id='query "%s"' % query,
3072 note='Downloading page %s' % pagenum,
3073 errnote='Unable to download API page',
3074 query={'spf': 'navigate'})
3075 html_content = data[1]['body']['content']
3076
3077 if 'class="search-message' in html_content:
3078 raise ExtractorError(
3079 '[youtube] No video results', expected=True)
3080
3081 new_videos = list(self._process_page(html_content))
3082 videos += new_videos
3083 if not new_videos or len(videos) > limit:
3084 break
3085 next_link = self._html_search_regex(
3086 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
3087 html_content, 'next link', default=None)
3088 if next_link is None:
3089 break
3090 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
3091
3092 if len(videos) > n:
3093 videos = videos[:n]
3094 return self.playlist_result(videos, query)
3095
3096
3097 class YoutubeSearchDateIE(YoutubeSearchIE):
3098 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3099 _SEARCH_KEY = 'ytsearchdate'
3100 IE_DESC = 'YouTube.com searches, newest videos first'
3101 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
3102
3103
3104 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3105 IE_DESC = 'YouTube.com search URLs'
3106 IE_NAME = 'youtube:search_url'
3107 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3108 _TESTS = [{
3109 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3110 'playlist_mincount': 5,
3111 'info_dict': {
3112 'title': 'youtube-dl test video',
3113 }
3114 }, {
3115 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3116 'only_matching': True,
3117 }]
3118
3119 def _real_extract(self, url):
3120 mobj = re.match(self._VALID_URL, url)
3121 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3122 webpage = self._download_webpage(url, query)
3123 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3124
3125
3126 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3127 IE_DESC = 'YouTube.com (multi-season) shows'
3128 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3129 IE_NAME = 'youtube:show'
3130 _TESTS = [{
3131 'url': 'https://www.youtube.com/show/airdisasters',
3132 'playlist_mincount': 5,
3133 'info_dict': {
3134 'id': 'airdisasters',
3135 'title': 'Air Disasters',
3136 }
3137 }]
3138
3139 def _real_extract(self, url):
3140 playlist_id = self._match_id(url)
3141 return super(YoutubeShowIE, self)._real_extract(
3142 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3143
3144
3145 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3146 """
3147 Base class for feed extractors
3148 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3149 """
3150 _LOGIN_REQUIRED = True
3151
3152 @property
3153 def IE_NAME(self):
3154 return 'youtube:%s' % self._FEED_NAME
3155
3156 def _real_initialize(self):
3157 self._login()
3158
3159 def _entries(self, page):
3160 # The extraction process is the same as for playlists, but the regex
3161 # for the video ids doesn't contain an index
3162 ids = []
3163 more_widget_html = content_html = page
3164 for page_num in itertools.count(1):
3165 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3166
3167 # 'recommended' feed has infinite 'load more' and each new portion spins
3168 # the same videos in (sometimes) slightly different order, so we'll check
3169 # for unicity and break when portion has no new videos
3170 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3171 if not new_ids:
3172 break
3173
3174 ids.extend(new_ids)
3175
3176 for entry in self._ids_to_results(new_ids):
3177 yield entry
3178
3179 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3180 if not mobj:
3181 break
3182
3183 more = self._download_json(
3184 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3185 'Downloading page #%s' % page_num,
3186 transform_source=uppercase_escape)
3187 content_html = more['content_html']
3188 more_widget_html = more['load_more_widget_html']
3189
3190 def _real_extract(self, url):
3191 page = self._download_webpage(
3192 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3193 self._PLAYLIST_TITLE)
3194 return self.playlist_result(
3195 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3196
3197
3198 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3199 IE_NAME = 'youtube:watchlater'
3200 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3201 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3202
3203 _TESTS = [{
3204 'url': 'https://www.youtube.com/playlist?list=WL',
3205 'only_matching': True,
3206 }, {
3207 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3208 'only_matching': True,
3209 }]
3210
3211 def _real_extract(self, url):
3212 _, video = self._check_download_just_video(url, 'WL')
3213 if video:
3214 return video
3215 _, playlist = self._extract_playlist('WL')
3216 return playlist
3217
3218
3219 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3220 IE_NAME = 'youtube:favorites'
3221 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3222 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3223 _LOGIN_REQUIRED = True
3224
3225 def _real_extract(self, url):
3226 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3227 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3228 return self.url_result(playlist_id, 'YoutubePlaylist')
3229
3230
3231 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3232 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3233 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3234 _FEED_NAME = 'recommended'
3235 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3236
3237
3238 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3239 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3240 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3241 _FEED_NAME = 'subscriptions'
3242 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3243
3244
3245 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3246 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3247 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3248 _FEED_NAME = 'history'
3249 _PLAYLIST_TITLE = 'Youtube History'
3250
3251
3252 class YoutubeTruncatedURLIE(InfoExtractor):
3253 IE_NAME = 'youtube:truncated_url'
3254 IE_DESC = False # Do not list
3255 _VALID_URL = r'''(?x)
3256 (?:https?://)?
3257 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3258 (?:watch\?(?:
3259 feature=[a-z_]+|
3260 annotation_id=annotation_[^&]+|
3261 x-yt-cl=[0-9]+|
3262 hl=[^&]*|
3263 t=[0-9]+
3264 )?
3265 |
3266 attribution_link\?a=[^&]+
3267 )
3268 $
3269 '''
3270
3271 _TESTS = [{
3272 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3273 'only_matching': True,
3274 }, {
3275 'url': 'https://www.youtube.com/watch?',
3276 'only_matching': True,
3277 }, {
3278 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3279 'only_matching': True,
3280 }, {
3281 'url': 'https://www.youtube.com/watch?feature=foo',
3282 'only_matching': True,
3283 }, {
3284 'url': 'https://www.youtube.com/watch?hl=en-GB',
3285 'only_matching': True,
3286 }, {
3287 'url': 'https://www.youtube.com/watch?t=2372',
3288 'only_matching': True,
3289 }]
3290
3291 def _real_extract(self, url):
3292 raise ExtractorError(
3293 'Did you forget to quote the URL? Remember that & is a meta '
3294 'character in most shells, so you want to put the URL in quotes, '
3295 'like youtube-dl '
3296 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3297 ' or simply youtube-dl BaW_jenozKc .',
3298 expected=True)
3299
3300
3301 class YoutubeTruncatedIDIE(InfoExtractor):
3302 IE_NAME = 'youtube:truncated_id'
3303 IE_DESC = False # Do not list
3304 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3305
3306 _TESTS = [{
3307 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3308 'only_matching': True,
3309 }]
3310
3311 def _real_extract(self, url):
3312 video_id = self._match_id(url)
3313 raise ExtractorError(
3314 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3315 expected=True)