]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
debian/control: Update list of extractors in long description.
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 clean_html,
30 error_to_compat_str,
31 ExtractorError,
32 float_or_none,
33 get_element_by_attribute,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 orderedSet,
38 parse_codecs,
39 parse_duration,
40 qualities,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 uppercase_escape,
50 urlencode_postdata,
51 )
52
53
54 class YoutubeBaseInfoExtractor(InfoExtractor):
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
62
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
68
69 def _set_language(self):
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
72 # YouTube sets the expire time to about two months
73 expire_time=time.time() + 2 * 30 * 24 * 3600)
74
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
80 def _login(self):
81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
88 username, password = self._get_login_info()
89 # No authentication to be performed
90 if username is None:
91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
93 return True
94
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
99 if login_page is False:
100 return
101
102 login_form = self._hidden_inputs(login_page)
103
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
112 'f.req': json.dumps(f_req),
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
115 })
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
141 lookup_results = req(
142 self._LOOKUP_URL, lookup_req,
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
147
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
160
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
164
165 if challenge_results is False:
166 return
167
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
187 status = try_get(login_challenge, lambda x: x[5], compat_str)
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
248
249 check_cookie_results = self._download_webpage(
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
254
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
257 return False
258
259 return True
260
261 def _download_webpage_handle(self, *args, **kwargs):
262 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
263 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
264 *args, **compat_kwargs(kwargs))
265
266 def _real_initialize(self):
267 if self._downloader is None:
268 return
269 self._set_language()
270 if not self._login():
271 return
272
273
274 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
275 # Extract entries from page with "Load more" button
276 def _entries(self, page, playlist_id):
277 more_widget_html = content_html = page
278 for page_num in itertools.count(1):
279 for entry in self._process_page(content_html):
280 yield entry
281
282 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
283 if not mobj:
284 break
285
286 more = self._download_json(
287 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
288 'Downloading page #%s' % page_num,
289 transform_source=uppercase_escape)
290 content_html = more['content_html']
291 if not content_html.strip():
292 # Some webpages show a "Load more" button but they don't
293 # have more videos
294 break
295 more_widget_html = more['load_more_widget_html']
296
297
298 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
299 def _process_page(self, content):
300 for video_id, video_title in self.extract_videos_from_page(content):
301 yield self.url_result(video_id, 'Youtube', video_id, video_title)
302
303 def extract_videos_from_page(self, page):
304 ids_in_page = []
305 titles_in_page = []
306 for mobj in re.finditer(self._VIDEO_RE, page):
307 # The link with index 0 is not the first video of the playlist (not sure if still actual)
308 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
309 continue
310 video_id = mobj.group('id')
311 video_title = unescapeHTML(mobj.group('title'))
312 if video_title:
313 video_title = video_title.strip()
314 try:
315 idx = ids_in_page.index(video_id)
316 if video_title and not titles_in_page[idx]:
317 titles_in_page[idx] = video_title
318 except ValueError:
319 ids_in_page.append(video_id)
320 titles_in_page.append(video_title)
321 return zip(ids_in_page, titles_in_page)
322
323
324 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
325 def _process_page(self, content):
326 for playlist_id in orderedSet(re.findall(
327 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
328 content)):
329 yield self.url_result(
330 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
331
332 def _real_extract(self, url):
333 playlist_id = self._match_id(url)
334 webpage = self._download_webpage(url, playlist_id)
335 title = self._og_search_title(webpage, fatal=False)
336 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
337
338
339 class YoutubeIE(YoutubeBaseInfoExtractor):
340 IE_DESC = 'YouTube.com'
341 _VALID_URL = r"""(?x)^
342 (
343 (?:https?://|//) # http(s):// or protocol-independent URL
344 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
345 (?:www\.)?deturl\.com/www\.youtube\.com/|
346 (?:www\.)?pwnyoutube\.com/|
347 (?:www\.)?hooktube\.com/|
348 (?:www\.)?yourepeat\.com/|
349 tube\.majestyc\.net/|
350 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
351 (?:.*?\#/)? # handle anchor (#/) redirect urls
352 (?: # the various things that can precede the ID:
353 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
354 |(?: # or the v= param in all its forms
355 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
356 (?:\?|\#!?) # the params delimiter ? or # or #!
357 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
358 v=
359 )
360 ))
361 |(?:
362 youtu\.be| # just youtu.be/xxxx
363 vid\.plus| # or vid.plus/xxxx
364 zwearz\.com/watch| # or zwearz.com/watch/xxxx
365 )/
366 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
367 )
368 )? # all until now is optional -> you can pass the naked ID
369 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
370 (?!.*?\blist=
371 (?:
372 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
373 WL # WL are handled by the watch later IE
374 )
375 )
376 (?(1).+)? # if we found the ID, everything can follow
377 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
378 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
379 _formats = {
380 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
381 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
382 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
383 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
384 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
385 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
386 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
388 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
389 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
390 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
391 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
392 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
393 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
394 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
395 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
396 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
398
399
400 # 3D videos
401 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
402 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
403 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
404 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
405 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
406 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
407 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
408
409 # Apple HTTP Live Streaming
410 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
411 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
412 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
413 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
414 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
415 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
416 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
417 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
418
419 # DASH mp4 video
420 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
421 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
422 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
426 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
430 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
431 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
432
433 # Dash mp4 audio
434 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
435 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
436 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
437 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
438 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
439 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
440 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
441
442 # Dash webm
443 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
444 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
445 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
450 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
451 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
452 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
458 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
459 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
461 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
462 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
465
466 # Dash webm audio
467 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
468 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
469
470 # Dash webm audio with opus inside
471 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
472 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
473 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
474
475 # RTMP (unnamed)
476 '_rtmp': {'protocol': 'rtmp'},
477 }
478 _SUBTITLE_FORMATS = ('ttml', 'vtt')
479
480 _GEO_BYPASS = False
481
482 IE_NAME = 'youtube'
483 _TESTS = [
484 {
485 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
486 'info_dict': {
487 'id': 'BaW_jenozKc',
488 'ext': 'mp4',
489 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
490 'uploader': 'Philipp Hagemeister',
491 'uploader_id': 'phihag',
492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
493 'upload_date': '20121002',
494 'license': 'Standard YouTube License',
495 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
496 'categories': ['Science & Technology'],
497 'tags': ['youtube-dl'],
498 'duration': 10,
499 'like_count': int,
500 'dislike_count': int,
501 'start_time': 1,
502 'end_time': 9,
503 }
504 },
505 {
506 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
507 'note': 'Test generic use_cipher_signature video (#897)',
508 'info_dict': {
509 'id': 'UxxajLWwzqY',
510 'ext': 'mp4',
511 'upload_date': '20120506',
512 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
513 'alt_title': 'I Love It (feat. Charli XCX)',
514 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
515 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
516 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
517 'iconic ep', 'iconic', 'love', 'it'],
518 'duration': 180,
519 'uploader': 'Icona Pop',
520 'uploader_id': 'IconaPop',
521 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
522 'license': 'Standard YouTube License',
523 'creator': 'Icona Pop',
524 'track': 'I Love It (feat. Charli XCX)',
525 'artist': 'Icona Pop',
526 }
527 },
528 {
529 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
530 'note': 'Test VEVO video with age protection (#956)',
531 'info_dict': {
532 'id': '07FYdnEawAQ',
533 'ext': 'mp4',
534 'upload_date': '20130703',
535 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
536 'alt_title': 'Tunnel Vision',
537 'description': 'md5:64249768eec3bc4276236606ea996373',
538 'duration': 419,
539 'uploader': 'justintimberlakeVEVO',
540 'uploader_id': 'justintimberlakeVEVO',
541 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
542 'license': 'Standard YouTube License',
543 'creator': 'Justin Timberlake',
544 'track': 'Tunnel Vision',
545 'artist': 'Justin Timberlake',
546 'age_limit': 18,
547 }
548 },
549 {
550 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
551 'note': 'Embed-only video (#1746)',
552 'info_dict': {
553 'id': 'yZIXLfi8CZQ',
554 'ext': 'mp4',
555 'upload_date': '20120608',
556 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
557 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
558 'uploader': 'SET India',
559 'uploader_id': 'setindia',
560 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
561 'license': 'Standard YouTube License',
562 'age_limit': 18,
563 }
564 },
565 {
566 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
567 'note': 'Use the first video ID in the URL',
568 'info_dict': {
569 'id': 'BaW_jenozKc',
570 'ext': 'mp4',
571 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
572 'uploader': 'Philipp Hagemeister',
573 'uploader_id': 'phihag',
574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
575 'upload_date': '20121002',
576 'license': 'Standard YouTube License',
577 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
578 'categories': ['Science & Technology'],
579 'tags': ['youtube-dl'],
580 'duration': 10,
581 'like_count': int,
582 'dislike_count': int,
583 },
584 'params': {
585 'skip_download': True,
586 },
587 },
588 {
589 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
590 'note': '256k DASH audio (format 141) via DASH manifest',
591 'info_dict': {
592 'id': 'a9LDPn-MO4I',
593 'ext': 'm4a',
594 'upload_date': '20121002',
595 'uploader_id': '8KVIDEO',
596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
597 'description': '',
598 'uploader': '8KVIDEO',
599 'license': 'Standard YouTube License',
600 'title': 'UHDTV TEST 8K VIDEO.mp4'
601 },
602 'params': {
603 'youtube_include_dash_manifest': True,
604 'format': '141',
605 },
606 'skip': 'format 141 not served anymore',
607 },
608 # DASH manifest with encrypted signature
609 {
610 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
611 'info_dict': {
612 'id': 'IB3lcPjvWLA',
613 'ext': 'm4a',
614 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
615 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
616 'duration': 244,
617 'uploader': 'AfrojackVEVO',
618 'uploader_id': 'AfrojackVEVO',
619 'upload_date': '20131011',
620 'license': 'Standard YouTube License',
621 },
622 'params': {
623 'youtube_include_dash_manifest': True,
624 'format': '141/bestaudio[ext=m4a]',
625 },
626 },
627 # JS player signature function name containing $
628 {
629 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
630 'info_dict': {
631 'id': 'nfWlot6h_JM',
632 'ext': 'm4a',
633 'title': 'Taylor Swift - Shake It Off',
634 'alt_title': 'Shake It Off',
635 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
636 'duration': 242,
637 'uploader': 'TaylorSwiftVEVO',
638 'uploader_id': 'TaylorSwiftVEVO',
639 'upload_date': '20140818',
640 'license': 'Standard YouTube License',
641 'creator': 'Taylor Swift',
642 },
643 'params': {
644 'youtube_include_dash_manifest': True,
645 'format': '141/bestaudio[ext=m4a]',
646 },
647 },
648 # Controversy video
649 {
650 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
651 'info_dict': {
652 'id': 'T4XJQO3qol8',
653 'ext': 'mp4',
654 'duration': 219,
655 'upload_date': '20100909',
656 'uploader': 'TJ Kirk',
657 'uploader_id': 'TheAmazingAtheist',
658 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
659 'license': 'Standard YouTube License',
660 'title': 'Burning Everyone\'s Koran',
661 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
662 }
663 },
664 # Normal age-gate video (No vevo, embed allowed)
665 {
666 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
667 'info_dict': {
668 'id': 'HtVdAasjOgU',
669 'ext': 'mp4',
670 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
671 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
672 'duration': 142,
673 'uploader': 'The Witcher',
674 'uploader_id': 'WitcherGame',
675 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
676 'upload_date': '20140605',
677 'license': 'Standard YouTube License',
678 'age_limit': 18,
679 },
680 },
681 # Age-gate video with encrypted signature
682 {
683 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
684 'info_dict': {
685 'id': '6kLq3WMV1nU',
686 'ext': 'webm',
687 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
688 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
689 'duration': 246,
690 'uploader': 'LloydVEVO',
691 'uploader_id': 'LloydVEVO',
692 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
693 'upload_date': '20110629',
694 'license': 'Standard YouTube License',
695 'age_limit': 18,
696 },
697 },
698 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
699 # YouTube Red ad is not captured for creator
700 {
701 'url': '__2ABJjxzNo',
702 'info_dict': {
703 'id': '__2ABJjxzNo',
704 'ext': 'mp4',
705 'duration': 266,
706 'upload_date': '20100430',
707 'uploader_id': 'deadmau5',
708 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
709 'creator': 'deadmau5',
710 'description': 'md5:12c56784b8032162bb936a5f76d55360',
711 'uploader': 'deadmau5',
712 'license': 'Standard YouTube License',
713 'title': 'Deadmau5 - Some Chords (HD)',
714 'alt_title': 'Some Chords',
715 },
716 'expected_warnings': [
717 'DASH manifest missing',
718 ]
719 },
720 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
721 {
722 'url': 'lqQg6PlCWgI',
723 'info_dict': {
724 'id': 'lqQg6PlCWgI',
725 'ext': 'mp4',
726 'duration': 6085,
727 'upload_date': '20150827',
728 'uploader_id': 'olympic',
729 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
730 'license': 'Standard YouTube License',
731 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
732 'uploader': 'Olympic',
733 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
734 },
735 'params': {
736 'skip_download': 'requires avconv',
737 }
738 },
739 # Non-square pixels
740 {
741 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
742 'info_dict': {
743 'id': '_b-2C3KPAM0',
744 'ext': 'mp4',
745 'stretched_ratio': 16 / 9.,
746 'duration': 85,
747 'upload_date': '20110310',
748 'uploader_id': 'AllenMeow',
749 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
750 'description': 'made by Wacom from Korea | 字幕&åŠ ę²¹ę·»é†‹ by TY\'s Allen | ę„Ÿč¬heylisa00cavey1001同å­øē†±ęƒ…ęä¾›ę¢—åŠēæ»č­Æ',
751 'uploader': 'å­«į„‹į„…',
752 'license': 'Standard YouTube License',
753 'title': '[A-made] č®Šę…‹å¦å­—å¹•ē‰ˆ å¤Ŗ妍 ęˆ‘å°±ę˜Æ這ęØ£ēš„äŗŗ',
754 },
755 },
756 # url_encoded_fmt_stream_map is empty string
757 {
758 'url': 'qEJwOuvDf7I',
759 'info_dict': {
760 'id': 'qEJwOuvDf7I',
761 'ext': 'webm',
762 'title': 'ŠžŠ±ŃŃƒŠ¶Š“ŠµŠ½ŠøŠµ суŠ“ŠµŠ±Š½Š¾Š¹ ŠæрŠ°ŠŗтŠøŠŗŠø ŠæŠ¾ Š²Ń‹Š±Š¾Ń€Š°Š¼ 14 сŠµŠ½Ń‚яŠ±Ń€Ń 2014 Š³Š¾Š“Š° Š² Š”Š°Š½Šŗт-ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Šµ',
763 'description': '',
764 'upload_date': '20150404',
765 'uploader_id': 'spbelect',
766 'uploader': 'ŠŠ°Š±Š»ŃŽŠ“Š°Ń‚ŠµŠ»Šø ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Š°',
767 },
768 'params': {
769 'skip_download': 'requires avconv',
770 },
771 'skip': 'This live event has ended.',
772 },
773 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
774 {
775 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
776 'info_dict': {
777 'id': 'FIl7x6_3R5Y',
778 'ext': 'webm',
779 'title': 'md5:7b81415841e02ecd4313668cde88737a',
780 'description': 'md5:116377fd2963b81ec4ce64b542173306',
781 'duration': 220,
782 'upload_date': '20150625',
783 'uploader_id': 'dorappi2000',
784 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
785 'uploader': 'dorappi2000',
786 'license': 'Standard YouTube License',
787 'formats': 'mincount:31',
788 },
789 'skip': 'not actual anymore',
790 },
791 # DASH manifest with segment_list
792 {
793 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
794 'md5': '8ce563a1d667b599d21064e982ab9e31',
795 'info_dict': {
796 'id': 'CsmdDsKjzN8',
797 'ext': 'mp4',
798 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
799 'uploader': 'Airtek',
800 'description': 'RetransmisiĆ³n en directo de la XVIII media maratĆ³n de Zaragoza.',
801 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
802 'license': 'Standard YouTube License',
803 'title': 'RetransmisiĆ³n XVIII Media maratĆ³n Zaragoza 2015',
804 },
805 'params': {
806 'youtube_include_dash_manifest': True,
807 'format': '135', # bestvideo
808 },
809 'skip': 'This live event has ended.',
810 },
811 {
812 # Multifeed videos (multiple cameras), URL is for Main Camera
813 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
814 'info_dict': {
815 'id': 'jqWvoWXjCVs',
816 'title': 'teamPGP: Rocket League Noob Stream',
817 'description': 'md5:dc7872fb300e143831327f1bae3af010',
818 },
819 'playlist': [{
820 'info_dict': {
821 'id': 'jqWvoWXjCVs',
822 'ext': 'mp4',
823 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
824 'description': 'md5:dc7872fb300e143831327f1bae3af010',
825 'duration': 7335,
826 'upload_date': '20150721',
827 'uploader': 'Beer Games Beer',
828 'uploader_id': 'beergamesbeer',
829 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
830 'license': 'Standard YouTube License',
831 },
832 }, {
833 'info_dict': {
834 'id': '6h8e8xoXJzg',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
838 'duration': 7337,
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
843 'license': 'Standard YouTube License',
844 },
845 }, {
846 'info_dict': {
847 'id': 'PUOgX5z9xZw',
848 'ext': 'mp4',
849 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
850 'description': 'md5:dc7872fb300e143831327f1bae3af010',
851 'duration': 7337,
852 'upload_date': '20150721',
853 'uploader': 'Beer Games Beer',
854 'uploader_id': 'beergamesbeer',
855 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
856 'license': 'Standard YouTube License',
857 },
858 }, {
859 'info_dict': {
860 'id': 'teuwxikvS5k',
861 'ext': 'mp4',
862 'title': 'teamPGP: Rocket League Noob Stream (zim)',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
864 'duration': 7334,
865 'upload_date': '20150721',
866 'uploader': 'Beer Games Beer',
867 'uploader_id': 'beergamesbeer',
868 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
869 'license': 'Standard YouTube License',
870 },
871 }],
872 'params': {
873 'skip_download': True,
874 },
875 },
876 {
877 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
878 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
879 'info_dict': {
880 'id': 'gVfLd0zydlo',
881 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
882 },
883 'playlist_count': 2,
884 'skip': 'Not multifeed anymore',
885 },
886 {
887 'url': 'https://vid.plus/FlRa-iH7PGw',
888 'only_matching': True,
889 },
890 {
891 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
892 'only_matching': True,
893 },
894 {
895 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
896 # Also tests cut-off URL expansion in video description (see
897 # https://github.com/rg3/youtube-dl/issues/1892,
898 # https://github.com/rg3/youtube-dl/issues/8164)
899 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
900 'info_dict': {
901 'id': 'lsguqyKfVQg',
902 'ext': 'mp4',
903 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
904 'alt_title': 'Dark Walk - Position Music',
905 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
906 'duration': 133,
907 'upload_date': '20151119',
908 'uploader_id': 'IronSoulElf',
909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
910 'uploader': 'IronSoulElf',
911 'license': 'Standard YouTube License',
912 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
913 'track': 'Dark Walk - Position Music',
914 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
915 },
916 'params': {
917 'skip_download': True,
918 },
919 },
920 {
921 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
922 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
923 'only_matching': True,
924 },
925 {
926 # Video with yt:stretch=17:0
927 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
928 'info_dict': {
929 'id': 'Q39EVAstoRM',
930 'ext': 'mp4',
931 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
932 'description': 'md5:ee18a25c350637c8faff806845bddee9',
933 'upload_date': '20151107',
934 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
935 'uploader': 'CH GAMER DROID',
936 },
937 'params': {
938 'skip_download': True,
939 },
940 'skip': 'This video does not exist.',
941 },
942 {
943 # Video licensed under Creative Commons
944 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
945 'info_dict': {
946 'id': 'M4gD1WSo5mA',
947 'ext': 'mp4',
948 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
949 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
950 'duration': 721,
951 'upload_date': '20150127',
952 'uploader_id': 'BerkmanCenter',
953 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
954 'uploader': 'The Berkman Klein Center for Internet & Society',
955 'license': 'Creative Commons Attribution license (reuse allowed)',
956 },
957 'params': {
958 'skip_download': True,
959 },
960 },
961 {
962 # Channel-like uploader_url
963 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
964 'info_dict': {
965 'id': 'eQcmzGIKrzg',
966 'ext': 'mp4',
967 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
968 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
969 'duration': 4060,
970 'upload_date': '20151119',
971 'uploader': 'Bernie Sanders',
972 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
973 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
974 'license': 'Creative Commons Attribution license (reuse allowed)',
975 },
976 'params': {
977 'skip_download': True,
978 },
979 },
980 {
981 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
982 'only_matching': True,
983 },
984 {
985 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
986 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
987 'only_matching': True,
988 },
989 {
990 # Rental video preview
991 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
992 'info_dict': {
993 'id': 'uGpuVWrhIzE',
994 'ext': 'mp4',
995 'title': 'Piku - Trailer',
996 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
997 'upload_date': '20150811',
998 'uploader': 'FlixMatrix',
999 'uploader_id': 'FlixMatrixKaravan',
1000 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1001 'license': 'Standard YouTube License',
1002 },
1003 'params': {
1004 'skip_download': True,
1005 },
1006 'skip': 'This video is not available.',
1007 },
1008 {
1009 # YouTube Red video with episode data
1010 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1011 'info_dict': {
1012 'id': 'iqKdEhx-dD4',
1013 'ext': 'mp4',
1014 'title': 'Isolation - Mind Field (Ep 1)',
1015 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
1016 'duration': 2085,
1017 'upload_date': '20170118',
1018 'uploader': 'Vsauce',
1019 'uploader_id': 'Vsauce',
1020 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1021 'license': 'Standard YouTube License',
1022 'series': 'Mind Field',
1023 'season_number': 1,
1024 'episode_number': 1,
1025 },
1026 'params': {
1027 'skip_download': True,
1028 },
1029 'expected_warnings': [
1030 'Skipping DASH manifest',
1031 ],
1032 },
1033 {
1034 # The following content has been identified by the YouTube community
1035 # as inappropriate or offensive to some audiences.
1036 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1037 'info_dict': {
1038 'id': '6SJNVb0GnPI',
1039 'ext': 'mp4',
1040 'title': 'Race Differences in Intelligence',
1041 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1042 'duration': 965,
1043 'upload_date': '20140124',
1044 'uploader': 'New Century Foundation',
1045 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1046 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1047 'license': 'Standard YouTube License',
1048 },
1049 'params': {
1050 'skip_download': True,
1051 },
1052 },
1053 {
1054 # itag 212
1055 'url': '1t24XAntNCY',
1056 'only_matching': True,
1057 },
1058 {
1059 # geo restricted to JP
1060 'url': 'sJL6WA-aGkQ',
1061 'only_matching': True,
1062 },
1063 {
1064 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1065 'only_matching': True,
1066 },
1067 ]
1068
1069 def __init__(self, *args, **kwargs):
1070 super(YoutubeIE, self).__init__(*args, **kwargs)
1071 self._player_cache = {}
1072
1073 def report_video_info_webpage_download(self, video_id):
1074 """Report attempt to download video info webpage."""
1075 self.to_screen('%s: Downloading video info webpage' % video_id)
1076
1077 def report_information_extraction(self, video_id):
1078 """Report attempt to extract video information."""
1079 self.to_screen('%s: Extracting video information' % video_id)
1080
1081 def report_unavailable_format(self, video_id, format):
1082 """Report extracted video URL."""
1083 self.to_screen('%s: Format %s not available' % (video_id, format))
1084
1085 def report_rtmp_download(self):
1086 """Indicate the download will use the RTMP protocol."""
1087 self.to_screen('RTMP download detected')
1088
1089 def _signature_cache_id(self, example_sig):
1090 """ Return a string representation of a signature """
1091 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1092
1093 def _extract_signature_function(self, video_id, player_url, example_sig):
1094 id_m = re.match(
1095 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1096 player_url)
1097 if not id_m:
1098 raise ExtractorError('Cannot identify player %r' % player_url)
1099 player_type = id_m.group('ext')
1100 player_id = id_m.group('id')
1101
1102 # Read from filesystem cache
1103 func_id = '%s_%s_%s' % (
1104 player_type, player_id, self._signature_cache_id(example_sig))
1105 assert os.path.basename(func_id) == func_id
1106
1107 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1108 if cache_spec is not None:
1109 return lambda s: ''.join(s[i] for i in cache_spec)
1110
1111 download_note = (
1112 'Downloading player %s' % player_url
1113 if self._downloader.params.get('verbose') else
1114 'Downloading %s player %s' % (player_type, player_id)
1115 )
1116 if player_type == 'js':
1117 code = self._download_webpage(
1118 player_url, video_id,
1119 note=download_note,
1120 errnote='Download of %s failed' % player_url)
1121 res = self._parse_sig_js(code)
1122 elif player_type == 'swf':
1123 urlh = self._request_webpage(
1124 player_url, video_id,
1125 note=download_note,
1126 errnote='Download of %s failed' % player_url)
1127 code = urlh.read()
1128 res = self._parse_sig_swf(code)
1129 else:
1130 assert False, 'Invalid player type %r' % player_type
1131
1132 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1133 cache_res = res(test_string)
1134 cache_spec = [ord(c) for c in cache_res]
1135
1136 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1137 return res
1138
1139 def _print_sig_code(self, func, example_sig):
1140 def gen_sig_code(idxs):
1141 def _genslice(start, end, step):
1142 starts = '' if start == 0 else str(start)
1143 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1144 steps = '' if step == 1 else (':%d' % step)
1145 return 's[%s%s%s]' % (starts, ends, steps)
1146
1147 step = None
1148 # Quelch pyflakes warnings - start will be set when step is set
1149 start = '(Never used)'
1150 for i, prev in zip(idxs[1:], idxs[:-1]):
1151 if step is not None:
1152 if i - prev == step:
1153 continue
1154 yield _genslice(start, prev, step)
1155 step = None
1156 continue
1157 if i - prev in [-1, 1]:
1158 step = i - prev
1159 start = prev
1160 continue
1161 else:
1162 yield 's[%d]' % prev
1163 if step is None:
1164 yield 's[%d]' % i
1165 else:
1166 yield _genslice(start, i, step)
1167
1168 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1169 cache_res = func(test_string)
1170 cache_spec = [ord(c) for c in cache_res]
1171 expr_code = ' + '.join(gen_sig_code(cache_spec))
1172 signature_id_tuple = '(%s)' % (
1173 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1174 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1175 ' return %s\n') % (signature_id_tuple, expr_code)
1176 self.to_screen('Extracted signature function:\n' + code)
1177
1178 def _parse_sig_js(self, jscode):
1179 funcname = self._search_regex(
1180 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1181 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1182 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1183 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1184 jscode, 'Initial JS player signature function name', group='sig')
1185
1186 jsi = JSInterpreter(jscode)
1187 initial_function = jsi.extract_function(funcname)
1188 return lambda s: initial_function([s])
1189
1190 def _parse_sig_swf(self, file_contents):
1191 swfi = SWFInterpreter(file_contents)
1192 TARGET_CLASSNAME = 'SignatureDecipher'
1193 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1194 initial_function = swfi.extract_function(searched_class, 'decipher')
1195 return lambda s: initial_function([s])
1196
1197 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1198 """Turn the encrypted s field into a working signature"""
1199
1200 if player_url is None:
1201 raise ExtractorError('Cannot decrypt signature without player_url')
1202
1203 if player_url.startswith('//'):
1204 player_url = 'https:' + player_url
1205 elif not re.match(r'https?://', player_url):
1206 player_url = compat_urlparse.urljoin(
1207 'https://www.youtube.com', player_url)
1208 try:
1209 player_id = (player_url, self._signature_cache_id(s))
1210 if player_id not in self._player_cache:
1211 func = self._extract_signature_function(
1212 video_id, player_url, s
1213 )
1214 self._player_cache[player_id] = func
1215 func = self._player_cache[player_id]
1216 if self._downloader.params.get('youtube_print_sig_code'):
1217 self._print_sig_code(func, s)
1218 return func(s)
1219 except Exception as e:
1220 tb = traceback.format_exc()
1221 raise ExtractorError(
1222 'Signature extraction failed: ' + tb, cause=e)
1223
1224 def _get_subtitles(self, video_id, webpage):
1225 try:
1226 subs_doc = self._download_xml(
1227 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1228 video_id, note=False)
1229 except ExtractorError as err:
1230 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1231 return {}
1232
1233 sub_lang_list = {}
1234 for track in subs_doc.findall('track'):
1235 lang = track.attrib['lang_code']
1236 if lang in sub_lang_list:
1237 continue
1238 sub_formats = []
1239 for ext in self._SUBTITLE_FORMATS:
1240 params = compat_urllib_parse_urlencode({
1241 'lang': lang,
1242 'v': video_id,
1243 'fmt': ext,
1244 'name': track.attrib['name'].encode('utf-8'),
1245 })
1246 sub_formats.append({
1247 'url': 'https://www.youtube.com/api/timedtext?' + params,
1248 'ext': ext,
1249 })
1250 sub_lang_list[lang] = sub_formats
1251 if not sub_lang_list:
1252 self._downloader.report_warning('video doesn\'t have subtitles')
1253 return {}
1254 return sub_lang_list
1255
1256 def _get_ytplayer_config(self, video_id, webpage):
1257 patterns = (
1258 # User data may contain arbitrary character sequences that may affect
1259 # JSON extraction with regex, e.g. when '};' is contained the second
1260 # regex won't capture the whole JSON. Yet working around by trying more
1261 # concrete regex first keeping in mind proper quoted string handling
1262 # to be implemented in future that will replace this workaround (see
1263 # https://github.com/rg3/youtube-dl/issues/7468,
1264 # https://github.com/rg3/youtube-dl/pull/7599)
1265 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1266 r';ytplayer\.config\s*=\s*({.+?});',
1267 )
1268 config = self._search_regex(
1269 patterns, webpage, 'ytplayer.config', default=None)
1270 if config:
1271 return self._parse_json(
1272 uppercase_escape(config), video_id, fatal=False)
1273
1274 def _get_automatic_captions(self, video_id, webpage):
1275 """We need the webpage for getting the captions url, pass it as an
1276 argument to speed up the process."""
1277 self.to_screen('%s: Looking for automatic captions' % video_id)
1278 player_config = self._get_ytplayer_config(video_id, webpage)
1279 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1280 if not player_config:
1281 self._downloader.report_warning(err_msg)
1282 return {}
1283 try:
1284 args = player_config['args']
1285 caption_url = args.get('ttsurl')
1286 if caption_url:
1287 timestamp = args['timestamp']
1288 # We get the available subtitles
1289 list_params = compat_urllib_parse_urlencode({
1290 'type': 'list',
1291 'tlangs': 1,
1292 'asrs': 1,
1293 })
1294 list_url = caption_url + '&' + list_params
1295 caption_list = self._download_xml(list_url, video_id)
1296 original_lang_node = caption_list.find('track')
1297 if original_lang_node is None:
1298 self._downloader.report_warning('Video doesn\'t have automatic captions')
1299 return {}
1300 original_lang = original_lang_node.attrib['lang_code']
1301 caption_kind = original_lang_node.attrib.get('kind', '')
1302
1303 sub_lang_list = {}
1304 for lang_node in caption_list.findall('target'):
1305 sub_lang = lang_node.attrib['lang_code']
1306 sub_formats = []
1307 for ext in self._SUBTITLE_FORMATS:
1308 params = compat_urllib_parse_urlencode({
1309 'lang': original_lang,
1310 'tlang': sub_lang,
1311 'fmt': ext,
1312 'ts': timestamp,
1313 'kind': caption_kind,
1314 })
1315 sub_formats.append({
1316 'url': caption_url + '&' + params,
1317 'ext': ext,
1318 })
1319 sub_lang_list[sub_lang] = sub_formats
1320 return sub_lang_list
1321
1322 def make_captions(sub_url, sub_langs):
1323 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1324 caption_qs = compat_parse_qs(parsed_sub_url.query)
1325 captions = {}
1326 for sub_lang in sub_langs:
1327 sub_formats = []
1328 for ext in self._SUBTITLE_FORMATS:
1329 caption_qs.update({
1330 'tlang': [sub_lang],
1331 'fmt': [ext],
1332 })
1333 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1334 query=compat_urllib_parse_urlencode(caption_qs, True)))
1335 sub_formats.append({
1336 'url': sub_url,
1337 'ext': ext,
1338 })
1339 captions[sub_lang] = sub_formats
1340 return captions
1341
1342 # New captions format as of 22.06.2017
1343 player_response = args.get('player_response')
1344 if player_response and isinstance(player_response, compat_str):
1345 player_response = self._parse_json(
1346 player_response, video_id, fatal=False)
1347 if player_response:
1348 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1349 base_url = renderer['captionTracks'][0]['baseUrl']
1350 sub_lang_list = []
1351 for lang in renderer['translationLanguages']:
1352 lang_code = lang.get('languageCode')
1353 if lang_code:
1354 sub_lang_list.append(lang_code)
1355 return make_captions(base_url, sub_lang_list)
1356
1357 # Some videos don't provide ttsurl but rather caption_tracks and
1358 # caption_translation_languages (e.g. 20LmZk1hakA)
1359 # Does not used anymore as of 22.06.2017
1360 caption_tracks = args['caption_tracks']
1361 caption_translation_languages = args['caption_translation_languages']
1362 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1363 sub_lang_list = []
1364 for lang in caption_translation_languages.split(','):
1365 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1366 sub_lang = lang_qs.get('lc', [None])[0]
1367 if sub_lang:
1368 sub_lang_list.append(sub_lang)
1369 return make_captions(caption_url, sub_lang_list)
1370 # An extractor error can be raise by the download process if there are
1371 # no automatic captions but there are subtitles
1372 except (KeyError, IndexError, ExtractorError):
1373 self._downloader.report_warning(err_msg)
1374 return {}
1375
1376 def _mark_watched(self, video_id, video_info):
1377 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1378 if not playback_url:
1379 return
1380 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1381 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1382
1383 # cpn generation algorithm is reverse engineered from base.js.
1384 # In fact it works even with dummy cpn.
1385 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1386 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1387
1388 qs.update({
1389 'ver': ['2'],
1390 'cpn': [cpn],
1391 })
1392 playback_url = compat_urlparse.urlunparse(
1393 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1394
1395 self._download_webpage(
1396 playback_url, video_id, 'Marking watched',
1397 'Unable to mark watched', fatal=False)
1398
1399 @staticmethod
1400 def _extract_urls(webpage):
1401 # Embedded YouTube player
1402 entries = [
1403 unescapeHTML(mobj.group('url'))
1404 for mobj in re.finditer(r'''(?x)
1405 (?:
1406 <iframe[^>]+?src=|
1407 data-video-url=|
1408 <embed[^>]+?src=|
1409 embedSWF\(?:\s*|
1410 <object[^>]+data=|
1411 new\s+SWFObject\(
1412 )
1413 (["\'])
1414 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1415 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1416 \1''', webpage)]
1417
1418 # lazyYT YouTube embed
1419 entries.extend(list(map(
1420 unescapeHTML,
1421 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1422
1423 # Wordpress "YouTube Video Importer" plugin
1424 matches = re.findall(r'''(?x)<div[^>]+
1425 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1426 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1427 entries.extend(m[-1] for m in matches)
1428
1429 return entries
1430
1431 @staticmethod
1432 def _extract_url(webpage):
1433 urls = YoutubeIE._extract_urls(webpage)
1434 return urls[0] if urls else None
1435
1436 @classmethod
1437 def extract_id(cls, url):
1438 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1439 if mobj is None:
1440 raise ExtractorError('Invalid URL: %s' % url)
1441 video_id = mobj.group(2)
1442 return video_id
1443
1444 def _extract_annotations(self, video_id):
1445 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1446 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1447
1448 @staticmethod
1449 def _extract_chapters(description, duration):
1450 if not description:
1451 return None
1452 chapter_lines = re.findall(
1453 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1454 description)
1455 if not chapter_lines:
1456 return None
1457 chapters = []
1458 for next_num, (chapter_line, time_point) in enumerate(
1459 chapter_lines, start=1):
1460 start_time = parse_duration(time_point)
1461 if start_time is None:
1462 continue
1463 if start_time > duration:
1464 break
1465 end_time = (duration if next_num == len(chapter_lines)
1466 else parse_duration(chapter_lines[next_num][1]))
1467 if end_time is None:
1468 continue
1469 if end_time > duration:
1470 end_time = duration
1471 if start_time > end_time:
1472 break
1473 chapter_title = re.sub(
1474 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1475 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1476 chapters.append({
1477 'start_time': start_time,
1478 'end_time': end_time,
1479 'title': chapter_title,
1480 })
1481 return chapters
1482
1483 def _real_extract(self, url):
1484 url, smuggled_data = unsmuggle_url(url, {})
1485
1486 proto = (
1487 'http' if self._downloader.params.get('prefer_insecure', False)
1488 else 'https')
1489
1490 start_time = None
1491 end_time = None
1492 parsed_url = compat_urllib_parse_urlparse(url)
1493 for component in [parsed_url.fragment, parsed_url.query]:
1494 query = compat_parse_qs(component)
1495 if start_time is None and 't' in query:
1496 start_time = parse_duration(query['t'][0])
1497 if start_time is None and 'start' in query:
1498 start_time = parse_duration(query['start'][0])
1499 if end_time is None and 'end' in query:
1500 end_time = parse_duration(query['end'][0])
1501
1502 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1503 mobj = re.search(self._NEXT_URL_RE, url)
1504 if mobj:
1505 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1506 video_id = self.extract_id(url)
1507
1508 # Get video webpage
1509 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1510 video_webpage = self._download_webpage(url, video_id)
1511
1512 # Attempt to extract SWF player URL
1513 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1514 if mobj is not None:
1515 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1516 else:
1517 player_url = None
1518
1519 dash_mpds = []
1520
1521 def add_dash_mpd(video_info):
1522 dash_mpd = video_info.get('dashmpd')
1523 if dash_mpd and dash_mpd[0] not in dash_mpds:
1524 dash_mpds.append(dash_mpd[0])
1525
1526 is_live = None
1527 view_count = None
1528
1529 def extract_view_count(v_info):
1530 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1531
1532 # Get video info
1533 embed_webpage = None
1534 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1535 age_gate = True
1536 # We simulate the access to the video from www.youtube.com/v/{video_id}
1537 # this can be viewed without login into Youtube
1538 url = proto + '://www.youtube.com/embed/%s' % video_id
1539 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1540 data = compat_urllib_parse_urlencode({
1541 'video_id': video_id,
1542 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1543 'sts': self._search_regex(
1544 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1545 })
1546 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1547 video_info_webpage = self._download_webpage(
1548 video_info_url, video_id,
1549 note='Refetching age-gated info webpage',
1550 errnote='unable to download video info webpage')
1551 video_info = compat_parse_qs(video_info_webpage)
1552 add_dash_mpd(video_info)
1553 else:
1554 age_gate = False
1555 video_info = None
1556 sts = None
1557 # Try looking directly into the video webpage
1558 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1559 if ytplayer_config:
1560 args = ytplayer_config['args']
1561 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1562 # Convert to the same format returned by compat_parse_qs
1563 video_info = dict((k, [v]) for k, v in args.items())
1564 add_dash_mpd(video_info)
1565 # Rental video is not rented but preview is available (e.g.
1566 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1567 # https://github.com/rg3/youtube-dl/issues/10532)
1568 if not video_info and args.get('ypc_vid'):
1569 return self.url_result(
1570 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1571 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1572 is_live = True
1573 sts = ytplayer_config.get('sts')
1574 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1575 # We also try looking in get_video_info since it may contain different dashmpd
1576 # URL that points to a DASH manifest with possibly different itag set (some itags
1577 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1578 # manifest pointed by get_video_info's dashmpd).
1579 # The general idea is to take a union of itags of both DASH manifests (for example
1580 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1581 self.report_video_info_webpage_download(video_id)
1582 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1583 query = {
1584 'video_id': video_id,
1585 'ps': 'default',
1586 'eurl': '',
1587 'gl': 'US',
1588 'hl': 'en',
1589 }
1590 if el:
1591 query['el'] = el
1592 if sts:
1593 query['sts'] = sts
1594 video_info_webpage = self._download_webpage(
1595 '%s://www.youtube.com/get_video_info' % proto,
1596 video_id, note=False,
1597 errnote='unable to download video info webpage',
1598 fatal=False, query=query)
1599 if not video_info_webpage:
1600 continue
1601 get_video_info = compat_parse_qs(video_info_webpage)
1602 add_dash_mpd(get_video_info)
1603 if view_count is None:
1604 view_count = extract_view_count(get_video_info)
1605 if not video_info:
1606 video_info = get_video_info
1607 if 'token' in get_video_info:
1608 # Different get_video_info requests may report different results, e.g.
1609 # some may report video unavailability, but some may serve it without
1610 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1611 # the original webpage as well as el=info and el=embedded get_video_info
1612 # requests report video unavailability due to geo restriction while
1613 # el=detailpage succeeds and returns valid data). This is probably
1614 # due to YouTube measures against IP ranges of hosting providers.
1615 # Working around by preferring the first succeeded video_info containing
1616 # the token if no such video_info yet was found.
1617 if 'token' not in video_info:
1618 video_info = get_video_info
1619 break
1620
1621 def extract_unavailable_message():
1622 return self._html_search_regex(
1623 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1624 video_webpage, 'unavailable message', default=None)
1625
1626 if 'token' not in video_info:
1627 if 'reason' in video_info:
1628 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1629 regions_allowed = self._html_search_meta(
1630 'regionsAllowed', video_webpage, default=None)
1631 countries = regions_allowed.split(',') if regions_allowed else None
1632 self.raise_geo_restricted(
1633 msg=video_info['reason'][0], countries=countries)
1634 reason = video_info['reason'][0]
1635 if 'Invalid parameters' in reason:
1636 unavailable_message = extract_unavailable_message()
1637 if unavailable_message:
1638 reason = unavailable_message
1639 raise ExtractorError(
1640 'YouTube said: %s' % reason,
1641 expected=True, video_id=video_id)
1642 else:
1643 raise ExtractorError(
1644 '"token" parameter not in video info for unknown reason',
1645 video_id=video_id)
1646
1647 # title
1648 if 'title' in video_info:
1649 video_title = video_info['title'][0]
1650 else:
1651 self._downloader.report_warning('Unable to extract video title')
1652 video_title = '_'
1653
1654 # description
1655 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1656 if video_description:
1657
1658 def replace_url(m):
1659 redir_url = compat_urlparse.urljoin(url, m.group(1))
1660 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1661 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1662 qs = compat_parse_qs(parsed_redir_url.query)
1663 q = qs.get('q')
1664 if q and q[0]:
1665 return q[0]
1666 return redir_url
1667
1668 description_original = video_description = re.sub(r'''(?x)
1669 <a\s+
1670 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1671 (?:title|href)="([^"]+)"\s+
1672 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1673 class="[^"]*"[^>]*>
1674 [^<]+\.{3}\s*
1675 </a>
1676 ''', replace_url, video_description)
1677 video_description = clean_html(video_description)
1678 else:
1679 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1680 if fd_mobj:
1681 video_description = unescapeHTML(fd_mobj.group(1))
1682 else:
1683 video_description = ''
1684
1685 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1686 if not self._downloader.params.get('noplaylist'):
1687 entries = []
1688 feed_ids = []
1689 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
1690 for feed in multifeed_metadata_list.split(','):
1691 # Unquote should take place before split on comma (,) since textual
1692 # fields may contain comma as well (see
1693 # https://github.com/rg3/youtube-dl/issues/8536)
1694 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1695 entries.append({
1696 '_type': 'url_transparent',
1697 'ie_key': 'Youtube',
1698 'url': smuggle_url(
1699 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1700 {'force_singlefeed': True}),
1701 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1702 })
1703 feed_ids.append(feed_data['id'][0])
1704 self.to_screen(
1705 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1706 % (', '.join(feed_ids), video_id))
1707 return self.playlist_result(entries, video_id, video_title, video_description)
1708 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1709
1710 if view_count is None:
1711 view_count = extract_view_count(video_info)
1712
1713 # Check for "rental" videos
1714 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1715 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
1716
1717 def _extract_filesize(media_url):
1718 return int_or_none(self._search_regex(
1719 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1720
1721 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1722 self.report_rtmp_download()
1723 formats = [{
1724 'format_id': '_rtmp',
1725 'protocol': 'rtmp',
1726 'url': video_info['conn'][0],
1727 'player_url': player_url,
1728 }]
1729 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1730 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1731 if 'rtmpe%3Dyes' in encoded_url_map:
1732 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1733 formats_spec = {}
1734 fmt_list = video_info.get('fmt_list', [''])[0]
1735 if fmt_list:
1736 for fmt in fmt_list.split(','):
1737 spec = fmt.split('/')
1738 if len(spec) > 1:
1739 width_height = spec[1].split('x')
1740 if len(width_height) == 2:
1741 formats_spec[spec[0]] = {
1742 'resolution': spec[1],
1743 'width': int_or_none(width_height[0]),
1744 'height': int_or_none(width_height[1]),
1745 }
1746 q = qualities(['small', 'medium', 'hd720'])
1747 formats = []
1748 for url_data_str in encoded_url_map.split(','):
1749 url_data = compat_parse_qs(url_data_str)
1750 if 'itag' not in url_data or 'url' not in url_data:
1751 continue
1752 format_id = url_data['itag'][0]
1753 url = url_data['url'][0]
1754
1755 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1756 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1757 jsplayer_url_json = self._search_regex(
1758 ASSETS_RE,
1759 embed_webpage if age_gate else video_webpage,
1760 'JS player URL (1)', default=None)
1761 if not jsplayer_url_json and not age_gate:
1762 # We need the embed website after all
1763 if embed_webpage is None:
1764 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1765 embed_webpage = self._download_webpage(
1766 embed_url, video_id, 'Downloading embed webpage')
1767 jsplayer_url_json = self._search_regex(
1768 ASSETS_RE, embed_webpage, 'JS player URL')
1769
1770 player_url = json.loads(jsplayer_url_json)
1771 if player_url is None:
1772 player_url_json = self._search_regex(
1773 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1774 video_webpage, 'age gate player URL')
1775 player_url = json.loads(player_url_json)
1776
1777 if 'sig' in url_data:
1778 url += '&signature=' + url_data['sig'][0]
1779 elif 's' in url_data:
1780 encrypted_sig = url_data['s'][0]
1781
1782 if self._downloader.params.get('verbose'):
1783 if player_url is None:
1784 player_version = 'unknown'
1785 player_desc = 'unknown'
1786 else:
1787 if player_url.endswith('swf'):
1788 player_version = self._search_regex(
1789 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1790 'flash player', fatal=False)
1791 player_desc = 'flash player %s' % player_version
1792 else:
1793 player_version = self._search_regex(
1794 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1795 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
1796 player_url,
1797 'html5 player', fatal=False)
1798 player_desc = 'html5 player %s' % player_version
1799
1800 parts_sizes = self._signature_cache_id(encrypted_sig)
1801 self.to_screen('{%s} signature length %s, %s' %
1802 (format_id, parts_sizes, player_desc))
1803
1804 signature = self._decrypt_signature(
1805 encrypted_sig, video_id, player_url, age_gate)
1806 url += '&signature=' + signature
1807 if 'ratebypass' not in url:
1808 url += '&ratebypass=yes'
1809
1810 dct = {
1811 'format_id': format_id,
1812 'url': url,
1813 'player_url': player_url,
1814 }
1815 if format_id in self._formats:
1816 dct.update(self._formats[format_id])
1817 if format_id in formats_spec:
1818 dct.update(formats_spec[format_id])
1819
1820 # Some itags are not included in DASH manifest thus corresponding formats will
1821 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1822 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1823 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1824 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1825
1826 filesize = int_or_none(url_data.get(
1827 'clen', [None])[0]) or _extract_filesize(url)
1828
1829 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1830
1831 more_fields = {
1832 'filesize': filesize,
1833 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1834 'width': width,
1835 'height': height,
1836 'fps': int_or_none(url_data.get('fps', [None])[0]),
1837 'format_note': quality,
1838 'quality': q(quality),
1839 }
1840 for key, value in more_fields.items():
1841 if value:
1842 dct[key] = value
1843 type_ = url_data.get('type', [None])[0]
1844 if type_:
1845 type_split = type_.split(';')
1846 kind_ext = type_split[0].split('/')
1847 if len(kind_ext) == 2:
1848 kind, _ = kind_ext
1849 dct['ext'] = mimetype2ext(type_split[0])
1850 if kind in ('audio', 'video'):
1851 codecs = None
1852 for mobj in re.finditer(
1853 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1854 if mobj.group('key') == 'codecs':
1855 codecs = mobj.group('val')
1856 break
1857 if codecs:
1858 dct.update(parse_codecs(codecs))
1859 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1860 dct['downloader_options'] = {
1861 # Youtube throttles chunks >~10M
1862 'http_chunk_size': 10485760,
1863 }
1864 formats.append(dct)
1865 elif video_info.get('hlsvp'):
1866 manifest_url = video_info['hlsvp'][0]
1867 formats = []
1868 m3u8_formats = self._extract_m3u8_formats(
1869 manifest_url, video_id, 'mp4', fatal=False)
1870 for a_format in m3u8_formats:
1871 itag = self._search_regex(
1872 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1873 if itag:
1874 a_format['format_id'] = itag
1875 if itag in self._formats:
1876 dct = self._formats[itag].copy()
1877 dct.update(a_format)
1878 a_format = dct
1879 a_format['player_url'] = player_url
1880 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
1881 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
1882 formats.append(a_format)
1883 else:
1884 error_message = clean_html(video_info.get('reason', [None])[0])
1885 if not error_message:
1886 error_message = extract_unavailable_message()
1887 if error_message:
1888 raise ExtractorError(error_message, expected=True)
1889 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1890
1891 # uploader
1892 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1893 if video_uploader:
1894 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1895 else:
1896 self._downloader.report_warning('unable to extract uploader name')
1897
1898 # uploader_id
1899 video_uploader_id = None
1900 video_uploader_url = None
1901 mobj = re.search(
1902 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1903 video_webpage)
1904 if mobj is not None:
1905 video_uploader_id = mobj.group('uploader_id')
1906 video_uploader_url = mobj.group('uploader_url')
1907 else:
1908 self._downloader.report_warning('unable to extract uploader nickname')
1909
1910 # thumbnail image
1911 # We try first to get a high quality image:
1912 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1913 video_webpage, re.DOTALL)
1914 if m_thumb is not None:
1915 video_thumbnail = m_thumb.group(1)
1916 elif 'thumbnail_url' not in video_info:
1917 self._downloader.report_warning('unable to extract video thumbnail')
1918 video_thumbnail = None
1919 else: # don't panic if we can't find it
1920 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1921
1922 # upload date
1923 upload_date = self._html_search_meta(
1924 'datePublished', video_webpage, 'upload date', default=None)
1925 if not upload_date:
1926 upload_date = self._search_regex(
1927 [r'(?s)id="eow-date.*?>(.*?)</span>',
1928 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1929 video_webpage, 'upload date', default=None)
1930 upload_date = unified_strdate(upload_date)
1931
1932 video_license = self._html_search_regex(
1933 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1934 video_webpage, 'license', default=None)
1935
1936 m_music = re.search(
1937 r'''(?x)
1938 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1939 <ul[^>]*>\s*
1940 <li>(?P<title>.+?)
1941 by (?P<creator>.+?)
1942 (?:
1943 \(.+?\)|
1944 <a[^>]*
1945 (?:
1946 \bhref=["\']/red[^>]*>| # drop possible
1947 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1948 )
1949 .*?
1950 )?</li
1951 ''',
1952 video_webpage)
1953 if m_music:
1954 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1955 video_creator = clean_html(m_music.group('creator'))
1956 else:
1957 video_alt_title = video_creator = None
1958
1959 def extract_meta(field):
1960 return self._html_search_regex(
1961 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1962 video_webpage, field, default=None)
1963
1964 track = extract_meta('Song')
1965 artist = extract_meta('Artist')
1966
1967 m_episode = re.search(
1968 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*ā€¢\s*E(?P<episode>\d+)</span>',
1969 video_webpage)
1970 if m_episode:
1971 series = m_episode.group('series')
1972 season_number = int(m_episode.group('season'))
1973 episode_number = int(m_episode.group('episode'))
1974 else:
1975 series = season_number = episode_number = None
1976
1977 m_cat_container = self._search_regex(
1978 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1979 video_webpage, 'categories', default=None)
1980 if m_cat_container:
1981 category = self._html_search_regex(
1982 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1983 default=None)
1984 video_categories = None if category is None else [category]
1985 else:
1986 video_categories = None
1987
1988 video_tags = [
1989 unescapeHTML(m.group('content'))
1990 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1991
1992 def _extract_count(count_name):
1993 return str_to_int(self._search_regex(
1994 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1995 % re.escape(count_name),
1996 video_webpage, count_name, default=None))
1997
1998 like_count = _extract_count('like')
1999 dislike_count = _extract_count('dislike')
2000
2001 # subtitles
2002 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2003 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2004
2005 video_duration = try_get(
2006 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2007 if not video_duration:
2008 video_duration = parse_duration(self._html_search_meta(
2009 'duration', video_webpage, 'video duration'))
2010
2011 # annotations
2012 video_annotations = None
2013 if self._downloader.params.get('writeannotations', False):
2014 video_annotations = self._extract_annotations(video_id)
2015
2016 chapters = self._extract_chapters(description_original, video_duration)
2017
2018 # Look for the DASH manifest
2019 if self._downloader.params.get('youtube_include_dash_manifest', True):
2020 dash_mpd_fatal = True
2021 for mpd_url in dash_mpds:
2022 dash_formats = {}
2023 try:
2024 def decrypt_sig(mobj):
2025 s = mobj.group(1)
2026 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2027 return '/signature/%s' % dec_s
2028
2029 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2030
2031 for df in self._extract_mpd_formats(
2032 mpd_url, video_id, fatal=dash_mpd_fatal,
2033 formats_dict=self._formats):
2034 if not df.get('filesize'):
2035 df['filesize'] = _extract_filesize(df['url'])
2036 # Do not overwrite DASH format found in some previous DASH manifest
2037 if df['format_id'] not in dash_formats:
2038 dash_formats[df['format_id']] = df
2039 # Additional DASH manifests may end up in HTTP Error 403 therefore
2040 # allow them to fail without bug report message if we already have
2041 # some DASH manifest succeeded. This is temporary workaround to reduce
2042 # burst of bug reports until we figure out the reason and whether it
2043 # can be fixed at all.
2044 dash_mpd_fatal = False
2045 except (ExtractorError, KeyError) as e:
2046 self.report_warning(
2047 'Skipping DASH manifest: %r' % e, video_id)
2048 if dash_formats:
2049 # Remove the formats we found through non-DASH, they
2050 # contain less info and it can be wrong, because we use
2051 # fixed values (for example the resolution). See
2052 # https://github.com/rg3/youtube-dl/issues/5774 for an
2053 # example.
2054 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2055 formats.extend(dash_formats.values())
2056
2057 # Check for malformed aspect ratio
2058 stretched_m = re.search(
2059 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2060 video_webpage)
2061 if stretched_m:
2062 w = float(stretched_m.group('w'))
2063 h = float(stretched_m.group('h'))
2064 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2065 # We will only process correct ratios.
2066 if w > 0 and h > 0:
2067 ratio = w / h
2068 for f in formats:
2069 if f.get('vcodec') != 'none':
2070 f['stretched_ratio'] = ratio
2071
2072 self._sort_formats(formats)
2073
2074 self.mark_watched(video_id, video_info)
2075
2076 return {
2077 'id': video_id,
2078 'uploader': video_uploader,
2079 'uploader_id': video_uploader_id,
2080 'uploader_url': video_uploader_url,
2081 'upload_date': upload_date,
2082 'license': video_license,
2083 'creator': video_creator or artist,
2084 'title': video_title,
2085 'alt_title': video_alt_title or track,
2086 'thumbnail': video_thumbnail,
2087 'description': video_description,
2088 'categories': video_categories,
2089 'tags': video_tags,
2090 'subtitles': video_subtitles,
2091 'automatic_captions': automatic_captions,
2092 'duration': video_duration,
2093 'age_limit': 18 if age_gate else 0,
2094 'annotations': video_annotations,
2095 'chapters': chapters,
2096 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2097 'view_count': view_count,
2098 'like_count': like_count,
2099 'dislike_count': dislike_count,
2100 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
2101 'formats': formats,
2102 'is_live': is_live,
2103 'start_time': start_time,
2104 'end_time': end_time,
2105 'series': series,
2106 'season_number': season_number,
2107 'episode_number': episode_number,
2108 'track': track,
2109 'artist': artist,
2110 }
2111
2112
2113 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2114 IE_DESC = 'YouTube.com playlists'
2115 _VALID_URL = r"""(?x)(?:
2116 (?:https?://)?
2117 (?:\w+\.)?
2118 (?:
2119 youtube\.com/
2120 (?:
2121 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2122 \? (?:.*?[&;])*? (?:p|a|list)=
2123 | p/
2124 )|
2125 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2126 )
2127 (
2128 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2129 # Top tracks, they can also include dots
2130 |(?:MC)[\w\.]*
2131 )
2132 .*
2133 |
2134 (%(playlist_id)s)
2135 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2136 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2137 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
2138 IE_NAME = 'youtube:playlist'
2139 _TESTS = [{
2140 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2141 'info_dict': {
2142 'title': 'ytdl test PL',
2143 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2144 },
2145 'playlist_count': 3,
2146 }, {
2147 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2148 'info_dict': {
2149 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2150 'title': 'YDL_Empty_List',
2151 },
2152 'playlist_count': 0,
2153 'skip': 'This playlist is private',
2154 }, {
2155 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2156 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2157 'info_dict': {
2158 'title': '29C3: Not my department',
2159 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2160 },
2161 'playlist_count': 95,
2162 }, {
2163 'note': 'issue #673',
2164 'url': 'PLBB231211A4F62143',
2165 'info_dict': {
2166 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2167 'id': 'PLBB231211A4F62143',
2168 },
2169 'playlist_mincount': 26,
2170 }, {
2171 'note': 'Large playlist',
2172 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2173 'info_dict': {
2174 'title': 'Uploads from Cauchemar',
2175 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2176 },
2177 'playlist_mincount': 799,
2178 }, {
2179 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2180 'info_dict': {
2181 'title': 'YDL_safe_search',
2182 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2183 },
2184 'playlist_count': 2,
2185 'skip': 'This playlist is private',
2186 }, {
2187 'note': 'embedded',
2188 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2189 'playlist_count': 4,
2190 'info_dict': {
2191 'title': 'JODA15',
2192 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2193 }
2194 }, {
2195 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2196 'playlist_mincount': 485,
2197 'info_dict': {
2198 'title': '2017 čÆčŖžęœ€ę–°å–®ę›² (2/24ꛓꖰ)',
2199 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2200 }
2201 }, {
2202 'note': 'Embedded SWF player',
2203 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2204 'playlist_count': 4,
2205 'info_dict': {
2206 'title': 'JODA7',
2207 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2208 }
2209 }, {
2210 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2211 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2212 'info_dict': {
2213 'title': 'Uploads from Interstellar Movie',
2214 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2215 },
2216 'playlist_mincount': 21,
2217 }, {
2218 # Playlist URL that does not actually serve a playlist
2219 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2220 'info_dict': {
2221 'id': 'FqZTN594JQw',
2222 'ext': 'webm',
2223 'title': "Smiley's People 01 detective, Adventure Series, Action",
2224 'uploader': 'STREEM',
2225 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2226 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2227 'upload_date': '20150526',
2228 'license': 'Standard YouTube License',
2229 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2230 'categories': ['People & Blogs'],
2231 'tags': list,
2232 'like_count': int,
2233 'dislike_count': int,
2234 },
2235 'params': {
2236 'skip_download': True,
2237 },
2238 'add_ie': [YoutubeIE.ie_key()],
2239 }, {
2240 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2241 'info_dict': {
2242 'id': 'yeWKywCrFtk',
2243 'ext': 'mp4',
2244 'title': 'Small Scale Baler and Braiding Rugs',
2245 'uploader': 'Backus-Page House Museum',
2246 'uploader_id': 'backuspagemuseum',
2247 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2248 'upload_date': '20161008',
2249 'license': 'Standard YouTube License',
2250 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2251 'categories': ['Nonprofits & Activism'],
2252 'tags': list,
2253 'like_count': int,
2254 'dislike_count': int,
2255 },
2256 'params': {
2257 'noplaylist': True,
2258 'skip_download': True,
2259 },
2260 }, {
2261 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2262 'only_matching': True,
2263 }, {
2264 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2265 'only_matching': True,
2266 }, {
2267 # music album playlist
2268 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2269 'only_matching': True,
2270 }]
2271
2272 def _real_initialize(self):
2273 self._login()
2274
2275 def _extract_mix(self, playlist_id):
2276 # The mixes are generated from a single video
2277 # the id of the playlist is just 'RD' + video_id
2278 ids = []
2279 last_id = playlist_id[-11:]
2280 for n in itertools.count(1):
2281 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2282 webpage = self._download_webpage(
2283 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2284 new_ids = orderedSet(re.findall(
2285 r'''(?xs)data-video-username=".*?".*?
2286 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2287 webpage))
2288 # Fetch new pages until all the videos are repeated, it seems that
2289 # there are always 51 unique videos.
2290 new_ids = [_id for _id in new_ids if _id not in ids]
2291 if not new_ids:
2292 break
2293 ids.extend(new_ids)
2294 last_id = ids[-1]
2295
2296 url_results = self._ids_to_results(ids)
2297
2298 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2299 title_span = (
2300 search_title('playlist-title') or
2301 search_title('title long-title') or
2302 search_title('title'))
2303 title = clean_html(title_span)
2304
2305 return self.playlist_result(url_results, playlist_id, title)
2306
2307 def _extract_playlist(self, playlist_id):
2308 url = self._TEMPLATE_URL % playlist_id
2309 page = self._download_webpage(url, playlist_id)
2310
2311 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2312 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2313 match = match.strip()
2314 # Check if the playlist exists or is private
2315 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2316 if mobj:
2317 reason = mobj.group('reason')
2318 message = 'This playlist %s' % reason
2319 if 'private' in reason:
2320 message += ', use --username or --netrc to access it'
2321 message += '.'
2322 raise ExtractorError(message, expected=True)
2323 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2324 raise ExtractorError(
2325 'Invalid parameters. Maybe URL is incorrect.',
2326 expected=True)
2327 elif re.match(r'[^<]*Choose your language[^<]*', match):
2328 continue
2329 else:
2330 self.report_warning('Youtube gives an alert message: ' + match)
2331
2332 playlist_title = self._html_search_regex(
2333 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2334 page, 'title', default=None)
2335
2336 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2337 uploader = self._search_regex(
2338 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2339 page, 'uploader', default=None)
2340 mobj = re.search(
2341 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2342 page)
2343 if mobj:
2344 uploader_id = mobj.group('uploader_id')
2345 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2346 else:
2347 uploader_id = uploader_url = None
2348
2349 has_videos = True
2350
2351 if not playlist_title:
2352 try:
2353 # Some playlist URLs don't actually serve a playlist (e.g.
2354 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2355 next(self._entries(page, playlist_id))
2356 except StopIteration:
2357 has_videos = False
2358
2359 playlist = self.playlist_result(
2360 self._entries(page, playlist_id), playlist_id, playlist_title)
2361 playlist.update({
2362 'uploader': uploader,
2363 'uploader_id': uploader_id,
2364 'uploader_url': uploader_url,
2365 })
2366
2367 return has_videos, playlist
2368
2369 def _check_download_just_video(self, url, playlist_id):
2370 # Check if it's a video-specific URL
2371 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2372 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2373 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2374 'video id', default=None)
2375 if video_id:
2376 if self._downloader.params.get('noplaylist'):
2377 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2378 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2379 else:
2380 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2381 return video_id, None
2382 return None, None
2383
2384 def _real_extract(self, url):
2385 # Extract playlist id
2386 mobj = re.match(self._VALID_URL, url)
2387 if mobj is None:
2388 raise ExtractorError('Invalid URL: %s' % url)
2389 playlist_id = mobj.group(1) or mobj.group(2)
2390
2391 video_id, video = self._check_download_just_video(url, playlist_id)
2392 if video:
2393 return video
2394
2395 if playlist_id.startswith(('RD', 'UL', 'PU')):
2396 # Mixes require a custom extraction process
2397 return self._extract_mix(playlist_id)
2398
2399 has_videos, playlist = self._extract_playlist(playlist_id)
2400 if has_videos or not video_id:
2401 return playlist
2402
2403 # Some playlist URLs don't actually serve a playlist (see
2404 # https://github.com/rg3/youtube-dl/issues/10537).
2405 # Fallback to plain video extraction if there is a video id
2406 # along with playlist id.
2407 return self.url_result(video_id, 'Youtube', video_id=video_id)
2408
2409
2410 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2411 IE_DESC = 'YouTube.com channels'
2412 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
2413 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2414 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2415 IE_NAME = 'youtube:channel'
2416 _TESTS = [{
2417 'note': 'paginated channel',
2418 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2419 'playlist_mincount': 91,
2420 'info_dict': {
2421 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2422 'title': 'Uploads from lex will',
2423 }
2424 }, {
2425 'note': 'Age restricted channel',
2426 # from https://www.youtube.com/user/DeusExOfficial
2427 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2428 'playlist_mincount': 64,
2429 'info_dict': {
2430 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2431 'title': 'Uploads from Deus Ex',
2432 },
2433 }]
2434
2435 @classmethod
2436 def suitable(cls, url):
2437 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2438 else super(YoutubeChannelIE, cls).suitable(url))
2439
2440 def _build_template_url(self, url, channel_id):
2441 return self._TEMPLATE_URL % channel_id
2442
2443 def _real_extract(self, url):
2444 channel_id = self._match_id(url)
2445
2446 url = self._build_template_url(url, channel_id)
2447
2448 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2449 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2450 # otherwise fallback on channel by page extraction
2451 channel_page = self._download_webpage(
2452 url + '?view=57', channel_id,
2453 'Downloading channel page', fatal=False)
2454 if channel_page is False:
2455 channel_playlist_id = False
2456 else:
2457 channel_playlist_id = self._html_search_meta(
2458 'channelId', channel_page, 'channel id', default=None)
2459 if not channel_playlist_id:
2460 channel_url = self._html_search_meta(
2461 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2462 channel_page, 'channel url', default=None)
2463 if channel_url:
2464 channel_playlist_id = self._search_regex(
2465 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2466 channel_url, 'channel id', default=None)
2467 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2468 playlist_id = 'UU' + channel_playlist_id[2:]
2469 return self.url_result(
2470 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2471
2472 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2473 autogenerated = re.search(r'''(?x)
2474 class="[^"]*?(?:
2475 channel-header-autogenerated-label|
2476 yt-channel-title-autogenerated
2477 )[^"]*"''', channel_page) is not None
2478
2479 if autogenerated:
2480 # The videos are contained in a single page
2481 # the ajax pages can't be used, they are empty
2482 entries = [
2483 self.url_result(
2484 video_id, 'Youtube', video_id=video_id,
2485 video_title=video_title)
2486 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2487 return self.playlist_result(entries, channel_id)
2488
2489 try:
2490 next(self._entries(channel_page, channel_id))
2491 except StopIteration:
2492 alert_message = self._html_search_regex(
2493 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2494 channel_page, 'alert', default=None, group='alert')
2495 if alert_message:
2496 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2497
2498 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2499
2500
2501 class YoutubeUserIE(YoutubeChannelIE):
2502 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2503 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2504 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2505 IE_NAME = 'youtube:user'
2506
2507 _TESTS = [{
2508 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2509 'playlist_mincount': 320,
2510 'info_dict': {
2511 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2512 'title': 'Uploads from The Linux Foundation',
2513 }
2514 }, {
2515 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2516 # but not https://www.youtube.com/user/12minuteathlete/videos
2517 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2518 'playlist_mincount': 249,
2519 'info_dict': {
2520 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2521 'title': 'Uploads from 12 Minute Athlete',
2522 }
2523 }, {
2524 'url': 'ytuser:phihag',
2525 'only_matching': True,
2526 }, {
2527 'url': 'https://www.youtube.com/c/gametrailers',
2528 'only_matching': True,
2529 }, {
2530 'url': 'https://www.youtube.com/gametrailers',
2531 'only_matching': True,
2532 }, {
2533 # This channel is not available, geo restricted to JP
2534 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2535 'only_matching': True,
2536 }]
2537
2538 @classmethod
2539 def suitable(cls, url):
2540 # Don't return True if the url can be extracted with other youtube
2541 # extractor, the regex would is too permissive and it would match.
2542 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2543 if any(ie.suitable(url) for ie in other_yt_ies):
2544 return False
2545 else:
2546 return super(YoutubeUserIE, cls).suitable(url)
2547
2548 def _build_template_url(self, url, channel_id):
2549 mobj = re.match(self._VALID_URL, url)
2550 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2551
2552
2553 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2554 IE_DESC = 'YouTube.com live streams'
2555 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2556 IE_NAME = 'youtube:live'
2557
2558 _TESTS = [{
2559 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2560 'info_dict': {
2561 'id': 'a48o2S1cPoo',
2562 'ext': 'mp4',
2563 'title': 'The Young Turks - Live Main Show',
2564 'uploader': 'The Young Turks',
2565 'uploader_id': 'TheYoungTurks',
2566 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2567 'upload_date': '20150715',
2568 'license': 'Standard YouTube License',
2569 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2570 'categories': ['News & Politics'],
2571 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2572 'like_count': int,
2573 'dislike_count': int,
2574 },
2575 'params': {
2576 'skip_download': True,
2577 },
2578 }, {
2579 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2580 'only_matching': True,
2581 }, {
2582 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2583 'only_matching': True,
2584 }, {
2585 'url': 'https://www.youtube.com/TheYoungTurks/live',
2586 'only_matching': True,
2587 }]
2588
2589 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2591 channel_id = mobj.group('id')
2592 base_url = mobj.group('base_url')
2593 webpage = self._download_webpage(url, channel_id, fatal=False)
2594 if webpage:
2595 page_type = self._og_search_property(
2596 'type', webpage, 'page type', default='')
2597 video_id = self._html_search_meta(
2598 'videoId', webpage, 'video id', default=None)
2599 if page_type.startswith('video') and video_id and re.match(
2600 r'^[0-9A-Za-z_-]{11}$', video_id):
2601 return self.url_result(video_id, YoutubeIE.ie_key())
2602 return self.url_result(base_url)
2603
2604
2605 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2606 IE_DESC = 'YouTube.com user/channel playlists'
2607 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2608 IE_NAME = 'youtube:playlists'
2609
2610 _TESTS = [{
2611 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2612 'playlist_mincount': 4,
2613 'info_dict': {
2614 'id': 'ThirstForScience',
2615 'title': 'Thirst for Science',
2616 },
2617 }, {
2618 # with "Load more" button
2619 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2620 'playlist_mincount': 70,
2621 'info_dict': {
2622 'id': 'igorkle1',
2623 'title': 'Š˜Š³Š¾Ń€ŃŒ ŠšŠ»ŠµŠ¹Š½ŠµŃ€',
2624 },
2625 }, {
2626 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2627 'playlist_mincount': 17,
2628 'info_dict': {
2629 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2630 'title': 'Chem Player',
2631 },
2632 }]
2633
2634
2635 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2636 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2637
2638
2639 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
2640 IE_DESC = 'YouTube.com searches'
2641 # there doesn't appear to be a real limit, for example if you search for
2642 # 'python' you get more than 8.000.000 results
2643 _MAX_RESULTS = float('inf')
2644 IE_NAME = 'youtube:search'
2645 _SEARCH_KEY = 'ytsearch'
2646 _EXTRA_QUERY_ARGS = {}
2647 _TESTS = []
2648
2649 def _get_n_results(self, query, n):
2650 """Get a specified number of results for a query"""
2651
2652 videos = []
2653 limit = n
2654
2655 url_query = {
2656 'search_query': query.encode('utf-8'),
2657 }
2658 url_query.update(self._EXTRA_QUERY_ARGS)
2659 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2660
2661 for pagenum in itertools.count(1):
2662 data = self._download_json(
2663 result_url, video_id='query "%s"' % query,
2664 note='Downloading page %s' % pagenum,
2665 errnote='Unable to download API page',
2666 query={'spf': 'navigate'})
2667 html_content = data[1]['body']['content']
2668
2669 if 'class="search-message' in html_content:
2670 raise ExtractorError(
2671 '[youtube] No video results', expected=True)
2672
2673 new_videos = list(self._process_page(html_content))
2674 videos += new_videos
2675 if not new_videos or len(videos) > limit:
2676 break
2677 next_link = self._html_search_regex(
2678 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2679 html_content, 'next link', default=None)
2680 if next_link is None:
2681 break
2682 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
2683
2684 if len(videos) > n:
2685 videos = videos[:n]
2686 return self.playlist_result(videos, query)
2687
2688
2689 class YoutubeSearchDateIE(YoutubeSearchIE):
2690 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
2691 _SEARCH_KEY = 'ytsearchdate'
2692 IE_DESC = 'YouTube.com searches, newest videos first'
2693 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
2694
2695
2696 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
2697 IE_DESC = 'YouTube.com search URLs'
2698 IE_NAME = 'youtube:search_url'
2699 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
2700 _TESTS = [{
2701 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2702 'playlist_mincount': 5,
2703 'info_dict': {
2704 'title': 'youtube-dl test video',
2705 }
2706 }, {
2707 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2708 'only_matching': True,
2709 }]
2710
2711 def _real_extract(self, url):
2712 mobj = re.match(self._VALID_URL, url)
2713 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
2714 webpage = self._download_webpage(url, query)
2715 return self.playlist_result(self._process_page(webpage), playlist_title=query)
2716
2717
2718 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
2719 IE_DESC = 'YouTube.com (multi-season) shows'
2720 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
2721 IE_NAME = 'youtube:show'
2722 _TESTS = [{
2723 'url': 'https://www.youtube.com/show/airdisasters',
2724 'playlist_mincount': 5,
2725 'info_dict': {
2726 'id': 'airdisasters',
2727 'title': 'Air Disasters',
2728 }
2729 }]
2730
2731 def _real_extract(self, url):
2732 playlist_id = self._match_id(url)
2733 return super(YoutubeShowIE, self)._real_extract(
2734 'https://www.youtube.com/show/%s/playlists' % playlist_id)
2735
2736
2737 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
2738 """
2739 Base class for feed extractors
2740 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2741 """
2742 _LOGIN_REQUIRED = True
2743
2744 @property
2745 def IE_NAME(self):
2746 return 'youtube:%s' % self._FEED_NAME
2747
2748 def _real_initialize(self):
2749 self._login()
2750
2751 def _entries(self, page):
2752 # The extraction process is the same as for playlists, but the regex
2753 # for the video ids doesn't contain an index
2754 ids = []
2755 more_widget_html = content_html = page
2756 for page_num in itertools.count(1):
2757 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
2758
2759 # 'recommended' feed has infinite 'load more' and each new portion spins
2760 # the same videos in (sometimes) slightly different order, so we'll check
2761 # for unicity and break when portion has no new videos
2762 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
2763 if not new_ids:
2764 break
2765
2766 ids.extend(new_ids)
2767
2768 for entry in self._ids_to_results(new_ids):
2769 yield entry
2770
2771 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2772 if not mobj:
2773 break
2774
2775 more = self._download_json(
2776 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2777 'Downloading page #%s' % page_num,
2778 transform_source=uppercase_escape)
2779 content_html = more['content_html']
2780 more_widget_html = more['load_more_widget_html']
2781
2782 def _real_extract(self, url):
2783 page = self._download_webpage(
2784 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2785 self._PLAYLIST_TITLE)
2786 return self.playlist_result(
2787 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
2788
2789
2790 class YoutubeWatchLaterIE(YoutubePlaylistIE):
2791 IE_NAME = 'youtube:watchlater'
2792 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
2793 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
2794
2795 _TESTS = [{
2796 'url': 'https://www.youtube.com/playlist?list=WL',
2797 'only_matching': True,
2798 }, {
2799 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2800 'only_matching': True,
2801 }]
2802
2803 def _real_extract(self, url):
2804 _, video = self._check_download_just_video(url, 'WL')
2805 if video:
2806 return video
2807 _, playlist = self._extract_playlist('WL')
2808 return playlist
2809
2810
2811 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
2812 IE_NAME = 'youtube:favorites'
2813 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
2814 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
2815 _LOGIN_REQUIRED = True
2816
2817 def _real_extract(self, url):
2818 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
2819 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
2820 return self.url_result(playlist_id, 'YoutubePlaylist')
2821
2822
2823 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2824 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
2825 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
2826 _FEED_NAME = 'recommended'
2827 _PLAYLIST_TITLE = 'Youtube Recommended videos'
2828
2829
2830 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2831 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
2832 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
2833 _FEED_NAME = 'subscriptions'
2834 _PLAYLIST_TITLE = 'Youtube Subscriptions'
2835
2836
2837 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2838 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
2839 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
2840 _FEED_NAME = 'history'
2841 _PLAYLIST_TITLE = 'Youtube History'
2842
2843
2844 class YoutubeTruncatedURLIE(InfoExtractor):
2845 IE_NAME = 'youtube:truncated_url'
2846 IE_DESC = False # Do not list
2847 _VALID_URL = r'''(?x)
2848 (?:https?://)?
2849 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2850 (?:watch\?(?:
2851 feature=[a-z_]+|
2852 annotation_id=annotation_[^&]+|
2853 x-yt-cl=[0-9]+|
2854 hl=[^&]*|
2855 t=[0-9]+
2856 )?
2857 |
2858 attribution_link\?a=[^&]+
2859 )
2860 $
2861 '''
2862
2863 _TESTS = [{
2864 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
2865 'only_matching': True,
2866 }, {
2867 'url': 'https://www.youtube.com/watch?',
2868 'only_matching': True,
2869 }, {
2870 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2871 'only_matching': True,
2872 }, {
2873 'url': 'https://www.youtube.com/watch?feature=foo',
2874 'only_matching': True,
2875 }, {
2876 'url': 'https://www.youtube.com/watch?hl=en-GB',
2877 'only_matching': True,
2878 }, {
2879 'url': 'https://www.youtube.com/watch?t=2372',
2880 'only_matching': True,
2881 }]
2882
2883 def _real_extract(self, url):
2884 raise ExtractorError(
2885 'Did you forget to quote the URL? Remember that & is a meta '
2886 'character in most shells, so you want to put the URL in quotes, '
2887 'like youtube-dl '
2888 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2889 ' or simply youtube-dl BaW_jenozKc .',
2890 expected=True)
2891
2892
2893 class YoutubeTruncatedIDIE(InfoExtractor):
2894 IE_NAME = 'youtube:truncated_id'
2895 IE_DESC = False # Do not list
2896 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2897
2898 _TESTS = [{
2899 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2900 'only_matching': True,
2901 }]
2902
2903 def _real_extract(self, url):
2904 video_id = self._match_id(url)
2905 raise ExtractorError(
2906 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2907 expected=True)