]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
debian/control: Canonicalize with `wrap-and-sort -s -a -b`.
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 clean_html,
30 error_to_compat_str,
31 ExtractorError,
32 float_or_none,
33 get_element_by_attribute,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 orderedSet,
38 parse_codecs,
39 parse_duration,
40 qualities,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 uppercase_escape,
50 urlencode_postdata,
51 )
52
53
54 class YoutubeBaseInfoExtractor(InfoExtractor):
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
62
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
68
69 def _set_language(self):
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
72 # YouTube sets the expire time to about two months
73 expire_time=time.time() + 2 * 30 * 24 * 3600)
74
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
80 def _login(self):
81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
88 username, password = self._get_login_info()
89 # No authentication to be performed
90 if username is None:
91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
93 return True
94
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
99 if login_page is False:
100 return
101
102 login_form = self._hidden_inputs(login_page)
103
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
112 'f.req': json.dumps(f_req),
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
115 })
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
141 lookup_results = req(
142 self._LOOKUP_URL, lookup_req,
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
147
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
160
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
164
165 if challenge_results is False:
166 return
167
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
181 tfa = try_get(res, lambda x: x[0][0], list)
182 if tfa:
183 tfa_str = try_get(tfa, lambda x: x[2], compat_str)
184 if tfa_str == 'TWO_STEP_VERIFICATION':
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
187 status = try_get(tfa, lambda x: x[5], compat_str)
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
231 else:
232 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
233
234 if not check_cookie_url:
235 warn('Unable to extract CheckCookie URL')
236 return False
237
238 check_cookie_results = self._download_webpage(
239 check_cookie_url, None, 'Checking cookie', fatal=False)
240
241 if check_cookie_results is False:
242 return False
243
244 if 'https://myaccount.google.com/' not in check_cookie_results:
245 warn('Unable to log in')
246 return False
247
248 return True
249
250 def _download_webpage_handle(self, *args, **kwargs):
251 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
252 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
253 *args, **compat_kwargs(kwargs))
254
255 def _real_initialize(self):
256 if self._downloader is None:
257 return
258 self._set_language()
259 if not self._login():
260 return
261
262
263 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
264 # Extract entries from page with "Load more" button
265 def _entries(self, page, playlist_id):
266 more_widget_html = content_html = page
267 for page_num in itertools.count(1):
268 for entry in self._process_page(content_html):
269 yield entry
270
271 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
272 if not mobj:
273 break
274
275 more = self._download_json(
276 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
277 'Downloading page #%s' % page_num,
278 transform_source=uppercase_escape)
279 content_html = more['content_html']
280 if not content_html.strip():
281 # Some webpages show a "Load more" button but they don't
282 # have more videos
283 break
284 more_widget_html = more['load_more_widget_html']
285
286
287 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
288 def _process_page(self, content):
289 for video_id, video_title in self.extract_videos_from_page(content):
290 yield self.url_result(video_id, 'Youtube', video_id, video_title)
291
292 def extract_videos_from_page(self, page):
293 ids_in_page = []
294 titles_in_page = []
295 for mobj in re.finditer(self._VIDEO_RE, page):
296 # The link with index 0 is not the first video of the playlist (not sure if still actual)
297 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
298 continue
299 video_id = mobj.group('id')
300 video_title = unescapeHTML(mobj.group('title'))
301 if video_title:
302 video_title = video_title.strip()
303 try:
304 idx = ids_in_page.index(video_id)
305 if video_title and not titles_in_page[idx]:
306 titles_in_page[idx] = video_title
307 except ValueError:
308 ids_in_page.append(video_id)
309 titles_in_page.append(video_title)
310 return zip(ids_in_page, titles_in_page)
311
312
313 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
314 def _process_page(self, content):
315 for playlist_id in orderedSet(re.findall(
316 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
317 content)):
318 yield self.url_result(
319 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
320
321 def _real_extract(self, url):
322 playlist_id = self._match_id(url)
323 webpage = self._download_webpage(url, playlist_id)
324 title = self._og_search_title(webpage, fatal=False)
325 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
326
327
328 class YoutubeIE(YoutubeBaseInfoExtractor):
329 IE_DESC = 'YouTube.com'
330 _VALID_URL = r"""(?x)^
331 (
332 (?:https?://|//) # http(s):// or protocol-independent URL
333 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
334 (?:www\.)?deturl\.com/www\.youtube\.com/|
335 (?:www\.)?pwnyoutube\.com/|
336 (?:www\.)?hooktube\.com/|
337 (?:www\.)?yourepeat\.com/|
338 tube\.majestyc\.net/|
339 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
340 (?:.*?\#/)? # handle anchor (#/) redirect urls
341 (?: # the various things that can precede the ID:
342 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
343 |(?: # or the v= param in all its forms
344 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
345 (?:\?|\#!?) # the params delimiter ? or # or #!
346 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
347 v=
348 )
349 ))
350 |(?:
351 youtu\.be| # just youtu.be/xxxx
352 vid\.plus| # or vid.plus/xxxx
353 zwearz\.com/watch| # or zwearz.com/watch/xxxx
354 )/
355 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
356 )
357 )? # all until now is optional -> you can pass the naked ID
358 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
359 (?!.*?\blist=
360 (?:
361 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
362 WL # WL are handled by the watch later IE
363 )
364 )
365 (?(1).+)? # if we found the ID, everything can follow
366 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
367 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
368 _formats = {
369 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
370 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
371 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
372 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
373 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
374 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
375 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
376 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
377 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
378 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
379 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
380 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
381 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
382 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
383 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
384 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
385 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
386 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387
388
389 # 3D videos
390 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
391 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
392 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
393 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
394 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
395 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
396 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
397
398 # Apple HTTP Live Streaming
399 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
400 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
401 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
402 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
403 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
404 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
405 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
406 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
407
408 # DASH mp4 video
409 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
410 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
411 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
412 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
413 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
414 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
415 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
416 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
417 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
418 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
419 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
420 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
421
422 # Dash mp4 audio
423 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
424 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
425 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
426 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
427 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
428 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
429 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
430
431 # Dash webm
432 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
433 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
434 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
435 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
436 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
437 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
438 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
439 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
440 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
441 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
442 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
443 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
444 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
445 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
446 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
447 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
448 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
449 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
450 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
451 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
452 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
454
455 # Dash webm audio
456 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
457 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
458
459 # Dash webm audio with opus inside
460 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
461 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
462 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
463
464 # RTMP (unnamed)
465 '_rtmp': {'protocol': 'rtmp'},
466 }
467 _SUBTITLE_FORMATS = ('ttml', 'vtt')
468
469 _GEO_BYPASS = False
470
471 IE_NAME = 'youtube'
472 _TESTS = [
473 {
474 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
475 'info_dict': {
476 'id': 'BaW_jenozKc',
477 'ext': 'mp4',
478 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
479 'uploader': 'Philipp Hagemeister',
480 'uploader_id': 'phihag',
481 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
482 'upload_date': '20121002',
483 'license': 'Standard YouTube License',
484 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
485 'categories': ['Science & Technology'],
486 'tags': ['youtube-dl'],
487 'duration': 10,
488 'like_count': int,
489 'dislike_count': int,
490 'start_time': 1,
491 'end_time': 9,
492 }
493 },
494 {
495 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
496 'note': 'Test generic use_cipher_signature video (#897)',
497 'info_dict': {
498 'id': 'UxxajLWwzqY',
499 'ext': 'mp4',
500 'upload_date': '20120506',
501 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
502 'alt_title': 'I Love It (feat. Charli XCX)',
503 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
504 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
505 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
506 'iconic ep', 'iconic', 'love', 'it'],
507 'duration': 180,
508 'uploader': 'Icona Pop',
509 'uploader_id': 'IconaPop',
510 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
511 'license': 'Standard YouTube License',
512 'creator': 'Icona Pop',
513 'track': 'I Love It (feat. Charli XCX)',
514 'artist': 'Icona Pop',
515 }
516 },
517 {
518 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
519 'note': 'Test VEVO video with age protection (#956)',
520 'info_dict': {
521 'id': '07FYdnEawAQ',
522 'ext': 'mp4',
523 'upload_date': '20130703',
524 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
525 'alt_title': 'Tunnel Vision',
526 'description': 'md5:64249768eec3bc4276236606ea996373',
527 'duration': 419,
528 'uploader': 'justintimberlakeVEVO',
529 'uploader_id': 'justintimberlakeVEVO',
530 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
531 'license': 'Standard YouTube License',
532 'creator': 'Justin Timberlake',
533 'track': 'Tunnel Vision',
534 'artist': 'Justin Timberlake',
535 'age_limit': 18,
536 }
537 },
538 {
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
548 'uploader_id': 'setindia',
549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
550 'license': 'Standard YouTube License',
551 'age_limit': 18,
552 }
553 },
554 {
555 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
556 'note': 'Use the first video ID in the URL',
557 'info_dict': {
558 'id': 'BaW_jenozKc',
559 'ext': 'mp4',
560 'title': 'youtube-dl test video "\'/\\Ƥā†­š•',
561 'uploader': 'Philipp Hagemeister',
562 'uploader_id': 'phihag',
563 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
564 'upload_date': '20121002',
565 'license': 'Standard YouTube License',
566 'description': 'test chars: "\'/\\Ƥā†­š•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
567 'categories': ['Science & Technology'],
568 'tags': ['youtube-dl'],
569 'duration': 10,
570 'like_count': int,
571 'dislike_count': int,
572 },
573 'params': {
574 'skip_download': True,
575 },
576 },
577 {
578 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
579 'note': '256k DASH audio (format 141) via DASH manifest',
580 'info_dict': {
581 'id': 'a9LDPn-MO4I',
582 'ext': 'm4a',
583 'upload_date': '20121002',
584 'uploader_id': '8KVIDEO',
585 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
586 'description': '',
587 'uploader': '8KVIDEO',
588 'license': 'Standard YouTube License',
589 'title': 'UHDTV TEST 8K VIDEO.mp4'
590 },
591 'params': {
592 'youtube_include_dash_manifest': True,
593 'format': '141',
594 },
595 'skip': 'format 141 not served anymore',
596 },
597 # DASH manifest with encrypted signature
598 {
599 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
600 'info_dict': {
601 'id': 'IB3lcPjvWLA',
602 'ext': 'm4a',
603 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
604 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
605 'duration': 244,
606 'uploader': 'AfrojackVEVO',
607 'uploader_id': 'AfrojackVEVO',
608 'upload_date': '20131011',
609 'license': 'Standard YouTube License',
610 },
611 'params': {
612 'youtube_include_dash_manifest': True,
613 'format': '141/bestaudio[ext=m4a]',
614 },
615 },
616 # JS player signature function name containing $
617 {
618 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
619 'info_dict': {
620 'id': 'nfWlot6h_JM',
621 'ext': 'm4a',
622 'title': 'Taylor Swift - Shake It Off',
623 'alt_title': 'Shake It Off',
624 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
625 'duration': 242,
626 'uploader': 'TaylorSwiftVEVO',
627 'uploader_id': 'TaylorSwiftVEVO',
628 'upload_date': '20140818',
629 'license': 'Standard YouTube License',
630 'creator': 'Taylor Swift',
631 },
632 'params': {
633 'youtube_include_dash_manifest': True,
634 'format': '141/bestaudio[ext=m4a]',
635 },
636 },
637 # Controversy video
638 {
639 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
640 'info_dict': {
641 'id': 'T4XJQO3qol8',
642 'ext': 'mp4',
643 'duration': 219,
644 'upload_date': '20100909',
645 'uploader': 'TJ Kirk',
646 'uploader_id': 'TheAmazingAtheist',
647 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
648 'license': 'Standard YouTube License',
649 'title': 'Burning Everyone\'s Koran',
650 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
651 }
652 },
653 # Normal age-gate video (No vevo, embed allowed)
654 {
655 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
656 'info_dict': {
657 'id': 'HtVdAasjOgU',
658 'ext': 'mp4',
659 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
660 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
661 'duration': 142,
662 'uploader': 'The Witcher',
663 'uploader_id': 'WitcherGame',
664 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
665 'upload_date': '20140605',
666 'license': 'Standard YouTube License',
667 'age_limit': 18,
668 },
669 },
670 # Age-gate video with encrypted signature
671 {
672 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
673 'info_dict': {
674 'id': '6kLq3WMV1nU',
675 'ext': 'webm',
676 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
677 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
678 'duration': 246,
679 'uploader': 'LloydVEVO',
680 'uploader_id': 'LloydVEVO',
681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
682 'upload_date': '20110629',
683 'license': 'Standard YouTube License',
684 'age_limit': 18,
685 },
686 },
687 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
688 # YouTube Red ad is not captured for creator
689 {
690 'url': '__2ABJjxzNo',
691 'info_dict': {
692 'id': '__2ABJjxzNo',
693 'ext': 'mp4',
694 'duration': 266,
695 'upload_date': '20100430',
696 'uploader_id': 'deadmau5',
697 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
698 'creator': 'deadmau5',
699 'description': 'md5:12c56784b8032162bb936a5f76d55360',
700 'uploader': 'deadmau5',
701 'license': 'Standard YouTube License',
702 'title': 'Deadmau5 - Some Chords (HD)',
703 'alt_title': 'Some Chords',
704 },
705 'expected_warnings': [
706 'DASH manifest missing',
707 ]
708 },
709 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
710 {
711 'url': 'lqQg6PlCWgI',
712 'info_dict': {
713 'id': 'lqQg6PlCWgI',
714 'ext': 'mp4',
715 'duration': 6085,
716 'upload_date': '20150827',
717 'uploader_id': 'olympic',
718 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
719 'license': 'Standard YouTube License',
720 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
721 'uploader': 'Olympic',
722 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
723 },
724 'params': {
725 'skip_download': 'requires avconv',
726 }
727 },
728 # Non-square pixels
729 {
730 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
731 'info_dict': {
732 'id': '_b-2C3KPAM0',
733 'ext': 'mp4',
734 'stretched_ratio': 16 / 9.,
735 'duration': 85,
736 'upload_date': '20110310',
737 'uploader_id': 'AllenMeow',
738 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
739 'description': 'made by Wacom from Korea | 字幕&åŠ ę²¹ę·»é†‹ by TY\'s Allen | ę„Ÿč¬heylisa00cavey1001同å­øē†±ęƒ…ęä¾›ę¢—åŠēæ»č­Æ',
740 'uploader': 'å­«į„‹į„…',
741 'license': 'Standard YouTube License',
742 'title': '[A-made] č®Šę…‹å¦å­—å¹•ē‰ˆ å¤Ŗ妍 ęˆ‘å°±ę˜Æ這ęØ£ēš„äŗŗ',
743 },
744 },
745 # url_encoded_fmt_stream_map is empty string
746 {
747 'url': 'qEJwOuvDf7I',
748 'info_dict': {
749 'id': 'qEJwOuvDf7I',
750 'ext': 'webm',
751 'title': 'ŠžŠ±ŃŃƒŠ¶Š“ŠµŠ½ŠøŠµ суŠ“ŠµŠ±Š½Š¾Š¹ ŠæрŠ°ŠŗтŠøŠŗŠø ŠæŠ¾ Š²Ń‹Š±Š¾Ń€Š°Š¼ 14 сŠµŠ½Ń‚яŠ±Ń€Ń 2014 Š³Š¾Š“Š° Š² Š”Š°Š½Šŗт-ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Šµ',
752 'description': '',
753 'upload_date': '20150404',
754 'uploader_id': 'spbelect',
755 'uploader': 'ŠŠ°Š±Š»ŃŽŠ“Š°Ń‚ŠµŠ»Šø ŠŸŠµŃ‚ŠµŃ€Š±ŃƒŃ€Š³Š°',
756 },
757 'params': {
758 'skip_download': 'requires avconv',
759 },
760 'skip': 'This live event has ended.',
761 },
762 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
763 {
764 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
765 'info_dict': {
766 'id': 'FIl7x6_3R5Y',
767 'ext': 'webm',
768 'title': 'md5:7b81415841e02ecd4313668cde88737a',
769 'description': 'md5:116377fd2963b81ec4ce64b542173306',
770 'duration': 220,
771 'upload_date': '20150625',
772 'uploader_id': 'dorappi2000',
773 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
774 'uploader': 'dorappi2000',
775 'license': 'Standard YouTube License',
776 'formats': 'mincount:31',
777 },
778 'skip': 'not actual anymore',
779 },
780 # DASH manifest with segment_list
781 {
782 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
783 'md5': '8ce563a1d667b599d21064e982ab9e31',
784 'info_dict': {
785 'id': 'CsmdDsKjzN8',
786 'ext': 'mp4',
787 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
788 'uploader': 'Airtek',
789 'description': 'RetransmisiĆ³n en directo de la XVIII media maratĆ³n de Zaragoza.',
790 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
791 'license': 'Standard YouTube License',
792 'title': 'RetransmisiĆ³n XVIII Media maratĆ³n Zaragoza 2015',
793 },
794 'params': {
795 'youtube_include_dash_manifest': True,
796 'format': '135', # bestvideo
797 },
798 'skip': 'This live event has ended.',
799 },
800 {
801 # Multifeed videos (multiple cameras), URL is for Main Camera
802 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
803 'info_dict': {
804 'id': 'jqWvoWXjCVs',
805 'title': 'teamPGP: Rocket League Noob Stream',
806 'description': 'md5:dc7872fb300e143831327f1bae3af010',
807 },
808 'playlist': [{
809 'info_dict': {
810 'id': 'jqWvoWXjCVs',
811 'ext': 'mp4',
812 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
813 'description': 'md5:dc7872fb300e143831327f1bae3af010',
814 'duration': 7335,
815 'upload_date': '20150721',
816 'uploader': 'Beer Games Beer',
817 'uploader_id': 'beergamesbeer',
818 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
819 'license': 'Standard YouTube License',
820 },
821 }, {
822 'info_dict': {
823 'id': '6h8e8xoXJzg',
824 'ext': 'mp4',
825 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
826 'description': 'md5:dc7872fb300e143831327f1bae3af010',
827 'duration': 7337,
828 'upload_date': '20150721',
829 'uploader': 'Beer Games Beer',
830 'uploader_id': 'beergamesbeer',
831 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
832 'license': 'Standard YouTube License',
833 },
834 }, {
835 'info_dict': {
836 'id': 'PUOgX5z9xZw',
837 'ext': 'mp4',
838 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
839 'description': 'md5:dc7872fb300e143831327f1bae3af010',
840 'duration': 7337,
841 'upload_date': '20150721',
842 'uploader': 'Beer Games Beer',
843 'uploader_id': 'beergamesbeer',
844 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
845 'license': 'Standard YouTube License',
846 },
847 }, {
848 'info_dict': {
849 'id': 'teuwxikvS5k',
850 'ext': 'mp4',
851 'title': 'teamPGP: Rocket League Noob Stream (zim)',
852 'description': 'md5:dc7872fb300e143831327f1bae3af010',
853 'duration': 7334,
854 'upload_date': '20150721',
855 'uploader': 'Beer Games Beer',
856 'uploader_id': 'beergamesbeer',
857 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
858 'license': 'Standard YouTube License',
859 },
860 }],
861 'params': {
862 'skip_download': True,
863 },
864 },
865 {
866 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
867 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
868 'info_dict': {
869 'id': 'gVfLd0zydlo',
870 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
871 },
872 'playlist_count': 2,
873 'skip': 'Not multifeed anymore',
874 },
875 {
876 'url': 'https://vid.plus/FlRa-iH7PGw',
877 'only_matching': True,
878 },
879 {
880 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
881 'only_matching': True,
882 },
883 {
884 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
885 # Also tests cut-off URL expansion in video description (see
886 # https://github.com/rg3/youtube-dl/issues/1892,
887 # https://github.com/rg3/youtube-dl/issues/8164)
888 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
889 'info_dict': {
890 'id': 'lsguqyKfVQg',
891 'ext': 'mp4',
892 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
893 'alt_title': 'Dark Walk - Position Music',
894 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
895 'duration': 133,
896 'upload_date': '20151119',
897 'uploader_id': 'IronSoulElf',
898 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
899 'uploader': 'IronSoulElf',
900 'license': 'Standard YouTube License',
901 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
902 'track': 'Dark Walk - Position Music',
903 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
904 },
905 'params': {
906 'skip_download': True,
907 },
908 },
909 {
910 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
911 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
912 'only_matching': True,
913 },
914 {
915 # Video with yt:stretch=17:0
916 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
917 'info_dict': {
918 'id': 'Q39EVAstoRM',
919 'ext': 'mp4',
920 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
921 'description': 'md5:ee18a25c350637c8faff806845bddee9',
922 'upload_date': '20151107',
923 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
924 'uploader': 'CH GAMER DROID',
925 },
926 'params': {
927 'skip_download': True,
928 },
929 'skip': 'This video does not exist.',
930 },
931 {
932 # Video licensed under Creative Commons
933 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
934 'info_dict': {
935 'id': 'M4gD1WSo5mA',
936 'ext': 'mp4',
937 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
938 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
939 'duration': 721,
940 'upload_date': '20150127',
941 'uploader_id': 'BerkmanCenter',
942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
943 'uploader': 'The Berkman Klein Center for Internet & Society',
944 'license': 'Creative Commons Attribution license (reuse allowed)',
945 },
946 'params': {
947 'skip_download': True,
948 },
949 },
950 {
951 # Channel-like uploader_url
952 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
953 'info_dict': {
954 'id': 'eQcmzGIKrzg',
955 'ext': 'mp4',
956 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
957 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
958 'duration': 4060,
959 'upload_date': '20151119',
960 'uploader': 'Bernie Sanders',
961 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
963 'license': 'Creative Commons Attribution license (reuse allowed)',
964 },
965 'params': {
966 'skip_download': True,
967 },
968 },
969 {
970 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
971 'only_matching': True,
972 },
973 {
974 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
975 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
976 'only_matching': True,
977 },
978 {
979 # Rental video preview
980 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
981 'info_dict': {
982 'id': 'uGpuVWrhIzE',
983 'ext': 'mp4',
984 'title': 'Piku - Trailer',
985 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
986 'upload_date': '20150811',
987 'uploader': 'FlixMatrix',
988 'uploader_id': 'FlixMatrixKaravan',
989 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
990 'license': 'Standard YouTube License',
991 },
992 'params': {
993 'skip_download': True,
994 },
995 'skip': 'This video is not available.',
996 },
997 {
998 # YouTube Red video with episode data
999 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1000 'info_dict': {
1001 'id': 'iqKdEhx-dD4',
1002 'ext': 'mp4',
1003 'title': 'Isolation - Mind Field (Ep 1)',
1004 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
1005 'duration': 2085,
1006 'upload_date': '20170118',
1007 'uploader': 'Vsauce',
1008 'uploader_id': 'Vsauce',
1009 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1010 'license': 'Standard YouTube License',
1011 'series': 'Mind Field',
1012 'season_number': 1,
1013 'episode_number': 1,
1014 },
1015 'params': {
1016 'skip_download': True,
1017 },
1018 'expected_warnings': [
1019 'Skipping DASH manifest',
1020 ],
1021 },
1022 {
1023 # The following content has been identified by the YouTube community
1024 # as inappropriate or offensive to some audiences.
1025 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1026 'info_dict': {
1027 'id': '6SJNVb0GnPI',
1028 'ext': 'mp4',
1029 'title': 'Race Differences in Intelligence',
1030 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1031 'duration': 965,
1032 'upload_date': '20140124',
1033 'uploader': 'New Century Foundation',
1034 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1036 'license': 'Standard YouTube License',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 },
1041 },
1042 {
1043 # itag 212
1044 'url': '1t24XAntNCY',
1045 'only_matching': True,
1046 },
1047 {
1048 # geo restricted to JP
1049 'url': 'sJL6WA-aGkQ',
1050 'only_matching': True,
1051 },
1052 {
1053 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1054 'only_matching': True,
1055 },
1056 ]
1057
1058 def __init__(self, *args, **kwargs):
1059 super(YoutubeIE, self).__init__(*args, **kwargs)
1060 self._player_cache = {}
1061
1062 def report_video_info_webpage_download(self, video_id):
1063 """Report attempt to download video info webpage."""
1064 self.to_screen('%s: Downloading video info webpage' % video_id)
1065
1066 def report_information_extraction(self, video_id):
1067 """Report attempt to extract video information."""
1068 self.to_screen('%s: Extracting video information' % video_id)
1069
1070 def report_unavailable_format(self, video_id, format):
1071 """Report extracted video URL."""
1072 self.to_screen('%s: Format %s not available' % (video_id, format))
1073
1074 def report_rtmp_download(self):
1075 """Indicate the download will use the RTMP protocol."""
1076 self.to_screen('RTMP download detected')
1077
1078 def _signature_cache_id(self, example_sig):
1079 """ Return a string representation of a signature """
1080 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1081
1082 def _extract_signature_function(self, video_id, player_url, example_sig):
1083 id_m = re.match(
1084 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1085 player_url)
1086 if not id_m:
1087 raise ExtractorError('Cannot identify player %r' % player_url)
1088 player_type = id_m.group('ext')
1089 player_id = id_m.group('id')
1090
1091 # Read from filesystem cache
1092 func_id = '%s_%s_%s' % (
1093 player_type, player_id, self._signature_cache_id(example_sig))
1094 assert os.path.basename(func_id) == func_id
1095
1096 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1097 if cache_spec is not None:
1098 return lambda s: ''.join(s[i] for i in cache_spec)
1099
1100 download_note = (
1101 'Downloading player %s' % player_url
1102 if self._downloader.params.get('verbose') else
1103 'Downloading %s player %s' % (player_type, player_id)
1104 )
1105 if player_type == 'js':
1106 code = self._download_webpage(
1107 player_url, video_id,
1108 note=download_note,
1109 errnote='Download of %s failed' % player_url)
1110 res = self._parse_sig_js(code)
1111 elif player_type == 'swf':
1112 urlh = self._request_webpage(
1113 player_url, video_id,
1114 note=download_note,
1115 errnote='Download of %s failed' % player_url)
1116 code = urlh.read()
1117 res = self._parse_sig_swf(code)
1118 else:
1119 assert False, 'Invalid player type %r' % player_type
1120
1121 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1122 cache_res = res(test_string)
1123 cache_spec = [ord(c) for c in cache_res]
1124
1125 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1126 return res
1127
1128 def _print_sig_code(self, func, example_sig):
1129 def gen_sig_code(idxs):
1130 def _genslice(start, end, step):
1131 starts = '' if start == 0 else str(start)
1132 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1133 steps = '' if step == 1 else (':%d' % step)
1134 return 's[%s%s%s]' % (starts, ends, steps)
1135
1136 step = None
1137 # Quelch pyflakes warnings - start will be set when step is set
1138 start = '(Never used)'
1139 for i, prev in zip(idxs[1:], idxs[:-1]):
1140 if step is not None:
1141 if i - prev == step:
1142 continue
1143 yield _genslice(start, prev, step)
1144 step = None
1145 continue
1146 if i - prev in [-1, 1]:
1147 step = i - prev
1148 start = prev
1149 continue
1150 else:
1151 yield 's[%d]' % prev
1152 if step is None:
1153 yield 's[%d]' % i
1154 else:
1155 yield _genslice(start, i, step)
1156
1157 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1158 cache_res = func(test_string)
1159 cache_spec = [ord(c) for c in cache_res]
1160 expr_code = ' + '.join(gen_sig_code(cache_spec))
1161 signature_id_tuple = '(%s)' % (
1162 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1163 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1164 ' return %s\n') % (signature_id_tuple, expr_code)
1165 self.to_screen('Extracted signature function:\n' + code)
1166
1167 def _parse_sig_js(self, jscode):
1168 funcname = self._search_regex(
1169 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1170 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('),
1171 jscode, 'Initial JS player signature function name', group='sig')
1172
1173 jsi = JSInterpreter(jscode)
1174 initial_function = jsi.extract_function(funcname)
1175 return lambda s: initial_function([s])
1176
1177 def _parse_sig_swf(self, file_contents):
1178 swfi = SWFInterpreter(file_contents)
1179 TARGET_CLASSNAME = 'SignatureDecipher'
1180 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1181 initial_function = swfi.extract_function(searched_class, 'decipher')
1182 return lambda s: initial_function([s])
1183
1184 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1185 """Turn the encrypted s field into a working signature"""
1186
1187 if player_url is None:
1188 raise ExtractorError('Cannot decrypt signature without player_url')
1189
1190 if player_url.startswith('//'):
1191 player_url = 'https:' + player_url
1192 elif not re.match(r'https?://', player_url):
1193 player_url = compat_urlparse.urljoin(
1194 'https://www.youtube.com', player_url)
1195 try:
1196 player_id = (player_url, self._signature_cache_id(s))
1197 if player_id not in self._player_cache:
1198 func = self._extract_signature_function(
1199 video_id, player_url, s
1200 )
1201 self._player_cache[player_id] = func
1202 func = self._player_cache[player_id]
1203 if self._downloader.params.get('youtube_print_sig_code'):
1204 self._print_sig_code(func, s)
1205 return func(s)
1206 except Exception as e:
1207 tb = traceback.format_exc()
1208 raise ExtractorError(
1209 'Signature extraction failed: ' + tb, cause=e)
1210
1211 def _get_subtitles(self, video_id, webpage):
1212 try:
1213 subs_doc = self._download_xml(
1214 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1215 video_id, note=False)
1216 except ExtractorError as err:
1217 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1218 return {}
1219
1220 sub_lang_list = {}
1221 for track in subs_doc.findall('track'):
1222 lang = track.attrib['lang_code']
1223 if lang in sub_lang_list:
1224 continue
1225 sub_formats = []
1226 for ext in self._SUBTITLE_FORMATS:
1227 params = compat_urllib_parse_urlencode({
1228 'lang': lang,
1229 'v': video_id,
1230 'fmt': ext,
1231 'name': track.attrib['name'].encode('utf-8'),
1232 })
1233 sub_formats.append({
1234 'url': 'https://www.youtube.com/api/timedtext?' + params,
1235 'ext': ext,
1236 })
1237 sub_lang_list[lang] = sub_formats
1238 if not sub_lang_list:
1239 self._downloader.report_warning('video doesn\'t have subtitles')
1240 return {}
1241 return sub_lang_list
1242
1243 def _get_ytplayer_config(self, video_id, webpage):
1244 patterns = (
1245 # User data may contain arbitrary character sequences that may affect
1246 # JSON extraction with regex, e.g. when '};' is contained the second
1247 # regex won't capture the whole JSON. Yet working around by trying more
1248 # concrete regex first keeping in mind proper quoted string handling
1249 # to be implemented in future that will replace this workaround (see
1250 # https://github.com/rg3/youtube-dl/issues/7468,
1251 # https://github.com/rg3/youtube-dl/pull/7599)
1252 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1253 r';ytplayer\.config\s*=\s*({.+?});',
1254 )
1255 config = self._search_regex(
1256 patterns, webpage, 'ytplayer.config', default=None)
1257 if config:
1258 return self._parse_json(
1259 uppercase_escape(config), video_id, fatal=False)
1260
1261 def _get_automatic_captions(self, video_id, webpage):
1262 """We need the webpage for getting the captions url, pass it as an
1263 argument to speed up the process."""
1264 self.to_screen('%s: Looking for automatic captions' % video_id)
1265 player_config = self._get_ytplayer_config(video_id, webpage)
1266 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1267 if not player_config:
1268 self._downloader.report_warning(err_msg)
1269 return {}
1270 try:
1271 args = player_config['args']
1272 caption_url = args.get('ttsurl')
1273 if caption_url:
1274 timestamp = args['timestamp']
1275 # We get the available subtitles
1276 list_params = compat_urllib_parse_urlencode({
1277 'type': 'list',
1278 'tlangs': 1,
1279 'asrs': 1,
1280 })
1281 list_url = caption_url + '&' + list_params
1282 caption_list = self._download_xml(list_url, video_id)
1283 original_lang_node = caption_list.find('track')
1284 if original_lang_node is None:
1285 self._downloader.report_warning('Video doesn\'t have automatic captions')
1286 return {}
1287 original_lang = original_lang_node.attrib['lang_code']
1288 caption_kind = original_lang_node.attrib.get('kind', '')
1289
1290 sub_lang_list = {}
1291 for lang_node in caption_list.findall('target'):
1292 sub_lang = lang_node.attrib['lang_code']
1293 sub_formats = []
1294 for ext in self._SUBTITLE_FORMATS:
1295 params = compat_urllib_parse_urlencode({
1296 'lang': original_lang,
1297 'tlang': sub_lang,
1298 'fmt': ext,
1299 'ts': timestamp,
1300 'kind': caption_kind,
1301 })
1302 sub_formats.append({
1303 'url': caption_url + '&' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[sub_lang] = sub_formats
1307 return sub_lang_list
1308
1309 def make_captions(sub_url, sub_langs):
1310 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1311 caption_qs = compat_parse_qs(parsed_sub_url.query)
1312 captions = {}
1313 for sub_lang in sub_langs:
1314 sub_formats = []
1315 for ext in self._SUBTITLE_FORMATS:
1316 caption_qs.update({
1317 'tlang': [sub_lang],
1318 'fmt': [ext],
1319 })
1320 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1321 query=compat_urllib_parse_urlencode(caption_qs, True)))
1322 sub_formats.append({
1323 'url': sub_url,
1324 'ext': ext,
1325 })
1326 captions[sub_lang] = sub_formats
1327 return captions
1328
1329 # New captions format as of 22.06.2017
1330 player_response = args.get('player_response')
1331 if player_response and isinstance(player_response, compat_str):
1332 player_response = self._parse_json(
1333 player_response, video_id, fatal=False)
1334 if player_response:
1335 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1336 base_url = renderer['captionTracks'][0]['baseUrl']
1337 sub_lang_list = []
1338 for lang in renderer['translationLanguages']:
1339 lang_code = lang.get('languageCode')
1340 if lang_code:
1341 sub_lang_list.append(lang_code)
1342 return make_captions(base_url, sub_lang_list)
1343
1344 # Some videos don't provide ttsurl but rather caption_tracks and
1345 # caption_translation_languages (e.g. 20LmZk1hakA)
1346 # Does not used anymore as of 22.06.2017
1347 caption_tracks = args['caption_tracks']
1348 caption_translation_languages = args['caption_translation_languages']
1349 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1350 sub_lang_list = []
1351 for lang in caption_translation_languages.split(','):
1352 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1353 sub_lang = lang_qs.get('lc', [None])[0]
1354 if sub_lang:
1355 sub_lang_list.append(sub_lang)
1356 return make_captions(caption_url, sub_lang_list)
1357 # An extractor error can be raise by the download process if there are
1358 # no automatic captions but there are subtitles
1359 except (KeyError, IndexError, ExtractorError):
1360 self._downloader.report_warning(err_msg)
1361 return {}
1362
1363 def _mark_watched(self, video_id, video_info):
1364 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1365 if not playback_url:
1366 return
1367 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1368 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1369
1370 # cpn generation algorithm is reverse engineered from base.js.
1371 # In fact it works even with dummy cpn.
1372 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1373 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1374
1375 qs.update({
1376 'ver': ['2'],
1377 'cpn': [cpn],
1378 })
1379 playback_url = compat_urlparse.urlunparse(
1380 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1381
1382 self._download_webpage(
1383 playback_url, video_id, 'Marking watched',
1384 'Unable to mark watched', fatal=False)
1385
1386 @staticmethod
1387 def _extract_urls(webpage):
1388 # Embedded YouTube player
1389 entries = [
1390 unescapeHTML(mobj.group('url'))
1391 for mobj in re.finditer(r'''(?x)
1392 (?:
1393 <iframe[^>]+?src=|
1394 data-video-url=|
1395 <embed[^>]+?src=|
1396 embedSWF\(?:\s*|
1397 <object[^>]+data=|
1398 new\s+SWFObject\(
1399 )
1400 (["\'])
1401 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1402 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1403 \1''', webpage)]
1404
1405 # lazyYT YouTube embed
1406 entries.extend(list(map(
1407 unescapeHTML,
1408 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1409
1410 # Wordpress "YouTube Video Importer" plugin
1411 matches = re.findall(r'''(?x)<div[^>]+
1412 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1413 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1414 entries.extend(m[-1] for m in matches)
1415
1416 return entries
1417
1418 @staticmethod
1419 def _extract_url(webpage):
1420 urls = YoutubeIE._extract_urls(webpage)
1421 return urls[0] if urls else None
1422
1423 @classmethod
1424 def extract_id(cls, url):
1425 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1426 if mobj is None:
1427 raise ExtractorError('Invalid URL: %s' % url)
1428 video_id = mobj.group(2)
1429 return video_id
1430
1431 def _extract_annotations(self, video_id):
1432 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1433 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1434
1435 @staticmethod
1436 def _extract_chapters(description, duration):
1437 if not description:
1438 return None
1439 chapter_lines = re.findall(
1440 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1441 description)
1442 if not chapter_lines:
1443 return None
1444 chapters = []
1445 for next_num, (chapter_line, time_point) in enumerate(
1446 chapter_lines, start=1):
1447 start_time = parse_duration(time_point)
1448 if start_time is None:
1449 continue
1450 if start_time > duration:
1451 break
1452 end_time = (duration if next_num == len(chapter_lines)
1453 else parse_duration(chapter_lines[next_num][1]))
1454 if end_time is None:
1455 continue
1456 if end_time > duration:
1457 end_time = duration
1458 if start_time > end_time:
1459 break
1460 chapter_title = re.sub(
1461 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1462 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1463 chapters.append({
1464 'start_time': start_time,
1465 'end_time': end_time,
1466 'title': chapter_title,
1467 })
1468 return chapters
1469
1470 def _real_extract(self, url):
1471 url, smuggled_data = unsmuggle_url(url, {})
1472
1473 proto = (
1474 'http' if self._downloader.params.get('prefer_insecure', False)
1475 else 'https')
1476
1477 start_time = None
1478 end_time = None
1479 parsed_url = compat_urllib_parse_urlparse(url)
1480 for component in [parsed_url.fragment, parsed_url.query]:
1481 query = compat_parse_qs(component)
1482 if start_time is None and 't' in query:
1483 start_time = parse_duration(query['t'][0])
1484 if start_time is None and 'start' in query:
1485 start_time = parse_duration(query['start'][0])
1486 if end_time is None and 'end' in query:
1487 end_time = parse_duration(query['end'][0])
1488
1489 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1490 mobj = re.search(self._NEXT_URL_RE, url)
1491 if mobj:
1492 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1493 video_id = self.extract_id(url)
1494
1495 # Get video webpage
1496 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1497 video_webpage = self._download_webpage(url, video_id)
1498
1499 # Attempt to extract SWF player URL
1500 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1501 if mobj is not None:
1502 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1503 else:
1504 player_url = None
1505
1506 dash_mpds = []
1507
1508 def add_dash_mpd(video_info):
1509 dash_mpd = video_info.get('dashmpd')
1510 if dash_mpd and dash_mpd[0] not in dash_mpds:
1511 dash_mpds.append(dash_mpd[0])
1512
1513 is_live = None
1514 view_count = None
1515
1516 def extract_view_count(v_info):
1517 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1518
1519 # Get video info
1520 embed_webpage = None
1521 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1522 age_gate = True
1523 # We simulate the access to the video from www.youtube.com/v/{video_id}
1524 # this can be viewed without login into Youtube
1525 url = proto + '://www.youtube.com/embed/%s' % video_id
1526 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1527 data = compat_urllib_parse_urlencode({
1528 'video_id': video_id,
1529 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1530 'sts': self._search_regex(
1531 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1532 })
1533 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1534 video_info_webpage = self._download_webpage(
1535 video_info_url, video_id,
1536 note='Refetching age-gated info webpage',
1537 errnote='unable to download video info webpage')
1538 video_info = compat_parse_qs(video_info_webpage)
1539 add_dash_mpd(video_info)
1540 else:
1541 age_gate = False
1542 video_info = None
1543 sts = None
1544 # Try looking directly into the video webpage
1545 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1546 if ytplayer_config:
1547 args = ytplayer_config['args']
1548 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1549 # Convert to the same format returned by compat_parse_qs
1550 video_info = dict((k, [v]) for k, v in args.items())
1551 add_dash_mpd(video_info)
1552 # Rental video is not rented but preview is available (e.g.
1553 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1554 # https://github.com/rg3/youtube-dl/issues/10532)
1555 if not video_info and args.get('ypc_vid'):
1556 return self.url_result(
1557 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1558 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1559 is_live = True
1560 sts = ytplayer_config.get('sts')
1561 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1562 # We also try looking in get_video_info since it may contain different dashmpd
1563 # URL that points to a DASH manifest with possibly different itag set (some itags
1564 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1565 # manifest pointed by get_video_info's dashmpd).
1566 # The general idea is to take a union of itags of both DASH manifests (for example
1567 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1568 self.report_video_info_webpage_download(video_id)
1569 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1570 query = {
1571 'video_id': video_id,
1572 'ps': 'default',
1573 'eurl': '',
1574 'gl': 'US',
1575 'hl': 'en',
1576 }
1577 if el:
1578 query['el'] = el
1579 if sts:
1580 query['sts'] = sts
1581 video_info_webpage = self._download_webpage(
1582 '%s://www.youtube.com/get_video_info' % proto,
1583 video_id, note=False,
1584 errnote='unable to download video info webpage',
1585 fatal=False, query=query)
1586 if not video_info_webpage:
1587 continue
1588 get_video_info = compat_parse_qs(video_info_webpage)
1589 add_dash_mpd(get_video_info)
1590 if view_count is None:
1591 view_count = extract_view_count(get_video_info)
1592 if not video_info:
1593 video_info = get_video_info
1594 if 'token' in get_video_info:
1595 # Different get_video_info requests may report different results, e.g.
1596 # some may report video unavailability, but some may serve it without
1597 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1598 # the original webpage as well as el=info and el=embedded get_video_info
1599 # requests report video unavailability due to geo restriction while
1600 # el=detailpage succeeds and returns valid data). This is probably
1601 # due to YouTube measures against IP ranges of hosting providers.
1602 # Working around by preferring the first succeeded video_info containing
1603 # the token if no such video_info yet was found.
1604 if 'token' not in video_info:
1605 video_info = get_video_info
1606 break
1607
1608 def extract_unavailable_message():
1609 return self._html_search_regex(
1610 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1611 video_webpage, 'unavailable message', default=None)
1612
1613 if 'token' not in video_info:
1614 if 'reason' in video_info:
1615 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1616 regions_allowed = self._html_search_meta(
1617 'regionsAllowed', video_webpage, default=None)
1618 countries = regions_allowed.split(',') if regions_allowed else None
1619 self.raise_geo_restricted(
1620 msg=video_info['reason'][0], countries=countries)
1621 reason = video_info['reason'][0]
1622 if 'Invalid parameters' in reason:
1623 unavailable_message = extract_unavailable_message()
1624 if unavailable_message:
1625 reason = unavailable_message
1626 raise ExtractorError(
1627 'YouTube said: %s' % reason,
1628 expected=True, video_id=video_id)
1629 else:
1630 raise ExtractorError(
1631 '"token" parameter not in video info for unknown reason',
1632 video_id=video_id)
1633
1634 # title
1635 if 'title' in video_info:
1636 video_title = video_info['title'][0]
1637 else:
1638 self._downloader.report_warning('Unable to extract video title')
1639 video_title = '_'
1640
1641 # description
1642 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1643 if video_description:
1644
1645 def replace_url(m):
1646 redir_url = compat_urlparse.urljoin(url, m.group(1))
1647 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1648 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1649 qs = compat_parse_qs(parsed_redir_url.query)
1650 q = qs.get('q')
1651 if q and q[0]:
1652 return q[0]
1653 return redir_url
1654
1655 description_original = video_description = re.sub(r'''(?x)
1656 <a\s+
1657 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1658 (?:title|href)="([^"]+)"\s+
1659 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1660 class="[^"]*"[^>]*>
1661 [^<]+\.{3}\s*
1662 </a>
1663 ''', replace_url, video_description)
1664 video_description = clean_html(video_description)
1665 else:
1666 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1667 if fd_mobj:
1668 video_description = unescapeHTML(fd_mobj.group(1))
1669 else:
1670 video_description = ''
1671
1672 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1673 if not self._downloader.params.get('noplaylist'):
1674 entries = []
1675 feed_ids = []
1676 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
1677 for feed in multifeed_metadata_list.split(','):
1678 # Unquote should take place before split on comma (,) since textual
1679 # fields may contain comma as well (see
1680 # https://github.com/rg3/youtube-dl/issues/8536)
1681 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1682 entries.append({
1683 '_type': 'url_transparent',
1684 'ie_key': 'Youtube',
1685 'url': smuggle_url(
1686 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1687 {'force_singlefeed': True}),
1688 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1689 })
1690 feed_ids.append(feed_data['id'][0])
1691 self.to_screen(
1692 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1693 % (', '.join(feed_ids), video_id))
1694 return self.playlist_result(entries, video_id, video_title, video_description)
1695 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1696
1697 if view_count is None:
1698 view_count = extract_view_count(video_info)
1699
1700 # Check for "rental" videos
1701 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1702 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
1703
1704 def _extract_filesize(media_url):
1705 return int_or_none(self._search_regex(
1706 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1707
1708 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1709 self.report_rtmp_download()
1710 formats = [{
1711 'format_id': '_rtmp',
1712 'protocol': 'rtmp',
1713 'url': video_info['conn'][0],
1714 'player_url': player_url,
1715 }]
1716 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1717 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1718 if 'rtmpe%3Dyes' in encoded_url_map:
1719 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1720 formats_spec = {}
1721 fmt_list = video_info.get('fmt_list', [''])[0]
1722 if fmt_list:
1723 for fmt in fmt_list.split(','):
1724 spec = fmt.split('/')
1725 if len(spec) > 1:
1726 width_height = spec[1].split('x')
1727 if len(width_height) == 2:
1728 formats_spec[spec[0]] = {
1729 'resolution': spec[1],
1730 'width': int_or_none(width_height[0]),
1731 'height': int_or_none(width_height[1]),
1732 }
1733 q = qualities(['small', 'medium', 'hd720'])
1734 formats = []
1735 for url_data_str in encoded_url_map.split(','):
1736 url_data = compat_parse_qs(url_data_str)
1737 if 'itag' not in url_data or 'url' not in url_data:
1738 continue
1739 format_id = url_data['itag'][0]
1740 url = url_data['url'][0]
1741
1742 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1743 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1744 jsplayer_url_json = self._search_regex(
1745 ASSETS_RE,
1746 embed_webpage if age_gate else video_webpage,
1747 'JS player URL (1)', default=None)
1748 if not jsplayer_url_json and not age_gate:
1749 # We need the embed website after all
1750 if embed_webpage is None:
1751 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1752 embed_webpage = self._download_webpage(
1753 embed_url, video_id, 'Downloading embed webpage')
1754 jsplayer_url_json = self._search_regex(
1755 ASSETS_RE, embed_webpage, 'JS player URL')
1756
1757 player_url = json.loads(jsplayer_url_json)
1758 if player_url is None:
1759 player_url_json = self._search_regex(
1760 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1761 video_webpage, 'age gate player URL')
1762 player_url = json.loads(player_url_json)
1763
1764 if 'sig' in url_data:
1765 url += '&signature=' + url_data['sig'][0]
1766 elif 's' in url_data:
1767 encrypted_sig = url_data['s'][0]
1768
1769 if self._downloader.params.get('verbose'):
1770 if player_url is None:
1771 player_version = 'unknown'
1772 player_desc = 'unknown'
1773 else:
1774 if player_url.endswith('swf'):
1775 player_version = self._search_regex(
1776 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1777 'flash player', fatal=False)
1778 player_desc = 'flash player %s' % player_version
1779 else:
1780 player_version = self._search_regex(
1781 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1782 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
1783 player_url,
1784 'html5 player', fatal=False)
1785 player_desc = 'html5 player %s' % player_version
1786
1787 parts_sizes = self._signature_cache_id(encrypted_sig)
1788 self.to_screen('{%s} signature length %s, %s' %
1789 (format_id, parts_sizes, player_desc))
1790
1791 signature = self._decrypt_signature(
1792 encrypted_sig, video_id, player_url, age_gate)
1793 url += '&signature=' + signature
1794 if 'ratebypass' not in url:
1795 url += '&ratebypass=yes'
1796
1797 dct = {
1798 'format_id': format_id,
1799 'url': url,
1800 'player_url': player_url,
1801 }
1802 if format_id in self._formats:
1803 dct.update(self._formats[format_id])
1804 if format_id in formats_spec:
1805 dct.update(formats_spec[format_id])
1806
1807 # Some itags are not included in DASH manifest thus corresponding formats will
1808 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1809 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1810 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1811 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1812
1813 filesize = int_or_none(url_data.get(
1814 'clen', [None])[0]) or _extract_filesize(url)
1815
1816 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1817
1818 more_fields = {
1819 'filesize': filesize,
1820 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1821 'width': width,
1822 'height': height,
1823 'fps': int_or_none(url_data.get('fps', [None])[0]),
1824 'format_note': quality,
1825 'quality': q(quality),
1826 }
1827 for key, value in more_fields.items():
1828 if value:
1829 dct[key] = value
1830 type_ = url_data.get('type', [None])[0]
1831 if type_:
1832 type_split = type_.split(';')
1833 kind_ext = type_split[0].split('/')
1834 if len(kind_ext) == 2:
1835 kind, _ = kind_ext
1836 dct['ext'] = mimetype2ext(type_split[0])
1837 if kind in ('audio', 'video'):
1838 codecs = None
1839 for mobj in re.finditer(
1840 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1841 if mobj.group('key') == 'codecs':
1842 codecs = mobj.group('val')
1843 break
1844 if codecs:
1845 dct.update(parse_codecs(codecs))
1846 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1847 dct['downloader_options'] = {
1848 # Youtube throttles chunks >~10M
1849 'http_chunk_size': 10485760,
1850 }
1851 formats.append(dct)
1852 elif video_info.get('hlsvp'):
1853 manifest_url = video_info['hlsvp'][0]
1854 formats = []
1855 m3u8_formats = self._extract_m3u8_formats(
1856 manifest_url, video_id, 'mp4', fatal=False)
1857 for a_format in m3u8_formats:
1858 itag = self._search_regex(
1859 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1860 if itag:
1861 a_format['format_id'] = itag
1862 if itag in self._formats:
1863 dct = self._formats[itag].copy()
1864 dct.update(a_format)
1865 a_format = dct
1866 a_format['player_url'] = player_url
1867 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
1868 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
1869 formats.append(a_format)
1870 else:
1871 error_message = clean_html(video_info.get('reason', [None])[0])
1872 if not error_message:
1873 error_message = extract_unavailable_message()
1874 if error_message:
1875 raise ExtractorError(error_message, expected=True)
1876 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1877
1878 # uploader
1879 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1880 if video_uploader:
1881 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1882 else:
1883 self._downloader.report_warning('unable to extract uploader name')
1884
1885 # uploader_id
1886 video_uploader_id = None
1887 video_uploader_url = None
1888 mobj = re.search(
1889 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1890 video_webpage)
1891 if mobj is not None:
1892 video_uploader_id = mobj.group('uploader_id')
1893 video_uploader_url = mobj.group('uploader_url')
1894 else:
1895 self._downloader.report_warning('unable to extract uploader nickname')
1896
1897 # thumbnail image
1898 # We try first to get a high quality image:
1899 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1900 video_webpage, re.DOTALL)
1901 if m_thumb is not None:
1902 video_thumbnail = m_thumb.group(1)
1903 elif 'thumbnail_url' not in video_info:
1904 self._downloader.report_warning('unable to extract video thumbnail')
1905 video_thumbnail = None
1906 else: # don't panic if we can't find it
1907 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1908
1909 # upload date
1910 upload_date = self._html_search_meta(
1911 'datePublished', video_webpage, 'upload date', default=None)
1912 if not upload_date:
1913 upload_date = self._search_regex(
1914 [r'(?s)id="eow-date.*?>(.*?)</span>',
1915 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1916 video_webpage, 'upload date', default=None)
1917 upload_date = unified_strdate(upload_date)
1918
1919 video_license = self._html_search_regex(
1920 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1921 video_webpage, 'license', default=None)
1922
1923 m_music = re.search(
1924 r'''(?x)
1925 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1926 <ul[^>]*>\s*
1927 <li>(?P<title>.+?)
1928 by (?P<creator>.+?)
1929 (?:
1930 \(.+?\)|
1931 <a[^>]*
1932 (?:
1933 \bhref=["\']/red[^>]*>| # drop possible
1934 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1935 )
1936 .*?
1937 )?</li
1938 ''',
1939 video_webpage)
1940 if m_music:
1941 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1942 video_creator = clean_html(m_music.group('creator'))
1943 else:
1944 video_alt_title = video_creator = None
1945
1946 def extract_meta(field):
1947 return self._html_search_regex(
1948 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1949 video_webpage, field, default=None)
1950
1951 track = extract_meta('Song')
1952 artist = extract_meta('Artist')
1953
1954 m_episode = re.search(
1955 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*ā€¢\s*E(?P<episode>\d+)</span>',
1956 video_webpage)
1957 if m_episode:
1958 series = m_episode.group('series')
1959 season_number = int(m_episode.group('season'))
1960 episode_number = int(m_episode.group('episode'))
1961 else:
1962 series = season_number = episode_number = None
1963
1964 m_cat_container = self._search_regex(
1965 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1966 video_webpage, 'categories', default=None)
1967 if m_cat_container:
1968 category = self._html_search_regex(
1969 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1970 default=None)
1971 video_categories = None if category is None else [category]
1972 else:
1973 video_categories = None
1974
1975 video_tags = [
1976 unescapeHTML(m.group('content'))
1977 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1978
1979 def _extract_count(count_name):
1980 return str_to_int(self._search_regex(
1981 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1982 % re.escape(count_name),
1983 video_webpage, count_name, default=None))
1984
1985 like_count = _extract_count('like')
1986 dislike_count = _extract_count('dislike')
1987
1988 # subtitles
1989 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1990 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1991
1992 video_duration = try_get(
1993 video_info, lambda x: int_or_none(x['length_seconds'][0]))
1994 if not video_duration:
1995 video_duration = parse_duration(self._html_search_meta(
1996 'duration', video_webpage, 'video duration'))
1997
1998 # annotations
1999 video_annotations = None
2000 if self._downloader.params.get('writeannotations', False):
2001 video_annotations = self._extract_annotations(video_id)
2002
2003 chapters = self._extract_chapters(description_original, video_duration)
2004
2005 # Look for the DASH manifest
2006 if self._downloader.params.get('youtube_include_dash_manifest', True):
2007 dash_mpd_fatal = True
2008 for mpd_url in dash_mpds:
2009 dash_formats = {}
2010 try:
2011 def decrypt_sig(mobj):
2012 s = mobj.group(1)
2013 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2014 return '/signature/%s' % dec_s
2015
2016 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2017
2018 for df in self._extract_mpd_formats(
2019 mpd_url, video_id, fatal=dash_mpd_fatal,
2020 formats_dict=self._formats):
2021 if not df.get('filesize'):
2022 df['filesize'] = _extract_filesize(df['url'])
2023 # Do not overwrite DASH format found in some previous DASH manifest
2024 if df['format_id'] not in dash_formats:
2025 dash_formats[df['format_id']] = df
2026 # Additional DASH manifests may end up in HTTP Error 403 therefore
2027 # allow them to fail without bug report message if we already have
2028 # some DASH manifest succeeded. This is temporary workaround to reduce
2029 # burst of bug reports until we figure out the reason and whether it
2030 # can be fixed at all.
2031 dash_mpd_fatal = False
2032 except (ExtractorError, KeyError) as e:
2033 self.report_warning(
2034 'Skipping DASH manifest: %r' % e, video_id)
2035 if dash_formats:
2036 # Remove the formats we found through non-DASH, they
2037 # contain less info and it can be wrong, because we use
2038 # fixed values (for example the resolution). See
2039 # https://github.com/rg3/youtube-dl/issues/5774 for an
2040 # example.
2041 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2042 formats.extend(dash_formats.values())
2043
2044 # Check for malformed aspect ratio
2045 stretched_m = re.search(
2046 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2047 video_webpage)
2048 if stretched_m:
2049 w = float(stretched_m.group('w'))
2050 h = float(stretched_m.group('h'))
2051 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2052 # We will only process correct ratios.
2053 if w > 0 and h > 0:
2054 ratio = w / h
2055 for f in formats:
2056 if f.get('vcodec') != 'none':
2057 f['stretched_ratio'] = ratio
2058
2059 self._sort_formats(formats)
2060
2061 self.mark_watched(video_id, video_info)
2062
2063 return {
2064 'id': video_id,
2065 'uploader': video_uploader,
2066 'uploader_id': video_uploader_id,
2067 'uploader_url': video_uploader_url,
2068 'upload_date': upload_date,
2069 'license': video_license,
2070 'creator': video_creator or artist,
2071 'title': video_title,
2072 'alt_title': video_alt_title or track,
2073 'thumbnail': video_thumbnail,
2074 'description': video_description,
2075 'categories': video_categories,
2076 'tags': video_tags,
2077 'subtitles': video_subtitles,
2078 'automatic_captions': automatic_captions,
2079 'duration': video_duration,
2080 'age_limit': 18 if age_gate else 0,
2081 'annotations': video_annotations,
2082 'chapters': chapters,
2083 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2084 'view_count': view_count,
2085 'like_count': like_count,
2086 'dislike_count': dislike_count,
2087 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
2088 'formats': formats,
2089 'is_live': is_live,
2090 'start_time': start_time,
2091 'end_time': end_time,
2092 'series': series,
2093 'season_number': season_number,
2094 'episode_number': episode_number,
2095 'track': track,
2096 'artist': artist,
2097 }
2098
2099
2100 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2101 IE_DESC = 'YouTube.com playlists'
2102 _VALID_URL = r"""(?x)(?:
2103 (?:https?://)?
2104 (?:\w+\.)?
2105 (?:
2106 youtube\.com/
2107 (?:
2108 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2109 \? (?:.*?[&;])*? (?:p|a|list)=
2110 | p/
2111 )|
2112 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2113 )
2114 (
2115 (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,}
2116 # Top tracks, they can also include dots
2117 |(?:MC)[\w\.]*
2118 )
2119 .*
2120 |
2121 (%(playlist_id)s)
2122 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2123 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2124 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
2125 IE_NAME = 'youtube:playlist'
2126 _TESTS = [{
2127 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2128 'info_dict': {
2129 'title': 'ytdl test PL',
2130 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2131 },
2132 'playlist_count': 3,
2133 }, {
2134 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2135 'info_dict': {
2136 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2137 'title': 'YDL_Empty_List',
2138 },
2139 'playlist_count': 0,
2140 'skip': 'This playlist is private',
2141 }, {
2142 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2143 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2144 'info_dict': {
2145 'title': '29C3: Not my department',
2146 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2147 },
2148 'playlist_count': 95,
2149 }, {
2150 'note': 'issue #673',
2151 'url': 'PLBB231211A4F62143',
2152 'info_dict': {
2153 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2154 'id': 'PLBB231211A4F62143',
2155 },
2156 'playlist_mincount': 26,
2157 }, {
2158 'note': 'Large playlist',
2159 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2160 'info_dict': {
2161 'title': 'Uploads from Cauchemar',
2162 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2163 },
2164 'playlist_mincount': 799,
2165 }, {
2166 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2167 'info_dict': {
2168 'title': 'YDL_safe_search',
2169 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2170 },
2171 'playlist_count': 2,
2172 'skip': 'This playlist is private',
2173 }, {
2174 'note': 'embedded',
2175 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2176 'playlist_count': 4,
2177 'info_dict': {
2178 'title': 'JODA15',
2179 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2180 }
2181 }, {
2182 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2183 'playlist_mincount': 485,
2184 'info_dict': {
2185 'title': '2017 čÆčŖžęœ€ę–°å–®ę›² (2/24ꛓꖰ)',
2186 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2187 }
2188 }, {
2189 'note': 'Embedded SWF player',
2190 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2191 'playlist_count': 4,
2192 'info_dict': {
2193 'title': 'JODA7',
2194 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2195 }
2196 }, {
2197 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2198 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2199 'info_dict': {
2200 'title': 'Uploads from Interstellar Movie',
2201 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2202 },
2203 'playlist_mincount': 21,
2204 }, {
2205 # Playlist URL that does not actually serve a playlist
2206 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2207 'info_dict': {
2208 'id': 'FqZTN594JQw',
2209 'ext': 'webm',
2210 'title': "Smiley's People 01 detective, Adventure Series, Action",
2211 'uploader': 'STREEM',
2212 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2213 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2214 'upload_date': '20150526',
2215 'license': 'Standard YouTube License',
2216 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2217 'categories': ['People & Blogs'],
2218 'tags': list,
2219 'like_count': int,
2220 'dislike_count': int,
2221 },
2222 'params': {
2223 'skip_download': True,
2224 },
2225 'add_ie': [YoutubeIE.ie_key()],
2226 }, {
2227 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2228 'info_dict': {
2229 'id': 'yeWKywCrFtk',
2230 'ext': 'mp4',
2231 'title': 'Small Scale Baler and Braiding Rugs',
2232 'uploader': 'Backus-Page House Museum',
2233 'uploader_id': 'backuspagemuseum',
2234 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2235 'upload_date': '20161008',
2236 'license': 'Standard YouTube License',
2237 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2238 'categories': ['Nonprofits & Activism'],
2239 'tags': list,
2240 'like_count': int,
2241 'dislike_count': int,
2242 },
2243 'params': {
2244 'noplaylist': True,
2245 'skip_download': True,
2246 },
2247 }, {
2248 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2249 'only_matching': True,
2250 }, {
2251 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2252 'only_matching': True,
2253 }]
2254
2255 def _real_initialize(self):
2256 self._login()
2257
2258 def _extract_mix(self, playlist_id):
2259 # The mixes are generated from a single video
2260 # the id of the playlist is just 'RD' + video_id
2261 ids = []
2262 last_id = playlist_id[-11:]
2263 for n in itertools.count(1):
2264 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2265 webpage = self._download_webpage(
2266 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2267 new_ids = orderedSet(re.findall(
2268 r'''(?xs)data-video-username=".*?".*?
2269 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2270 webpage))
2271 # Fetch new pages until all the videos are repeated, it seems that
2272 # there are always 51 unique videos.
2273 new_ids = [_id for _id in new_ids if _id not in ids]
2274 if not new_ids:
2275 break
2276 ids.extend(new_ids)
2277 last_id = ids[-1]
2278
2279 url_results = self._ids_to_results(ids)
2280
2281 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2282 title_span = (
2283 search_title('playlist-title') or
2284 search_title('title long-title') or
2285 search_title('title'))
2286 title = clean_html(title_span)
2287
2288 return self.playlist_result(url_results, playlist_id, title)
2289
2290 def _extract_playlist(self, playlist_id):
2291 url = self._TEMPLATE_URL % playlist_id
2292 page = self._download_webpage(url, playlist_id)
2293
2294 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2295 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2296 match = match.strip()
2297 # Check if the playlist exists or is private
2298 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2299 if mobj:
2300 reason = mobj.group('reason')
2301 message = 'This playlist %s' % reason
2302 if 'private' in reason:
2303 message += ', use --username or --netrc to access it'
2304 message += '.'
2305 raise ExtractorError(message, expected=True)
2306 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2307 raise ExtractorError(
2308 'Invalid parameters. Maybe URL is incorrect.',
2309 expected=True)
2310 elif re.match(r'[^<]*Choose your language[^<]*', match):
2311 continue
2312 else:
2313 self.report_warning('Youtube gives an alert message: ' + match)
2314
2315 playlist_title = self._html_search_regex(
2316 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2317 page, 'title', default=None)
2318
2319 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2320 uploader = self._search_regex(
2321 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2322 page, 'uploader', default=None)
2323 mobj = re.search(
2324 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2325 page)
2326 if mobj:
2327 uploader_id = mobj.group('uploader_id')
2328 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2329 else:
2330 uploader_id = uploader_url = None
2331
2332 has_videos = True
2333
2334 if not playlist_title:
2335 try:
2336 # Some playlist URLs don't actually serve a playlist (e.g.
2337 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2338 next(self._entries(page, playlist_id))
2339 except StopIteration:
2340 has_videos = False
2341
2342 playlist = self.playlist_result(
2343 self._entries(page, playlist_id), playlist_id, playlist_title)
2344 playlist.update({
2345 'uploader': uploader,
2346 'uploader_id': uploader_id,
2347 'uploader_url': uploader_url,
2348 })
2349
2350 return has_videos, playlist
2351
2352 def _check_download_just_video(self, url, playlist_id):
2353 # Check if it's a video-specific URL
2354 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2355 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2356 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2357 'video id', default=None)
2358 if video_id:
2359 if self._downloader.params.get('noplaylist'):
2360 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2361 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2362 else:
2363 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2364 return video_id, None
2365 return None, None
2366
2367 def _real_extract(self, url):
2368 # Extract playlist id
2369 mobj = re.match(self._VALID_URL, url)
2370 if mobj is None:
2371 raise ExtractorError('Invalid URL: %s' % url)
2372 playlist_id = mobj.group(1) or mobj.group(2)
2373
2374 video_id, video = self._check_download_just_video(url, playlist_id)
2375 if video:
2376 return video
2377
2378 if playlist_id.startswith(('RD', 'UL', 'PU')):
2379 # Mixes require a custom extraction process
2380 return self._extract_mix(playlist_id)
2381
2382 has_videos, playlist = self._extract_playlist(playlist_id)
2383 if has_videos or not video_id:
2384 return playlist
2385
2386 # Some playlist URLs don't actually serve a playlist (see
2387 # https://github.com/rg3/youtube-dl/issues/10537).
2388 # Fallback to plain video extraction if there is a video id
2389 # along with playlist id.
2390 return self.url_result(video_id, 'Youtube', video_id=video_id)
2391
2392
2393 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2394 IE_DESC = 'YouTube.com channels'
2395 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
2396 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2397 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2398 IE_NAME = 'youtube:channel'
2399 _TESTS = [{
2400 'note': 'paginated channel',
2401 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2402 'playlist_mincount': 91,
2403 'info_dict': {
2404 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2405 'title': 'Uploads from lex will',
2406 }
2407 }, {
2408 'note': 'Age restricted channel',
2409 # from https://www.youtube.com/user/DeusExOfficial
2410 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2411 'playlist_mincount': 64,
2412 'info_dict': {
2413 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2414 'title': 'Uploads from Deus Ex',
2415 },
2416 }]
2417
2418 @classmethod
2419 def suitable(cls, url):
2420 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2421 else super(YoutubeChannelIE, cls).suitable(url))
2422
2423 def _build_template_url(self, url, channel_id):
2424 return self._TEMPLATE_URL % channel_id
2425
2426 def _real_extract(self, url):
2427 channel_id = self._match_id(url)
2428
2429 url = self._build_template_url(url, channel_id)
2430
2431 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2432 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2433 # otherwise fallback on channel by page extraction
2434 channel_page = self._download_webpage(
2435 url + '?view=57', channel_id,
2436 'Downloading channel page', fatal=False)
2437 if channel_page is False:
2438 channel_playlist_id = False
2439 else:
2440 channel_playlist_id = self._html_search_meta(
2441 'channelId', channel_page, 'channel id', default=None)
2442 if not channel_playlist_id:
2443 channel_url = self._html_search_meta(
2444 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2445 channel_page, 'channel url', default=None)
2446 if channel_url:
2447 channel_playlist_id = self._search_regex(
2448 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2449 channel_url, 'channel id', default=None)
2450 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2451 playlist_id = 'UU' + channel_playlist_id[2:]
2452 return self.url_result(
2453 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2454
2455 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2456 autogenerated = re.search(r'''(?x)
2457 class="[^"]*?(?:
2458 channel-header-autogenerated-label|
2459 yt-channel-title-autogenerated
2460 )[^"]*"''', channel_page) is not None
2461
2462 if autogenerated:
2463 # The videos are contained in a single page
2464 # the ajax pages can't be used, they are empty
2465 entries = [
2466 self.url_result(
2467 video_id, 'Youtube', video_id=video_id,
2468 video_title=video_title)
2469 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2470 return self.playlist_result(entries, channel_id)
2471
2472 try:
2473 next(self._entries(channel_page, channel_id))
2474 except StopIteration:
2475 alert_message = self._html_search_regex(
2476 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2477 channel_page, 'alert', default=None, group='alert')
2478 if alert_message:
2479 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2480
2481 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2482
2483
2484 class YoutubeUserIE(YoutubeChannelIE):
2485 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2486 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2487 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2488 IE_NAME = 'youtube:user'
2489
2490 _TESTS = [{
2491 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2492 'playlist_mincount': 320,
2493 'info_dict': {
2494 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2495 'title': 'Uploads from The Linux Foundation',
2496 }
2497 }, {
2498 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2499 # but not https://www.youtube.com/user/12minuteathlete/videos
2500 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2501 'playlist_mincount': 249,
2502 'info_dict': {
2503 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2504 'title': 'Uploads from 12 Minute Athlete',
2505 }
2506 }, {
2507 'url': 'ytuser:phihag',
2508 'only_matching': True,
2509 }, {
2510 'url': 'https://www.youtube.com/c/gametrailers',
2511 'only_matching': True,
2512 }, {
2513 'url': 'https://www.youtube.com/gametrailers',
2514 'only_matching': True,
2515 }, {
2516 # This channel is not available, geo restricted to JP
2517 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2518 'only_matching': True,
2519 }]
2520
2521 @classmethod
2522 def suitable(cls, url):
2523 # Don't return True if the url can be extracted with other youtube
2524 # extractor, the regex would is too permissive and it would match.
2525 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2526 if any(ie.suitable(url) for ie in other_yt_ies):
2527 return False
2528 else:
2529 return super(YoutubeUserIE, cls).suitable(url)
2530
2531 def _build_template_url(self, url, channel_id):
2532 mobj = re.match(self._VALID_URL, url)
2533 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2534
2535
2536 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2537 IE_DESC = 'YouTube.com live streams'
2538 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2539 IE_NAME = 'youtube:live'
2540
2541 _TESTS = [{
2542 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2543 'info_dict': {
2544 'id': 'a48o2S1cPoo',
2545 'ext': 'mp4',
2546 'title': 'The Young Turks - Live Main Show',
2547 'uploader': 'The Young Turks',
2548 'uploader_id': 'TheYoungTurks',
2549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2550 'upload_date': '20150715',
2551 'license': 'Standard YouTube License',
2552 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2553 'categories': ['News & Politics'],
2554 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2555 'like_count': int,
2556 'dislike_count': int,
2557 },
2558 'params': {
2559 'skip_download': True,
2560 },
2561 }, {
2562 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2563 'only_matching': True,
2564 }, {
2565 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2566 'only_matching': True,
2567 }, {
2568 'url': 'https://www.youtube.com/TheYoungTurks/live',
2569 'only_matching': True,
2570 }]
2571
2572 def _real_extract(self, url):
2573 mobj = re.match(self._VALID_URL, url)
2574 channel_id = mobj.group('id')
2575 base_url = mobj.group('base_url')
2576 webpage = self._download_webpage(url, channel_id, fatal=False)
2577 if webpage:
2578 page_type = self._og_search_property(
2579 'type', webpage, 'page type', default='')
2580 video_id = self._html_search_meta(
2581 'videoId', webpage, 'video id', default=None)
2582 if page_type.startswith('video') and video_id and re.match(
2583 r'^[0-9A-Za-z_-]{11}$', video_id):
2584 return self.url_result(video_id, YoutubeIE.ie_key())
2585 return self.url_result(base_url)
2586
2587
2588 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2589 IE_DESC = 'YouTube.com user/channel playlists'
2590 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2591 IE_NAME = 'youtube:playlists'
2592
2593 _TESTS = [{
2594 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2595 'playlist_mincount': 4,
2596 'info_dict': {
2597 'id': 'ThirstForScience',
2598 'title': 'Thirst for Science',
2599 },
2600 }, {
2601 # with "Load more" button
2602 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2603 'playlist_mincount': 70,
2604 'info_dict': {
2605 'id': 'igorkle1',
2606 'title': 'Š˜Š³Š¾Ń€ŃŒ ŠšŠ»ŠµŠ¹Š½ŠµŃ€',
2607 },
2608 }, {
2609 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2610 'playlist_mincount': 17,
2611 'info_dict': {
2612 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2613 'title': 'Chem Player',
2614 },
2615 }]
2616
2617
2618 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2619 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2620
2621
2622 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
2623 IE_DESC = 'YouTube.com searches'
2624 # there doesn't appear to be a real limit, for example if you search for
2625 # 'python' you get more than 8.000.000 results
2626 _MAX_RESULTS = float('inf')
2627 IE_NAME = 'youtube:search'
2628 _SEARCH_KEY = 'ytsearch'
2629 _EXTRA_QUERY_ARGS = {}
2630 _TESTS = []
2631
2632 def _get_n_results(self, query, n):
2633 """Get a specified number of results for a query"""
2634
2635 videos = []
2636 limit = n
2637
2638 url_query = {
2639 'search_query': query.encode('utf-8'),
2640 }
2641 url_query.update(self._EXTRA_QUERY_ARGS)
2642 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2643
2644 for pagenum in itertools.count(1):
2645 data = self._download_json(
2646 result_url, video_id='query "%s"' % query,
2647 note='Downloading page %s' % pagenum,
2648 errnote='Unable to download API page',
2649 query={'spf': 'navigate'})
2650 html_content = data[1]['body']['content']
2651
2652 if 'class="search-message' in html_content:
2653 raise ExtractorError(
2654 '[youtube] No video results', expected=True)
2655
2656 new_videos = list(self._process_page(html_content))
2657 videos += new_videos
2658 if not new_videos or len(videos) > limit:
2659 break
2660 next_link = self._html_search_regex(
2661 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2662 html_content, 'next link', default=None)
2663 if next_link is None:
2664 break
2665 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
2666
2667 if len(videos) > n:
2668 videos = videos[:n]
2669 return self.playlist_result(videos, query)
2670
2671
2672 class YoutubeSearchDateIE(YoutubeSearchIE):
2673 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
2674 _SEARCH_KEY = 'ytsearchdate'
2675 IE_DESC = 'YouTube.com searches, newest videos first'
2676 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
2677
2678
2679 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
2680 IE_DESC = 'YouTube.com search URLs'
2681 IE_NAME = 'youtube:search_url'
2682 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
2683 _TESTS = [{
2684 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2685 'playlist_mincount': 5,
2686 'info_dict': {
2687 'title': 'youtube-dl test video',
2688 }
2689 }, {
2690 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2691 'only_matching': True,
2692 }]
2693
2694 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2696 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
2697 webpage = self._download_webpage(url, query)
2698 return self.playlist_result(self._process_page(webpage), playlist_title=query)
2699
2700
2701 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
2702 IE_DESC = 'YouTube.com (multi-season) shows'
2703 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
2704 IE_NAME = 'youtube:show'
2705 _TESTS = [{
2706 'url': 'https://www.youtube.com/show/airdisasters',
2707 'playlist_mincount': 5,
2708 'info_dict': {
2709 'id': 'airdisasters',
2710 'title': 'Air Disasters',
2711 }
2712 }]
2713
2714 def _real_extract(self, url):
2715 playlist_id = self._match_id(url)
2716 return super(YoutubeShowIE, self)._real_extract(
2717 'https://www.youtube.com/show/%s/playlists' % playlist_id)
2718
2719
2720 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
2721 """
2722 Base class for feed extractors
2723 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2724 """
2725 _LOGIN_REQUIRED = True
2726
2727 @property
2728 def IE_NAME(self):
2729 return 'youtube:%s' % self._FEED_NAME
2730
2731 def _real_initialize(self):
2732 self._login()
2733
2734 def _entries(self, page):
2735 # The extraction process is the same as for playlists, but the regex
2736 # for the video ids doesn't contain an index
2737 ids = []
2738 more_widget_html = content_html = page
2739 for page_num in itertools.count(1):
2740 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
2741
2742 # 'recommended' feed has infinite 'load more' and each new portion spins
2743 # the same videos in (sometimes) slightly different order, so we'll check
2744 # for unicity and break when portion has no new videos
2745 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
2746 if not new_ids:
2747 break
2748
2749 ids.extend(new_ids)
2750
2751 for entry in self._ids_to_results(new_ids):
2752 yield entry
2753
2754 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2755 if not mobj:
2756 break
2757
2758 more = self._download_json(
2759 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2760 'Downloading page #%s' % page_num,
2761 transform_source=uppercase_escape)
2762 content_html = more['content_html']
2763 more_widget_html = more['load_more_widget_html']
2764
2765 def _real_extract(self, url):
2766 page = self._download_webpage(
2767 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2768 self._PLAYLIST_TITLE)
2769 return self.playlist_result(
2770 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
2771
2772
2773 class YoutubeWatchLaterIE(YoutubePlaylistIE):
2774 IE_NAME = 'youtube:watchlater'
2775 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
2776 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
2777
2778 _TESTS = [{
2779 'url': 'https://www.youtube.com/playlist?list=WL',
2780 'only_matching': True,
2781 }, {
2782 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2783 'only_matching': True,
2784 }]
2785
2786 def _real_extract(self, url):
2787 _, video = self._check_download_just_video(url, 'WL')
2788 if video:
2789 return video
2790 _, playlist = self._extract_playlist('WL')
2791 return playlist
2792
2793
2794 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
2795 IE_NAME = 'youtube:favorites'
2796 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
2797 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
2798 _LOGIN_REQUIRED = True
2799
2800 def _real_extract(self, url):
2801 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
2802 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
2803 return self.url_result(playlist_id, 'YoutubePlaylist')
2804
2805
2806 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2807 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
2808 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
2809 _FEED_NAME = 'recommended'
2810 _PLAYLIST_TITLE = 'Youtube Recommended videos'
2811
2812
2813 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2814 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
2815 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
2816 _FEED_NAME = 'subscriptions'
2817 _PLAYLIST_TITLE = 'Youtube Subscriptions'
2818
2819
2820 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2821 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
2822 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
2823 _FEED_NAME = 'history'
2824 _PLAYLIST_TITLE = 'Youtube History'
2825
2826
2827 class YoutubeTruncatedURLIE(InfoExtractor):
2828 IE_NAME = 'youtube:truncated_url'
2829 IE_DESC = False # Do not list
2830 _VALID_URL = r'''(?x)
2831 (?:https?://)?
2832 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2833 (?:watch\?(?:
2834 feature=[a-z_]+|
2835 annotation_id=annotation_[^&]+|
2836 x-yt-cl=[0-9]+|
2837 hl=[^&]*|
2838 t=[0-9]+
2839 )?
2840 |
2841 attribution_link\?a=[^&]+
2842 )
2843 $
2844 '''
2845
2846 _TESTS = [{
2847 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
2848 'only_matching': True,
2849 }, {
2850 'url': 'https://www.youtube.com/watch?',
2851 'only_matching': True,
2852 }, {
2853 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2854 'only_matching': True,
2855 }, {
2856 'url': 'https://www.youtube.com/watch?feature=foo',
2857 'only_matching': True,
2858 }, {
2859 'url': 'https://www.youtube.com/watch?hl=en-GB',
2860 'only_matching': True,
2861 }, {
2862 'url': 'https://www.youtube.com/watch?t=2372',
2863 'only_matching': True,
2864 }]
2865
2866 def _real_extract(self, url):
2867 raise ExtractorError(
2868 'Did you forget to quote the URL? Remember that & is a meta '
2869 'character in most shells, so you want to put the URL in quotes, '
2870 'like youtube-dl '
2871 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2872 ' or simply youtube-dl BaW_jenozKc .',
2873 expected=True)
2874
2875
2876 class YoutubeTruncatedIDIE(InfoExtractor):
2877 IE_NAME = 'youtube:truncated_id'
2878 IE_DESC = False # Do not list
2879 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2880
2881 _TESTS = [{
2882 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2883 'only_matching': True,
2884 }]
2885
2886 def _real_extract(self, url):
2887 video_id = self._match_id(url)
2888 raise ExtractorError(
2889 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2890 expected=True)