]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Imported Upstream version 2014.11.21
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 OnDemandPagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note='Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = 'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//) # http(s):// or protocol-independent URL
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
249
250 # 3d videos
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
258
259 # Apple HTTP Live Streaming
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
267
268 # DASH mp4 video
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
277 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
279 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
280
281 # Dash mp4 audio
282 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
283 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
284 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
285
286 # Dash webm
287 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
291 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
292 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
293 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
294 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
301 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
302 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
303 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
304 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
305
306 # Dash webm audio
307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
309
310 # Dash webm audio with opus inside
311 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
312 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
313 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
314
315 # RTMP (unnamed)
316 '_rtmp': {'protocol': 'rtmp'},
317 }
318
319 IE_NAME = 'youtube'
320 _TESTS = [
321 {
322 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
323 'info_dict': {
324 'id': 'BaW_jenozKc',
325 'ext': 'mp4',
326 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
327 'uploader': 'Philipp Hagemeister',
328 'uploader_id': 'phihag',
329 'upload_date': '20121002',
330 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
331 'categories': ['Science & Technology'],
332 'like_count': int,
333 'dislike_count': int,
334 }
335 },
336 {
337 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
338 'note': 'Test generic use_cipher_signature video (#897)',
339 'info_dict': {
340 'id': 'UxxajLWwzqY',
341 'ext': 'mp4',
342 'upload_date': '20120506',
343 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
344 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
345 'uploader': 'Icona Pop',
346 'uploader_id': 'IconaPop',
347 }
348 },
349 {
350 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
351 'note': 'Test VEVO video with age protection (#956)',
352 'info_dict': {
353 'id': '07FYdnEawAQ',
354 'ext': 'mp4',
355 'upload_date': '20130703',
356 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
357 'description': 'md5:64249768eec3bc4276236606ea996373',
358 'uploader': 'justintimberlakeVEVO',
359 'uploader_id': 'justintimberlakeVEVO',
360 }
361 },
362 {
363 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
364 'note': 'Embed-only video (#1746)',
365 'info_dict': {
366 'id': 'yZIXLfi8CZQ',
367 'ext': 'mp4',
368 'upload_date': '20120608',
369 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
370 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
371 'uploader': 'SET India',
372 'uploader_id': 'setindia'
373 }
374 },
375 {
376 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
377 'note': '256k DASH audio (format 141) via DASH manifest',
378 'info_dict': {
379 'id': 'a9LDPn-MO4I',
380 'ext': 'm4a',
381 'upload_date': '20121002',
382 'uploader_id': '8KVIDEO',
383 'description': '',
384 'uploader': '8KVIDEO',
385 'title': 'UHDTV TEST 8K VIDEO.mp4'
386 },
387 'params': {
388 'youtube_include_dash_manifest': True,
389 'format': '141',
390 },
391 },
392 # DASH manifest with encrypted signature
393 {
394 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
395 'info_dict': {
396 'id': 'IB3lcPjvWLA',
397 'ext': 'm4a',
398 'title': 'Afrojack - The Spark ft. Spree Wilson',
399 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
400 'uploader': 'AfrojackVEVO',
401 'uploader_id': 'AfrojackVEVO',
402 'upload_date': '20131011',
403 },
404 'params': {
405 'youtube_include_dash_manifest': True,
406 'format': '141',
407 },
408 },
409 ]
410
411 def __init__(self, *args, **kwargs):
412 super(YoutubeIE, self).__init__(*args, **kwargs)
413 self._player_cache = {}
414
415 def report_video_info_webpage_download(self, video_id):
416 """Report attempt to download video info webpage."""
417 self.to_screen('%s: Downloading video info webpage' % video_id)
418
419 def report_information_extraction(self, video_id):
420 """Report attempt to extract video information."""
421 self.to_screen('%s: Extracting video information' % video_id)
422
423 def report_unavailable_format(self, video_id, format):
424 """Report extracted video URL."""
425 self.to_screen('%s: Format %s not available' % (video_id, format))
426
427 def report_rtmp_download(self):
428 """Indicate the download will use the RTMP protocol."""
429 self.to_screen('RTMP download detected')
430
431 def _signature_cache_id(self, example_sig):
432 """ Return a string representation of a signature """
433 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
434
435 def _extract_signature_function(self, video_id, player_url, example_sig):
436 id_m = re.match(
437 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
438 player_url)
439 if not id_m:
440 raise ExtractorError('Cannot identify player %r' % player_url)
441 player_type = id_m.group('ext')
442 player_id = id_m.group('id')
443
444 # Read from filesystem cache
445 func_id = '%s_%s_%s' % (
446 player_type, player_id, self._signature_cache_id(example_sig))
447 assert os.path.basename(func_id) == func_id
448
449 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
450 if cache_spec is not None:
451 return lambda s: ''.join(s[i] for i in cache_spec)
452
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note='Downloading %s player %s' % (player_type, player_id),
457 errnote='Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note='Downloading %s player %s' % (player_type, player_id),
463 errnote='Download of %s failed' % player_url)
464 code = urlh.read()
465 res = self._parse_sig_swf(code)
466 else:
467 assert False, 'Invalid player type %r' % player_type
468
469 if cache_spec is None:
470 test_string = ''.join(map(compat_chr, range(len(example_sig))))
471 cache_res = res(test_string)
472 cache_spec = [ord(c) for c in cache_res]
473
474 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
475 return res
476
477 def _print_sig_code(self, func, example_sig):
478 def gen_sig_code(idxs):
479 def _genslice(start, end, step):
480 starts = '' if start == 0 else str(start)
481 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
482 steps = '' if step == 1 else (':%d' % step)
483 return 's[%s%s%s]' % (starts, ends, steps)
484
485 step = None
486 start = '(Never used)' # Quelch pyflakes warnings - start will be
487 # set as soon as step is set
488 for i, prev in zip(idxs[1:], idxs[:-1]):
489 if step is not None:
490 if i - prev == step:
491 continue
492 yield _genslice(start, prev, step)
493 step = None
494 continue
495 if i - prev in [-1, 1]:
496 step = i - prev
497 start = prev
498 continue
499 else:
500 yield 's[%d]' % prev
501 if step is None:
502 yield 's[%d]' % i
503 else:
504 yield _genslice(start, i, step)
505
506 test_string = ''.join(map(compat_chr, range(len(example_sig))))
507 cache_res = func(test_string)
508 cache_spec = [ord(c) for c in cache_res]
509 expr_code = ' + '.join(gen_sig_code(cache_spec))
510 signature_id_tuple = '(%s)' % (
511 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
512 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
513 ' return %s\n') % (signature_id_tuple, expr_code)
514 self.to_screen('Extracted signature function:\n' + code)
515
516 def _parse_sig_js(self, jscode):
517 funcname = self._search_regex(
518 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
519 'Initial JS player signature function name')
520
521 jsi = JSInterpreter(jscode)
522 initial_function = jsi.extract_function(funcname)
523 return lambda s: initial_function([s])
524
525 def _parse_sig_swf(self, file_contents):
526 swfi = SWFInterpreter(file_contents)
527 TARGET_CLASSNAME = 'SignatureDecipher'
528 searched_class = swfi.extract_class(TARGET_CLASSNAME)
529 initial_function = swfi.extract_function(searched_class, 'decipher')
530 return lambda s: initial_function([s])
531
532 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
533 """Turn the encrypted s field into a working signature"""
534
535 if player_url is None:
536 raise ExtractorError('Cannot decrypt signature without player_url')
537
538 if player_url.startswith('//'):
539 player_url = 'https:' + player_url
540 try:
541 player_id = (player_url, self._signature_cache_id(s))
542 if player_id not in self._player_cache:
543 func = self._extract_signature_function(
544 video_id, player_url, s
545 )
546 self._player_cache[player_id] = func
547 func = self._player_cache[player_id]
548 if self._downloader.params.get('youtube_print_sig_code'):
549 self._print_sig_code(func, s)
550 return func(s)
551 except Exception as e:
552 tb = traceback.format_exc()
553 raise ExtractorError(
554 'Signature extraction failed: ' + tb, cause=e)
555
556 def _get_available_subtitles(self, video_id, webpage):
557 try:
558 sub_list = self._download_webpage(
559 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
560 video_id, note=False)
561 except ExtractorError as err:
562 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
563 return {}
564 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
565
566 sub_lang_list = {}
567 for l in lang_list:
568 lang = l[1]
569 if lang in sub_lang_list:
570 continue
571 params = compat_urllib_parse.urlencode({
572 'lang': lang,
573 'v': video_id,
574 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
575 'name': unescapeHTML(l[0]).encode('utf-8'),
576 })
577 url = 'https://www.youtube.com/api/timedtext?' + params
578 sub_lang_list[lang] = url
579 if not sub_lang_list:
580 self._downloader.report_warning('video doesn\'t have subtitles')
581 return {}
582 return sub_lang_list
583
584 def _get_available_automatic_caption(self, video_id, webpage):
585 """We need the webpage for getting the captions url, pass it as an
586 argument to speed up the process."""
587 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
588 self.to_screen('%s: Looking for automatic captions' % video_id)
589 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
590 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
591 if mobj is None:
592 self._downloader.report_warning(err_msg)
593 return {}
594 player_config = json.loads(mobj.group(1))
595 try:
596 args = player_config[u'args']
597 caption_url = args[u'ttsurl']
598 timestamp = args[u'timestamp']
599 # We get the available subtitles
600 list_params = compat_urllib_parse.urlencode({
601 'type': 'list',
602 'tlangs': 1,
603 'asrs': 1,
604 })
605 list_url = caption_url + '&' + list_params
606 caption_list = self._download_xml(list_url, video_id)
607 original_lang_node = caption_list.find('track')
608 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
609 self._downloader.report_warning('Video doesn\'t have automatic captions')
610 return {}
611 original_lang = original_lang_node.attrib['lang_code']
612
613 sub_lang_list = {}
614 for lang_node in caption_list.findall('target'):
615 sub_lang = lang_node.attrib['lang_code']
616 params = compat_urllib_parse.urlencode({
617 'lang': original_lang,
618 'tlang': sub_lang,
619 'fmt': sub_format,
620 'ts': timestamp,
621 'kind': 'asr',
622 })
623 sub_lang_list[sub_lang] = caption_url + '&' + params
624 return sub_lang_list
625 # An extractor error can be raise by the download process if there are
626 # no automatic captions but there are subtitles
627 except (KeyError, ExtractorError):
628 self._downloader.report_warning(err_msg)
629 return {}
630
631 @classmethod
632 def extract_id(cls, url):
633 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
634 if mobj is None:
635 raise ExtractorError('Invalid URL: %s' % url)
636 video_id = mobj.group(2)
637 return video_id
638
639 def _extract_from_m3u8(self, manifest_url, video_id):
640 url_map = {}
641 def _get_urls(_manifest):
642 lines = _manifest.split('\n')
643 urls = filter(lambda l: l and not l.startswith('#'),
644 lines)
645 return urls
646 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
647 formats_urls = _get_urls(manifest)
648 for format_url in formats_urls:
649 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
650 url_map[itag] = format_url
651 return url_map
652
653 def _extract_annotations(self, video_id):
654 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
655 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
656
657 def _real_extract(self, url):
658 proto = (
659 'http' if self._downloader.params.get('prefer_insecure', False)
660 else 'https')
661
662 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
663 mobj = re.search(self._NEXT_URL_RE, url)
664 if mobj:
665 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
666 video_id = self.extract_id(url)
667
668 # Get video webpage
669 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
670 pref_cookies = [
671 c for c in self._downloader.cookiejar
672 if c.domain == '.youtube.com' and c.name == 'PREF']
673 for pc in pref_cookies:
674 if 'hl=' in pc.value:
675 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
676 else:
677 if pc.value:
678 pc.value += '&'
679 pc.value += 'hl=en'
680 video_webpage = self._download_webpage(url, video_id)
681
682 # Attempt to extract SWF player URL
683 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
684 if mobj is not None:
685 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
686 else:
687 player_url = None
688
689 # Get video info
690 self.report_video_info_webpage_download(video_id)
691 if re.search(r'player-age-gate-content">', video_webpage) is not None:
692 age_gate = True
693 # We simulate the access to the video from www.youtube.com/v/{video_id}
694 # this can be viewed without login into Youtube
695 data = compat_urllib_parse.urlencode({
696 'video_id': video_id,
697 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
698 'sts': self._search_regex(
699 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
700 })
701 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
702 video_info_webpage = self._download_webpage(
703 video_info_url, video_id,
704 note='Refetching age-gated info webpage',
705 errnote='unable to download video info webpage')
706 video_info = compat_parse_qs(video_info_webpage)
707 else:
708 age_gate = False
709 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
710 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
711 % (video_id, el_type))
712 video_info_webpage = self._download_webpage(video_info_url, video_id,
713 note=False,
714 errnote='unable to download video info webpage')
715 video_info = compat_parse_qs(video_info_webpage)
716 if 'token' in video_info:
717 break
718 if 'token' not in video_info:
719 if 'reason' in video_info:
720 raise ExtractorError(
721 'YouTube said: %s' % video_info['reason'][0],
722 expected=True, video_id=video_id)
723 else:
724 raise ExtractorError(
725 '"token" parameter not in video info for unknown reason',
726 video_id=video_id)
727
728 if 'view_count' in video_info:
729 view_count = int(video_info['view_count'][0])
730 else:
731 view_count = None
732
733 # Check for "rental" videos
734 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
735 raise ExtractorError('"rental" videos not supported')
736
737 # Start extracting information
738 self.report_information_extraction(video_id)
739
740 # uploader
741 if 'author' not in video_info:
742 raise ExtractorError('Unable to extract uploader name')
743 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
744
745 # uploader_id
746 video_uploader_id = None
747 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
748 if mobj is not None:
749 video_uploader_id = mobj.group(1)
750 else:
751 self._downloader.report_warning('unable to extract uploader nickname')
752
753 # title
754 if 'title' in video_info:
755 video_title = video_info['title'][0]
756 else:
757 self._downloader.report_warning('Unable to extract video title')
758 video_title = '_'
759
760 # thumbnail image
761 # We try first to get a high quality image:
762 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
763 video_webpage, re.DOTALL)
764 if m_thumb is not None:
765 video_thumbnail = m_thumb.group(1)
766 elif 'thumbnail_url' not in video_info:
767 self._downloader.report_warning('unable to extract video thumbnail')
768 video_thumbnail = None
769 else: # don't panic if we can't find it
770 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
771
772 # upload date
773 upload_date = None
774 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
775 if mobj is None:
776 mobj = re.search(
777 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
778 video_webpage)
779 if mobj is not None:
780 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
781 upload_date = unified_strdate(upload_date)
782
783 m_cat_container = self._search_regex(
784 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
785 video_webpage, 'categories', fatal=False)
786 if m_cat_container:
787 category = self._html_search_regex(
788 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
789 default=None)
790 video_categories = None if category is None else [category]
791 else:
792 video_categories = None
793
794 # description
795 video_description = get_element_by_id("eow-description", video_webpage)
796 if video_description:
797 video_description = re.sub(r'''(?x)
798 <a\s+
799 (?:[a-zA-Z-]+="[^"]+"\s+)*?
800 title="([^"]+)"\s+
801 (?:[a-zA-Z-]+="[^"]+"\s+)*?
802 class="yt-uix-redirect-link"\s*>
803 [^<]+
804 </a>
805 ''', r'\1', video_description)
806 video_description = clean_html(video_description)
807 else:
808 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
809 if fd_mobj:
810 video_description = unescapeHTML(fd_mobj.group(1))
811 else:
812 video_description = ''
813
814 def _extract_count(count_name):
815 count = self._search_regex(
816 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
817 video_webpage, count_name, default=None)
818 if count is not None:
819 return int(count.replace(',', ''))
820 return None
821 like_count = _extract_count('like')
822 dislike_count = _extract_count('dislike')
823
824 # subtitles
825 video_subtitles = self.extract_subtitles(video_id, video_webpage)
826
827 if self._downloader.params.get('listsubtitles', False):
828 self._list_available_subtitles(video_id, video_webpage)
829 return
830
831 if 'length_seconds' not in video_info:
832 self._downloader.report_warning('unable to extract video duration')
833 video_duration = None
834 else:
835 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
836
837 # annotations
838 video_annotations = None
839 if self._downloader.params.get('writeannotations', False):
840 video_annotations = self._extract_annotations(video_id)
841
842 # Decide which formats to download
843 try:
844 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
845 if not mobj:
846 raise ValueError('Could not find vevo ID')
847 json_code = uppercase_escape(mobj.group(1))
848 ytplayer_config = json.loads(json_code)
849 args = ytplayer_config['args']
850 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
851 # this signatures are encrypted
852 if 'url_encoded_fmt_stream_map' not in args:
853 raise ValueError('No stream_map present') # caught below
854 re_signature = re.compile(r'[&,]s=')
855 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
856 if m_s is not None:
857 self.to_screen('%s: Encrypted signatures detected.' % video_id)
858 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
859 m_s = re_signature.search(args.get('adaptive_fmts', ''))
860 if m_s is not None:
861 if 'adaptive_fmts' in video_info:
862 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
863 else:
864 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
865 except ValueError:
866 pass
867
868 def _map_to_format_list(urlmap):
869 formats = []
870 for itag, video_real_url in urlmap.items():
871 dct = {
872 'format_id': itag,
873 'url': video_real_url,
874 'player_url': player_url,
875 }
876 if itag in self._formats:
877 dct.update(self._formats[itag])
878 formats.append(dct)
879 return formats
880
881 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
882 self.report_rtmp_download()
883 formats = [{
884 'format_id': '_rtmp',
885 'protocol': 'rtmp',
886 'url': video_info['conn'][0],
887 'player_url': player_url,
888 }]
889 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
890 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
891 if 'rtmpe%3Dyes' in encoded_url_map:
892 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
893 url_map = {}
894 for url_data_str in encoded_url_map.split(','):
895 url_data = compat_parse_qs(url_data_str)
896 if 'itag' not in url_data or 'url' not in url_data:
897 continue
898 format_id = url_data['itag'][0]
899 url = url_data['url'][0]
900
901 if 'sig' in url_data:
902 url += '&signature=' + url_data['sig'][0]
903 elif 's' in url_data:
904 encrypted_sig = url_data['s'][0]
905
906 if not age_gate:
907 jsplayer_url_json = self._search_regex(
908 r'"assets":.+?"js":\s*("[^"]+")',
909 video_webpage, 'JS player URL')
910 player_url = json.loads(jsplayer_url_json)
911 if player_url is None:
912 player_url_json = self._search_regex(
913 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
914 video_webpage, 'age gate player URL')
915 player_url = json.loads(player_url_json)
916
917 if self._downloader.params.get('verbose'):
918 if player_url is None:
919 player_version = 'unknown'
920 player_desc = 'unknown'
921 else:
922 if player_url.endswith('swf'):
923 player_version = self._search_regex(
924 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
925 'flash player', fatal=False)
926 player_desc = 'flash player %s' % player_version
927 else:
928 player_version = self._search_regex(
929 r'html5player-([^/]+?)(?:/html5player)?\.js',
930 player_url,
931 'html5 player', fatal=False)
932 player_desc = 'html5 player %s' % player_version
933
934 parts_sizes = self._signature_cache_id(encrypted_sig)
935 self.to_screen('{%s} signature length %s, %s' %
936 (format_id, parts_sizes, player_desc))
937
938 signature = self._decrypt_signature(
939 encrypted_sig, video_id, player_url, age_gate)
940 url += '&signature=' + signature
941 if 'ratebypass' not in url:
942 url += '&ratebypass=yes'
943 url_map[format_id] = url
944 formats = _map_to_format_list(url_map)
945 elif video_info.get('hlsvp'):
946 manifest_url = video_info['hlsvp'][0]
947 url_map = self._extract_from_m3u8(manifest_url, video_id)
948 formats = _map_to_format_list(url_map)
949 else:
950 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
951
952 # Look for the DASH manifest
953 if self._downloader.params.get('youtube_include_dash_manifest', True):
954 try:
955 # The DASH manifest used needs to be the one from the original video_webpage.
956 # The one found in get_video_info seems to be using different signatures.
957 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
958 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
959 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
960 if age_gate:
961 dash_manifest_url = video_info.get('dashmpd')[0]
962 else:
963 dash_manifest_url = ytplayer_config['args']['dashmpd']
964 def decrypt_sig(mobj):
965 s = mobj.group(1)
966 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
967 return '/signature/%s' % dec_s
968 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
969 dash_doc = self._download_xml(
970 dash_manifest_url, video_id,
971 note='Downloading DASH manifest',
972 errnote='Could not download DASH manifest')
973 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
974 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
975 if url_el is None:
976 continue
977 format_id = r.attrib['id']
978 video_url = url_el.text
979 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
980 f = {
981 'format_id': format_id,
982 'url': video_url,
983 'width': int_or_none(r.attrib.get('width')),
984 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
985 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
986 'filesize': filesize,
987 }
988 try:
989 existing_format = next(
990 fo for fo in formats
991 if fo['format_id'] == format_id)
992 except StopIteration:
993 f.update(self._formats.get(format_id, {}))
994 formats.append(f)
995 else:
996 existing_format.update(f)
997
998 except (ExtractorError, KeyError) as e:
999 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1000
1001 self._sort_formats(formats)
1002
1003 return {
1004 'id': video_id,
1005 'uploader': video_uploader,
1006 'uploader_id': video_uploader_id,
1007 'upload_date': upload_date,
1008 'title': video_title,
1009 'thumbnail': video_thumbnail,
1010 'description': video_description,
1011 'categories': video_categories,
1012 'subtitles': video_subtitles,
1013 'duration': video_duration,
1014 'age_limit': 18 if age_gate else 0,
1015 'annotations': video_annotations,
1016 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1017 'view_count': view_count,
1018 'like_count': like_count,
1019 'dislike_count': dislike_count,
1020 'formats': formats,
1021 }
1022
1023 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1024 IE_DESC = 'YouTube.com playlists'
1025 _VALID_URL = r"""(?x)(?:
1026 (?:https?://)?
1027 (?:\w+\.)?
1028 youtube\.com/
1029 (?:
1030 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1031 \? (?:.*?&)*? (?:p|a|list)=
1032 | p/
1033 )
1034 (
1035 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1036 # Top tracks, they can also include dots
1037 |(?:MC)[\w\.]*
1038 )
1039 .*
1040 |
1041 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1042 )"""
1043 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1044 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1045 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1046 IE_NAME = 'youtube:playlist'
1047 _TESTS = [{
1048 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1049 'info_dict': {
1050 'title': 'ytdl test PL',
1051 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1052 },
1053 'playlist_count': 3,
1054 }, {
1055 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1056 'info_dict': {
1057 'title': 'YDL_Empty_List',
1058 },
1059 'playlist_count': 0,
1060 }, {
1061 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1062 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1063 'info_dict': {
1064 'title': '29C3: Not my department',
1065 },
1066 'playlist_count': 95,
1067 }, {
1068 'note': 'issue #673',
1069 'url': 'PLBB231211A4F62143',
1070 'info_dict': {
1071 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1072 },
1073 'playlist_mincount': 26,
1074 }, {
1075 'note': 'Large playlist',
1076 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1077 'info_dict': {
1078 'title': 'Uploads from Cauchemar',
1079 },
1080 'playlist_mincount': 799,
1081 }, {
1082 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1083 'info_dict': {
1084 'title': 'YDL_safe_search',
1085 },
1086 'playlist_count': 2,
1087 }, {
1088 'note': 'embedded',
1089 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1090 'playlist_count': 4,
1091 'info_dict': {
1092 'title': 'JODA15',
1093 }
1094 }, {
1095 'note': 'Embedded SWF player',
1096 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1097 'playlist_count': 4,
1098 'info_dict': {
1099 'title': 'JODA7',
1100 }
1101 }]
1102
1103 def _real_initialize(self):
1104 self._login()
1105
1106 def _ids_to_results(self, ids):
1107 return [
1108 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1109 for vid_id in ids]
1110
1111 def _extract_mix(self, playlist_id):
1112 # The mixes are generated from a a single video
1113 # the id of the playlist is just 'RD' + video_id
1114 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1115 webpage = self._download_webpage(
1116 url, playlist_id, 'Downloading Youtube mix')
1117 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1118 title_span = (
1119 search_title('playlist-title') or
1120 search_title('title long-title') or
1121 search_title('title'))
1122 title = clean_html(title_span)
1123 ids = orderedSet(re.findall(
1124 r'''(?xs)data-video-username=".*?".*?
1125 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1126 webpage))
1127 url_results = self._ids_to_results(ids)
1128
1129 return self.playlist_result(url_results, playlist_id, title)
1130
1131 def _real_extract(self, url):
1132 # Extract playlist id
1133 mobj = re.match(self._VALID_URL, url)
1134 if mobj is None:
1135 raise ExtractorError('Invalid URL: %s' % url)
1136 playlist_id = mobj.group(1) or mobj.group(2)
1137
1138 # Check if it's a video-specific URL
1139 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1140 if 'v' in query_dict:
1141 video_id = query_dict['v'][0]
1142 if self._downloader.params.get('noplaylist'):
1143 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1144 return self.url_result(video_id, 'Youtube', video_id=video_id)
1145 else:
1146 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1147
1148 if playlist_id.startswith('RD'):
1149 # Mixes require a custom extraction process
1150 return self._extract_mix(playlist_id)
1151 if playlist_id.startswith('TL'):
1152 raise ExtractorError('For downloading YouTube.com top lists, use '
1153 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1154
1155 url = self._TEMPLATE_URL % playlist_id
1156 page = self._download_webpage(url, playlist_id)
1157 more_widget_html = content_html = page
1158
1159 # Check if the playlist exists or is private
1160 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1161 raise ExtractorError(
1162 'The playlist doesn\'t exist or is private, use --username or '
1163 '--netrc to access it.',
1164 expected=True)
1165
1166 # Extract the video ids from the playlist pages
1167 ids = []
1168
1169 for page_num in itertools.count(1):
1170 matches = re.finditer(self._VIDEO_RE, content_html)
1171 # We remove the duplicates and the link with index 0
1172 # (it's not the first video of the playlist)
1173 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1174 ids.extend(new_ids)
1175
1176 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1177 if not mobj:
1178 break
1179
1180 more = self._download_json(
1181 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1182 'Downloading page #%s' % page_num,
1183 transform_source=uppercase_escape)
1184 content_html = more['content_html']
1185 more_widget_html = more['load_more_widget_html']
1186
1187 playlist_title = self._html_search_regex(
1188 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1189 page, 'title')
1190
1191 url_results = self._ids_to_results(ids)
1192 return self.playlist_result(url_results, playlist_id, playlist_title)
1193
1194
1195 class YoutubeTopListIE(YoutubePlaylistIE):
1196 IE_NAME = 'youtube:toplist'
1197 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1198 ' (Example: "yttoplist:music:Top Tracks")')
1199 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1200 _TESTS = [{
1201 'url': 'yttoplist:music:Trending',
1202 'playlist_mincount': 5,
1203 'skip': 'Only works for logged-in users',
1204 }]
1205
1206 def _real_extract(self, url):
1207 mobj = re.match(self._VALID_URL, url)
1208 channel = mobj.group('chann')
1209 title = mobj.group('title')
1210 query = compat_urllib_parse.urlencode({'title': title})
1211 channel_page = self._download_webpage(
1212 'https://www.youtube.com/%s' % channel, title)
1213 link = self._html_search_regex(
1214 r'''(?x)
1215 <a\s+href="([^"]+)".*?>\s*
1216 <span\s+class="branded-page-module-title-text">\s*
1217 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1218 channel_page, 'list')
1219 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1220
1221 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1222 ids = []
1223 # sometimes the webpage doesn't contain the videos
1224 # retry until we get them
1225 for i in itertools.count(0):
1226 msg = 'Downloading Youtube mix'
1227 if i > 0:
1228 msg += ', retry #%d' % i
1229
1230 webpage = self._download_webpage(url, title, msg)
1231 ids = orderedSet(re.findall(video_re, webpage))
1232 if ids:
1233 break
1234 url_results = self._ids_to_results(ids)
1235 return self.playlist_result(url_results, playlist_title=title)
1236
1237
1238 class YoutubeChannelIE(InfoExtractor):
1239 IE_DESC = 'YouTube.com channels'
1240 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1241 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1242 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1243 IE_NAME = 'youtube:channel'
1244 _TESTS = [{
1245 'note': 'paginated channel',
1246 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1247 'playlist_mincount': 91,
1248 }]
1249
1250 def extract_videos_from_page(self, page):
1251 ids_in_page = []
1252 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1253 if mobj.group(1) not in ids_in_page:
1254 ids_in_page.append(mobj.group(1))
1255 return ids_in_page
1256
1257 def _real_extract(self, url):
1258 # Extract channel id
1259 mobj = re.match(self._VALID_URL, url)
1260 if mobj is None:
1261 raise ExtractorError('Invalid URL: %s' % url)
1262
1263 # Download channel page
1264 channel_id = mobj.group(1)
1265 video_ids = []
1266 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1267 channel_page = self._download_webpage(url, channel_id)
1268 autogenerated = re.search(r'''(?x)
1269 class="[^"]*?(?:
1270 channel-header-autogenerated-label|
1271 yt-channel-title-autogenerated
1272 )[^"]*"''', channel_page) is not None
1273
1274 if autogenerated:
1275 # The videos are contained in a single page
1276 # the ajax pages can't be used, they are empty
1277 video_ids = self.extract_videos_from_page(channel_page)
1278 else:
1279 # Download all channel pages using the json-based channel_ajax query
1280 for pagenum in itertools.count(1):
1281 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1282 page = self._download_json(
1283 url, channel_id, note='Downloading page #%s' % pagenum,
1284 transform_source=uppercase_escape)
1285
1286 ids_in_page = self.extract_videos_from_page(page['content_html'])
1287 video_ids.extend(ids_in_page)
1288
1289 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1290 break
1291
1292 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1293
1294 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1295 for video_id in video_ids]
1296 return self.playlist_result(url_entries, channel_id)
1297
1298
1299 class YoutubeUserIE(InfoExtractor):
1300 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1301 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1302 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1303 _GDATA_PAGE_SIZE = 50
1304 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1305 IE_NAME = 'youtube:user'
1306
1307 _TESTS = [{
1308 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1309 'playlist_mincount': 320,
1310 'info_dict': {
1311 'title': 'TheLinuxFoundation',
1312 }
1313 }, {
1314 'url': 'ytuser:phihag',
1315 'only_matching': True,
1316 }]
1317
1318 @classmethod
1319 def suitable(cls, url):
1320 # Don't return True if the url can be extracted with other youtube
1321 # extractor, the regex would is too permissive and it would match.
1322 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1323 if any(ie.suitable(url) for ie in other_ies): return False
1324 else: return super(YoutubeUserIE, cls).suitable(url)
1325
1326 def _real_extract(self, url):
1327 # Extract username
1328 mobj = re.match(self._VALID_URL, url)
1329 if mobj is None:
1330 raise ExtractorError('Invalid URL: %s' % url)
1331
1332 username = mobj.group(1)
1333
1334 # Download video ids using YouTube Data API. Result size per
1335 # query is limited (currently to 50 videos) so we need to query
1336 # page by page until there are no video ids - it means we got
1337 # all of them.
1338
1339 def download_page(pagenum):
1340 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1341
1342 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1343 page = self._download_webpage(
1344 gdata_url, username,
1345 'Downloading video ids from %d to %d' % (
1346 start_index, start_index + self._GDATA_PAGE_SIZE))
1347
1348 try:
1349 response = json.loads(page)
1350 except ValueError as err:
1351 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1352 if 'entry' not in response['feed']:
1353 return
1354
1355 # Extract video identifiers
1356 entries = response['feed']['entry']
1357 for entry in entries:
1358 title = entry['title']['$t']
1359 video_id = entry['id']['$t'].split('/')[-1]
1360 yield {
1361 '_type': 'url',
1362 'url': video_id,
1363 'ie_key': 'Youtube',
1364 'id': video_id,
1365 'title': title,
1366 }
1367 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1368
1369 return self.playlist_result(url_results, playlist_title=username)
1370
1371
1372 class YoutubeSearchIE(SearchInfoExtractor):
1373 IE_DESC = 'YouTube.com searches'
1374 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1375 _MAX_RESULTS = 1000
1376 IE_NAME = 'youtube:search'
1377 _SEARCH_KEY = 'ytsearch'
1378
1379 def _get_n_results(self, query, n):
1380 """Get a specified number of results for a query"""
1381
1382 video_ids = []
1383 pagenum = 0
1384 limit = n
1385 PAGE_SIZE = 50
1386
1387 while (PAGE_SIZE * pagenum) < limit:
1388 result_url = self._API_URL % (
1389 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1390 (PAGE_SIZE * pagenum) + 1)
1391 data_json = self._download_webpage(
1392 result_url, video_id='query "%s"' % query,
1393 note='Downloading page %s' % (pagenum + 1),
1394 errnote='Unable to download API page')
1395 data = json.loads(data_json)
1396 api_response = data['data']
1397
1398 if 'items' not in api_response:
1399 raise ExtractorError(
1400 '[youtube] No video results', expected=True)
1401
1402 new_ids = list(video['id'] for video in api_response['items'])
1403 video_ids += new_ids
1404
1405 limit = min(n, api_response['totalItems'])
1406 pagenum += 1
1407
1408 if len(video_ids) > n:
1409 video_ids = video_ids[:n]
1410 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1411 for video_id in video_ids]
1412 return self.playlist_result(videos, query)
1413
1414
1415 class YoutubeSearchDateIE(YoutubeSearchIE):
1416 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1417 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1418 _SEARCH_KEY = 'ytsearchdate'
1419 IE_DESC = 'YouTube.com searches, newest videos first'
1420
1421
1422 class YoutubeSearchURLIE(InfoExtractor):
1423 IE_DESC = 'YouTube.com search URLs'
1424 IE_NAME = 'youtube:search_url'
1425 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1426 _TESTS = [{
1427 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1428 'playlist_mincount': 5,
1429 'info_dict': {
1430 'title': 'youtube-dl test video',
1431 }
1432 }]
1433
1434 def _real_extract(self, url):
1435 mobj = re.match(self._VALID_URL, url)
1436 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1437
1438 webpage = self._download_webpage(url, query)
1439 result_code = self._search_regex(
1440 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1441
1442 part_codes = re.findall(
1443 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1444 entries = []
1445 for part_code in part_codes:
1446 part_title = self._html_search_regex(
1447 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1448 part_url_snippet = self._html_search_regex(
1449 r'(?s)href="([^"]+)"', part_code, 'item URL')
1450 part_url = compat_urlparse.urljoin(
1451 'https://www.youtube.com/', part_url_snippet)
1452 entries.append({
1453 '_type': 'url',
1454 'url': part_url,
1455 'title': part_title,
1456 })
1457
1458 return {
1459 '_type': 'playlist',
1460 'entries': entries,
1461 'title': query,
1462 }
1463
1464
1465 class YoutubeShowIE(InfoExtractor):
1466 IE_DESC = 'YouTube.com (multi-season) shows'
1467 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1468 IE_NAME = 'youtube:show'
1469 _TESTS = [{
1470 'url': 'http://www.youtube.com/show/airdisasters',
1471 'playlist_mincount': 3,
1472 'info_dict': {
1473 'id': 'airdisasters',
1474 'title': 'Air Disasters',
1475 }
1476 }]
1477
1478 def _real_extract(self, url):
1479 mobj = re.match(self._VALID_URL, url)
1480 playlist_id = mobj.group('id')
1481 webpage = self._download_webpage(
1482 url, playlist_id, 'Downloading show webpage')
1483 # There's one playlist for each season of the show
1484 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1485 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1486 entries = [
1487 self.url_result(
1488 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1489 for season in m_seasons
1490 ]
1491 title = self._og_search_title(webpage, fatal=False)
1492
1493 return {
1494 '_type': 'playlist',
1495 'id': playlist_id,
1496 'title': title,
1497 'entries': entries,
1498 }
1499
1500
1501 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1502 """
1503 Base class for extractors that fetch info from
1504 http://www.youtube.com/feed_ajax
1505 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1506 """
1507 _LOGIN_REQUIRED = True
1508 # use action_load_personal_feed instead of action_load_system_feed
1509 _PERSONAL_FEED = False
1510
1511 @property
1512 def _FEED_TEMPLATE(self):
1513 action = 'action_load_system_feed'
1514 if self._PERSONAL_FEED:
1515 action = 'action_load_personal_feed'
1516 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1517
1518 @property
1519 def IE_NAME(self):
1520 return 'youtube:%s' % self._FEED_NAME
1521
1522 def _real_initialize(self):
1523 self._login()
1524
1525 def _real_extract(self, url):
1526 feed_entries = []
1527 paging = 0
1528 for i in itertools.count(1):
1529 info = self._download_json(self._FEED_TEMPLATE % paging,
1530 '%s feed' % self._FEED_NAME,
1531 'Downloading page %s' % i)
1532 feed_html = info.get('feed_html') or info.get('content_html')
1533 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1534 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1535 ids = orderedSet(m.group(1) for m in m_ids)
1536 feed_entries.extend(
1537 self.url_result(video_id, 'Youtube', video_id=video_id)
1538 for video_id in ids)
1539 mobj = re.search(
1540 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1541 load_more_widget_html)
1542 if mobj is None:
1543 break
1544 paging = mobj.group('paging')
1545 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1546
1547 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1548 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1549 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1550 _FEED_NAME = 'recommended'
1551 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1552
1553 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1554 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1555 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1556 _FEED_NAME = 'watch_later'
1557 _PLAYLIST_TITLE = 'Youtube Watch Later'
1558 _PERSONAL_FEED = True
1559
1560 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1561 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1562 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1563 _FEED_NAME = 'history'
1564 _PERSONAL_FEED = True
1565 _PLAYLIST_TITLE = 'Youtube Watch History'
1566
1567 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1568 IE_NAME = 'youtube:favorites'
1569 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1570 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1571 _LOGIN_REQUIRED = True
1572
1573 def _real_extract(self, url):
1574 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1575 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1576 return self.url_result(playlist_id, 'YoutubePlaylist')
1577
1578
1579 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1580 IE_NAME = 'youtube:subscriptions'
1581 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1582 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1583 _TESTS = []
1584
1585 def _real_extract(self, url):
1586 title = 'Youtube Subscriptions'
1587 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1588
1589 # The extraction process is the same as for playlists, but the regex
1590 # for the video ids doesn't contain an index
1591 ids = []
1592 more_widget_html = content_html = page
1593
1594 for page_num in itertools.count(1):
1595 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1596 new_ids = orderedSet(matches)
1597 ids.extend(new_ids)
1598
1599 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1600 if not mobj:
1601 break
1602
1603 more = self._download_json(
1604 'https://youtube.com/%s' % mobj.group('more'), title,
1605 'Downloading page #%s' % page_num,
1606 transform_source=uppercase_escape)
1607 content_html = more['content_html']
1608 more_widget_html = more['load_more_widget_html']
1609
1610 return {
1611 '_type': 'playlist',
1612 'title': title,
1613 'entries': self._ids_to_results(ids),
1614 }
1615
1616
1617 class YoutubeTruncatedURLIE(InfoExtractor):
1618 IE_NAME = 'youtube:truncated_url'
1619 IE_DESC = False # Do not list
1620 _VALID_URL = r'''(?x)
1621 (?:https?://)?[^/]+/watch\?(?:
1622 feature=[a-z_]+|
1623 annotation_id=annotation_[^&]+
1624 )?$|
1625 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1626 '''
1627
1628 _TESTS = [{
1629 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1630 'only_matching': True,
1631 }, {
1632 'url': 'http://www.youtube.com/watch?',
1633 'only_matching': True,
1634 }]
1635
1636 def _real_extract(self, url):
1637 raise ExtractorError(
1638 'Did you forget to quote the URL? Remember that & is a meta '
1639 'character in most shells, so you want to put the URL in quotes, '
1640 'like youtube-dl '
1641 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1642 ' or simply youtube-dl BaW_jenozKc .',
1643 expected=True)