]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Imported Upstream version 2014.11.23
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 OnDemandPagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note='Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = 'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//) # http(s):// or protocol-independent URL
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
249
250 # 3d videos
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
258
259 # Apple HTTP Live Streaming
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
267
268 # DASH mp4 video
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
277 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
279 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
280
281 # Dash mp4 audio
282 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
283 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
284 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
285
286 # Dash webm
287 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
291 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
292 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
293 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
294 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
301 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
302 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
303 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
304 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
305
306 # Dash webm audio
307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
309
310 # Dash webm audio with opus inside
311 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
312 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
313 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
314
315 # RTMP (unnamed)
316 '_rtmp': {'protocol': 'rtmp'},
317 }
318
319 IE_NAME = 'youtube'
320 _TESTS = [
321 {
322 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
323 'info_dict': {
324 'id': 'BaW_jenozKc',
325 'ext': 'mp4',
326 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
327 'uploader': 'Philipp Hagemeister',
328 'uploader_id': 'phihag',
329 'upload_date': '20121002',
330 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
331 'categories': ['Science & Technology'],
332 'like_count': int,
333 'dislike_count': int,
334 }
335 },
336 {
337 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
338 'note': 'Test generic use_cipher_signature video (#897)',
339 'info_dict': {
340 'id': 'UxxajLWwzqY',
341 'ext': 'mp4',
342 'upload_date': '20120506',
343 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
344 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
345 'uploader': 'Icona Pop',
346 'uploader_id': 'IconaPop',
347 }
348 },
349 {
350 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
351 'note': 'Test VEVO video with age protection (#956)',
352 'info_dict': {
353 'id': '07FYdnEawAQ',
354 'ext': 'mp4',
355 'upload_date': '20130703',
356 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
357 'description': 'md5:64249768eec3bc4276236606ea996373',
358 'uploader': 'justintimberlakeVEVO',
359 'uploader_id': 'justintimberlakeVEVO',
360 }
361 },
362 {
363 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
364 'note': 'Embed-only video (#1746)',
365 'info_dict': {
366 'id': 'yZIXLfi8CZQ',
367 'ext': 'mp4',
368 'upload_date': '20120608',
369 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
370 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
371 'uploader': 'SET India',
372 'uploader_id': 'setindia'
373 }
374 },
375 {
376 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
377 'note': '256k DASH audio (format 141) via DASH manifest',
378 'info_dict': {
379 'id': 'a9LDPn-MO4I',
380 'ext': 'm4a',
381 'upload_date': '20121002',
382 'uploader_id': '8KVIDEO',
383 'description': '',
384 'uploader': '8KVIDEO',
385 'title': 'UHDTV TEST 8K VIDEO.mp4'
386 },
387 'params': {
388 'youtube_include_dash_manifest': True,
389 'format': '141',
390 },
391 },
392 # DASH manifest with encrypted signature
393 {
394 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
395 'info_dict': {
396 'id': 'IB3lcPjvWLA',
397 'ext': 'm4a',
398 'title': 'Afrojack - The Spark ft. Spree Wilson',
399 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
400 'uploader': 'AfrojackVEVO',
401 'uploader_id': 'AfrojackVEVO',
402 'upload_date': '20131011',
403 },
404 'params': {
405 'youtube_include_dash_manifest': True,
406 'format': '141',
407 },
408 },
409 # Controversy video
410 {
411 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
412 'info_dict': {
413 'id': 'T4XJQO3qol8',
414 'ext': 'mp4',
415 'upload_date': '20100909',
416 'uploader': 'The Amazing Atheist',
417 'uploader_id': 'TheAmazingAtheist',
418 'title': 'Burning Everyone\'s Koran',
419 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
420 }
421 }
422 ]
423
424 def __init__(self, *args, **kwargs):
425 super(YoutubeIE, self).__init__(*args, **kwargs)
426 self._player_cache = {}
427
428 def report_video_info_webpage_download(self, video_id):
429 """Report attempt to download video info webpage."""
430 self.to_screen('%s: Downloading video info webpage' % video_id)
431
432 def report_information_extraction(self, video_id):
433 """Report attempt to extract video information."""
434 self.to_screen('%s: Extracting video information' % video_id)
435
436 def report_unavailable_format(self, video_id, format):
437 """Report extracted video URL."""
438 self.to_screen('%s: Format %s not available' % (video_id, format))
439
440 def report_rtmp_download(self):
441 """Indicate the download will use the RTMP protocol."""
442 self.to_screen('RTMP download detected')
443
444 def _signature_cache_id(self, example_sig):
445 """ Return a string representation of a signature """
446 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
447
448 def _extract_signature_function(self, video_id, player_url, example_sig):
449 id_m = re.match(
450 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
451 player_url)
452 if not id_m:
453 raise ExtractorError('Cannot identify player %r' % player_url)
454 player_type = id_m.group('ext')
455 player_id = id_m.group('id')
456
457 # Read from filesystem cache
458 func_id = '%s_%s_%s' % (
459 player_type, player_id, self._signature_cache_id(example_sig))
460 assert os.path.basename(func_id) == func_id
461
462 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
463 if cache_spec is not None:
464 return lambda s: ''.join(s[i] for i in cache_spec)
465
466 if player_type == 'js':
467 code = self._download_webpage(
468 player_url, video_id,
469 note='Downloading %s player %s' % (player_type, player_id),
470 errnote='Download of %s failed' % player_url)
471 res = self._parse_sig_js(code)
472 elif player_type == 'swf':
473 urlh = self._request_webpage(
474 player_url, video_id,
475 note='Downloading %s player %s' % (player_type, player_id),
476 errnote='Download of %s failed' % player_url)
477 code = urlh.read()
478 res = self._parse_sig_swf(code)
479 else:
480 assert False, 'Invalid player type %r' % player_type
481
482 if cache_spec is None:
483 test_string = ''.join(map(compat_chr, range(len(example_sig))))
484 cache_res = res(test_string)
485 cache_spec = [ord(c) for c in cache_res]
486
487 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
488 return res
489
490 def _print_sig_code(self, func, example_sig):
491 def gen_sig_code(idxs):
492 def _genslice(start, end, step):
493 starts = '' if start == 0 else str(start)
494 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
495 steps = '' if step == 1 else (':%d' % step)
496 return 's[%s%s%s]' % (starts, ends, steps)
497
498 step = None
499 start = '(Never used)' # Quelch pyflakes warnings - start will be
500 # set as soon as step is set
501 for i, prev in zip(idxs[1:], idxs[:-1]):
502 if step is not None:
503 if i - prev == step:
504 continue
505 yield _genslice(start, prev, step)
506 step = None
507 continue
508 if i - prev in [-1, 1]:
509 step = i - prev
510 start = prev
511 continue
512 else:
513 yield 's[%d]' % prev
514 if step is None:
515 yield 's[%d]' % i
516 else:
517 yield _genslice(start, i, step)
518
519 test_string = ''.join(map(compat_chr, range(len(example_sig))))
520 cache_res = func(test_string)
521 cache_spec = [ord(c) for c in cache_res]
522 expr_code = ' + '.join(gen_sig_code(cache_spec))
523 signature_id_tuple = '(%s)' % (
524 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
525 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
526 ' return %s\n') % (signature_id_tuple, expr_code)
527 self.to_screen('Extracted signature function:\n' + code)
528
529 def _parse_sig_js(self, jscode):
530 funcname = self._search_regex(
531 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
532 'Initial JS player signature function name')
533
534 jsi = JSInterpreter(jscode)
535 initial_function = jsi.extract_function(funcname)
536 return lambda s: initial_function([s])
537
538 def _parse_sig_swf(self, file_contents):
539 swfi = SWFInterpreter(file_contents)
540 TARGET_CLASSNAME = 'SignatureDecipher'
541 searched_class = swfi.extract_class(TARGET_CLASSNAME)
542 initial_function = swfi.extract_function(searched_class, 'decipher')
543 return lambda s: initial_function([s])
544
545 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
546 """Turn the encrypted s field into a working signature"""
547
548 if player_url is None:
549 raise ExtractorError('Cannot decrypt signature without player_url')
550
551 if player_url.startswith('//'):
552 player_url = 'https:' + player_url
553 try:
554 player_id = (player_url, self._signature_cache_id(s))
555 if player_id not in self._player_cache:
556 func = self._extract_signature_function(
557 video_id, player_url, s
558 )
559 self._player_cache[player_id] = func
560 func = self._player_cache[player_id]
561 if self._downloader.params.get('youtube_print_sig_code'):
562 self._print_sig_code(func, s)
563 return func(s)
564 except Exception as e:
565 tb = traceback.format_exc()
566 raise ExtractorError(
567 'Signature extraction failed: ' + tb, cause=e)
568
569 def _get_available_subtitles(self, video_id, webpage):
570 try:
571 sub_list = self._download_webpage(
572 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
573 video_id, note=False)
574 except ExtractorError as err:
575 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
576 return {}
577 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
578
579 sub_lang_list = {}
580 for l in lang_list:
581 lang = l[1]
582 if lang in sub_lang_list:
583 continue
584 params = compat_urllib_parse.urlencode({
585 'lang': lang,
586 'v': video_id,
587 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
588 'name': unescapeHTML(l[0]).encode('utf-8'),
589 })
590 url = 'https://www.youtube.com/api/timedtext?' + params
591 sub_lang_list[lang] = url
592 if not sub_lang_list:
593 self._downloader.report_warning('video doesn\'t have subtitles')
594 return {}
595 return sub_lang_list
596
597 def _get_available_automatic_caption(self, video_id, webpage):
598 """We need the webpage for getting the captions url, pass it as an
599 argument to speed up the process."""
600 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
601 self.to_screen('%s: Looking for automatic captions' % video_id)
602 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
603 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
604 if mobj is None:
605 self._downloader.report_warning(err_msg)
606 return {}
607 player_config = json.loads(mobj.group(1))
608 try:
609 args = player_config[u'args']
610 caption_url = args[u'ttsurl']
611 timestamp = args[u'timestamp']
612 # We get the available subtitles
613 list_params = compat_urllib_parse.urlencode({
614 'type': 'list',
615 'tlangs': 1,
616 'asrs': 1,
617 })
618 list_url = caption_url + '&' + list_params
619 caption_list = self._download_xml(list_url, video_id)
620 original_lang_node = caption_list.find('track')
621 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
622 self._downloader.report_warning('Video doesn\'t have automatic captions')
623 return {}
624 original_lang = original_lang_node.attrib['lang_code']
625
626 sub_lang_list = {}
627 for lang_node in caption_list.findall('target'):
628 sub_lang = lang_node.attrib['lang_code']
629 params = compat_urllib_parse.urlencode({
630 'lang': original_lang,
631 'tlang': sub_lang,
632 'fmt': sub_format,
633 'ts': timestamp,
634 'kind': 'asr',
635 })
636 sub_lang_list[sub_lang] = caption_url + '&' + params
637 return sub_lang_list
638 # An extractor error can be raise by the download process if there are
639 # no automatic captions but there are subtitles
640 except (KeyError, ExtractorError):
641 self._downloader.report_warning(err_msg)
642 return {}
643
644 @classmethod
645 def extract_id(cls, url):
646 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
647 if mobj is None:
648 raise ExtractorError('Invalid URL: %s' % url)
649 video_id = mobj.group(2)
650 return video_id
651
652 def _extract_from_m3u8(self, manifest_url, video_id):
653 url_map = {}
654 def _get_urls(_manifest):
655 lines = _manifest.split('\n')
656 urls = filter(lambda l: l and not l.startswith('#'),
657 lines)
658 return urls
659 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
660 formats_urls = _get_urls(manifest)
661 for format_url in formats_urls:
662 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
663 url_map[itag] = format_url
664 return url_map
665
666 def _extract_annotations(self, video_id):
667 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
668 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
669
670 def _real_extract(self, url):
671 proto = (
672 'http' if self._downloader.params.get('prefer_insecure', False)
673 else 'https')
674
675 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
676 mobj = re.search(self._NEXT_URL_RE, url)
677 if mobj:
678 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
679 video_id = self.extract_id(url)
680
681 # Get video webpage
682 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
683 pref_cookies = [
684 c for c in self._downloader.cookiejar
685 if c.domain == '.youtube.com' and c.name == 'PREF']
686 for pc in pref_cookies:
687 if 'hl=' in pc.value:
688 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
689 else:
690 if pc.value:
691 pc.value += '&'
692 pc.value += 'hl=en'
693 video_webpage = self._download_webpage(url, video_id)
694
695 # Attempt to extract SWF player URL
696 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
697 if mobj is not None:
698 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
699 else:
700 player_url = None
701
702 # Get video info
703 self.report_video_info_webpage_download(video_id)
704 if re.search(r'player-age-gate-content">', video_webpage) is not None:
705 age_gate = True
706 # We simulate the access to the video from www.youtube.com/v/{video_id}
707 # this can be viewed without login into Youtube
708 data = compat_urllib_parse.urlencode({
709 'video_id': video_id,
710 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
711 'sts': self._search_regex(
712 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
713 })
714 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
715 video_info_webpage = self._download_webpage(
716 video_info_url, video_id,
717 note='Refetching age-gated info webpage',
718 errnote='unable to download video info webpage')
719 video_info = compat_parse_qs(video_info_webpage)
720 else:
721 age_gate = False
722 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
723 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
724 % (video_id, el_type))
725 video_info_webpage = self._download_webpage(video_info_url, video_id,
726 note=False,
727 errnote='unable to download video info webpage')
728 video_info = compat_parse_qs(video_info_webpage)
729 if 'token' in video_info:
730 break
731 if 'token' not in video_info:
732 if 'reason' in video_info:
733 raise ExtractorError(
734 'YouTube said: %s' % video_info['reason'][0],
735 expected=True, video_id=video_id)
736 else:
737 raise ExtractorError(
738 '"token" parameter not in video info for unknown reason',
739 video_id=video_id)
740
741 if 'view_count' in video_info:
742 view_count = int(video_info['view_count'][0])
743 else:
744 view_count = None
745
746 # Check for "rental" videos
747 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
748 raise ExtractorError('"rental" videos not supported')
749
750 # Start extracting information
751 self.report_information_extraction(video_id)
752
753 # uploader
754 if 'author' not in video_info:
755 raise ExtractorError('Unable to extract uploader name')
756 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
757
758 # uploader_id
759 video_uploader_id = None
760 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
761 if mobj is not None:
762 video_uploader_id = mobj.group(1)
763 else:
764 self._downloader.report_warning('unable to extract uploader nickname')
765
766 # title
767 if 'title' in video_info:
768 video_title = video_info['title'][0]
769 else:
770 self._downloader.report_warning('Unable to extract video title')
771 video_title = '_'
772
773 # thumbnail image
774 # We try first to get a high quality image:
775 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
776 video_webpage, re.DOTALL)
777 if m_thumb is not None:
778 video_thumbnail = m_thumb.group(1)
779 elif 'thumbnail_url' not in video_info:
780 self._downloader.report_warning('unable to extract video thumbnail')
781 video_thumbnail = None
782 else: # don't panic if we can't find it
783 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
784
785 # upload date
786 upload_date = None
787 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
788 if mobj is None:
789 mobj = re.search(
790 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
791 video_webpage)
792 if mobj is not None:
793 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
794 upload_date = unified_strdate(upload_date)
795
796 m_cat_container = self._search_regex(
797 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
798 video_webpage, 'categories', fatal=False)
799 if m_cat_container:
800 category = self._html_search_regex(
801 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
802 default=None)
803 video_categories = None if category is None else [category]
804 else:
805 video_categories = None
806
807 # description
808 video_description = get_element_by_id("eow-description", video_webpage)
809 if video_description:
810 video_description = re.sub(r'''(?x)
811 <a\s+
812 (?:[a-zA-Z-]+="[^"]+"\s+)*?
813 title="([^"]+)"\s+
814 (?:[a-zA-Z-]+="[^"]+"\s+)*?
815 class="yt-uix-redirect-link"\s*>
816 [^<]+
817 </a>
818 ''', r'\1', video_description)
819 video_description = clean_html(video_description)
820 else:
821 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
822 if fd_mobj:
823 video_description = unescapeHTML(fd_mobj.group(1))
824 else:
825 video_description = ''
826
827 def _extract_count(count_name):
828 count = self._search_regex(
829 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
830 video_webpage, count_name, default=None)
831 if count is not None:
832 return int(count.replace(',', ''))
833 return None
834 like_count = _extract_count('like')
835 dislike_count = _extract_count('dislike')
836
837 # subtitles
838 video_subtitles = self.extract_subtitles(video_id, video_webpage)
839
840 if self._downloader.params.get('listsubtitles', False):
841 self._list_available_subtitles(video_id, video_webpage)
842 return
843
844 if 'length_seconds' not in video_info:
845 self._downloader.report_warning('unable to extract video duration')
846 video_duration = None
847 else:
848 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
849
850 # annotations
851 video_annotations = None
852 if self._downloader.params.get('writeannotations', False):
853 video_annotations = self._extract_annotations(video_id)
854
855 # Decide which formats to download
856 try:
857 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
858 if not mobj:
859 raise ValueError('Could not find vevo ID')
860 json_code = uppercase_escape(mobj.group(1))
861 ytplayer_config = json.loads(json_code)
862 args = ytplayer_config['args']
863 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
864 # this signatures are encrypted
865 if 'url_encoded_fmt_stream_map' not in args:
866 raise ValueError('No stream_map present') # caught below
867 re_signature = re.compile(r'[&,]s=')
868 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
869 if m_s is not None:
870 self.to_screen('%s: Encrypted signatures detected.' % video_id)
871 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
872 m_s = re_signature.search(args.get('adaptive_fmts', ''))
873 if m_s is not None:
874 if 'adaptive_fmts' in video_info:
875 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
876 else:
877 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
878 except ValueError:
879 pass
880
881 def _map_to_format_list(urlmap):
882 formats = []
883 for itag, video_real_url in urlmap.items():
884 dct = {
885 'format_id': itag,
886 'url': video_real_url,
887 'player_url': player_url,
888 }
889 if itag in self._formats:
890 dct.update(self._formats[itag])
891 formats.append(dct)
892 return formats
893
894 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
895 self.report_rtmp_download()
896 formats = [{
897 'format_id': '_rtmp',
898 'protocol': 'rtmp',
899 'url': video_info['conn'][0],
900 'player_url': player_url,
901 }]
902 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
903 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
904 if 'rtmpe%3Dyes' in encoded_url_map:
905 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
906 url_map = {}
907 for url_data_str in encoded_url_map.split(','):
908 url_data = compat_parse_qs(url_data_str)
909 if 'itag' not in url_data or 'url' not in url_data:
910 continue
911 format_id = url_data['itag'][0]
912 url = url_data['url'][0]
913
914 if 'sig' in url_data:
915 url += '&signature=' + url_data['sig'][0]
916 elif 's' in url_data:
917 encrypted_sig = url_data['s'][0]
918
919 if not age_gate:
920 jsplayer_url_json = self._search_regex(
921 r'"assets":.+?"js":\s*("[^"]+")',
922 video_webpage, 'JS player URL')
923 player_url = json.loads(jsplayer_url_json)
924 if player_url is None:
925 player_url_json = self._search_regex(
926 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
927 video_webpage, 'age gate player URL')
928 player_url = json.loads(player_url_json)
929
930 if self._downloader.params.get('verbose'):
931 if player_url is None:
932 player_version = 'unknown'
933 player_desc = 'unknown'
934 else:
935 if player_url.endswith('swf'):
936 player_version = self._search_regex(
937 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
938 'flash player', fatal=False)
939 player_desc = 'flash player %s' % player_version
940 else:
941 player_version = self._search_regex(
942 r'html5player-([^/]+?)(?:/html5player)?\.js',
943 player_url,
944 'html5 player', fatal=False)
945 player_desc = 'html5 player %s' % player_version
946
947 parts_sizes = self._signature_cache_id(encrypted_sig)
948 self.to_screen('{%s} signature length %s, %s' %
949 (format_id, parts_sizes, player_desc))
950
951 signature = self._decrypt_signature(
952 encrypted_sig, video_id, player_url, age_gate)
953 url += '&signature=' + signature
954 if 'ratebypass' not in url:
955 url += '&ratebypass=yes'
956 url_map[format_id] = url
957 formats = _map_to_format_list(url_map)
958 elif video_info.get('hlsvp'):
959 manifest_url = video_info['hlsvp'][0]
960 url_map = self._extract_from_m3u8(manifest_url, video_id)
961 formats = _map_to_format_list(url_map)
962 else:
963 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
964
965 # Look for the DASH manifest
966 if self._downloader.params.get('youtube_include_dash_manifest', True):
967 try:
968 # The DASH manifest used needs to be the one from the original video_webpage.
969 # The one found in get_video_info seems to be using different signatures.
970 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
971 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
972 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
973 if age_gate:
974 dash_manifest_url = video_info.get('dashmpd')[0]
975 else:
976 dash_manifest_url = ytplayer_config['args']['dashmpd']
977 def decrypt_sig(mobj):
978 s = mobj.group(1)
979 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
980 return '/signature/%s' % dec_s
981 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
982 dash_doc = self._download_xml(
983 dash_manifest_url, video_id,
984 note='Downloading DASH manifest',
985 errnote='Could not download DASH manifest')
986 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
987 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
988 if url_el is None:
989 continue
990 format_id = r.attrib['id']
991 video_url = url_el.text
992 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
993 f = {
994 'format_id': format_id,
995 'url': video_url,
996 'width': int_or_none(r.attrib.get('width')),
997 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
998 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
999 'filesize': filesize,
1000 }
1001 try:
1002 existing_format = next(
1003 fo for fo in formats
1004 if fo['format_id'] == format_id)
1005 except StopIteration:
1006 f.update(self._formats.get(format_id, {}))
1007 formats.append(f)
1008 else:
1009 existing_format.update(f)
1010
1011 except (ExtractorError, KeyError) as e:
1012 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1013
1014 self._sort_formats(formats)
1015
1016 return {
1017 'id': video_id,
1018 'uploader': video_uploader,
1019 'uploader_id': video_uploader_id,
1020 'upload_date': upload_date,
1021 'title': video_title,
1022 'thumbnail': video_thumbnail,
1023 'description': video_description,
1024 'categories': video_categories,
1025 'subtitles': video_subtitles,
1026 'duration': video_duration,
1027 'age_limit': 18 if age_gate else 0,
1028 'annotations': video_annotations,
1029 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1030 'view_count': view_count,
1031 'like_count': like_count,
1032 'dislike_count': dislike_count,
1033 'formats': formats,
1034 }
1035
1036 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1037 IE_DESC = 'YouTube.com playlists'
1038 _VALID_URL = r"""(?x)(?:
1039 (?:https?://)?
1040 (?:\w+\.)?
1041 youtube\.com/
1042 (?:
1043 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1044 \? (?:.*?&)*? (?:p|a|list)=
1045 | p/
1046 )
1047 (
1048 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1049 # Top tracks, they can also include dots
1050 |(?:MC)[\w\.]*
1051 )
1052 .*
1053 |
1054 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1055 )"""
1056 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1057 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1058 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1059 IE_NAME = 'youtube:playlist'
1060 _TESTS = [{
1061 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1062 'info_dict': {
1063 'title': 'ytdl test PL',
1064 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1065 },
1066 'playlist_count': 3,
1067 }, {
1068 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1069 'info_dict': {
1070 'title': 'YDL_Empty_List',
1071 },
1072 'playlist_count': 0,
1073 }, {
1074 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1075 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1076 'info_dict': {
1077 'title': '29C3: Not my department',
1078 },
1079 'playlist_count': 95,
1080 }, {
1081 'note': 'issue #673',
1082 'url': 'PLBB231211A4F62143',
1083 'info_dict': {
1084 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1085 },
1086 'playlist_mincount': 26,
1087 }, {
1088 'note': 'Large playlist',
1089 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1090 'info_dict': {
1091 'title': 'Uploads from Cauchemar',
1092 },
1093 'playlist_mincount': 799,
1094 }, {
1095 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1096 'info_dict': {
1097 'title': 'YDL_safe_search',
1098 },
1099 'playlist_count': 2,
1100 }, {
1101 'note': 'embedded',
1102 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1103 'playlist_count': 4,
1104 'info_dict': {
1105 'title': 'JODA15',
1106 }
1107 }, {
1108 'note': 'Embedded SWF player',
1109 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1110 'playlist_count': 4,
1111 'info_dict': {
1112 'title': 'JODA7',
1113 }
1114 }]
1115
1116 def _real_initialize(self):
1117 self._login()
1118
1119 def _ids_to_results(self, ids):
1120 return [
1121 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1122 for vid_id in ids]
1123
1124 def _extract_mix(self, playlist_id):
1125 # The mixes are generated from a a single video
1126 # the id of the playlist is just 'RD' + video_id
1127 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1128 webpage = self._download_webpage(
1129 url, playlist_id, 'Downloading Youtube mix')
1130 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1131 title_span = (
1132 search_title('playlist-title') or
1133 search_title('title long-title') or
1134 search_title('title'))
1135 title = clean_html(title_span)
1136 ids = orderedSet(re.findall(
1137 r'''(?xs)data-video-username=".*?".*?
1138 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1139 webpage))
1140 url_results = self._ids_to_results(ids)
1141
1142 return self.playlist_result(url_results, playlist_id, title)
1143
1144 def _real_extract(self, url):
1145 # Extract playlist id
1146 mobj = re.match(self._VALID_URL, url)
1147 if mobj is None:
1148 raise ExtractorError('Invalid URL: %s' % url)
1149 playlist_id = mobj.group(1) or mobj.group(2)
1150
1151 # Check if it's a video-specific URL
1152 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1153 if 'v' in query_dict:
1154 video_id = query_dict['v'][0]
1155 if self._downloader.params.get('noplaylist'):
1156 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1157 return self.url_result(video_id, 'Youtube', video_id=video_id)
1158 else:
1159 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1160
1161 if playlist_id.startswith('RD'):
1162 # Mixes require a custom extraction process
1163 return self._extract_mix(playlist_id)
1164 if playlist_id.startswith('TL'):
1165 raise ExtractorError('For downloading YouTube.com top lists, use '
1166 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1167
1168 url = self._TEMPLATE_URL % playlist_id
1169 page = self._download_webpage(url, playlist_id)
1170 more_widget_html = content_html = page
1171
1172 # Check if the playlist exists or is private
1173 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1174 raise ExtractorError(
1175 'The playlist doesn\'t exist or is private, use --username or '
1176 '--netrc to access it.',
1177 expected=True)
1178
1179 # Extract the video ids from the playlist pages
1180 ids = []
1181
1182 for page_num in itertools.count(1):
1183 matches = re.finditer(self._VIDEO_RE, content_html)
1184 # We remove the duplicates and the link with index 0
1185 # (it's not the first video of the playlist)
1186 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1187 ids.extend(new_ids)
1188
1189 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1190 if not mobj:
1191 break
1192
1193 more = self._download_json(
1194 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1195 'Downloading page #%s' % page_num,
1196 transform_source=uppercase_escape)
1197 content_html = more['content_html']
1198 more_widget_html = more['load_more_widget_html']
1199
1200 playlist_title = self._html_search_regex(
1201 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1202 page, 'title')
1203
1204 url_results = self._ids_to_results(ids)
1205 return self.playlist_result(url_results, playlist_id, playlist_title)
1206
1207
1208 class YoutubeTopListIE(YoutubePlaylistIE):
1209 IE_NAME = 'youtube:toplist'
1210 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1211 ' (Example: "yttoplist:music:Top Tracks")')
1212 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1213 _TESTS = [{
1214 'url': 'yttoplist:music:Trending',
1215 'playlist_mincount': 5,
1216 'skip': 'Only works for logged-in users',
1217 }]
1218
1219 def _real_extract(self, url):
1220 mobj = re.match(self._VALID_URL, url)
1221 channel = mobj.group('chann')
1222 title = mobj.group('title')
1223 query = compat_urllib_parse.urlencode({'title': title})
1224 channel_page = self._download_webpage(
1225 'https://www.youtube.com/%s' % channel, title)
1226 link = self._html_search_regex(
1227 r'''(?x)
1228 <a\s+href="([^"]+)".*?>\s*
1229 <span\s+class="branded-page-module-title-text">\s*
1230 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1231 channel_page, 'list')
1232 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1233
1234 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1235 ids = []
1236 # sometimes the webpage doesn't contain the videos
1237 # retry until we get them
1238 for i in itertools.count(0):
1239 msg = 'Downloading Youtube mix'
1240 if i > 0:
1241 msg += ', retry #%d' % i
1242
1243 webpage = self._download_webpage(url, title, msg)
1244 ids = orderedSet(re.findall(video_re, webpage))
1245 if ids:
1246 break
1247 url_results = self._ids_to_results(ids)
1248 return self.playlist_result(url_results, playlist_title=title)
1249
1250
1251 class YoutubeChannelIE(InfoExtractor):
1252 IE_DESC = 'YouTube.com channels'
1253 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1254 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1255 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1256 IE_NAME = 'youtube:channel'
1257 _TESTS = [{
1258 'note': 'paginated channel',
1259 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1260 'playlist_mincount': 91,
1261 }]
1262
1263 def extract_videos_from_page(self, page):
1264 ids_in_page = []
1265 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1266 if mobj.group(1) not in ids_in_page:
1267 ids_in_page.append(mobj.group(1))
1268 return ids_in_page
1269
1270 def _real_extract(self, url):
1271 # Extract channel id
1272 mobj = re.match(self._VALID_URL, url)
1273 if mobj is None:
1274 raise ExtractorError('Invalid URL: %s' % url)
1275
1276 # Download channel page
1277 channel_id = mobj.group(1)
1278 video_ids = []
1279 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1280 channel_page = self._download_webpage(url, channel_id)
1281 autogenerated = re.search(r'''(?x)
1282 class="[^"]*?(?:
1283 channel-header-autogenerated-label|
1284 yt-channel-title-autogenerated
1285 )[^"]*"''', channel_page) is not None
1286
1287 if autogenerated:
1288 # The videos are contained in a single page
1289 # the ajax pages can't be used, they are empty
1290 video_ids = self.extract_videos_from_page(channel_page)
1291 else:
1292 # Download all channel pages using the json-based channel_ajax query
1293 for pagenum in itertools.count(1):
1294 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1295 page = self._download_json(
1296 url, channel_id, note='Downloading page #%s' % pagenum,
1297 transform_source=uppercase_escape)
1298
1299 ids_in_page = self.extract_videos_from_page(page['content_html'])
1300 video_ids.extend(ids_in_page)
1301
1302 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1303 break
1304
1305 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1306
1307 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1308 for video_id in video_ids]
1309 return self.playlist_result(url_entries, channel_id)
1310
1311
1312 class YoutubeUserIE(InfoExtractor):
1313 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1314 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1315 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1316 _GDATA_PAGE_SIZE = 50
1317 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1318 IE_NAME = 'youtube:user'
1319
1320 _TESTS = [{
1321 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1322 'playlist_mincount': 320,
1323 'info_dict': {
1324 'title': 'TheLinuxFoundation',
1325 }
1326 }, {
1327 'url': 'ytuser:phihag',
1328 'only_matching': True,
1329 }]
1330
1331 @classmethod
1332 def suitable(cls, url):
1333 # Don't return True if the url can be extracted with other youtube
1334 # extractor, the regex would is too permissive and it would match.
1335 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1336 if any(ie.suitable(url) for ie in other_ies): return False
1337 else: return super(YoutubeUserIE, cls).suitable(url)
1338
1339 def _real_extract(self, url):
1340 # Extract username
1341 mobj = re.match(self._VALID_URL, url)
1342 if mobj is None:
1343 raise ExtractorError('Invalid URL: %s' % url)
1344
1345 username = mobj.group(1)
1346
1347 # Download video ids using YouTube Data API. Result size per
1348 # query is limited (currently to 50 videos) so we need to query
1349 # page by page until there are no video ids - it means we got
1350 # all of them.
1351
1352 def download_page(pagenum):
1353 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1354
1355 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1356 page = self._download_webpage(
1357 gdata_url, username,
1358 'Downloading video ids from %d to %d' % (
1359 start_index, start_index + self._GDATA_PAGE_SIZE))
1360
1361 try:
1362 response = json.loads(page)
1363 except ValueError as err:
1364 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1365 if 'entry' not in response['feed']:
1366 return
1367
1368 # Extract video identifiers
1369 entries = response['feed']['entry']
1370 for entry in entries:
1371 title = entry['title']['$t']
1372 video_id = entry['id']['$t'].split('/')[-1]
1373 yield {
1374 '_type': 'url',
1375 'url': video_id,
1376 'ie_key': 'Youtube',
1377 'id': video_id,
1378 'title': title,
1379 }
1380 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1381
1382 return self.playlist_result(url_results, playlist_title=username)
1383
1384
1385 class YoutubeSearchIE(SearchInfoExtractor):
1386 IE_DESC = 'YouTube.com searches'
1387 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1388 _MAX_RESULTS = 1000
1389 IE_NAME = 'youtube:search'
1390 _SEARCH_KEY = 'ytsearch'
1391
1392 def _get_n_results(self, query, n):
1393 """Get a specified number of results for a query"""
1394
1395 video_ids = []
1396 pagenum = 0
1397 limit = n
1398 PAGE_SIZE = 50
1399
1400 while (PAGE_SIZE * pagenum) < limit:
1401 result_url = self._API_URL % (
1402 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1403 (PAGE_SIZE * pagenum) + 1)
1404 data_json = self._download_webpage(
1405 result_url, video_id='query "%s"' % query,
1406 note='Downloading page %s' % (pagenum + 1),
1407 errnote='Unable to download API page')
1408 data = json.loads(data_json)
1409 api_response = data['data']
1410
1411 if 'items' not in api_response:
1412 raise ExtractorError(
1413 '[youtube] No video results', expected=True)
1414
1415 new_ids = list(video['id'] for video in api_response['items'])
1416 video_ids += new_ids
1417
1418 limit = min(n, api_response['totalItems'])
1419 pagenum += 1
1420
1421 if len(video_ids) > n:
1422 video_ids = video_ids[:n]
1423 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1424 for video_id in video_ids]
1425 return self.playlist_result(videos, query)
1426
1427
1428 class YoutubeSearchDateIE(YoutubeSearchIE):
1429 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1430 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1431 _SEARCH_KEY = 'ytsearchdate'
1432 IE_DESC = 'YouTube.com searches, newest videos first'
1433
1434
1435 class YoutubeSearchURLIE(InfoExtractor):
1436 IE_DESC = 'YouTube.com search URLs'
1437 IE_NAME = 'youtube:search_url'
1438 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1439 _TESTS = [{
1440 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1441 'playlist_mincount': 5,
1442 'info_dict': {
1443 'title': 'youtube-dl test video',
1444 }
1445 }]
1446
1447 def _real_extract(self, url):
1448 mobj = re.match(self._VALID_URL, url)
1449 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1450
1451 webpage = self._download_webpage(url, query)
1452 result_code = self._search_regex(
1453 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1454
1455 part_codes = re.findall(
1456 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1457 entries = []
1458 for part_code in part_codes:
1459 part_title = self._html_search_regex(
1460 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1461 part_url_snippet = self._html_search_regex(
1462 r'(?s)href="([^"]+)"', part_code, 'item URL')
1463 part_url = compat_urlparse.urljoin(
1464 'https://www.youtube.com/', part_url_snippet)
1465 entries.append({
1466 '_type': 'url',
1467 'url': part_url,
1468 'title': part_title,
1469 })
1470
1471 return {
1472 '_type': 'playlist',
1473 'entries': entries,
1474 'title': query,
1475 }
1476
1477
1478 class YoutubeShowIE(InfoExtractor):
1479 IE_DESC = 'YouTube.com (multi-season) shows'
1480 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1481 IE_NAME = 'youtube:show'
1482 _TESTS = [{
1483 'url': 'http://www.youtube.com/show/airdisasters',
1484 'playlist_mincount': 3,
1485 'info_dict': {
1486 'id': 'airdisasters',
1487 'title': 'Air Disasters',
1488 }
1489 }]
1490
1491 def _real_extract(self, url):
1492 mobj = re.match(self._VALID_URL, url)
1493 playlist_id = mobj.group('id')
1494 webpage = self._download_webpage(
1495 url, playlist_id, 'Downloading show webpage')
1496 # There's one playlist for each season of the show
1497 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1498 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1499 entries = [
1500 self.url_result(
1501 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1502 for season in m_seasons
1503 ]
1504 title = self._og_search_title(webpage, fatal=False)
1505
1506 return {
1507 '_type': 'playlist',
1508 'id': playlist_id,
1509 'title': title,
1510 'entries': entries,
1511 }
1512
1513
1514 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1515 """
1516 Base class for extractors that fetch info from
1517 http://www.youtube.com/feed_ajax
1518 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1519 """
1520 _LOGIN_REQUIRED = True
1521 # use action_load_personal_feed instead of action_load_system_feed
1522 _PERSONAL_FEED = False
1523
1524 @property
1525 def _FEED_TEMPLATE(self):
1526 action = 'action_load_system_feed'
1527 if self._PERSONAL_FEED:
1528 action = 'action_load_personal_feed'
1529 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1530
1531 @property
1532 def IE_NAME(self):
1533 return 'youtube:%s' % self._FEED_NAME
1534
1535 def _real_initialize(self):
1536 self._login()
1537
1538 def _real_extract(self, url):
1539 feed_entries = []
1540 paging = 0
1541 for i in itertools.count(1):
1542 info = self._download_json(self._FEED_TEMPLATE % paging,
1543 '%s feed' % self._FEED_NAME,
1544 'Downloading page %s' % i)
1545 feed_html = info.get('feed_html') or info.get('content_html')
1546 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1547 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1548 ids = orderedSet(m.group(1) for m in m_ids)
1549 feed_entries.extend(
1550 self.url_result(video_id, 'Youtube', video_id=video_id)
1551 for video_id in ids)
1552 mobj = re.search(
1553 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1554 load_more_widget_html)
1555 if mobj is None:
1556 break
1557 paging = mobj.group('paging')
1558 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1559
1560 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1561 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1562 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1563 _FEED_NAME = 'recommended'
1564 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1565
1566 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1567 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1568 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1569 _FEED_NAME = 'watch_later'
1570 _PLAYLIST_TITLE = 'Youtube Watch Later'
1571 _PERSONAL_FEED = True
1572
1573 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1574 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1575 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1576 _FEED_NAME = 'history'
1577 _PERSONAL_FEED = True
1578 _PLAYLIST_TITLE = 'Youtube Watch History'
1579
1580 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1581 IE_NAME = 'youtube:favorites'
1582 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1583 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1584 _LOGIN_REQUIRED = True
1585
1586 def _real_extract(self, url):
1587 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1588 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1589 return self.url_result(playlist_id, 'YoutubePlaylist')
1590
1591
1592 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1593 IE_NAME = 'youtube:subscriptions'
1594 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1595 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1596 _TESTS = []
1597
1598 def _real_extract(self, url):
1599 title = 'Youtube Subscriptions'
1600 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1601
1602 # The extraction process is the same as for playlists, but the regex
1603 # for the video ids doesn't contain an index
1604 ids = []
1605 more_widget_html = content_html = page
1606
1607 for page_num in itertools.count(1):
1608 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1609 new_ids = orderedSet(matches)
1610 ids.extend(new_ids)
1611
1612 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1613 if not mobj:
1614 break
1615
1616 more = self._download_json(
1617 'https://youtube.com/%s' % mobj.group('more'), title,
1618 'Downloading page #%s' % page_num,
1619 transform_source=uppercase_escape)
1620 content_html = more['content_html']
1621 more_widget_html = more['load_more_widget_html']
1622
1623 return {
1624 '_type': 'playlist',
1625 'title': title,
1626 'entries': self._ids_to_results(ids),
1627 }
1628
1629
1630 class YoutubeTruncatedURLIE(InfoExtractor):
1631 IE_NAME = 'youtube:truncated_url'
1632 IE_DESC = False # Do not list
1633 _VALID_URL = r'''(?x)
1634 (?:https?://)?[^/]+/watch\?(?:
1635 feature=[a-z_]+|
1636 annotation_id=annotation_[^&]+
1637 )?$|
1638 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1639 '''
1640
1641 _TESTS = [{
1642 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1643 'only_matching': True,
1644 }, {
1645 'url': 'http://www.youtube.com/watch?',
1646 'only_matching': True,
1647 }]
1648
1649 def _real_extract(self, url):
1650 raise ExtractorError(
1651 'Did you forget to quote the URL? Remember that & is a meta '
1652 'character in most shells, so you want to put the URL in quotes, '
1653 'like youtube-dl '
1654 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1655 ' or simply youtube-dl BaW_jenozKc .',
1656 expected=True)