]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Fix lintian warning introduced by Ben's patch.
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from .subtitles import SubtitlesInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
27 get_element_by_attribute,
28 ExtractorError,
29 int_or_none,
30 OnDemandPagedList,
31 unescapeHTML,
32 unified_strdate,
33 orderedSet,
34 uppercase_escape,
35 )
36
37
38 class YoutubeBaseInfoExtractor(InfoExtractor):
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en',
48 # YouTube sets the expire time to about two months
49 expire_time=time.time() + 60*24*3600)
50
51 def _login(self):
52 """
53 Attempt to log in to YouTube.
54 True is returned if successful or skipped.
55 False is returned if login failed.
56
57 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
58 """
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
61 if username is None:
62 if self._LOGIN_REQUIRED:
63 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 return True
65
66 login_page = self._download_webpage(
67 self._LOGIN_URL, None,
68 note='Downloading login page',
69 errnote='unable to fetch login page', fatal=False)
70 if login_page is False:
71 return
72
73 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
74 login_page, 'Login GALX parameter')
75
76 # Log in
77 login_form_strs = {
78 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 'Email': username,
80 'GALX': galx,
81 'Passwd': password,
82
83 'PersistentCookie': 'yes',
84 '_utf8': '霱',
85 'bgresponse': 'js_disabled',
86 'checkConnection': '',
87 'checkedDomains': 'youtube',
88 'dnConn': '',
89 'pstMsg': '0',
90 'rmShown': '1',
91 'secTok': '',
92 'signIn': 'Sign in',
93 'timeStmp': '',
94 'service': 'youtube',
95 'uilel': '3',
96 'hl': 'en_US',
97 }
98
99 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
100 # chokes on unicode
101 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
102 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
103
104 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
105 login_results = self._download_webpage(
106 req, None,
107 note='Logging in', errnote='unable to log in', fatal=False)
108 if login_results is False:
109 return False
110
111 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
112 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
113
114 # Two-Factor
115 # TODO add SMS and phone call support - these require making a request and then prompting the user
116
117 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
118 tfa_code = self._get_tfa_info()
119
120 if tfa_code is None:
121 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
122 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
123 return False
124
125 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
126
127 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
128 if match is None:
129 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
130 secTok = match.group(1)
131 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
132 if match is None:
133 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
134 timeStmp = match.group(1)
135
136 tfa_form_strs = {
137 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
138 'smsToken': '',
139 'smsUserPin': tfa_code,
140 'smsVerifyPin': 'Verify',
141
142 'PersistentCookie': 'yes',
143 'checkConnection': '',
144 'checkedDomains': 'youtube',
145 'pstMsg': '1',
146 'secTok': secTok,
147 'timeStmp': timeStmp,
148 'service': 'youtube',
149 'hl': 'en_US',
150 }
151 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
152 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
153
154 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
155 tfa_results = self._download_webpage(
156 tfa_req, None,
157 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
158
159 if tfa_results is False:
160 return False
161
162 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
163 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
164 return False
165 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
166 self._downloader.report_warning('unable to log in - did the page structure change?')
167 return False
168 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
169 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
170 return False
171
172 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
173 self._downloader.report_warning('unable to log in: bad username or password')
174 return False
175 return True
176
177 def _real_initialize(self):
178 if self._downloader is None:
179 return
180 self._set_language()
181 if not self._login():
182 return
183
184
185 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
186 IE_DESC = 'YouTube.com'
187 _VALID_URL = r"""(?x)^
188 (
189 (?:https?://|//) # http(s):// or protocol-independent URL
190 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
191 (?:www\.)?deturl\.com/www\.youtube\.com/|
192 (?:www\.)?pwnyoutube\.com/|
193 (?:www\.)?yourepeat\.com/|
194 tube\.majestyc\.net/|
195 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
196 (?:.*?\#/)? # handle anchor (#/) redirect urls
197 (?: # the various things that can precede the ID:
198 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
199 |(?: # or the v= param in all its forms
200 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
201 (?:\?|\#!?) # the params delimiter ? or # or #!
202 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
203 v=
204 )
205 ))
206 |youtu\.be/ # just youtu.be/xxxx
207 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
208 )
209 )? # all until now is optional -> you can pass the naked ID
210 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
211 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
212 (?(1).+)? # if we found the ID, everything can follow
213 $"""
214 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
215 _formats = {
216 '5': {'ext': 'flv', 'width': 400, 'height': 240},
217 '6': {'ext': 'flv', 'width': 450, 'height': 270},
218 '13': {'ext': '3gp'},
219 '17': {'ext': '3gp', 'width': 176, 'height': 144},
220 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
221 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
222 '34': {'ext': 'flv', 'width': 640, 'height': 360},
223 '35': {'ext': 'flv', 'width': 854, 'height': 480},
224 '36': {'ext': '3gp', 'width': 320, 'height': 240},
225 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
226 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
227 '43': {'ext': 'webm', 'width': 640, 'height': 360},
228 '44': {'ext': 'webm', 'width': 854, 'height': 480},
229 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
230 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
231
232
233 # 3d videos
234 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
235 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
236 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
237 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
238 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
239 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
240 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
241
242 # Apple HTTP Live Streaming
243 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
244 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
245 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
246 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
247 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
248 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
249 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
250
251 # DASH mp4 video
252 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
253 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
261 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
263
264 # Dash mp4 audio
265 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
266 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
267 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
268
269 # Dash webm
270 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
271 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
277 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
278 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
287 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
288 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
289
290 # Dash webm audio
291 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
292 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
293
294 # Dash webm audio with opus inside
295 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
296 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
297 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
298
299 # RTMP (unnamed)
300 '_rtmp': {'protocol': 'rtmp'},
301 }
302
303 IE_NAME = 'youtube'
304 _TESTS = [
305 {
306 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
307 'info_dict': {
308 'id': 'BaW_jenozKc',
309 'ext': 'mp4',
310 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
311 'uploader': 'Philipp Hagemeister',
312 'uploader_id': 'phihag',
313 'upload_date': '20121002',
314 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
315 'categories': ['Science & Technology'],
316 'like_count': int,
317 'dislike_count': int,
318 }
319 },
320 {
321 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
322 'note': 'Test generic use_cipher_signature video (#897)',
323 'info_dict': {
324 'id': 'UxxajLWwzqY',
325 'ext': 'mp4',
326 'upload_date': '20120506',
327 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
328 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
329 'uploader': 'Icona Pop',
330 'uploader_id': 'IconaPop',
331 }
332 },
333 {
334 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
335 'note': 'Test VEVO video with age protection (#956)',
336 'info_dict': {
337 'id': '07FYdnEawAQ',
338 'ext': 'mp4',
339 'upload_date': '20130703',
340 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
341 'description': 'md5:64249768eec3bc4276236606ea996373',
342 'uploader': 'justintimberlakeVEVO',
343 'uploader_id': 'justintimberlakeVEVO',
344 }
345 },
346 {
347 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
348 'note': 'Embed-only video (#1746)',
349 'info_dict': {
350 'id': 'yZIXLfi8CZQ',
351 'ext': 'mp4',
352 'upload_date': '20120608',
353 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
354 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
355 'uploader': 'SET India',
356 'uploader_id': 'setindia'
357 }
358 },
359 {
360 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
361 'note': '256k DASH audio (format 141) via DASH manifest',
362 'info_dict': {
363 'id': 'a9LDPn-MO4I',
364 'ext': 'm4a',
365 'upload_date': '20121002',
366 'uploader_id': '8KVIDEO',
367 'description': '',
368 'uploader': '8KVIDEO',
369 'title': 'UHDTV TEST 8K VIDEO.mp4'
370 },
371 'params': {
372 'youtube_include_dash_manifest': True,
373 'format': '141',
374 },
375 },
376 # DASH manifest with encrypted signature
377 {
378 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
379 'info_dict': {
380 'id': 'IB3lcPjvWLA',
381 'ext': 'm4a',
382 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
383 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
384 'uploader': 'AfrojackVEVO',
385 'uploader_id': 'AfrojackVEVO',
386 'upload_date': '20131011',
387 },
388 'params': {
389 'youtube_include_dash_manifest': True,
390 'format': '141',
391 },
392 },
393 # Controversy video
394 {
395 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
396 'info_dict': {
397 'id': 'T4XJQO3qol8',
398 'ext': 'mp4',
399 'upload_date': '20100909',
400 'uploader': 'The Amazing Atheist',
401 'uploader_id': 'TheAmazingAtheist',
402 'title': 'Burning Everyone\'s Koran',
403 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
404 }
405 },
406 # Normal age-gate video (No vevo, embed allowed)
407 {
408 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
409 'info_dict': {
410 'id': 'HtVdAasjOgU',
411 'ext': 'mp4',
412 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
413 'description': 'md5:eca57043abae25130f58f655ad9a7771',
414 'uploader': 'The Witcher',
415 'uploader_id': 'WitcherGame',
416 'upload_date': '20140605',
417 },
418 },
419 ]
420
421 def __init__(self, *args, **kwargs):
422 super(YoutubeIE, self).__init__(*args, **kwargs)
423 self._player_cache = {}
424
425 def report_video_info_webpage_download(self, video_id):
426 """Report attempt to download video info webpage."""
427 self.to_screen('%s: Downloading video info webpage' % video_id)
428
429 def report_information_extraction(self, video_id):
430 """Report attempt to extract video information."""
431 self.to_screen('%s: Extracting video information' % video_id)
432
433 def report_unavailable_format(self, video_id, format):
434 """Report extracted video URL."""
435 self.to_screen('%s: Format %s not available' % (video_id, format))
436
437 def report_rtmp_download(self):
438 """Indicate the download will use the RTMP protocol."""
439 self.to_screen('RTMP download detected')
440
441 def _signature_cache_id(self, example_sig):
442 """ Return a string representation of a signature """
443 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
444
445 def _extract_signature_function(self, video_id, player_url, example_sig):
446 id_m = re.match(
447 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
448 player_url)
449 if not id_m:
450 raise ExtractorError('Cannot identify player %r' % player_url)
451 player_type = id_m.group('ext')
452 player_id = id_m.group('id')
453
454 # Read from filesystem cache
455 func_id = '%s_%s_%s' % (
456 player_type, player_id, self._signature_cache_id(example_sig))
457 assert os.path.basename(func_id) == func_id
458
459 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
460 if cache_spec is not None:
461 return lambda s: ''.join(s[i] for i in cache_spec)
462
463 if player_type == 'js':
464 code = self._download_webpage(
465 player_url, video_id,
466 note='Downloading %s player %s' % (player_type, player_id),
467 errnote='Download of %s failed' % player_url)
468 res = self._parse_sig_js(code)
469 elif player_type == 'swf':
470 urlh = self._request_webpage(
471 player_url, video_id,
472 note='Downloading %s player %s' % (player_type, player_id),
473 errnote='Download of %s failed' % player_url)
474 code = urlh.read()
475 res = self._parse_sig_swf(code)
476 else:
477 assert False, 'Invalid player type %r' % player_type
478
479 if cache_spec is None:
480 test_string = ''.join(map(compat_chr, range(len(example_sig))))
481 cache_res = res(test_string)
482 cache_spec = [ord(c) for c in cache_res]
483
484 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
485 return res
486
487 def _print_sig_code(self, func, example_sig):
488 def gen_sig_code(idxs):
489 def _genslice(start, end, step):
490 starts = '' if start == 0 else str(start)
491 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
492 steps = '' if step == 1 else (':%d' % step)
493 return 's[%s%s%s]' % (starts, ends, steps)
494
495 step = None
496 start = '(Never used)' # Quelch pyflakes warnings - start will be
497 # set as soon as step is set
498 for i, prev in zip(idxs[1:], idxs[:-1]):
499 if step is not None:
500 if i - prev == step:
501 continue
502 yield _genslice(start, prev, step)
503 step = None
504 continue
505 if i - prev in [-1, 1]:
506 step = i - prev
507 start = prev
508 continue
509 else:
510 yield 's[%d]' % prev
511 if step is None:
512 yield 's[%d]' % i
513 else:
514 yield _genslice(start, i, step)
515
516 test_string = ''.join(map(compat_chr, range(len(example_sig))))
517 cache_res = func(test_string)
518 cache_spec = [ord(c) for c in cache_res]
519 expr_code = ' + '.join(gen_sig_code(cache_spec))
520 signature_id_tuple = '(%s)' % (
521 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
522 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
523 ' return %s\n') % (signature_id_tuple, expr_code)
524 self.to_screen('Extracted signature function:\n' + code)
525
526 def _parse_sig_js(self, jscode):
527 funcname = self._search_regex(
528 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
529 'Initial JS player signature function name')
530
531 jsi = JSInterpreter(jscode)
532 initial_function = jsi.extract_function(funcname)
533 return lambda s: initial_function([s])
534
535 def _parse_sig_swf(self, file_contents):
536 swfi = SWFInterpreter(file_contents)
537 TARGET_CLASSNAME = 'SignatureDecipher'
538 searched_class = swfi.extract_class(TARGET_CLASSNAME)
539 initial_function = swfi.extract_function(searched_class, 'decipher')
540 return lambda s: initial_function([s])
541
542 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
543 """Turn the encrypted s field into a working signature"""
544
545 if player_url is None:
546 raise ExtractorError('Cannot decrypt signature without player_url')
547
548 if player_url.startswith('//'):
549 player_url = 'https:' + player_url
550 try:
551 player_id = (player_url, self._signature_cache_id(s))
552 if player_id not in self._player_cache:
553 func = self._extract_signature_function(
554 video_id, player_url, s
555 )
556 self._player_cache[player_id] = func
557 func = self._player_cache[player_id]
558 if self._downloader.params.get('youtube_print_sig_code'):
559 self._print_sig_code(func, s)
560 return func(s)
561 except Exception as e:
562 tb = traceback.format_exc()
563 raise ExtractorError(
564 'Signature extraction failed: ' + tb, cause=e)
565
566 def _get_available_subtitles(self, video_id, webpage):
567 try:
568 sub_list = self._download_webpage(
569 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
570 video_id, note=False)
571 except ExtractorError as err:
572 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
573 return {}
574 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
575
576 sub_lang_list = {}
577 for l in lang_list:
578 lang = l[1]
579 if lang in sub_lang_list:
580 continue
581 params = compat_urllib_parse.urlencode({
582 'lang': lang,
583 'v': video_id,
584 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
585 'name': unescapeHTML(l[0]).encode('utf-8'),
586 })
587 url = 'https://www.youtube.com/api/timedtext?' + params
588 sub_lang_list[lang] = url
589 if not sub_lang_list:
590 self._downloader.report_warning('video doesn\'t have subtitles')
591 return {}
592 return sub_lang_list
593
594 def _get_available_automatic_caption(self, video_id, webpage):
595 """We need the webpage for getting the captions url, pass it as an
596 argument to speed up the process."""
597 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
598 self.to_screen('%s: Looking for automatic captions' % video_id)
599 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
600 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
601 if mobj is None:
602 self._downloader.report_warning(err_msg)
603 return {}
604 player_config = json.loads(mobj.group(1))
605 try:
606 args = player_config['args']
607 caption_url = args['ttsurl']
608 timestamp = args['timestamp']
609 # We get the available subtitles
610 list_params = compat_urllib_parse.urlencode({
611 'type': 'list',
612 'tlangs': 1,
613 'asrs': 1,
614 })
615 list_url = caption_url + '&' + list_params
616 caption_list = self._download_xml(list_url, video_id)
617 original_lang_node = caption_list.find('track')
618 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
619 self._downloader.report_warning('Video doesn\'t have automatic captions')
620 return {}
621 original_lang = original_lang_node.attrib['lang_code']
622
623 sub_lang_list = {}
624 for lang_node in caption_list.findall('target'):
625 sub_lang = lang_node.attrib['lang_code']
626 params = compat_urllib_parse.urlencode({
627 'lang': original_lang,
628 'tlang': sub_lang,
629 'fmt': sub_format,
630 'ts': timestamp,
631 'kind': 'asr',
632 })
633 sub_lang_list[sub_lang] = caption_url + '&' + params
634 return sub_lang_list
635 # An extractor error can be raise by the download process if there are
636 # no automatic captions but there are subtitles
637 except (KeyError, ExtractorError):
638 self._downloader.report_warning(err_msg)
639 return {}
640
641 @classmethod
642 def extract_id(cls, url):
643 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
644 if mobj is None:
645 raise ExtractorError('Invalid URL: %s' % url)
646 video_id = mobj.group(2)
647 return video_id
648
649 def _extract_from_m3u8(self, manifest_url, video_id):
650 url_map = {}
651
652 def _get_urls(_manifest):
653 lines = _manifest.split('\n')
654 urls = filter(lambda l: l and not l.startswith('#'),
655 lines)
656 return urls
657 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
658 formats_urls = _get_urls(manifest)
659 for format_url in formats_urls:
660 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
661 url_map[itag] = format_url
662 return url_map
663
664 def _extract_annotations(self, video_id):
665 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
666 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
667
668 def _real_extract(self, url):
669 proto = (
670 'http' if self._downloader.params.get('prefer_insecure', False)
671 else 'https')
672
673 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
674 mobj = re.search(self._NEXT_URL_RE, url)
675 if mobj:
676 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
677 video_id = self.extract_id(url)
678
679 # Get video webpage
680 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
681 video_webpage = self._download_webpage(url, video_id)
682
683 # Attempt to extract SWF player URL
684 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
685 if mobj is not None:
686 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
687 else:
688 player_url = None
689
690 # Get video info
691 if re.search(r'player-age-gate-content">', video_webpage) is not None:
692 age_gate = True
693 # We simulate the access to the video from www.youtube.com/v/{video_id}
694 # this can be viewed without login into Youtube
695 data = compat_urllib_parse.urlencode({
696 'video_id': video_id,
697 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
698 'sts': self._search_regex(
699 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
700 })
701 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
702 video_info_webpage = self._download_webpage(
703 video_info_url, video_id,
704 note='Refetching age-gated info webpage',
705 errnote='unable to download video info webpage')
706 video_info = compat_parse_qs(video_info_webpage)
707 else:
708 age_gate = False
709 try:
710 # Try looking directly into the video webpage
711 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
712 if not mobj:
713 raise ValueError('Could not find ytplayer.config') # caught below
714 json_code = uppercase_escape(mobj.group(1))
715 ytplayer_config = json.loads(json_code)
716 args = ytplayer_config['args']
717 # Convert to the same format returned by compat_parse_qs
718 video_info = dict((k, [v]) for k, v in args.items())
719 if 'url_encoded_fmt_stream_map' not in args:
720 raise ValueError('No stream_map present') # caught below
721 except ValueError:
722 # We fallback to the get_video_info pages (used by the embed page)
723 self.report_video_info_webpage_download(video_id)
724 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
725 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
726 % (video_id, el_type))
727 video_info_webpage = self._download_webpage(video_info_url,
728 video_id, note=False,
729 errnote='unable to download video info webpage')
730 video_info = compat_parse_qs(video_info_webpage)
731 if 'token' in video_info:
732 break
733 if 'token' not in video_info:
734 if 'reason' in video_info:
735 raise ExtractorError(
736 'YouTube said: %s' % video_info['reason'][0],
737 expected=True, video_id=video_id)
738 else:
739 raise ExtractorError(
740 '"token" parameter not in video info for unknown reason',
741 video_id=video_id)
742
743 if 'view_count' in video_info:
744 view_count = int(video_info['view_count'][0])
745 else:
746 view_count = None
747
748 # Check for "rental" videos
749 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
750 raise ExtractorError('"rental" videos not supported')
751
752 # Start extracting information
753 self.report_information_extraction(video_id)
754
755 # uploader
756 if 'author' not in video_info:
757 raise ExtractorError('Unable to extract uploader name')
758 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
759
760 # uploader_id
761 video_uploader_id = None
762 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
763 if mobj is not None:
764 video_uploader_id = mobj.group(1)
765 else:
766 self._downloader.report_warning('unable to extract uploader nickname')
767
768 # title
769 if 'title' in video_info:
770 video_title = video_info['title'][0]
771 else:
772 self._downloader.report_warning('Unable to extract video title')
773 video_title = '_'
774
775 # thumbnail image
776 # We try first to get a high quality image:
777 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
778 video_webpage, re.DOTALL)
779 if m_thumb is not None:
780 video_thumbnail = m_thumb.group(1)
781 elif 'thumbnail_url' not in video_info:
782 self._downloader.report_warning('unable to extract video thumbnail')
783 video_thumbnail = None
784 else: # don't panic if we can't find it
785 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
786
787 # upload date
788 upload_date = None
789 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
790 if mobj is None:
791 mobj = re.search(
792 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
793 video_webpage)
794 if mobj is not None:
795 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
796 upload_date = unified_strdate(upload_date)
797
798 m_cat_container = self._search_regex(
799 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
800 video_webpage, 'categories', fatal=False)
801 if m_cat_container:
802 category = self._html_search_regex(
803 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
804 default=None)
805 video_categories = None if category is None else [category]
806 else:
807 video_categories = None
808
809 # description
810 video_description = get_element_by_id("eow-description", video_webpage)
811 if video_description:
812 video_description = re.sub(r'''(?x)
813 <a\s+
814 (?:[a-zA-Z-]+="[^"]+"\s+)*?
815 title="([^"]+)"\s+
816 (?:[a-zA-Z-]+="[^"]+"\s+)*?
817 class="yt-uix-redirect-link"\s*>
818 [^<]+
819 </a>
820 ''', r'\1', video_description)
821 video_description = clean_html(video_description)
822 else:
823 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
824 if fd_mobj:
825 video_description = unescapeHTML(fd_mobj.group(1))
826 else:
827 video_description = ''
828
829 def _extract_count(count_name):
830 count = self._search_regex(
831 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
832 video_webpage, count_name, default=None)
833 if count is not None:
834 return int(count.replace(',', ''))
835 return None
836 like_count = _extract_count('like')
837 dislike_count = _extract_count('dislike')
838
839 # subtitles
840 video_subtitles = self.extract_subtitles(video_id, video_webpage)
841
842 if self._downloader.params.get('listsubtitles', False):
843 self._list_available_subtitles(video_id, video_webpage)
844 return
845
846 if 'length_seconds' not in video_info:
847 self._downloader.report_warning('unable to extract video duration')
848 video_duration = None
849 else:
850 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
851
852 # annotations
853 video_annotations = None
854 if self._downloader.params.get('writeannotations', False):
855 video_annotations = self._extract_annotations(video_id)
856
857 def _map_to_format_list(urlmap):
858 formats = []
859 for itag, video_real_url in urlmap.items():
860 dct = {
861 'format_id': itag,
862 'url': video_real_url,
863 'player_url': player_url,
864 }
865 if itag in self._formats:
866 dct.update(self._formats[itag])
867 formats.append(dct)
868 return formats
869
870 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
871 self.report_rtmp_download()
872 formats = [{
873 'format_id': '_rtmp',
874 'protocol': 'rtmp',
875 'url': video_info['conn'][0],
876 'player_url': player_url,
877 }]
878 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
879 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
880 if 'rtmpe%3Dyes' in encoded_url_map:
881 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
882 url_map = {}
883 for url_data_str in encoded_url_map.split(','):
884 url_data = compat_parse_qs(url_data_str)
885 if 'itag' not in url_data or 'url' not in url_data:
886 continue
887 format_id = url_data['itag'][0]
888 url = url_data['url'][0]
889
890 if 'sig' in url_data:
891 url += '&signature=' + url_data['sig'][0]
892 elif 's' in url_data:
893 encrypted_sig = url_data['s'][0]
894
895 if not age_gate:
896 jsplayer_url_json = self._search_regex(
897 r'"assets":.+?"js":\s*("[^"]+")',
898 video_webpage, 'JS player URL')
899 player_url = json.loads(jsplayer_url_json)
900 if player_url is None:
901 player_url_json = self._search_regex(
902 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
903 video_webpage, 'age gate player URL')
904 player_url = json.loads(player_url_json)
905
906 if self._downloader.params.get('verbose'):
907 if player_url is None:
908 player_version = 'unknown'
909 player_desc = 'unknown'
910 else:
911 if player_url.endswith('swf'):
912 player_version = self._search_regex(
913 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
914 'flash player', fatal=False)
915 player_desc = 'flash player %s' % player_version
916 else:
917 player_version = self._search_regex(
918 r'html5player-([^/]+?)(?:/html5player)?\.js',
919 player_url,
920 'html5 player', fatal=False)
921 player_desc = 'html5 player %s' % player_version
922
923 parts_sizes = self._signature_cache_id(encrypted_sig)
924 self.to_screen('{%s} signature length %s, %s' %
925 (format_id, parts_sizes, player_desc))
926
927 signature = self._decrypt_signature(
928 encrypted_sig, video_id, player_url, age_gate)
929 url += '&signature=' + signature
930 if 'ratebypass' not in url:
931 url += '&ratebypass=yes'
932 url_map[format_id] = url
933 formats = _map_to_format_list(url_map)
934 elif video_info.get('hlsvp'):
935 manifest_url = video_info['hlsvp'][0]
936 url_map = self._extract_from_m3u8(manifest_url, video_id)
937 formats = _map_to_format_list(url_map)
938 else:
939 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
940
941 # Look for the DASH manifest
942 if self._downloader.params.get('youtube_include_dash_manifest', True):
943 try:
944 # The DASH manifest used needs to be the one from the original video_webpage.
945 # The one found in get_video_info seems to be using different signatures.
946 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
947 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
948 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
949 dash_manifest_url = video_info.get('dashmpd')[0]
950
951 def decrypt_sig(mobj):
952 s = mobj.group(1)
953 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
954 return '/signature/%s' % dec_s
955 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
956 dash_doc = self._download_xml(
957 dash_manifest_url, video_id,
958 note='Downloading DASH manifest',
959 errnote='Could not download DASH manifest')
960 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
961 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
962 if url_el is None:
963 continue
964 format_id = r.attrib['id']
965 video_url = url_el.text
966 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
967 f = {
968 'format_id': format_id,
969 'url': video_url,
970 'width': int_or_none(r.attrib.get('width')),
971 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
972 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
973 'filesize': filesize,
974 }
975 try:
976 existing_format = next(
977 fo for fo in formats
978 if fo['format_id'] == format_id)
979 except StopIteration:
980 f.update(self._formats.get(format_id, {}))
981 formats.append(f)
982 else:
983 existing_format.update(f)
984
985 except (ExtractorError, KeyError) as e:
986 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
987
988 self._sort_formats(formats)
989
990 return {
991 'id': video_id,
992 'uploader': video_uploader,
993 'uploader_id': video_uploader_id,
994 'upload_date': upload_date,
995 'title': video_title,
996 'thumbnail': video_thumbnail,
997 'description': video_description,
998 'categories': video_categories,
999 'subtitles': video_subtitles,
1000 'duration': video_duration,
1001 'age_limit': 18 if age_gate else 0,
1002 'annotations': video_annotations,
1003 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1004 'view_count': view_count,
1005 'like_count': like_count,
1006 'dislike_count': dislike_count,
1007 'formats': formats,
1008 }
1009
1010
1011 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1012 IE_DESC = 'YouTube.com playlists'
1013 _VALID_URL = r"""(?x)(?:
1014 (?:https?://)?
1015 (?:\w+\.)?
1016 youtube\.com/
1017 (?:
1018 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1019 \? (?:.*?&)*? (?:p|a|list)=
1020 | p/
1021 )
1022 (
1023 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1024 # Top tracks, they can also include dots
1025 |(?:MC)[\w\.]*
1026 )
1027 .*
1028 |
1029 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1030 )"""
1031 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1032 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1033 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1034 IE_NAME = 'youtube:playlist'
1035 _TESTS = [{
1036 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1037 'info_dict': {
1038 'title': 'ytdl test PL',
1039 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1040 },
1041 'playlist_count': 3,
1042 }, {
1043 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1044 'info_dict': {
1045 'title': 'YDL_Empty_List',
1046 },
1047 'playlist_count': 0,
1048 }, {
1049 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1050 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1051 'info_dict': {
1052 'title': '29C3: Not my department',
1053 },
1054 'playlist_count': 95,
1055 }, {
1056 'note': 'issue #673',
1057 'url': 'PLBB231211A4F62143',
1058 'info_dict': {
1059 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1060 },
1061 'playlist_mincount': 26,
1062 }, {
1063 'note': 'Large playlist',
1064 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1065 'info_dict': {
1066 'title': 'Uploads from Cauchemar',
1067 },
1068 'playlist_mincount': 799,
1069 }, {
1070 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1071 'info_dict': {
1072 'title': 'YDL_safe_search',
1073 },
1074 'playlist_count': 2,
1075 }, {
1076 'note': 'embedded',
1077 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1078 'playlist_count': 4,
1079 'info_dict': {
1080 'title': 'JODA15',
1081 }
1082 }, {
1083 'note': 'Embedded SWF player',
1084 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1085 'playlist_count': 4,
1086 'info_dict': {
1087 'title': 'JODA7',
1088 }
1089 }]
1090
1091 def _real_initialize(self):
1092 self._login()
1093
1094 def _ids_to_results(self, ids):
1095 return [
1096 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1097 for vid_id in ids]
1098
1099 def _extract_mix(self, playlist_id):
1100 # The mixes are generated from a a single video
1101 # the id of the playlist is just 'RD' + video_id
1102 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1103 webpage = self._download_webpage(
1104 url, playlist_id, 'Downloading Youtube mix')
1105 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1106 title_span = (
1107 search_title('playlist-title') or
1108 search_title('title long-title') or
1109 search_title('title'))
1110 title = clean_html(title_span)
1111 ids = orderedSet(re.findall(
1112 r'''(?xs)data-video-username=".*?".*?
1113 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1114 webpage))
1115 url_results = self._ids_to_results(ids)
1116
1117 return self.playlist_result(url_results, playlist_id, title)
1118
1119 def _real_extract(self, url):
1120 # Extract playlist id
1121 mobj = re.match(self._VALID_URL, url)
1122 if mobj is None:
1123 raise ExtractorError('Invalid URL: %s' % url)
1124 playlist_id = mobj.group(1) or mobj.group(2)
1125
1126 # Check if it's a video-specific URL
1127 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1128 if 'v' in query_dict:
1129 video_id = query_dict['v'][0]
1130 if self._downloader.params.get('noplaylist'):
1131 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1132 return self.url_result(video_id, 'Youtube', video_id=video_id)
1133 else:
1134 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1135
1136 if playlist_id.startswith('RD'):
1137 # Mixes require a custom extraction process
1138 return self._extract_mix(playlist_id)
1139 if playlist_id.startswith('TL'):
1140 raise ExtractorError('For downloading YouTube.com top lists, use '
1141 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1142
1143 url = self._TEMPLATE_URL % playlist_id
1144 page = self._download_webpage(url, playlist_id)
1145 more_widget_html = content_html = page
1146
1147 # Check if the playlist exists or is private
1148 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1149 raise ExtractorError(
1150 'The playlist doesn\'t exist or is private, use --username or '
1151 '--netrc to access it.',
1152 expected=True)
1153
1154 # Extract the video ids from the playlist pages
1155 ids = []
1156
1157 for page_num in itertools.count(1):
1158 matches = re.finditer(self._VIDEO_RE, content_html)
1159 # We remove the duplicates and the link with index 0
1160 # (it's not the first video of the playlist)
1161 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1162 ids.extend(new_ids)
1163
1164 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1165 if not mobj:
1166 break
1167
1168 more = self._download_json(
1169 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1170 'Downloading page #%s' % page_num,
1171 transform_source=uppercase_escape)
1172 content_html = more['content_html']
1173 more_widget_html = more['load_more_widget_html']
1174
1175 playlist_title = self._html_search_regex(
1176 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1177 page, 'title')
1178
1179 url_results = self._ids_to_results(ids)
1180 return self.playlist_result(url_results, playlist_id, playlist_title)
1181
1182
1183 class YoutubeTopListIE(YoutubePlaylistIE):
1184 IE_NAME = 'youtube:toplist'
1185 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1186 ' (Example: "yttoplist:music:Top Tracks")')
1187 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1188 _TESTS = [{
1189 'url': 'yttoplist:music:Trending',
1190 'playlist_mincount': 5,
1191 'skip': 'Only works for logged-in users',
1192 }]
1193
1194 def _real_extract(self, url):
1195 mobj = re.match(self._VALID_URL, url)
1196 channel = mobj.group('chann')
1197 title = mobj.group('title')
1198 query = compat_urllib_parse.urlencode({'title': title})
1199 channel_page = self._download_webpage(
1200 'https://www.youtube.com/%s' % channel, title)
1201 link = self._html_search_regex(
1202 r'''(?x)
1203 <a\s+href="([^"]+)".*?>\s*
1204 <span\s+class="branded-page-module-title-text">\s*
1205 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1206 channel_page, 'list')
1207 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1208
1209 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1210 ids = []
1211 # sometimes the webpage doesn't contain the videos
1212 # retry until we get them
1213 for i in itertools.count(0):
1214 msg = 'Downloading Youtube mix'
1215 if i > 0:
1216 msg += ', retry #%d' % i
1217
1218 webpage = self._download_webpage(url, title, msg)
1219 ids = orderedSet(re.findall(video_re, webpage))
1220 if ids:
1221 break
1222 url_results = self._ids_to_results(ids)
1223 return self.playlist_result(url_results, playlist_title=title)
1224
1225
1226 class YoutubeChannelIE(InfoExtractor):
1227 IE_DESC = 'YouTube.com channels'
1228 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1229 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1230 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1231 IE_NAME = 'youtube:channel'
1232 _TESTS = [{
1233 'note': 'paginated channel',
1234 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1235 'playlist_mincount': 91,
1236 }]
1237
1238 def extract_videos_from_page(self, page):
1239 ids_in_page = []
1240 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1241 if mobj.group(1) not in ids_in_page:
1242 ids_in_page.append(mobj.group(1))
1243 return ids_in_page
1244
1245 def _real_extract(self, url):
1246 # Extract channel id
1247 mobj = re.match(self._VALID_URL, url)
1248 if mobj is None:
1249 raise ExtractorError('Invalid URL: %s' % url)
1250
1251 # Download channel page
1252 channel_id = mobj.group(1)
1253 video_ids = []
1254 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1255 channel_page = self._download_webpage(url, channel_id)
1256 autogenerated = re.search(r'''(?x)
1257 class="[^"]*?(?:
1258 channel-header-autogenerated-label|
1259 yt-channel-title-autogenerated
1260 )[^"]*"''', channel_page) is not None
1261
1262 if autogenerated:
1263 # The videos are contained in a single page
1264 # the ajax pages can't be used, they are empty
1265 video_ids = self.extract_videos_from_page(channel_page)
1266 else:
1267 # Download all channel pages using the json-based channel_ajax query
1268 for pagenum in itertools.count(1):
1269 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1270 page = self._download_json(
1271 url, channel_id, note='Downloading page #%s' % pagenum,
1272 transform_source=uppercase_escape)
1273
1274 ids_in_page = self.extract_videos_from_page(page['content_html'])
1275 video_ids.extend(ids_in_page)
1276
1277 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1278 break
1279
1280 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1281
1282 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1283 for video_id in video_ids]
1284 return self.playlist_result(url_entries, channel_id)
1285
1286
1287 class YoutubeUserIE(InfoExtractor):
1288 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1289 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1290 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1291 _GDATA_PAGE_SIZE = 50
1292 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1293 IE_NAME = 'youtube:user'
1294
1295 _TESTS = [{
1296 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1297 'playlist_mincount': 320,
1298 'info_dict': {
1299 'title': 'TheLinuxFoundation',
1300 }
1301 }, {
1302 'url': 'ytuser:phihag',
1303 'only_matching': True,
1304 }]
1305
1306 @classmethod
1307 def suitable(cls, url):
1308 # Don't return True if the url can be extracted with other youtube
1309 # extractor, the regex would is too permissive and it would match.
1310 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1311 if any(ie.suitable(url) for ie in other_ies):
1312 return False
1313 else:
1314 return super(YoutubeUserIE, cls).suitable(url)
1315
1316 def _real_extract(self, url):
1317 # Extract username
1318 mobj = re.match(self._VALID_URL, url)
1319 if mobj is None:
1320 raise ExtractorError('Invalid URL: %s' % url)
1321
1322 username = mobj.group(1)
1323
1324 # Download video ids using YouTube Data API. Result size per
1325 # query is limited (currently to 50 videos) so we need to query
1326 # page by page until there are no video ids - it means we got
1327 # all of them.
1328
1329 def download_page(pagenum):
1330 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1331
1332 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1333 page = self._download_webpage(
1334 gdata_url, username,
1335 'Downloading video ids from %d to %d' % (
1336 start_index, start_index + self._GDATA_PAGE_SIZE))
1337
1338 try:
1339 response = json.loads(page)
1340 except ValueError as err:
1341 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1342 if 'entry' not in response['feed']:
1343 return
1344
1345 # Extract video identifiers
1346 entries = response['feed']['entry']
1347 for entry in entries:
1348 title = entry['title']['$t']
1349 video_id = entry['id']['$t'].split('/')[-1]
1350 yield {
1351 '_type': 'url',
1352 'url': video_id,
1353 'ie_key': 'Youtube',
1354 'id': video_id,
1355 'title': title,
1356 }
1357 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1358
1359 return self.playlist_result(url_results, playlist_title=username)
1360
1361
1362 class YoutubeSearchIE(SearchInfoExtractor):
1363 IE_DESC = 'YouTube.com searches'
1364 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1365 _MAX_RESULTS = 1000
1366 IE_NAME = 'youtube:search'
1367 _SEARCH_KEY = 'ytsearch'
1368
1369 def _get_n_results(self, query, n):
1370 """Get a specified number of results for a query"""
1371
1372 video_ids = []
1373 pagenum = 0
1374 limit = n
1375 PAGE_SIZE = 50
1376
1377 while (PAGE_SIZE * pagenum) < limit:
1378 result_url = self._API_URL % (
1379 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1380 (PAGE_SIZE * pagenum) + 1)
1381 data_json = self._download_webpage(
1382 result_url, video_id='query "%s"' % query,
1383 note='Downloading page %s' % (pagenum + 1),
1384 errnote='Unable to download API page')
1385 data = json.loads(data_json)
1386 api_response = data['data']
1387
1388 if 'items' not in api_response:
1389 raise ExtractorError(
1390 '[youtube] No video results', expected=True)
1391
1392 new_ids = list(video['id'] for video in api_response['items'])
1393 video_ids += new_ids
1394
1395 limit = min(n, api_response['totalItems'])
1396 pagenum += 1
1397
1398 if len(video_ids) > n:
1399 video_ids = video_ids[:n]
1400 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1401 for video_id in video_ids]
1402 return self.playlist_result(videos, query)
1403
1404
1405 class YoutubeSearchDateIE(YoutubeSearchIE):
1406 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1407 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1408 _SEARCH_KEY = 'ytsearchdate'
1409 IE_DESC = 'YouTube.com searches, newest videos first'
1410
1411
1412 class YoutubeSearchURLIE(InfoExtractor):
1413 IE_DESC = 'YouTube.com search URLs'
1414 IE_NAME = 'youtube:search_url'
1415 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1416 _TESTS = [{
1417 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1418 'playlist_mincount': 5,
1419 'info_dict': {
1420 'title': 'youtube-dl test video',
1421 }
1422 }]
1423
1424 def _real_extract(self, url):
1425 mobj = re.match(self._VALID_URL, url)
1426 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1427
1428 webpage = self._download_webpage(url, query)
1429 result_code = self._search_regex(
1430 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1431
1432 part_codes = re.findall(
1433 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1434 entries = []
1435 for part_code in part_codes:
1436 part_title = self._html_search_regex(
1437 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1438 part_url_snippet = self._html_search_regex(
1439 r'(?s)href="([^"]+)"', part_code, 'item URL')
1440 part_url = compat_urlparse.urljoin(
1441 'https://www.youtube.com/', part_url_snippet)
1442 entries.append({
1443 '_type': 'url',
1444 'url': part_url,
1445 'title': part_title,
1446 })
1447
1448 return {
1449 '_type': 'playlist',
1450 'entries': entries,
1451 'title': query,
1452 }
1453
1454
1455 class YoutubeShowIE(InfoExtractor):
1456 IE_DESC = 'YouTube.com (multi-season) shows'
1457 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1458 IE_NAME = 'youtube:show'
1459 _TESTS = [{
1460 'url': 'http://www.youtube.com/show/airdisasters',
1461 'playlist_mincount': 3,
1462 'info_dict': {
1463 'id': 'airdisasters',
1464 'title': 'Air Disasters',
1465 }
1466 }]
1467
1468 def _real_extract(self, url):
1469 mobj = re.match(self._VALID_URL, url)
1470 playlist_id = mobj.group('id')
1471 webpage = self._download_webpage(
1472 url, playlist_id, 'Downloading show webpage')
1473 # There's one playlist for each season of the show
1474 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1475 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1476 entries = [
1477 self.url_result(
1478 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1479 for season in m_seasons
1480 ]
1481 title = self._og_search_title(webpage, fatal=False)
1482
1483 return {
1484 '_type': 'playlist',
1485 'id': playlist_id,
1486 'title': title,
1487 'entries': entries,
1488 }
1489
1490
1491 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1492 """
1493 Base class for extractors that fetch info from
1494 http://www.youtube.com/feed_ajax
1495 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1496 """
1497 _LOGIN_REQUIRED = True
1498 # use action_load_personal_feed instead of action_load_system_feed
1499 _PERSONAL_FEED = False
1500
1501 @property
1502 def _FEED_TEMPLATE(self):
1503 action = 'action_load_system_feed'
1504 if self._PERSONAL_FEED:
1505 action = 'action_load_personal_feed'
1506 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1507
1508 @property
1509 def IE_NAME(self):
1510 return 'youtube:%s' % self._FEED_NAME
1511
1512 def _real_initialize(self):
1513 self._login()
1514
1515 def _real_extract(self, url):
1516 feed_entries = []
1517 paging = 0
1518 for i in itertools.count(1):
1519 info = self._download_json(self._FEED_TEMPLATE % paging,
1520 '%s feed' % self._FEED_NAME,
1521 'Downloading page %s' % i)
1522 feed_html = info.get('feed_html') or info.get('content_html')
1523 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1524 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1525 ids = orderedSet(m.group(1) for m in m_ids)
1526 feed_entries.extend(
1527 self.url_result(video_id, 'Youtube', video_id=video_id)
1528 for video_id in ids)
1529 mobj = re.search(
1530 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1531 load_more_widget_html)
1532 if mobj is None:
1533 break
1534 paging = mobj.group('paging')
1535 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1536
1537
1538 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1539 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1540 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1541 _FEED_NAME = 'recommended'
1542 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1543
1544
1545 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1546 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1547 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1548 _FEED_NAME = 'watch_later'
1549 _PLAYLIST_TITLE = 'Youtube Watch Later'
1550 _PERSONAL_FEED = True
1551
1552
1553 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1554 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1555 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1556 _FEED_NAME = 'history'
1557 _PERSONAL_FEED = True
1558 _PLAYLIST_TITLE = 'Youtube Watch History'
1559
1560
1561 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1562 IE_NAME = 'youtube:favorites'
1563 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1564 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1565 _LOGIN_REQUIRED = True
1566
1567 def _real_extract(self, url):
1568 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1569 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1570 return self.url_result(playlist_id, 'YoutubePlaylist')
1571
1572
1573 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1574 IE_NAME = 'youtube:subscriptions'
1575 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1576 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1577 _TESTS = []
1578
1579 def _real_extract(self, url):
1580 title = 'Youtube Subscriptions'
1581 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1582
1583 # The extraction process is the same as for playlists, but the regex
1584 # for the video ids doesn't contain an index
1585 ids = []
1586 more_widget_html = content_html = page
1587
1588 for page_num in itertools.count(1):
1589 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1590 new_ids = orderedSet(matches)
1591 ids.extend(new_ids)
1592
1593 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1594 if not mobj:
1595 break
1596
1597 more = self._download_json(
1598 'https://youtube.com/%s' % mobj.group('more'), title,
1599 'Downloading page #%s' % page_num,
1600 transform_source=uppercase_escape)
1601 content_html = more['content_html']
1602 more_widget_html = more['load_more_widget_html']
1603
1604 return {
1605 '_type': 'playlist',
1606 'title': title,
1607 'entries': self._ids_to_results(ids),
1608 }
1609
1610
1611 class YoutubeTruncatedURLIE(InfoExtractor):
1612 IE_NAME = 'youtube:truncated_url'
1613 IE_DESC = False # Do not list
1614 _VALID_URL = r'''(?x)
1615 (?:https?://)?[^/]+/watch\?(?:
1616 feature=[a-z_]+|
1617 annotation_id=annotation_[^&]+
1618 )?$|
1619 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1620 '''
1621
1622 _TESTS = [{
1623 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1624 'only_matching': True,
1625 }, {
1626 'url': 'http://www.youtube.com/watch?',
1627 'only_matching': True,
1628 }]
1629
1630 def _real_extract(self, url):
1631 raise ExtractorError(
1632 'Did you forget to quote the URL? Remember that & is a meta '
1633 'character in most shells, so you want to put the URL in quotes, '
1634 'like youtube-dl '
1635 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1636 ' or simply youtube-dl BaW_jenozKc .',
1637 expected=True)