]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Prepare for upload.
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 OnDemandPagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note='Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = 'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//) # http(s):// or protocol-independent URL
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
249
250 # 3d videos
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
258
259 # Apple HTTP Live Streaming
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
267
268 # DASH mp4 video
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
277
278 # Dash mp4 audio
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
282
283 # Dash webm
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
301 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
302
303 # Dash webm audio
304 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
305 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
306
307 # Dash mov
308 '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
309 '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
310 '266': {'ext': 'mov', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
311
312 # RTMP (unnamed)
313 '_rtmp': {'protocol': 'rtmp'},
314 }
315
316 IE_NAME = 'youtube'
317 _TESTS = [
318 {
319 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
320 'info_dict': {
321 'id': 'BaW_jenozKc',
322 'ext': 'mp4',
323 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
324 'uploader': 'Philipp Hagemeister',
325 'uploader_id': 'phihag',
326 'upload_date': '20121002',
327 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
328 'categories': ['Science & Technology'],
329 'like_count': int,
330 'dislike_count': int,
331 }
332 },
333 {
334 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
335 'note': 'Test generic use_cipher_signature video (#897)',
336 'info_dict': {
337 'id': 'UxxajLWwzqY',
338 'ext': 'mp4',
339 'upload_date': '20120506',
340 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
341 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
342 'uploader': 'Icona Pop',
343 'uploader_id': 'IconaPop',
344 }
345 },
346 {
347 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
348 'note': 'Test VEVO video with age protection (#956)',
349 'info_dict': {
350 'id': '07FYdnEawAQ',
351 'ext': 'mp4',
352 'upload_date': '20130703',
353 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
354 'description': 'md5:64249768eec3bc4276236606ea996373',
355 'uploader': 'justintimberlakeVEVO',
356 'uploader_id': 'justintimberlakeVEVO',
357 }
358 },
359 {
360 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
361 'note': 'Embed-only video (#1746)',
362 'info_dict': {
363 'id': 'yZIXLfi8CZQ',
364 'ext': 'mp4',
365 'upload_date': '20120608',
366 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
367 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
368 'uploader': 'SET India',
369 'uploader_id': 'setindia'
370 }
371 },
372 {
373 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
374 'note': '256k DASH audio (format 141) via DASH manifest',
375 'info_dict': {
376 'id': 'a9LDPn-MO4I',
377 'ext': 'm4a',
378 'upload_date': '20121002',
379 'uploader_id': '8KVIDEO',
380 'description': '',
381 'uploader': '8KVIDEO',
382 'title': 'UHDTV TEST 8K VIDEO.mp4'
383 },
384 'params': {
385 'youtube_include_dash_manifest': True,
386 'format': '141',
387 },
388 },
389 # DASH manifest with encrypted signature
390 {
391 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
392 'info_dict': {
393 'id': 'IB3lcPjvWLA',
394 'ext': 'm4a',
395 'title': 'Afrojack - The Spark ft. Spree Wilson',
396 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
397 'uploader': 'AfrojackVEVO',
398 'uploader_id': 'AfrojackVEVO',
399 'upload_date': '20131011',
400 },
401 'params': {
402 'youtube_include_dash_manifest': True,
403 'format': '141',
404 },
405 },
406 ]
407
408 def __init__(self, *args, **kwargs):
409 super(YoutubeIE, self).__init__(*args, **kwargs)
410 self._player_cache = {}
411
412 def report_video_info_webpage_download(self, video_id):
413 """Report attempt to download video info webpage."""
414 self.to_screen('%s: Downloading video info webpage' % video_id)
415
416 def report_information_extraction(self, video_id):
417 """Report attempt to extract video information."""
418 self.to_screen('%s: Extracting video information' % video_id)
419
420 def report_unavailable_format(self, video_id, format):
421 """Report extracted video URL."""
422 self.to_screen('%s: Format %s not available' % (video_id, format))
423
424 def report_rtmp_download(self):
425 """Indicate the download will use the RTMP protocol."""
426 self.to_screen('RTMP download detected')
427
428 def _signature_cache_id(self, example_sig):
429 """ Return a string representation of a signature """
430 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
431
432 def _extract_signature_function(self, video_id, player_url, example_sig):
433 id_m = re.match(
434 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
435 player_url)
436 if not id_m:
437 raise ExtractorError('Cannot identify player %r' % player_url)
438 player_type = id_m.group('ext')
439 player_id = id_m.group('id')
440
441 # Read from filesystem cache
442 func_id = '%s_%s_%s' % (
443 player_type, player_id, self._signature_cache_id(example_sig))
444 assert os.path.basename(func_id) == func_id
445
446 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
447 if cache_spec is not None:
448 return lambda s: ''.join(s[i] for i in cache_spec)
449
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
453 note='Downloading %s player %s' % (player_type, player_id),
454 errnote='Download of %s failed' % player_url)
455 res = self._parse_sig_js(code)
456 elif player_type == 'swf':
457 urlh = self._request_webpage(
458 player_url, video_id,
459 note='Downloading %s player %s' % (player_type, player_id),
460 errnote='Download of %s failed' % player_url)
461 code = urlh.read()
462 res = self._parse_sig_swf(code)
463 else:
464 assert False, 'Invalid player type %r' % player_type
465
466 if cache_spec is None:
467 test_string = ''.join(map(compat_chr, range(len(example_sig))))
468 cache_res = res(test_string)
469 cache_spec = [ord(c) for c in cache_res]
470
471 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
472 return res
473
474 def _print_sig_code(self, func, example_sig):
475 def gen_sig_code(idxs):
476 def _genslice(start, end, step):
477 starts = '' if start == 0 else str(start)
478 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
479 steps = '' if step == 1 else (':%d' % step)
480 return 's[%s%s%s]' % (starts, ends, steps)
481
482 step = None
483 start = '(Never used)' # Quelch pyflakes warnings - start will be
484 # set as soon as step is set
485 for i, prev in zip(idxs[1:], idxs[:-1]):
486 if step is not None:
487 if i - prev == step:
488 continue
489 yield _genslice(start, prev, step)
490 step = None
491 continue
492 if i - prev in [-1, 1]:
493 step = i - prev
494 start = prev
495 continue
496 else:
497 yield 's[%d]' % prev
498 if step is None:
499 yield 's[%d]' % i
500 else:
501 yield _genslice(start, i, step)
502
503 test_string = ''.join(map(compat_chr, range(len(example_sig))))
504 cache_res = func(test_string)
505 cache_spec = [ord(c) for c in cache_res]
506 expr_code = ' + '.join(gen_sig_code(cache_spec))
507 signature_id_tuple = '(%s)' % (
508 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
509 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
510 ' return %s\n') % (signature_id_tuple, expr_code)
511 self.to_screen('Extracted signature function:\n' + code)
512
513 def _parse_sig_js(self, jscode):
514 funcname = self._search_regex(
515 r'signature=([$a-zA-Z]+)', jscode,
516 'Initial JS player signature function name')
517
518 jsi = JSInterpreter(jscode)
519 initial_function = jsi.extract_function(funcname)
520 return lambda s: initial_function([s])
521
522 def _parse_sig_swf(self, file_contents):
523 swfi = SWFInterpreter(file_contents)
524 TARGET_CLASSNAME = 'SignatureDecipher'
525 searched_class = swfi.extract_class(TARGET_CLASSNAME)
526 initial_function = swfi.extract_function(searched_class, 'decipher')
527 return lambda s: initial_function([s])
528
529 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
530 """Turn the encrypted s field into a working signature"""
531
532 if player_url is None:
533 raise ExtractorError('Cannot decrypt signature without player_url')
534
535 if player_url.startswith('//'):
536 player_url = 'https:' + player_url
537 try:
538 player_id = (player_url, self._signature_cache_id(s))
539 if player_id not in self._player_cache:
540 func = self._extract_signature_function(
541 video_id, player_url, s
542 )
543 self._player_cache[player_id] = func
544 func = self._player_cache[player_id]
545 if self._downloader.params.get('youtube_print_sig_code'):
546 self._print_sig_code(func, s)
547 return func(s)
548 except Exception as e:
549 tb = traceback.format_exc()
550 raise ExtractorError(
551 'Signature extraction failed: ' + tb, cause=e)
552
553 def _get_available_subtitles(self, video_id, webpage):
554 try:
555 sub_list = self._download_webpage(
556 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
557 video_id, note=False)
558 except ExtractorError as err:
559 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
560 return {}
561 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
562
563 sub_lang_list = {}
564 for l in lang_list:
565 lang = l[1]
566 if lang in sub_lang_list:
567 continue
568 params = compat_urllib_parse.urlencode({
569 'lang': lang,
570 'v': video_id,
571 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
572 'name': unescapeHTML(l[0]).encode('utf-8'),
573 })
574 url = 'https://www.youtube.com/api/timedtext?' + params
575 sub_lang_list[lang] = url
576 if not sub_lang_list:
577 self._downloader.report_warning('video doesn\'t have subtitles')
578 return {}
579 return sub_lang_list
580
581 def _get_available_automatic_caption(self, video_id, webpage):
582 """We need the webpage for getting the captions url, pass it as an
583 argument to speed up the process."""
584 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
585 self.to_screen('%s: Looking for automatic captions' % video_id)
586 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
587 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
588 if mobj is None:
589 self._downloader.report_warning(err_msg)
590 return {}
591 player_config = json.loads(mobj.group(1))
592 try:
593 args = player_config[u'args']
594 caption_url = args[u'ttsurl']
595 timestamp = args[u'timestamp']
596 # We get the available subtitles
597 list_params = compat_urllib_parse.urlencode({
598 'type': 'list',
599 'tlangs': 1,
600 'asrs': 1,
601 })
602 list_url = caption_url + '&' + list_params
603 caption_list = self._download_xml(list_url, video_id)
604 original_lang_node = caption_list.find('track')
605 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
606 self._downloader.report_warning('Video doesn\'t have automatic captions')
607 return {}
608 original_lang = original_lang_node.attrib['lang_code']
609
610 sub_lang_list = {}
611 for lang_node in caption_list.findall('target'):
612 sub_lang = lang_node.attrib['lang_code']
613 params = compat_urllib_parse.urlencode({
614 'lang': original_lang,
615 'tlang': sub_lang,
616 'fmt': sub_format,
617 'ts': timestamp,
618 'kind': 'asr',
619 })
620 sub_lang_list[sub_lang] = caption_url + '&' + params
621 return sub_lang_list
622 # An extractor error can be raise by the download process if there are
623 # no automatic captions but there are subtitles
624 except (KeyError, ExtractorError):
625 self._downloader.report_warning(err_msg)
626 return {}
627
628 @classmethod
629 def extract_id(cls, url):
630 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
631 if mobj is None:
632 raise ExtractorError('Invalid URL: %s' % url)
633 video_id = mobj.group(2)
634 return video_id
635
636 def _extract_from_m3u8(self, manifest_url, video_id):
637 url_map = {}
638 def _get_urls(_manifest):
639 lines = _manifest.split('\n')
640 urls = filter(lambda l: l and not l.startswith('#'),
641 lines)
642 return urls
643 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
644 formats_urls = _get_urls(manifest)
645 for format_url in formats_urls:
646 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
647 url_map[itag] = format_url
648 return url_map
649
650 def _extract_annotations(self, video_id):
651 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
652 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
653
654 def _real_extract(self, url):
655 proto = (
656 'http' if self._downloader.params.get('prefer_insecure', False)
657 else 'https')
658
659 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
660 mobj = re.search(self._NEXT_URL_RE, url)
661 if mobj:
662 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
663 video_id = self.extract_id(url)
664
665 # Get video webpage
666 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
667 pref_cookies = [
668 c for c in self._downloader.cookiejar
669 if c.domain == '.youtube.com' and c.name == 'PREF']
670 for pc in pref_cookies:
671 if 'hl=' in pc.value:
672 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
673 else:
674 if pc.value:
675 pc.value += '&'
676 pc.value += 'hl=en'
677 video_webpage = self._download_webpage(url, video_id)
678
679 # Attempt to extract SWF player URL
680 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
681 if mobj is not None:
682 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
683 else:
684 player_url = None
685
686 # Get video info
687 self.report_video_info_webpage_download(video_id)
688 if re.search(r'player-age-gate-content">', video_webpage) is not None:
689 self.report_age_confirmation()
690 age_gate = True
691 # We simulate the access to the video from www.youtube.com/v/{video_id}
692 # this can be viewed without login into Youtube
693 data = compat_urllib_parse.urlencode({
694 'video_id': video_id,
695 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
696 'sts': self._search_regex(
697 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
698 })
699 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
700 video_info_webpage = self._download_webpage(video_info_url, video_id,
701 note=False,
702 errnote='unable to download video info webpage')
703 video_info = compat_parse_qs(video_info_webpage)
704 else:
705 age_gate = False
706 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
707 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
708 % (video_id, el_type))
709 video_info_webpage = self._download_webpage(video_info_url, video_id,
710 note=False,
711 errnote='unable to download video info webpage')
712 video_info = compat_parse_qs(video_info_webpage)
713 if 'token' in video_info:
714 break
715 if 'token' not in video_info:
716 if 'reason' in video_info:
717 raise ExtractorError(
718 'YouTube said: %s' % video_info['reason'][0],
719 expected=True, video_id=video_id)
720 else:
721 raise ExtractorError(
722 '"token" parameter not in video info for unknown reason',
723 video_id=video_id)
724
725 if 'view_count' in video_info:
726 view_count = int(video_info['view_count'][0])
727 else:
728 view_count = None
729
730 # Check for "rental" videos
731 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
732 raise ExtractorError('"rental" videos not supported')
733
734 # Start extracting information
735 self.report_information_extraction(video_id)
736
737 # uploader
738 if 'author' not in video_info:
739 raise ExtractorError('Unable to extract uploader name')
740 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
741
742 # uploader_id
743 video_uploader_id = None
744 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
745 if mobj is not None:
746 video_uploader_id = mobj.group(1)
747 else:
748 self._downloader.report_warning('unable to extract uploader nickname')
749
750 # title
751 if 'title' in video_info:
752 video_title = video_info['title'][0]
753 else:
754 self._downloader.report_warning('Unable to extract video title')
755 video_title = '_'
756
757 # thumbnail image
758 # We try first to get a high quality image:
759 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
760 video_webpage, re.DOTALL)
761 if m_thumb is not None:
762 video_thumbnail = m_thumb.group(1)
763 elif 'thumbnail_url' not in video_info:
764 self._downloader.report_warning('unable to extract video thumbnail')
765 video_thumbnail = None
766 else: # don't panic if we can't find it
767 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
768
769 # upload date
770 upload_date = None
771 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
772 if mobj is None:
773 mobj = re.search(
774 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
775 video_webpage)
776 if mobj is not None:
777 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
778 upload_date = unified_strdate(upload_date)
779
780 m_cat_container = self._search_regex(
781 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
782 video_webpage, 'categories', fatal=False)
783 if m_cat_container:
784 category = self._html_search_regex(
785 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
786 default=None)
787 video_categories = None if category is None else [category]
788 else:
789 video_categories = None
790
791 # description
792 video_description = get_element_by_id("eow-description", video_webpage)
793 if video_description:
794 video_description = re.sub(r'''(?x)
795 <a\s+
796 (?:[a-zA-Z-]+="[^"]+"\s+)*?
797 title="([^"]+)"\s+
798 (?:[a-zA-Z-]+="[^"]+"\s+)*?
799 class="yt-uix-redirect-link"\s*>
800 [^<]+
801 </a>
802 ''', r'\1', video_description)
803 video_description = clean_html(video_description)
804 else:
805 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
806 if fd_mobj:
807 video_description = unescapeHTML(fd_mobj.group(1))
808 else:
809 video_description = ''
810
811 def _extract_count(count_name):
812 count = self._search_regex(
813 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
814 video_webpage, count_name, default=None)
815 if count is not None:
816 return int(count.replace(',', ''))
817 return None
818 like_count = _extract_count('like')
819 dislike_count = _extract_count('dislike')
820
821 # subtitles
822 video_subtitles = self.extract_subtitles(video_id, video_webpage)
823
824 if self._downloader.params.get('listsubtitles', False):
825 self._list_available_subtitles(video_id, video_webpage)
826 return
827
828 if 'length_seconds' not in video_info:
829 self._downloader.report_warning('unable to extract video duration')
830 video_duration = None
831 else:
832 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
833
834 # annotations
835 video_annotations = None
836 if self._downloader.params.get('writeannotations', False):
837 video_annotations = self._extract_annotations(video_id)
838
839 # Decide which formats to download
840 try:
841 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
842 if not mobj:
843 raise ValueError('Could not find vevo ID')
844 json_code = uppercase_escape(mobj.group(1))
845 ytplayer_config = json.loads(json_code)
846 args = ytplayer_config['args']
847 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
848 # this signatures are encrypted
849 if 'url_encoded_fmt_stream_map' not in args:
850 raise ValueError('No stream_map present') # caught below
851 re_signature = re.compile(r'[&,]s=')
852 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
853 if m_s is not None:
854 self.to_screen('%s: Encrypted signatures detected.' % video_id)
855 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
856 m_s = re_signature.search(args.get('adaptive_fmts', ''))
857 if m_s is not None:
858 if 'adaptive_fmts' in video_info:
859 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
860 else:
861 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
862 except ValueError:
863 pass
864
865 def _map_to_format_list(urlmap):
866 formats = []
867 for itag, video_real_url in urlmap.items():
868 dct = {
869 'format_id': itag,
870 'url': video_real_url,
871 'player_url': player_url,
872 }
873 if itag in self._formats:
874 dct.update(self._formats[itag])
875 formats.append(dct)
876 return formats
877
878 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
879 self.report_rtmp_download()
880 formats = [{
881 'format_id': '_rtmp',
882 'protocol': 'rtmp',
883 'url': video_info['conn'][0],
884 'player_url': player_url,
885 }]
886 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
887 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
888 if 'rtmpe%3Dyes' in encoded_url_map:
889 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
890 url_map = {}
891 for url_data_str in encoded_url_map.split(','):
892 url_data = compat_parse_qs(url_data_str)
893 if 'itag' not in url_data or 'url' not in url_data:
894 continue
895 format_id = url_data['itag'][0]
896 url = url_data['url'][0]
897
898 if 'sig' in url_data:
899 url += '&signature=' + url_data['sig'][0]
900 elif 's' in url_data:
901 encrypted_sig = url_data['s'][0]
902
903 if not age_gate:
904 jsplayer_url_json = self._search_regex(
905 r'"assets":.+?"js":\s*("[^"]+")',
906 video_webpage, 'JS player URL')
907 player_url = json.loads(jsplayer_url_json)
908 if player_url is None:
909 player_url_json = self._search_regex(
910 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
911 video_webpage, 'age gate player URL')
912 player_url = json.loads(player_url_json)
913
914 if self._downloader.params.get('verbose'):
915 if player_url is None:
916 player_version = 'unknown'
917 player_desc = 'unknown'
918 else:
919 if player_url.endswith('swf'):
920 player_version = self._search_regex(
921 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
922 'flash player', fatal=False)
923 player_desc = 'flash player %s' % player_version
924 else:
925 player_version = self._search_regex(
926 r'html5player-([^/]+?)(?:/html5player)?\.js',
927 player_url,
928 'html5 player', fatal=False)
929 player_desc = 'html5 player %s' % player_version
930
931 parts_sizes = self._signature_cache_id(encrypted_sig)
932 self.to_screen('{%s} signature length %s, %s' %
933 (format_id, parts_sizes, player_desc))
934
935 signature = self._decrypt_signature(
936 encrypted_sig, video_id, player_url, age_gate)
937 url += '&signature=' + signature
938 if 'ratebypass' not in url:
939 url += '&ratebypass=yes'
940 url_map[format_id] = url
941 formats = _map_to_format_list(url_map)
942 elif video_info.get('hlsvp'):
943 manifest_url = video_info['hlsvp'][0]
944 url_map = self._extract_from_m3u8(manifest_url, video_id)
945 formats = _map_to_format_list(url_map)
946 else:
947 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
948
949 # Look for the DASH manifest
950 if self._downloader.params.get('youtube_include_dash_manifest', True):
951 try:
952 # The DASH manifest used needs to be the one from the original video_webpage.
953 # The one found in get_video_info seems to be using different signatures.
954 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
955 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
956 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
957 if age_gate:
958 dash_manifest_url = video_info.get('dashmpd')[0]
959 else:
960 dash_manifest_url = ytplayer_config['args']['dashmpd']
961 def decrypt_sig(mobj):
962 s = mobj.group(1)
963 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
964 return '/signature/%s' % dec_s
965 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
966 dash_doc = self._download_xml(
967 dash_manifest_url, video_id,
968 note='Downloading DASH manifest',
969 errnote='Could not download DASH manifest')
970 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
971 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
972 if url_el is None:
973 continue
974 format_id = r.attrib['id']
975 video_url = url_el.text
976 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
977 f = {
978 'format_id': format_id,
979 'url': video_url,
980 'width': int_or_none(r.attrib.get('width')),
981 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
982 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
983 'filesize': filesize,
984 }
985 try:
986 existing_format = next(
987 fo for fo in formats
988 if fo['format_id'] == format_id)
989 except StopIteration:
990 f.update(self._formats.get(format_id, {}))
991 formats.append(f)
992 else:
993 existing_format.update(f)
994
995 except (ExtractorError, KeyError) as e:
996 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
997
998 self._sort_formats(formats)
999
1000 return {
1001 'id': video_id,
1002 'uploader': video_uploader,
1003 'uploader_id': video_uploader_id,
1004 'upload_date': upload_date,
1005 'title': video_title,
1006 'thumbnail': video_thumbnail,
1007 'description': video_description,
1008 'categories': video_categories,
1009 'subtitles': video_subtitles,
1010 'duration': video_duration,
1011 'age_limit': 18 if age_gate else 0,
1012 'annotations': video_annotations,
1013 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1014 'view_count': view_count,
1015 'like_count': like_count,
1016 'dislike_count': dislike_count,
1017 'formats': formats,
1018 }
1019
1020 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1021 IE_DESC = 'YouTube.com playlists'
1022 _VALID_URL = r"""(?x)(?:
1023 (?:https?://)?
1024 (?:\w+\.)?
1025 youtube\.com/
1026 (?:
1027 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1028 \? (?:.*?&)*? (?:p|a|list)=
1029 | p/
1030 )
1031 (
1032 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1033 # Top tracks, they can also include dots
1034 |(?:MC)[\w\.]*
1035 )
1036 .*
1037 |
1038 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1039 )"""
1040 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1041 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1042 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1043 IE_NAME = 'youtube:playlist'
1044 _TESTS = [{
1045 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1046 'info_dict': {
1047 'title': 'ytdl test PL',
1048 },
1049 'playlist_count': 3,
1050 }, {
1051 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1052 'info_dict': {
1053 'title': 'YDL_Empty_List',
1054 },
1055 'playlist_count': 0,
1056 }, {
1057 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1058 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1059 'info_dict': {
1060 'title': '29C3: Not my department',
1061 },
1062 'playlist_count': 95,
1063 }, {
1064 'note': 'issue #673',
1065 'url': 'PLBB231211A4F62143',
1066 'info_dict': {
1067 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1068 },
1069 'playlist_mincount': 26,
1070 }, {
1071 'note': 'Large playlist',
1072 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1073 'info_dict': {
1074 'title': 'Uploads from Cauchemar',
1075 },
1076 'playlist_mincount': 799,
1077 }, {
1078 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1079 'info_dict': {
1080 'title': 'YDL_safe_search',
1081 },
1082 'playlist_count': 2,
1083 }, {
1084 'note': 'embedded',
1085 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1086 'playlist_count': 4,
1087 'info_dict': {
1088 'title': 'JODA15',
1089 }
1090 }, {
1091 'note': 'Embedded SWF player',
1092 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1093 'playlist_count': 4,
1094 'info_dict': {
1095 'title': 'JODA7',
1096 }
1097 }]
1098
1099 def _real_initialize(self):
1100 self._login()
1101
1102 def _ids_to_results(self, ids):
1103 return [
1104 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1105 for vid_id in ids]
1106
1107 def _extract_mix(self, playlist_id):
1108 # The mixes are generated from a a single video
1109 # the id of the playlist is just 'RD' + video_id
1110 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1111 webpage = self._download_webpage(
1112 url, playlist_id, 'Downloading Youtube mix')
1113 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1114 title_span = (
1115 search_title('playlist-title') or
1116 search_title('title long-title') or
1117 search_title('title'))
1118 title = clean_html(title_span)
1119 ids = orderedSet(re.findall(
1120 r'''(?xs)data-video-username=".*?".*?
1121 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1122 webpage))
1123 url_results = self._ids_to_results(ids)
1124
1125 return self.playlist_result(url_results, playlist_id, title)
1126
1127 def _real_extract(self, url):
1128 # Extract playlist id
1129 mobj = re.match(self._VALID_URL, url)
1130 if mobj is None:
1131 raise ExtractorError('Invalid URL: %s' % url)
1132 playlist_id = mobj.group(1) or mobj.group(2)
1133
1134 # Check if it's a video-specific URL
1135 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1136 if 'v' in query_dict:
1137 video_id = query_dict['v'][0]
1138 if self._downloader.params.get('noplaylist'):
1139 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1140 return self.url_result(video_id, 'Youtube', video_id=video_id)
1141 else:
1142 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1143
1144 if playlist_id.startswith('RD'):
1145 # Mixes require a custom extraction process
1146 return self._extract_mix(playlist_id)
1147 if playlist_id.startswith('TL'):
1148 raise ExtractorError('For downloading YouTube.com top lists, use '
1149 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1150
1151 url = self._TEMPLATE_URL % playlist_id
1152 page = self._download_webpage(url, playlist_id)
1153 more_widget_html = content_html = page
1154
1155 # Check if the playlist exists or is private
1156 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1157 raise ExtractorError(
1158 'The playlist doesn\'t exist or is private, use --username or '
1159 '--netrc to access it.',
1160 expected=True)
1161
1162 # Extract the video ids from the playlist pages
1163 ids = []
1164
1165 for page_num in itertools.count(1):
1166 matches = re.finditer(self._VIDEO_RE, content_html)
1167 # We remove the duplicates and the link with index 0
1168 # (it's not the first video of the playlist)
1169 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1170 ids.extend(new_ids)
1171
1172 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1173 if not mobj:
1174 break
1175
1176 more = self._download_json(
1177 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1178 'Downloading page #%s' % page_num,
1179 transform_source=uppercase_escape)
1180 content_html = more['content_html']
1181 more_widget_html = more['load_more_widget_html']
1182
1183 playlist_title = self._html_search_regex(
1184 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1185 page, 'title')
1186
1187 url_results = self._ids_to_results(ids)
1188 return self.playlist_result(url_results, playlist_id, playlist_title)
1189
1190
1191 class YoutubeTopListIE(YoutubePlaylistIE):
1192 IE_NAME = 'youtube:toplist'
1193 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1194 ' (Example: "yttoplist:music:Top Tracks")')
1195 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1196 _TESTS = [{
1197 'url': 'yttoplist:music:Trending',
1198 'playlist_mincount': 5,
1199 'skip': 'Only works for logged-in users',
1200 }]
1201
1202 def _real_extract(self, url):
1203 mobj = re.match(self._VALID_URL, url)
1204 channel = mobj.group('chann')
1205 title = mobj.group('title')
1206 query = compat_urllib_parse.urlencode({'title': title})
1207 channel_page = self._download_webpage(
1208 'https://www.youtube.com/%s' % channel, title)
1209 link = self._html_search_regex(
1210 r'''(?x)
1211 <a\s+href="([^"]+)".*?>\s*
1212 <span\s+class="branded-page-module-title-text">\s*
1213 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1214 channel_page, 'list')
1215 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1216
1217 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1218 ids = []
1219 # sometimes the webpage doesn't contain the videos
1220 # retry until we get them
1221 for i in itertools.count(0):
1222 msg = 'Downloading Youtube mix'
1223 if i > 0:
1224 msg += ', retry #%d' % i
1225
1226 webpage = self._download_webpage(url, title, msg)
1227 ids = orderedSet(re.findall(video_re, webpage))
1228 if ids:
1229 break
1230 url_results = self._ids_to_results(ids)
1231 return self.playlist_result(url_results, playlist_title=title)
1232
1233
1234 class YoutubeChannelIE(InfoExtractor):
1235 IE_DESC = 'YouTube.com channels'
1236 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1237 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1238 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1239 IE_NAME = 'youtube:channel'
1240 _TESTS = [{
1241 'note': 'paginated channel',
1242 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1243 'playlist_mincount': 91,
1244 }]
1245
1246 def extract_videos_from_page(self, page):
1247 ids_in_page = []
1248 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1249 if mobj.group(1) not in ids_in_page:
1250 ids_in_page.append(mobj.group(1))
1251 return ids_in_page
1252
1253 def _real_extract(self, url):
1254 # Extract channel id
1255 mobj = re.match(self._VALID_URL, url)
1256 if mobj is None:
1257 raise ExtractorError('Invalid URL: %s' % url)
1258
1259 # Download channel page
1260 channel_id = mobj.group(1)
1261 video_ids = []
1262 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1263 channel_page = self._download_webpage(url, channel_id)
1264 autogenerated = re.search(r'''(?x)
1265 class="[^"]*?(?:
1266 channel-header-autogenerated-label|
1267 yt-channel-title-autogenerated
1268 )[^"]*"''', channel_page) is not None
1269
1270 if autogenerated:
1271 # The videos are contained in a single page
1272 # the ajax pages can't be used, they are empty
1273 video_ids = self.extract_videos_from_page(channel_page)
1274 else:
1275 # Download all channel pages using the json-based channel_ajax query
1276 for pagenum in itertools.count(1):
1277 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1278 page = self._download_json(
1279 url, channel_id, note='Downloading page #%s' % pagenum,
1280 transform_source=uppercase_escape)
1281
1282 ids_in_page = self.extract_videos_from_page(page['content_html'])
1283 video_ids.extend(ids_in_page)
1284
1285 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1286 break
1287
1288 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1289
1290 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(url_entries, channel_id)
1293
1294
1295 class YoutubeUserIE(InfoExtractor):
1296 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1297 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1298 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1299 _GDATA_PAGE_SIZE = 50
1300 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1301 IE_NAME = 'youtube:user'
1302
1303 _TESTS = [{
1304 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1305 'playlist_mincount': 320,
1306 'info_dict': {
1307 'title': 'TheLinuxFoundation',
1308 }
1309 }, {
1310 'url': 'ytuser:phihag',
1311 'only_matching': True,
1312 }]
1313
1314 @classmethod
1315 def suitable(cls, url):
1316 # Don't return True if the url can be extracted with other youtube
1317 # extractor, the regex would is too permissive and it would match.
1318 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1319 if any(ie.suitable(url) for ie in other_ies): return False
1320 else: return super(YoutubeUserIE, cls).suitable(url)
1321
1322 def _real_extract(self, url):
1323 # Extract username
1324 mobj = re.match(self._VALID_URL, url)
1325 if mobj is None:
1326 raise ExtractorError('Invalid URL: %s' % url)
1327
1328 username = mobj.group(1)
1329
1330 # Download video ids using YouTube Data API. Result size per
1331 # query is limited (currently to 50 videos) so we need to query
1332 # page by page until there are no video ids - it means we got
1333 # all of them.
1334
1335 def download_page(pagenum):
1336 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1337
1338 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1339 page = self._download_webpage(
1340 gdata_url, username,
1341 'Downloading video ids from %d to %d' % (
1342 start_index, start_index + self._GDATA_PAGE_SIZE))
1343
1344 try:
1345 response = json.loads(page)
1346 except ValueError as err:
1347 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1348 if 'entry' not in response['feed']:
1349 return
1350
1351 # Extract video identifiers
1352 entries = response['feed']['entry']
1353 for entry in entries:
1354 title = entry['title']['$t']
1355 video_id = entry['id']['$t'].split('/')[-1]
1356 yield {
1357 '_type': 'url',
1358 'url': video_id,
1359 'ie_key': 'Youtube',
1360 'id': video_id,
1361 'title': title,
1362 }
1363 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1364
1365 return self.playlist_result(url_results, playlist_title=username)
1366
1367
1368 class YoutubeSearchIE(SearchInfoExtractor):
1369 IE_DESC = 'YouTube.com searches'
1370 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1371 _MAX_RESULTS = 1000
1372 IE_NAME = 'youtube:search'
1373 _SEARCH_KEY = 'ytsearch'
1374
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
1377
1378 video_ids = []
1379 pagenum = 0
1380 limit = n
1381 PAGE_SIZE = 50
1382
1383 while (PAGE_SIZE * pagenum) < limit:
1384 result_url = self._API_URL % (
1385 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1386 (PAGE_SIZE * pagenum) + 1)
1387 data_json = self._download_webpage(
1388 result_url, video_id='query "%s"' % query,
1389 note='Downloading page %s' % (pagenum + 1),
1390 errnote='Unable to download API page')
1391 data = json.loads(data_json)
1392 api_response = data['data']
1393
1394 if 'items' not in api_response:
1395 raise ExtractorError(
1396 '[youtube] No video results', expected=True)
1397
1398 new_ids = list(video['id'] for video in api_response['items'])
1399 video_ids += new_ids
1400
1401 limit = min(n, api_response['totalItems'])
1402 pagenum += 1
1403
1404 if len(video_ids) > n:
1405 video_ids = video_ids[:n]
1406 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1407 for video_id in video_ids]
1408 return self.playlist_result(videos, query)
1409
1410
1411 class YoutubeSearchDateIE(YoutubeSearchIE):
1412 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1413 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1414 _SEARCH_KEY = 'ytsearchdate'
1415 IE_DESC = 'YouTube.com searches, newest videos first'
1416
1417
1418 class YoutubeSearchURLIE(InfoExtractor):
1419 IE_DESC = 'YouTube.com search URLs'
1420 IE_NAME = 'youtube:search_url'
1421 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1422 _TESTS = [{
1423 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1424 'playlist_mincount': 5,
1425 'info_dict': {
1426 'title': 'youtube-dl test video',
1427 }
1428 }]
1429
1430 def _real_extract(self, url):
1431 mobj = re.match(self._VALID_URL, url)
1432 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1433
1434 webpage = self._download_webpage(url, query)
1435 result_code = self._search_regex(
1436 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1437
1438 part_codes = re.findall(
1439 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1440 entries = []
1441 for part_code in part_codes:
1442 part_title = self._html_search_regex(
1443 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1444 part_url_snippet = self._html_search_regex(
1445 r'(?s)href="([^"]+)"', part_code, 'item URL')
1446 part_url = compat_urlparse.urljoin(
1447 'https://www.youtube.com/', part_url_snippet)
1448 entries.append({
1449 '_type': 'url',
1450 'url': part_url,
1451 'title': part_title,
1452 })
1453
1454 return {
1455 '_type': 'playlist',
1456 'entries': entries,
1457 'title': query,
1458 }
1459
1460
1461 class YoutubeShowIE(InfoExtractor):
1462 IE_DESC = 'YouTube.com (multi-season) shows'
1463 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1464 IE_NAME = 'youtube:show'
1465 _TESTS = [{
1466 'url': 'http://www.youtube.com/show/airdisasters',
1467 'playlist_mincount': 3,
1468 'info_dict': {
1469 'id': 'airdisasters',
1470 'title': 'Air Disasters',
1471 }
1472 }]
1473
1474 def _real_extract(self, url):
1475 mobj = re.match(self._VALID_URL, url)
1476 playlist_id = mobj.group('id')
1477 webpage = self._download_webpage(
1478 url, playlist_id, 'Downloading show webpage')
1479 # There's one playlist for each season of the show
1480 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1481 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1482 entries = [
1483 self.url_result(
1484 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1485 for season in m_seasons
1486 ]
1487 title = self._og_search_title(webpage, fatal=False)
1488
1489 return {
1490 '_type': 'playlist',
1491 'id': playlist_id,
1492 'title': title,
1493 'entries': entries,
1494 }
1495
1496
1497 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1498 """
1499 Base class for extractors that fetch info from
1500 http://www.youtube.com/feed_ajax
1501 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1502 """
1503 _LOGIN_REQUIRED = True
1504 # use action_load_personal_feed instead of action_load_system_feed
1505 _PERSONAL_FEED = False
1506
1507 @property
1508 def _FEED_TEMPLATE(self):
1509 action = 'action_load_system_feed'
1510 if self._PERSONAL_FEED:
1511 action = 'action_load_personal_feed'
1512 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1513
1514 @property
1515 def IE_NAME(self):
1516 return 'youtube:%s' % self._FEED_NAME
1517
1518 def _real_initialize(self):
1519 self._login()
1520
1521 def _real_extract(self, url):
1522 feed_entries = []
1523 paging = 0
1524 for i in itertools.count(1):
1525 info = self._download_json(self._FEED_TEMPLATE % paging,
1526 '%s feed' % self._FEED_NAME,
1527 'Downloading page %s' % i)
1528 feed_html = info.get('feed_html') or info.get('content_html')
1529 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1530 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1531 ids = orderedSet(m.group(1) for m in m_ids)
1532 feed_entries.extend(
1533 self.url_result(video_id, 'Youtube', video_id=video_id)
1534 for video_id in ids)
1535 mobj = re.search(
1536 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1537 load_more_widget_html)
1538 if mobj is None:
1539 break
1540 paging = mobj.group('paging')
1541 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1542
1543 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1544 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1545 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1546 _FEED_NAME = 'recommended'
1547 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1548
1549 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1550 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1551 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1552 _FEED_NAME = 'watch_later'
1553 _PLAYLIST_TITLE = 'Youtube Watch Later'
1554 _PERSONAL_FEED = True
1555
1556 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1557 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1558 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1559 _FEED_NAME = 'history'
1560 _PERSONAL_FEED = True
1561 _PLAYLIST_TITLE = 'Youtube Watch History'
1562
1563 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1564 IE_NAME = 'youtube:favorites'
1565 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1566 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1567 _LOGIN_REQUIRED = True
1568
1569 def _real_extract(self, url):
1570 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1571 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1572 return self.url_result(playlist_id, 'YoutubePlaylist')
1573
1574
1575 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1576 IE_NAME = 'youtube:subscriptions'
1577 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1578 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1579 _TESTS = []
1580
1581 def _real_extract(self, url):
1582 title = 'Youtube Subscriptions'
1583 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1584
1585 # The extraction process is the same as for playlists, but the regex
1586 # for the video ids doesn't contain an index
1587 ids = []
1588 more_widget_html = content_html = page
1589
1590 for page_num in itertools.count(1):
1591 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1592 new_ids = orderedSet(matches)
1593 ids.extend(new_ids)
1594
1595 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1596 if not mobj:
1597 break
1598
1599 more = self._download_json(
1600 'https://youtube.com/%s' % mobj.group('more'), title,
1601 'Downloading page #%s' % page_num,
1602 transform_source=uppercase_escape)
1603 content_html = more['content_html']
1604 more_widget_html = more['load_more_widget_html']
1605
1606 return {
1607 '_type': 'playlist',
1608 'title': title,
1609 'entries': self._ids_to_results(ids),
1610 }
1611
1612
1613 class YoutubeTruncatedURLIE(InfoExtractor):
1614 IE_NAME = 'youtube:truncated_url'
1615 IE_DESC = False # Do not list
1616 _VALID_URL = r'''(?x)
1617 (?:https?://)?[^/]+/watch\?(?:
1618 feature=[a-z_]+|
1619 annotation_id=annotation_[^&]+
1620 )?$|
1621 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1622 '''
1623
1624 _TESTS = [{
1625 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1626 'only_matching': True,
1627 }, {
1628 'url': 'http://www.youtube.com/watch?',
1629 'only_matching': True,
1630 }]
1631
1632 def _real_extract(self, url):
1633 raise ExtractorError(
1634 'Did you forget to quote the URL? Remember that & is a meta '
1635 'character in most shells, so you want to put the URL in quotes, '
1636 'like youtube-dl '
1637 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1638 ' or simply youtube-dl BaW_jenozKc .',
1639 expected=True)