]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
debian/changelog: Annotate with bugs being closed.
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
227
228 # Dash webm audio
229 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
230 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
231
232 # RTMP (unnamed)
233 '_rtmp': {'protocol': 'rtmp'},
234 }
235
236 IE_NAME = u'youtube'
237 _TESTS = [
238 {
239 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
240 u"file": u"BaW_jenozKc.mp4",
241 u"info_dict": {
242 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
243 u"uploader": u"Philipp Hagemeister",
244 u"uploader_id": u"phihag",
245 u"upload_date": u"20121002",
246 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
247 u"categories": [u'Science & Technology'],
248 }
249 },
250 {
251 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
252 u"file": u"UxxajLWwzqY.mp4",
253 u"note": u"Test generic use_cipher_signature video (#897)",
254 u"info_dict": {
255 u"upload_date": u"20120506",
256 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
257 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
258 u"uploader": u"Icona Pop",
259 u"uploader_id": u"IconaPop"
260 }
261 },
262 {
263 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
264 u"file": u"07FYdnEawAQ.mp4",
265 u"note": u"Test VEVO video with age protection (#956)",
266 u"info_dict": {
267 u"upload_date": u"20130703",
268 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
269 u"description": u"md5:64249768eec3bc4276236606ea996373",
270 u"uploader": u"justintimberlakeVEVO",
271 u"uploader_id": u"justintimberlakeVEVO"
272 }
273 },
274 {
275 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
276 u"file": u"yZIXLfi8CZQ.mp4",
277 u"note": u"Embed-only video (#1746)",
278 u"info_dict": {
279 u"upload_date": u"20120608",
280 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
281 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
282 u"uploader": u"SET India",
283 u"uploader_id": u"setindia"
284 }
285 },
286 {
287 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
288 u"file": u"a9LDPn-MO4I.m4a",
289 u"note": u"256k DASH audio (format 141) via DASH manifest",
290 u"info_dict": {
291 u"upload_date": "20121002",
292 u"uploader_id": "8KVIDEO",
293 u"description": "No description available.",
294 u"uploader": "8KVIDEO",
295 u"title": "UHDTV TEST 8K VIDEO.mp4"
296 },
297 u"params": {
298 u"youtube_include_dash_manifest": True,
299 u"format": "141",
300 },
301 },
302 # DASH manifest with encrypted signature
303 {
304 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
305 u'info_dict': {
306 u'id': u'IB3lcPjvWLA',
307 u'ext': u'm4a',
308 u'title': u'Afrojack - The Spark ft. Spree Wilson',
309 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
310 u'uploader': u'AfrojackVEVO',
311 u'uploader_id': u'AfrojackVEVO',
312 u'upload_date': u'20131011',
313 },
314 u"params": {
315 u'youtube_include_dash_manifest': True,
316 u'format': '141',
317 },
318 },
319 ]
320
321
322 @classmethod
323 def suitable(cls, url):
324 """Receives a URL and returns True if suitable for this IE."""
325 if YoutubePlaylistIE.suitable(url): return False
326 return re.match(cls._VALID_URL, url) is not None
327
328 def __init__(self, *args, **kwargs):
329 super(YoutubeIE, self).__init__(*args, **kwargs)
330 self._player_cache = {}
331
332 def report_video_info_webpage_download(self, video_id):
333 """Report attempt to download video info webpage."""
334 self.to_screen(u'%s: Downloading video info webpage' % video_id)
335
336 def report_information_extraction(self, video_id):
337 """Report attempt to extract video information."""
338 self.to_screen(u'%s: Extracting video information' % video_id)
339
340 def report_unavailable_format(self, video_id, format):
341 """Report extracted video URL."""
342 self.to_screen(u'%s: Format %s not available' % (video_id, format))
343
344 def report_rtmp_download(self):
345 """Indicate the download will use the RTMP protocol."""
346 self.to_screen(u'RTMP download detected')
347
348 def _extract_signature_function(self, video_id, player_url, slen):
349 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
350 player_url)
351 player_type = id_m.group('ext')
352 player_id = id_m.group('id')
353
354 # Read from filesystem cache
355 func_id = '%s_%s_%d' % (player_type, player_id, slen)
356 assert os.path.basename(func_id) == func_id
357 cache_dir = get_cachedir(self._downloader.params)
358
359 cache_enabled = cache_dir is not None
360 if cache_enabled:
361 cache_fn = os.path.join(os.path.expanduser(cache_dir),
362 u'youtube-sigfuncs',
363 func_id + '.json')
364 try:
365 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
366 cache_spec = json.load(cachef)
367 return lambda s: u''.join(s[i] for i in cache_spec)
368 except IOError:
369 pass # No cache available
370
371 if player_type == 'js':
372 code = self._download_webpage(
373 player_url, video_id,
374 note=u'Downloading %s player %s' % (player_type, player_id),
375 errnote=u'Download of %s failed' % player_url)
376 res = self._parse_sig_js(code)
377 elif player_type == 'swf':
378 urlh = self._request_webpage(
379 player_url, video_id,
380 note=u'Downloading %s player %s' % (player_type, player_id),
381 errnote=u'Download of %s failed' % player_url)
382 code = urlh.read()
383 res = self._parse_sig_swf(code)
384 else:
385 assert False, 'Invalid player type %r' % player_type
386
387 if cache_enabled:
388 try:
389 test_string = u''.join(map(compat_chr, range(slen)))
390 cache_res = res(test_string)
391 cache_spec = [ord(c) for c in cache_res]
392 try:
393 os.makedirs(os.path.dirname(cache_fn))
394 except OSError as ose:
395 if ose.errno != errno.EEXIST:
396 raise
397 write_json_file(cache_spec, cache_fn)
398 except Exception:
399 tb = traceback.format_exc()
400 self._downloader.report_warning(
401 u'Writing cache to %r failed: %s' % (cache_fn, tb))
402
403 return res
404
405 def _print_sig_code(self, func, slen):
406 def gen_sig_code(idxs):
407 def _genslice(start, end, step):
408 starts = u'' if start == 0 else str(start)
409 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
410 steps = u'' if step == 1 else (u':%d' % step)
411 return u's[%s%s%s]' % (starts, ends, steps)
412
413 step = None
414 start = '(Never used)' # Quelch pyflakes warnings - start will be
415 # set as soon as step is set
416 for i, prev in zip(idxs[1:], idxs[:-1]):
417 if step is not None:
418 if i - prev == step:
419 continue
420 yield _genslice(start, prev, step)
421 step = None
422 continue
423 if i - prev in [-1, 1]:
424 step = i - prev
425 start = prev
426 continue
427 else:
428 yield u's[%d]' % prev
429 if step is None:
430 yield u's[%d]' % i
431 else:
432 yield _genslice(start, i, step)
433
434 test_string = u''.join(map(compat_chr, range(slen)))
435 cache_res = func(test_string)
436 cache_spec = [ord(c) for c in cache_res]
437 expr_code = u' + '.join(gen_sig_code(cache_spec))
438 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
439 self.to_screen(u'Extracted signature function:\n' + code)
440
441 def _parse_sig_js(self, jscode):
442 funcname = self._search_regex(
443 r'signature=([$a-zA-Z]+)', jscode,
444 u'Initial JS player signature function name')
445
446 jsi = JSInterpreter(jscode)
447 initial_function = jsi.extract_function(funcname)
448 return lambda s: initial_function([s])
449
450 def _parse_sig_swf(self, file_contents):
451 if file_contents[1:3] != b'WS':
452 raise ExtractorError(
453 u'Not an SWF file; header is %r' % file_contents[:3])
454 if file_contents[:1] == b'C':
455 content = zlib.decompress(file_contents[8:])
456 else:
457 raise NotImplementedError(u'Unsupported compression format %r' %
458 file_contents[:1])
459
460 def extract_tags(content):
461 pos = 0
462 while pos < len(content):
463 header16 = struct.unpack('<H', content[pos:pos+2])[0]
464 pos += 2
465 tag_code = header16 >> 6
466 tag_len = header16 & 0x3f
467 if tag_len == 0x3f:
468 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
469 pos += 4
470 assert pos+tag_len <= len(content)
471 yield (tag_code, content[pos:pos+tag_len])
472 pos += tag_len
473
474 code_tag = next(tag
475 for tag_code, tag in extract_tags(content)
476 if tag_code == 82)
477 p = code_tag.index(b'\0', 4) + 1
478 code_reader = io.BytesIO(code_tag[p:])
479
480 # Parse ABC (AVM2 ByteCode)
481 def read_int(reader=None):
482 if reader is None:
483 reader = code_reader
484 res = 0
485 shift = 0
486 for _ in range(5):
487 buf = reader.read(1)
488 assert len(buf) == 1
489 b = struct.unpack('<B', buf)[0]
490 res = res | ((b & 0x7f) << shift)
491 if b & 0x80 == 0:
492 break
493 shift += 7
494 return res
495
496 def u30(reader=None):
497 res = read_int(reader)
498 assert res & 0xf0000000 == 0
499 return res
500 u32 = read_int
501
502 def s32(reader=None):
503 v = read_int(reader)
504 if v & 0x80000000 != 0:
505 v = - ((v ^ 0xffffffff) + 1)
506 return v
507
508 def read_string(reader=None):
509 if reader is None:
510 reader = code_reader
511 slen = u30(reader)
512 resb = reader.read(slen)
513 assert len(resb) == slen
514 return resb.decode('utf-8')
515
516 def read_bytes(count, reader=None):
517 if reader is None:
518 reader = code_reader
519 resb = reader.read(count)
520 assert len(resb) == count
521 return resb
522
523 def read_byte(reader=None):
524 resb = read_bytes(1, reader=reader)
525 res = struct.unpack('<B', resb)[0]
526 return res
527
528 # minor_version + major_version
529 read_bytes(2 + 2)
530
531 # Constant pool
532 int_count = u30()
533 for _c in range(1, int_count):
534 s32()
535 uint_count = u30()
536 for _c in range(1, uint_count):
537 u32()
538 double_count = u30()
539 read_bytes((double_count-1) * 8)
540 string_count = u30()
541 constant_strings = [u'']
542 for _c in range(1, string_count):
543 s = read_string()
544 constant_strings.append(s)
545 namespace_count = u30()
546 for _c in range(1, namespace_count):
547 read_bytes(1) # kind
548 u30() # name
549 ns_set_count = u30()
550 for _c in range(1, ns_set_count):
551 count = u30()
552 for _c2 in range(count):
553 u30()
554 multiname_count = u30()
555 MULTINAME_SIZES = {
556 0x07: 2, # QName
557 0x0d: 2, # QNameA
558 0x0f: 1, # RTQName
559 0x10: 1, # RTQNameA
560 0x11: 0, # RTQNameL
561 0x12: 0, # RTQNameLA
562 0x09: 2, # Multiname
563 0x0e: 2, # MultinameA
564 0x1b: 1, # MultinameL
565 0x1c: 1, # MultinameLA
566 }
567 multinames = [u'']
568 for _c in range(1, multiname_count):
569 kind = u30()
570 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
571 if kind == 0x07:
572 u30() # namespace_idx
573 name_idx = u30()
574 multinames.append(constant_strings[name_idx])
575 else:
576 multinames.append('[MULTINAME kind: %d]' % kind)
577 for _c2 in range(MULTINAME_SIZES[kind]):
578 u30()
579
580 # Methods
581 method_count = u30()
582 MethodInfo = collections.namedtuple(
583 'MethodInfo',
584 ['NEED_ARGUMENTS', 'NEED_REST'])
585 method_infos = []
586 for method_id in range(method_count):
587 param_count = u30()
588 u30() # return type
589 for _ in range(param_count):
590 u30() # param type
591 u30() # name index (always 0 for youtube)
592 flags = read_byte()
593 if flags & 0x08 != 0:
594 # Options present
595 option_count = u30()
596 for c in range(option_count):
597 u30() # val
598 read_bytes(1) # kind
599 if flags & 0x80 != 0:
600 # Param names present
601 for _ in range(param_count):
602 u30() # param name
603 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
604 method_infos.append(mi)
605
606 # Metadata
607 metadata_count = u30()
608 for _c in range(metadata_count):
609 u30() # name
610 item_count = u30()
611 for _c2 in range(item_count):
612 u30() # key
613 u30() # value
614
615 def parse_traits_info():
616 trait_name_idx = u30()
617 kind_full = read_byte()
618 kind = kind_full & 0x0f
619 attrs = kind_full >> 4
620 methods = {}
621 if kind in [0x00, 0x06]: # Slot or Const
622 u30() # Slot id
623 u30() # type_name_idx
624 vindex = u30()
625 if vindex != 0:
626 read_byte() # vkind
627 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
628 u30() # disp_id
629 method_idx = u30()
630 methods[multinames[trait_name_idx]] = method_idx
631 elif kind == 0x04: # Class
632 u30() # slot_id
633 u30() # classi
634 elif kind == 0x05: # Function
635 u30() # slot_id
636 function_idx = u30()
637 methods[function_idx] = multinames[trait_name_idx]
638 else:
639 raise ExtractorError(u'Unsupported trait kind %d' % kind)
640
641 if attrs & 0x4 != 0: # Metadata present
642 metadata_count = u30()
643 for _c3 in range(metadata_count):
644 u30() # metadata index
645
646 return methods
647
648 # Classes
649 TARGET_CLASSNAME = u'SignatureDecipher'
650 searched_idx = multinames.index(TARGET_CLASSNAME)
651 searched_class_id = None
652 class_count = u30()
653 for class_id in range(class_count):
654 name_idx = u30()
655 if name_idx == searched_idx:
656 # We found the class we're looking for!
657 searched_class_id = class_id
658 u30() # super_name idx
659 flags = read_byte()
660 if flags & 0x08 != 0: # Protected namespace is present
661 u30() # protected_ns_idx
662 intrf_count = u30()
663 for _c2 in range(intrf_count):
664 u30()
665 u30() # iinit
666 trait_count = u30()
667 for _c2 in range(trait_count):
668 parse_traits_info()
669
670 if searched_class_id is None:
671 raise ExtractorError(u'Target class %r not found' %
672 TARGET_CLASSNAME)
673
674 method_names = {}
675 method_idxs = {}
676 for class_id in range(class_count):
677 u30() # cinit
678 trait_count = u30()
679 for _c2 in range(trait_count):
680 trait_methods = parse_traits_info()
681 if class_id == searched_class_id:
682 method_names.update(trait_methods.items())
683 method_idxs.update(dict(
684 (idx, name)
685 for name, idx in trait_methods.items()))
686
687 # Scripts
688 script_count = u30()
689 for _c in range(script_count):
690 u30() # init
691 trait_count = u30()
692 for _c2 in range(trait_count):
693 parse_traits_info()
694
695 # Method bodies
696 method_body_count = u30()
697 Method = collections.namedtuple('Method', ['code', 'local_count'])
698 methods = {}
699 for _c in range(method_body_count):
700 method_idx = u30()
701 u30() # max_stack
702 local_count = u30()
703 u30() # init_scope_depth
704 u30() # max_scope_depth
705 code_length = u30()
706 code = read_bytes(code_length)
707 if method_idx in method_idxs:
708 m = Method(code, local_count)
709 methods[method_idxs[method_idx]] = m
710 exception_count = u30()
711 for _c2 in range(exception_count):
712 u30() # from
713 u30() # to
714 u30() # target
715 u30() # exc_type
716 u30() # var_name
717 trait_count = u30()
718 for _c2 in range(trait_count):
719 parse_traits_info()
720
721 assert p + code_reader.tell() == len(code_tag)
722 assert len(methods) == len(method_idxs)
723
724 method_pyfunctions = {}
725
726 def extract_function(func_name):
727 if func_name in method_pyfunctions:
728 return method_pyfunctions[func_name]
729 if func_name not in methods:
730 raise ExtractorError(u'Cannot find function %r' % func_name)
731 m = methods[func_name]
732
733 def resfunc(args):
734 registers = ['(this)'] + list(args) + [None] * m.local_count
735 stack = []
736 coder = io.BytesIO(m.code)
737 while True:
738 opcode = struct.unpack('!B', coder.read(1))[0]
739 if opcode == 36: # pushbyte
740 v = struct.unpack('!B', coder.read(1))[0]
741 stack.append(v)
742 elif opcode == 44: # pushstring
743 idx = u30(coder)
744 stack.append(constant_strings[idx])
745 elif opcode == 48: # pushscope
746 # We don't implement the scope register, so we'll just
747 # ignore the popped value
748 stack.pop()
749 elif opcode == 70: # callproperty
750 index = u30(coder)
751 mname = multinames[index]
752 arg_count = u30(coder)
753 args = list(reversed(
754 [stack.pop() for _ in range(arg_count)]))
755 obj = stack.pop()
756 if mname == u'split':
757 assert len(args) == 1
758 assert isinstance(args[0], compat_str)
759 assert isinstance(obj, compat_str)
760 if args[0] == u'':
761 res = list(obj)
762 else:
763 res = obj.split(args[0])
764 stack.append(res)
765 elif mname == u'slice':
766 assert len(args) == 1
767 assert isinstance(args[0], int)
768 assert isinstance(obj, list)
769 res = obj[args[0]:]
770 stack.append(res)
771 elif mname == u'join':
772 assert len(args) == 1
773 assert isinstance(args[0], compat_str)
774 assert isinstance(obj, list)
775 res = args[0].join(obj)
776 stack.append(res)
777 elif mname in method_pyfunctions:
778 stack.append(method_pyfunctions[mname](args))
779 else:
780 raise NotImplementedError(
781 u'Unsupported property %r on %r'
782 % (mname, obj))
783 elif opcode == 72: # returnvalue
784 res = stack.pop()
785 return res
786 elif opcode == 79: # callpropvoid
787 index = u30(coder)
788 mname = multinames[index]
789 arg_count = u30(coder)
790 args = list(reversed(
791 [stack.pop() for _ in range(arg_count)]))
792 obj = stack.pop()
793 if mname == u'reverse':
794 assert isinstance(obj, list)
795 obj.reverse()
796 else:
797 raise NotImplementedError(
798 u'Unsupported (void) property %r on %r'
799 % (mname, obj))
800 elif opcode == 93: # findpropstrict
801 index = u30(coder)
802 mname = multinames[index]
803 res = extract_function(mname)
804 stack.append(res)
805 elif opcode == 97: # setproperty
806 index = u30(coder)
807 value = stack.pop()
808 idx = stack.pop()
809 obj = stack.pop()
810 assert isinstance(obj, list)
811 assert isinstance(idx, int)
812 obj[idx] = value
813 elif opcode == 98: # getlocal
814 index = u30(coder)
815 stack.append(registers[index])
816 elif opcode == 99: # setlocal
817 index = u30(coder)
818 value = stack.pop()
819 registers[index] = value
820 elif opcode == 102: # getproperty
821 index = u30(coder)
822 pname = multinames[index]
823 if pname == u'length':
824 obj = stack.pop()
825 assert isinstance(obj, list)
826 stack.append(len(obj))
827 else: # Assume attribute access
828 idx = stack.pop()
829 assert isinstance(idx, int)
830 obj = stack.pop()
831 assert isinstance(obj, list)
832 stack.append(obj[idx])
833 elif opcode == 128: # coerce
834 u30(coder)
835 elif opcode == 133: # coerce_s
836 assert isinstance(stack[-1], (type(None), compat_str))
837 elif opcode == 164: # modulo
838 value2 = stack.pop()
839 value1 = stack.pop()
840 res = value1 % value2
841 stack.append(res)
842 elif opcode == 208: # getlocal_0
843 stack.append(registers[0])
844 elif opcode == 209: # getlocal_1
845 stack.append(registers[1])
846 elif opcode == 210: # getlocal_2
847 stack.append(registers[2])
848 elif opcode == 211: # getlocal_3
849 stack.append(registers[3])
850 elif opcode == 214: # setlocal_2
851 registers[2] = stack.pop()
852 elif opcode == 215: # setlocal_3
853 registers[3] = stack.pop()
854 else:
855 raise NotImplementedError(
856 u'Unsupported opcode %d' % opcode)
857
858 method_pyfunctions[func_name] = resfunc
859 return resfunc
860
861 initial_function = extract_function(u'decipher')
862 return lambda s: initial_function([s])
863
864 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
865 """Turn the encrypted s field into a working signature"""
866
867 if player_url is not None:
868 if player_url.startswith(u'//'):
869 player_url = u'https:' + player_url
870 try:
871 player_id = (player_url, len(s))
872 if player_id not in self._player_cache:
873 func = self._extract_signature_function(
874 video_id, player_url, len(s)
875 )
876 self._player_cache[player_id] = func
877 func = self._player_cache[player_id]
878 if self._downloader.params.get('youtube_print_sig_code'):
879 self._print_sig_code(func, len(s))
880 return func(s)
881 except Exception:
882 tb = traceback.format_exc()
883 self._downloader.report_warning(
884 u'Automatic signature extraction failed: ' + tb)
885
886 self._downloader.report_warning(
887 u'Warning: Falling back to static signature algorithm')
888
889 return self._static_decrypt_signature(
890 s, video_id, player_url, age_gate)
891
892 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
893 if age_gate:
894 # The videos with age protection use another player, so the
895 # algorithms can be different.
896 if len(s) == 86:
897 return s[2:63] + s[82] + s[64:82] + s[63]
898
899 if len(s) == 93:
900 return s[86:29:-1] + s[88] + s[28:5:-1]
901 elif len(s) == 92:
902 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
903 elif len(s) == 91:
904 return s[84:27:-1] + s[86] + s[26:5:-1]
905 elif len(s) == 90:
906 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
907 elif len(s) == 89:
908 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
909 elif len(s) == 88:
910 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
911 elif len(s) == 87:
912 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
913 elif len(s) == 86:
914 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
915 elif len(s) == 85:
916 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
917 elif len(s) == 84:
918 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
919 elif len(s) == 83:
920 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
921 elif len(s) == 82:
922 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
923 elif len(s) == 81:
924 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
925 elif len(s) == 80:
926 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
927 elif len(s) == 79:
928 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
929
930 else:
931 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
932
933 def _get_available_subtitles(self, video_id, webpage):
934 try:
935 sub_list = self._download_webpage(
936 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
937 video_id, note=False)
938 except ExtractorError as err:
939 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
940 return {}
941 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
942
943 sub_lang_list = {}
944 for l in lang_list:
945 lang = l[1]
946 params = compat_urllib_parse.urlencode({
947 'lang': lang,
948 'v': video_id,
949 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
950 'name': unescapeHTML(l[0]).encode('utf-8'),
951 })
952 url = u'https://www.youtube.com/api/timedtext?' + params
953 sub_lang_list[lang] = url
954 if not sub_lang_list:
955 self._downloader.report_warning(u'video doesn\'t have subtitles')
956 return {}
957 return sub_lang_list
958
959 def _get_available_automatic_caption(self, video_id, webpage):
960 """We need the webpage for getting the captions url, pass it as an
961 argument to speed up the process."""
962 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
963 self.to_screen(u'%s: Looking for automatic captions' % video_id)
964 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
965 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
966 if mobj is None:
967 self._downloader.report_warning(err_msg)
968 return {}
969 player_config = json.loads(mobj.group(1))
970 try:
971 args = player_config[u'args']
972 caption_url = args[u'ttsurl']
973 timestamp = args[u'timestamp']
974 # We get the available subtitles
975 list_params = compat_urllib_parse.urlencode({
976 'type': 'list',
977 'tlangs': 1,
978 'asrs': 1,
979 })
980 list_url = caption_url + '&' + list_params
981 caption_list = self._download_xml(list_url, video_id)
982 original_lang_node = caption_list.find('track')
983 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
984 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
985 return {}
986 original_lang = original_lang_node.attrib['lang_code']
987
988 sub_lang_list = {}
989 for lang_node in caption_list.findall('target'):
990 sub_lang = lang_node.attrib['lang_code']
991 params = compat_urllib_parse.urlencode({
992 'lang': original_lang,
993 'tlang': sub_lang,
994 'fmt': sub_format,
995 'ts': timestamp,
996 'kind': 'asr',
997 })
998 sub_lang_list[sub_lang] = caption_url + '&' + params
999 return sub_lang_list
1000 # An extractor error can be raise by the download process if there are
1001 # no automatic captions but there are subtitles
1002 except (KeyError, ExtractorError):
1003 self._downloader.report_warning(err_msg)
1004 return {}
1005
1006 @classmethod
1007 def extract_id(cls, url):
1008 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1009 if mobj is None:
1010 raise ExtractorError(u'Invalid URL: %s' % url)
1011 video_id = mobj.group(2)
1012 return video_id
1013
1014 def _extract_from_m3u8(self, manifest_url, video_id):
1015 url_map = {}
1016 def _get_urls(_manifest):
1017 lines = _manifest.split('\n')
1018 urls = filter(lambda l: l and not l.startswith('#'),
1019 lines)
1020 return urls
1021 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1022 formats_urls = _get_urls(manifest)
1023 for format_url in formats_urls:
1024 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1025 url_map[itag] = format_url
1026 return url_map
1027
1028 def _extract_annotations(self, video_id):
1029 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1030 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1031
1032 def _real_extract(self, url):
1033 proto = (
1034 u'http' if self._downloader.params.get('prefer_insecure', False)
1035 else u'https')
1036
1037 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1038 mobj = re.search(self._NEXT_URL_RE, url)
1039 if mobj:
1040 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1041 video_id = self.extract_id(url)
1042
1043 # Get video webpage
1044 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1045 video_webpage = self._download_webpage(url, video_id)
1046
1047 # Attempt to extract SWF player URL
1048 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1049 if mobj is not None:
1050 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1051 else:
1052 player_url = None
1053
1054 # Get video info
1055 self.report_video_info_webpage_download(video_id)
1056 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1057 self.report_age_confirmation()
1058 age_gate = True
1059 # We simulate the access to the video from www.youtube.com/v/{video_id}
1060 # this can be viewed without login into Youtube
1061 data = compat_urllib_parse.urlencode({'video_id': video_id,
1062 'el': 'player_embedded',
1063 'gl': 'US',
1064 'hl': 'en',
1065 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1066 'asv': 3,
1067 'sts':'1588',
1068 })
1069 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1070 video_info_webpage = self._download_webpage(video_info_url, video_id,
1071 note=False,
1072 errnote='unable to download video info webpage')
1073 video_info = compat_parse_qs(video_info_webpage)
1074 else:
1075 age_gate = False
1076 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1077 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1078 % (video_id, el_type))
1079 video_info_webpage = self._download_webpage(video_info_url, video_id,
1080 note=False,
1081 errnote='unable to download video info webpage')
1082 video_info = compat_parse_qs(video_info_webpage)
1083 if 'token' in video_info:
1084 break
1085 if 'token' not in video_info:
1086 if 'reason' in video_info:
1087 raise ExtractorError(
1088 u'YouTube said: %s' % video_info['reason'][0],
1089 expected=True, video_id=video_id)
1090 else:
1091 raise ExtractorError(
1092 u'"token" parameter not in video info for unknown reason',
1093 video_id=video_id)
1094
1095 if 'view_count' in video_info:
1096 view_count = int(video_info['view_count'][0])
1097 else:
1098 view_count = None
1099
1100 # Check for "rental" videos
1101 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1102 raise ExtractorError(u'"rental" videos not supported')
1103
1104 # Start extracting information
1105 self.report_information_extraction(video_id)
1106
1107 # uploader
1108 if 'author' not in video_info:
1109 raise ExtractorError(u'Unable to extract uploader name')
1110 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1111
1112 # uploader_id
1113 video_uploader_id = None
1114 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1115 if mobj is not None:
1116 video_uploader_id = mobj.group(1)
1117 else:
1118 self._downloader.report_warning(u'unable to extract uploader nickname')
1119
1120 # title
1121 if 'title' in video_info:
1122 video_title = video_info['title'][0]
1123 else:
1124 self._downloader.report_warning(u'Unable to extract video title')
1125 video_title = u'_'
1126
1127 # thumbnail image
1128 # We try first to get a high quality image:
1129 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1130 video_webpage, re.DOTALL)
1131 if m_thumb is not None:
1132 video_thumbnail = m_thumb.group(1)
1133 elif 'thumbnail_url' not in video_info:
1134 self._downloader.report_warning(u'unable to extract video thumbnail')
1135 video_thumbnail = None
1136 else: # don't panic if we can't find it
1137 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1138
1139 # upload date
1140 upload_date = None
1141 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1142 if mobj is None:
1143 mobj = re.search(
1144 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1145 video_webpage)
1146 if mobj is not None:
1147 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1148 upload_date = unified_strdate(upload_date)
1149
1150 m_cat_container = get_element_by_id("eow-category", video_webpage)
1151 if m_cat_container:
1152 category = self._html_search_regex(
1153 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1154 default=None)
1155 video_categories = None if category is None else [category]
1156 else:
1157 video_categories = None
1158
1159 # description
1160 video_description = get_element_by_id("eow-description", video_webpage)
1161 if video_description:
1162 video_description = re.sub(r'''(?x)
1163 <a\s+
1164 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1165 title="([^"]+)"\s+
1166 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1167 class="yt-uix-redirect-link"\s*>
1168 [^<]+
1169 </a>
1170 ''', r'\1', video_description)
1171 video_description = clean_html(video_description)
1172 else:
1173 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1174 if fd_mobj:
1175 video_description = unescapeHTML(fd_mobj.group(1))
1176 else:
1177 video_description = u''
1178
1179 def _extract_count(klass):
1180 count = self._search_regex(
1181 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1182 video_webpage, klass, default=None)
1183 if count is not None:
1184 return int(count.replace(',', ''))
1185 return None
1186 like_count = _extract_count(u'likes-count')
1187 dislike_count = _extract_count(u'dislikes-count')
1188
1189 # subtitles
1190 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1191
1192 if self._downloader.params.get('listsubtitles', False):
1193 self._list_available_subtitles(video_id, video_webpage)
1194 return
1195
1196 if 'length_seconds' not in video_info:
1197 self._downloader.report_warning(u'unable to extract video duration')
1198 video_duration = None
1199 else:
1200 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1201
1202 # annotations
1203 video_annotations = None
1204 if self._downloader.params.get('writeannotations', False):
1205 video_annotations = self._extract_annotations(video_id)
1206
1207 # Decide which formats to download
1208 try:
1209 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1210 if not mobj:
1211 raise ValueError('Could not find vevo ID')
1212 json_code = uppercase_escape(mobj.group(1))
1213 ytplayer_config = json.loads(json_code)
1214 args = ytplayer_config['args']
1215 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1216 # this signatures are encrypted
1217 if 'url_encoded_fmt_stream_map' not in args:
1218 raise ValueError(u'No stream_map present') # caught below
1219 re_signature = re.compile(r'[&,]s=')
1220 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1221 if m_s is not None:
1222 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1223 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1224 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1225 if m_s is not None:
1226 if 'adaptive_fmts' in video_info:
1227 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1228 else:
1229 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1230 except ValueError:
1231 pass
1232
1233 def _map_to_format_list(urlmap):
1234 formats = []
1235 for itag, video_real_url in urlmap.items():
1236 dct = {
1237 'format_id': itag,
1238 'url': video_real_url,
1239 'player_url': player_url,
1240 }
1241 if itag in self._formats:
1242 dct.update(self._formats[itag])
1243 formats.append(dct)
1244 return formats
1245
1246 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1247 self.report_rtmp_download()
1248 formats = [{
1249 'format_id': '_rtmp',
1250 'protocol': 'rtmp',
1251 'url': video_info['conn'][0],
1252 'player_url': player_url,
1253 }]
1254 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1255 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1256 if 'rtmpe%3Dyes' in encoded_url_map:
1257 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1258 url_map = {}
1259 for url_data_str in encoded_url_map.split(','):
1260 url_data = compat_parse_qs(url_data_str)
1261 if 'itag' in url_data and 'url' in url_data:
1262 url = url_data['url'][0]
1263 if 'sig' in url_data:
1264 url += '&signature=' + url_data['sig'][0]
1265 elif 's' in url_data:
1266 encrypted_sig = url_data['s'][0]
1267 if self._downloader.params.get('verbose'):
1268 if age_gate:
1269 if player_url is None:
1270 player_version = 'unknown'
1271 else:
1272 player_version = self._search_regex(
1273 r'-(.+)\.swf$', player_url,
1274 u'flash player', fatal=False)
1275 player_desc = 'flash player %s' % player_version
1276 else:
1277 player_version = self._search_regex(
1278 r'html5player-(.+?)\.js', video_webpage,
1279 'html5 player', fatal=False)
1280 player_desc = u'html5 player %s' % player_version
1281
1282 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1283 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1284 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1285
1286 if not age_gate:
1287 jsplayer_url_json = self._search_regex(
1288 r'"assets":.+?"js":\s*("[^"]+")',
1289 video_webpage, u'JS player URL')
1290 player_url = json.loads(jsplayer_url_json)
1291
1292 signature = self._decrypt_signature(
1293 encrypted_sig, video_id, player_url, age_gate)
1294 url += '&signature=' + signature
1295 if 'ratebypass' not in url:
1296 url += '&ratebypass=yes'
1297 url_map[url_data['itag'][0]] = url
1298 formats = _map_to_format_list(url_map)
1299 elif video_info.get('hlsvp'):
1300 manifest_url = video_info['hlsvp'][0]
1301 url_map = self._extract_from_m3u8(manifest_url, video_id)
1302 formats = _map_to_format_list(url_map)
1303 else:
1304 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1305
1306 # Look for the DASH manifest
1307 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1308 try:
1309 # The DASH manifest used needs to be the one from the original video_webpage.
1310 # The one found in get_video_info seems to be using different signatures.
1311 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1312 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1313 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1314 if age_gate:
1315 dash_manifest_url = video_info.get('dashmpd')[0]
1316 else:
1317 dash_manifest_url = ytplayer_config['args']['dashmpd']
1318 def decrypt_sig(mobj):
1319 s = mobj.group(1)
1320 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1321 return '/signature/%s' % dec_s
1322 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1323 dash_doc = self._download_xml(
1324 dash_manifest_url, video_id,
1325 note=u'Downloading DASH manifest',
1326 errnote=u'Could not download DASH manifest')
1327 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1328 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1329 if url_el is None:
1330 continue
1331 format_id = r.attrib['id']
1332 video_url = url_el.text
1333 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1334 f = {
1335 'format_id': format_id,
1336 'url': video_url,
1337 'width': int_or_none(r.attrib.get('width')),
1338 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1339 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1340 'filesize': filesize,
1341 }
1342 try:
1343 existing_format = next(
1344 fo for fo in formats
1345 if fo['format_id'] == format_id)
1346 except StopIteration:
1347 f.update(self._formats.get(format_id, {}))
1348 formats.append(f)
1349 else:
1350 existing_format.update(f)
1351
1352 except (ExtractorError, KeyError) as e:
1353 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1354
1355 self._sort_formats(formats)
1356
1357 return {
1358 'id': video_id,
1359 'uploader': video_uploader,
1360 'uploader_id': video_uploader_id,
1361 'upload_date': upload_date,
1362 'title': video_title,
1363 'thumbnail': video_thumbnail,
1364 'description': video_description,
1365 'categories': video_categories,
1366 'subtitles': video_subtitles,
1367 'duration': video_duration,
1368 'age_limit': 18 if age_gate else 0,
1369 'annotations': video_annotations,
1370 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1371 'view_count': view_count,
1372 'like_count': like_count,
1373 'dislike_count': dislike_count,
1374 'formats': formats,
1375 }
1376
1377 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1378 IE_DESC = u'YouTube.com playlists'
1379 _VALID_URL = r"""(?x)(?:
1380 (?:https?://)?
1381 (?:\w+\.)?
1382 youtube\.com/
1383 (?:
1384 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1385 \? (?:.*?&)*? (?:p|a|list)=
1386 | p/
1387 )
1388 (
1389 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1390 # Top tracks, they can also include dots
1391 |(?:MC)[\w\.]*
1392 )
1393 .*
1394 |
1395 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1396 )"""
1397 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1398 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1399 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1400 IE_NAME = u'youtube:playlist'
1401
1402 def _real_initialize(self):
1403 self._login()
1404
1405 def _ids_to_results(self, ids):
1406 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1407 for vid_id in ids]
1408
1409 def _extract_mix(self, playlist_id):
1410 # The mixes are generated from a a single video
1411 # the id of the playlist is just 'RD' + video_id
1412 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1413 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1414 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1415 title_span = (search_title('playlist-title') or
1416 search_title('title long-title') or search_title('title'))
1417 title = clean_html(title_span)
1418 video_re = r'''(?x)data-video-username=".*?".*?
1419 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1420 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1421 url_results = self._ids_to_results(ids)
1422
1423 return self.playlist_result(url_results, playlist_id, title)
1424
1425 def _real_extract(self, url):
1426 # Extract playlist id
1427 mobj = re.match(self._VALID_URL, url)
1428 if mobj is None:
1429 raise ExtractorError(u'Invalid URL: %s' % url)
1430 playlist_id = mobj.group(1) or mobj.group(2)
1431
1432 # Check if it's a video-specific URL
1433 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1434 if 'v' in query_dict:
1435 video_id = query_dict['v'][0]
1436 if self._downloader.params.get('noplaylist'):
1437 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1438 return self.url_result(video_id, 'Youtube', video_id=video_id)
1439 else:
1440 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1441
1442 if playlist_id.startswith('RD'):
1443 # Mixes require a custom extraction process
1444 return self._extract_mix(playlist_id)
1445 if playlist_id.startswith('TL'):
1446 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1447 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1448
1449 url = self._TEMPLATE_URL % playlist_id
1450 page = self._download_webpage(url, playlist_id)
1451 more_widget_html = content_html = page
1452
1453 # Check if the playlist exists or is private
1454 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1455 raise ExtractorError(
1456 u'The playlist doesn\'t exist or is private, use --username or '
1457 '--netrc to access it.',
1458 expected=True)
1459
1460 # Extract the video ids from the playlist pages
1461 ids = []
1462
1463 for page_num in itertools.count(1):
1464 matches = re.finditer(self._VIDEO_RE, content_html)
1465 # We remove the duplicates and the link with index 0
1466 # (it's not the first video of the playlist)
1467 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1468 ids.extend(new_ids)
1469
1470 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1471 if not mobj:
1472 break
1473
1474 more = self._download_json(
1475 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1476 'Downloading page #%s' % page_num,
1477 transform_source=uppercase_escape)
1478 content_html = more['content_html']
1479 more_widget_html = more['load_more_widget_html']
1480
1481 playlist_title = self._html_search_regex(
1482 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1483 page, u'title')
1484
1485 url_results = self._ids_to_results(ids)
1486 return self.playlist_result(url_results, playlist_id, playlist_title)
1487
1488
1489 class YoutubeTopListIE(YoutubePlaylistIE):
1490 IE_NAME = u'youtube:toplist'
1491 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1492 u' (Example: "yttoplist:music:Top Tracks")')
1493 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1494
1495 def _real_extract(self, url):
1496 mobj = re.match(self._VALID_URL, url)
1497 channel = mobj.group('chann')
1498 title = mobj.group('title')
1499 query = compat_urllib_parse.urlencode({'title': title})
1500 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1501 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1502 link = self._html_search_regex(playlist_re, channel_page, u'list')
1503 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1504
1505 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1506 ids = []
1507 # sometimes the webpage doesn't contain the videos
1508 # retry until we get them
1509 for i in itertools.count(0):
1510 msg = u'Downloading Youtube mix'
1511 if i > 0:
1512 msg += ', retry #%d' % i
1513 webpage = self._download_webpage(url, title, msg)
1514 ids = orderedSet(re.findall(video_re, webpage))
1515 if ids:
1516 break
1517 url_results = self._ids_to_results(ids)
1518 return self.playlist_result(url_results, playlist_title=title)
1519
1520
1521 class YoutubeChannelIE(InfoExtractor):
1522 IE_DESC = u'YouTube.com channels'
1523 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1524 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1525 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1526 IE_NAME = u'youtube:channel'
1527
1528 def extract_videos_from_page(self, page):
1529 ids_in_page = []
1530 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1531 if mobj.group(1) not in ids_in_page:
1532 ids_in_page.append(mobj.group(1))
1533 return ids_in_page
1534
1535 def _real_extract(self, url):
1536 # Extract channel id
1537 mobj = re.match(self._VALID_URL, url)
1538 if mobj is None:
1539 raise ExtractorError(u'Invalid URL: %s' % url)
1540
1541 # Download channel page
1542 channel_id = mobj.group(1)
1543 video_ids = []
1544 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1545 channel_page = self._download_webpage(url, channel_id)
1546 autogenerated = re.search(r'''(?x)
1547 class="[^"]*?(?:
1548 channel-header-autogenerated-label|
1549 yt-channel-title-autogenerated
1550 )[^"]*"''', channel_page) is not None
1551
1552 if autogenerated:
1553 # The videos are contained in a single page
1554 # the ajax pages can't be used, they are empty
1555 video_ids = self.extract_videos_from_page(channel_page)
1556 else:
1557 # Download all channel pages using the json-based channel_ajax query
1558 for pagenum in itertools.count(1):
1559 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1560 page = self._download_json(
1561 url, channel_id, note=u'Downloading page #%s' % pagenum,
1562 transform_source=uppercase_escape)
1563
1564 ids_in_page = self.extract_videos_from_page(page['content_html'])
1565 video_ids.extend(ids_in_page)
1566
1567 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1568 break
1569
1570 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1571
1572 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1573 for video_id in video_ids]
1574 return self.playlist_result(url_entries, channel_id)
1575
1576
1577 class YoutubeUserIE(InfoExtractor):
1578 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1579 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1580 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1581 _GDATA_PAGE_SIZE = 50
1582 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1583 IE_NAME = u'youtube:user'
1584
1585 @classmethod
1586 def suitable(cls, url):
1587 # Don't return True if the url can be extracted with other youtube
1588 # extractor, the regex would is too permissive and it would match.
1589 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1590 if any(ie.suitable(url) for ie in other_ies): return False
1591 else: return super(YoutubeUserIE, cls).suitable(url)
1592
1593 def _real_extract(self, url):
1594 # Extract username
1595 mobj = re.match(self._VALID_URL, url)
1596 if mobj is None:
1597 raise ExtractorError(u'Invalid URL: %s' % url)
1598
1599 username = mobj.group(1)
1600
1601 # Download video ids using YouTube Data API. Result size per
1602 # query is limited (currently to 50 videos) so we need to query
1603 # page by page until there are no video ids - it means we got
1604 # all of them.
1605
1606 def download_page(pagenum):
1607 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1608
1609 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1610 page = self._download_webpage(
1611 gdata_url, username,
1612 u'Downloading video ids from %d to %d' % (
1613 start_index, start_index + self._GDATA_PAGE_SIZE))
1614
1615 try:
1616 response = json.loads(page)
1617 except ValueError as err:
1618 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1619 if 'entry' not in response['feed']:
1620 return
1621
1622 # Extract video identifiers
1623 entries = response['feed']['entry']
1624 for entry in entries:
1625 title = entry['title']['$t']
1626 video_id = entry['id']['$t'].split('/')[-1]
1627 yield {
1628 '_type': 'url',
1629 'url': video_id,
1630 'ie_key': 'Youtube',
1631 'id': video_id,
1632 'title': title,
1633 }
1634 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1635
1636 return self.playlist_result(url_results, playlist_title=username)
1637
1638
1639 class YoutubeSearchIE(SearchInfoExtractor):
1640 IE_DESC = u'YouTube.com searches'
1641 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1642 _MAX_RESULTS = 1000
1643 IE_NAME = u'youtube:search'
1644 _SEARCH_KEY = 'ytsearch'
1645
1646 def _get_n_results(self, query, n):
1647 """Get a specified number of results for a query"""
1648
1649 video_ids = []
1650 pagenum = 0
1651 limit = n
1652 PAGE_SIZE = 50
1653
1654 while (PAGE_SIZE * pagenum) < limit:
1655 result_url = self._API_URL % (
1656 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1657 (PAGE_SIZE * pagenum) + 1)
1658 data_json = self._download_webpage(
1659 result_url, video_id=u'query "%s"' % query,
1660 note=u'Downloading page %s' % (pagenum + 1),
1661 errnote=u'Unable to download API page')
1662 data = json.loads(data_json)
1663 api_response = data['data']
1664
1665 if 'items' not in api_response:
1666 raise ExtractorError(
1667 u'[youtube] No video results', expected=True)
1668
1669 new_ids = list(video['id'] for video in api_response['items'])
1670 video_ids += new_ids
1671
1672 limit = min(n, api_response['totalItems'])
1673 pagenum += 1
1674
1675 if len(video_ids) > n:
1676 video_ids = video_ids[:n]
1677 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1678 for video_id in video_ids]
1679 return self.playlist_result(videos, query)
1680
1681
1682 class YoutubeSearchDateIE(YoutubeSearchIE):
1683 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1684 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1685 _SEARCH_KEY = 'ytsearchdate'
1686 IE_DESC = u'YouTube.com searches, newest videos first'
1687
1688
1689 class YoutubeSearchURLIE(InfoExtractor):
1690 IE_DESC = u'YouTube.com search URLs'
1691 IE_NAME = u'youtube:search_url'
1692 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1693
1694 def _real_extract(self, url):
1695 mobj = re.match(self._VALID_URL, url)
1696 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1697
1698 webpage = self._download_webpage(url, query)
1699 result_code = self._search_regex(
1700 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1701
1702 part_codes = re.findall(
1703 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1704 entries = []
1705 for part_code in part_codes:
1706 part_title = self._html_search_regex(
1707 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1708 part_url_snippet = self._html_search_regex(
1709 r'(?s)href="([^"]+)"', part_code, 'item URL')
1710 part_url = compat_urlparse.urljoin(
1711 'https://www.youtube.com/', part_url_snippet)
1712 entries.append({
1713 '_type': 'url',
1714 'url': part_url,
1715 'title': part_title,
1716 })
1717
1718 return {
1719 '_type': 'playlist',
1720 'entries': entries,
1721 'title': query,
1722 }
1723
1724
1725 class YoutubeShowIE(InfoExtractor):
1726 IE_DESC = u'YouTube.com (multi-season) shows'
1727 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1728 IE_NAME = u'youtube:show'
1729
1730 def _real_extract(self, url):
1731 mobj = re.match(self._VALID_URL, url)
1732 show_name = mobj.group(1)
1733 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1734 # There's one playlist for each season of the show
1735 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1736 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1737 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1738
1739
1740 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1741 """
1742 Base class for extractors that fetch info from
1743 http://www.youtube.com/feed_ajax
1744 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1745 """
1746 _LOGIN_REQUIRED = True
1747 # use action_load_personal_feed instead of action_load_system_feed
1748 _PERSONAL_FEED = False
1749
1750 @property
1751 def _FEED_TEMPLATE(self):
1752 action = 'action_load_system_feed'
1753 if self._PERSONAL_FEED:
1754 action = 'action_load_personal_feed'
1755 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1756
1757 @property
1758 def IE_NAME(self):
1759 return u'youtube:%s' % self._FEED_NAME
1760
1761 def _real_initialize(self):
1762 self._login()
1763
1764 def _real_extract(self, url):
1765 feed_entries = []
1766 paging = 0
1767 for i in itertools.count(1):
1768 info = self._download_json(self._FEED_TEMPLATE % paging,
1769 u'%s feed' % self._FEED_NAME,
1770 u'Downloading page %s' % i)
1771 feed_html = info.get('feed_html') or info.get('content_html')
1772 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1773 ids = orderedSet(m.group(1) for m in m_ids)
1774 feed_entries.extend(
1775 self.url_result(video_id, 'Youtube', video_id=video_id)
1776 for video_id in ids)
1777 mobj = re.search(
1778 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1779 feed_html)
1780 if mobj is None:
1781 break
1782 paging = mobj.group('paging')
1783 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1784
1785 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1786 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1787 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1788 _FEED_NAME = 'subscriptions'
1789 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1790
1791 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1792 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1793 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1794 _FEED_NAME = 'recommended'
1795 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1796
1797 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1798 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1799 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1800 _FEED_NAME = 'watch_later'
1801 _PLAYLIST_TITLE = u'Youtube Watch Later'
1802 _PERSONAL_FEED = True
1803
1804 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1806 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1807 _FEED_NAME = 'history'
1808 _PERSONAL_FEED = True
1809 _PLAYLIST_TITLE = u'Youtube Watch History'
1810
1811 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1812 IE_NAME = u'youtube:favorites'
1813 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1814 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1815 _LOGIN_REQUIRED = True
1816
1817 def _real_extract(self, url):
1818 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1819 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1820 return self.url_result(playlist_id, 'YoutubePlaylist')
1821
1822
1823 class YoutubeTruncatedURLIE(InfoExtractor):
1824 IE_NAME = 'youtube:truncated_url'
1825 IE_DESC = False # Do not list
1826 _VALID_URL = r'''(?x)
1827 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1828 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1829 '''
1830
1831 def _real_extract(self, url):
1832 raise ExtractorError(
1833 u'Did you forget to quote the URL? Remember that & is a meta '
1834 u'character in most shells, so you want to put the URL in quotes, '
1835 u'like youtube-dl '
1836 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1837 u' or simply youtube-dl BaW_jenozKc .',
1838 expected=True)