]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
Imported Upstream version 2013.11.11
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import socket
11 import string
12 import struct
13 import traceback
14 import xml.etree.ElementTree
15 import zlib
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
19 from ..utils import (
20 compat_chr,
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_urlparse,
27 compat_str,
28
29 clean_html,
30 get_cachedir,
31 get_element_by_id,
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
35 orderedSet,
36 write_json_file,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
79
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
92 u'pstMsg': u'0',
93 u'rmShown': u'1',
94 u'secTok': u'',
95 u'signIn': u'Sign in',
96 u'timeStmp': u'',
97 u'service': u'youtube',
98 u'uilel': u'3',
99 u'hl': u'en_US',
100 }
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 try:
107 self.report_login()
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
111 return False
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
114 return False
115 return True
116
117 def _confirm_age(self):
118 age_form = {
119 'next_url': '/',
120 'action_confirm': 'Confirm',
121 }
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
123 try:
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
128 return True
129
130 def _real_initialize(self):
131 if self._downloader is None:
132 return
133 if not self._set_language():
134 return
135 if not self._login():
136 return
137 self._confirm_age()
138
139
140 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
141 IE_DESC = u'YouTube.com'
142 _VALID_URL = r"""^
143 (
144 (?:https?://)? # http(s):// (optional)
145 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
155 v=
156 )
157 ))
158 |youtu\.be/ # just youtu.be/xxxx
159 )
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
163 $"""
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
169 # 3D
170 '85', '84', '102', '83', '101', '82', '100',
171 # Dash video
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
174 # Dash audio
175 '141', '172', '140', '171', '139',
176 ]
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
180 # 3D
181 '85', '102', '84', '101', '83', '100', '82',
182 # Dash video
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
185 # Dash audio
186 '172', '141', '171', '140', '139',
187 ]
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
193 }
194 _video_extensions = {
195 '13': '3gp',
196 '17': '3gp',
197 '18': 'mp4',
198 '22': 'mp4',
199 '36': '3gp',
200 '37': 'mp4',
201 '38': 'mp4',
202 '43': 'webm',
203 '44': 'webm',
204 '45': 'webm',
205 '46': 'webm',
206
207 # 3d videos
208 '82': 'mp4',
209 '83': 'mp4',
210 '84': 'mp4',
211 '85': 'mp4',
212 '100': 'webm',
213 '101': 'webm',
214 '102': 'webm',
215
216 # Apple HTTP Live Streaming
217 '92': 'mp4',
218 '93': 'mp4',
219 '94': 'mp4',
220 '95': 'mp4',
221 '96': 'mp4',
222 '132': 'mp4',
223 '151': 'mp4',
224
225 # Dash mp4
226 '133': 'mp4',
227 '134': 'mp4',
228 '135': 'mp4',
229 '136': 'mp4',
230 '137': 'mp4',
231 '138': 'mp4',
232 '160': 'mp4',
233
234 # Dash mp4 audio
235 '139': 'm4a',
236 '140': 'm4a',
237 '141': 'm4a',
238
239 # Dash webm
240 '171': 'webm',
241 '172': 'webm',
242 '242': 'webm',
243 '243': 'webm',
244 '244': 'webm',
245 '245': 'webm',
246 '246': 'webm',
247 '247': 'webm',
248 '248': 'webm',
249 }
250 _video_dimensions = {
251 '5': '240x400',
252 '6': '???',
253 '13': '???',
254 '17': '144x176',
255 '18': '360x640',
256 '22': '720x1280',
257 '34': '360x640',
258 '35': '480x854',
259 '36': '240x320',
260 '37': '1080x1920',
261 '38': '3072x4096',
262 '43': '360x640',
263 '44': '480x854',
264 '45': '720x1280',
265 '46': '1080x1920',
266 '82': '360p',
267 '83': '480p',
268 '84': '720p',
269 '85': '1080p',
270 '92': '240p',
271 '93': '360p',
272 '94': '480p',
273 '95': '720p',
274 '96': '1080p',
275 '100': '360p',
276 '101': '480p',
277 '102': '720p',
278 '132': '240p',
279 '151': '72p',
280 '133': '240p',
281 '134': '360p',
282 '135': '480p',
283 '136': '720p',
284 '137': '1080p',
285 '138': '>1080p',
286 '139': '48k',
287 '140': '128k',
288 '141': '256k',
289 '160': '192p',
290 '171': '128k',
291 '172': '256k',
292 '242': '240p',
293 '243': '360p',
294 '244': '480p',
295 '245': '480p',
296 '246': '480p',
297 '247': '720p',
298 '248': '1080p',
299 }
300 _special_itags = {
301 '82': '3D',
302 '83': '3D',
303 '84': '3D',
304 '85': '3D',
305 '100': '3D',
306 '101': '3D',
307 '102': '3D',
308 '133': 'DASH Video',
309 '134': 'DASH Video',
310 '135': 'DASH Video',
311 '136': 'DASH Video',
312 '137': 'DASH Video',
313 '138': 'DASH Video',
314 '139': 'DASH Audio',
315 '140': 'DASH Audio',
316 '141': 'DASH Audio',
317 '160': 'DASH Video',
318 '171': 'DASH Audio',
319 '172': 'DASH Audio',
320 '242': 'DASH Video',
321 '243': 'DASH Video',
322 '244': 'DASH Video',
323 '245': 'DASH Video',
324 '246': 'DASH Video',
325 '247': 'DASH Video',
326 '248': 'DASH Video',
327 }
328
329 IE_NAME = u'youtube'
330 _TESTS = [
331 {
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
334 u"info_dict": {
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
340 }
341 },
342 {
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
346 u"info_dict": {
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
350 u"uploader": u"Icona Pop",
351 u"uploader_id": u"IconaPop"
352 }
353 },
354 {
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
358 u"info_dict": {
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
364 }
365 },
366 ]
367
368
369 @classmethod
370 def suitable(cls, url):
371 """Receives a URL and returns True if suitable for this IE."""
372 if YoutubePlaylistIE.suitable(url): return False
373 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
374
375 def __init__(self, *args, **kwargs):
376 super(YoutubeIE, self).__init__(*args, **kwargs)
377 self._player_cache = {}
378
379 def report_video_webpage_download(self, video_id):
380 """Report attempt to download video webpage."""
381 self.to_screen(u'%s: Downloading video webpage' % video_id)
382
383 def report_video_info_webpage_download(self, video_id):
384 """Report attempt to download video info webpage."""
385 self.to_screen(u'%s: Downloading video info webpage' % video_id)
386
387 def report_information_extraction(self, video_id):
388 """Report attempt to extract video information."""
389 self.to_screen(u'%s: Extracting video information' % video_id)
390
391 def report_unavailable_format(self, video_id, format):
392 """Report extracted video URL."""
393 self.to_screen(u'%s: Format %s not available' % (video_id, format))
394
395 def report_rtmp_download(self):
396 """Indicate the download will use the RTMP protocol."""
397 self.to_screen(u'RTMP download detected')
398
399 def _extract_signature_function(self, video_id, player_url, slen):
400 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
401 player_url)
402 player_type = id_m.group('ext')
403 player_id = id_m.group('id')
404
405 # Read from filesystem cache
406 func_id = '%s_%s_%d' % (player_type, player_id, slen)
407 assert os.path.basename(func_id) == func_id
408 cache_dir = get_cachedir(self._downloader.params)
409
410 cache_enabled = cache_dir is not None
411 if cache_enabled:
412 cache_fn = os.path.join(os.path.expanduser(cache_dir),
413 u'youtube-sigfuncs',
414 func_id + '.json')
415 try:
416 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
417 cache_spec = json.load(cachef)
418 return lambda s: u''.join(s[i] for i in cache_spec)
419 except IOError:
420 pass # No cache available
421
422 if player_type == 'js':
423 code = self._download_webpage(
424 player_url, video_id,
425 note=u'Downloading %s player %s' % (player_type, player_id),
426 errnote=u'Download of %s failed' % player_url)
427 res = self._parse_sig_js(code)
428 elif player_type == 'swf':
429 urlh = self._request_webpage(
430 player_url, video_id,
431 note=u'Downloading %s player %s' % (player_type, player_id),
432 errnote=u'Download of %s failed' % player_url)
433 code = urlh.read()
434 res = self._parse_sig_swf(code)
435 else:
436 assert False, 'Invalid player type %r' % player_type
437
438 if cache_enabled:
439 try:
440 test_string = u''.join(map(compat_chr, range(slen)))
441 cache_res = res(test_string)
442 cache_spec = [ord(c) for c in cache_res]
443 try:
444 os.makedirs(os.path.dirname(cache_fn))
445 except OSError as ose:
446 if ose.errno != errno.EEXIST:
447 raise
448 write_json_file(cache_spec, cache_fn)
449 except Exception:
450 tb = traceback.format_exc()
451 self._downloader.report_warning(
452 u'Writing cache to %r failed: %s' % (cache_fn, tb))
453
454 return res
455
456 def _print_sig_code(self, func, slen):
457 def gen_sig_code(idxs):
458 def _genslice(start, end, step):
459 starts = u'' if start == 0 else str(start)
460 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
461 steps = u'' if step == 1 else (u':%d' % step)
462 return u's[%s%s%s]' % (starts, ends, steps)
463
464 step = None
465 start = '(Never used)' # Quelch pyflakes warnings - start will be
466 # set as soon as step is set
467 for i, prev in zip(idxs[1:], idxs[:-1]):
468 if step is not None:
469 if i - prev == step:
470 continue
471 yield _genslice(start, prev, step)
472 step = None
473 continue
474 if i - prev in [-1, 1]:
475 step = i - prev
476 start = prev
477 continue
478 else:
479 yield u's[%d]' % prev
480 if step is None:
481 yield u's[%d]' % i
482 else:
483 yield _genslice(start, i, step)
484
485 test_string = u''.join(map(compat_chr, range(slen)))
486 cache_res = func(test_string)
487 cache_spec = [ord(c) for c in cache_res]
488 expr_code = u' + '.join(gen_sig_code(cache_spec))
489 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
490 self.to_screen(u'Extracted signature function:\n' + code)
491
492 def _parse_sig_js(self, jscode):
493 funcname = self._search_regex(
494 r'signature=([a-zA-Z]+)', jscode,
495 u'Initial JS player signature function name')
496
497 functions = {}
498
499 def argidx(varname):
500 return string.lowercase.index(varname)
501
502 def interpret_statement(stmt, local_vars, allow_recursion=20):
503 if allow_recursion < 0:
504 raise ExtractorError(u'Recursion limit reached')
505
506 if stmt.startswith(u'var '):
507 stmt = stmt[len(u'var '):]
508 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
509 r'=(?P<expr>.*)$', stmt)
510 if ass_m:
511 if ass_m.groupdict().get('index'):
512 def assign(val):
513 lvar = local_vars[ass_m.group('out')]
514 idx = interpret_expression(ass_m.group('index'),
515 local_vars, allow_recursion)
516 assert isinstance(idx, int)
517 lvar[idx] = val
518 return val
519 expr = ass_m.group('expr')
520 else:
521 def assign(val):
522 local_vars[ass_m.group('out')] = val
523 return val
524 expr = ass_m.group('expr')
525 elif stmt.startswith(u'return '):
526 assign = lambda v: v
527 expr = stmt[len(u'return '):]
528 else:
529 raise ExtractorError(
530 u'Cannot determine left side of statement in %r' % stmt)
531
532 v = interpret_expression(expr, local_vars, allow_recursion)
533 return assign(v)
534
535 def interpret_expression(expr, local_vars, allow_recursion):
536 if expr.isdigit():
537 return int(expr)
538
539 if expr.isalpha():
540 return local_vars[expr]
541
542 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
543 if m:
544 member = m.group('member')
545 val = local_vars[m.group('in')]
546 if member == 'split("")':
547 return list(val)
548 if member == 'join("")':
549 return u''.join(val)
550 if member == 'length':
551 return len(val)
552 if member == 'reverse()':
553 return val[::-1]
554 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
555 if slice_m:
556 idx = interpret_expression(
557 slice_m.group('idx'), local_vars, allow_recursion-1)
558 return val[idx:]
559
560 m = re.match(
561 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
562 if m:
563 val = local_vars[m.group('in')]
564 idx = interpret_expression(m.group('idx'), local_vars,
565 allow_recursion-1)
566 return val[idx]
567
568 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
569 if m:
570 a = interpret_expression(m.group('a'),
571 local_vars, allow_recursion)
572 b = interpret_expression(m.group('b'),
573 local_vars, allow_recursion)
574 return a % b
575
576 m = re.match(
577 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
578 if m:
579 fname = m.group('func')
580 if fname not in functions:
581 functions[fname] = extract_function(fname)
582 argvals = [int(v) if v.isdigit() else local_vars[v]
583 for v in m.group('args').split(',')]
584 return functions[fname](argvals)
585 raise ExtractorError(u'Unsupported JS expression %r' % expr)
586
587 def extract_function(funcname):
588 func_m = re.search(
589 r'function ' + re.escape(funcname) +
590 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
591 jscode)
592 argnames = func_m.group('args').split(',')
593
594 def resf(args):
595 local_vars = dict(zip(argnames, args))
596 for stmt in func_m.group('code').split(';'):
597 res = interpret_statement(stmt, local_vars)
598 return res
599 return resf
600
601 initial_function = extract_function(funcname)
602 return lambda s: initial_function([s])
603
604 def _parse_sig_swf(self, file_contents):
605 if file_contents[1:3] != b'WS':
606 raise ExtractorError(
607 u'Not an SWF file; header is %r' % file_contents[:3])
608 if file_contents[:1] == b'C':
609 content = zlib.decompress(file_contents[8:])
610 else:
611 raise NotImplementedError(u'Unsupported compression format %r' %
612 file_contents[:1])
613
614 def extract_tags(content):
615 pos = 0
616 while pos < len(content):
617 header16 = struct.unpack('<H', content[pos:pos+2])[0]
618 pos += 2
619 tag_code = header16 >> 6
620 tag_len = header16 & 0x3f
621 if tag_len == 0x3f:
622 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
623 pos += 4
624 assert pos+tag_len <= len(content)
625 yield (tag_code, content[pos:pos+tag_len])
626 pos += tag_len
627
628 code_tag = next(tag
629 for tag_code, tag in extract_tags(content)
630 if tag_code == 82)
631 p = code_tag.index(b'\0', 4) + 1
632 code_reader = io.BytesIO(code_tag[p:])
633
634 # Parse ABC (AVM2 ByteCode)
635 def read_int(reader=None):
636 if reader is None:
637 reader = code_reader
638 res = 0
639 shift = 0
640 for _ in range(5):
641 buf = reader.read(1)
642 assert len(buf) == 1
643 b = struct.unpack('<B', buf)[0]
644 res = res | ((b & 0x7f) << shift)
645 if b & 0x80 == 0:
646 break
647 shift += 7
648 return res
649
650 def u30(reader=None):
651 res = read_int(reader)
652 assert res & 0xf0000000 == 0
653 return res
654 u32 = read_int
655
656 def s32(reader=None):
657 v = read_int(reader)
658 if v & 0x80000000 != 0:
659 v = - ((v ^ 0xffffffff) + 1)
660 return v
661
662 def read_string(reader=None):
663 if reader is None:
664 reader = code_reader
665 slen = u30(reader)
666 resb = reader.read(slen)
667 assert len(resb) == slen
668 return resb.decode('utf-8')
669
670 def read_bytes(count, reader=None):
671 if reader is None:
672 reader = code_reader
673 resb = reader.read(count)
674 assert len(resb) == count
675 return resb
676
677 def read_byte(reader=None):
678 resb = read_bytes(1, reader=reader)
679 res = struct.unpack('<B', resb)[0]
680 return res
681
682 # minor_version + major_version
683 read_bytes(2 + 2)
684
685 # Constant pool
686 int_count = u30()
687 for _c in range(1, int_count):
688 s32()
689 uint_count = u30()
690 for _c in range(1, uint_count):
691 u32()
692 double_count = u30()
693 read_bytes((double_count-1) * 8)
694 string_count = u30()
695 constant_strings = [u'']
696 for _c in range(1, string_count):
697 s = read_string()
698 constant_strings.append(s)
699 namespace_count = u30()
700 for _c in range(1, namespace_count):
701 read_bytes(1) # kind
702 u30() # name
703 ns_set_count = u30()
704 for _c in range(1, ns_set_count):
705 count = u30()
706 for _c2 in range(count):
707 u30()
708 multiname_count = u30()
709 MULTINAME_SIZES = {
710 0x07: 2, # QName
711 0x0d: 2, # QNameA
712 0x0f: 1, # RTQName
713 0x10: 1, # RTQNameA
714 0x11: 0, # RTQNameL
715 0x12: 0, # RTQNameLA
716 0x09: 2, # Multiname
717 0x0e: 2, # MultinameA
718 0x1b: 1, # MultinameL
719 0x1c: 1, # MultinameLA
720 }
721 multinames = [u'']
722 for _c in range(1, multiname_count):
723 kind = u30()
724 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
725 if kind == 0x07:
726 u30() # namespace_idx
727 name_idx = u30()
728 multinames.append(constant_strings[name_idx])
729 else:
730 multinames.append('[MULTINAME kind: %d]' % kind)
731 for _c2 in range(MULTINAME_SIZES[kind]):
732 u30()
733
734 # Methods
735 method_count = u30()
736 MethodInfo = collections.namedtuple(
737 'MethodInfo',
738 ['NEED_ARGUMENTS', 'NEED_REST'])
739 method_infos = []
740 for method_id in range(method_count):
741 param_count = u30()
742 u30() # return type
743 for _ in range(param_count):
744 u30() # param type
745 u30() # name index (always 0 for youtube)
746 flags = read_byte()
747 if flags & 0x08 != 0:
748 # Options present
749 option_count = u30()
750 for c in range(option_count):
751 u30() # val
752 read_bytes(1) # kind
753 if flags & 0x80 != 0:
754 # Param names present
755 for _ in range(param_count):
756 u30() # param name
757 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
758 method_infos.append(mi)
759
760 # Metadata
761 metadata_count = u30()
762 for _c in range(metadata_count):
763 u30() # name
764 item_count = u30()
765 for _c2 in range(item_count):
766 u30() # key
767 u30() # value
768
769 def parse_traits_info():
770 trait_name_idx = u30()
771 kind_full = read_byte()
772 kind = kind_full & 0x0f
773 attrs = kind_full >> 4
774 methods = {}
775 if kind in [0x00, 0x06]: # Slot or Const
776 u30() # Slot id
777 u30() # type_name_idx
778 vindex = u30()
779 if vindex != 0:
780 read_byte() # vkind
781 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
782 u30() # disp_id
783 method_idx = u30()
784 methods[multinames[trait_name_idx]] = method_idx
785 elif kind == 0x04: # Class
786 u30() # slot_id
787 u30() # classi
788 elif kind == 0x05: # Function
789 u30() # slot_id
790 function_idx = u30()
791 methods[function_idx] = multinames[trait_name_idx]
792 else:
793 raise ExtractorError(u'Unsupported trait kind %d' % kind)
794
795 if attrs & 0x4 != 0: # Metadata present
796 metadata_count = u30()
797 for _c3 in range(metadata_count):
798 u30() # metadata index
799
800 return methods
801
802 # Classes
803 TARGET_CLASSNAME = u'SignatureDecipher'
804 searched_idx = multinames.index(TARGET_CLASSNAME)
805 searched_class_id = None
806 class_count = u30()
807 for class_id in range(class_count):
808 name_idx = u30()
809 if name_idx == searched_idx:
810 # We found the class we're looking for!
811 searched_class_id = class_id
812 u30() # super_name idx
813 flags = read_byte()
814 if flags & 0x08 != 0: # Protected namespace is present
815 u30() # protected_ns_idx
816 intrf_count = u30()
817 for _c2 in range(intrf_count):
818 u30()
819 u30() # iinit
820 trait_count = u30()
821 for _c2 in range(trait_count):
822 parse_traits_info()
823
824 if searched_class_id is None:
825 raise ExtractorError(u'Target class %r not found' %
826 TARGET_CLASSNAME)
827
828 method_names = {}
829 method_idxs = {}
830 for class_id in range(class_count):
831 u30() # cinit
832 trait_count = u30()
833 for _c2 in range(trait_count):
834 trait_methods = parse_traits_info()
835 if class_id == searched_class_id:
836 method_names.update(trait_methods.items())
837 method_idxs.update(dict(
838 (idx, name)
839 for name, idx in trait_methods.items()))
840
841 # Scripts
842 script_count = u30()
843 for _c in range(script_count):
844 u30() # init
845 trait_count = u30()
846 for _c2 in range(trait_count):
847 parse_traits_info()
848
849 # Method bodies
850 method_body_count = u30()
851 Method = collections.namedtuple('Method', ['code', 'local_count'])
852 methods = {}
853 for _c in range(method_body_count):
854 method_idx = u30()
855 u30() # max_stack
856 local_count = u30()
857 u30() # init_scope_depth
858 u30() # max_scope_depth
859 code_length = u30()
860 code = read_bytes(code_length)
861 if method_idx in method_idxs:
862 m = Method(code, local_count)
863 methods[method_idxs[method_idx]] = m
864 exception_count = u30()
865 for _c2 in range(exception_count):
866 u30() # from
867 u30() # to
868 u30() # target
869 u30() # exc_type
870 u30() # var_name
871 trait_count = u30()
872 for _c2 in range(trait_count):
873 parse_traits_info()
874
875 assert p + code_reader.tell() == len(code_tag)
876 assert len(methods) == len(method_idxs)
877
878 method_pyfunctions = {}
879
880 def extract_function(func_name):
881 if func_name in method_pyfunctions:
882 return method_pyfunctions[func_name]
883 if func_name not in methods:
884 raise ExtractorError(u'Cannot find function %r' % func_name)
885 m = methods[func_name]
886
887 def resfunc(args):
888 registers = ['(this)'] + list(args) + [None] * m.local_count
889 stack = []
890 coder = io.BytesIO(m.code)
891 while True:
892 opcode = struct.unpack('!B', coder.read(1))[0]
893 if opcode == 36: # pushbyte
894 v = struct.unpack('!B', coder.read(1))[0]
895 stack.append(v)
896 elif opcode == 44: # pushstring
897 idx = u30(coder)
898 stack.append(constant_strings[idx])
899 elif opcode == 48: # pushscope
900 # We don't implement the scope register, so we'll just
901 # ignore the popped value
902 stack.pop()
903 elif opcode == 70: # callproperty
904 index = u30(coder)
905 mname = multinames[index]
906 arg_count = u30(coder)
907 args = list(reversed(
908 [stack.pop() for _ in range(arg_count)]))
909 obj = stack.pop()
910 if mname == u'split':
911 assert len(args) == 1
912 assert isinstance(args[0], compat_str)
913 assert isinstance(obj, compat_str)
914 if args[0] == u'':
915 res = list(obj)
916 else:
917 res = obj.split(args[0])
918 stack.append(res)
919 elif mname == u'slice':
920 assert len(args) == 1
921 assert isinstance(args[0], int)
922 assert isinstance(obj, list)
923 res = obj[args[0]:]
924 stack.append(res)
925 elif mname == u'join':
926 assert len(args) == 1
927 assert isinstance(args[0], compat_str)
928 assert isinstance(obj, list)
929 res = args[0].join(obj)
930 stack.append(res)
931 elif mname in method_pyfunctions:
932 stack.append(method_pyfunctions[mname](args))
933 else:
934 raise NotImplementedError(
935 u'Unsupported property %r on %r'
936 % (mname, obj))
937 elif opcode == 72: # returnvalue
938 res = stack.pop()
939 return res
940 elif opcode == 79: # callpropvoid
941 index = u30(coder)
942 mname = multinames[index]
943 arg_count = u30(coder)
944 args = list(reversed(
945 [stack.pop() for _ in range(arg_count)]))
946 obj = stack.pop()
947 if mname == u'reverse':
948 assert isinstance(obj, list)
949 obj.reverse()
950 else:
951 raise NotImplementedError(
952 u'Unsupported (void) property %r on %r'
953 % (mname, obj))
954 elif opcode == 93: # findpropstrict
955 index = u30(coder)
956 mname = multinames[index]
957 res = extract_function(mname)
958 stack.append(res)
959 elif opcode == 97: # setproperty
960 index = u30(coder)
961 value = stack.pop()
962 idx = stack.pop()
963 obj = stack.pop()
964 assert isinstance(obj, list)
965 assert isinstance(idx, int)
966 obj[idx] = value
967 elif opcode == 98: # getlocal
968 index = u30(coder)
969 stack.append(registers[index])
970 elif opcode == 99: # setlocal
971 index = u30(coder)
972 value = stack.pop()
973 registers[index] = value
974 elif opcode == 102: # getproperty
975 index = u30(coder)
976 pname = multinames[index]
977 if pname == u'length':
978 obj = stack.pop()
979 assert isinstance(obj, list)
980 stack.append(len(obj))
981 else: # Assume attribute access
982 idx = stack.pop()
983 assert isinstance(idx, int)
984 obj = stack.pop()
985 assert isinstance(obj, list)
986 stack.append(obj[idx])
987 elif opcode == 128: # coerce
988 u30(coder)
989 elif opcode == 133: # coerce_s
990 assert isinstance(stack[-1], (type(None), compat_str))
991 elif opcode == 164: # modulo
992 value2 = stack.pop()
993 value1 = stack.pop()
994 res = value1 % value2
995 stack.append(res)
996 elif opcode == 208: # getlocal_0
997 stack.append(registers[0])
998 elif opcode == 209: # getlocal_1
999 stack.append(registers[1])
1000 elif opcode == 210: # getlocal_2
1001 stack.append(registers[2])
1002 elif opcode == 211: # getlocal_3
1003 stack.append(registers[3])
1004 elif opcode == 214: # setlocal_2
1005 registers[2] = stack.pop()
1006 elif opcode == 215: # setlocal_3
1007 registers[3] = stack.pop()
1008 else:
1009 raise NotImplementedError(
1010 u'Unsupported opcode %d' % opcode)
1011
1012 method_pyfunctions[func_name] = resfunc
1013 return resfunc
1014
1015 initial_function = extract_function(u'decipher')
1016 return lambda s: initial_function([s])
1017
1018 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1019 """Turn the encrypted s field into a working signature"""
1020
1021 if player_url is not None:
1022 try:
1023 player_id = (player_url, len(s))
1024 if player_id not in self._player_cache:
1025 func = self._extract_signature_function(
1026 video_id, player_url, len(s)
1027 )
1028 self._player_cache[player_id] = func
1029 func = self._player_cache[player_id]
1030 if self._downloader.params.get('youtube_print_sig_code'):
1031 self._print_sig_code(func, len(s))
1032 return func(s)
1033 except Exception:
1034 tb = traceback.format_exc()
1035 self._downloader.report_warning(
1036 u'Automatic signature extraction failed: ' + tb)
1037
1038 self._downloader.report_warning(
1039 u'Warning: Falling back to static signature algorithm')
1040
1041 return self._static_decrypt_signature(
1042 s, video_id, player_url, age_gate)
1043
1044 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1045 if age_gate:
1046 # The videos with age protection use another player, so the
1047 # algorithms can be different.
1048 if len(s) == 86:
1049 return s[2:63] + s[82] + s[64:82] + s[63]
1050
1051 if len(s) == 93:
1052 return s[86:29:-1] + s[88] + s[28:5:-1]
1053 elif len(s) == 92:
1054 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1055 elif len(s) == 91:
1056 return s[84:27:-1] + s[86] + s[26:5:-1]
1057 elif len(s) == 90:
1058 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1059 elif len(s) == 89:
1060 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1061 elif len(s) == 88:
1062 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1063 elif len(s) == 87:
1064 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1065 elif len(s) == 86:
1066 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1067 elif len(s) == 85:
1068 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1069 elif len(s) == 84:
1070 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1071 elif len(s) == 83:
1072 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1073 elif len(s) == 82:
1074 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1075 elif len(s) == 81:
1076 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1077 elif len(s) == 80:
1078 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1079 elif len(s) == 79:
1080 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1081
1082 else:
1083 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1084
1085 def _get_available_subtitles(self, video_id, webpage):
1086 try:
1087 sub_list = self._download_webpage(
1088 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1089 video_id, note=False)
1090 except ExtractorError as err:
1091 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1092 return {}
1093 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1094
1095 sub_lang_list = {}
1096 for l in lang_list:
1097 lang = l[1]
1098 params = compat_urllib_parse.urlencode({
1099 'lang': lang,
1100 'v': video_id,
1101 'fmt': self._downloader.params.get('subtitlesformat'),
1102 'name': l[0].encode('utf-8'),
1103 })
1104 url = u'http://www.youtube.com/api/timedtext?' + params
1105 sub_lang_list[lang] = url
1106 if not sub_lang_list:
1107 self._downloader.report_warning(u'video doesn\'t have subtitles')
1108 return {}
1109 return sub_lang_list
1110
1111 def _get_available_automatic_caption(self, video_id, webpage):
1112 """We need the webpage for getting the captions url, pass it as an
1113 argument to speed up the process."""
1114 sub_format = self._downloader.params.get('subtitlesformat')
1115 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1116 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1117 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1118 if mobj is None:
1119 self._downloader.report_warning(err_msg)
1120 return {}
1121 player_config = json.loads(mobj.group(1))
1122 try:
1123 args = player_config[u'args']
1124 caption_url = args[u'ttsurl']
1125 timestamp = args[u'timestamp']
1126 # We get the available subtitles
1127 list_params = compat_urllib_parse.urlencode({
1128 'type': 'list',
1129 'tlangs': 1,
1130 'asrs': 1,
1131 })
1132 list_url = caption_url + '&' + list_params
1133 list_page = self._download_webpage(list_url, video_id)
1134 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1135 original_lang_node = caption_list.find('track')
1136 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1137 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1138 return {}
1139 original_lang = original_lang_node.attrib['lang_code']
1140
1141 sub_lang_list = {}
1142 for lang_node in caption_list.findall('target'):
1143 sub_lang = lang_node.attrib['lang_code']
1144 params = compat_urllib_parse.urlencode({
1145 'lang': original_lang,
1146 'tlang': sub_lang,
1147 'fmt': sub_format,
1148 'ts': timestamp,
1149 'kind': 'asr',
1150 })
1151 sub_lang_list[sub_lang] = caption_url + '&' + params
1152 return sub_lang_list
1153 # An extractor error can be raise by the download process if there are
1154 # no automatic captions but there are subtitles
1155 except (KeyError, ExtractorError):
1156 self._downloader.report_warning(err_msg)
1157 return {}
1158
1159 def _print_formats(self, formats):
1160 print('Available formats:')
1161 for x in formats:
1162 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1163 self._video_dimensions.get(x, '???'),
1164 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1165
1166 def _extract_id(self, url):
1167 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1168 if mobj is None:
1169 raise ExtractorError(u'Invalid URL: %s' % url)
1170 video_id = mobj.group(2)
1171 return video_id
1172
1173 def _get_video_url_list(self, url_map):
1174 """
1175 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1176 with the requested formats.
1177 """
1178 req_format = self._downloader.params.get('format', None)
1179 format_limit = self._downloader.params.get('format_limit', None)
1180 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1181 if format_limit is not None and format_limit in available_formats:
1182 format_list = available_formats[available_formats.index(format_limit):]
1183 else:
1184 format_list = available_formats
1185 existing_formats = [x for x in format_list if x in url_map]
1186 if len(existing_formats) == 0:
1187 raise ExtractorError(u'no known formats available for video')
1188 if self._downloader.params.get('listformats', None):
1189 self._print_formats(existing_formats)
1190 return
1191 if req_format is None or req_format == 'best':
1192 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1193 elif req_format == 'worst':
1194 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1195 elif req_format in ('-1', 'all'):
1196 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1197 else:
1198 # Specific formats. We pick the first in a slash-delimeted sequence.
1199 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1200 # available in the specified format. For example,
1201 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1202 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1203 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1204 req_formats = req_format.split('/')
1205 video_url_list = None
1206 for rf in req_formats:
1207 if rf in url_map:
1208 video_url_list = [(rf, url_map[rf])]
1209 break
1210 if rf in self._video_formats_map:
1211 for srf in self._video_formats_map[rf]:
1212 if srf in url_map:
1213 video_url_list = [(srf, url_map[srf])]
1214 break
1215 else:
1216 continue
1217 break
1218 if video_url_list is None:
1219 raise ExtractorError(u'requested format not available')
1220 return video_url_list
1221
1222 def _extract_from_m3u8(self, manifest_url, video_id):
1223 url_map = {}
1224 def _get_urls(_manifest):
1225 lines = _manifest.split('\n')
1226 urls = filter(lambda l: l and not l.startswith('#'),
1227 lines)
1228 return urls
1229 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1230 formats_urls = _get_urls(manifest)
1231 for format_url in formats_urls:
1232 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1233 url_map[itag] = format_url
1234 return url_map
1235
1236 def _extract_annotations(self, video_id):
1237 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1238 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1239
1240 def _real_extract(self, url):
1241 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1242 mobj = re.search(self._NEXT_URL_RE, url)
1243 if mobj:
1244 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1245 video_id = self._extract_id(url)
1246
1247 # Get video webpage
1248 self.report_video_webpage_download(video_id)
1249 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1250 request = compat_urllib_request.Request(url)
1251 try:
1252 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1254 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1255
1256 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1257
1258 # Attempt to extract SWF player URL
1259 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1260 if mobj is not None:
1261 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1262 else:
1263 player_url = None
1264
1265 # Get video info
1266 self.report_video_info_webpage_download(video_id)
1267 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1268 self.report_age_confirmation()
1269 age_gate = True
1270 # We simulate the access to the video from www.youtube.com/v/{video_id}
1271 # this can be viewed without login into Youtube
1272 data = compat_urllib_parse.urlencode({'video_id': video_id,
1273 'el': 'embedded',
1274 'gl': 'US',
1275 'hl': 'en',
1276 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1277 'asv': 3,
1278 'sts':'1588',
1279 })
1280 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1281 video_info_webpage = self._download_webpage(video_info_url, video_id,
1282 note=False,
1283 errnote='unable to download video info webpage')
1284 video_info = compat_parse_qs(video_info_webpage)
1285 else:
1286 age_gate = False
1287 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1288 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1289 % (video_id, el_type))
1290 video_info_webpage = self._download_webpage(video_info_url, video_id,
1291 note=False,
1292 errnote='unable to download video info webpage')
1293 video_info = compat_parse_qs(video_info_webpage)
1294 if 'token' in video_info:
1295 break
1296 if 'token' not in video_info:
1297 if 'reason' in video_info:
1298 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1299 else:
1300 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1301
1302 # Check for "rental" videos
1303 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1304 raise ExtractorError(u'"rental" videos not supported')
1305
1306 # Start extracting information
1307 self.report_information_extraction(video_id)
1308
1309 # uploader
1310 if 'author' not in video_info:
1311 raise ExtractorError(u'Unable to extract uploader name')
1312 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1313
1314 # uploader_id
1315 video_uploader_id = None
1316 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1317 if mobj is not None:
1318 video_uploader_id = mobj.group(1)
1319 else:
1320 self._downloader.report_warning(u'unable to extract uploader nickname')
1321
1322 # title
1323 if 'title' in video_info:
1324 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1325 else:
1326 self._downloader.report_warning(u'Unable to extract video title')
1327 video_title = u'_'
1328
1329 # thumbnail image
1330 # We try first to get a high quality image:
1331 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1332 video_webpage, re.DOTALL)
1333 if m_thumb is not None:
1334 video_thumbnail = m_thumb.group(1)
1335 elif 'thumbnail_url' not in video_info:
1336 self._downloader.report_warning(u'unable to extract video thumbnail')
1337 video_thumbnail = None
1338 else: # don't panic if we can't find it
1339 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1340
1341 # upload date
1342 upload_date = None
1343 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1344 if mobj is not None:
1345 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1346 upload_date = unified_strdate(upload_date)
1347
1348 # description
1349 video_description = get_element_by_id("eow-description", video_webpage)
1350 if video_description:
1351 video_description = clean_html(video_description)
1352 else:
1353 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1354 if fd_mobj:
1355 video_description = unescapeHTML(fd_mobj.group(1))
1356 else:
1357 video_description = u''
1358
1359 # subtitles
1360 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1361
1362 if self._downloader.params.get('listsubtitles', False):
1363 self._list_available_subtitles(video_id, video_webpage)
1364 return
1365
1366 if 'length_seconds' not in video_info:
1367 self._downloader.report_warning(u'unable to extract video duration')
1368 video_duration = ''
1369 else:
1370 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1371
1372 # annotations
1373 video_annotations = None
1374 if self._downloader.params.get('writeannotations', False):
1375 video_annotations = self._extract_annotations(video_id)
1376
1377 # Decide which formats to download
1378
1379 try:
1380 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1381 if not mobj:
1382 raise ValueError('Could not find vevo ID')
1383 info = json.loads(mobj.group(1))
1384 args = info['args']
1385 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1386 # this signatures are encrypted
1387 if 'url_encoded_fmt_stream_map' not in args:
1388 raise ValueError(u'No stream_map present') # caught below
1389 re_signature = re.compile(r'[&,]s=')
1390 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1391 if m_s is not None:
1392 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1393 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1394 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1395 if m_s is not None:
1396 if 'adaptive_fmts' in video_info:
1397 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1398 else:
1399 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1400 except ValueError:
1401 pass
1402
1403 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1404 self.report_rtmp_download()
1405 video_url_list = [(None, video_info['conn'][0])]
1406 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1407 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1408 if 'rtmpe%3Dyes' in encoded_url_map:
1409 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1410 url_map = {}
1411 for url_data_str in encoded_url_map.split(','):
1412 url_data = compat_parse_qs(url_data_str)
1413 if 'itag' in url_data and 'url' in url_data:
1414 url = url_data['url'][0]
1415 if 'sig' in url_data:
1416 url += '&signature=' + url_data['sig'][0]
1417 elif 's' in url_data:
1418 encrypted_sig = url_data['s'][0]
1419 if self._downloader.params.get('verbose'):
1420 if age_gate:
1421 if player_url is None:
1422 player_version = 'unknown'
1423 else:
1424 player_version = self._search_regex(
1425 r'-(.+)\.swf$', player_url,
1426 u'flash player', fatal=False)
1427 player_desc = 'flash player %s' % player_version
1428 else:
1429 player_version = self._search_regex(
1430 r'html5player-(.+?)\.js', video_webpage,
1431 'html5 player', fatal=False)
1432 player_desc = u'html5 player %s' % player_version
1433
1434 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1435 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1436 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1437
1438 if not age_gate:
1439 jsplayer_url_json = self._search_regex(
1440 r'"assets":.+?"js":\s*("[^"]+")',
1441 video_webpage, u'JS player URL')
1442 player_url = json.loads(jsplayer_url_json)
1443
1444 signature = self._decrypt_signature(
1445 encrypted_sig, video_id, player_url, age_gate)
1446 url += '&signature=' + signature
1447 if 'ratebypass' not in url:
1448 url += '&ratebypass=yes'
1449 url_map[url_data['itag'][0]] = url
1450 video_url_list = self._get_video_url_list(url_map)
1451 if not video_url_list:
1452 return
1453 elif video_info.get('hlsvp'):
1454 manifest_url = video_info['hlsvp'][0]
1455 url_map = self._extract_from_m3u8(manifest_url, video_id)
1456 video_url_list = self._get_video_url_list(url_map)
1457 if not video_url_list:
1458 return
1459
1460 else:
1461 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1462
1463 results = []
1464 for itag, video_real_url in video_url_list:
1465 # Extension
1466 video_extension = self._video_extensions.get(itag, 'flv')
1467
1468 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1469 self._video_dimensions.get(itag, '???'),
1470 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1471
1472 results.append({
1473 'id': video_id,
1474 'url': video_real_url,
1475 'uploader': video_uploader,
1476 'uploader_id': video_uploader_id,
1477 'upload_date': upload_date,
1478 'title': video_title,
1479 'ext': video_extension,
1480 'format': video_format,
1481 'format_id': itag,
1482 'thumbnail': video_thumbnail,
1483 'description': video_description,
1484 'player_url': player_url,
1485 'subtitles': video_subtitles,
1486 'duration': video_duration,
1487 'age_limit': 18 if age_gate else 0,
1488 'annotations': video_annotations,
1489 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1490 })
1491 return results
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494 IE_DESC = u'YouTube.com playlists'
1495 _VALID_URL = r"""(?:
1496 (?:https?://)?
1497 (?:\w+\.)?
1498 youtube\.com/
1499 (?:
1500 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1501 \? (?:.*?&)*? (?:p|a|list)=
1502 | p/
1503 )
1504 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1505 .*
1506 |
1507 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1508 )"""
1509 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1510 _MAX_RESULTS = 50
1511 IE_NAME = u'youtube:playlist'
1512
1513 @classmethod
1514 def suitable(cls, url):
1515 """Receives a URL and returns True if suitable for this IE."""
1516 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1517
1518 def _real_extract(self, url):
1519 # Extract playlist id
1520 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1521 if mobj is None:
1522 raise ExtractorError(u'Invalid URL: %s' % url)
1523 playlist_id = mobj.group(1) or mobj.group(2)
1524
1525 # Check if it's a video-specific URL
1526 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1527 if 'v' in query_dict:
1528 video_id = query_dict['v'][0]
1529 if self._downloader.params.get('noplaylist'):
1530 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1531 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1532 else:
1533 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1534
1535 # Download playlist videos from API
1536 videos = []
1537
1538 for page_num in itertools.count(1):
1539 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1540 if start_index >= 1000:
1541 self._downloader.report_warning(u'Max number of results reached')
1542 break
1543 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1544 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1545
1546 try:
1547 response = json.loads(page)
1548 except ValueError as err:
1549 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1550
1551 if 'feed' not in response:
1552 raise ExtractorError(u'Got a malformed response from YouTube API')
1553 playlist_title = response['feed']['title']['$t']
1554 if 'entry' not in response['feed']:
1555 # Number of videos is a multiple of self._MAX_RESULTS
1556 break
1557
1558 for entry in response['feed']['entry']:
1559 index = entry['yt$position']['$t']
1560 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1561 videos.append((
1562 index,
1563 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1564 ))
1565
1566 videos = [v[1] for v in sorted(videos)]
1567
1568 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1569 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1570
1571
1572 class YoutubeChannelIE(InfoExtractor):
1573 IE_DESC = u'YouTube.com channels'
1574 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1575 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1576 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1577 IE_NAME = u'youtube:channel'
1578
1579 def extract_videos_from_page(self, page):
1580 ids_in_page = []
1581 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1582 if mobj.group(1) not in ids_in_page:
1583 ids_in_page.append(mobj.group(1))
1584 return ids_in_page
1585
1586 def _real_extract(self, url):
1587 # Extract channel id
1588 mobj = re.match(self._VALID_URL, url)
1589 if mobj is None:
1590 raise ExtractorError(u'Invalid URL: %s' % url)
1591
1592 # Download channel page
1593 channel_id = mobj.group(1)
1594 video_ids = []
1595
1596 # Download all channel pages using the json-based channel_ajax query
1597 for pagenum in itertools.count(1):
1598 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1599 page = self._download_webpage(url, channel_id,
1600 u'Downloading page #%s' % pagenum)
1601
1602 page = json.loads(page)
1603
1604 ids_in_page = self.extract_videos_from_page(page['content_html'])
1605 video_ids.extend(ids_in_page)
1606
1607 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1608 break
1609
1610 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1611
1612 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1613 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1614 return [self.playlist_result(url_entries, channel_id)]
1615
1616
1617 class YoutubeUserIE(InfoExtractor):
1618 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1619 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1620 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1621 _GDATA_PAGE_SIZE = 50
1622 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1623 IE_NAME = u'youtube:user'
1624
1625 @classmethod
1626 def suitable(cls, url):
1627 # Don't return True if the url can be extracted with other youtube
1628 # extractor, the regex would is too permissive and it would match.
1629 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1630 if any(ie.suitable(url) for ie in other_ies): return False
1631 else: return super(YoutubeUserIE, cls).suitable(url)
1632
1633 def _real_extract(self, url):
1634 # Extract username
1635 mobj = re.match(self._VALID_URL, url)
1636 if mobj is None:
1637 raise ExtractorError(u'Invalid URL: %s' % url)
1638
1639 username = mobj.group(1)
1640
1641 # Download video ids using YouTube Data API. Result size per
1642 # query is limited (currently to 50 videos) so we need to query
1643 # page by page until there are no video ids - it means we got
1644 # all of them.
1645
1646 video_ids = []
1647
1648 for pagenum in itertools.count(0):
1649 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1650
1651 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1652 page = self._download_webpage(gdata_url, username,
1653 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1654
1655 try:
1656 response = json.loads(page)
1657 except ValueError as err:
1658 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1659 if 'entry' not in response['feed']:
1660 # Number of videos is a multiple of self._MAX_RESULTS
1661 break
1662
1663 # Extract video identifiers
1664 ids_in_page = []
1665 for entry in response['feed']['entry']:
1666 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1667 video_ids.extend(ids_in_page)
1668
1669 # A little optimization - if current page is not
1670 # "full", ie. does not contain PAGE_SIZE video ids then
1671 # we can assume that this page is the last one - there
1672 # are no more ids on further pages - no need to query
1673 # again.
1674
1675 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1676 break
1677
1678 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1679 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1680 return [self.playlist_result(url_results, playlist_title = username)]
1681
1682 class YoutubeSearchIE(SearchInfoExtractor):
1683 IE_DESC = u'YouTube.com searches'
1684 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1685 _MAX_RESULTS = 1000
1686 IE_NAME = u'youtube:search'
1687 _SEARCH_KEY = 'ytsearch'
1688
1689 def report_download_page(self, query, pagenum):
1690 """Report attempt to download search page with given number."""
1691 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1692
1693 def _get_n_results(self, query, n):
1694 """Get a specified number of results for a query"""
1695
1696 video_ids = []
1697 pagenum = 0
1698 limit = n
1699
1700 while (50 * pagenum) < limit:
1701 self.report_download_page(query, pagenum+1)
1702 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1703 request = compat_urllib_request.Request(result_url)
1704 try:
1705 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1707 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1708 api_response = json.loads(data)['data']
1709
1710 if not 'items' in api_response:
1711 raise ExtractorError(u'[youtube] No video results')
1712
1713 new_ids = list(video['id'] for video in api_response['items'])
1714 video_ids += new_ids
1715
1716 limit = min(n, api_response['totalItems'])
1717 pagenum += 1
1718
1719 if len(video_ids) > n:
1720 video_ids = video_ids[:n]
1721 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1722 return self.playlist_result(videos, query)
1723
1724 class YoutubeSearchDateIE(YoutubeSearchIE):
1725 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1726 _SEARCH_KEY = 'ytsearchdate'
1727 IE_DESC = u'YouTube.com searches, newest videos first'
1728
1729 class YoutubeShowIE(InfoExtractor):
1730 IE_DESC = u'YouTube.com (multi-season) shows'
1731 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1732 IE_NAME = u'youtube:show'
1733
1734 def _real_extract(self, url):
1735 mobj = re.match(self._VALID_URL, url)
1736 show_name = mobj.group(1)
1737 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1738 # There's one playlist for each season of the show
1739 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1740 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1741 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1742
1743
1744 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1745 """
1746 Base class for extractors that fetch info from
1747 http://www.youtube.com/feed_ajax
1748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1749 """
1750 _LOGIN_REQUIRED = True
1751 _PAGING_STEP = 30
1752 # use action_load_personal_feed instead of action_load_system_feed
1753 _PERSONAL_FEED = False
1754
1755 @property
1756 def _FEED_TEMPLATE(self):
1757 action = 'action_load_system_feed'
1758 if self._PERSONAL_FEED:
1759 action = 'action_load_personal_feed'
1760 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1761
1762 @property
1763 def IE_NAME(self):
1764 return u'youtube:%s' % self._FEED_NAME
1765
1766 def _real_initialize(self):
1767 self._login()
1768
1769 def _real_extract(self, url):
1770 feed_entries = []
1771 # The step argument is available only in 2.7 or higher
1772 for i in itertools.count(0):
1773 paging = i*self._PAGING_STEP
1774 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1775 u'%s feed' % self._FEED_NAME,
1776 u'Downloading page %s' % i)
1777 info = json.loads(info)
1778 feed_html = info['feed_html']
1779 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1780 ids = orderedSet(m.group(1) for m in m_ids)
1781 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1782 if info['paging'] is None:
1783 break
1784 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME = 'subscriptions'
1790 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1797
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME = 'watch_later'
1802 _PLAYLIST_TITLE = u'Youtube Watch Later'
1803 _PAGING_STEP = 100
1804 _PERSONAL_FEED = True
1805
1806 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1807 IE_NAME = u'youtube:favorites'
1808 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1809 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1810 _LOGIN_REQUIRED = True
1811
1812 def _real_extract(self, url):
1813 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1814 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1815 return self.url_result(playlist_id, 'YoutubePlaylist')
1816
1817
1818 class YoutubeTruncatedURLIE(InfoExtractor):
1819 IE_NAME = 'youtube:truncated_url'
1820 IE_DESC = False # Do not list
1821 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1822
1823 def _real_extract(self, url):
1824 raise ExtractorError(
1825 u'Did you forget to quote the URL? Remember that & is a meta '
1826 u'character in most shells, so you want to put the URL in quotes, '
1827 u'like youtube-dl '
1828 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1829 u' (or simply youtube-dl BaW_jenozKc ).',
1830 expected=True)