14 import xml
.etree
.ElementTree
17 from .common
import InfoExtractor
, SearchInfoExtractor
18 from .subtitles
import SubtitlesInfoExtractor
25 compat_urllib_request
,
39 class YoutubeBaseInfoExtractor(InfoExtractor
):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE
= 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED
= False
48 def report_lang(self
):
49 """Report attempt to set language."""
50 self
.to_screen(u
'Setting language')
52 def _set_language(self
):
53 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
56 compat_urllib_request
.urlopen(request
).read()
57 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
58 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
63 (username
, password
) = self
._get
_login
_info
()
64 # No authentication to be performed
66 if self
._LOGIN
_REQUIRED
:
67 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
70 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
72 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
73 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
74 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
77 galx
= self
._search
_regex
(r
'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page
, u
'Login GALX parameter')
82 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u
'PersistentCookie': u
'yes',
88 u
'bgresponse': u
'js_disabled',
89 u
'checkConnection': u
'',
90 u
'checkedDomains': u
'youtube',
95 u
'signIn': u
'Sign in',
97 u
'service': u
'youtube',
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
104 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
105 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
108 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
109 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
110 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
112 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
113 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
117 def _confirm_age(self
):
120 'action_confirm': 'Confirm',
122 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
124 self
.report_age_confirmation()
125 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
126 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
127 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
130 def _real_initialize(self
):
131 if self
._downloader
is None:
133 if not self
._set
_language
():
135 if not self
._login
():
140 class YoutubeIE(YoutubeBaseInfoExtractor
, SubtitlesInfoExtractor
):
141 IE_DESC
= u
'YouTube.com'
144 (?:https?://)? # http(s):// (optional)
145 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 |youtu\.be/ # just youtu.be/xxxx
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
164 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
170 '85', '84', '102', '83', '101', '82', '100',
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
175 '141', '172', '140', '171', '139',
177 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
181 '85', '102', '84', '101', '83', '100', '82',
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
186 '172', '141', '171', '140', '139',
188 _video_formats_map
= {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
194 _video_extensions
= {
216 # Apple HTTP Live Streaming
250 _video_dimensions
= {
332 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u
"file": u
"BaW_jenozKc.mp4",
335 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
336 u
"uploader": u
"Philipp Hagemeister",
337 u
"uploader_id": u
"phihag",
338 u
"upload_date": u
"20121002",
339 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u
"file": u
"UxxajLWwzqY.mp4",
345 u
"note": u
"Test generic use_cipher_signature video (#897)",
347 u
"upload_date": u
"20120506",
348 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u
"description": u
"md5:5b292926389560516e384ac437c0ec07",
350 u
"uploader": u
"Icona Pop",
351 u
"uploader_id": u
"IconaPop"
355 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u
"file": u
"07FYdnEawAQ.mp4",
357 u
"note": u
"Test VEVO video with age protection (#956)",
359 u
"upload_date": u
"20130703",
360 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
361 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
362 u
"uploader": u
"justintimberlakeVEVO",
363 u
"uploader_id": u
"justintimberlakeVEVO"
370 def suitable(cls
, url
):
371 """Receives a URL and returns True if suitable for this IE."""
372 if YoutubePlaylistIE
.suitable(url
): return False
373 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
375 def __init__(self
, *args
, **kwargs
):
376 super(YoutubeIE
, self
).__init
__(*args
, **kwargs
)
377 self
._player
_cache
= {}
379 def report_video_webpage_download(self
, video_id
):
380 """Report attempt to download video webpage."""
381 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
383 def report_video_info_webpage_download(self
, video_id
):
384 """Report attempt to download video info webpage."""
385 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
387 def report_information_extraction(self
, video_id
):
388 """Report attempt to extract video information."""
389 self
.to_screen(u
'%s: Extracting video information' % video_id
)
391 def report_unavailable_format(self
, video_id
, format
):
392 """Report extracted video URL."""
393 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
395 def report_rtmp_download(self
):
396 """Indicate the download will use the RTMP protocol."""
397 self
.to_screen(u
'RTMP download detected')
399 def _extract_signature_function(self
, video_id
, player_url
, slen
):
400 id_m
= re
.match(r
'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
402 player_type
= id_m
.group('ext')
403 player_id
= id_m
.group('id')
405 # Read from filesystem cache
406 func_id
= '%s_%s_%d' % (player_type
, player_id
, slen
)
407 assert os
.path
.basename(func_id
) == func_id
408 cache_dir
= get_cachedir(self
._downloader
.params
)
410 cache_enabled
= cache_dir
is not None
412 cache_fn
= os
.path
.join(os
.path
.expanduser(cache_dir
),
416 with io
.open(cache_fn
, 'r', encoding
='utf-8') as cachef
:
417 cache_spec
= json
.load(cachef
)
418 return lambda s
: u
''.join(s
[i
] for i
in cache_spec
)
420 pass # No cache available
422 if player_type
== 'js':
423 code
= self
._download
_webpage
(
424 player_url
, video_id
,
425 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
426 errnote
=u
'Download of %s failed' % player_url
)
427 res
= self
._parse
_sig
_js
(code
)
428 elif player_type
== 'swf':
429 urlh
= self
._request
_webpage
(
430 player_url
, video_id
,
431 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
432 errnote
=u
'Download of %s failed' % player_url
)
434 res
= self
._parse
_sig
_swf
(code
)
436 assert False, 'Invalid player type %r' % player_type
440 test_string
= u
''.join(map(compat_chr
, range(slen
)))
441 cache_res
= res(test_string
)
442 cache_spec
= [ord(c
) for c
in cache_res
]
444 os
.makedirs(os
.path
.dirname(cache_fn
))
445 except OSError as ose
:
446 if ose
.errno
!= errno
.EEXIST
:
448 write_json_file(cache_spec
, cache_fn
)
450 tb
= traceback
.format_exc()
451 self
._downloader
.report_warning(
452 u
'Writing cache to %r failed: %s' % (cache_fn
, tb
))
456 def _print_sig_code(self
, func
, slen
):
457 def gen_sig_code(idxs
):
458 def _genslice(start
, end
, step
):
459 starts
= u
'' if start
== 0 else str(start
)
460 ends
= (u
':%d' % (end
+step
)) if end
+ step
>= 0 else u
':'
461 steps
= u
'' if step
== 1 else (u
':%d' % step
)
462 return u
's[%s%s%s]' % (starts
, ends
, steps
)
465 start
= '(Never used)' # Quelch pyflakes warnings - start will be
466 # set as soon as step is set
467 for i
, prev
in zip(idxs
[1:], idxs
[:-1]):
471 yield _genslice(start
, prev
, step
)
474 if i
- prev
in [-1, 1]:
479 yield u
's[%d]' % prev
483 yield _genslice(start
, i
, step
)
485 test_string
= u
''.join(map(compat_chr
, range(slen
)))
486 cache_res
= func(test_string
)
487 cache_spec
= [ord(c
) for c
in cache_res
]
488 expr_code
= u
' + '.join(gen_sig_code(cache_spec
))
489 code
= u
'if len(s) == %d:\n return %s\n' % (slen
, expr_code
)
490 self
.to_screen(u
'Extracted signature function:\n' + code
)
492 def _parse_sig_js(self
, jscode
):
493 funcname
= self
._search
_regex
(
494 r
'signature=([a-zA-Z]+)', jscode
,
495 u
'Initial JS player signature function name')
500 return string
.lowercase
.index(varname
)
502 def interpret_statement(stmt
, local_vars
, allow_recursion
=20):
503 if allow_recursion
< 0:
504 raise ExtractorError(u
'Recursion limit reached')
506 if stmt
.startswith(u
'var '):
507 stmt
= stmt
[len(u
'var '):]
508 ass_m
= re
.match(r
'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
509 r
'=(?P<expr>.*)$', stmt
)
511 if ass_m
.groupdict().get('index'):
513 lvar
= local_vars
[ass_m
.group('out')]
514 idx
= interpret_expression(ass_m
.group('index'),
515 local_vars
, allow_recursion
)
516 assert isinstance(idx
, int)
519 expr
= ass_m
.group('expr')
522 local_vars
[ass_m
.group('out')] = val
524 expr
= ass_m
.group('expr')
525 elif stmt
.startswith(u
'return '):
527 expr
= stmt
[len(u
'return '):]
529 raise ExtractorError(
530 u
'Cannot determine left side of statement in %r' % stmt
)
532 v
= interpret_expression(expr
, local_vars
, allow_recursion
)
535 def interpret_expression(expr
, local_vars
, allow_recursion
):
540 return local_vars
[expr
]
542 m
= re
.match(r
'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr
)
544 member
= m
.group('member')
545 val
= local_vars
[m
.group('in')]
546 if member
== 'split("")':
548 if member
== 'join("")':
550 if member
== 'length':
552 if member
== 'reverse()':
554 slice_m
= re
.match(r
'slice\((?P<idx>.*)\)', member
)
556 idx
= interpret_expression(
557 slice_m
.group('idx'), local_vars
, allow_recursion
-1)
561 r
'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr
)
563 val
= local_vars
[m
.group('in')]
564 idx
= interpret_expression(m
.group('idx'), local_vars
,
568 m
= re
.match(r
'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr
)
570 a
= interpret_expression(m
.group('a'),
571 local_vars
, allow_recursion
)
572 b
= interpret_expression(m
.group('b'),
573 local_vars
, allow_recursion
)
577 r
'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr
)
579 fname
= m
.group('func')
580 if fname
not in functions
:
581 functions
[fname
] = extract_function(fname
)
582 argvals
= [int(v
) if v
.isdigit() else local_vars
[v
]
583 for v
in m
.group('args').split(',')]
584 return functions
[fname
](argvals
)
585 raise ExtractorError(u
'Unsupported JS expression %r' % expr
)
587 def extract_function(funcname
):
589 r
'function ' + re
.escape(funcname
) +
590 r
'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
592 argnames
= func_m
.group('args').split(',')
595 local_vars
= dict(zip(argnames
, args
))
596 for stmt
in func_m
.group('code').split(';'):
597 res
= interpret_statement(stmt
, local_vars
)
601 initial_function
= extract_function(funcname
)
602 return lambda s
: initial_function([s
])
604 def _parse_sig_swf(self
, file_contents
):
605 if file_contents
[1:3] != b
'WS':
606 raise ExtractorError(
607 u
'Not an SWF file; header is %r' % file_contents
[:3])
608 if file_contents
[:1] == b
'C':
609 content
= zlib
.decompress(file_contents
[8:])
611 raise NotImplementedError(u
'Unsupported compression format %r' %
614 def extract_tags(content
):
616 while pos
< len(content
):
617 header16
= struct
.unpack('<H', content
[pos
:pos
+2])[0]
619 tag_code
= header16
>> 6
620 tag_len
= header16
& 0x3f
622 tag_len
= struct
.unpack('<I', content
[pos
:pos
+4])[0]
624 assert pos
+tag_len
<= len(content
)
625 yield (tag_code
, content
[pos
:pos
+tag_len
])
629 for tag_code
, tag
in extract_tags(content
)
631 p
= code_tag
.index(b
'\0', 4) + 1
632 code_reader
= io
.BytesIO(code_tag
[p
:])
634 # Parse ABC (AVM2 ByteCode)
635 def read_int(reader
=None):
643 b
= struct
.unpack('<B', buf
)[0]
644 res
= res |
((b
& 0x7f) << shift
)
650 def u30(reader
=None):
651 res
= read_int(reader
)
652 assert res
& 0xf0000000 == 0
656 def s32(reader
=None):
658 if v
& 0x80000000 != 0:
659 v
= - ((v ^
0xffffffff) + 1)
662 def read_string(reader
=None):
666 resb
= reader
.read(slen
)
667 assert len(resb
) == slen
668 return resb
.decode('utf-8')
670 def read_bytes(count
, reader
=None):
673 resb
= reader
.read(count
)
674 assert len(resb
) == count
677 def read_byte(reader
=None):
678 resb
= read_bytes(1, reader
=reader
)
679 res
= struct
.unpack('<B', resb
)[0]
682 # minor_version + major_version
687 for _c
in range(1, int_count
):
690 for _c
in range(1, uint_count
):
693 read_bytes((double_count
-1) * 8)
695 constant_strings
= [u
'']
696 for _c
in range(1, string_count
):
698 constant_strings
.append(s
)
699 namespace_count
= u30()
700 for _c
in range(1, namespace_count
):
704 for _c
in range(1, ns_set_count
):
706 for _c2
in range(count
):
708 multiname_count
= u30()
717 0x0e: 2, # MultinameA
718 0x1b: 1, # MultinameL
719 0x1c: 1, # MultinameLA
722 for _c
in range(1, multiname_count
):
724 assert kind
in MULTINAME_SIZES
, u
'Invalid multiname kind %r' % kind
726 u30() # namespace_idx
728 multinames
.append(constant_strings
[name_idx
])
730 multinames
.append('[MULTINAME kind: %d]' % kind
)
731 for _c2
in range(MULTINAME_SIZES
[kind
]):
736 MethodInfo
= collections
.namedtuple(
738 ['NEED_ARGUMENTS', 'NEED_REST'])
740 for method_id
in range(method_count
):
743 for _
in range(param_count
):
745 u30() # name index (always 0 for youtube)
747 if flags
& 0x08 != 0:
750 for c
in range(option_count
):
753 if flags
& 0x80 != 0:
754 # Param names present
755 for _
in range(param_count
):
757 mi
= MethodInfo(flags
& 0x01 != 0, flags
& 0x04 != 0)
758 method_infos
.append(mi
)
761 metadata_count
= u30()
762 for _c
in range(metadata_count
):
765 for _c2
in range(item_count
):
769 def parse_traits_info():
770 trait_name_idx
= u30()
771 kind_full
= read_byte()
772 kind
= kind_full
& 0x0f
773 attrs
= kind_full
>> 4
775 if kind
in [0x00, 0x06]: # Slot or Const
777 u30() # type_name_idx
781 elif kind
in [0x01, 0x02, 0x03]: # Method / Getter / Setter
784 methods
[multinames
[trait_name_idx
]] = method_idx
785 elif kind
== 0x04: # Class
788 elif kind
== 0x05: # Function
791 methods
[function_idx
] = multinames
[trait_name_idx
]
793 raise ExtractorError(u
'Unsupported trait kind %d' % kind
)
795 if attrs
& 0x4 != 0: # Metadata present
796 metadata_count
= u30()
797 for _c3
in range(metadata_count
):
798 u30() # metadata index
803 TARGET_CLASSNAME
= u
'SignatureDecipher'
804 searched_idx
= multinames
.index(TARGET_CLASSNAME
)
805 searched_class_id
= None
807 for class_id
in range(class_count
):
809 if name_idx
== searched_idx
:
810 # We found the class we're looking for!
811 searched_class_id
= class_id
812 u30() # super_name idx
814 if flags
& 0x08 != 0: # Protected namespace is present
815 u30() # protected_ns_idx
817 for _c2
in range(intrf_count
):
821 for _c2
in range(trait_count
):
824 if searched_class_id
is None:
825 raise ExtractorError(u
'Target class %r not found' %
830 for class_id
in range(class_count
):
833 for _c2
in range(trait_count
):
834 trait_methods
= parse_traits_info()
835 if class_id
== searched_class_id
:
836 method_names
.update(trait_methods
.items())
837 method_idxs
.update(dict(
839 for name
, idx
in trait_methods
.items()))
843 for _c
in range(script_count
):
846 for _c2
in range(trait_count
):
850 method_body_count
= u30()
851 Method
= collections
.namedtuple('Method', ['code', 'local_count'])
853 for _c
in range(method_body_count
):
857 u30() # init_scope_depth
858 u30() # max_scope_depth
860 code
= read_bytes(code_length
)
861 if method_idx
in method_idxs
:
862 m
= Method(code
, local_count
)
863 methods
[method_idxs
[method_idx
]] = m
864 exception_count
= u30()
865 for _c2
in range(exception_count
):
872 for _c2
in range(trait_count
):
875 assert p
+ code_reader
.tell() == len(code_tag
)
876 assert len(methods
) == len(method_idxs
)
878 method_pyfunctions
= {}
880 def extract_function(func_name
):
881 if func_name
in method_pyfunctions
:
882 return method_pyfunctions
[func_name
]
883 if func_name
not in methods
:
884 raise ExtractorError(u
'Cannot find function %r' % func_name
)
885 m
= methods
[func_name
]
888 registers
= ['(this)'] + list(args
) + [None] * m
.local_count
890 coder
= io
.BytesIO(m
.code
)
892 opcode
= struct
.unpack('!B', coder
.read(1))[0]
893 if opcode
== 36: # pushbyte
894 v
= struct
.unpack('!B', coder
.read(1))[0]
896 elif opcode
== 44: # pushstring
898 stack
.append(constant_strings
[idx
])
899 elif opcode
== 48: # pushscope
900 # We don't implement the scope register, so we'll just
901 # ignore the popped value
903 elif opcode
== 70: # callproperty
905 mname
= multinames
[index
]
906 arg_count
= u30(coder
)
907 args
= list(reversed(
908 [stack
.pop() for _
in range(arg_count
)]))
910 if mname
== u
'split':
911 assert len(args
) == 1
912 assert isinstance(args
[0], compat_str
)
913 assert isinstance(obj
, compat_str
)
917 res
= obj
.split(args
[0])
919 elif mname
== u
'slice':
920 assert len(args
) == 1
921 assert isinstance(args
[0], int)
922 assert isinstance(obj
, list)
925 elif mname
== u
'join':
926 assert len(args
) == 1
927 assert isinstance(args
[0], compat_str
)
928 assert isinstance(obj
, list)
929 res
= args
[0].join(obj
)
931 elif mname
in method_pyfunctions
:
932 stack
.append(method_pyfunctions
[mname
](args
))
934 raise NotImplementedError(
935 u
'Unsupported property %r on %r'
937 elif opcode
== 72: # returnvalue
940 elif opcode
== 79: # callpropvoid
942 mname
= multinames
[index
]
943 arg_count
= u30(coder
)
944 args
= list(reversed(
945 [stack
.pop() for _
in range(arg_count
)]))
947 if mname
== u
'reverse':
948 assert isinstance(obj
, list)
951 raise NotImplementedError(
952 u
'Unsupported (void) property %r on %r'
954 elif opcode
== 93: # findpropstrict
956 mname
= multinames
[index
]
957 res
= extract_function(mname
)
959 elif opcode
== 97: # setproperty
964 assert isinstance(obj
, list)
965 assert isinstance(idx
, int)
967 elif opcode
== 98: # getlocal
969 stack
.append(registers
[index
])
970 elif opcode
== 99: # setlocal
973 registers
[index
] = value
974 elif opcode
== 102: # getproperty
976 pname
= multinames
[index
]
977 if pname
== u
'length':
979 assert isinstance(obj
, list)
980 stack
.append(len(obj
))
981 else: # Assume attribute access
983 assert isinstance(idx
, int)
985 assert isinstance(obj
, list)
986 stack
.append(obj
[idx
])
987 elif opcode
== 128: # coerce
989 elif opcode
== 133: # coerce_s
990 assert isinstance(stack
[-1], (type(None), compat_str
))
991 elif opcode
== 164: # modulo
994 res
= value1
% value2
996 elif opcode
== 208: # getlocal_0
997 stack
.append(registers
[0])
998 elif opcode
== 209: # getlocal_1
999 stack
.append(registers
[1])
1000 elif opcode
== 210: # getlocal_2
1001 stack
.append(registers
[2])
1002 elif opcode
== 211: # getlocal_3
1003 stack
.append(registers
[3])
1004 elif opcode
== 214: # setlocal_2
1005 registers
[2] = stack
.pop()
1006 elif opcode
== 215: # setlocal_3
1007 registers
[3] = stack
.pop()
1009 raise NotImplementedError(
1010 u
'Unsupported opcode %d' % opcode
)
1012 method_pyfunctions
[func_name
] = resfunc
1015 initial_function
= extract_function(u
'decipher')
1016 return lambda s
: initial_function([s
])
1018 def _decrypt_signature(self
, s
, video_id
, player_url
, age_gate
=False):
1019 """Turn the encrypted s field into a working signature"""
1021 if player_url
is not None:
1023 player_id
= (player_url
, len(s
))
1024 if player_id
not in self
._player
_cache
:
1025 func
= self
._extract
_signature
_function
(
1026 video_id
, player_url
, len(s
)
1028 self
._player
_cache
[player_id
] = func
1029 func
= self
._player
_cache
[player_id
]
1030 if self
._downloader
.params
.get('youtube_print_sig_code'):
1031 self
._print
_sig
_code
(func
, len(s
))
1034 tb
= traceback
.format_exc()
1035 self
._downloader
.report_warning(
1036 u
'Automatic signature extraction failed: ' + tb
)
1038 self
._downloader
.report_warning(
1039 u
'Warning: Falling back to static signature algorithm')
1041 return self
._static
_decrypt
_signature
(
1042 s
, video_id
, player_url
, age_gate
)
1044 def _static_decrypt_signature(self
, s
, video_id
, player_url
, age_gate
):
1046 # The videos with age protection use another player, so the
1047 # algorithms can be different.
1049 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
1052 return s
[86:29:-1] + s
[88] + s
[28:5:-1]
1054 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
1056 return s
[84:27:-1] + s
[86] + s
[26:5:-1]
1058 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
1060 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
1062 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
1064 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
1066 return s
[80:72:-1] + s
[16] + s
[71:39:-1] + s
[72] + s
[38:16:-1] + s
[82] + s
[15::-1]
1068 return s
[3:11] + s
[0] + s
[12:55] + s
[84] + s
[56:84]
1070 return s
[78:70:-1] + s
[14] + s
[69:37:-1] + s
[70] + s
[36:14:-1] + s
[80] + s
[:14][::-1]
1072 return s
[80:63:-1] + s
[0] + s
[62:0:-1] + s
[63]
1074 return s
[80:37:-1] + s
[7] + s
[36:7:-1] + s
[0] + s
[6:0:-1] + s
[37]
1076 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1078 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
1080 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1083 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
1085 def _get_available_subtitles(self
, video_id
, webpage
):
1087 sub_list
= self
._download
_webpage
(
1088 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
,
1089 video_id
, note
=False)
1090 except ExtractorError
as err
:
1091 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
1093 lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
1098 params
= compat_urllib_parse
.urlencode({
1101 'fmt': self
._downloader
.params
.get('subtitlesformat'),
1102 'name': l
[0].encode('utf-8'),
1104 url
= u
'http://www.youtube.com/api/timedtext?' + params
1105 sub_lang_list
[lang
] = url
1106 if not sub_lang_list
:
1107 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
1109 return sub_lang_list
1111 def _get_available_automatic_caption(self
, video_id
, webpage
):
1112 """We need the webpage for getting the captions url, pass it as an
1113 argument to speed up the process."""
1114 sub_format
= self
._downloader
.params
.get('subtitlesformat')
1115 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
1116 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
1117 err_msg
= u
'Couldn\'t find automatic captions for %s' % video_id
1119 self
._downloader
.report_warning(err_msg
)
1121 player_config
= json
.loads(mobj
.group(1))
1123 args
= player_config
[u
'args']
1124 caption_url
= args
[u
'ttsurl']
1125 timestamp
= args
[u
'timestamp']
1126 # We get the available subtitles
1127 list_params
= compat_urllib_parse
.urlencode({
1132 list_url
= caption_url
+ '&' + list_params
1133 list_page
= self
._download
_webpage
(list_url
, video_id
)
1134 caption_list
= xml
.etree
.ElementTree
.fromstring(list_page
.encode('utf-8'))
1135 original_lang_node
= caption_list
.find('track')
1136 if original_lang_node
is None or original_lang_node
.attrib
.get('kind') != 'asr' :
1137 self
._downloader
.report_warning(u
'Video doesn\'t have automatic captions')
1139 original_lang
= original_lang_node
.attrib
['lang_code']
1142 for lang_node
in caption_list
.findall('target'):
1143 sub_lang
= lang_node
.attrib
['lang_code']
1144 params
= compat_urllib_parse
.urlencode({
1145 'lang': original_lang
,
1151 sub_lang_list
[sub_lang
] = caption_url
+ '&' + params
1152 return sub_lang_list
1153 # An extractor error can be raise by the download process if there are
1154 # no automatic captions but there are subtitles
1155 except (KeyError, ExtractorError
):
1156 self
._downloader
.report_warning(err_msg
)
1159 def _print_formats(self
, formats
):
1160 print('Available formats:')
1162 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
1163 self
._video
_dimensions
.get(x
, '???'),
1164 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
1166 def _extract_id(self
, url
):
1167 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1169 raise ExtractorError(u
'Invalid URL: %s' % url
)
1170 video_id
= mobj
.group(2)
1173 def _get_video_url_list(self
, url_map
):
1175 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1176 with the requested formats.
1178 req_format
= self
._downloader
.params
.get('format', None)
1179 format_limit
= self
._downloader
.params
.get('format_limit', None)
1180 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
1181 if format_limit
is not None and format_limit
in available_formats
:
1182 format_list
= available_formats
[available_formats
.index(format_limit
):]
1184 format_list
= available_formats
1185 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1186 if len(existing_formats
) == 0:
1187 raise ExtractorError(u
'no known formats available for video')
1188 if self
._downloader
.params
.get('listformats', None):
1189 self
._print
_formats
(existing_formats
)
1191 if req_format
is None or req_format
== 'best':
1192 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1193 elif req_format
== 'worst':
1194 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
1195 elif req_format
in ('-1', 'all'):
1196 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1198 # Specific formats. We pick the first in a slash-delimeted sequence.
1199 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1200 # available in the specified format. For example,
1201 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1202 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1203 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1204 req_formats
= req_format
.split('/')
1205 video_url_list
= None
1206 for rf
in req_formats
:
1208 video_url_list
= [(rf
, url_map
[rf
])]
1210 if rf
in self
._video
_formats
_map
:
1211 for srf
in self
._video
_formats
_map
[rf
]:
1213 video_url_list
= [(srf
, url_map
[srf
])]
1218 if video_url_list
is None:
1219 raise ExtractorError(u
'requested format not available')
1220 return video_url_list
1222 def _extract_from_m3u8(self
, manifest_url
, video_id
):
1224 def _get_urls(_manifest
):
1225 lines
= _manifest
.split('\n')
1226 urls
= filter(lambda l
: l
and not l
.startswith('#'),
1229 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
1230 formats_urls
= _get_urls(manifest
)
1231 for format_url
in formats_urls
:
1232 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
1233 url_map
[itag
] = format_url
1236 def _extract_annotations(self
, video_id
):
1237 url
= 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1238 return self
._download
_webpage
(url
, video_id
, note
=u
'Searching for annotations.', errnote
=u
'Unable to download video annotations.')
1240 def _real_extract(self
, url
):
1241 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1242 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
1244 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
1245 video_id
= self
._extract
_id
(url
)
1248 self
.report_video_webpage_download(video_id
)
1249 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1250 request
= compat_urllib_request
.Request(url
)
1252 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
1253 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1254 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
1256 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
1258 # Attempt to extract SWF player URL
1259 mobj
= re
.search(r
'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1260 if mobj
is not None:
1261 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1266 self
.report_video_info_webpage_download(video_id
)
1267 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
1268 self
.report_age_confirmation()
1270 # We simulate the access to the video from www.youtube.com/v/{video_id}
1271 # this can be viewed without login into Youtube
1272 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
1276 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
1280 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
1281 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1283 errnote
='unable to download video info webpage')
1284 video_info
= compat_parse_qs(video_info_webpage
)
1287 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1288 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1289 % (video_id
, el_type
))
1290 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1292 errnote
='unable to download video info webpage')
1293 video_info
= compat_parse_qs(video_info_webpage
)
1294 if 'token' in video_info
:
1296 if 'token' not in video_info
:
1297 if 'reason' in video_info
:
1298 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
1300 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
1302 # Check for "rental" videos
1303 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
1304 raise ExtractorError(u
'"rental" videos not supported')
1306 # Start extracting information
1307 self
.report_information_extraction(video_id
)
1310 if 'author' not in video_info
:
1311 raise ExtractorError(u
'Unable to extract uploader name')
1312 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
1315 video_uploader_id
= None
1316 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
1317 if mobj
is not None:
1318 video_uploader_id
= mobj
.group(1)
1320 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
1323 if 'title' in video_info
:
1324 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
1326 self
._downloader
.report_warning(u
'Unable to extract video title')
1330 # We try first to get a high quality image:
1331 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
1332 video_webpage
, re
.DOTALL
)
1333 if m_thumb
is not None:
1334 video_thumbnail
= m_thumb
.group(1)
1335 elif 'thumbnail_url' not in video_info
:
1336 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
1337 video_thumbnail
= None
1338 else: # don't panic if we can't find it
1339 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
1343 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1344 if mobj
is not None:
1345 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1346 upload_date
= unified_strdate(upload_date
)
1349 video_description
= get_element_by_id("eow-description", video_webpage
)
1350 if video_description
:
1351 video_description
= clean_html(video_description
)
1353 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
1355 video_description
= unescapeHTML(fd_mobj
.group(1))
1357 video_description
= u
''
1360 video_subtitles
= self
.extract_subtitles(video_id
, video_webpage
)
1362 if self
._downloader
.params
.get('listsubtitles', False):
1363 self
._list
_available
_subtitles
(video_id
, video_webpage
)
1366 if 'length_seconds' not in video_info
:
1367 self
._downloader
.report_warning(u
'unable to extract video duration')
1370 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
1373 video_annotations
= None
1374 if self
._downloader
.params
.get('writeannotations', False):
1375 video_annotations
= self
._extract
_annotations
(video_id
)
1377 # Decide which formats to download
1380 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
1382 raise ValueError('Could not find vevo ID')
1383 info
= json
.loads(mobj
.group(1))
1385 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1386 # this signatures are encrypted
1387 if 'url_encoded_fmt_stream_map' not in args
:
1388 raise ValueError(u
'No stream_map present') # caught below
1389 re_signature
= re
.compile(r
'[&,]s=')
1390 m_s
= re_signature
.search(args
['url_encoded_fmt_stream_map'])
1392 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
1393 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
1394 m_s
= re_signature
.search(args
.get('adaptive_fmts', u
''))
1396 if 'adaptive_fmts' in video_info
:
1397 video_info
['adaptive_fmts'][0] += ',' + args
['adaptive_fmts']
1399 video_info
['adaptive_fmts'] = [args
['adaptive_fmts']]
1403 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1404 self
.report_rtmp_download()
1405 video_url_list
= [(None, video_info
['conn'][0])]
1406 elif len(video_info
.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info
.get('adaptive_fmts', [])) >= 1:
1407 encoded_url_map
= video_info
.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info
.get('adaptive_fmts',[''])[0]
1408 if 'rtmpe%3Dyes' in encoded_url_map
:
1409 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
1411 for url_data_str
in encoded_url_map
.split(','):
1412 url_data
= compat_parse_qs(url_data_str
)
1413 if 'itag' in url_data
and 'url' in url_data
:
1414 url
= url_data
['url'][0]
1415 if 'sig' in url_data
:
1416 url
+= '&signature=' + url_data
['sig'][0]
1417 elif 's' in url_data
:
1418 encrypted_sig
= url_data
['s'][0]
1419 if self
._downloader
.params
.get('verbose'):
1421 if player_url
is None:
1422 player_version
= 'unknown'
1424 player_version
= self
._search
_regex
(
1425 r
'-(.+)\.swf$', player_url
,
1426 u
'flash player', fatal
=False)
1427 player_desc
= 'flash player %s' % player_version
1429 player_version
= self
._search
_regex
(
1430 r
'html5player-(.+?)\.js', video_webpage
,
1431 'html5 player', fatal
=False)
1432 player_desc
= u
'html5 player %s' % player_version
1434 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in encrypted_sig
.split('.'))
1435 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
1436 (len(encrypted_sig
), parts_sizes
, url_data
['itag'][0], player_desc
))
1439 jsplayer_url_json
= self
._search
_regex
(
1440 r
'"assets":.+?"js":\s*("[^"]+")',
1441 video_webpage
, u
'JS player URL')
1442 player_url
= json
.loads(jsplayer_url_json
)
1444 signature
= self
._decrypt
_signature
(
1445 encrypted_sig
, video_id
, player_url
, age_gate
)
1446 url
+= '&signature=' + signature
1447 if 'ratebypass' not in url
:
1448 url
+= '&ratebypass=yes'
1449 url_map
[url_data
['itag'][0]] = url
1450 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1451 if not video_url_list
:
1453 elif video_info
.get('hlsvp'):
1454 manifest_url
= video_info
['hlsvp'][0]
1455 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
1456 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1457 if not video_url_list
:
1461 raise ExtractorError(u
'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1464 for itag
, video_real_url
in video_url_list
:
1466 video_extension
= self
._video
_extensions
.get(itag
, 'flv')
1468 video_format
= '{0} - {1}{2}'.format(itag
if itag
else video_extension
,
1469 self
._video
_dimensions
.get(itag
, '???'),
1470 ' ('+self
._special
_itags
[itag
]+')' if itag
in self
._special
_itags
else '')
1474 'url': video_real_url
,
1475 'uploader': video_uploader
,
1476 'uploader_id': video_uploader_id
,
1477 'upload_date': upload_date
,
1478 'title': video_title
,
1479 'ext': video_extension
,
1480 'format': video_format
,
1482 'thumbnail': video_thumbnail
,
1483 'description': video_description
,
1484 'player_url': player_url
,
1485 'subtitles': video_subtitles
,
1486 'duration': video_duration
,
1487 'age_limit': 18 if age_gate
else 0,
1488 'annotations': video_annotations
,
1489 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id
,
1493 class YoutubePlaylistIE(InfoExtractor
):
1494 IE_DESC
= u
'YouTube.com playlists'
1495 _VALID_URL
= r
"""(?:
1500 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1501 \? (?:.*?&)*? (?:p|a|list)=
1504 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1507 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1509 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1511 IE_NAME
= u
'youtube:playlist'
1514 def suitable(cls
, url
):
1515 """Receives a URL and returns True if suitable for this IE."""
1516 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1518 def _real_extract(self
, url
):
1519 # Extract playlist id
1520 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1522 raise ExtractorError(u
'Invalid URL: %s' % url
)
1523 playlist_id
= mobj
.group(1) or mobj
.group(2)
1525 # Check if it's a video-specific URL
1526 query_dict
= compat_urlparse
.parse_qs(compat_urlparse
.urlparse(url
).query
)
1527 if 'v' in query_dict
:
1528 video_id
= query_dict
['v'][0]
1529 if self
._downloader
.params
.get('noplaylist'):
1530 self
.to_screen(u
'Downloading just video %s because of --no-playlist' % video_id
)
1531 return self
.url_result('https://www.youtube.com/watch?v=' + video_id
, 'Youtube')
1533 self
.to_screen(u
'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id
, video_id
))
1535 # Download playlist videos from API
1538 for page_num
in itertools
.count(1):
1539 start_index
= self
._MAX
_RESULTS
* (page_num
- 1) + 1
1540 if start_index
>= 1000:
1541 self
._downloader
.report_warning(u
'Max number of results reached')
1543 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, start_index
)
1544 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
1547 response
= json
.loads(page
)
1548 except ValueError as err
:
1549 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1551 if 'feed' not in response
:
1552 raise ExtractorError(u
'Got a malformed response from YouTube API')
1553 playlist_title
= response
['feed']['title']['$t']
1554 if 'entry' not in response
['feed']:
1555 # Number of videos is a multiple of self._MAX_RESULTS
1558 for entry
in response
['feed']['entry']:
1559 index
= entry
['yt$position']['$t']
1560 if 'media$group' in entry
and 'yt$videoid' in entry
['media$group']:
1563 'https://www.youtube.com/watch?v=' + entry
['media$group']['yt$videoid']['$t']
1566 videos
= [v
[1] for v
in sorted(videos
)]
1568 url_results
= [self
.url_result(vurl
, 'Youtube') for vurl
in videos
]
1569 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
1572 class YoutubeChannelIE(InfoExtractor
):
1573 IE_DESC
= u
'YouTube.com channels'
1574 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1575 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
1576 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1577 IE_NAME
= u
'youtube:channel'
1579 def extract_videos_from_page(self
, page
):
1581 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
1582 if mobj
.group(1) not in ids_in_page
:
1583 ids_in_page
.append(mobj
.group(1))
1586 def _real_extract(self
, url
):
1587 # Extract channel id
1588 mobj
= re
.match(self
._VALID
_URL
, url
)
1590 raise ExtractorError(u
'Invalid URL: %s' % url
)
1592 # Download channel page
1593 channel_id
= mobj
.group(1)
1596 # Download all channel pages using the json-based channel_ajax query
1597 for pagenum
in itertools
.count(1):
1598 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
1599 page
= self
._download
_webpage
(url
, channel_id
,
1600 u
'Downloading page #%s' % pagenum
)
1602 page
= json
.loads(page
)
1604 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1605 video_ids
.extend(ids_in_page
)
1607 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1610 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1612 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
1613 url_entries
= [self
.url_result(eurl
, 'Youtube') for eurl
in urls
]
1614 return [self
.playlist_result(url_entries
, channel_id
)]
1617 class YoutubeUserIE(InfoExtractor
):
1618 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1619 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1620 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1621 _GDATA_PAGE_SIZE
= 50
1622 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1623 IE_NAME
= u
'youtube:user'
1626 def suitable(cls
, url
):
1627 # Don't return True if the url can be extracted with other youtube
1628 # extractor, the regex would is too permissive and it would match.
1629 other_ies
= iter(klass
for (name
, klass
) in globals().items() if name
.endswith('IE') and klass
is not cls
)
1630 if any(ie
.suitable(url
) for ie
in other_ies
): return False
1631 else: return super(YoutubeUserIE
, cls
).suitable(url
)
1633 def _real_extract(self
, url
):
1635 mobj
= re
.match(self
._VALID
_URL
, url
)
1637 raise ExtractorError(u
'Invalid URL: %s' % url
)
1639 username
= mobj
.group(1)
1641 # Download video ids using YouTube Data API. Result size per
1642 # query is limited (currently to 50 videos) so we need to query
1643 # page by page until there are no video ids - it means we got
1648 for pagenum
in itertools
.count(0):
1649 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1651 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1652 page
= self
._download
_webpage
(gdata_url
, username
,
1653 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1656 response
= json
.loads(page
)
1657 except ValueError as err
:
1658 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1659 if 'entry' not in response
['feed']:
1660 # Number of videos is a multiple of self._MAX_RESULTS
1663 # Extract video identifiers
1665 for entry
in response
['feed']['entry']:
1666 ids_in_page
.append(entry
['id']['$t'].split('/')[-1])
1667 video_ids
.extend(ids_in_page
)
1669 # A little optimization - if current page is not
1670 # "full", ie. does not contain PAGE_SIZE video ids then
1671 # we can assume that this page is the last one - there
1672 # are no more ids on further pages - no need to query
1675 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1678 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1679 url_results
= [self
.url_result(rurl
, 'Youtube') for rurl
in urls
]
1680 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1682 class YoutubeSearchIE(SearchInfoExtractor
):
1683 IE_DESC
= u
'YouTube.com searches'
1684 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1686 IE_NAME
= u
'youtube:search'
1687 _SEARCH_KEY
= 'ytsearch'
1689 def report_download_page(self
, query
, pagenum
):
1690 """Report attempt to download search page with given number."""
1691 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1693 def _get_n_results(self
, query
, n
):
1694 """Get a specified number of results for a query"""
1700 while (50 * pagenum
) < limit
:
1701 self
.report_download_page(query
, pagenum
+1)
1702 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1703 request
= compat_urllib_request
.Request(result_url
)
1705 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1706 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1707 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1708 api_response
= json
.loads(data
)['data']
1710 if not 'items' in api_response
:
1711 raise ExtractorError(u
'[youtube] No video results')
1713 new_ids
= list(video
['id'] for video
in api_response
['items'])
1714 video_ids
+= new_ids
1716 limit
= min(n
, api_response
['totalItems'])
1719 if len(video_ids
) > n
:
1720 video_ids
= video_ids
[:n
]
1721 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1722 return self
.playlist_result(videos
, query
)
1724 class YoutubeSearchDateIE(YoutubeSearchIE
):
1725 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1726 _SEARCH_KEY
= 'ytsearchdate'
1727 IE_DESC
= u
'YouTube.com searches, newest videos first'
1729 class YoutubeShowIE(InfoExtractor
):
1730 IE_DESC
= u
'YouTube.com (multi-season) shows'
1731 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1732 IE_NAME
= u
'youtube:show'
1734 def _real_extract(self
, url
):
1735 mobj
= re
.match(self
._VALID
_URL
, url
)
1736 show_name
= mobj
.group(1)
1737 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1738 # There's one playlist for each season of the show
1739 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1740 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1741 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1744 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1746 Base class for extractors that fetch info from
1747 http://www.youtube.com/feed_ajax
1748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1750 _LOGIN_REQUIRED
= True
1752 # use action_load_personal_feed instead of action_load_system_feed
1753 _PERSONAL_FEED
= False
1756 def _FEED_TEMPLATE(self
):
1757 action
= 'action_load_system_feed'
1758 if self
._PERSONAL
_FEED
:
1759 action
= 'action_load_personal_feed'
1760 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1764 return u
'youtube:%s' % self
._FEED
_NAME
1766 def _real_initialize(self
):
1769 def _real_extract(self
, url
):
1771 # The step argument is available only in 2.7 or higher
1772 for i
in itertools
.count(0):
1773 paging
= i
*self
._PAGING
_STEP
1774 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1775 u
'%s feed' % self
._FEED
_NAME
,
1776 u
'Downloading page %s' % i
)
1777 info
= json
.loads(info
)
1778 feed_html
= info
['feed_html']
1779 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1780 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1781 feed_entries
.extend(self
.url_result(id, 'Youtube') for id in ids
)
1782 if info
['paging'] is None:
1784 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1787 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1788 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME
= 'subscriptions'
1790 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1793 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME
= 'recommended'
1796 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1799 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME
= 'watch_later'
1802 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1804 _PERSONAL_FEED
= True
1806 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1807 IE_NAME
= u
'youtube:favorites'
1808 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1809 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1810 _LOGIN_REQUIRED
= True
1812 def _real_extract(self
, url
):
1813 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1814 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1815 return self
.url_result(playlist_id
, 'YoutubePlaylist')
1818 class YoutubeTruncatedURLIE(InfoExtractor
):
1819 IE_NAME
= 'youtube:truncated_url'
1820 IE_DESC
= False # Do not list
1821 _VALID_URL
= r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1823 def _real_extract(self
, url
):
1824 raise ExtractorError(
1825 u
'Did you forget to quote the URL? Remember that & is a meta '
1826 u
'character in most shells, so you want to put the URL in quotes, '
1828 u
'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1829 u
' (or simply youtube-dl BaW_jenozKc ).',