15 from .common
import InfoExtractor
, SearchInfoExtractor
16 from .subtitles
import SubtitlesInfoExtractor
21 compat_urllib_request
,
28 get_element_by_attribute
,
36 class YoutubeBaseInfoExtractor(InfoExtractor
):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE
= 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED
= False
45 def _set_language(self
):
46 return bool(self
._download
_webpage
(
48 note
=u
'Setting language', errnote
='unable to set language',
52 (username
, password
) = self
._get
_login
_info
()
53 # No authentication to be performed
55 if self
._LOGIN
_REQUIRED
:
56 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
59 login_page
= self
._download
_webpage
(
60 self
._LOGIN
_URL
, None,
61 note
=u
'Downloading login page',
62 errnote
=u
'unable to fetch login page', fatal
=False)
63 if login_page
is False:
66 galx
= self
._search
_regex
(r
'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page
, u
'Login GALX parameter')
71 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u
'PersistentCookie': u
'yes',
77 u
'bgresponse': u
'js_disabled',
78 u
'checkConnection': u
'',
79 u
'checkedDomains': u
'youtube',
84 u
'signIn': u
'Sign in',
86 u
'service': u
'youtube',
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
93 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
95 req
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
96 login_results
= self
._download
_webpage
(
98 note
=u
'Logging in', errnote
=u
'unable to log in', fatal
=False)
99 if login_results
is False:
101 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
102 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
106 def _confirm_age(self
):
109 'action_confirm': 'Confirm',
111 req
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
113 self
._download
_webpage
(
115 note
=u
'Confirming age', errnote
=u
'Unable to confirm age')
118 def _real_initialize(self
):
119 if self
._downloader
is None:
121 if not self
._set
_language
():
123 if not self
._login
():
128 class YoutubeIE(YoutubeBaseInfoExtractor
, SubtitlesInfoExtractor
):
129 IE_DESC
= u
'YouTube.com'
130 _VALID_URL
= r
"""(?x)^
132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 |youtu\.be/ # just youtu.be/xxxx
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
153 # Listed in order of quality
154 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
155 # Apple HTTP Live Streaming
156 '96', '95', '94', '93', '92', '132', '151',
158 '85', '84', '102', '83', '101', '82', '100',
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
163 '141', '172', '140', '171', '139',
165 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
166 # Apple HTTP Live Streaming
167 '96', '95', '94', '93', '92', '132', '151',
169 '85', '102', '84', '101', '83', '100', '82',
171 '138', '248', '137', '247', '136', '246', '245',
172 '244', '135', '243', '134', '242', '133', '160',
174 '172', '141', '171', '140', '139',
176 _video_formats_map
= {
177 'flv': ['35', '34', '6', '5'],
178 '3gp': ['36', '17', '13'],
179 'mp4': ['38', '37', '22', '18'],
180 'webm': ['46', '45', '44', '43'],
182 _video_extensions
= {
204 # Apple HTTP Live Streaming
238 _video_dimensions
= {
320 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
321 u
"file": u
"BaW_jenozKc.mp4",
323 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
324 u
"uploader": u
"Philipp Hagemeister",
325 u
"uploader_id": u
"phihag",
326 u
"upload_date": u
"20121002",
327 u
"description": u
"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
331 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
332 u
"file": u
"UxxajLWwzqY.mp4",
333 u
"note": u
"Test generic use_cipher_signature video (#897)",
335 u
"upload_date": u
"20120506",
336 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
337 u
"description": u
"md5:5b292926389560516e384ac437c0ec07",
338 u
"uploader": u
"Icona Pop",
339 u
"uploader_id": u
"IconaPop"
343 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
344 u
"file": u
"07FYdnEawAQ.mp4",
345 u
"note": u
"Test VEVO video with age protection (#956)",
347 u
"upload_date": u
"20130703",
348 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
349 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
350 u
"uploader": u
"justintimberlakeVEVO",
351 u
"uploader_id": u
"justintimberlakeVEVO"
355 u
"url": u
"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
356 u
"file": u
"yZIXLfi8CZQ.mp4",
357 u
"note": u
"Embed-only video (#1746)",
359 u
"upload_date": u
"20120608",
360 u
"title": u
"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
361 u
"description": u
"md5:09b78bd971f1e3e289601dfba15ca4f7",
362 u
"uploader": u
"SET India",
363 u
"uploader_id": u
"setindia"
370 def suitable(cls
, url
):
371 """Receives a URL and returns True if suitable for this IE."""
372 if YoutubePlaylistIE
.suitable(url
): return False
373 return re
.match(cls
._VALID
_URL
, url
) is not None
375 def __init__(self
, *args
, **kwargs
):
376 super(YoutubeIE
, self
).__init
__(*args
, **kwargs
)
377 self
._player
_cache
= {}
379 def report_video_info_webpage_download(self
, video_id
):
380 """Report attempt to download video info webpage."""
381 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
383 def report_information_extraction(self
, video_id
):
384 """Report attempt to extract video information."""
385 self
.to_screen(u
'%s: Extracting video information' % video_id
)
387 def report_unavailable_format(self
, video_id
, format
):
388 """Report extracted video URL."""
389 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
391 def report_rtmp_download(self
):
392 """Indicate the download will use the RTMP protocol."""
393 self
.to_screen(u
'RTMP download detected')
395 def _extract_signature_function(self
, video_id
, player_url
, slen
):
396 id_m
= re
.match(r
'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
398 player_type
= id_m
.group('ext')
399 player_id
= id_m
.group('id')
401 # Read from filesystem cache
402 func_id
= '%s_%s_%d' % (player_type
, player_id
, slen
)
403 assert os
.path
.basename(func_id
) == func_id
404 cache_dir
= get_cachedir(self
._downloader
.params
)
406 cache_enabled
= cache_dir
is not None
408 cache_fn
= os
.path
.join(os
.path
.expanduser(cache_dir
),
412 with io
.open(cache_fn
, 'r', encoding
='utf-8') as cachef
:
413 cache_spec
= json
.load(cachef
)
414 return lambda s
: u
''.join(s
[i
] for i
in cache_spec
)
416 pass # No cache available
418 if player_type
== 'js':
419 code
= self
._download
_webpage
(
420 player_url
, video_id
,
421 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
422 errnote
=u
'Download of %s failed' % player_url
)
423 res
= self
._parse
_sig
_js
(code
)
424 elif player_type
== 'swf':
425 urlh
= self
._request
_webpage
(
426 player_url
, video_id
,
427 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
428 errnote
=u
'Download of %s failed' % player_url
)
430 res
= self
._parse
_sig
_swf
(code
)
432 assert False, 'Invalid player type %r' % player_type
436 test_string
= u
''.join(map(compat_chr
, range(slen
)))
437 cache_res
= res(test_string
)
438 cache_spec
= [ord(c
) for c
in cache_res
]
440 os
.makedirs(os
.path
.dirname(cache_fn
))
441 except OSError as ose
:
442 if ose
.errno
!= errno
.EEXIST
:
444 write_json_file(cache_spec
, cache_fn
)
446 tb
= traceback
.format_exc()
447 self
._downloader
.report_warning(
448 u
'Writing cache to %r failed: %s' % (cache_fn
, tb
))
452 def _print_sig_code(self
, func
, slen
):
453 def gen_sig_code(idxs
):
454 def _genslice(start
, end
, step
):
455 starts
= u
'' if start
== 0 else str(start
)
456 ends
= (u
':%d' % (end
+step
)) if end
+ step
>= 0 else u
':'
457 steps
= u
'' if step
== 1 else (u
':%d' % step
)
458 return u
's[%s%s%s]' % (starts
, ends
, steps
)
461 start
= '(Never used)' # Quelch pyflakes warnings - start will be
462 # set as soon as step is set
463 for i
, prev
in zip(idxs
[1:], idxs
[:-1]):
467 yield _genslice(start
, prev
, step
)
470 if i
- prev
in [-1, 1]:
475 yield u
's[%d]' % prev
479 yield _genslice(start
, i
, step
)
481 test_string
= u
''.join(map(compat_chr
, range(slen
)))
482 cache_res
= func(test_string
)
483 cache_spec
= [ord(c
) for c
in cache_res
]
484 expr_code
= u
' + '.join(gen_sig_code(cache_spec
))
485 code
= u
'if len(s) == %d:\n return %s\n' % (slen
, expr_code
)
486 self
.to_screen(u
'Extracted signature function:\n' + code
)
488 def _parse_sig_js(self
, jscode
):
489 funcname
= self
._search
_regex
(
490 r
'signature=([a-zA-Z]+)', jscode
,
491 u
'Initial JS player signature function name')
496 return string
.lowercase
.index(varname
)
498 def interpret_statement(stmt
, local_vars
, allow_recursion
=20):
499 if allow_recursion
< 0:
500 raise ExtractorError(u
'Recursion limit reached')
502 if stmt
.startswith(u
'var '):
503 stmt
= stmt
[len(u
'var '):]
504 ass_m
= re
.match(r
'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
505 r
'=(?P<expr>.*)$', stmt
)
507 if ass_m
.groupdict().get('index'):
509 lvar
= local_vars
[ass_m
.group('out')]
510 idx
= interpret_expression(ass_m
.group('index'),
511 local_vars
, allow_recursion
)
512 assert isinstance(idx
, int)
515 expr
= ass_m
.group('expr')
518 local_vars
[ass_m
.group('out')] = val
520 expr
= ass_m
.group('expr')
521 elif stmt
.startswith(u
'return '):
523 expr
= stmt
[len(u
'return '):]
525 raise ExtractorError(
526 u
'Cannot determine left side of statement in %r' % stmt
)
528 v
= interpret_expression(expr
, local_vars
, allow_recursion
)
531 def interpret_expression(expr
, local_vars
, allow_recursion
):
536 return local_vars
[expr
]
538 m
= re
.match(r
'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr
)
540 member
= m
.group('member')
541 val
= local_vars
[m
.group('in')]
542 if member
== 'split("")':
544 if member
== 'join("")':
546 if member
== 'length':
548 if member
== 'reverse()':
550 slice_m
= re
.match(r
'slice\((?P<idx>.*)\)', member
)
552 idx
= interpret_expression(
553 slice_m
.group('idx'), local_vars
, allow_recursion
-1)
557 r
'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr
)
559 val
= local_vars
[m
.group('in')]
560 idx
= interpret_expression(m
.group('idx'), local_vars
,
564 m
= re
.match(r
'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr
)
566 a
= interpret_expression(m
.group('a'),
567 local_vars
, allow_recursion
)
568 b
= interpret_expression(m
.group('b'),
569 local_vars
, allow_recursion
)
573 r
'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr
)
575 fname
= m
.group('func')
576 if fname
not in functions
:
577 functions
[fname
] = extract_function(fname
)
578 argvals
= [int(v
) if v
.isdigit() else local_vars
[v
]
579 for v
in m
.group('args').split(',')]
580 return functions
[fname
](argvals
)
581 raise ExtractorError(u
'Unsupported JS expression %r' % expr
)
583 def extract_function(funcname
):
585 r
'function ' + re
.escape(funcname
) +
586 r
'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
588 argnames
= func_m
.group('args').split(',')
591 local_vars
= dict(zip(argnames
, args
))
592 for stmt
in func_m
.group('code').split(';'):
593 res
= interpret_statement(stmt
, local_vars
)
597 initial_function
= extract_function(funcname
)
598 return lambda s
: initial_function([s
])
600 def _parse_sig_swf(self
, file_contents
):
601 if file_contents
[1:3] != b
'WS':
602 raise ExtractorError(
603 u
'Not an SWF file; header is %r' % file_contents
[:3])
604 if file_contents
[:1] == b
'C':
605 content
= zlib
.decompress(file_contents
[8:])
607 raise NotImplementedError(u
'Unsupported compression format %r' %
610 def extract_tags(content
):
612 while pos
< len(content
):
613 header16
= struct
.unpack('<H', content
[pos
:pos
+2])[0]
615 tag_code
= header16
>> 6
616 tag_len
= header16
& 0x3f
618 tag_len
= struct
.unpack('<I', content
[pos
:pos
+4])[0]
620 assert pos
+tag_len
<= len(content
)
621 yield (tag_code
, content
[pos
:pos
+tag_len
])
625 for tag_code
, tag
in extract_tags(content
)
627 p
= code_tag
.index(b
'\0', 4) + 1
628 code_reader
= io
.BytesIO(code_tag
[p
:])
630 # Parse ABC (AVM2 ByteCode)
631 def read_int(reader
=None):
639 b
= struct
.unpack('<B', buf
)[0]
640 res
= res |
((b
& 0x7f) << shift
)
646 def u30(reader
=None):
647 res
= read_int(reader
)
648 assert res
& 0xf0000000 == 0
652 def s32(reader
=None):
654 if v
& 0x80000000 != 0:
655 v
= - ((v ^
0xffffffff) + 1)
658 def read_string(reader
=None):
662 resb
= reader
.read(slen
)
663 assert len(resb
) == slen
664 return resb
.decode('utf-8')
666 def read_bytes(count
, reader
=None):
669 resb
= reader
.read(count
)
670 assert len(resb
) == count
673 def read_byte(reader
=None):
674 resb
= read_bytes(1, reader
=reader
)
675 res
= struct
.unpack('<B', resb
)[0]
678 # minor_version + major_version
683 for _c
in range(1, int_count
):
686 for _c
in range(1, uint_count
):
689 read_bytes((double_count
-1) * 8)
691 constant_strings
= [u
'']
692 for _c
in range(1, string_count
):
694 constant_strings
.append(s
)
695 namespace_count
= u30()
696 for _c
in range(1, namespace_count
):
700 for _c
in range(1, ns_set_count
):
702 for _c2
in range(count
):
704 multiname_count
= u30()
713 0x0e: 2, # MultinameA
714 0x1b: 1, # MultinameL
715 0x1c: 1, # MultinameLA
718 for _c
in range(1, multiname_count
):
720 assert kind
in MULTINAME_SIZES
, u
'Invalid multiname kind %r' % kind
722 u30() # namespace_idx
724 multinames
.append(constant_strings
[name_idx
])
726 multinames
.append('[MULTINAME kind: %d]' % kind
)
727 for _c2
in range(MULTINAME_SIZES
[kind
]):
732 MethodInfo
= collections
.namedtuple(
734 ['NEED_ARGUMENTS', 'NEED_REST'])
736 for method_id
in range(method_count
):
739 for _
in range(param_count
):
741 u30() # name index (always 0 for youtube)
743 if flags
& 0x08 != 0:
746 for c
in range(option_count
):
749 if flags
& 0x80 != 0:
750 # Param names present
751 for _
in range(param_count
):
753 mi
= MethodInfo(flags
& 0x01 != 0, flags
& 0x04 != 0)
754 method_infos
.append(mi
)
757 metadata_count
= u30()
758 for _c
in range(metadata_count
):
761 for _c2
in range(item_count
):
765 def parse_traits_info():
766 trait_name_idx
= u30()
767 kind_full
= read_byte()
768 kind
= kind_full
& 0x0f
769 attrs
= kind_full
>> 4
771 if kind
in [0x00, 0x06]: # Slot or Const
773 u30() # type_name_idx
777 elif kind
in [0x01, 0x02, 0x03]: # Method / Getter / Setter
780 methods
[multinames
[trait_name_idx
]] = method_idx
781 elif kind
== 0x04: # Class
784 elif kind
== 0x05: # Function
787 methods
[function_idx
] = multinames
[trait_name_idx
]
789 raise ExtractorError(u
'Unsupported trait kind %d' % kind
)
791 if attrs
& 0x4 != 0: # Metadata present
792 metadata_count
= u30()
793 for _c3
in range(metadata_count
):
794 u30() # metadata index
799 TARGET_CLASSNAME
= u
'SignatureDecipher'
800 searched_idx
= multinames
.index(TARGET_CLASSNAME
)
801 searched_class_id
= None
803 for class_id
in range(class_count
):
805 if name_idx
== searched_idx
:
806 # We found the class we're looking for!
807 searched_class_id
= class_id
808 u30() # super_name idx
810 if flags
& 0x08 != 0: # Protected namespace is present
811 u30() # protected_ns_idx
813 for _c2
in range(intrf_count
):
817 for _c2
in range(trait_count
):
820 if searched_class_id
is None:
821 raise ExtractorError(u
'Target class %r not found' %
826 for class_id
in range(class_count
):
829 for _c2
in range(trait_count
):
830 trait_methods
= parse_traits_info()
831 if class_id
== searched_class_id
:
832 method_names
.update(trait_methods
.items())
833 method_idxs
.update(dict(
835 for name
, idx
in trait_methods
.items()))
839 for _c
in range(script_count
):
842 for _c2
in range(trait_count
):
846 method_body_count
= u30()
847 Method
= collections
.namedtuple('Method', ['code', 'local_count'])
849 for _c
in range(method_body_count
):
853 u30() # init_scope_depth
854 u30() # max_scope_depth
856 code
= read_bytes(code_length
)
857 if method_idx
in method_idxs
:
858 m
= Method(code
, local_count
)
859 methods
[method_idxs
[method_idx
]] = m
860 exception_count
= u30()
861 for _c2
in range(exception_count
):
868 for _c2
in range(trait_count
):
871 assert p
+ code_reader
.tell() == len(code_tag
)
872 assert len(methods
) == len(method_idxs
)
874 method_pyfunctions
= {}
876 def extract_function(func_name
):
877 if func_name
in method_pyfunctions
:
878 return method_pyfunctions
[func_name
]
879 if func_name
not in methods
:
880 raise ExtractorError(u
'Cannot find function %r' % func_name
)
881 m
= methods
[func_name
]
884 registers
= ['(this)'] + list(args
) + [None] * m
.local_count
886 coder
= io
.BytesIO(m
.code
)
888 opcode
= struct
.unpack('!B', coder
.read(1))[0]
889 if opcode
== 36: # pushbyte
890 v
= struct
.unpack('!B', coder
.read(1))[0]
892 elif opcode
== 44: # pushstring
894 stack
.append(constant_strings
[idx
])
895 elif opcode
== 48: # pushscope
896 # We don't implement the scope register, so we'll just
897 # ignore the popped value
899 elif opcode
== 70: # callproperty
901 mname
= multinames
[index
]
902 arg_count
= u30(coder
)
903 args
= list(reversed(
904 [stack
.pop() for _
in range(arg_count
)]))
906 if mname
== u
'split':
907 assert len(args
) == 1
908 assert isinstance(args
[0], compat_str
)
909 assert isinstance(obj
, compat_str
)
913 res
= obj
.split(args
[0])
915 elif mname
== u
'slice':
916 assert len(args
) == 1
917 assert isinstance(args
[0], int)
918 assert isinstance(obj
, list)
921 elif mname
== u
'join':
922 assert len(args
) == 1
923 assert isinstance(args
[0], compat_str
)
924 assert isinstance(obj
, list)
925 res
= args
[0].join(obj
)
927 elif mname
in method_pyfunctions
:
928 stack
.append(method_pyfunctions
[mname
](args
))
930 raise NotImplementedError(
931 u
'Unsupported property %r on %r'
933 elif opcode
== 72: # returnvalue
936 elif opcode
== 79: # callpropvoid
938 mname
= multinames
[index
]
939 arg_count
= u30(coder
)
940 args
= list(reversed(
941 [stack
.pop() for _
in range(arg_count
)]))
943 if mname
== u
'reverse':
944 assert isinstance(obj
, list)
947 raise NotImplementedError(
948 u
'Unsupported (void) property %r on %r'
950 elif opcode
== 93: # findpropstrict
952 mname
= multinames
[index
]
953 res
= extract_function(mname
)
955 elif opcode
== 97: # setproperty
960 assert isinstance(obj
, list)
961 assert isinstance(idx
, int)
963 elif opcode
== 98: # getlocal
965 stack
.append(registers
[index
])
966 elif opcode
== 99: # setlocal
969 registers
[index
] = value
970 elif opcode
== 102: # getproperty
972 pname
= multinames
[index
]
973 if pname
== u
'length':
975 assert isinstance(obj
, list)
976 stack
.append(len(obj
))
977 else: # Assume attribute access
979 assert isinstance(idx
, int)
981 assert isinstance(obj
, list)
982 stack
.append(obj
[idx
])
983 elif opcode
== 128: # coerce
985 elif opcode
== 133: # coerce_s
986 assert isinstance(stack
[-1], (type(None), compat_str
))
987 elif opcode
== 164: # modulo
990 res
= value1
% value2
992 elif opcode
== 208: # getlocal_0
993 stack
.append(registers
[0])
994 elif opcode
== 209: # getlocal_1
995 stack
.append(registers
[1])
996 elif opcode
== 210: # getlocal_2
997 stack
.append(registers
[2])
998 elif opcode
== 211: # getlocal_3
999 stack
.append(registers
[3])
1000 elif opcode
== 214: # setlocal_2
1001 registers
[2] = stack
.pop()
1002 elif opcode
== 215: # setlocal_3
1003 registers
[3] = stack
.pop()
1005 raise NotImplementedError(
1006 u
'Unsupported opcode %d' % opcode
)
1008 method_pyfunctions
[func_name
] = resfunc
1011 initial_function
= extract_function(u
'decipher')
1012 return lambda s
: initial_function([s
])
1014 def _decrypt_signature(self
, s
, video_id
, player_url
, age_gate
=False):
1015 """Turn the encrypted s field into a working signature"""
1017 if player_url
is not None:
1018 if player_url
.startswith(u
'//'):
1019 player_url
= u
'https:' + player_url
1021 player_id
= (player_url
, len(s
))
1022 if player_id
not in self
._player
_cache
:
1023 func
= self
._extract
_signature
_function
(
1024 video_id
, player_url
, len(s
)
1026 self
._player
_cache
[player_id
] = func
1027 func
= self
._player
_cache
[player_id
]
1028 if self
._downloader
.params
.get('youtube_print_sig_code'):
1029 self
._print
_sig
_code
(func
, len(s
))
1032 tb
= traceback
.format_exc()
1033 self
._downloader
.report_warning(
1034 u
'Automatic signature extraction failed: ' + tb
)
1036 self
._downloader
.report_warning(
1037 u
'Warning: Falling back to static signature algorithm')
1039 return self
._static
_decrypt
_signature
(
1040 s
, video_id
, player_url
, age_gate
)
1042 def _static_decrypt_signature(self
, s
, video_id
, player_url
, age_gate
):
1044 # The videos with age protection use another player, so the
1045 # algorithms can be different.
1047 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
1050 return s
[86:29:-1] + s
[88] + s
[28:5:-1]
1052 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
1054 return s
[84:27:-1] + s
[86] + s
[26:5:-1]
1056 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
1058 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
1060 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
1062 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
1064 return s
[80:72:-1] + s
[16] + s
[71:39:-1] + s
[72] + s
[38:16:-1] + s
[82] + s
[15::-1]
1066 return s
[3:11] + s
[0] + s
[12:55] + s
[84] + s
[56:84]
1068 return s
[78:70:-1] + s
[14] + s
[69:37:-1] + s
[70] + s
[36:14:-1] + s
[80] + s
[:14][::-1]
1070 return s
[80:63:-1] + s
[0] + s
[62:0:-1] + s
[63]
1072 return s
[80:37:-1] + s
[7] + s
[36:7:-1] + s
[0] + s
[6:0:-1] + s
[37]
1074 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1076 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
1078 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1081 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
1083 def _get_available_subtitles(self
, video_id
, webpage
):
1085 sub_list
= self
._download
_webpage
(
1086 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
,
1087 video_id
, note
=False)
1088 except ExtractorError
as err
:
1089 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
1091 lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
1096 params
= compat_urllib_parse
.urlencode({
1099 'fmt': self
._downloader
.params
.get('subtitlesformat', 'srt'),
1100 'name': l
[0].encode('utf-8'),
1102 url
= u
'http://www.youtube.com/api/timedtext?' + params
1103 sub_lang_list
[lang
] = url
1104 if not sub_lang_list
:
1105 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
1107 return sub_lang_list
1109 def _get_available_automatic_caption(self
, video_id
, webpage
):
1110 """We need the webpage for getting the captions url, pass it as an
1111 argument to speed up the process."""
1112 sub_format
= self
._downloader
.params
.get('subtitlesformat', 'srt')
1113 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
1114 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
1115 err_msg
= u
'Couldn\'t find automatic captions for %s' % video_id
1117 self
._downloader
.report_warning(err_msg
)
1119 player_config
= json
.loads(mobj
.group(1))
1121 args
= player_config
[u
'args']
1122 caption_url
= args
[u
'ttsurl']
1123 timestamp
= args
[u
'timestamp']
1124 # We get the available subtitles
1125 list_params
= compat_urllib_parse
.urlencode({
1130 list_url
= caption_url
+ '&' + list_params
1131 caption_list
= self
._download
_xml
(list_url
, video_id
)
1132 original_lang_node
= caption_list
.find('track')
1133 if original_lang_node
is None or original_lang_node
.attrib
.get('kind') != 'asr' :
1134 self
._downloader
.report_warning(u
'Video doesn\'t have automatic captions')
1136 original_lang
= original_lang_node
.attrib
['lang_code']
1139 for lang_node
in caption_list
.findall('target'):
1140 sub_lang
= lang_node
.attrib
['lang_code']
1141 params
= compat_urllib_parse
.urlencode({
1142 'lang': original_lang
,
1148 sub_lang_list
[sub_lang
] = caption_url
+ '&' + params
1149 return sub_lang_list
1150 # An extractor error can be raise by the download process if there are
1151 # no automatic captions but there are subtitles
1152 except (KeyError, ExtractorError
):
1153 self
._downloader
.report_warning(err_msg
)
1156 def _print_formats(self
, formats
):
1157 print('Available formats:')
1159 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
1160 self
._video
_dimensions
.get(x
, '???'),
1161 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
1163 def _extract_id(self
, url
):
1164 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1166 raise ExtractorError(u
'Invalid URL: %s' % url
)
1167 video_id
= mobj
.group(2)
1170 def _get_video_url_list(self
, url_map
):
1172 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1173 with the requested formats.
1175 req_format
= self
._downloader
.params
.get('format', None)
1176 format_limit
= self
._downloader
.params
.get('format_limit', None)
1177 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
1178 if format_limit
is not None and format_limit
in available_formats
:
1179 format_list
= available_formats
[available_formats
.index(format_limit
):]
1181 format_list
= available_formats
1182 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1183 if len(existing_formats
) == 0:
1184 raise ExtractorError(u
'no known formats available for video')
1185 if self
._downloader
.params
.get('listformats', None):
1186 self
._print
_formats
(existing_formats
)
1188 if req_format
is None or req_format
== 'best':
1189 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1190 elif req_format
== 'worst':
1191 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
1192 elif req_format
in ('-1', 'all'):
1193 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1195 # Specific formats. We pick the first in a slash-delimeted sequence.
1196 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1197 # available in the specified format. For example,
1198 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1199 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1200 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1201 req_formats
= req_format
.split('/')
1202 video_url_list
= None
1203 for rf
in req_formats
:
1205 video_url_list
= [(rf
, url_map
[rf
])]
1207 if rf
in self
._video
_formats
_map
:
1208 for srf
in self
._video
_formats
_map
[rf
]:
1210 video_url_list
= [(srf
, url_map
[srf
])]
1215 if video_url_list
is None:
1216 raise ExtractorError(u
'requested format not available')
1217 return video_url_list
1219 def _extract_from_m3u8(self
, manifest_url
, video_id
):
1221 def _get_urls(_manifest
):
1222 lines
= _manifest
.split('\n')
1223 urls
= filter(lambda l
: l
and not l
.startswith('#'),
1226 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
1227 formats_urls
= _get_urls(manifest
)
1228 for format_url
in formats_urls
:
1229 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
1230 url_map
[itag
] = format_url
1233 def _extract_annotations(self
, video_id
):
1234 url
= 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1235 return self
._download
_webpage
(url
, video_id
, note
=u
'Searching for annotations.', errnote
=u
'Unable to download video annotations.')
1237 def _real_extract(self
, url
):
1238 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1239 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
1241 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
1242 video_id
= self
._extract
_id
(url
)
1245 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1246 video_webpage
= self
._download
_webpage
(url
, video_id
)
1248 # Attempt to extract SWF player URL
1249 mobj
= re
.search(r
'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1250 if mobj
is not None:
1251 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1256 self
.report_video_info_webpage_download(video_id
)
1257 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
1258 self
.report_age_confirmation()
1260 # We simulate the access to the video from www.youtube.com/v/{video_id}
1261 # this can be viewed without login into Youtube
1262 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
1263 'el': 'player_embedded',
1266 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
1270 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
1271 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1273 errnote
='unable to download video info webpage')
1274 video_info
= compat_parse_qs(video_info_webpage
)
1277 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1278 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1279 % (video_id
, el_type
))
1280 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1282 errnote
='unable to download video info webpage')
1283 video_info
= compat_parse_qs(video_info_webpage
)
1284 if 'token' in video_info
:
1286 if 'token' not in video_info
:
1287 if 'reason' in video_info
:
1288 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
1290 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
1292 if 'view_count' in video_info
:
1293 view_count
= int(video_info
['view_count'][0])
1297 # Check for "rental" videos
1298 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
1299 raise ExtractorError(u
'"rental" videos not supported')
1301 # Start extracting information
1302 self
.report_information_extraction(video_id
)
1305 if 'author' not in video_info
:
1306 raise ExtractorError(u
'Unable to extract uploader name')
1307 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
1310 video_uploader_id
= None
1311 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
1312 if mobj
is not None:
1313 video_uploader_id
= mobj
.group(1)
1315 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
1318 if 'title' in video_info
:
1319 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
1321 self
._downloader
.report_warning(u
'Unable to extract video title')
1325 # We try first to get a high quality image:
1326 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
1327 video_webpage
, re
.DOTALL
)
1328 if m_thumb
is not None:
1329 video_thumbnail
= m_thumb
.group(1)
1330 elif 'thumbnail_url' not in video_info
:
1331 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
1332 video_thumbnail
= None
1333 else: # don't panic if we can't find it
1334 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
1338 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1339 if mobj
is not None:
1340 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1341 upload_date
= unified_strdate(upload_date
)
1344 video_description
= get_element_by_id("eow-description", video_webpage
)
1345 if video_description
:
1346 video_description
= re
.sub(r
'''(?x)
1348 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1350 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1351 class="yt-uix-redirect-link"\s*>
1354 ''', r
'\1', video_description
)
1355 video_description
= clean_html(video_description
)
1357 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
1359 video_description
= unescapeHTML(fd_mobj
.group(1))
1361 video_description
= u
''
1363 def _extract_count(klass
):
1364 count
= self
._search
_regex
(
1365 r
'class="%s">([\d,]+)</span>' % re
.escape(klass
),
1366 video_webpage
, klass
, default
=None)
1367 if count
is not None:
1368 return int(count
.replace(',', ''))
1370 like_count
= _extract_count(u
'likes-count')
1371 dislike_count
= _extract_count(u
'dislikes-count')
1374 video_subtitles
= self
.extract_subtitles(video_id
, video_webpage
)
1376 if self
._downloader
.params
.get('listsubtitles', False):
1377 self
._list
_available
_subtitles
(video_id
, video_webpage
)
1380 if 'length_seconds' not in video_info
:
1381 self
._downloader
.report_warning(u
'unable to extract video duration')
1382 video_duration
= None
1384 video_duration
= int(compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0]))
1387 video_annotations
= None
1388 if self
._downloader
.params
.get('writeannotations', False):
1389 video_annotations
= self
._extract
_annotations
(video_id
)
1391 # Decide which formats to download
1394 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
1396 raise ValueError('Could not find vevo ID')
1397 info
= json
.loads(mobj
.group(1))
1399 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1400 # this signatures are encrypted
1401 if 'url_encoded_fmt_stream_map' not in args
:
1402 raise ValueError(u
'No stream_map present') # caught below
1403 re_signature
= re
.compile(r
'[&,]s=')
1404 m_s
= re_signature
.search(args
['url_encoded_fmt_stream_map'])
1406 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
1407 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
1408 m_s
= re_signature
.search(args
.get('adaptive_fmts', u
''))
1410 if 'adaptive_fmts' in video_info
:
1411 video_info
['adaptive_fmts'][0] += ',' + args
['adaptive_fmts']
1413 video_info
['adaptive_fmts'] = [args
['adaptive_fmts']]
1417 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1418 self
.report_rtmp_download()
1419 video_url_list
= [(None, video_info
['conn'][0])]
1420 elif len(video_info
.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info
.get('adaptive_fmts', [])) >= 1:
1421 encoded_url_map
= video_info
.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info
.get('adaptive_fmts',[''])[0]
1422 if 'rtmpe%3Dyes' in encoded_url_map
:
1423 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
1425 for url_data_str
in encoded_url_map
.split(','):
1426 url_data
= compat_parse_qs(url_data_str
)
1427 if 'itag' in url_data
and 'url' in url_data
:
1428 url
= url_data
['url'][0]
1429 if 'sig' in url_data
:
1430 url
+= '&signature=' + url_data
['sig'][0]
1431 elif 's' in url_data
:
1432 encrypted_sig
= url_data
['s'][0]
1433 if self
._downloader
.params
.get('verbose'):
1435 if player_url
is None:
1436 player_version
= 'unknown'
1438 player_version
= self
._search
_regex
(
1439 r
'-(.+)\.swf$', player_url
,
1440 u
'flash player', fatal
=False)
1441 player_desc
= 'flash player %s' % player_version
1443 player_version
= self
._search
_regex
(
1444 r
'html5player-(.+?)\.js', video_webpage
,
1445 'html5 player', fatal
=False)
1446 player_desc
= u
'html5 player %s' % player_version
1448 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in encrypted_sig
.split('.'))
1449 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
1450 (len(encrypted_sig
), parts_sizes
, url_data
['itag'][0], player_desc
))
1453 jsplayer_url_json
= self
._search
_regex
(
1454 r
'"assets":.+?"js":\s*("[^"]+")',
1455 video_webpage
, u
'JS player URL')
1456 player_url
= json
.loads(jsplayer_url_json
)
1458 signature
= self
._decrypt
_signature
(
1459 encrypted_sig
, video_id
, player_url
, age_gate
)
1460 url
+= '&signature=' + signature
1461 if 'ratebypass' not in url
:
1462 url
+= '&ratebypass=yes'
1463 url_map
[url_data
['itag'][0]] = url
1464 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1465 if not video_url_list
:
1467 elif video_info
.get('hlsvp'):
1468 manifest_url
= video_info
['hlsvp'][0]
1469 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
1470 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1471 if not video_url_list
:
1475 raise ExtractorError(u
'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1478 for itag
, video_real_url
in video_url_list
:
1480 video_extension
= self
._video
_extensions
.get(itag
, 'flv')
1482 video_format
= '{0} - {1}{2}'.format(itag
if itag
else video_extension
,
1483 self
._video
_dimensions
.get(itag
, '???'),
1484 ' ('+self
._special
_itags
[itag
]+')' if itag
in self
._special
_itags
else '')
1488 'url': video_real_url
,
1489 'uploader': video_uploader
,
1490 'uploader_id': video_uploader_id
,
1491 'upload_date': upload_date
,
1492 'title': video_title
,
1493 'ext': video_extension
,
1494 'format': video_format
,
1496 'thumbnail': video_thumbnail
,
1497 'description': video_description
,
1498 'player_url': player_url
,
1499 'subtitles': video_subtitles
,
1500 'duration': video_duration
,
1501 'age_limit': 18 if age_gate
else 0,
1502 'annotations': video_annotations
,
1503 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id
,
1504 'view_count': view_count
,
1505 'like_count': like_count
,
1506 'dislike_count': dislike_count
,
1510 class YoutubePlaylistIE(YoutubeBaseInfoExtractor
):
1511 IE_DESC
= u
'YouTube.com playlists'
1512 _VALID_URL
= r
"""(?:
1517 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1518 \? (?:.*?&)*? (?:p|a|list)=
1521 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1524 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1526 _TEMPLATE_URL
= 'https://www.youtube.com/playlist?list=%s&page=%s'
1527 _MORE_PAGES_INDICATOR
= r
'data-link-type="next"'
1528 _VIDEO_RE
= r
'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1529 IE_NAME
= u
'youtube:playlist'
1532 def suitable(cls
, url
):
1533 """Receives a URL and returns True if suitable for this IE."""
1534 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1536 def _real_initialize(self
):
1539 def _ids_to_results(self
, ids
):
1540 return [self
.url_result(vid_id
, 'Youtube', video_id
=vid_id
)
1543 def _extract_mix(self
, playlist_id
):
1544 # The mixes are generated from a a single video
1545 # the id of the playlist is just 'RD' + video_id
1546 url
= 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id
[-11:], playlist_id
)
1547 webpage
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading Youtube mix')
1548 title_span
= (get_element_by_attribute('class', 'title long-title', webpage
) or
1549 get_element_by_attribute('class', 'title ', webpage
))
1550 title
= clean_html(title_span
)
1551 video_re
= r
'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re
.escape(playlist_id
)
1552 ids
= orderedSet(re
.findall(video_re
, webpage
))
1553 url_results
= self
._ids
_to
_results
(ids
)
1555 return self
.playlist_result(url_results
, playlist_id
, title
)
1557 def _real_extract(self
, url
):
1558 # Extract playlist id
1559 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1561 raise ExtractorError(u
'Invalid URL: %s' % url
)
1562 playlist_id
= mobj
.group(1) or mobj
.group(2)
1564 # Check if it's a video-specific URL
1565 query_dict
= compat_urlparse
.parse_qs(compat_urlparse
.urlparse(url
).query
)
1566 if 'v' in query_dict
:
1567 video_id
= query_dict
['v'][0]
1568 if self
._downloader
.params
.get('noplaylist'):
1569 self
.to_screen(u
'Downloading just video %s because of --no-playlist' % video_id
)
1570 return self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1572 self
.to_screen(u
'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id
, video_id
))
1574 if playlist_id
.startswith('RD'):
1575 # Mixes require a custom extraction process
1576 return self
._extract
_mix
(playlist_id
)
1577 if playlist_id
.startswith('TL'):
1578 raise ExtractorError(u
'For downloading YouTube.com top lists, use '
1579 u
'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected
=True)
1581 # Extract the video ids from the playlist pages
1584 for page_num
in itertools
.count(1):
1585 url
= self
._TEMPLATE
_URL
% (playlist_id
, page_num
)
1586 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
1587 matches
= re
.finditer(self
._VIDEO
_RE
, page
)
1588 # We remove the duplicates and the link with index 0
1589 # (it's not the first video of the playlist)
1590 new_ids
= orderedSet(m
.group('id') for m
in matches
if m
.group('index') != '0')
1593 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1596 playlist_title
= self
._og
_search
_title
(page
)
1598 url_results
= self
._ids
_to
_results
(ids
)
1599 return self
.playlist_result(url_results
, playlist_id
, playlist_title
)
1602 class YoutubeTopListIE(YoutubePlaylistIE
):
1603 IE_NAME
= u
'youtube:toplist'
1604 IE_DESC
= (u
'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1605 u
' (Example: "yttoplist:music:Top Tracks")')
1606 _VALID_URL
= r
'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1608 def _real_extract(self
, url
):
1609 mobj
= re
.match(self
._VALID
_URL
, url
)
1610 channel
= mobj
.group('chann')
1611 title
= mobj
.group('title')
1612 query
= compat_urllib_parse
.urlencode({'title': title
})
1613 playlist_re
= 'href="([^"]+?%s[^"]+?)"' % re
.escape(query
)
1614 channel_page
= self
._download
_webpage
('https://www.youtube.com/%s' % channel
, title
)
1615 link
= self
._html
_search
_regex
(playlist_re
, channel_page
, u
'list')
1616 url
= compat_urlparse
.urljoin('https://www.youtube.com/', link
)
1618 video_re
= r
'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1620 # sometimes the webpage doesn't contain the videos
1621 # retry until we get them
1622 for i
in itertools
.count(0):
1623 msg
= u
'Downloading Youtube mix'
1625 msg
+= ', retry #%d' % i
1626 webpage
= self
._download
_webpage
(url
, title
, msg
)
1627 ids
= orderedSet(re
.findall(video_re
, webpage
))
1630 url_results
= self
._ids
_to
_results
(ids
)
1631 return self
.playlist_result(url_results
, playlist_title
=title
)
1634 class YoutubeChannelIE(InfoExtractor
):
1635 IE_DESC
= u
'YouTube.com channels'
1636 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1637 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
1638 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1639 IE_NAME
= u
'youtube:channel'
1641 def extract_videos_from_page(self
, page
):
1643 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
1644 if mobj
.group(1) not in ids_in_page
:
1645 ids_in_page
.append(mobj
.group(1))
1648 def _real_extract(self
, url
):
1649 # Extract channel id
1650 mobj
= re
.match(self
._VALID
_URL
, url
)
1652 raise ExtractorError(u
'Invalid URL: %s' % url
)
1654 # Download channel page
1655 channel_id
= mobj
.group(1)
1657 url
= 'https://www.youtube.com/channel/%s/videos' % channel_id
1658 channel_page
= self
._download
_webpage
(url
, channel_id
)
1659 autogenerated
= re
.search(r
'''(?x)
1661 channel-header-autogenerated-label|
1662 yt-channel-title-autogenerated
1663 )[^"]*"''', channel_page
) is not None
1666 # The videos are contained in a single page
1667 # the ajax pages can't be used, they are empty
1668 video_ids
= self
.extract_videos_from_page(channel_page
)
1670 # Download all channel pages using the json-based channel_ajax query
1671 for pagenum
in itertools
.count(1):
1672 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
1673 page
= self
._download
_webpage
(url
, channel_id
,
1674 u
'Downloading page #%s' % pagenum
)
1676 page
= json
.loads(page
)
1678 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1679 video_ids
.extend(ids_in_page
)
1681 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1684 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1686 url_entries
= [self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1687 for video_id
in video_ids
]
1688 return self
.playlist_result(url_entries
, channel_id
)
1691 class YoutubeUserIE(InfoExtractor
):
1692 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1693 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1694 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1695 _GDATA_PAGE_SIZE
= 50
1696 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1697 IE_NAME
= u
'youtube:user'
1700 def suitable(cls
, url
):
1701 # Don't return True if the url can be extracted with other youtube
1702 # extractor, the regex would is too permissive and it would match.
1703 other_ies
= iter(klass
for (name
, klass
) in globals().items() if name
.endswith('IE') and klass
is not cls
)
1704 if any(ie
.suitable(url
) for ie
in other_ies
): return False
1705 else: return super(YoutubeUserIE
, cls
).suitable(url
)
1707 def _real_extract(self
, url
):
1709 mobj
= re
.match(self
._VALID
_URL
, url
)
1711 raise ExtractorError(u
'Invalid URL: %s' % url
)
1713 username
= mobj
.group(1)
1715 # Download video ids using YouTube Data API. Result size per
1716 # query is limited (currently to 50 videos) so we need to query
1717 # page by page until there are no video ids - it means we got
1722 for pagenum
in itertools
.count(0):
1723 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1725 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1726 page
= self
._download
_webpage
(gdata_url
, username
,
1727 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1730 response
= json
.loads(page
)
1731 except ValueError as err
:
1732 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1733 if 'entry' not in response
['feed']:
1734 # Number of videos is a multiple of self._MAX_RESULTS
1737 # Extract video identifiers
1738 entries
= response
['feed']['entry']
1739 for entry
in entries
:
1740 title
= entry
['title']['$t']
1741 video_id
= entry
['id']['$t'].split('/')[-1]
1742 url_results
.append({
1745 'ie_key': 'Youtube',
1750 # A little optimization - if current page is not
1751 # "full", ie. does not contain PAGE_SIZE video ids then
1752 # we can assume that this page is the last one - there
1753 # are no more ids on further pages - no need to query
1756 if len(entries
) < self
._GDATA
_PAGE
_SIZE
:
1759 return self
.playlist_result(url_results
, playlist_title
=username
)
1762 class YoutubeSearchIE(SearchInfoExtractor
):
1763 IE_DESC
= u
'YouTube.com searches'
1764 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1766 IE_NAME
= u
'youtube:search'
1767 _SEARCH_KEY
= 'ytsearch'
1769 def _get_n_results(self
, query
, n
):
1770 """Get a specified number of results for a query"""
1776 while (50 * pagenum
) < limit
:
1777 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1778 data_json
= self
._download
_webpage
(
1779 result_url
, video_id
=u
'query "%s"' % query
,
1780 note
=u
'Downloading page %s' % (pagenum
+ 1),
1781 errnote
=u
'Unable to download API page')
1782 data
= json
.loads(data_json
)
1783 api_response
= data
['data']
1785 if 'items' not in api_response
:
1786 raise ExtractorError(u
'[youtube] No video results')
1788 new_ids
= list(video
['id'] for video
in api_response
['items'])
1789 video_ids
+= new_ids
1791 limit
= min(n
, api_response
['totalItems'])
1794 if len(video_ids
) > n
:
1795 video_ids
= video_ids
[:n
]
1796 videos
= [self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1797 for video_id
in video_ids
]
1798 return self
.playlist_result(videos
, query
)
1800 class YoutubeSearchDateIE(YoutubeSearchIE
):
1801 IE_NAME
= YoutubeSearchIE
.IE_NAME
+ ':date'
1802 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1803 _SEARCH_KEY
= 'ytsearchdate'
1804 IE_DESC
= u
'YouTube.com searches, newest videos first'
1806 class YoutubeShowIE(InfoExtractor
):
1807 IE_DESC
= u
'YouTube.com (multi-season) shows'
1808 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1809 IE_NAME
= u
'youtube:show'
1811 def _real_extract(self
, url
):
1812 mobj
= re
.match(self
._VALID
_URL
, url
)
1813 show_name
= mobj
.group(1)
1814 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1815 # There's one playlist for each season of the show
1816 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1817 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1818 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1821 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1823 Base class for extractors that fetch info from
1824 http://www.youtube.com/feed_ajax
1825 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1827 _LOGIN_REQUIRED
= True
1828 # use action_load_personal_feed instead of action_load_system_feed
1829 _PERSONAL_FEED
= False
1832 def _FEED_TEMPLATE(self
):
1833 action
= 'action_load_system_feed'
1834 if self
._PERSONAL
_FEED
:
1835 action
= 'action_load_personal_feed'
1836 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1840 return u
'youtube:%s' % self
._FEED
_NAME
1842 def _real_initialize(self
):
1845 def _real_extract(self
, url
):
1848 for i
in itertools
.count(1):
1849 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1850 u
'%s feed' % self
._FEED
_NAME
,
1851 u
'Downloading page %s' % i
)
1852 info
= json
.loads(info
)
1853 feed_html
= info
['feed_html']
1854 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1855 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1856 feed_entries
.extend(
1857 self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1858 for video_id
in ids
)
1859 if info
['paging'] is None:
1861 paging
= info
['paging']
1862 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1864 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1865 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1866 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1867 _FEED_NAME
= 'subscriptions'
1868 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1870 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1871 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1872 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1873 _FEED_NAME
= 'recommended'
1874 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1876 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1877 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1878 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1879 _FEED_NAME
= 'watch_later'
1880 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1881 _PERSONAL_FEED
= True
1883 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor
):
1884 IE_DESC
= u
'Youtube watch history, "ythistory" keyword (requires authentication)'
1885 _VALID_URL
= u
'https?://www\.youtube\.com/feed/history|:ythistory'
1886 _FEED_NAME
= 'history'
1887 _PERSONAL_FEED
= True
1888 _PLAYLIST_TITLE
= u
'Youtube Watch History'
1890 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1891 IE_NAME
= u
'youtube:favorites'
1892 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1893 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1894 _LOGIN_REQUIRED
= True
1896 def _real_extract(self
, url
):
1897 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1898 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1899 return self
.url_result(playlist_id
, 'YoutubePlaylist')
1902 class YoutubeTruncatedURLIE(InfoExtractor
):
1903 IE_NAME
= 'youtube:truncated_url'
1904 IE_DESC
= False # Do not list
1905 _VALID_URL
= r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1907 def _real_extract(self
, url
):
1908 raise ExtractorError(
1909 u
'Did you forget to quote the URL? Remember that & is a meta '
1910 u
'character in most shells, so you want to put the URL in quotes, '
1912 u
'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1913 u
' (or simply youtube-dl BaW_jenozKc ).',