14 import xml
.etree
.ElementTree
17 from .common
import InfoExtractor
, SearchInfoExtractor
18 from .subtitles
import SubtitlesInfoExtractor
25 compat_urllib_request
,
39 class YoutubeBaseInfoExtractor(InfoExtractor
):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE
= 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED
= False
48 def report_lang(self
):
49 """Report attempt to set language."""
50 self
.to_screen(u
'Setting language')
52 def _set_language(self
):
53 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
56 compat_urllib_request
.urlopen(request
).read()
57 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
58 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
63 (username
, password
) = self
._get
_login
_info
()
64 # No authentication to be performed
66 if self
._LOGIN
_REQUIRED
:
67 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
70 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
72 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
73 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
74 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
79 match
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
)
82 match
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
)
88 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
92 u
'PersistentCookie': u
'yes',
94 u
'bgresponse': u
'js_disabled',
95 u
'checkConnection': u
'',
96 u
'checkedDomains': u
'youtube',
102 u
'signIn': u
'Sign in',
104 u
'service': u
'youtube',
108 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
110 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
111 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
112 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
115 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
116 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
117 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
119 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
120 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
124 def _confirm_age(self
):
127 'action_confirm': 'Confirm',
129 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
131 self
.report_age_confirmation()
132 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
133 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
134 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
137 def _real_initialize(self
):
138 if self
._downloader
is None:
140 if not self
._set
_language
():
142 if not self
._login
():
147 class YoutubeIE(YoutubeBaseInfoExtractor
, SubtitlesInfoExtractor
):
148 IE_DESC
= u
'YouTube.com'
151 (?:https?://)? # http(s):// (optional)
152 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
153 tube\.majestyc\.net/|
154 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
155 (?:.*?\#/)? # handle anchor (#/) redirect urls
156 (?: # the various things that can precede the ID:
157 (?:(?:v|embed|e)/) # v/ or embed/ or e/
158 |(?: # or the v= param in all its forms
159 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
160 (?:\?|\#!?) # the params delimiter ? or # or #!
161 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 |youtu\.be/ # just youtu.be/xxxx
167 )? # all until now is optional -> you can pass the naked ID
168 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
169 (?(1).+)? # if we found the ID, everything can follow
171 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
172 # Listed in order of quality
173 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '84', '102', '83', '101', '82', '100',
179 '138', '137', '248', '136', '247', '135', '246',
180 '245', '244', '134', '243', '133', '242', '160',
182 '141', '172', '140', '171', '139',
184 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
185 # Apple HTTP Live Streaming
186 '96', '95', '94', '93', '92', '132', '151',
188 '85', '102', '84', '101', '83', '100', '82',
190 '138', '248', '137', '247', '136', '246', '245',
191 '244', '135', '243', '134', '242', '133', '160',
193 '172', '141', '171', '140', '139',
195 _video_formats_map
= {
196 'flv': ['35', '34', '6', '5'],
197 '3gp': ['36', '17', '13'],
198 'mp4': ['38', '37', '22', '18'],
199 'webm': ['46', '45', '44', '43'],
201 _video_extensions
= {
223 # Apple HTTP Live Streaming
255 _video_dimensions
= {
337 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
338 u
"file": u
"BaW_jenozKc.mp4",
340 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
341 u
"uploader": u
"Philipp Hagemeister",
342 u
"uploader_id": u
"phihag",
343 u
"upload_date": u
"20121002",
344 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
348 u
"url": u
"http://www.youtube.com/watch?v=1ltcDfZMA3U",
349 u
"file": u
"1ltcDfZMA3U.flv",
350 u
"note": u
"Test VEVO video (#897)",
352 u
"upload_date": u
"20070518",
353 u
"title": u
"Maps - It Will Find You",
354 u
"description": u
"Music video by Maps performing It Will Find You.",
355 u
"uploader": u
"MuteUSA",
356 u
"uploader_id": u
"MuteUSA"
360 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
361 u
"file": u
"UxxajLWwzqY.mp4",
362 u
"note": u
"Test generic use_cipher_signature video (#897)",
364 u
"upload_date": u
"20120506",
365 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
366 u
"description": u
"md5:5b292926389560516e384ac437c0ec07",
367 u
"uploader": u
"Icona Pop",
368 u
"uploader_id": u
"IconaPop"
372 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
373 u
"file": u
"07FYdnEawAQ.mp4",
374 u
"note": u
"Test VEVO video with age protection (#956)",
376 u
"upload_date": u
"20130703",
377 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
378 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
379 u
"uploader": u
"justintimberlakeVEVO",
380 u
"uploader_id": u
"justintimberlakeVEVO"
387 def suitable(cls
, url
):
388 """Receives a URL and returns True if suitable for this IE."""
389 if YoutubePlaylistIE
.suitable(url
): return False
390 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
392 def __init__(self
, *args
, **kwargs
):
393 super(YoutubeIE
, self
).__init
__(*args
, **kwargs
)
394 self
._player
_cache
= {}
396 def report_video_webpage_download(self
, video_id
):
397 """Report attempt to download video webpage."""
398 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
400 def report_video_info_webpage_download(self
, video_id
):
401 """Report attempt to download video info webpage."""
402 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
404 def report_information_extraction(self
, video_id
):
405 """Report attempt to extract video information."""
406 self
.to_screen(u
'%s: Extracting video information' % video_id
)
408 def report_unavailable_format(self
, video_id
, format
):
409 """Report extracted video URL."""
410 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
412 def report_rtmp_download(self
):
413 """Indicate the download will use the RTMP protocol."""
414 self
.to_screen(u
'RTMP download detected')
416 def _extract_signature_function(self
, video_id
, player_url
, slen
):
417 id_m
= re
.match(r
'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
419 player_type
= id_m
.group('ext')
420 player_id
= id_m
.group('id')
422 # Read from filesystem cache
423 func_id
= '%s_%s_%d' % (player_type
, player_id
, slen
)
424 assert os
.path
.basename(func_id
) == func_id
425 cache_dir
= get_cachedir(self
._downloader
.params
)
427 cache_enabled
= cache_dir
is not None
429 cache_fn
= os
.path
.join(os
.path
.expanduser(cache_dir
),
433 with io
.open(cache_fn
, 'r', encoding
='utf-8') as cachef
:
434 cache_spec
= json
.load(cachef
)
435 return lambda s
: u
''.join(s
[i
] for i
in cache_spec
)
437 pass # No cache available
439 if player_type
== 'js':
440 code
= self
._download
_webpage
(
441 player_url
, video_id
,
442 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
443 errnote
=u
'Download of %s failed' % player_url
)
444 res
= self
._parse
_sig
_js
(code
)
445 elif player_type
== 'swf':
446 urlh
= self
._request
_webpage
(
447 player_url
, video_id
,
448 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
449 errnote
=u
'Download of %s failed' % player_url
)
451 res
= self
._parse
_sig
_swf
(code
)
453 assert False, 'Invalid player type %r' % player_type
457 test_string
= u
''.join(map(compat_chr
, range(slen
)))
458 cache_res
= res(test_string
)
459 cache_spec
= [ord(c
) for c
in cache_res
]
461 os
.makedirs(os
.path
.dirname(cache_fn
))
462 except OSError as ose
:
463 if ose
.errno
!= errno
.EEXIST
:
465 write_json_file(cache_spec
, cache_fn
)
467 tb
= traceback
.format_exc()
468 self
._downloader
.report_warning(
469 u
'Writing cache to %r failed: %s' % (cache_fn
, tb
))
473 def _print_sig_code(self
, func
, slen
):
474 def gen_sig_code(idxs
):
475 def _genslice(start
, end
, step
):
476 starts
= u
'' if start
== 0 else str(start
)
477 ends
= (u
':%d' % (end
+step
)) if end
+ step
>= 0 else u
':'
478 steps
= u
'' if step
== 1 else (u
':%d' % step
)
479 return u
's[%s%s%s]' % (starts
, ends
, steps
)
482 start
= '(Never used)' # Quelch pyflakes warnings - start will be
483 # set as soon as step is set
484 for i
, prev
in zip(idxs
[1:], idxs
[:-1]):
488 yield _genslice(start
, prev
, step
)
491 if i
- prev
in [-1, 1]:
496 yield u
's[%d]' % prev
500 yield _genslice(start
, i
, step
)
502 test_string
= u
''.join(map(compat_chr
, range(slen
)))
503 cache_res
= func(test_string
)
504 cache_spec
= [ord(c
) for c
in cache_res
]
505 expr_code
= u
' + '.join(gen_sig_code(cache_spec
))
506 code
= u
'if len(s) == %d:\n return %s\n' % (slen
, expr_code
)
507 self
.to_screen(u
'Extracted signature function:\n' + code
)
509 def _parse_sig_js(self
, jscode
):
510 funcname
= self
._search
_regex
(
511 r
'signature=([a-zA-Z]+)', jscode
,
512 u
'Initial JS player signature function name')
517 return string
.lowercase
.index(varname
)
519 def interpret_statement(stmt
, local_vars
, allow_recursion
=20):
520 if allow_recursion
< 0:
521 raise ExtractorError(u
'Recursion limit reached')
523 if stmt
.startswith(u
'var '):
524 stmt
= stmt
[len(u
'var '):]
525 ass_m
= re
.match(r
'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
526 r
'=(?P<expr>.*)$', stmt
)
528 if ass_m
.groupdict().get('index'):
530 lvar
= local_vars
[ass_m
.group('out')]
531 idx
= interpret_expression(ass_m
.group('index'),
532 local_vars
, allow_recursion
)
533 assert isinstance(idx
, int)
536 expr
= ass_m
.group('expr')
539 local_vars
[ass_m
.group('out')] = val
541 expr
= ass_m
.group('expr')
542 elif stmt
.startswith(u
'return '):
544 expr
= stmt
[len(u
'return '):]
546 raise ExtractorError(
547 u
'Cannot determine left side of statement in %r' % stmt
)
549 v
= interpret_expression(expr
, local_vars
, allow_recursion
)
552 def interpret_expression(expr
, local_vars
, allow_recursion
):
557 return local_vars
[expr
]
559 m
= re
.match(r
'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr
)
561 member
= m
.group('member')
562 val
= local_vars
[m
.group('in')]
563 if member
== 'split("")':
565 if member
== 'join("")':
567 if member
== 'length':
569 if member
== 'reverse()':
571 slice_m
= re
.match(r
'slice\((?P<idx>.*)\)', member
)
573 idx
= interpret_expression(
574 slice_m
.group('idx'), local_vars
, allow_recursion
-1)
578 r
'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr
)
580 val
= local_vars
[m
.group('in')]
581 idx
= interpret_expression(m
.group('idx'), local_vars
,
585 m
= re
.match(r
'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr
)
587 a
= interpret_expression(m
.group('a'),
588 local_vars
, allow_recursion
)
589 b
= interpret_expression(m
.group('b'),
590 local_vars
, allow_recursion
)
594 r
'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr
)
596 fname
= m
.group('func')
597 if fname
not in functions
:
598 functions
[fname
] = extract_function(fname
)
599 argvals
= [int(v
) if v
.isdigit() else local_vars
[v
]
600 for v
in m
.group('args').split(',')]
601 return functions
[fname
](argvals
)
602 raise ExtractorError(u
'Unsupported JS expression %r' % expr
)
604 def extract_function(funcname
):
606 r
'function ' + re
.escape(funcname
) +
607 r
'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
609 argnames
= func_m
.group('args').split(',')
612 local_vars
= dict(zip(argnames
, args
))
613 for stmt
in func_m
.group('code').split(';'):
614 res
= interpret_statement(stmt
, local_vars
)
618 initial_function
= extract_function(funcname
)
619 return lambda s
: initial_function([s
])
621 def _parse_sig_swf(self
, file_contents
):
622 if file_contents
[1:3] != b
'WS':
623 raise ExtractorError(
624 u
'Not an SWF file; header is %r' % file_contents
[:3])
625 if file_contents
[:1] == b
'C':
626 content
= zlib
.decompress(file_contents
[8:])
628 raise NotImplementedError(u
'Unsupported compression format %r' %
631 def extract_tags(content
):
633 while pos
< len(content
):
634 header16
= struct
.unpack('<H', content
[pos
:pos
+2])[0]
636 tag_code
= header16
>> 6
637 tag_len
= header16
& 0x3f
639 tag_len
= struct
.unpack('<I', content
[pos
:pos
+4])[0]
641 assert pos
+tag_len
<= len(content
)
642 yield (tag_code
, content
[pos
:pos
+tag_len
])
646 for tag_code
, tag
in extract_tags(content
)
648 p
= code_tag
.index(b
'\0', 4) + 1
649 code_reader
= io
.BytesIO(code_tag
[p
:])
651 # Parse ABC (AVM2 ByteCode)
652 def read_int(reader
=None):
660 b
= struct
.unpack('<B', buf
)[0]
661 res
= res |
((b
& 0x7f) << shift
)
667 def u30(reader
=None):
668 res
= read_int(reader
)
669 assert res
& 0xf0000000 == 0
673 def s32(reader
=None):
675 if v
& 0x80000000 != 0:
676 v
= - ((v ^
0xffffffff) + 1)
679 def read_string(reader
=None):
683 resb
= reader
.read(slen
)
684 assert len(resb
) == slen
685 return resb
.decode('utf-8')
687 def read_bytes(count
, reader
=None):
690 resb
= reader
.read(count
)
691 assert len(resb
) == count
694 def read_byte(reader
=None):
695 resb
= read_bytes(1, reader
=reader
)
696 res
= struct
.unpack('<B', resb
)[0]
699 # minor_version + major_version
704 for _c
in range(1, int_count
):
707 for _c
in range(1, uint_count
):
710 read_bytes((double_count
-1) * 8)
712 constant_strings
= [u
'']
713 for _c
in range(1, string_count
):
715 constant_strings
.append(s
)
716 namespace_count
= u30()
717 for _c
in range(1, namespace_count
):
721 for _c
in range(1, ns_set_count
):
723 for _c2
in range(count
):
725 multiname_count
= u30()
734 0x0e: 2, # MultinameA
735 0x1b: 1, # MultinameL
736 0x1c: 1, # MultinameLA
739 for _c
in range(1, multiname_count
):
741 assert kind
in MULTINAME_SIZES
, u
'Invalid multiname kind %r' % kind
743 u30() # namespace_idx
745 multinames
.append(constant_strings
[name_idx
])
747 multinames
.append('[MULTINAME kind: %d]' % kind
)
748 for _c2
in range(MULTINAME_SIZES
[kind
]):
753 MethodInfo
= collections
.namedtuple(
755 ['NEED_ARGUMENTS', 'NEED_REST'])
757 for method_id
in range(method_count
):
760 for _
in range(param_count
):
762 u30() # name index (always 0 for youtube)
764 if flags
& 0x08 != 0:
767 for c
in range(option_count
):
770 if flags
& 0x80 != 0:
771 # Param names present
772 for _
in range(param_count
):
774 mi
= MethodInfo(flags
& 0x01 != 0, flags
& 0x04 != 0)
775 method_infos
.append(mi
)
778 metadata_count
= u30()
779 for _c
in range(metadata_count
):
782 for _c2
in range(item_count
):
786 def parse_traits_info():
787 trait_name_idx
= u30()
788 kind_full
= read_byte()
789 kind
= kind_full
& 0x0f
790 attrs
= kind_full
>> 4
792 if kind
in [0x00, 0x06]: # Slot or Const
794 u30() # type_name_idx
798 elif kind
in [0x01, 0x02, 0x03]: # Method / Getter / Setter
801 methods
[multinames
[trait_name_idx
]] = method_idx
802 elif kind
== 0x04: # Class
805 elif kind
== 0x05: # Function
808 methods
[function_idx
] = multinames
[trait_name_idx
]
810 raise ExtractorError(u
'Unsupported trait kind %d' % kind
)
812 if attrs
& 0x4 != 0: # Metadata present
813 metadata_count
= u30()
814 for _c3
in range(metadata_count
):
815 u30() # metadata index
820 TARGET_CLASSNAME
= u
'SignatureDecipher'
821 searched_idx
= multinames
.index(TARGET_CLASSNAME
)
822 searched_class_id
= None
824 for class_id
in range(class_count
):
826 if name_idx
== searched_idx
:
827 # We found the class we're looking for!
828 searched_class_id
= class_id
829 u30() # super_name idx
831 if flags
& 0x08 != 0: # Protected namespace is present
832 u30() # protected_ns_idx
834 for _c2
in range(intrf_count
):
838 for _c2
in range(trait_count
):
841 if searched_class_id
is None:
842 raise ExtractorError(u
'Target class %r not found' %
847 for class_id
in range(class_count
):
850 for _c2
in range(trait_count
):
851 trait_methods
= parse_traits_info()
852 if class_id
== searched_class_id
:
853 method_names
.update(trait_methods
.items())
854 method_idxs
.update(dict(
856 for name
, idx
in trait_methods
.items()))
860 for _c
in range(script_count
):
863 for _c2
in range(trait_count
):
867 method_body_count
= u30()
868 Method
= collections
.namedtuple('Method', ['code', 'local_count'])
870 for _c
in range(method_body_count
):
874 u30() # init_scope_depth
875 u30() # max_scope_depth
877 code
= read_bytes(code_length
)
878 if method_idx
in method_idxs
:
879 m
= Method(code
, local_count
)
880 methods
[method_idxs
[method_idx
]] = m
881 exception_count
= u30()
882 for _c2
in range(exception_count
):
889 for _c2
in range(trait_count
):
892 assert p
+ code_reader
.tell() == len(code_tag
)
893 assert len(methods
) == len(method_idxs
)
895 method_pyfunctions
= {}
897 def extract_function(func_name
):
898 if func_name
in method_pyfunctions
:
899 return method_pyfunctions
[func_name
]
900 if func_name
not in methods
:
901 raise ExtractorError(u
'Cannot find function %r' % func_name
)
902 m
= methods
[func_name
]
905 registers
= ['(this)'] + list(args
) + [None] * m
.local_count
907 coder
= io
.BytesIO(m
.code
)
909 opcode
= struct
.unpack('!B', coder
.read(1))[0]
910 if opcode
== 36: # pushbyte
911 v
= struct
.unpack('!B', coder
.read(1))[0]
913 elif opcode
== 44: # pushstring
915 stack
.append(constant_strings
[idx
])
916 elif opcode
== 48: # pushscope
917 # We don't implement the scope register, so we'll just
918 # ignore the popped value
920 elif opcode
== 70: # callproperty
922 mname
= multinames
[index
]
923 arg_count
= u30(coder
)
924 args
= list(reversed(
925 [stack
.pop() for _
in range(arg_count
)]))
927 if mname
== u
'split':
928 assert len(args
) == 1
929 assert isinstance(args
[0], compat_str
)
930 assert isinstance(obj
, compat_str
)
934 res
= obj
.split(args
[0])
936 elif mname
== u
'slice':
937 assert len(args
) == 1
938 assert isinstance(args
[0], int)
939 assert isinstance(obj
, list)
942 elif mname
== u
'join':
943 assert len(args
) == 1
944 assert isinstance(args
[0], compat_str
)
945 assert isinstance(obj
, list)
946 res
= args
[0].join(obj
)
948 elif mname
in method_pyfunctions
:
949 stack
.append(method_pyfunctions
[mname
](args
))
951 raise NotImplementedError(
952 u
'Unsupported property %r on %r'
954 elif opcode
== 72: # returnvalue
957 elif opcode
== 79: # callpropvoid
959 mname
= multinames
[index
]
960 arg_count
= u30(coder
)
961 args
= list(reversed(
962 [stack
.pop() for _
in range(arg_count
)]))
964 if mname
== u
'reverse':
965 assert isinstance(obj
, list)
968 raise NotImplementedError(
969 u
'Unsupported (void) property %r on %r'
971 elif opcode
== 93: # findpropstrict
973 mname
= multinames
[index
]
974 res
= extract_function(mname
)
976 elif opcode
== 97: # setproperty
981 assert isinstance(obj
, list)
982 assert isinstance(idx
, int)
984 elif opcode
== 98: # getlocal
986 stack
.append(registers
[index
])
987 elif opcode
== 99: # setlocal
990 registers
[index
] = value
991 elif opcode
== 102: # getproperty
993 pname
= multinames
[index
]
994 if pname
== u
'length':
996 assert isinstance(obj
, list)
997 stack
.append(len(obj
))
998 else: # Assume attribute access
1000 assert isinstance(idx
, int)
1002 assert isinstance(obj
, list)
1003 stack
.append(obj
[idx
])
1004 elif opcode
== 128: # coerce
1006 elif opcode
== 133: # coerce_s
1007 assert isinstance(stack
[-1], (type(None), compat_str
))
1008 elif opcode
== 164: # modulo
1009 value2
= stack
.pop()
1010 value1
= stack
.pop()
1011 res
= value1
% value2
1013 elif opcode
== 208: # getlocal_0
1014 stack
.append(registers
[0])
1015 elif opcode
== 209: # getlocal_1
1016 stack
.append(registers
[1])
1017 elif opcode
== 210: # getlocal_2
1018 stack
.append(registers
[2])
1019 elif opcode
== 211: # getlocal_3
1020 stack
.append(registers
[3])
1021 elif opcode
== 214: # setlocal_2
1022 registers
[2] = stack
.pop()
1023 elif opcode
== 215: # setlocal_3
1024 registers
[3] = stack
.pop()
1026 raise NotImplementedError(
1027 u
'Unsupported opcode %d' % opcode
)
1029 method_pyfunctions
[func_name
] = resfunc
1032 initial_function
= extract_function(u
'decipher')
1033 return lambda s
: initial_function([s
])
1035 def _decrypt_signature(self
, s
, video_id
, player_url
, age_gate
=False):
1036 """Turn the encrypted s field into a working signature"""
1038 if player_url
is not None:
1040 if player_url
not in self
._player
_cache
:
1041 func
= self
._extract
_signature
_function
(
1042 video_id
, player_url
, len(s
)
1044 self
._player
_cache
[player_url
] = func
1045 func
= self
._player
_cache
[player_url
]
1046 if self
._downloader
.params
.get('youtube_print_sig_code'):
1047 self
._print
_sig
_code
(func
, len(s
))
1050 tb
= traceback
.format_exc()
1051 self
._downloader
.report_warning(
1052 u
'Automatic signature extraction failed: ' + tb
)
1054 self
._downloader
.report_warning(
1055 u
'Warning: Falling back to static signature algorithm')
1057 return self
._static
_decrypt
_signature
(
1058 s
, video_id
, player_url
, age_gate
)
1060 def _static_decrypt_signature(self
, s
, video_id
, player_url
, age_gate
):
1062 # The videos with age protection use another player, so the
1063 # algorithms can be different.
1065 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
1068 return s
[86:29:-1] + s
[88] + s
[28:5:-1]
1070 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
1072 return s
[84:27:-1] + s
[86] + s
[26:5:-1]
1074 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
1076 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
1078 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
1080 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
1082 return s
[80:72:-1] + s
[16] + s
[71:39:-1] + s
[72] + s
[38:16:-1] + s
[82] + s
[15::-1]
1084 return s
[3:11] + s
[0] + s
[12:55] + s
[84] + s
[56:84]
1086 return s
[78:70:-1] + s
[14] + s
[69:37:-1] + s
[70] + s
[36:14:-1] + s
[80] + s
[:14][::-1]
1088 return s
[80:63:-1] + s
[0] + s
[62:0:-1] + s
[63]
1090 return s
[12] + s
[79:12:-1] + s
[80] + s
[11::-1]
1092 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1094 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
1096 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1099 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
1101 def _get_available_subtitles(self
, video_id
):
1103 sub_list
= self
._download
_webpage
(
1104 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
,
1105 video_id
, note
=False)
1106 except ExtractorError
as err
:
1107 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
1109 lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
1114 params
= compat_urllib_parse
.urlencode({
1117 'fmt': self
._downloader
.params
.get('subtitlesformat'),
1119 url
= u
'http://www.youtube.com/api/timedtext?' + params
1120 sub_lang_list
[lang
] = url
1121 if not sub_lang_list
:
1122 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
1124 return sub_lang_list
1126 def _get_available_automatic_caption(self
, video_id
, webpage
):
1127 """We need the webpage for getting the captions url, pass it as an
1128 argument to speed up the process."""
1129 sub_format
= self
._downloader
.params
.get('subtitlesformat')
1130 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
1131 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
1132 err_msg
= u
'Couldn\'t find automatic captions for %s' % video_id
1134 self
._downloader
.report_warning(err_msg
)
1136 player_config
= json
.loads(mobj
.group(1))
1138 args
= player_config
[u
'args']
1139 caption_url
= args
[u
'ttsurl']
1140 timestamp
= args
[u
'timestamp']
1141 # We get the available subtitles
1142 list_params
= compat_urllib_parse
.urlencode({
1147 list_url
= caption_url
+ '&' + list_params
1148 list_page
= self
._download
_webpage
(list_url
, video_id
)
1149 caption_list
= xml
.etree
.ElementTree
.fromstring(list_page
.encode('utf-8'))
1150 original_lang_node
= caption_list
.find('track')
1151 if original_lang_node
.attrib
.get('kind') != 'asr' :
1152 self
._downloader
.report_warning(u
'Video doesn\'t have automatic captions')
1154 original_lang
= original_lang_node
.attrib
['lang_code']
1157 for lang_node
in caption_list
.findall('target'):
1158 sub_lang
= lang_node
.attrib
['lang_code']
1159 params
= compat_urllib_parse
.urlencode({
1160 'lang': original_lang
,
1166 sub_lang_list
[sub_lang
] = caption_url
+ '&' + params
1167 return sub_lang_list
1168 # An extractor error can be raise by the download process if there are
1169 # no automatic captions but there are subtitles
1170 except (KeyError, ExtractorError
):
1171 self
._downloader
.report_warning(err_msg
)
1174 def _print_formats(self
, formats
):
1175 print('Available formats:')
1177 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
1178 self
._video
_dimensions
.get(x
, '???'),
1179 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
1181 def _extract_id(self
, url
):
1182 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1184 raise ExtractorError(u
'Invalid URL: %s' % url
)
1185 video_id
= mobj
.group(2)
1188 def _get_video_url_list(self
, url_map
):
1190 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1191 with the requested formats.
1193 req_format
= self
._downloader
.params
.get('format', None)
1194 format_limit
= self
._downloader
.params
.get('format_limit', None)
1195 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
1196 if format_limit
is not None and format_limit
in available_formats
:
1197 format_list
= available_formats
[available_formats
.index(format_limit
):]
1199 format_list
= available_formats
1200 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1201 if len(existing_formats
) == 0:
1202 raise ExtractorError(u
'no known formats available for video')
1203 if self
._downloader
.params
.get('listformats', None):
1204 self
._print
_formats
(existing_formats
)
1206 if req_format
is None or req_format
== 'best':
1207 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1208 elif req_format
== 'worst':
1209 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
1210 elif req_format
in ('-1', 'all'):
1211 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1213 # Specific formats. We pick the first in a slash-delimeted sequence.
1214 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1215 # available in the specified format. For example,
1216 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1217 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1218 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1219 req_formats
= req_format
.split('/')
1220 video_url_list
= None
1221 for rf
in req_formats
:
1223 video_url_list
= [(rf
, url_map
[rf
])]
1225 if rf
in self
._video
_formats
_map
:
1226 for srf
in self
._video
_formats
_map
[rf
]:
1228 video_url_list
= [(srf
, url_map
[srf
])]
1233 if video_url_list
is None:
1234 raise ExtractorError(u
'requested format not available')
1235 return video_url_list
1237 def _extract_from_m3u8(self
, manifest_url
, video_id
):
1239 def _get_urls(_manifest
):
1240 lines
= _manifest
.split('\n')
1241 urls
= filter(lambda l
: l
and not l
.startswith('#'),
1244 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
1245 formats_urls
= _get_urls(manifest
)
1246 for format_url
in formats_urls
:
1247 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
1248 url_map
[itag
] = format_url
1251 def _real_extract(self
, url
):
1252 if re
.match(r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url
):
1253 self
._downloader
.report_warning(u
'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1255 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1256 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
1258 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
1259 video_id
= self
._extract
_id
(url
)
1262 self
.report_video_webpage_download(video_id
)
1263 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1264 request
= compat_urllib_request
.Request(url
)
1266 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
1267 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1268 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
1270 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
1272 # Attempt to extract SWF player URL
1273 mobj
= re
.search(r
'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1274 if mobj
is not None:
1275 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1280 self
.report_video_info_webpage_download(video_id
)
1281 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
1282 self
.report_age_confirmation()
1284 # We simulate the access to the video from www.youtube.com/v/{video_id}
1285 # this can be viewed without login into Youtube
1286 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
1290 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
1294 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
1295 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1297 errnote
='unable to download video info webpage')
1298 video_info
= compat_parse_qs(video_info_webpage
)
1301 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1302 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1303 % (video_id
, el_type
))
1304 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1306 errnote
='unable to download video info webpage')
1307 video_info
= compat_parse_qs(video_info_webpage
)
1308 if 'token' in video_info
:
1310 if 'token' not in video_info
:
1311 if 'reason' in video_info
:
1312 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
1314 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
1316 # Check for "rental" videos
1317 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
1318 raise ExtractorError(u
'"rental" videos not supported')
1320 # Start extracting information
1321 self
.report_information_extraction(video_id
)
1324 if 'author' not in video_info
:
1325 raise ExtractorError(u
'Unable to extract uploader name')
1326 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
1329 video_uploader_id
= None
1330 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
1331 if mobj
is not None:
1332 video_uploader_id
= mobj
.group(1)
1334 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
1337 if 'title' in video_info
:
1338 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
1340 self
._downloader
.report_warning(u
'Unable to extract video title')
1344 # We try first to get a high quality image:
1345 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
1346 video_webpage
, re
.DOTALL
)
1347 if m_thumb
is not None:
1348 video_thumbnail
= m_thumb
.group(1)
1349 elif 'thumbnail_url' not in video_info
:
1350 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
1351 video_thumbnail
= None
1352 else: # don't panic if we can't find it
1353 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
1357 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1358 if mobj
is not None:
1359 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1360 upload_date
= unified_strdate(upload_date
)
1363 video_description
= get_element_by_id("eow-description", video_webpage
)
1364 if video_description
:
1365 video_description
= clean_html(video_description
)
1367 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
1369 video_description
= unescapeHTML(fd_mobj
.group(1))
1371 video_description
= u
''
1374 video_subtitles
= self
.extract_subtitles(video_id
, video_webpage
)
1376 if self
._downloader
.params
.get('listsubtitles', False):
1377 self
._list
_available
_subtitles
(video_id
, video_webpage
)
1380 if 'length_seconds' not in video_info
:
1381 self
._downloader
.report_warning(u
'unable to extract video duration')
1384 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
1386 # Decide which formats to download
1389 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
1391 raise ValueError('Could not find vevo ID')
1392 info
= json
.loads(mobj
.group(1))
1394 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1395 # this signatures are encrypted
1396 if 'url_encoded_fmt_stream_map' not in args
:
1397 raise ValueError(u
'No stream_map present') # caught below
1398 m_s
= re
.search(r
'[&,]s=', args
['url_encoded_fmt_stream_map'])
1400 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
1401 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
1402 m_s
= re
.search(r
'[&,]s=', args
.get('adaptive_fmts', u
''))
1404 if 'url_encoded_fmt_stream_map' in video_info
:
1405 video_info
['url_encoded_fmt_stream_map'][0] += ',' + args
['adaptive_fmts']
1407 video_info
['url_encoded_fmt_stream_map'] = [args
['adaptive_fmts']]
1408 elif 'adaptive_fmts' in video_info
:
1409 if 'url_encoded_fmt_stream_map' in video_info
:
1410 video_info
['url_encoded_fmt_stream_map'][0] += ',' + video_info
['adaptive_fmts'][0]
1412 video_info
['url_encoded_fmt_stream_map'] = video_info
['adaptive_fmts']
1416 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1417 self
.report_rtmp_download()
1418 video_url_list
= [(None, video_info
['conn'][0])]
1419 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
1420 if 'rtmpe%3Dyes' in video_info
['url_encoded_fmt_stream_map'][0]:
1421 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
1423 for url_data_str
in video_info
['url_encoded_fmt_stream_map'][0].split(','):
1424 url_data
= compat_parse_qs(url_data_str
)
1425 if 'itag' in url_data
and 'url' in url_data
:
1426 url
= url_data
['url'][0]
1427 if 'sig' in url_data
:
1428 url
+= '&signature=' + url_data
['sig'][0]
1429 elif 's' in url_data
:
1430 encrypted_sig
= url_data
['s'][0]
1431 if self
._downloader
.params
.get('verbose'):
1433 if player_url
is None:
1434 player_version
= 'unknown'
1436 player_version
= self
._search
_regex
(
1437 r
'-(.+)\.swf$', player_url
,
1438 u
'flash player', fatal
=False)
1439 player_desc
= 'flash player %s' % player_version
1441 player_version
= self
._search
_regex
(
1442 r
'html5player-(.+?)\.js', video_webpage
,
1443 'html5 player', fatal
=False)
1444 player_desc
= u
'html5 player %s' % player_version
1446 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in encrypted_sig
.split('.'))
1447 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
1448 (len(encrypted_sig
), parts_sizes
, url_data
['itag'][0], player_desc
))
1451 jsplayer_url_json
= self
._search
_regex
(
1452 r
'"assets":.+?"js":\s*("[^"]+")',
1453 video_webpage
, u
'JS player URL')
1454 player_url
= json
.loads(jsplayer_url_json
)
1456 signature
= self
._decrypt
_signature
(
1457 encrypted_sig
, video_id
, player_url
, age_gate
)
1458 url
+= '&signature=' + signature
1459 if 'ratebypass' not in url
:
1460 url
+= '&ratebypass=yes'
1461 url_map
[url_data
['itag'][0]] = url
1462 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1463 if not video_url_list
:
1465 elif video_info
.get('hlsvp'):
1466 manifest_url
= video_info
['hlsvp'][0]
1467 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
1468 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1469 if not video_url_list
:
1473 raise ExtractorError(u
'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1476 for format_param
, video_real_url
in video_url_list
:
1478 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
1480 video_format
= '{0} - {1}{2}'.format(format_param
if format_param
else video_extension
,
1481 self
._video
_dimensions
.get(format_param
, '???'),
1482 ' ('+self
._special
_itags
[format_param
]+')' if format_param
in self
._special
_itags
else '')
1486 'url': video_real_url
,
1487 'uploader': video_uploader
,
1488 'uploader_id': video_uploader_id
,
1489 'upload_date': upload_date
,
1490 'title': video_title
,
1491 'ext': video_extension
,
1492 'format': video_format
,
1493 'thumbnail': video_thumbnail
,
1494 'description': video_description
,
1495 'player_url': player_url
,
1496 'subtitles': video_subtitles
,
1497 'duration': video_duration
1501 class YoutubePlaylistIE(InfoExtractor
):
1502 IE_DESC
= u
'YouTube.com playlists'
1503 _VALID_URL
= r
"""(?:
1508 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1509 \? (?:.*?&)*? (?:p|a|list)=
1512 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1515 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1517 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1519 IE_NAME
= u
'youtube:playlist'
1522 def suitable(cls
, url
):
1523 """Receives a URL and returns True if suitable for this IE."""
1524 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1526 def _real_extract(self
, url
):
1527 # Extract playlist id
1528 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1530 raise ExtractorError(u
'Invalid URL: %s' % url
)
1531 playlist_id
= mobj
.group(1) or mobj
.group(2)
1533 # Check if it's a video-specific URL
1534 query_dict
= compat_urlparse
.parse_qs(compat_urlparse
.urlparse(url
).query
)
1535 if 'v' in query_dict
:
1536 video_id
= query_dict
['v'][0]
1537 if self
._downloader
.params
.get('noplaylist'):
1538 self
.to_screen(u
'Downloading just video %s because of --no-playlist' % video_id
)
1539 return self
.url_result('https://www.youtube.com/watch?v=' + video_id
, 'Youtube')
1541 self
.to_screen(u
'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id
, video_id
))
1543 # Download playlist videos from API
1546 for page_num
in itertools
.count(1):
1547 start_index
= self
._MAX
_RESULTS
* (page_num
- 1) + 1
1548 if start_index
>= 1000:
1549 self
._downloader
.report_warning(u
'Max number of results reached')
1551 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, start_index
)
1552 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
1555 response
= json
.loads(page
)
1556 except ValueError as err
:
1557 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1559 if 'feed' not in response
:
1560 raise ExtractorError(u
'Got a malformed response from YouTube API')
1561 playlist_title
= response
['feed']['title']['$t']
1562 if 'entry' not in response
['feed']:
1563 # Number of videos is a multiple of self._MAX_RESULTS
1566 for entry
in response
['feed']['entry']:
1567 index
= entry
['yt$position']['$t']
1568 if 'media$group' in entry
and 'yt$videoid' in entry
['media$group']:
1571 'https://www.youtube.com/watch?v=' + entry
['media$group']['yt$videoid']['$t']
1574 videos
= [v
[1] for v
in sorted(videos
)]
1576 url_results
= [self
.url_result(vurl
, 'Youtube') for vurl
in videos
]
1577 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
1580 class YoutubeChannelIE(InfoExtractor
):
1581 IE_DESC
= u
'YouTube.com channels'
1582 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1583 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1584 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
1585 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1586 IE_NAME
= u
'youtube:channel'
1588 def extract_videos_from_page(self
, page
):
1590 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
1591 if mobj
.group(1) not in ids_in_page
:
1592 ids_in_page
.append(mobj
.group(1))
1595 def _real_extract(self
, url
):
1596 # Extract channel id
1597 mobj
= re
.match(self
._VALID
_URL
, url
)
1599 raise ExtractorError(u
'Invalid URL: %s' % url
)
1601 # Download channel page
1602 channel_id
= mobj
.group(1)
1606 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
1607 page
= self
._download
_webpage
(url
, channel_id
,
1608 u
'Downloading page #%s' % pagenum
)
1610 # Extract video identifiers
1611 ids_in_page
= self
.extract_videos_from_page(page
)
1612 video_ids
.extend(ids_in_page
)
1614 # Download any subsequent channel pages using the json-based channel_ajax query
1615 if self
._MORE
_PAGES
_INDICATOR
in page
:
1616 for pagenum
in itertools
.count(1):
1617 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
1618 page
= self
._download
_webpage
(url
, channel_id
,
1619 u
'Downloading page #%s' % pagenum
)
1621 page
= json
.loads(page
)
1623 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1624 video_ids
.extend(ids_in_page
)
1626 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1629 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1631 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
1632 url_entries
= [self
.url_result(eurl
, 'Youtube') for eurl
in urls
]
1633 return [self
.playlist_result(url_entries
, channel_id
)]
1636 class YoutubeUserIE(InfoExtractor
):
1637 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1638 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1639 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1640 _GDATA_PAGE_SIZE
= 50
1641 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1642 IE_NAME
= u
'youtube:user'
1645 def suitable(cls
, url
):
1646 # Don't return True if the url can be extracted with other youtube
1647 # extractor, the regex would is too permissive and it would match.
1648 other_ies
= iter(klass
for (name
, klass
) in globals().items() if name
.endswith('IE') and klass
is not cls
)
1649 if any(ie
.suitable(url
) for ie
in other_ies
): return False
1650 else: return super(YoutubeUserIE
, cls
).suitable(url
)
1652 def _real_extract(self
, url
):
1654 mobj
= re
.match(self
._VALID
_URL
, url
)
1656 raise ExtractorError(u
'Invalid URL: %s' % url
)
1658 username
= mobj
.group(1)
1660 # Download video ids using YouTube Data API. Result size per
1661 # query is limited (currently to 50 videos) so we need to query
1662 # page by page until there are no video ids - it means we got
1667 for pagenum
in itertools
.count(0):
1668 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1670 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1671 page
= self
._download
_webpage
(gdata_url
, username
,
1672 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1675 response
= json
.loads(page
)
1676 except ValueError as err
:
1677 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1678 if 'entry' not in response
['feed']:
1679 # Number of videos is a multiple of self._MAX_RESULTS
1682 # Extract video identifiers
1684 for entry
in response
['feed']['entry']:
1685 ids_in_page
.append(entry
['id']['$t'].split('/')[-1])
1686 video_ids
.extend(ids_in_page
)
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1694 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1697 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1698 url_results
= [self
.url_result(rurl
, 'Youtube') for rurl
in urls
]
1699 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1701 class YoutubeSearchIE(SearchInfoExtractor
):
1702 IE_DESC
= u
'YouTube.com searches'
1703 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1705 IE_NAME
= u
'youtube:search'
1706 _SEARCH_KEY
= 'ytsearch'
1708 def report_download_page(self
, query
, pagenum
):
1709 """Report attempt to download search page with given number."""
1710 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1712 def _get_n_results(self
, query
, n
):
1713 """Get a specified number of results for a query"""
1719 while (50 * pagenum
) < limit
:
1720 self
.report_download_page(query
, pagenum
+1)
1721 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1722 request
= compat_urllib_request
.Request(result_url
)
1724 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1725 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1726 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1727 api_response
= json
.loads(data
)['data']
1729 if not 'items' in api_response
:
1730 raise ExtractorError(u
'[youtube] No video results')
1732 new_ids
= list(video
['id'] for video
in api_response
['items'])
1733 video_ids
+= new_ids
1735 limit
= min(n
, api_response
['totalItems'])
1738 if len(video_ids
) > n
:
1739 video_ids
= video_ids
[:n
]
1740 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1741 return self
.playlist_result(videos
, query
)
1744 class YoutubeShowIE(InfoExtractor
):
1745 IE_DESC
= u
'YouTube.com (multi-season) shows'
1746 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1747 IE_NAME
= u
'youtube:show'
1749 def _real_extract(self
, url
):
1750 mobj
= re
.match(self
._VALID
_URL
, url
)
1751 show_name
= mobj
.group(1)
1752 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1753 # There's one playlist for each season of the show
1754 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1755 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1756 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1759 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1761 Base class for extractors that fetch info from
1762 http://www.youtube.com/feed_ajax
1763 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1765 _LOGIN_REQUIRED
= True
1767 # use action_load_personal_feed instead of action_load_system_feed
1768 _PERSONAL_FEED
= False
1771 def _FEED_TEMPLATE(self
):
1772 action
= 'action_load_system_feed'
1773 if self
._PERSONAL
_FEED
:
1774 action
= 'action_load_personal_feed'
1775 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1779 return u
'youtube:%s' % self
._FEED
_NAME
1781 def _real_initialize(self
):
1784 def _real_extract(self
, url
):
1786 # The step argument is available only in 2.7 or higher
1787 for i
in itertools
.count(0):
1788 paging
= i
*self
._PAGING
_STEP
1789 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1790 u
'%s feed' % self
._FEED
_NAME
,
1791 u
'Downloading page %s' % i
)
1792 info
= json
.loads(info
)
1793 feed_html
= info
['feed_html']
1794 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1795 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1796 feed_entries
.extend(self
.url_result(id, 'Youtube') for id in ids
)
1797 if info
['paging'] is None:
1799 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1801 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1802 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1803 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1804 _FEED_NAME
= 'subscriptions'
1805 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1807 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1808 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1809 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1810 _FEED_NAME
= 'recommended'
1811 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1813 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1814 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1815 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1816 _FEED_NAME
= 'watch_later'
1817 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1819 _PERSONAL_FEED
= True
1821 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1822 IE_NAME
= u
'youtube:favorites'
1823 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1824 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1825 _LOGIN_REQUIRED
= True
1827 def _real_extract(self
, url
):
1828 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1829 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1830 return self
.url_result(playlist_id
, 'YoutubePlaylist')