4 from __future__
import unicode_literals
34 import xml
.etree
.ElementTree
38 compat_HTMLParseError
,
42 compat_ctypes_WINFUNCTYPE
,
43 compat_etree_fromstring
,
46 compat_html_entities_html5
,
52 compat_socket_create_connection
,
58 compat_urllib_parse_urlencode
,
59 compat_urllib_parse_urlparse
,
60 compat_urllib_parse_unquote_plus
,
61 compat_urllib_request
,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme
not in compat_urlparse
.uses_netloc
:
78 compat_urlparse
.uses_netloc
.append(scheme
)
81 # This is not clearly defined otherwise
82 compiled_regex_type
= type(re
.compile(''))
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
100 ENGLISH_MONTH_NAMES
= [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
105 'en': ENGLISH_MONTH_NAMES
,
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
124 'f4f', 'f4m', 'm3u8', 'smil')
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
152 '%Y-%m-%d %H:%M:%S.%f',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
159 '%Y-%m-%dT%H:%M:%S.%f',
162 '%b %d %Y at %H:%M:%S',
164 '%B %d %Y at %H:%M:%S',
167 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
168 DATE_FORMATS_DAY_FIRST
.extend([
177 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
178 DATE_FORMATS_MONTH_FIRST
.extend([
186 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
189 def preferredencoding():
190 """Get preferred encoding.
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
196 pref
= locale
.getpreferredencoding()
204 def write_json_file(obj
, fn
):
205 """ Encode obj as JSON and write it to fn, atomically if possible """
207 fn
= encodeFilename(fn
)
208 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
209 encoding
= get_filesystem_encoding()
210 # os.path.basename returns a bytes object, but NamedTemporaryFile
211 # will fail if the filename contains non ascii characters unless we
212 # use a unicode object
213 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
214 # the same for os.path.dirname
215 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
217 path_basename
= os
.path
.basename
218 path_dirname
= os
.path
.dirname
222 'prefix': path_basename(fn
) + '.',
223 'dir': path_dirname(fn
),
227 # In Python 2.x, json.dump expects a bytestream.
228 # In Python 3.x, it writes to a character stream
229 if sys
.version_info
< (3, 0):
237 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
242 if sys
.platform
== 'win32':
243 # Need to remove existing file on Windows, else os.rename raises
244 # WindowsError or FileExistsError.
249 os
.rename(tf
.name
, fn
)
258 if sys
.version_info
>= (2, 7):
259 def find_xpath_attr(node
, xpath
, key
, val
=None):
260 """ Find the xpath xpath[@key=val] """
261 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
262 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
263 return node
.find(expr
)
265 def find_xpath_attr(node
, xpath
, key
, val
=None):
266 for f
in node
.findall(compat_xpath(xpath
)):
267 if key
not in f
.attrib
:
269 if val
is None or f
.attrib
.get(key
) == val
:
273 # On python2.6 the xml.etree.ElementTree.Element methods don't support
274 # the namespace parameter
277 def xpath_with_ns(path
, ns_map
):
278 components
= [c
.split(':') for c
in path
.split('/')]
282 replaced
.append(c
[0])
285 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
286 return '/'.join(replaced
)
289 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
290 def _find_xpath(xpath
):
291 return node
.find(compat_xpath(xpath
))
293 if isinstance(xpath
, (str, compat_str
)):
294 n
= _find_xpath(xpath
)
302 if default
is not NO_DEFAULT
:
305 name
= xpath
if name
is None else name
306 raise ExtractorError('Could not find XML element %s' % name
)
312 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
313 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
314 if n
is None or n
== default
:
317 if default
is not NO_DEFAULT
:
320 name
= xpath
if name
is None else name
321 raise ExtractorError('Could not find XML element\'s text %s' % name
)
327 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
328 n
= find_xpath_attr(node
, xpath
, key
)
330 if default
is not NO_DEFAULT
:
333 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
334 raise ExtractorError('Could not find XML attribute %s' % name
)
340 def get_element_by_id(id, html
):
341 """Return the content of the tag with the specified ID in the passed HTML document"""
342 return get_element_by_attribute('id', id, html
)
345 def get_element_by_class(class_name
, html
):
346 """Return the content of the first tag with the specified class in the passed HTML document"""
347 retval
= get_elements_by_class(class_name
, html
)
348 return retval
[0] if retval
else None
351 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
352 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
353 return retval
[0] if retval
else None
356 def get_elements_by_class(class_name
, html
):
357 """Return the content of all tags with the specified class in the passed HTML document as a list"""
358 return get_elements_by_attribute(
359 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
360 html, escape_value=False)
363 def get_elements_by_attribute(attribute, value, html, escape_value=True):
364 """Return the content of the tag with the specified attribute in the passed HTML document"""
366 value = re.escape(value) if escape_value else value
369 for m in re.finditer(r'''(?xs)
371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
373 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
377 ''' % (re.escape(attribute), value), html):
378 res = m.group('content
')
380 if res.startswith('"') or res.startswith("'"):
383 retlist.append(unescapeHTML(res))
388 class HTMLAttributeParser(compat_HTMLParser):
389 """Trivial HTML parser to gather the attributes for a single element"""
392 compat_HTMLParser.__init__(self)
394 def handle_starttag(self, tag, attrs):
395 self.attrs = dict(attrs)
398 def extract_attributes(html_element):
399 """Given a string for an HTML element such as
401 a="foo" B="bar" c="&98;az" d=boz
402 empty= noval entity="&"
405 Decode and return a dictionary of attributes.
407 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
408 'empty
': '', 'noval
': None, 'entity
': '&',
409 'sq
': '"', 'dq': '\''
411 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
412 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 parser = HTMLAttributeParser()
416 parser.feed(html_element)
418 # Older Python may throw HTMLParseError in case of malformed HTML
419 except compat_HTMLParseError:
424 def clean_html(html):
425 """Clean an HTML snippet into a readable string"""
427 if html is None: # Convenience for sanitizing descriptions etc.
431 html = html.replace('\n', ' ')
432 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
433 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
435 html = re.sub('<.*?>', '', html)
436 # Replace html entities
437 html = unescapeHTML(html)
441 def sanitize_open(filename, open_mode):
442 """Try to open the given filename, and slightly tweak it if this fails.
444 Attempts to open the given filename. If this fails, it tries to change
445 the filename slightly, step by step, until it's either able to open it
446 or it fails and raises a final exception, like the standard open()
449 It returns the tuple (stream, definitive_file_name).
453 if sys.platform == 'win32':
455 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
456 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
457 stream = open(encodeFilename(filename), open_mode)
458 return (stream, filename)
459 except (IOError, OSError) as err:
460 if err.errno in (errno.EACCES,):
463 # In case of error, try to remove win32 forbidden chars
464 alt_filename = sanitize_path(filename)
465 if alt_filename == filename:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(alt_filename), open_mode)
470 return (stream, alt_filename)
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
482 def sanitize_filename(s, restricted=False, is_id=False):
483 """Sanitizes a string so it could be used as part of a filename.
484 If restricted is set, use a stricter subset of allowed characters.
485 Set is_id if this is not an arbitrary string, but an ID that should be kept
488 def replace_insane(char):
489 if restricted and char in ACCENT_CHARS:
490 return ACCENT_CHARS[char]
491 if char == '?' or ord(char) < 32 or ord(char) == 127:
494 return '' if restricted else '\''
496 return '_
-' if restricted else ' -'
497 elif char in '\\/|
*<>':
499 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
501 if restricted
and ord(char
) > 127:
506 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
507 result
= ''.join(map(replace_insane
, s
))
509 while '__' in result
:
510 result
= result
.replace('__', '_')
511 result
= result
.strip('_')
512 # Common case of "Foreign band name - English song title"
513 if restricted
and result
.startswith('-_'):
515 if result
.startswith('-'):
516 result
= '_' + result
[len('-'):]
517 result
= result
.lstrip('.')
523 def sanitize_path(s
):
524 """Sanitizes and normalizes path on Windows"""
525 if sys
.platform
!= 'win32':
527 drive_or_unc
, _
= os
.path
.splitdrive(s
)
528 if sys
.version_info
< (2, 7) and not drive_or_unc
:
529 drive_or_unc
, _
= os
.path
.splitunc(s
)
530 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
534 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
535 for path_part
in norm_path
]
537 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
538 return os
.path
.join(*sanitized_path
)
541 def sanitize_url(url
):
542 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
543 # the number of unwanted failures due to missing protocol
544 if url
.startswith('//'):
545 return 'http:%s' % url
546 # Fix some common typos seen so far
548 # https://github.com/rg3/youtube-dl/issues/15649
549 (r
'^httpss://', r
'https://'),
550 # https://bx1.be/lives/direct-tv/
551 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
553 for mistake
, fixup
in COMMON_TYPOS
:
554 if re
.match(mistake
, url
):
555 return re
.sub(mistake
, fixup
, url
)
559 def sanitized_Request(url
, *args
, **kwargs
):
560 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
564 """Expand shell variables and ~"""
565 return os
.path
.expandvars(compat_expanduser(s
))
568 def orderedSet(iterable
):
569 """ Remove all duplicates from the input iterable """
577 def _htmlentity_transform(entity_with_semicolon
):
578 """Transforms an HTML entity to a character."""
579 entity
= entity_with_semicolon
[:-1]
581 # Known non-numeric HTML entity
582 if entity
in compat_html_entities
.name2codepoint
:
583 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
585 # TODO: HTML5 allows entities without a semicolon. For example,
586 # 'Éric' should be decoded as 'Éric'.
587 if entity_with_semicolon
in compat_html_entities_html5
:
588 return compat_html_entities_html5
[entity_with_semicolon
]
590 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
592 numstr
= mobj
.group(1)
593 if numstr
.startswith('x'):
595 numstr
= '0%s' % numstr
598 # See https://github.com/rg3/youtube-dl/issues/7518
600 return compat_chr(int(numstr
, base
))
604 # Unknown entity in name, return its literal representation
605 return '&%s;' % entity
611 assert type(s
) == compat_str
614 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
617 def get_subprocess_encoding():
618 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
619 # For subprocess calls, encode with locale encoding
620 # Refer to http://stackoverflow.com/a/9951851/35070
621 encoding
= preferredencoding()
623 encoding
= sys
.getfilesystemencoding()
629 def encodeFilename(s
, for_subprocess
=False):
631 @param s The name of the file
634 assert type(s
) == compat_str
636 # Python 3 has a Unicode API
637 if sys
.version_info
>= (3, 0):
640 # Pass '' directly to use Unicode APIs on Windows 2000 and up
641 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
642 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
643 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
646 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
647 if sys
.platform
.startswith('java'):
650 return s
.encode(get_subprocess_encoding(), 'ignore')
653 def decodeFilename(b
, for_subprocess
=False):
655 if sys
.version_info
>= (3, 0):
658 if not isinstance(b
, bytes):
661 return b
.decode(get_subprocess_encoding(), 'ignore')
664 def encodeArgument(s
):
665 if not isinstance(s
, compat_str
):
666 # Legacy code that uses byte strings
667 # Uncomment the following line after fixing all post processors
668 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
669 s
= s
.decode('ascii')
670 return encodeFilename(s
, True)
673 def decodeArgument(b
):
674 return decodeFilename(b
, True)
677 def decodeOption(optval
):
680 if isinstance(optval
, bytes):
681 optval
= optval
.decode(preferredencoding())
683 assert isinstance(optval
, compat_str
)
687 def formatSeconds(secs
):
689 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
691 return '%d:%02d' % (secs
// 60, secs
% 60)
696 def make_HTTPS_handler(params
, **kwargs
):
697 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
698 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
699 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
700 if opts_no_check_certificate
:
701 context
.check_hostname
= False
702 context
.verify_mode
= ssl
.CERT_NONE
704 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
707 # (create_default_context present but HTTPSHandler has no context=)
710 if sys
.version_info
< (3, 2):
711 return YoutubeDLHTTPSHandler(params
, **kwargs
)
713 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
714 context
.verify_mode
= (ssl
.CERT_NONE
715 if opts_no_check_certificate
716 else ssl
.CERT_REQUIRED
)
717 context
.set_default_verify_paths()
718 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
721 def bug_reports_message():
722 if ytdl_is_updateable():
723 update_cmd
= 'type youtube-dl -U to update'
725 update_cmd
= 'see https://yt-dl.org/update on how to update'
726 msg
= '; please report this issue on https://yt-dl.org/bug .'
727 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
728 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
732 class YoutubeDLError(Exception):
733 """Base exception for YoutubeDL errors."""
737 class ExtractorError(YoutubeDLError
):
738 """Error during info extraction."""
740 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
741 """ tb, if given, is the original traceback (so that it can be printed out).
742 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
745 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
747 if video_id
is not None:
748 msg
= video_id
+ ': ' + msg
750 msg
+= ' (caused by %r)' % cause
752 msg
+= bug_reports_message()
753 super(ExtractorError
, self
).__init
__(msg
)
756 self
.exc_info
= sys
.exc_info() # preserve original exception
758 self
.video_id
= video_id
760 def format_traceback(self
):
761 if self
.traceback
is None:
763 return ''.join(traceback
.format_tb(self
.traceback
))
766 class UnsupportedError(ExtractorError
):
767 def __init__(self
, url
):
768 super(UnsupportedError
, self
).__init
__(
769 'Unsupported URL: %s' % url
, expected
=True)
773 class RegexNotFoundError(ExtractorError
):
774 """Error when a regex didn't match"""
778 class GeoRestrictedError(ExtractorError
):
779 """Geographic restriction Error exception.
781 This exception may be thrown when a video is not available from your
782 geographic location due to geographic restrictions imposed by a website.
784 def __init__(self
, msg
, countries
=None):
785 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
787 self
.countries
= countries
790 class DownloadError(YoutubeDLError
):
791 """Download Error exception.
793 This exception may be thrown by FileDownloader objects if they are not
794 configured to continue on errors. They will contain the appropriate
798 def __init__(self
, msg
, exc_info
=None):
799 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
800 super(DownloadError
, self
).__init
__(msg
)
801 self
.exc_info
= exc_info
804 class SameFileError(YoutubeDLError
):
805 """Same File exception.
807 This exception will be thrown by FileDownloader objects if they detect
808 multiple files would have to be downloaded to the same file on disk.
813 class PostProcessingError(YoutubeDLError
):
814 """Post Processing exception.
816 This exception may be raised by PostProcessor's .run() method to
817 indicate an error in the postprocessing task.
820 def __init__(self
, msg
):
821 super(PostProcessingError
, self
).__init
__(msg
)
825 class MaxDownloadsReached(YoutubeDLError
):
826 """ --max-downloads limit has been reached. """
830 class UnavailableVideoError(YoutubeDLError
):
831 """Unavailable Format exception.
833 This exception will be thrown when a video is requested
834 in a format that is not available for that video.
839 class ContentTooShortError(YoutubeDLError
):
840 """Content Too Short exception.
842 This exception may be raised by FileDownloader objects when a file they
843 download is too small for what the server announced first, indicating
844 the connection was probably interrupted.
847 def __init__(self
, downloaded
, expected
):
848 super(ContentTooShortError
, self
).__init
__(
849 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
852 self
.downloaded
= downloaded
853 self
.expected
= expected
856 class XAttrMetadataError(YoutubeDLError
):
857 def __init__(self
, code
=None, msg
='Unknown error'):
858 super(XAttrMetadataError
, self
).__init
__(msg
)
862 # Parsing code and msg
863 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
) or
864 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
865 self
.reason
= 'NO_SPACE'
866 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
867 self
.reason
= 'VALUE_TOO_LONG'
869 self
.reason
= 'NOT_SUPPORTED'
872 class XAttrUnavailableError(YoutubeDLError
):
876 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
877 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
878 # expected HTTP responses to meet HTTP/1.0 or later (see also
879 # https://github.com/rg3/youtube-dl/issues/6727)
880 if sys
.version_info
< (3, 0):
881 kwargs
['strict'] = True
882 hc
= http_class(*args
, **compat_kwargs(kwargs
))
883 source_address
= ydl_handler
._params
.get('source_address')
884 if source_address
is not None:
885 sa
= (source_address
, 0)
886 if hasattr(hc
, 'source_address'): # Python 2.7+
887 hc
.source_address
= sa
889 def _hc_connect(self
, *args
, **kwargs
):
890 sock
= compat_socket_create_connection(
891 (self
.host
, self
.port
), self
.timeout
, sa
)
893 self
.sock
= ssl
.wrap_socket(
894 sock
, self
.key_file
, self
.cert_file
,
895 ssl_version
=ssl
.PROTOCOL_TLSv1
)
898 hc
.connect
= functools
.partial(_hc_connect
, hc
)
903 def handle_youtubedl_headers(headers
):
904 filtered_headers
= headers
906 if 'Youtubedl-no-compression' in filtered_headers
:
907 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
908 del filtered_headers
['Youtubedl-no-compression']
910 return filtered_headers
913 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
914 """Handler for HTTP requests and responses.
916 This class, when installed with an OpenerDirector, automatically adds
917 the standard headers to every HTTP request and handles gzipped and
918 deflated responses from web servers. If compression is to be avoided in
919 a particular request, the original request in the program code only has
920 to include the HTTP header "Youtubedl-no-compression", which will be
921 removed before making the real request.
923 Part of this code was copied from:
925 http://techknack.net/python-urllib2-handlers/
927 Andrew Rowls, the author of that code, agreed to release it to the
931 def __init__(self
, params
, *args
, **kwargs
):
932 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
933 self
._params
= params
935 def http_open(self
, req
):
936 conn_class
= compat_http_client
.HTTPConnection
938 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
940 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
941 del req
.headers
['Ytdl-socks-proxy']
943 return self
.do_open(functools
.partial(
944 _create_http_connection
, self
, conn_class
, False),
950 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
952 return zlib
.decompress(data
)
954 def http_request(self
, req
):
955 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
956 # always respected by websites, some tend to give out URLs with non percent-encoded
957 # non-ASCII characters (see telemb.py, ard.py [#3412])
958 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
959 # To work around aforementioned issue we will replace request's original URL with
960 # percent-encoded one
961 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
962 # the code of this workaround has been moved here from YoutubeDL.urlopen()
963 url
= req
.get_full_url()
964 url_escaped
= escape_url(url
)
966 # Substitute URL if any change after escaping
967 if url
!= url_escaped
:
968 req
= update_Request(req
, url
=url_escaped
)
970 for h
, v
in std_headers
.items():
971 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
972 # The dict keys are capitalized because of this bug by urllib
973 if h
.capitalize() not in req
.headers
:
976 req
.headers
= handle_youtubedl_headers(req
.headers
)
978 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
979 # Python 2.6 is brain-dead when it comes to fragments
980 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
981 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
985 def http_response(self
, req
, resp
):
988 if resp
.headers
.get('Content-encoding', '') == 'gzip':
989 content
= resp
.read()
990 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
992 uncompressed
= io
.BytesIO(gz
.read())
993 except IOError as original_ioerror
:
994 # There may be junk add the end of the file
995 # See http://stackoverflow.com/q/4928560/35070 for details
996 for i
in range(1, 1024):
998 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
999 uncompressed
= io
.BytesIO(gz
.read())
1004 raise original_ioerror
1005 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1006 resp
.msg
= old_resp
.msg
1007 del resp
.headers
['Content-encoding']
1009 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1010 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1011 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1012 resp
.msg
= old_resp
.msg
1013 del resp
.headers
['Content-encoding']
1014 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015 # https://github.com/rg3/youtube-dl/issues/6457).
1016 if 300 <= resp
.code
< 400:
1017 location
= resp
.headers
.get('Location')
1019 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020 if sys
.version_info
>= (3, 0):
1021 location
= location
.encode('iso-8859-1').decode('utf-8')
1023 location
= location
.decode('utf-8')
1024 location_escaped
= escape_url(location
)
1025 if location
!= location_escaped
:
1026 del resp
.headers
['Location']
1027 if sys
.version_info
< (3, 0):
1028 location_escaped
= location_escaped
.encode('utf-8')
1029 resp
.headers
['Location'] = location_escaped
1032 https_request
= http_request
1033 https_response
= http_response
1036 def make_socks_conn_class(base_class
, socks_proxy
):
1037 assert issubclass(base_class
, (
1038 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1040 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1041 if url_components
.scheme
.lower() == 'socks5':
1042 socks_type
= ProxyType
.SOCKS5
1043 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1044 socks_type
= ProxyType
.SOCKS4
1045 elif url_components
.scheme
.lower() == 'socks4a':
1046 socks_type
= ProxyType
.SOCKS4A
1048 def unquote_if_non_empty(s
):
1051 return compat_urllib_parse_unquote_plus(s
)
1055 url_components
.hostname
, url_components
.port
or 1080,
1057 unquote_if_non_empty(url_components
.username
),
1058 unquote_if_non_empty(url_components
.password
),
1061 class SocksConnection(base_class
):
1063 self
.sock
= sockssocket()
1064 self
.sock
.setproxy(*proxy_args
)
1065 if type(self
.timeout
) in (int, float):
1066 self
.sock
.settimeout(self
.timeout
)
1067 self
.sock
.connect((self
.host
, self
.port
))
1069 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1070 if hasattr(self
, '_context'): # Python > 2.6
1071 self
.sock
= self
._context
.wrap_socket(
1072 self
.sock
, server_hostname
=self
.host
)
1074 self
.sock
= ssl
.wrap_socket(self
.sock
)
1076 return SocksConnection
1079 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1080 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1081 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1082 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1083 self
._params
= params
1085 def https_open(self
, req
):
1087 conn_class
= self
._https
_conn
_class
1089 if hasattr(self
, '_context'): # python > 2.6
1090 kwargs
['context'] = self
._context
1091 if hasattr(self
, '_check_hostname'): # python 3.x
1092 kwargs
['check_hostname'] = self
._check
_hostname
1094 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1096 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1097 del req
.headers
['Ytdl-socks-proxy']
1099 return self
.do_open(functools
.partial(
1100 _create_http_connection
, self
, conn_class
, True),
1104 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1105 def __init__(self
, cookiejar
=None):
1106 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1108 def http_response(self
, request
, response
):
1109 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110 # characters in Set-Cookie HTTP header of last response (see
1111 # https://github.com/rg3/youtube-dl/issues/6769).
1112 # In order to at least prevent crashing we will percent encode Set-Cookie
1113 # header before HTTPCookieProcessor starts processing it.
1114 # if sys.version_info < (3, 0) and response.headers:
1115 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116 # set_cookie = response.headers.get(set_cookie_header)
1118 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119 # if set_cookie != set_cookie_escaped:
1120 # del response.headers[set_cookie_header]
1121 # response.headers[set_cookie_header] = set_cookie_escaped
1122 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1124 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1125 https_response
= http_response
1128 def extract_timezone(date_str
):
1130 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1133 timezone
= datetime
.timedelta()
1135 date_str
= date_str
[:-len(m
.group('tz'))]
1136 if not m
.group('sign'):
1137 timezone
= datetime
.timedelta()
1139 sign
= 1 if m
.group('sign') == '+' else -1
1140 timezone
= datetime
.timedelta(
1141 hours
=sign
* int(m
.group('hours')),
1142 minutes
=sign
* int(m
.group('minutes')))
1143 return timezone
, date_str
1146 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1147 """ Return a UNIX timestamp from the given date """
1149 if date_str
is None:
1152 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1154 if timezone
is None:
1155 timezone
, date_str
= extract_timezone(date_str
)
1158 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1159 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1160 return calendar
.timegm(dt
.timetuple())
1165 def date_formats(day_first
=True):
1166 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1169 def unified_strdate(date_str
, day_first
=True):
1170 """Return a string with the date in the format YYYYMMDD"""
1172 if date_str
is None:
1176 date_str
= date_str
.replace(',', ' ')
1177 # Remove AM/PM + timezone
1178 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1179 _
, date_str
= extract_timezone(date_str
)
1181 for expression
in date_formats(day_first
):
1183 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1186 if upload_date
is None:
1187 timetuple
= email
.utils
.parsedate_tz(date_str
)
1190 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1193 if upload_date
is not None:
1194 return compat_str(upload_date
)
1197 def unified_timestamp(date_str
, day_first
=True):
1198 if date_str
is None:
1201 date_str
= re
.sub(r
'[,|]', '', date_str
)
1203 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1204 timezone
, date_str
= extract_timezone(date_str
)
1206 # Remove AM/PM + timezone
1207 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1209 # Remove unrecognized timezones from ISO 8601 alike timestamps
1210 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1212 date_str
= date_str
[:-len(m
.group('tz'))]
1214 # Python only supports microseconds, so remove nanoseconds
1215 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1217 date_str
= m
.group(1)
1219 for expression
in date_formats(day_first
):
1221 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1222 return calendar
.timegm(dt
.timetuple())
1225 timetuple
= email
.utils
.parsedate_tz(date_str
)
1227 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1230 def determine_ext(url
, default_ext
='unknown_video'):
1231 if url
is None or '.' not in url
:
1233 guess
= url
.partition('?')[0].rpartition('.')[2]
1234 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1236 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1237 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1238 return guess
.rstrip('/')
1243 def subtitles_filename(filename
, sub_lang
, sub_format
):
1244 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
1247 def date_from_str(date_str
):
1249 Return a datetime object from a string in the format YYYYMMDD or
1250 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1251 today
= datetime
.date
.today()
1252 if date_str
in ('now', 'today'):
1254 if date_str
== 'yesterday':
1255 return today
- datetime
.timedelta(days
=1)
1256 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
1257 if match
is not None:
1258 sign
= match
.group('sign')
1259 time
= int(match
.group('time'))
1262 unit
= match
.group('unit')
1263 # A bad approximation?
1267 elif unit
== 'year':
1271 delta
= datetime
.timedelta(**{unit
: time
})
1272 return today
+ delta
1273 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
1276 def hyphenate_date(date_str
):
1278 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1279 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1280 if match
is not None:
1281 return '-'.join(match
.groups())
1286 class DateRange(object):
1287 """Represents a time interval between two dates"""
1289 def __init__(self
, start
=None, end
=None):
1290 """start and end must be strings in the format accepted by date"""
1291 if start
is not None:
1292 self
.start
= date_from_str(start
)
1294 self
.start
= datetime
.datetime
.min.date()
1296 self
.end
= date_from_str(end
)
1298 self
.end
= datetime
.datetime
.max.date()
1299 if self
.start
> self
.end
:
1300 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1304 """Returns a range that only contains the given day"""
1305 return cls(day
, day
)
1307 def __contains__(self
, date
):
1308 """Check if the date is in the range"""
1309 if not isinstance(date
, datetime
.date
):
1310 date
= date_from_str(date
)
1311 return self
.start
<= date
<= self
.end
1314 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1317 def platform_name():
1318 """ Returns the platform name as a compat_str """
1319 res
= platform
.platform()
1320 if isinstance(res
, bytes):
1321 res
= res
.decode(preferredencoding())
1323 assert isinstance(res
, compat_str
)
1327 def _windows_write_string(s
, out
):
1328 """ Returns True if the string was written using special methods,
1329 False if it has yet to be written out."""
1330 # Adapted from http://stackoverflow.com/a/3259271/35070
1333 import ctypes
.wintypes
1341 fileno
= out
.fileno()
1342 except AttributeError:
1343 # If the output stream doesn't have a fileno, it's virtual
1345 except io
.UnsupportedOperation
:
1346 # Some strange Windows pseudo files?
1348 if fileno
not in WIN_OUTPUT_IDS
:
1351 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
1352 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1353 ('GetStdHandle', ctypes
.windll
.kernel32
))
1354 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1356 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
1357 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1358 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1359 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
1360 written
= ctypes
.wintypes
.DWORD(0)
1362 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
1363 FILE_TYPE_CHAR
= 0x0002
1364 FILE_TYPE_REMOTE
= 0x8000
1365 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
1366 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1367 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1368 ('GetConsoleMode', ctypes
.windll
.kernel32
))
1369 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1371 def not_a_console(handle
):
1372 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1374 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1375 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1377 if not_a_console(h
):
1380 def next_nonbmp_pos(s
):
1382 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1383 except StopIteration:
1387 count
= min(next_nonbmp_pos(s
), 1024)
1389 ret
= WriteConsoleW(
1390 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1392 raise OSError('Failed to write string')
1393 if not count
: # We just wrote a non-BMP character
1394 assert written
.value
== 2
1397 assert written
.value
> 0
1398 s
= s
[written
.value
:]
1402 def write_string(s
, out
=None, encoding
=None):
1405 assert type(s
) == compat_str
1407 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1408 if _windows_write_string(s
, out
):
1411 if ('b' in getattr(out
, 'mode', '') or
1412 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1413 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1415 elif hasattr(out
, 'buffer'):
1416 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1417 byt
= s
.encode(enc
, 'ignore')
1418 out
.buffer.write(byt
)
1424 def bytes_to_intlist(bs
):
1427 if isinstance(bs
[0], int): # Python 3
1430 return [ord(c
) for c
in bs
]
1433 def intlist_to_bytes(xs
):
1436 return compat_struct_pack('%dB' % len(xs
), *xs
)
1439 # Cross-platform file locking
1440 if sys
.platform
== 'win32':
1441 import ctypes
.wintypes
1444 class OVERLAPPED(ctypes
.Structure
):
1446 ('Internal', ctypes
.wintypes
.LPVOID
),
1447 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1448 ('Offset', ctypes
.wintypes
.DWORD
),
1449 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1450 ('hEvent', ctypes
.wintypes
.HANDLE
),
1453 kernel32
= ctypes
.windll
.kernel32
1454 LockFileEx
= kernel32
.LockFileEx
1455 LockFileEx
.argtypes
= [
1456 ctypes
.wintypes
.HANDLE
, # hFile
1457 ctypes
.wintypes
.DWORD
, # dwFlags
1458 ctypes
.wintypes
.DWORD
, # dwReserved
1459 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1460 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1461 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1463 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1464 UnlockFileEx
= kernel32
.UnlockFileEx
1465 UnlockFileEx
.argtypes
= [
1466 ctypes
.wintypes
.HANDLE
, # hFile
1467 ctypes
.wintypes
.DWORD
, # dwReserved
1468 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1469 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1470 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1472 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1473 whole_low
= 0xffffffff
1474 whole_high
= 0x7fffffff
1476 def _lock_file(f
, exclusive
):
1477 overlapped
= OVERLAPPED()
1478 overlapped
.Offset
= 0
1479 overlapped
.OffsetHigh
= 0
1480 overlapped
.hEvent
= 0
1481 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1482 handle
= msvcrt
.get_osfhandle(f
.fileno())
1483 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1484 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1485 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1487 def _unlock_file(f
):
1488 assert f
._lock
_file
_overlapped
_p
1489 handle
= msvcrt
.get_osfhandle(f
.fileno())
1490 if not UnlockFileEx(handle
, 0,
1491 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1492 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1495 # Some platforms, such as Jython, is missing fcntl
1499 def _lock_file(f
, exclusive
):
1500 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1502 def _unlock_file(f
):
1503 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1505 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
1507 def _lock_file(f
, exclusive
):
1508 raise IOError(UNSUPPORTED_MSG
)
1510 def _unlock_file(f
):
1511 raise IOError(UNSUPPORTED_MSG
)
1514 class locked_file(object):
1515 def __init__(self
, filename
, mode
, encoding
=None):
1516 assert mode
in ['r', 'a', 'w']
1517 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1520 def __enter__(self
):
1521 exclusive
= self
.mode
!= 'r'
1523 _lock_file(self
.f
, exclusive
)
1529 def __exit__(self
, etype
, value
, traceback
):
1531 _unlock_file(self
.f
)
1538 def write(self
, *args
):
1539 return self
.f
.write(*args
)
1541 def read(self
, *args
):
1542 return self
.f
.read(*args
)
1545 def get_filesystem_encoding():
1546 encoding
= sys
.getfilesystemencoding()
1547 return encoding
if encoding
is not None else 'utf-8'
1550 def shell_quote(args
):
1552 encoding
= get_filesystem_encoding()
1554 if isinstance(a
, bytes):
1555 # We may get a filename encoded with 'encodeFilename'
1556 a
= a
.decode(encoding
)
1557 quoted_args
.append(compat_shlex_quote(a
))
1558 return ' '.join(quoted_args
)
1561 def smuggle_url(url
, data
):
1562 """ Pass additional data in a URL for internal use. """
1564 url
, idata
= unsmuggle_url(url
, {})
1566 sdata
= compat_urllib_parse_urlencode(
1567 {'__youtubedl_smuggle': json
.dumps(data
)})
1568 return url
+ '#' + sdata
1571 def unsmuggle_url(smug_url
, default
=None):
1572 if '#__youtubedl_smuggle' not in smug_url
:
1573 return smug_url
, default
1574 url
, _
, sdata
= smug_url
.rpartition('#')
1575 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1576 data
= json
.loads(jsond
)
1580 def format_bytes(bytes):
1583 if type(bytes) is str:
1584 bytes = float(bytes)
1588 exponent
= int(math
.log(bytes, 1024.0))
1589 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1590 converted
= float(bytes) / float(1024 ** exponent
)
1591 return '%.2f%s' % (converted
, suffix
)
1594 def lookup_unit_table(unit_table
, s
):
1595 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1597 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
1600 num_str
= m
.group('num').replace(',', '.')
1601 mult
= unit_table
[m
.group('unit')]
1602 return int(float(num_str
) * mult
)
1605 def parse_filesize(s
):
1609 # The lower-case forms are of course incorrect and unofficial,
1610 # but we support those too
1627 'megabytes': 1000 ** 2,
1628 'mebibytes': 1024 ** 2,
1634 'gigabytes': 1000 ** 3,
1635 'gibibytes': 1024 ** 3,
1641 'terabytes': 1000 ** 4,
1642 'tebibytes': 1024 ** 4,
1648 'petabytes': 1000 ** 5,
1649 'pebibytes': 1024 ** 5,
1655 'exabytes': 1000 ** 6,
1656 'exbibytes': 1024 ** 6,
1662 'zettabytes': 1000 ** 7,
1663 'zebibytes': 1024 ** 7,
1669 'yottabytes': 1000 ** 8,
1670 'yobibytes': 1024 ** 8,
1673 return lookup_unit_table(_UNIT_TABLE
, s
)
1682 if re
.match(r
'^[\d,.]+$', s
):
1683 return str_to_int(s
)
1694 return lookup_unit_table(_UNIT_TABLE
, s
)
1697 def parse_resolution(s
):
1701 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
1704 'width': int(mobj
.group('w')),
1705 'height': int(mobj
.group('h')),
1708 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
1710 return {'height': int(mobj
.group(1))}
1712 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
1714 return {'height': int(mobj
.group(1)) * 540}
1719 def month_by_name(name
, lang
='en'):
1720 """ Return the number of a month by (locale-independently) English name """
1722 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1725 return month_names
.index(name
) + 1
1730 def month_by_abbreviation(abbrev
):
1731 """ Return the number of a month by (locale-independently) English
1735 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1740 def fix_xml_ampersands(xml_str
):
1741 """Replace all the '&' by '&' in XML"""
1743 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1748 def setproctitle(title
):
1749 assert isinstance(title
, compat_str
)
1751 # ctypes in Jython is not complete
1752 # http://bugs.jython.org/issue2148
1753 if sys
.platform
.startswith('java'):
1757 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1761 # LoadLibrary in Windows Python 2.7.13 only expects
1762 # a bytestring, but since unicode_literals turns
1763 # every string into a unicode string, it fails.
1765 title_bytes
= title
.encode('utf-8')
1766 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1767 buf
.value
= title_bytes
1769 libc
.prctl(15, buf
, 0, 0, 0)
1770 except AttributeError:
1771 return # Strange libc, just skip this
1774 def remove_start(s
, start
):
1775 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1778 def remove_end(s
, end
):
1779 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1782 def remove_quotes(s
):
1783 if s
is None or len(s
) < 2:
1785 for quote
in ('"', "'", ):
1786 if s
[0] == quote
and s
[-1] == quote
:
1791 def url_basename(url
):
1792 path
= compat_urlparse
.urlparse(url
).path
1793 return path
.strip('/').split('/')[-1]
1797 return re
.match(r
'https?://[^?#&]+/', url
).group()
1800 def urljoin(base
, path
):
1801 if isinstance(path
, bytes):
1802 path
= path
.decode('utf-8')
1803 if not isinstance(path
, compat_str
) or not path
:
1805 if re
.match(r
'^(?:https?:)?//', path
):
1807 if isinstance(base
, bytes):
1808 base
= base
.decode('utf-8')
1809 if not isinstance(base
, compat_str
) or not re
.match(
1810 r
'^(?:https?:)?//', base
):
1812 return compat_urlparse
.urljoin(base
, path
)
1815 class HEADRequest(compat_urllib_request
.Request
):
1816 def get_method(self
):
1820 class PUTRequest(compat_urllib_request
.Request
):
1821 def get_method(self
):
1825 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1828 v
= getattr(v
, get_attr
, None)
1834 return int(v
) * invscale
// scale
1839 def str_or_none(v
, default
=None):
1840 return default
if v
is None else compat_str(v
)
1843 def str_to_int(int_str
):
1844 """ A more relaxed version of int_or_none """
1847 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1851 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1855 return float(v
) * invscale
/ scale
1860 def bool_or_none(v
, default
=None):
1861 return v
if isinstance(v
, bool) else default
1864 def strip_or_none(v
):
1865 return None if v
is None else v
.strip()
1868 def parse_duration(s
):
1869 if not isinstance(s
, compat_basestring
):
1874 days
, hours
, mins
, secs
, ms
= [None] * 5
1875 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
1877 days
, hours
, mins
, secs
, ms
= m
.groups()
1882 [0-9]+\s*y(?:ears?)?\s*
1885 [0-9]+\s*m(?:onths?)?\s*
1888 [0-9]+\s*w(?:eeks?)?\s*
1891 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1895 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1898 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1901 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1904 days
, hours
, mins
, secs
, ms
= m
.groups()
1906 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
1908 hours
, mins
= m
.groups()
1914 duration
+= float(secs
)
1916 duration
+= float(mins
) * 60
1918 duration
+= float(hours
) * 60 * 60
1920 duration
+= float(days
) * 24 * 60 * 60
1922 duration
+= float(ms
)
1926 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1927 name
, real_ext
= os
.path
.splitext(filename
)
1929 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1930 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1931 else '{0}.{1}'.format(filename
, ext
))
1934 def replace_extension(filename
, ext
, expected_real_ext
=None):
1935 name
, real_ext
= os
.path
.splitext(filename
)
1936 return '{0}.{1}'.format(
1937 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1941 def check_executable(exe
, args
=[]):
1942 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1943 args can be a list of arguments for a short output (like -version) """
1945 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1951 def get_exe_version(exe
, args
=['--version'],
1952 version_re
=None, unrecognized
='present'):
1953 """ Returns the version of the specified executable,
1954 or False if the executable is not present """
1956 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1957 # SIGTTOU if youtube-dl is run in the background.
1958 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1959 out
, _
= subprocess
.Popen(
1960 [encodeArgument(exe
)] + args
,
1961 stdin
=subprocess
.PIPE
,
1962 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1965 if isinstance(out
, bytes): # Python 2.x
1966 out
= out
.decode('ascii', 'ignore')
1967 return detect_exe_version(out
, version_re
, unrecognized
)
1970 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1971 assert isinstance(output
, compat_str
)
1972 if version_re
is None:
1973 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1974 m
= re
.search(version_re
, output
)
1981 class PagedList(object):
1983 # This is only useful for tests
1984 return len(self
.getslice())
1987 class OnDemandPagedList(PagedList
):
1988 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
1989 self
._pagefunc
= pagefunc
1990 self
._pagesize
= pagesize
1991 self
._use
_cache
= use_cache
1995 def getslice(self
, start
=0, end
=None):
1997 for pagenum
in itertools
.count(start
// self
._pagesize
):
1998 firstid
= pagenum
* self
._pagesize
1999 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2000 if start
>= nextfirstid
:
2005 page_results
= self
._cache
.get(pagenum
)
2006 if page_results
is None:
2007 page_results
= list(self
._pagefunc
(pagenum
))
2009 self
._cache
[pagenum
] = page_results
2012 start
% self
._pagesize
2013 if firstid
<= start
< nextfirstid
2017 ((end
- 1) % self
._pagesize
) + 1
2018 if (end
is not None and firstid
<= end
<= nextfirstid
)
2021 if startv
!= 0 or endv
is not None:
2022 page_results
= page_results
[startv
:endv
]
2023 res
.extend(page_results
)
2025 # A little optimization - if current page is not "full", ie. does
2026 # not contain page_size videos then we can assume that this page
2027 # is the last one - there are no more ids on further pages -
2028 # i.e. no need to query again.
2029 if len(page_results
) + startv
< self
._pagesize
:
2032 # If we got the whole page, but the next page is not interesting,
2033 # break out early as well
2034 if end
== nextfirstid
:
2039 class InAdvancePagedList(PagedList
):
2040 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2041 self
._pagefunc
= pagefunc
2042 self
._pagecount
= pagecount
2043 self
._pagesize
= pagesize
2045 def getslice(self
, start
=0, end
=None):
2047 start_page
= start
// self
._pagesize
2049 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
2050 skip_elems
= start
- start_page
* self
._pagesize
2051 only_more
= None if end
is None else end
- start
2052 for pagenum
in range(start_page
, end_page
):
2053 page
= list(self
._pagefunc
(pagenum
))
2055 page
= page
[skip_elems
:]
2057 if only_more
is not None:
2058 if len(page
) < only_more
:
2059 only_more
-= len(page
)
2061 page
= page
[:only_more
]
2068 def uppercase_escape(s
):
2069 unicode_escape
= codecs
.getdecoder('unicode_escape')
2071 r
'\\U[0-9a-fA-F]{8}',
2072 lambda m
: unicode_escape(m
.group(0))[0],
2076 def lowercase_escape(s
):
2077 unicode_escape
= codecs
.getdecoder('unicode_escape')
2079 r
'\\u[0-9a-fA-F]{4}',
2080 lambda m
: unicode_escape(m
.group(0))[0],
2084 def escape_rfc3986(s
):
2085 """Escape non-ASCII characters as suggested by RFC 3986"""
2086 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
2087 s
= s
.encode('utf-8')
2088 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2091 def escape_url(url
):
2092 """Escape URL as suggested by RFC 3986"""
2093 url_parsed
= compat_urllib_parse_urlparse(url
)
2094 return url_parsed
._replace
(
2095 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2096 path
=escape_rfc3986(url_parsed
.path
),
2097 params
=escape_rfc3986(url_parsed
.params
),
2098 query
=escape_rfc3986(url_parsed
.query
),
2099 fragment
=escape_rfc3986(url_parsed
.fragment
)
2103 def read_batch_urls(batch_fd
):
2105 if not isinstance(url
, compat_str
):
2106 url
= url
.decode('utf-8', 'replace')
2107 BOM_UTF8
= '\xef\xbb\xbf'
2108 if url
.startswith(BOM_UTF8
):
2109 url
= url
[len(BOM_UTF8
):]
2111 if url
.startswith(('#', ';', ']')):
2115 with contextlib
.closing(batch_fd
) as fd
:
2116 return [url
for url
in map(fixup
, fd
) if url
]
2119 def urlencode_postdata(*args
, **kargs
):
2120 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2123 def update_url_query(url
, query
):
2126 parsed_url
= compat_urlparse
.urlparse(url
)
2127 qs
= compat_parse_qs(parsed_url
.query
)
2129 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2130 query
=compat_urllib_parse_urlencode(qs
, True)))
2133 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
2134 req_headers
= req
.headers
.copy()
2135 req_headers
.update(headers
)
2136 req_data
= data
or req
.data
2137 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2138 req_get_method
= req
.get_method()
2139 if req_get_method
== 'HEAD':
2140 req_type
= HEADRequest
2141 elif req_get_method
== 'PUT':
2142 req_type
= PUTRequest
2144 req_type
= compat_urllib_request
.Request
2146 req_url
, data
=req_data
, headers
=req_headers
,
2147 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2148 if hasattr(req
, 'timeout'):
2149 new_req
.timeout
= req
.timeout
2153 def _multipart_encode_impl(data
, boundary
):
2154 content_type
= 'multipart/form-data; boundary=%s' % boundary
2157 for k
, v
in data
.items():
2158 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2159 if isinstance(k
, compat_str
):
2160 k
= k
.encode('utf-8')
2161 if isinstance(v
, compat_str
):
2162 v
= v
.encode('utf-8')
2163 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2164 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2165 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2166 if boundary
.encode('ascii') in content
:
2167 raise ValueError('Boundary overlaps with data')
2170 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2172 return out
, content_type
2175 def multipart_encode(data
, boundary
=None):
2177 Encode a dict to RFC 7578-compliant form-data
2180 A dict where keys and values can be either Unicode or bytes-like
2183 If specified a Unicode object, it's used as the boundary. Otherwise
2184 a random boundary is generated.
2186 Reference: https://tools.ietf.org/html/rfc7578
2188 has_specified_boundary
= boundary
is not None
2191 if boundary
is None:
2192 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2195 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2198 if has_specified_boundary
:
2202 return out
, content_type
2205 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2206 if isinstance(key_or_keys
, (list, tuple)):
2207 for key
in key_or_keys
:
2208 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2212 return d
.get(key_or_keys
, default
)
2215 def try_get(src
, getter
, expected_type
=None):
2216 if not isinstance(getter
, (list, tuple)):
2221 except (AttributeError, KeyError, TypeError, IndexError):
2224 if expected_type
is None or isinstance(v
, expected_type
):
2228 def merge_dicts(*dicts
):
2230 for a_dict
in dicts
:
2231 for k
, v
in a_dict
.items():
2234 if (k
not in merged
or
2235 (isinstance(v
, compat_str
) and v
and
2236 isinstance(merged
[k
], compat_str
) and
2242 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2243 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2255 TV_PARENTAL_GUIDELINES
= {
2265 def parse_age_limit(s
):
2267 return s
if 0 <= s
<= 21 else None
2268 if not isinstance(s
, compat_basestring
):
2270 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2272 return int(m
.group('age'))
2274 return US_RATINGS
[s
]
2275 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2277 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
2281 def strip_jsonp(code
):
2284 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2285 (?:\s*&&\s*(?P=func_name))?
2286 \s*\(\s*(?P<callback_data>.*)\);?
2287 \s*?(?://[^\n]*)*$''',
2288 r
'\g<callback_data>', code
)
2291 def js_to_json(code
):
2292 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2293 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
2295 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
2296 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
2301 if v
in ('true', 'false', 'null'):
2303 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
2306 if v
[0] in ("'", '"'):
2307 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
2312 }.get(m
.group(0), m
.group(0)), v
[1:-1])
2314 for regex
, base
in INTEGER_TABLE
:
2315 im
= re
.match(regex
, v
)
2317 i
= int(im
.group(1), base
)
2318 return '"%d":' % i
if v
.endswith(':') else '%d' % i
2322 return re
.sub(r
'''(?sx)
2323 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2324 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2325 {comment}|,(?={skip}[\]}}])|
2326 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2327 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2329 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
2332 def qualities(quality_ids
):
2333 """ Get a numeric quality value out of a list of possible values """
2336 return quality_ids
.index(qid
)
2342 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
2345 def limit_length(s
, length
):
2346 """ Add ellipses to overly long strings """
2351 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
2355 def version_tuple(v
):
2356 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
2359 def is_outdated_version(version
, limit
, assume_new
=True):
2361 return not assume_new
2363 return version_tuple(version
) < version_tuple(limit
)
2365 return not assume_new
2368 def ytdl_is_updateable():
2369 """ Returns if youtube-dl can be updated with -U """
2370 from zipimport
import zipimporter
2372 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
2375 def args_to_str(args
):
2376 # Get a short string representation for a subprocess command
2377 return ' '.join(compat_shlex_quote(a
) for a
in args
)
2380 def error_to_compat_str(err
):
2382 # On python 2 error byte string must be decoded with proper
2383 # encoding rather than ascii
2384 if sys
.version_info
[0] < 3:
2385 err_str
= err_str
.decode(preferredencoding())
2389 def mimetype2ext(mt
):
2395 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2396 # it's the most popular one
2397 'audio/mpeg': 'mp3',
2402 _
, _
, res
= mt
.rpartition('/')
2403 res
= res
.split(';')[0].strip().lower()
2407 'smptett+xml': 'tt',
2411 'x-mp4-fragmented': 'mp4',
2412 'x-ms-sami': 'sami',
2415 'x-mpegurl': 'm3u8',
2416 'vnd.apple.mpegurl': 'm3u8',
2420 'vnd.ms-sstr+xml': 'ism',
2426 def parse_codecs(codecs_str
):
2427 # http://tools.ietf.org/html/rfc6381
2430 splited_codecs
= list(filter(None, map(
2431 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
2432 vcodec
, acodec
= None, None
2433 for full_codec
in splited_codecs
:
2434 codec
= full_codec
.split('.')[0]
2435 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2438 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2442 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
2443 if not vcodec
and not acodec
:
2444 if len(splited_codecs
) == 2:
2449 elif len(splited_codecs
) == 1:
2456 'vcodec': vcodec
or 'none',
2457 'acodec': acodec
or 'none',
2462 def urlhandle_detect_ext(url_handle
):
2463 getheader
= url_handle
.headers
.get
2465 cd
= getheader('Content-Disposition')
2467 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
2469 e
= determine_ext(m
.group('filename'), default_ext
=None)
2473 return mimetype2ext(getheader('Content-Type'))
2476 def encode_data_uri(data
, mime_type
):
2477 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
2480 def age_restricted(content_limit
, age_limit
):
2481 """ Returns True iff the content should be blocked """
2483 if age_limit
is None: # No limit set
2485 if content_limit
is None:
2486 return False # Content available for everyone
2487 return age_limit
< content_limit
2490 def is_html(first_bytes
):
2491 """ Detect whether a file contains HTML by examining its first bytes. """
2494 (b
'\xef\xbb\xbf', 'utf-8'),
2495 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
2496 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
2497 (b
'\xff\xfe', 'utf-16-le'),
2498 (b
'\xfe\xff', 'utf-16-be'),
2500 for bom
, enc
in BOMS
:
2501 if first_bytes
.startswith(bom
):
2502 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
2505 s
= first_bytes
.decode('utf-8', 'replace')
2507 return re
.match(r
'^\s*<', s
)
2510 def determine_protocol(info_dict
):
2511 protocol
= info_dict
.get('protocol')
2512 if protocol
is not None:
2515 url
= info_dict
['url']
2516 if url
.startswith('rtmp'):
2518 elif url
.startswith('mms'):
2520 elif url
.startswith('rtsp'):
2523 ext
= determine_ext(url
)
2529 return compat_urllib_parse_urlparse(url
).scheme
2532 def render_table(header_row
, data
):
2533 """ Render a list of rows, each as a list of values """
2534 table
= [header_row
] + data
2535 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
2536 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
2537 return '\n'.join(format_str
% tuple(row
) for row
in table
)
2540 def _match_one(filter_part
, dct
):
2541 COMPARISON_OPERATORS
= {
2549 operator_rex
= re
.compile(r
'''(?x)\s*
2551 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2553 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2554 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
2555 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
2558 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2559 m = operator_rex.search(filter_part)
2561 op = COMPARISON_OPERATORS[m.group('op')]
2562 actual_value = dct.get(m.group('key'))
2563 if (m.group('quotedstrval') is not None or
2564 m.group('strval') is not None or
2565 # If the original field is a string and matching comparisonvalue is
2566 # a number we should respect the origin of the original field
2567 # and process comparison value as a string (see
2568 # https://github.com/rg3/youtube-dl/issues/11082).
2569 actual_value is not None and m.group('intval') is not None and
2570 isinstance(actual_value, compat_str)):
2571 if m.group('op') not in ('=', '!='):
2573 'Operator %s does not support string values!' % m.group('op'))
2574 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2575 quote = m.group('quote')
2576 if quote is not None:
2577 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2580 comparison_value = int(m.group('intval'))
2582 comparison_value = parse_filesize(m.group('intval'))
2583 if comparison_value is None:
2584 comparison_value = parse_filesize(m.group('intval') + 'B')
2585 if comparison_value is None:
2587 'Invalid integer value %r in filter part %r' % (
2588 m.group('intval'), filter_part))
2589 if actual_value is None:
2590 return m.group('none_inclusive')
2591 return op(actual_value, comparison_value)
2594 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2595 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2597 operator_rex = re.compile(r'''(?x
)\s
*
2598 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
2600 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2601 m = operator_rex.search(filter_part)
2603 op = UNARY_OPERATORS[m.group('op')]
2604 actual_value = dct.get(m.group('key'))
2605 return op(actual_value)
2607 raise ValueError('Invalid filter part %r' % filter_part)
2610 def match_str(filter_str, dct):
2611 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2614 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2617 def match_filter_func(filter_str):
2618 def _match_func(info_dict):
2619 if match_str(filter_str, info_dict):
2622 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2623 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2627 def parse_dfxp_time_expr(time_expr):
2631 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2633 return float(mobj.group('time_offset'))
2635 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2637 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2640 def srt_subtitles_timecode(seconds):
2641 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2644 def dfxp2srt(dfxp_data):
2646 @param dfxp_data A
bytes-like
object containing DFXP data
2647 @returns A
unicode object containing converted SRT data
2649 LEGACY_NAMESPACES = (
2650 (b'http://www.w3.org/ns/ttml', [
2651 b'http://www.w3.org/2004/11/ttaf1',
2652 b'http://www.w3.org/2006/04/ttaf1',
2653 b'http://www.w3.org/2006/10/ttaf1',
2655 (b'http://www.w3.org/ns/ttml#styling', [
2656 b'http://www.w3.org/ns/ttml#style',
2660 SUPPORTED_STYLING = [
2669 _x = functools.partial(xpath_with_ns, ns_map={
2670 'xml': 'http://www.w3.org/XML/1998/namespace',
2671 'ttml': 'http://www.w3.org/ns/ttml',
2672 'tts': 'http://www.w3.org/ns/ttml#styling',
2678 class TTMLPElementParser(object):
2680 _unclosed_elements = []
2681 _applied_styles = []
2683 def start(self, tag, attrib):
2684 if tag in (_x('ttml:br'), 'br'):
2687 unclosed_elements = []
2689 element_style_id = attrib.get('style')
2691 style.update(default_style)
2692 if element_style_id:
2693 style.update(styles.get(element_style_id, {}))
2694 for prop in SUPPORTED_STYLING:
2695 prop_val = attrib.get(_x('tts:' + prop))
2697 style[prop] = prop_val
2700 for k, v in sorted(style.items()):
2701 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2704 font += ' color="%s"' % v
2705 elif k == 'fontSize':
2706 font += ' size="%s"' % v
2707 elif k == 'fontFamily':
2708 font += ' face="%s"' % v
2709 elif k == 'fontWeight' and v == 'bold':
2711 unclosed_elements.append('b')
2712 elif k == 'fontStyle' and v == 'italic':
2714 unclosed_elements.append('i')
2715 elif k == 'textDecoration' and v == 'underline':
2717 unclosed_elements.append('u')
2719 self._out += '<font' + font + '>'
2720 unclosed_elements.append('font')
2722 if self._applied_styles:
2723 applied_style.update(self._applied_styles[-1])
2724 applied_style.update(style)
2725 self._applied_styles.append(applied_style)
2726 self._unclosed_elements.append(unclosed_elements)
2729 if tag not in (_x('ttml:br'), 'br'):
2730 unclosed_elements = self._unclosed_elements.pop()
2731 for element in reversed(unclosed_elements):
2732 self._out += '</%s>' % element
2733 if unclosed_elements and self._applied_styles:
2734 self._applied_styles.pop()
2736 def data(self, data):
2740 return self._out.strip()
2742 def parse_node(node):
2743 target = TTMLPElementParser()
2744 parser = xml.etree.ElementTree.XMLParser(target=target)
2745 parser.feed(xml.etree.ElementTree.tostring(node))
2746 return parser.close()
2748 for k, v in LEGACY_NAMESPACES:
2750 dfxp_data = dfxp_data.replace(ns, k)
2752 dfxp = compat_etree_fromstring(dfxp_data)
2754 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2757 raise ValueError('Invalid dfxp/TTML subtitle')
2761 for style in dfxp.findall(_x('.//ttml:style')):
2762 style_id = style.get('id') or style.get(_x('xml:id'))
2765 parent_style_id = style.get('style')
2767 if parent_style_id not in styles:
2770 styles[style_id] = styles[parent_style_id].copy()
2771 for prop in SUPPORTED_STYLING:
2772 prop_val = style.get(_x('tts:' + prop))
2774 styles.setdefault(style_id, {})[prop] = prop_val
2780 for p in ('body', 'div'):
2781 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2784 style = styles.get(ele.get('style'))
2787 default_style.update(style)
2789 for para, index in zip(paras, itertools.count(1)):
2790 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2791 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2792 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2793 if begin_time is None:
2798 end_time = begin_time + dur
2799 out.append('%d\n%s --> %s\n%s\n\n' % (
2801 srt_subtitles_timecode(begin_time),
2802 srt_subtitles_timecode(end_time),
2808 def cli_option(params, command_option, param):
2809 param = params.get(param)
2811 param = compat_str(param)
2812 return [command_option, param] if param is not None else []
2815 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2816 param = params.get(param)
2819 assert isinstance(param, bool)
2821 return [command_option + separator + (true_value if param else false_value)]
2822 return [command_option, true_value if param else false_value]
2825 def cli_valueless_option(params, command_option, param, expected_value=True):
2826 param = params.get(param)
2827 return [command_option] if param == expected_value else []
2830 def cli_configuration_args(params, param, default=[]):
2831 ex_args = params.get(param)
2834 assert isinstance(ex_args, list)
2838 class ISO639Utils(object):
2839 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3028 def short2long(cls, code):
3029 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3030 return cls._lang_map.get(code[:2])
3033 def long2short(cls, code):
3034 """Convert language code from ISO 639-2/T to ISO 639-1"""
3035 for short_name, long_name in cls._lang_map.items():
3036 if long_name == code:
3040 class ISO3166Utils(object):
3041 # From http://data.okfn.org/data/core/country-list
3043 'AF': 'Afghanistan',
3044 'AX': 'Åland Islands',
3047 'AS': 'American Samoa',
3052 'AG': 'Antigua and Barbuda',
3069 'BO': 'Bolivia, Plurinational State of',
3070 'BQ': 'Bonaire, Sint Eustatius and Saba',
3071 'BA': 'Bosnia and Herzegovina',
3073 'BV': 'Bouvet Island',
3075 'IO': 'British Indian Ocean Territory',
3076 'BN': 'Brunei Darussalam',
3078 'BF': 'Burkina Faso',
3084 'KY': 'Cayman Islands',
3085 'CF': 'Central African Republic',
3089 'CX': 'Christmas Island',
3090 'CC': 'Cocos (Keeling) Islands',
3094 'CD': 'Congo, the Democratic Republic of the',
3095 'CK': 'Cook Islands',
3097 'CI': 'Côte d\'Ivoire',
3102 'CZ': 'Czech Republic',
3106 'DO': 'Dominican Republic',
3109 'SV': 'El Salvador',
3110 'GQ': 'Equatorial Guinea',
3114 'FK': 'Falkland Islands (Malvinas)',
3115 'FO': 'Faroe Islands',
3119 'GF': 'French Guiana',
3120 'PF': 'French Polynesia',
3121 'TF': 'French Southern Territories',
3136 'GW': 'Guinea-Bissau',
3139 'HM': 'Heard Island and McDonald Islands',
3140 'VA': 'Holy See (Vatican City State)',
3147 'IR': 'Iran, Islamic Republic of',
3150 'IM': 'Isle of Man',
3160 'KP': 'Korea, Democratic People\'s Republic of',
3161 'KR': 'Korea, Republic of',
3164 'LA': 'Lao People\'s Democratic Republic',
3170 'LI': 'Liechtenstein',
3174 'MK': 'Macedonia, the Former Yugoslav Republic of',
3181 'MH': 'Marshall Islands',
3187 'FM': 'Micronesia, Federated States of',
3188 'MD': 'Moldova, Republic of',
3199 'NL': 'Netherlands',
3200 'NC': 'New Caledonia',
3201 'NZ': 'New Zealand',
3206 'NF': 'Norfolk Island',
3207 'MP': 'Northern Mariana Islands',
3212 'PS': 'Palestine, State of',
3214 'PG': 'Papua New Guinea',
3217 'PH': 'Philippines',
3221 'PR': 'Puerto Rico',
3225 'RU': 'Russian Federation',
3227 'BL': 'Saint Barthélemy',
3228 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3229 'KN': 'Saint Kitts and Nevis',
3230 'LC': 'Saint Lucia',
3231 'MF': 'Saint Martin (French part)',
3232 'PM': 'Saint Pierre and Miquelon',
3233 'VC': 'Saint Vincent and the Grenadines',
3236 'ST': 'Sao Tome and Principe',
3237 'SA': 'Saudi Arabia',
3241 'SL': 'Sierra Leone',
3243 'SX': 'Sint Maarten (Dutch part)',
3246 'SB': 'Solomon Islands',
3248 'ZA': 'South Africa',
3249 'GS': 'South Georgia and the South Sandwich Islands',
3250 'SS': 'South Sudan',
3255 'SJ': 'Svalbard and Jan Mayen',
3258 'CH': 'Switzerland',
3259 'SY': 'Syrian Arab Republic',
3260 'TW': 'Taiwan, Province of China',
3262 'TZ': 'Tanzania, United Republic of',
3264 'TL': 'Timor-Leste',
3268 'TT': 'Trinidad and Tobago',
3271 'TM': 'Turkmenistan',
3272 'TC': 'Turks and Caicos Islands',
3276 'AE': 'United Arab Emirates',
3277 'GB': 'United Kingdom',
3278 'US': 'United States',
3279 'UM': 'United States Minor Outlying Islands',
3283 'VE': 'Venezuela, Bolivarian Republic of',
3285 'VG': 'Virgin Islands, British',
3286 'VI': 'Virgin Islands, U.S.',
3287 'WF': 'Wallis and Futuna',
3288 'EH': 'Western Sahara',
3295 def short2full(cls, code):
3296 """Convert an ISO 3166-2 country code to the corresponding full name"""
3297 return cls._country_map.get(code.upper())
3300 class GeoUtils(object):
3301 # Major IPv4 address blocks per country
3303 'AD': '85.94.160.0/19',
3304 'AE': '94.200.0.0/13',
3305 'AF': '149.54.0.0/17',
3306 'AG': '209.59.64.0/18',
3307 'AI': '204.14.248.0/21',
3308 'AL': '46.99.0.0/16',
3309 'AM': '46.70.0.0/15',
3310 'AO': '105.168.0.0/13',
3311 'AP': '159.117.192.0/21',
3312 'AR': '181.0.0.0/12',
3313 'AS': '202.70.112.0/20',
3314 'AT': '84.112.0.0/13',
3315 'AU': '1.128.0.0/11',
3316 'AW': '181.41.0.0/18',
3317 'AZ': '5.191.0.0/16',
3318 'BA': '31.176.128.0/17',
3319 'BB': '65.48.128.0/17',
3320 'BD': '114.130.0.0/16',
3322 'BF': '129.45.128.0/17',
3323 'BG': '95.42.0.0/15',
3324 'BH': '37.131.0.0/17',
3325 'BI': '154.117.192.0/18',
3326 'BJ': '137.255.0.0/16',
3327 'BL': '192.131.134.0/24',
3328 'BM': '196.12.64.0/18',
3329 'BN': '156.31.0.0/16',
3330 'BO': '161.56.0.0/16',
3331 'BQ': '161.0.80.0/20',
3332 'BR': '152.240.0.0/12',
3333 'BS': '24.51.64.0/18',
3334 'BT': '119.2.96.0/19',
3335 'BW': '168.167.0.0/16',
3336 'BY': '178.120.0.0/13',
3337 'BZ': '179.42.192.0/18',
3338 'CA': '99.224.0.0/11',
3339 'CD': '41.243.0.0/16',
3340 'CF': '196.32.200.0/21',
3341 'CG': '197.214.128.0/17',
3342 'CH': '85.0.0.0/13',
3343 'CI': '154.232.0.0/14',
3344 'CK': '202.65.32.0/19',
3345 'CL': '152.172.0.0/14',
3346 'CM': '165.210.0.0/15',
3347 'CN': '36.128.0.0/10',
3348 'CO': '181.240.0.0/12',
3349 'CR': '201.192.0.0/12',
3350 'CU': '152.206.0.0/15',
3351 'CV': '165.90.96.0/19',
3352 'CW': '190.88.128.0/17',
3353 'CY': '46.198.0.0/15',
3354 'CZ': '88.100.0.0/14',
3356 'DJ': '197.241.0.0/17',
3357 'DK': '87.48.0.0/12',
3358 'DM': '192.243.48.0/20',
3359 'DO': '152.166.0.0/15',
3360 'DZ': '41.96.0.0/12',
3361 'EC': '186.68.0.0/15',
3362 'EE': '90.190.0.0/15',
3363 'EG': '156.160.0.0/11',
3364 'ER': '196.200.96.0/20',
3365 'ES': '88.0.0.0/11',
3366 'ET': '196.188.0.0/14',
3367 'EU': '2.16.0.0/13',
3368 'FI': '91.152.0.0/13',
3369 'FJ': '144.120.0.0/16',
3370 'FM': '119.252.112.0/20',
3371 'FO': '88.85.32.0/19',
3373 'GA': '41.158.0.0/15',
3375 'GD': '74.122.88.0/21',
3376 'GE': '31.146.0.0/16',
3377 'GF': '161.22.64.0/18',
3378 'GG': '62.68.160.0/19',
3379 'GH': '45.208.0.0/14',
3380 'GI': '85.115.128.0/19',
3381 'GL': '88.83.0.0/19',
3382 'GM': '160.182.0.0/15',
3383 'GN': '197.149.192.0/18',
3384 'GP': '104.250.0.0/19',
3385 'GQ': '105.235.224.0/20',
3386 'GR': '94.64.0.0/13',
3387 'GT': '168.234.0.0/16',
3388 'GU': '168.123.0.0/16',
3389 'GW': '197.214.80.0/20',
3390 'GY': '181.41.64.0/18',
3391 'HK': '113.252.0.0/14',
3392 'HN': '181.210.0.0/16',
3393 'HR': '93.136.0.0/13',
3394 'HT': '148.102.128.0/17',
3395 'HU': '84.0.0.0/14',
3396 'ID': '39.192.0.0/10',
3397 'IE': '87.32.0.0/12',
3398 'IL': '79.176.0.0/13',
3399 'IM': '5.62.80.0/20',
3400 'IN': '117.192.0.0/10',
3401 'IO': '203.83.48.0/21',
3402 'IQ': '37.236.0.0/14',
3403 'IR': '2.176.0.0/12',
3404 'IS': '82.221.0.0/16',
3405 'IT': '79.0.0.0/10',
3406 'JE': '87.244.64.0/18',
3407 'JM': '72.27.0.0/17',
3408 'JO': '176.29.0.0/16',
3409 'JP': '126.0.0.0/8',
3410 'KE': '105.48.0.0/12',
3411 'KG': '158.181.128.0/17',
3412 'KH': '36.37.128.0/17',
3413 'KI': '103.25.140.0/22',
3414 'KM': '197.255.224.0/20',
3415 'KN': '198.32.32.0/19',
3416 'KP': '175.45.176.0/22',
3417 'KR': '175.192.0.0/10',
3418 'KW': '37.36.0.0/14',
3419 'KY': '64.96.0.0/15',
3420 'KZ': '2.72.0.0/13',
3421 'LA': '115.84.64.0/18',
3422 'LB': '178.135.0.0/16',
3423 'LC': '192.147.231.0/24',
3424 'LI': '82.117.0.0/19',
3425 'LK': '112.134.0.0/15',
3426 'LR': '41.86.0.0/19',
3427 'LS': '129.232.0.0/17',
3428 'LT': '78.56.0.0/13',
3429 'LU': '188.42.0.0/16',
3430 'LV': '46.109.0.0/16',
3431 'LY': '41.252.0.0/14',
3432 'MA': '105.128.0.0/11',
3433 'MC': '88.209.64.0/18',
3434 'MD': '37.246.0.0/16',
3435 'ME': '178.175.0.0/17',
3436 'MF': '74.112.232.0/21',
3437 'MG': '154.126.0.0/17',
3438 'MH': '117.103.88.0/21',
3439 'MK': '77.28.0.0/15',
3440 'ML': '154.118.128.0/18',
3441 'MM': '37.111.0.0/17',
3442 'MN': '49.0.128.0/17',
3443 'MO': '60.246.0.0/16',
3444 'MP': '202.88.64.0/20',
3445 'MQ': '109.203.224.0/19',
3446 'MR': '41.188.64.0/18',
3447 'MS': '208.90.112.0/22',
3448 'MT': '46.11.0.0/16',
3449 'MU': '105.16.0.0/12',
3450 'MV': '27.114.128.0/18',
3451 'MW': '105.234.0.0/16',
3452 'MX': '187.192.0.0/11',
3453 'MY': '175.136.0.0/13',
3454 'MZ': '197.218.0.0/15',
3455 'NA': '41.182.0.0/16',
3456 'NC': '101.101.0.0/18',
3457 'NE': '197.214.0.0/18',
3458 'NF': '203.17.240.0/22',
3459 'NG': '105.112.0.0/12',
3460 'NI': '186.76.0.0/15',
3461 'NL': '145.96.0.0/11',
3462 'NO': '84.208.0.0/13',
3463 'NP': '36.252.0.0/15',
3464 'NR': '203.98.224.0/19',
3465 'NU': '49.156.48.0/22',
3466 'NZ': '49.224.0.0/14',
3467 'OM': '5.36.0.0/15',
3468 'PA': '186.72.0.0/15',
3469 'PE': '186.160.0.0/14',
3470 'PF': '123.50.64.0/18',
3471 'PG': '124.240.192.0/19',
3472 'PH': '49.144.0.0/13',
3473 'PK': '39.32.0.0/11',
3474 'PL': '83.0.0.0/11',
3475 'PM': '70.36.0.0/20',
3476 'PR': '66.50.0.0/16',
3477 'PS': '188.161.0.0/16',
3478 'PT': '85.240.0.0/13',
3479 'PW': '202.124.224.0/20',
3480 'PY': '181.120.0.0/14',
3481 'QA': '37.210.0.0/15',
3482 'RE': '139.26.0.0/16',
3483 'RO': '79.112.0.0/13',
3484 'RS': '178.220.0.0/14',
3485 'RU': '5.136.0.0/13',
3486 'RW': '105.178.0.0/15',
3487 'SA': '188.48.0.0/13',
3488 'SB': '202.1.160.0/19',
3489 'SC': '154.192.0.0/11',
3490 'SD': '154.96.0.0/13',
3491 'SE': '78.64.0.0/12',
3492 'SG': '152.56.0.0/14',
3493 'SI': '188.196.0.0/14',
3494 'SK': '78.98.0.0/15',
3495 'SL': '197.215.0.0/17',
3496 'SM': '89.186.32.0/19',
3497 'SN': '41.82.0.0/15',
3498 'SO': '197.220.64.0/19',
3499 'SR': '186.179.128.0/17',
3500 'SS': '105.235.208.0/21',
3501 'ST': '197.159.160.0/19',
3502 'SV': '168.243.0.0/16',
3503 'SX': '190.102.0.0/20',
3505 'SZ': '41.84.224.0/19',
3506 'TC': '65.255.48.0/20',
3507 'TD': '154.68.128.0/19',
3508 'TG': '196.168.0.0/14',
3509 'TH': '171.96.0.0/13',
3510 'TJ': '85.9.128.0/18',
3511 'TK': '27.96.24.0/21',
3512 'TL': '180.189.160.0/20',
3513 'TM': '95.85.96.0/19',
3514 'TN': '197.0.0.0/11',
3515 'TO': '175.176.144.0/21',
3516 'TR': '78.160.0.0/11',
3517 'TT': '186.44.0.0/15',
3518 'TV': '202.2.96.0/19',
3519 'TW': '120.96.0.0/11',
3520 'TZ': '156.156.0.0/14',
3521 'UA': '93.72.0.0/13',
3522 'UG': '154.224.0.0/13',
3524 'UY': '167.56.0.0/13',
3525 'UZ': '82.215.64.0/18',
3526 'VA': '212.77.0.0/19',
3527 'VC': '24.92.144.0/20',
3528 'VE': '186.88.0.0/13',
3529 'VG': '172.103.64.0/18',
3530 'VI': '146.226.0.0/16',
3531 'VN': '14.160.0.0/11',
3532 'VU': '202.80.32.0/20',
3533 'WF': '117.20.32.0/21',
3534 'WS': '202.4.32.0/19',
3535 'YE': '134.35.0.0/16',
3536 'YT': '41.242.116.0/22',
3537 'ZA': '41.0.0.0/11',
3538 'ZM': '165.56.0.0/13',
3539 'ZW': '41.85.192.0/19',
3543 def random_ipv4(cls, code_or_block):
3544 if len(code_or_block) == 2:
3545 block = cls._country_ip_map.get(code_or_block.upper())
3549 block = code_or_block
3550 addr, preflen = block.split('/')
3551 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3552 addr_max = addr_min | (0xffffffff >> int(preflen))
3553 return compat_str(socket.inet_ntoa(
3554 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3557 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3558 def __init__(self, proxies=None):
3559 # Set default handlers
3560 for type in ('http', 'https'):
3561 setattr(self, '%s_open' % type,
3562 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3563 meth(r, proxy, type))
3564 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3566 def proxy_open(self, req, proxy, type):
3567 req_proxy = req.headers.get('Ytdl-request-proxy')
3568 if req_proxy is not None:
3570 del req.headers['Ytdl-request-proxy']
3572 if proxy == '__noproxy__':
3573 return None # No Proxy
3574 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3575 req.add_header('Ytdl-socks-proxy', proxy)
3576 # youtube-dl's http/https handlers do wrapping the socket with socks
3578 return compat_urllib_request.ProxyHandler.proxy_open(
3579 self, req, proxy, type)
3582 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3583 # released into Public Domain
3584 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3586 def long_to_bytes(n, blocksize=0):
3587 """long_to_bytes(n:long, blocksize:int) : string
3588 Convert a long integer to a byte string.
3590 If optional blocksize is given and greater than zero, pad the front of the
3591 byte string with binary zeros so that the length is a multiple of
3594 # after much testing, this algorithm was deemed to be the fastest
3598 s = compat_struct_pack('>I', n & 0xffffffff) + s
3600 # strip off leading zeros
3601 for i in range(len(s)):
3602 if s[i] != b'\000'[0]:
3605 # only happens when n == 0
3609 # add back some pad bytes. this could be done more efficiently w.r.t. the
3610 # de-padding being done above, but sigh...
3611 if blocksize > 0 and len(s) % blocksize:
3612 s = (blocksize - len(s) % blocksize) * b'\000' + s
3616 def bytes_to_long(s):
3617 """bytes_to_long(string) : long
3618 Convert a byte string to a long integer.
3620 This is (essentially) the inverse of long_to_bytes().
3625 extra = (4 - length % 4)
3626 s = b'\000' * extra + s
3627 length = length + extra
3628 for i in range(0, length, 4):
3629 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3633 def ohdave_rsa_encrypt(data, exponent, modulus):
3635 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
3638 data: data to encrypt, bytes-like object
3639 exponent, modulus: parameter e and N of RSA algorithm, both integer
3640 Output: hex string of encrypted data
3642 Limitation: supports one block encryption only
3645 payload = int(binascii.hexlify(data[::-1]), 16)
3646 encrypted = pow(payload, exponent, modulus)
3647 return '%x' % encrypted
3650 def pkcs1pad(data, length):
3652 Padding input data with PKCS#1 scheme
3654 @param {int[]} data input data
3655 @param {int} length target length
3656 @returns {int[]} padded data
3658 if len(data) > length - 11:
3659 raise ValueError('Input data too
long for PKCS
#1 padding')
3661 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
3662 return [0, 2] + pseudo_random
+ [0] + data
3665 def encode_base_n(num
, n
, table
=None):
3666 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3668 table
= FULL_TABLE
[:n
]
3671 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
3678 ret
= table
[num
% n
] + ret
3683 def decode_packed_codes(code
):
3684 mobj
= re
.search(PACKED_CODES_RE
, code
)
3685 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
3688 symbols
= symbols
.split('|')
3693 base_n_count
= encode_base_n(count
, base
)
3694 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
3697 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
3701 def parse_m3u8_attributes(attrib
):
3703 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
3704 if val
.startswith('"'):
3710 def urshift(val
, n
):
3711 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
3714 # Based on png2str() written by @gdkchan and improved by @yokrysty
3715 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3716 def decode_png(png_data
):
3717 # Reference: https://www.w3.org/TR/PNG/
3718 header
= png_data
[8:]
3720 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
3721 raise IOError('Not a valid PNG file.')
3723 int_map
= {1: '>B', 2: '>H', 4: '>I'}
3724 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
3729 length
= unpack_integer(header
[:4])
3732 chunk_type
= header
[:4]
3735 chunk_data
= header
[:length
]
3736 header
= header
[length
:]
3738 header
= header
[4:] # Skip CRC
3746 ihdr
= chunks
[0]['data']
3748 width
= unpack_integer(ihdr
[:4])
3749 height
= unpack_integer(ihdr
[4:8])
3753 for chunk
in chunks
:
3754 if chunk
['type'] == b
'IDAT':
3755 idat
+= chunk
['data']
3758 raise IOError('Unable to read PNG data.')
3760 decompressed_data
= bytearray(zlib
.decompress(idat
))
3765 def _get_pixel(idx
):
3770 for y
in range(height
):
3771 basePos
= y
* (1 + stride
)
3772 filter_type
= decompressed_data
[basePos
]
3776 pixels
.append(current_row
)
3778 for x
in range(stride
):
3779 color
= decompressed_data
[1 + basePos
+ x
]
3780 basex
= y
* stride
+ x
3785 left
= _get_pixel(basex
- 3)
3787 up
= _get_pixel(basex
- stride
)
3789 if filter_type
== 1: # Sub
3790 color
= (color
+ left
) & 0xff
3791 elif filter_type
== 2: # Up
3792 color
= (color
+ up
) & 0xff
3793 elif filter_type
== 3: # Average
3794 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
3795 elif filter_type
== 4: # Paeth
3801 c
= _get_pixel(basex
- stride
- 3)
3809 if pa
<= pb
and pa
<= pc
:
3810 color
= (color
+ a
) & 0xff
3812 color
= (color
+ b
) & 0xff
3814 color
= (color
+ c
) & 0xff
3816 current_row
.append(color
)
3818 return width
, height
, pixels
3821 def write_xattr(path
, key
, value
):
3822 # This mess below finds the best xattr tool for the job
3824 # try the pyxattr module...
3827 if hasattr(xattr
, 'set'): # pyxattr
3828 # Unicode arguments are not supported in python-pyxattr until
3830 # See https://github.com/rg3/youtube-dl/issues/5498
3831 pyxattr_required_version
= '0.5.0'
3832 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
3833 # TODO: fallback to CLI tools
3834 raise XAttrUnavailableError(
3835 'python-pyxattr is detected but is too old. '
3836 'youtube-dl requires %s or above while your version is %s. '
3837 'Falling back to other xattr implementations' % (
3838 pyxattr_required_version
, xattr
.__version
__))
3840 setxattr
= xattr
.set
3842 setxattr
= xattr
.setxattr
3845 setxattr(path
, key
, value
)
3846 except EnvironmentError as e
:
3847 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3850 if compat_os_name
== 'nt':
3851 # Write xattrs to NTFS Alternate Data Streams:
3852 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3853 assert ':' not in key
3854 assert os
.path
.exists(path
)
3856 ads_fn
= path
+ ':' + key
3858 with open(ads_fn
, 'wb') as f
:
3860 except EnvironmentError as e
:
3861 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3863 user_has_setfattr
= check_executable('setfattr', ['--version'])
3864 user_has_xattr
= check_executable('xattr', ['-h'])
3866 if user_has_setfattr
or user_has_xattr
:
3868 value
= value
.decode('utf-8')
3869 if user_has_setfattr
:
3870 executable
= 'setfattr'
3871 opts
= ['-n', key
, '-v', value
]
3872 elif user_has_xattr
:
3873 executable
= 'xattr'
3874 opts
= ['-w', key
, value
]
3876 cmd
= ([encodeFilename(executable
, True)] +
3877 [encodeArgument(o
) for o
in opts
] +
3878 [encodeFilename(path
, True)])
3881 p
= subprocess
.Popen(
3882 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
3883 except EnvironmentError as e
:
3884 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3885 stdout
, stderr
= p
.communicate()
3886 stderr
= stderr
.decode('utf-8', 'replace')
3887 if p
.returncode
!= 0:
3888 raise XAttrMetadataError(p
.returncode
, stderr
)
3891 # On Unix, and can't find pyxattr, setfattr, or xattr.
3892 if sys
.platform
.startswith('linux'):
3893 raise XAttrUnavailableError(
3894 "Couldn't find a tool to set the xattrs. "
3895 "Install either the python 'pyxattr' or 'xattr' "
3896 "modules, or the GNU 'attr' package "
3897 "(which contains the 'setfattr' tool).")
3899 raise XAttrUnavailableError(
3900 "Couldn't find a tool to set the xattrs. "
3901 "Install either the python 'xattr' module, "
3902 "or the 'xattr' binary.")
3905 def random_birthday(year_field
, month_field
, day_field
):
3907 year_field
: str(random
.randint(1950, 1995)),
3908 month_field
: str(random
.randint(1, 12)),
3909 day_field
: str(random
.randint(1, 31)),