4 from __future__
import unicode_literals
34 import xml
.etree
.ElementTree
38 compat_HTMLParseError
,
42 compat_etree_fromstring
,
45 compat_html_entities_html5
,
51 compat_socket_create_connection
,
57 compat_urllib_parse_urlencode
,
58 compat_urllib_parse_urlparse
,
59 compat_urllib_parse_unquote_plus
,
60 compat_urllib_request
,
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme
not in compat_urlparse
.uses_netloc
:
77 compat_urlparse
.uses_netloc
.append(scheme
)
80 # This is not clearly defined otherwise
81 compiled_regex_type
= type(re
.compile(''))
84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
99 ENGLISH_MONTH_NAMES
= [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
104 'en': ENGLISH_MONTH_NAMES
,
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
123 'f4f', 'f4m', 'm3u8', 'smil')
125 # needed for sanitizing filenames in restricted mode
126 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
151 '%Y-%m-%d %H:%M:%S.%f',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S.%f',
161 '%b %d %Y at %H:%M:%S',
163 '%B %d %Y at %H:%M:%S',
166 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
167 DATE_FORMATS_DAY_FIRST
.extend([
176 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
177 DATE_FORMATS_MONTH_FIRST
.extend([
185 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
188 def preferredencoding():
189 """Get preferred encoding.
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
195 pref
= locale
.getpreferredencoding()
203 def write_json_file(obj
, fn
):
204 """ Encode obj as JSON and write it to fn, atomically if possible """
206 fn
= encodeFilename(fn
)
207 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
208 encoding
= get_filesystem_encoding()
209 # os.path.basename returns a bytes object, but NamedTemporaryFile
210 # will fail if the filename contains non ascii characters unless we
211 # use a unicode object
212 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
213 # the same for os.path.dirname
214 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
216 path_basename
= os
.path
.basename
217 path_dirname
= os
.path
.dirname
221 'prefix': path_basename(fn
) + '.',
222 'dir': path_dirname(fn
),
226 # In Python 2.x, json.dump expects a bytestream.
227 # In Python 3.x, it writes to a character stream
228 if sys
.version_info
< (3, 0):
236 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
241 if sys
.platform
== 'win32':
242 # Need to remove existing file on Windows, else os.rename raises
243 # WindowsError or FileExistsError.
248 os
.rename(tf
.name
, fn
)
257 if sys
.version_info
>= (2, 7):
258 def find_xpath_attr(node
, xpath
, key
, val
=None):
259 """ Find the xpath xpath[@key=val] """
260 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
261 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
262 return node
.find(expr
)
264 def find_xpath_attr(node
, xpath
, key
, val
=None):
265 for f
in node
.findall(compat_xpath(xpath
)):
266 if key
not in f
.attrib
:
268 if val
is None or f
.attrib
.get(key
) == val
:
272 # On python2.6 the xml.etree.ElementTree.Element methods don't support
273 # the namespace parameter
276 def xpath_with_ns(path
, ns_map
):
277 components
= [c
.split(':') for c
in path
.split('/')]
281 replaced
.append(c
[0])
284 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
285 return '/'.join(replaced
)
288 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
289 def _find_xpath(xpath
):
290 return node
.find(compat_xpath(xpath
))
292 if isinstance(xpath
, (str, compat_str
)):
293 n
= _find_xpath(xpath
)
301 if default
is not NO_DEFAULT
:
304 name
= xpath
if name
is None else name
305 raise ExtractorError('Could not find XML element %s' % name
)
311 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
312 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
313 if n
is None or n
== default
:
316 if default
is not NO_DEFAULT
:
319 name
= xpath
if name
is None else name
320 raise ExtractorError('Could not find XML element\'s text %s' % name
)
326 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
327 n
= find_xpath_attr(node
, xpath
, key
)
329 if default
is not NO_DEFAULT
:
332 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
333 raise ExtractorError('Could not find XML attribute %s' % name
)
339 def get_element_by_id(id, html
):
340 """Return the content of the tag with the specified ID in the passed HTML document"""
341 return get_element_by_attribute('id', id, html
)
344 def get_element_by_class(class_name
, html
):
345 """Return the content of the first tag with the specified class in the passed HTML document"""
346 retval
= get_elements_by_class(class_name
, html
)
347 return retval
[0] if retval
else None
350 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
351 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
352 return retval
[0] if retval
else None
355 def get_elements_by_class(class_name
, html
):
356 """Return the content of all tags with the specified class in the passed HTML document as a list"""
357 return get_elements_by_attribute(
358 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
359 html, escape_value=False)
362 def get_elements_by_attribute(attribute, value, html, escape_value=True):
363 """Return the content of the tag with the specified attribute in the passed HTML document"""
365 value = re.escape(value) if escape_value else value
368 for m in re.finditer(r'''(?xs)
370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
376 ''' % (re.escape(attribute), value), html):
377 res = m.group('content
')
379 if res.startswith('"') or res.startswith("'"):
382 retlist.append(unescapeHTML(res))
387 class HTMLAttributeParser(compat_HTMLParser):
388 """Trivial HTML parser to gather the attributes for a single element"""
391 compat_HTMLParser.__init__(self)
393 def handle_starttag(self, tag, attrs):
394 self.attrs = dict(attrs)
397 def extract_attributes(html_element):
398 """Given a string for an HTML element such as
400 a="foo" B="bar" c="&98;az" d=boz
401 empty= noval entity="&"
404 Decode and return a dictionary of attributes.
406 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
407 'empty
': '', 'noval
': None, 'entity
': '&',
408 'sq
': '"', 'dq': '\''
410 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
411 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
413 parser = HTMLAttributeParser()
415 parser.feed(html_element)
417 # Older Python may throw HTMLParseError in case of malformed HTML
418 except compat_HTMLParseError:
423 def clean_html(html):
424 """Clean an HTML snippet into a readable string"""
426 if html is None: # Convenience for sanitizing descriptions etc.
430 html = html.replace('\n', ' ')
431 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
432 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
434 html = re.sub('<.*?>', '', html)
435 # Replace html entities
436 html = unescapeHTML(html)
440 def sanitize_open(filename, open_mode):
441 """Try to open the given filename, and slightly tweak it if this fails.
443 Attempts to open the given filename. If this fails, it tries to change
444 the filename slightly, step by step, until it's either able to open it
445 or it fails and raises a final exception, like the standard open()
448 It returns the tuple (stream, definitive_file_name).
452 if sys.platform == 'win32':
454 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
455 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
456 stream = open(encodeFilename(filename), open_mode)
457 return (stream, filename)
458 except (IOError, OSError) as err:
459 if err.errno in (errno.EACCES,):
462 # In case of error, try to remove win32 forbidden chars
463 alt_filename = sanitize_path(filename)
464 if alt_filename == filename:
467 # An exception here should be caught in the caller
468 stream = open(encodeFilename(alt_filename), open_mode)
469 return (stream, alt_filename)
472 def timeconvert(timestr):
473 """Convert RFC 2822 defined time string into system timestamp"""
475 timetuple = email.utils.parsedate_tz(timestr)
476 if timetuple is not None:
477 timestamp = email.utils.mktime_tz(timetuple)
481 def sanitize_filename(s, restricted=False, is_id=False):
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
484 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 def replace_insane(char):
488 if restricted and char in ACCENT_CHARS:
489 return ACCENT_CHARS[char]
490 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return '' if restricted else '\''
495 return '_
-' if restricted else ' -'
496 elif char in '\\/|
*<>':
498 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
500 if restricted
and ord(char
) > 127:
505 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
506 result
= ''.join(map(replace_insane
, s
))
508 while '__' in result
:
509 result
= result
.replace('__', '_')
510 result
= result
.strip('_')
511 # Common case of "Foreign band name - English song title"
512 if restricted
and result
.startswith('-_'):
514 if result
.startswith('-'):
515 result
= '_' + result
[len('-'):]
516 result
= result
.lstrip('.')
522 def sanitize_path(s
):
523 """Sanitizes and normalizes path on Windows"""
524 if sys
.platform
!= 'win32':
526 drive_or_unc
, _
= os
.path
.splitdrive(s
)
527 if sys
.version_info
< (2, 7) and not drive_or_unc
:
528 drive_or_unc
, _
= os
.path
.splitunc(s
)
529 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
533 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
534 for path_part
in norm_path
]
536 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
537 return os
.path
.join(*sanitized_path
)
540 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
541 # unwanted failures due to missing protocol
542 def sanitize_url(url
):
543 return 'http:%s' % url
if url
.startswith('//') else url
546 def sanitized_Request(url
, *args
, **kwargs
):
547 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
551 """Expand shell variables and ~"""
552 return os
.path
.expandvars(compat_expanduser(s
))
555 def orderedSet(iterable
):
556 """ Remove all duplicates from the input iterable """
564 def _htmlentity_transform(entity_with_semicolon
):
565 """Transforms an HTML entity to a character."""
566 entity
= entity_with_semicolon
[:-1]
568 # Known non-numeric HTML entity
569 if entity
in compat_html_entities
.name2codepoint
:
570 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
572 # TODO: HTML5 allows entities without a semicolon. For example,
573 # 'Éric' should be decoded as 'Éric'.
574 if entity_with_semicolon
in compat_html_entities_html5
:
575 return compat_html_entities_html5
[entity_with_semicolon
]
577 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
579 numstr
= mobj
.group(1)
580 if numstr
.startswith('x'):
582 numstr
= '0%s' % numstr
585 # See https://github.com/rg3/youtube-dl/issues/7518
587 return compat_chr(int(numstr
, base
))
591 # Unknown entity in name, return its literal representation
592 return '&%s;' % entity
598 assert type(s
) == compat_str
601 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
604 def get_subprocess_encoding():
605 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
606 # For subprocess calls, encode with locale encoding
607 # Refer to http://stackoverflow.com/a/9951851/35070
608 encoding
= preferredencoding()
610 encoding
= sys
.getfilesystemencoding()
616 def encodeFilename(s
, for_subprocess
=False):
618 @param s The name of the file
621 assert type(s
) == compat_str
623 # Python 3 has a Unicode API
624 if sys
.version_info
>= (3, 0):
627 # Pass '' directly to use Unicode APIs on Windows 2000 and up
628 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
629 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
630 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
633 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
634 if sys
.platform
.startswith('java'):
637 return s
.encode(get_subprocess_encoding(), 'ignore')
640 def decodeFilename(b
, for_subprocess
=False):
642 if sys
.version_info
>= (3, 0):
645 if not isinstance(b
, bytes):
648 return b
.decode(get_subprocess_encoding(), 'ignore')
651 def encodeArgument(s
):
652 if not isinstance(s
, compat_str
):
653 # Legacy code that uses byte strings
654 # Uncomment the following line after fixing all post processors
655 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
656 s
= s
.decode('ascii')
657 return encodeFilename(s
, True)
660 def decodeArgument(b
):
661 return decodeFilename(b
, True)
664 def decodeOption(optval
):
667 if isinstance(optval
, bytes):
668 optval
= optval
.decode(preferredencoding())
670 assert isinstance(optval
, compat_str
)
674 def formatSeconds(secs
):
676 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
678 return '%d:%02d' % (secs
// 60, secs
% 60)
683 def make_HTTPS_handler(params
, **kwargs
):
684 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
685 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
686 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
687 if opts_no_check_certificate
:
688 context
.check_hostname
= False
689 context
.verify_mode
= ssl
.CERT_NONE
691 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
694 # (create_default_context present but HTTPSHandler has no context=)
697 if sys
.version_info
< (3, 2):
698 return YoutubeDLHTTPSHandler(params
, **kwargs
)
700 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
701 context
.verify_mode
= (ssl
.CERT_NONE
702 if opts_no_check_certificate
703 else ssl
.CERT_REQUIRED
)
704 context
.set_default_verify_paths()
705 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
708 def bug_reports_message():
709 if ytdl_is_updateable():
710 update_cmd
= 'type youtube-dl -U to update'
712 update_cmd
= 'see https://yt-dl.org/update on how to update'
713 msg
= '; please report this issue on https://yt-dl.org/bug .'
714 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
715 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
719 class YoutubeDLError(Exception):
720 """Base exception for YoutubeDL errors."""
724 class ExtractorError(YoutubeDLError
):
725 """Error during info extraction."""
727 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
728 """ tb, if given, is the original traceback (so that it can be printed out).
729 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
732 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
734 if video_id
is not None:
735 msg
= video_id
+ ': ' + msg
737 msg
+= ' (caused by %r)' % cause
739 msg
+= bug_reports_message()
740 super(ExtractorError
, self
).__init
__(msg
)
743 self
.exc_info
= sys
.exc_info() # preserve original exception
745 self
.video_id
= video_id
747 def format_traceback(self
):
748 if self
.traceback
is None:
750 return ''.join(traceback
.format_tb(self
.traceback
))
753 class UnsupportedError(ExtractorError
):
754 def __init__(self
, url
):
755 super(UnsupportedError
, self
).__init
__(
756 'Unsupported URL: %s' % url
, expected
=True)
760 class RegexNotFoundError(ExtractorError
):
761 """Error when a regex didn't match"""
765 class GeoRestrictedError(ExtractorError
):
766 """Geographic restriction Error exception.
768 This exception may be thrown when a video is not available from your
769 geographic location due to geographic restrictions imposed by a website.
771 def __init__(self
, msg
, countries
=None):
772 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
774 self
.countries
= countries
777 class DownloadError(YoutubeDLError
):
778 """Download Error exception.
780 This exception may be thrown by FileDownloader objects if they are not
781 configured to continue on errors. They will contain the appropriate
785 def __init__(self
, msg
, exc_info
=None):
786 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
787 super(DownloadError
, self
).__init
__(msg
)
788 self
.exc_info
= exc_info
791 class SameFileError(YoutubeDLError
):
792 """Same File exception.
794 This exception will be thrown by FileDownloader objects if they detect
795 multiple files would have to be downloaded to the same file on disk.
800 class PostProcessingError(YoutubeDLError
):
801 """Post Processing exception.
803 This exception may be raised by PostProcessor's .run() method to
804 indicate an error in the postprocessing task.
807 def __init__(self
, msg
):
808 super(PostProcessingError
, self
).__init
__(msg
)
812 class MaxDownloadsReached(YoutubeDLError
):
813 """ --max-downloads limit has been reached. """
817 class UnavailableVideoError(YoutubeDLError
):
818 """Unavailable Format exception.
820 This exception will be thrown when a video is requested
821 in a format that is not available for that video.
826 class ContentTooShortError(YoutubeDLError
):
827 """Content Too Short exception.
829 This exception may be raised by FileDownloader objects when a file they
830 download is too small for what the server announced first, indicating
831 the connection was probably interrupted.
834 def __init__(self
, downloaded
, expected
):
835 super(ContentTooShortError
, self
).__init
__(
836 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
839 self
.downloaded
= downloaded
840 self
.expected
= expected
843 class XAttrMetadataError(YoutubeDLError
):
844 def __init__(self
, code
=None, msg
='Unknown error'):
845 super(XAttrMetadataError
, self
).__init
__(msg
)
849 # Parsing code and msg
850 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
) or
851 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
852 self
.reason
= 'NO_SPACE'
853 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
854 self
.reason
= 'VALUE_TOO_LONG'
856 self
.reason
= 'NOT_SUPPORTED'
859 class XAttrUnavailableError(YoutubeDLError
):
863 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
864 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
865 # expected HTTP responses to meet HTTP/1.0 or later (see also
866 # https://github.com/rg3/youtube-dl/issues/6727)
867 if sys
.version_info
< (3, 0):
868 kwargs
[b
'strict'] = True
869 hc
= http_class(*args
, **kwargs
)
870 source_address
= ydl_handler
._params
.get('source_address')
871 if source_address
is not None:
872 sa
= (source_address
, 0)
873 if hasattr(hc
, 'source_address'): # Python 2.7+
874 hc
.source_address
= sa
876 def _hc_connect(self
, *args
, **kwargs
):
877 sock
= compat_socket_create_connection(
878 (self
.host
, self
.port
), self
.timeout
, sa
)
880 self
.sock
= ssl
.wrap_socket(
881 sock
, self
.key_file
, self
.cert_file
,
882 ssl_version
=ssl
.PROTOCOL_TLSv1
)
885 hc
.connect
= functools
.partial(_hc_connect
, hc
)
890 def handle_youtubedl_headers(headers
):
891 filtered_headers
= headers
893 if 'Youtubedl-no-compression' in filtered_headers
:
894 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
895 del filtered_headers
['Youtubedl-no-compression']
897 return filtered_headers
900 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
901 """Handler for HTTP requests and responses.
903 This class, when installed with an OpenerDirector, automatically adds
904 the standard headers to every HTTP request and handles gzipped and
905 deflated responses from web servers. If compression is to be avoided in
906 a particular request, the original request in the program code only has
907 to include the HTTP header "Youtubedl-no-compression", which will be
908 removed before making the real request.
910 Part of this code was copied from:
912 http://techknack.net/python-urllib2-handlers/
914 Andrew Rowls, the author of that code, agreed to release it to the
918 def __init__(self
, params
, *args
, **kwargs
):
919 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
920 self
._params
= params
922 def http_open(self
, req
):
923 conn_class
= compat_http_client
.HTTPConnection
925 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
927 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
928 del req
.headers
['Ytdl-socks-proxy']
930 return self
.do_open(functools
.partial(
931 _create_http_connection
, self
, conn_class
, False),
937 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
939 return zlib
.decompress(data
)
941 def http_request(self
, req
):
942 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
943 # always respected by websites, some tend to give out URLs with non percent-encoded
944 # non-ASCII characters (see telemb.py, ard.py [#3412])
945 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
946 # To work around aforementioned issue we will replace request's original URL with
947 # percent-encoded one
948 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
949 # the code of this workaround has been moved here from YoutubeDL.urlopen()
950 url
= req
.get_full_url()
951 url_escaped
= escape_url(url
)
953 # Substitute URL if any change after escaping
954 if url
!= url_escaped
:
955 req
= update_Request(req
, url
=url_escaped
)
957 for h
, v
in std_headers
.items():
958 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
959 # The dict keys are capitalized because of this bug by urllib
960 if h
.capitalize() not in req
.headers
:
963 req
.headers
= handle_youtubedl_headers(req
.headers
)
965 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
966 # Python 2.6 is brain-dead when it comes to fragments
967 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
968 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
972 def http_response(self
, req
, resp
):
975 if resp
.headers
.get('Content-encoding', '') == 'gzip':
976 content
= resp
.read()
977 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
979 uncompressed
= io
.BytesIO(gz
.read())
980 except IOError as original_ioerror
:
981 # There may be junk add the end of the file
982 # See http://stackoverflow.com/q/4928560/35070 for details
983 for i
in range(1, 1024):
985 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
986 uncompressed
= io
.BytesIO(gz
.read())
991 raise original_ioerror
992 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
993 resp
.msg
= old_resp
.msg
994 del resp
.headers
['Content-encoding']
996 if resp
.headers
.get('Content-encoding', '') == 'deflate':
997 gz
= io
.BytesIO(self
.deflate(resp
.read()))
998 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
999 resp
.msg
= old_resp
.msg
1000 del resp
.headers
['Content-encoding']
1001 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1002 # https://github.com/rg3/youtube-dl/issues/6457).
1003 if 300 <= resp
.code
< 400:
1004 location
= resp
.headers
.get('Location')
1006 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1007 if sys
.version_info
>= (3, 0):
1008 location
= location
.encode('iso-8859-1').decode('utf-8')
1010 location
= location
.decode('utf-8')
1011 location_escaped
= escape_url(location
)
1012 if location
!= location_escaped
:
1013 del resp
.headers
['Location']
1014 if sys
.version_info
< (3, 0):
1015 location_escaped
= location_escaped
.encode('utf-8')
1016 resp
.headers
['Location'] = location_escaped
1019 https_request
= http_request
1020 https_response
= http_response
1023 def make_socks_conn_class(base_class
, socks_proxy
):
1024 assert issubclass(base_class
, (
1025 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1027 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1028 if url_components
.scheme
.lower() == 'socks5':
1029 socks_type
= ProxyType
.SOCKS5
1030 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1031 socks_type
= ProxyType
.SOCKS4
1032 elif url_components
.scheme
.lower() == 'socks4a':
1033 socks_type
= ProxyType
.SOCKS4A
1035 def unquote_if_non_empty(s
):
1038 return compat_urllib_parse_unquote_plus(s
)
1042 url_components
.hostname
, url_components
.port
or 1080,
1044 unquote_if_non_empty(url_components
.username
),
1045 unquote_if_non_empty(url_components
.password
),
1048 class SocksConnection(base_class
):
1050 self
.sock
= sockssocket()
1051 self
.sock
.setproxy(*proxy_args
)
1052 if type(self
.timeout
) in (int, float):
1053 self
.sock
.settimeout(self
.timeout
)
1054 self
.sock
.connect((self
.host
, self
.port
))
1056 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1057 if hasattr(self
, '_context'): # Python > 2.6
1058 self
.sock
= self
._context
.wrap_socket(
1059 self
.sock
, server_hostname
=self
.host
)
1061 self
.sock
= ssl
.wrap_socket(self
.sock
)
1063 return SocksConnection
1066 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1067 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1068 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1069 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1070 self
._params
= params
1072 def https_open(self
, req
):
1074 conn_class
= self
._https
_conn
_class
1076 if hasattr(self
, '_context'): # python > 2.6
1077 kwargs
['context'] = self
._context
1078 if hasattr(self
, '_check_hostname'): # python 3.x
1079 kwargs
['check_hostname'] = self
._check
_hostname
1081 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1083 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1084 del req
.headers
['Ytdl-socks-proxy']
1086 return self
.do_open(functools
.partial(
1087 _create_http_connection
, self
, conn_class
, True),
1091 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1092 def __init__(self
, cookiejar
=None):
1093 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1095 def http_response(self
, request
, response
):
1096 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1097 # characters in Set-Cookie HTTP header of last response (see
1098 # https://github.com/rg3/youtube-dl/issues/6769).
1099 # In order to at least prevent crashing we will percent encode Set-Cookie
1100 # header before HTTPCookieProcessor starts processing it.
1101 # if sys.version_info < (3, 0) and response.headers:
1102 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1103 # set_cookie = response.headers.get(set_cookie_header)
1105 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1106 # if set_cookie != set_cookie_escaped:
1107 # del response.headers[set_cookie_header]
1108 # response.headers[set_cookie_header] = set_cookie_escaped
1109 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1111 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1112 https_response
= http_response
1115 def extract_timezone(date_str
):
1117 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1120 timezone
= datetime
.timedelta()
1122 date_str
= date_str
[:-len(m
.group('tz'))]
1123 if not m
.group('sign'):
1124 timezone
= datetime
.timedelta()
1126 sign
= 1 if m
.group('sign') == '+' else -1
1127 timezone
= datetime
.timedelta(
1128 hours
=sign
* int(m
.group('hours')),
1129 minutes
=sign
* int(m
.group('minutes')))
1130 return timezone
, date_str
1133 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1134 """ Return a UNIX timestamp from the given date """
1136 if date_str
is None:
1139 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1141 if timezone
is None:
1142 timezone
, date_str
= extract_timezone(date_str
)
1145 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1146 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1147 return calendar
.timegm(dt
.timetuple())
1152 def date_formats(day_first
=True):
1153 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1156 def unified_strdate(date_str
, day_first
=True):
1157 """Return a string with the date in the format YYYYMMDD"""
1159 if date_str
is None:
1163 date_str
= date_str
.replace(',', ' ')
1164 # Remove AM/PM + timezone
1165 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1166 _
, date_str
= extract_timezone(date_str
)
1168 for expression
in date_formats(day_first
):
1170 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1173 if upload_date
is None:
1174 timetuple
= email
.utils
.parsedate_tz(date_str
)
1177 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1180 if upload_date
is not None:
1181 return compat_str(upload_date
)
1184 def unified_timestamp(date_str
, day_first
=True):
1185 if date_str
is None:
1188 date_str
= re
.sub(r
'[,|]', '', date_str
)
1190 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1191 timezone
, date_str
= extract_timezone(date_str
)
1193 # Remove AM/PM + timezone
1194 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1196 # Remove unrecognized timezones from ISO 8601 alike timestamps
1197 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1199 date_str
= date_str
[:-len(m
.group('tz'))]
1201 for expression
in date_formats(day_first
):
1203 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1204 return calendar
.timegm(dt
.timetuple())
1207 timetuple
= email
.utils
.parsedate_tz(date_str
)
1209 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1212 def determine_ext(url
, default_ext
='unknown_video'):
1215 guess
= url
.partition('?')[0].rpartition('.')[2]
1216 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1218 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1219 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1220 return guess
.rstrip('/')
1225 def subtitles_filename(filename
, sub_lang
, sub_format
):
1226 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
1229 def date_from_str(date_str
):
1231 Return a datetime object from a string in the format YYYYMMDD or
1232 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1233 today
= datetime
.date
.today()
1234 if date_str
in ('now', 'today'):
1236 if date_str
== 'yesterday':
1237 return today
- datetime
.timedelta(days
=1)
1238 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
1239 if match
is not None:
1240 sign
= match
.group('sign')
1241 time
= int(match
.group('time'))
1244 unit
= match
.group('unit')
1245 # A bad approximation?
1249 elif unit
== 'year':
1253 delta
= datetime
.timedelta(**{unit
: time
})
1254 return today
+ delta
1255 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
1258 def hyphenate_date(date_str
):
1260 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1261 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1262 if match
is not None:
1263 return '-'.join(match
.groups())
1268 class DateRange(object):
1269 """Represents a time interval between two dates"""
1271 def __init__(self
, start
=None, end
=None):
1272 """start and end must be strings in the format accepted by date"""
1273 if start
is not None:
1274 self
.start
= date_from_str(start
)
1276 self
.start
= datetime
.datetime
.min.date()
1278 self
.end
= date_from_str(end
)
1280 self
.end
= datetime
.datetime
.max.date()
1281 if self
.start
> self
.end
:
1282 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1286 """Returns a range that only contains the given day"""
1287 return cls(day
, day
)
1289 def __contains__(self
, date
):
1290 """Check if the date is in the range"""
1291 if not isinstance(date
, datetime
.date
):
1292 date
= date_from_str(date
)
1293 return self
.start
<= date
<= self
.end
1296 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1299 def platform_name():
1300 """ Returns the platform name as a compat_str """
1301 res
= platform
.platform()
1302 if isinstance(res
, bytes):
1303 res
= res
.decode(preferredencoding())
1305 assert isinstance(res
, compat_str
)
1309 def _windows_write_string(s
, out
):
1310 """ Returns True if the string was written using special methods,
1311 False if it has yet to be written out."""
1312 # Adapted from http://stackoverflow.com/a/3259271/35070
1315 import ctypes
.wintypes
1323 fileno
= out
.fileno()
1324 except AttributeError:
1325 # If the output stream doesn't have a fileno, it's virtual
1327 except io
.UnsupportedOperation
:
1328 # Some strange Windows pseudo files?
1330 if fileno
not in WIN_OUTPUT_IDS
:
1333 GetStdHandle
= ctypes
.WINFUNCTYPE(
1334 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1335 (b
'GetStdHandle', ctypes
.windll
.kernel32
))
1336 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1338 WriteConsoleW
= ctypes
.WINFUNCTYPE(
1339 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1340 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1341 ctypes
.wintypes
.LPVOID
)((b
'WriteConsoleW', ctypes
.windll
.kernel32
))
1342 written
= ctypes
.wintypes
.DWORD(0)
1344 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
'GetFileType', ctypes
.windll
.kernel32
))
1345 FILE_TYPE_CHAR
= 0x0002
1346 FILE_TYPE_REMOTE
= 0x8000
1347 GetConsoleMode
= ctypes
.WINFUNCTYPE(
1348 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1349 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1350 (b
'GetConsoleMode', ctypes
.windll
.kernel32
))
1351 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1353 def not_a_console(handle
):
1354 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1356 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1357 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1359 if not_a_console(h
):
1362 def next_nonbmp_pos(s
):
1364 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1365 except StopIteration:
1369 count
= min(next_nonbmp_pos(s
), 1024)
1371 ret
= WriteConsoleW(
1372 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1374 raise OSError('Failed to write string')
1375 if not count
: # We just wrote a non-BMP character
1376 assert written
.value
== 2
1379 assert written
.value
> 0
1380 s
= s
[written
.value
:]
1384 def write_string(s
, out
=None, encoding
=None):
1387 assert type(s
) == compat_str
1389 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1390 if _windows_write_string(s
, out
):
1393 if ('b' in getattr(out
, 'mode', '') or
1394 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1395 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1397 elif hasattr(out
, 'buffer'):
1398 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1399 byt
= s
.encode(enc
, 'ignore')
1400 out
.buffer.write(byt
)
1406 def bytes_to_intlist(bs
):
1409 if isinstance(bs
[0], int): # Python 3
1412 return [ord(c
) for c
in bs
]
1415 def intlist_to_bytes(xs
):
1418 return compat_struct_pack('%dB' % len(xs
), *xs
)
1421 # Cross-platform file locking
1422 if sys
.platform
== 'win32':
1423 import ctypes
.wintypes
1426 class OVERLAPPED(ctypes
.Structure
):
1428 ('Internal', ctypes
.wintypes
.LPVOID
),
1429 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1430 ('Offset', ctypes
.wintypes
.DWORD
),
1431 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1432 ('hEvent', ctypes
.wintypes
.HANDLE
),
1435 kernel32
= ctypes
.windll
.kernel32
1436 LockFileEx
= kernel32
.LockFileEx
1437 LockFileEx
.argtypes
= [
1438 ctypes
.wintypes
.HANDLE
, # hFile
1439 ctypes
.wintypes
.DWORD
, # dwFlags
1440 ctypes
.wintypes
.DWORD
, # dwReserved
1441 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1442 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1443 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1445 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1446 UnlockFileEx
= kernel32
.UnlockFileEx
1447 UnlockFileEx
.argtypes
= [
1448 ctypes
.wintypes
.HANDLE
, # hFile
1449 ctypes
.wintypes
.DWORD
, # dwReserved
1450 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1451 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1452 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1454 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1455 whole_low
= 0xffffffff
1456 whole_high
= 0x7fffffff
1458 def _lock_file(f
, exclusive
):
1459 overlapped
= OVERLAPPED()
1460 overlapped
.Offset
= 0
1461 overlapped
.OffsetHigh
= 0
1462 overlapped
.hEvent
= 0
1463 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1464 handle
= msvcrt
.get_osfhandle(f
.fileno())
1465 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1466 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1467 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1469 def _unlock_file(f
):
1470 assert f
._lock
_file
_overlapped
_p
1471 handle
= msvcrt
.get_osfhandle(f
.fileno())
1472 if not UnlockFileEx(handle
, 0,
1473 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1474 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1477 # Some platforms, such as Jython, is missing fcntl
1481 def _lock_file(f
, exclusive
):
1482 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1484 def _unlock_file(f
):
1485 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1487 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
1489 def _lock_file(f
, exclusive
):
1490 raise IOError(UNSUPPORTED_MSG
)
1492 def _unlock_file(f
):
1493 raise IOError(UNSUPPORTED_MSG
)
1496 class locked_file(object):
1497 def __init__(self
, filename
, mode
, encoding
=None):
1498 assert mode
in ['r', 'a', 'w']
1499 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1502 def __enter__(self
):
1503 exclusive
= self
.mode
!= 'r'
1505 _lock_file(self
.f
, exclusive
)
1511 def __exit__(self
, etype
, value
, traceback
):
1513 _unlock_file(self
.f
)
1520 def write(self
, *args
):
1521 return self
.f
.write(*args
)
1523 def read(self
, *args
):
1524 return self
.f
.read(*args
)
1527 def get_filesystem_encoding():
1528 encoding
= sys
.getfilesystemencoding()
1529 return encoding
if encoding
is not None else 'utf-8'
1532 def shell_quote(args
):
1534 encoding
= get_filesystem_encoding()
1536 if isinstance(a
, bytes):
1537 # We may get a filename encoded with 'encodeFilename'
1538 a
= a
.decode(encoding
)
1539 quoted_args
.append(compat_shlex_quote(a
))
1540 return ' '.join(quoted_args
)
1543 def smuggle_url(url
, data
):
1544 """ Pass additional data in a URL for internal use. """
1546 url
, idata
= unsmuggle_url(url
, {})
1548 sdata
= compat_urllib_parse_urlencode(
1549 {'__youtubedl_smuggle': json
.dumps(data
)})
1550 return url
+ '#' + sdata
1553 def unsmuggle_url(smug_url
, default
=None):
1554 if '#__youtubedl_smuggle' not in smug_url
:
1555 return smug_url
, default
1556 url
, _
, sdata
= smug_url
.rpartition('#')
1557 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1558 data
= json
.loads(jsond
)
1562 def format_bytes(bytes):
1565 if type(bytes) is str:
1566 bytes = float(bytes)
1570 exponent
= int(math
.log(bytes, 1024.0))
1571 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1572 converted
= float(bytes) / float(1024 ** exponent
)
1573 return '%.2f%s' % (converted
, suffix
)
1576 def lookup_unit_table(unit_table
, s
):
1577 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1579 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
1582 num_str
= m
.group('num').replace(',', '.')
1583 mult
= unit_table
[m
.group('unit')]
1584 return int(float(num_str
) * mult
)
1587 def parse_filesize(s
):
1591 # The lower-case forms are of course incorrect and unofficial,
1592 # but we support those too
1609 'megabytes': 1000 ** 2,
1610 'mebibytes': 1024 ** 2,
1616 'gigabytes': 1000 ** 3,
1617 'gibibytes': 1024 ** 3,
1623 'terabytes': 1000 ** 4,
1624 'tebibytes': 1024 ** 4,
1630 'petabytes': 1000 ** 5,
1631 'pebibytes': 1024 ** 5,
1637 'exabytes': 1000 ** 6,
1638 'exbibytes': 1024 ** 6,
1644 'zettabytes': 1000 ** 7,
1645 'zebibytes': 1024 ** 7,
1651 'yottabytes': 1000 ** 8,
1652 'yobibytes': 1024 ** 8,
1655 return lookup_unit_table(_UNIT_TABLE
, s
)
1664 if re
.match(r
'^[\d,.]+$', s
):
1665 return str_to_int(s
)
1676 return lookup_unit_table(_UNIT_TABLE
, s
)
1679 def month_by_name(name
, lang
='en'):
1680 """ Return the number of a month by (locale-independently) English name """
1682 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1685 return month_names
.index(name
) + 1
1690 def month_by_abbreviation(abbrev
):
1691 """ Return the number of a month by (locale-independently) English
1695 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1700 def fix_xml_ampersands(xml_str
):
1701 """Replace all the '&' by '&' in XML"""
1703 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1708 def setproctitle(title
):
1709 assert isinstance(title
, compat_str
)
1711 # ctypes in Jython is not complete
1712 # http://bugs.jython.org/issue2148
1713 if sys
.platform
.startswith('java'):
1717 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1721 # LoadLibrary in Windows Python 2.7.13 only expects
1722 # a bytestring, but since unicode_literals turns
1723 # every string into a unicode string, it fails.
1725 title_bytes
= title
.encode('utf-8')
1726 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1727 buf
.value
= title_bytes
1729 libc
.prctl(15, buf
, 0, 0, 0)
1730 except AttributeError:
1731 return # Strange libc, just skip this
1734 def remove_start(s
, start
):
1735 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1738 def remove_end(s
, end
):
1739 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1742 def remove_quotes(s
):
1743 if s
is None or len(s
) < 2:
1745 for quote
in ('"', "'", ):
1746 if s
[0] == quote
and s
[-1] == quote
:
1751 def url_basename(url
):
1752 path
= compat_urlparse
.urlparse(url
).path
1753 return path
.strip('/').split('/')[-1]
1757 return re
.match(r
'https?://[^?#&]+/', url
).group()
1760 def urljoin(base
, path
):
1761 if isinstance(path
, bytes):
1762 path
= path
.decode('utf-8')
1763 if not isinstance(path
, compat_str
) or not path
:
1765 if re
.match(r
'^(?:https?:)?//', path
):
1767 if isinstance(base
, bytes):
1768 base
= base
.decode('utf-8')
1769 if not isinstance(base
, compat_str
) or not re
.match(
1770 r
'^(?:https?:)?//', base
):
1772 return compat_urlparse
.urljoin(base
, path
)
1775 class HEADRequest(compat_urllib_request
.Request
):
1776 def get_method(self
):
1780 class PUTRequest(compat_urllib_request
.Request
):
1781 def get_method(self
):
1785 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1788 v
= getattr(v
, get_attr
, None)
1794 return int(v
) * invscale
// scale
1799 def str_or_none(v
, default
=None):
1800 return default
if v
is None else compat_str(v
)
1803 def str_to_int(int_str
):
1804 """ A more relaxed version of int_or_none """
1807 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1811 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1815 return float(v
) * invscale
/ scale
1820 def bool_or_none(v
, default
=None):
1821 return v
if isinstance(v
, bool) else default
1824 def strip_or_none(v
):
1825 return None if v
is None else v
.strip()
1828 def parse_duration(s
):
1829 if not isinstance(s
, compat_basestring
):
1834 days
, hours
, mins
, secs
, ms
= [None] * 5
1835 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
1837 days
, hours
, mins
, secs
, ms
= m
.groups()
1842 [0-9]+\s*y(?:ears?)?\s*
1845 [0-9]+\s*m(?:onths?)?\s*
1848 [0-9]+\s*w(?:eeks?)?\s*
1851 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1855 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1858 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1861 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1864 days
, hours
, mins
, secs
, ms
= m
.groups()
1866 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
1868 hours
, mins
= m
.groups()
1874 duration
+= float(secs
)
1876 duration
+= float(mins
) * 60
1878 duration
+= float(hours
) * 60 * 60
1880 duration
+= float(days
) * 24 * 60 * 60
1882 duration
+= float(ms
)
1886 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1887 name
, real_ext
= os
.path
.splitext(filename
)
1889 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1890 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1891 else '{0}.{1}'.format(filename
, ext
))
1894 def replace_extension(filename
, ext
, expected_real_ext
=None):
1895 name
, real_ext
= os
.path
.splitext(filename
)
1896 return '{0}.{1}'.format(
1897 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1901 def check_executable(exe
, args
=[]):
1902 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1903 args can be a list of arguments for a short output (like -version) """
1905 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1911 def get_exe_version(exe
, args
=['--version'],
1912 version_re
=None, unrecognized
='present'):
1913 """ Returns the version of the specified executable,
1914 or False if the executable is not present """
1916 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1917 # SIGTTOU if youtube-dl is run in the background.
1918 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1919 out
, _
= subprocess
.Popen(
1920 [encodeArgument(exe
)] + args
,
1921 stdin
=subprocess
.PIPE
,
1922 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1925 if isinstance(out
, bytes): # Python 2.x
1926 out
= out
.decode('ascii', 'ignore')
1927 return detect_exe_version(out
, version_re
, unrecognized
)
1930 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1931 assert isinstance(output
, compat_str
)
1932 if version_re
is None:
1933 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1934 m
= re
.search(version_re
, output
)
1941 class PagedList(object):
1943 # This is only useful for tests
1944 return len(self
.getslice())
1947 class OnDemandPagedList(PagedList
):
1948 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
1949 self
._pagefunc
= pagefunc
1950 self
._pagesize
= pagesize
1951 self
._use
_cache
= use_cache
1955 def getslice(self
, start
=0, end
=None):
1957 for pagenum
in itertools
.count(start
// self
._pagesize
):
1958 firstid
= pagenum
* self
._pagesize
1959 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1960 if start
>= nextfirstid
:
1965 page_results
= self
._cache
.get(pagenum
)
1966 if page_results
is None:
1967 page_results
= list(self
._pagefunc
(pagenum
))
1969 self
._cache
[pagenum
] = page_results
1972 start
% self
._pagesize
1973 if firstid
<= start
< nextfirstid
1977 ((end
- 1) % self
._pagesize
) + 1
1978 if (end
is not None and firstid
<= end
<= nextfirstid
)
1981 if startv
!= 0 or endv
is not None:
1982 page_results
= page_results
[startv
:endv
]
1983 res
.extend(page_results
)
1985 # A little optimization - if current page is not "full", ie. does
1986 # not contain page_size videos then we can assume that this page
1987 # is the last one - there are no more ids on further pages -
1988 # i.e. no need to query again.
1989 if len(page_results
) + startv
< self
._pagesize
:
1992 # If we got the whole page, but the next page is not interesting,
1993 # break out early as well
1994 if end
== nextfirstid
:
1999 class InAdvancePagedList(PagedList
):
2000 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2001 self
._pagefunc
= pagefunc
2002 self
._pagecount
= pagecount
2003 self
._pagesize
= pagesize
2005 def getslice(self
, start
=0, end
=None):
2007 start_page
= start
// self
._pagesize
2009 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
2010 skip_elems
= start
- start_page
* self
._pagesize
2011 only_more
= None if end
is None else end
- start
2012 for pagenum
in range(start_page
, end_page
):
2013 page
= list(self
._pagefunc
(pagenum
))
2015 page
= page
[skip_elems
:]
2017 if only_more
is not None:
2018 if len(page
) < only_more
:
2019 only_more
-= len(page
)
2021 page
= page
[:only_more
]
2028 def uppercase_escape(s
):
2029 unicode_escape
= codecs
.getdecoder('unicode_escape')
2031 r
'\\U[0-9a-fA-F]{8}',
2032 lambda m
: unicode_escape(m
.group(0))[0],
2036 def lowercase_escape(s
):
2037 unicode_escape
= codecs
.getdecoder('unicode_escape')
2039 r
'\\u[0-9a-fA-F]{4}',
2040 lambda m
: unicode_escape(m
.group(0))[0],
2044 def escape_rfc3986(s
):
2045 """Escape non-ASCII characters as suggested by RFC 3986"""
2046 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
2047 s
= s
.encode('utf-8')
2048 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2051 def escape_url(url
):
2052 """Escape URL as suggested by RFC 3986"""
2053 url_parsed
= compat_urllib_parse_urlparse(url
)
2054 return url_parsed
._replace
(
2055 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2056 path
=escape_rfc3986(url_parsed
.path
),
2057 params
=escape_rfc3986(url_parsed
.params
),
2058 query
=escape_rfc3986(url_parsed
.query
),
2059 fragment
=escape_rfc3986(url_parsed
.fragment
)
2063 def read_batch_urls(batch_fd
):
2065 if not isinstance(url
, compat_str
):
2066 url
= url
.decode('utf-8', 'replace')
2067 BOM_UTF8
= '\xef\xbb\xbf'
2068 if url
.startswith(BOM_UTF8
):
2069 url
= url
[len(BOM_UTF8
):]
2071 if url
.startswith(('#', ';', ']')):
2075 with contextlib
.closing(batch_fd
) as fd
:
2076 return [url
for url
in map(fixup
, fd
) if url
]
2079 def urlencode_postdata(*args
, **kargs
):
2080 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2083 def update_url_query(url
, query
):
2086 parsed_url
= compat_urlparse
.urlparse(url
)
2087 qs
= compat_parse_qs(parsed_url
.query
)
2089 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2090 query
=compat_urllib_parse_urlencode(qs
, True)))
2093 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
2094 req_headers
= req
.headers
.copy()
2095 req_headers
.update(headers
)
2096 req_data
= data
or req
.data
2097 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2098 req_get_method
= req
.get_method()
2099 if req_get_method
== 'HEAD':
2100 req_type
= HEADRequest
2101 elif req_get_method
== 'PUT':
2102 req_type
= PUTRequest
2104 req_type
= compat_urllib_request
.Request
2106 req_url
, data
=req_data
, headers
=req_headers
,
2107 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2108 if hasattr(req
, 'timeout'):
2109 new_req
.timeout
= req
.timeout
2113 def _multipart_encode_impl(data
, boundary
):
2114 content_type
= 'multipart/form-data; boundary=%s' % boundary
2117 for k
, v
in data
.items():
2118 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2119 if isinstance(k
, compat_str
):
2120 k
= k
.encode('utf-8')
2121 if isinstance(v
, compat_str
):
2122 v
= v
.encode('utf-8')
2123 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2124 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2125 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2126 if boundary
.encode('ascii') in content
:
2127 raise ValueError('Boundary overlaps with data')
2130 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2132 return out
, content_type
2135 def multipart_encode(data
, boundary
=None):
2137 Encode a dict to RFC 7578-compliant form-data
2140 A dict where keys and values can be either Unicode or bytes-like
2143 If specified a Unicode object, it's used as the boundary. Otherwise
2144 a random boundary is generated.
2146 Reference: https://tools.ietf.org/html/rfc7578
2148 has_specified_boundary
= boundary
is not None
2151 if boundary
is None:
2152 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2155 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2158 if has_specified_boundary
:
2162 return out
, content_type
2165 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2166 if isinstance(key_or_keys
, (list, tuple)):
2167 for key
in key_or_keys
:
2168 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2172 return d
.get(key_or_keys
, default
)
2175 def try_get(src
, getter
, expected_type
=None):
2176 if not isinstance(getter
, (list, tuple)):
2181 except (AttributeError, KeyError, TypeError, IndexError):
2184 if expected_type
is None or isinstance(v
, expected_type
):
2188 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2189 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2201 TV_PARENTAL_GUIDELINES
= {
2211 def parse_age_limit(s
):
2213 return s
if 0 <= s
<= 21 else None
2214 if not isinstance(s
, compat_basestring
):
2216 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2218 return int(m
.group('age'))
2220 return US_RATINGS
[s
]
2221 return TV_PARENTAL_GUIDELINES
.get(s
)
2224 def strip_jsonp(code
):
2227 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2228 (?:\s*&&\s*(?P=func_name))?
2229 \s*\(\s*(?P<callback_data>.*)\);?
2230 \s*?(?://[^\n]*)*$''',
2231 r
'\g<callback_data>', code
)
2234 def js_to_json(code
):
2235 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2236 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
2238 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
2239 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
2244 if v
in ('true', 'false', 'null'):
2246 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
2249 if v
[0] in ("'", '"'):
2250 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
2255 }.get(m
.group(0), m
.group(0)), v
[1:-1])
2257 for regex
, base
in INTEGER_TABLE
:
2258 im
= re
.match(regex
, v
)
2260 i
= int(im
.group(1), base
)
2261 return '"%d":' % i
if v
.endswith(':') else '%d' % i
2265 return re
.sub(r
'''(?sx)
2266 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2267 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2268 {comment}|,(?={skip}[\]}}])|
2269 [a-zA-Z_][.a-zA-Z_0-9]*|
2270 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2272 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
2275 def qualities(quality_ids
):
2276 """ Get a numeric quality value out of a list of possible values """
2279 return quality_ids
.index(qid
)
2285 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
2288 def limit_length(s
, length
):
2289 """ Add ellipses to overly long strings """
2294 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
2298 def version_tuple(v
):
2299 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
2302 def is_outdated_version(version
, limit
, assume_new
=True):
2304 return not assume_new
2306 return version_tuple(version
) < version_tuple(limit
)
2308 return not assume_new
2311 def ytdl_is_updateable():
2312 """ Returns if youtube-dl can be updated with -U """
2313 from zipimport
import zipimporter
2315 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
2318 def args_to_str(args
):
2319 # Get a short string representation for a subprocess command
2320 return ' '.join(compat_shlex_quote(a
) for a
in args
)
2323 def error_to_compat_str(err
):
2325 # On python 2 error byte string must be decoded with proper
2326 # encoding rather than ascii
2327 if sys
.version_info
[0] < 3:
2328 err_str
= err_str
.decode(preferredencoding())
2332 def mimetype2ext(mt
):
2338 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2339 # it's the most popular one
2340 'audio/mpeg': 'mp3',
2345 _
, _
, res
= mt
.rpartition('/')
2346 res
= res
.split(';')[0].strip().lower()
2350 'smptett+xml': 'tt',
2354 'x-mp4-fragmented': 'mp4',
2355 'x-ms-sami': 'sami',
2358 'x-mpegurl': 'm3u8',
2359 'vnd.apple.mpegurl': 'm3u8',
2363 'vnd.ms-sstr+xml': 'ism',
2369 def parse_codecs(codecs_str
):
2370 # http://tools.ietf.org/html/rfc6381
2373 splited_codecs
= list(filter(None, map(
2374 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
2375 vcodec
, acodec
= None, None
2376 for full_codec
in splited_codecs
:
2377 codec
= full_codec
.split('.')[0]
2378 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2381 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2385 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
2386 if not vcodec
and not acodec
:
2387 if len(splited_codecs
) == 2:
2392 elif len(splited_codecs
) == 1:
2399 'vcodec': vcodec
or 'none',
2400 'acodec': acodec
or 'none',
2405 def urlhandle_detect_ext(url_handle
):
2406 getheader
= url_handle
.headers
.get
2408 cd
= getheader('Content-Disposition')
2410 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
2412 e
= determine_ext(m
.group('filename'), default_ext
=None)
2416 return mimetype2ext(getheader('Content-Type'))
2419 def encode_data_uri(data
, mime_type
):
2420 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
2423 def age_restricted(content_limit
, age_limit
):
2424 """ Returns True iff the content should be blocked """
2426 if age_limit
is None: # No limit set
2428 if content_limit
is None:
2429 return False # Content available for everyone
2430 return age_limit
< content_limit
2433 def is_html(first_bytes
):
2434 """ Detect whether a file contains HTML by examining its first bytes. """
2437 (b
'\xef\xbb\xbf', 'utf-8'),
2438 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
2439 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
2440 (b
'\xff\xfe', 'utf-16-le'),
2441 (b
'\xfe\xff', 'utf-16-be'),
2443 for bom
, enc
in BOMS
:
2444 if first_bytes
.startswith(bom
):
2445 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
2448 s
= first_bytes
.decode('utf-8', 'replace')
2450 return re
.match(r
'^\s*<', s
)
2453 def determine_protocol(info_dict
):
2454 protocol
= info_dict
.get('protocol')
2455 if protocol
is not None:
2458 url
= info_dict
['url']
2459 if url
.startswith('rtmp'):
2461 elif url
.startswith('mms'):
2463 elif url
.startswith('rtsp'):
2466 ext
= determine_ext(url
)
2472 return compat_urllib_parse_urlparse(url
).scheme
2475 def render_table(header_row
, data
):
2476 """ Render a list of rows, each as a list of values """
2477 table
= [header_row
] + data
2478 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
2479 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
2480 return '\n'.join(format_str
% tuple(row
) for row
in table
)
2483 def _match_one(filter_part
, dct
):
2484 COMPARISON_OPERATORS
= {
2492 operator_rex
= re
.compile(r
'''(?x)\s*
2494 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2496 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2497 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
2498 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
2501 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2502 m = operator_rex.search(filter_part)
2504 op = COMPARISON_OPERATORS[m.group('op')]
2505 actual_value = dct.get(m.group('key'))
2506 if (m.group('quotedstrval') is not None or
2507 m.group('strval') is not None or
2508 # If the original field is a string and matching comparisonvalue is
2509 # a number we should respect the origin of the original field
2510 # and process comparison value as a string (see
2511 # https://github.com/rg3/youtube-dl/issues/11082).
2512 actual_value is not None and m.group('intval') is not None and
2513 isinstance(actual_value, compat_str)):
2514 if m.group('op') not in ('=', '!='):
2516 'Operator %s does not support string values!' % m.group('op'))
2517 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2518 quote = m.group('quote')
2519 if quote is not None:
2520 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2523 comparison_value = int(m.group('intval'))
2525 comparison_value = parse_filesize(m.group('intval'))
2526 if comparison_value is None:
2527 comparison_value = parse_filesize(m.group('intval') + 'B')
2528 if comparison_value is None:
2530 'Invalid integer value %r in filter part %r' % (
2531 m.group('intval'), filter_part))
2532 if actual_value is None:
2533 return m.group('none_inclusive')
2534 return op(actual_value, comparison_value)
2537 '': lambda v: v is not None,
2538 '!': lambda v: v is None,
2540 operator_rex = re.compile(r'''(?x
)\s
*
2541 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
2543 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2544 m = operator_rex.search(filter_part)
2546 op = UNARY_OPERATORS[m.group('op')]
2547 actual_value = dct.get(m.group('key'))
2548 return op(actual_value)
2550 raise ValueError('Invalid filter part %r' % filter_part)
2553 def match_str(filter_str, dct):
2554 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2557 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2560 def match_filter_func(filter_str):
2561 def _match_func(info_dict):
2562 if match_str(filter_str, info_dict):
2565 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2566 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2570 def parse_dfxp_time_expr(time_expr):
2574 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2576 return float(mobj.group('time_offset'))
2578 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2580 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2583 def srt_subtitles_timecode(seconds):
2584 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2587 def dfxp2srt(dfxp_data):
2589 @param dfxp_data A
bytes-like
object containing DFXP data
2590 @returns A
unicode object containing converted SRT data
2592 LEGACY_NAMESPACES = (
2593 (b'http://www.w3.org/ns/ttml', [
2594 b'http://www.w3.org/2004/11/ttaf1',
2595 b'http://www.w3.org/2006/04/ttaf1',
2596 b'http://www.w3.org/2006/10/ttaf1',
2598 (b'http://www.w3.org/ns/ttml#styling', [
2599 b'http://www.w3.org/ns/ttml#style',
2603 SUPPORTED_STYLING = [
2612 _x = functools.partial(xpath_with_ns, ns_map={
2613 'ttml': 'http://www.w3.org/ns/ttml',
2614 'tts': 'http://www.w3.org/ns/ttml#styling',
2620 class TTMLPElementParser(object):
2622 _unclosed_elements = []
2623 _applied_styles = []
2625 def start(self, tag, attrib):
2626 if tag in (_x('ttml:br'), 'br'):
2629 unclosed_elements = []
2631 element_style_id = attrib.get('style')
2633 style.update(default_style)
2634 if element_style_id:
2635 style.update(styles.get(element_style_id, {}))
2636 for prop in SUPPORTED_STYLING:
2637 prop_val = attrib.get(_x('tts:' + prop))
2639 style[prop] = prop_val
2642 for k, v in sorted(style.items()):
2643 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2646 font += ' color="%s"' % v
2647 elif k == 'fontSize':
2648 font += ' size="%s"' % v
2649 elif k == 'fontFamily':
2650 font += ' face="%s"' % v
2651 elif k == 'fontWeight' and v == 'bold':
2653 unclosed_elements.append('b')
2654 elif k == 'fontStyle' and v == 'italic':
2656 unclosed_elements.append('i')
2657 elif k == 'textDecoration' and v == 'underline':
2659 unclosed_elements.append('u')
2661 self._out += '<font' + font + '>'
2662 unclosed_elements.append('font')
2664 if self._applied_styles:
2665 applied_style.update(self._applied_styles[-1])
2666 applied_style.update(style)
2667 self._applied_styles.append(applied_style)
2668 self._unclosed_elements.append(unclosed_elements)
2671 if tag not in (_x('ttml:br'), 'br'):
2672 unclosed_elements = self._unclosed_elements.pop()
2673 for element in reversed(unclosed_elements):
2674 self._out += '</%s>' % element
2675 if unclosed_elements and self._applied_styles:
2676 self._applied_styles.pop()
2678 def data(self, data):
2682 return self._out.strip()
2684 def parse_node(node):
2685 target = TTMLPElementParser()
2686 parser = xml.etree.ElementTree.XMLParser(target=target)
2687 parser.feed(xml.etree.ElementTree.tostring(node))
2688 return parser.close()
2690 for k, v in LEGACY_NAMESPACES:
2692 dfxp_data = dfxp_data.replace(ns, k)
2694 dfxp = compat_etree_fromstring(dfxp_data)
2696 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2699 raise ValueError('Invalid dfxp/TTML subtitle')
2703 for style in dfxp.findall(_x('.//ttml:style')):
2704 style_id = style.get('id')
2705 parent_style_id = style.get('style')
2707 if parent_style_id not in styles:
2710 styles[style_id] = styles[parent_style_id].copy()
2711 for prop in SUPPORTED_STYLING:
2712 prop_val = style.get(_x('tts:' + prop))
2714 styles.setdefault(style_id, {})[prop] = prop_val
2720 for p in ('body', 'div'):
2721 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2724 style = styles.get(ele.get('style'))
2727 default_style.update(style)
2729 for para, index in zip(paras, itertools.count(1)):
2730 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2731 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2732 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2733 if begin_time is None:
2738 end_time = begin_time + dur
2739 out.append('%d\n%s --> %s\n%s\n\n' % (
2741 srt_subtitles_timecode(begin_time),
2742 srt_subtitles_timecode(end_time),
2748 def cli_option(params, command_option, param):
2749 param = params.get(param)
2751 param = compat_str(param)
2752 return [command_option, param] if param is not None else []
2755 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2756 param = params.get(param)
2759 assert isinstance(param, bool)
2761 return [command_option + separator + (true_value if param else false_value)]
2762 return [command_option, true_value if param else false_value]
2765 def cli_valueless_option(params, command_option, param, expected_value=True):
2766 param = params.get(param)
2767 return [command_option] if param == expected_value else []
2770 def cli_configuration_args(params, param, default=[]):
2771 ex_args = params.get(param)
2774 assert isinstance(ex_args, list)
2778 class ISO639Utils(object):
2779 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2968 def short2long(cls, code):
2969 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2970 return cls._lang_map.get(code[:2])
2973 def long2short(cls, code):
2974 """Convert language code from ISO 639-2/T to ISO 639-1"""
2975 for short_name, long_name in cls._lang_map.items():
2976 if long_name == code:
2980 class ISO3166Utils(object):
2981 # From http://data.okfn.org/data/core/country-list
2983 'AF': 'Afghanistan',
2984 'AX': 'Åland Islands',
2987 'AS': 'American Samoa',
2992 'AG': 'Antigua and Barbuda',
3009 'BO': 'Bolivia, Plurinational State of',
3010 'BQ': 'Bonaire, Sint Eustatius and Saba',
3011 'BA': 'Bosnia and Herzegovina',
3013 'BV': 'Bouvet Island',
3015 'IO': 'British Indian Ocean Territory',
3016 'BN': 'Brunei Darussalam',
3018 'BF': 'Burkina Faso',
3024 'KY': 'Cayman Islands',
3025 'CF': 'Central African Republic',
3029 'CX': 'Christmas Island',
3030 'CC': 'Cocos (Keeling) Islands',
3034 'CD': 'Congo, the Democratic Republic of the',
3035 'CK': 'Cook Islands',
3037 'CI': 'Côte d\'Ivoire',
3042 'CZ': 'Czech Republic',
3046 'DO': 'Dominican Republic',
3049 'SV': 'El Salvador',
3050 'GQ': 'Equatorial Guinea',
3054 'FK': 'Falkland Islands (Malvinas)',
3055 'FO': 'Faroe Islands',
3059 'GF': 'French Guiana',
3060 'PF': 'French Polynesia',
3061 'TF': 'French Southern Territories',
3076 'GW': 'Guinea-Bissau',
3079 'HM': 'Heard Island and McDonald Islands',
3080 'VA': 'Holy See (Vatican City State)',
3087 'IR': 'Iran, Islamic Republic of',
3090 'IM': 'Isle of Man',
3100 'KP': 'Korea, Democratic People\'s Republic of',
3101 'KR': 'Korea, Republic of',
3104 'LA': 'Lao People\'s Democratic Republic',
3110 'LI': 'Liechtenstein',
3114 'MK': 'Macedonia, the Former Yugoslav Republic of',
3121 'MH': 'Marshall Islands',
3127 'FM': 'Micronesia, Federated States of',
3128 'MD': 'Moldova, Republic of',
3139 'NL': 'Netherlands',
3140 'NC': 'New Caledonia',
3141 'NZ': 'New Zealand',
3146 'NF': 'Norfolk Island',
3147 'MP': 'Northern Mariana Islands',
3152 'PS': 'Palestine, State of',
3154 'PG': 'Papua New Guinea',
3157 'PH': 'Philippines',
3161 'PR': 'Puerto Rico',
3165 'RU': 'Russian Federation',
3167 'BL': 'Saint Barthélemy',
3168 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3169 'KN': 'Saint Kitts and Nevis',
3170 'LC': 'Saint Lucia',
3171 'MF': 'Saint Martin (French part)',
3172 'PM': 'Saint Pierre and Miquelon',
3173 'VC': 'Saint Vincent and the Grenadines',
3176 'ST': 'Sao Tome and Principe',
3177 'SA': 'Saudi Arabia',
3181 'SL': 'Sierra Leone',
3183 'SX': 'Sint Maarten (Dutch part)',
3186 'SB': 'Solomon Islands',
3188 'ZA': 'South Africa',
3189 'GS': 'South Georgia and the South Sandwich Islands',
3190 'SS': 'South Sudan',
3195 'SJ': 'Svalbard and Jan Mayen',
3198 'CH': 'Switzerland',
3199 'SY': 'Syrian Arab Republic',
3200 'TW': 'Taiwan, Province of China',
3202 'TZ': 'Tanzania, United Republic of',
3204 'TL': 'Timor-Leste',
3208 'TT': 'Trinidad and Tobago',
3211 'TM': 'Turkmenistan',
3212 'TC': 'Turks and Caicos Islands',
3216 'AE': 'United Arab Emirates',
3217 'GB': 'United Kingdom',
3218 'US': 'United States',
3219 'UM': 'United States Minor Outlying Islands',
3223 'VE': 'Venezuela, Bolivarian Republic of',
3225 'VG': 'Virgin Islands, British',
3226 'VI': 'Virgin Islands, U.S.',
3227 'WF': 'Wallis and Futuna',
3228 'EH': 'Western Sahara',
3235 def short2full(cls, code):
3236 """Convert an ISO 3166-2 country code to the corresponding full name"""
3237 return cls._country_map.get(code.upper())
3240 class GeoUtils(object):
3241 # Major IPv4 address blocks per country
3243 'AD': '85.94.160.0/19',
3244 'AE': '94.200.0.0/13',
3245 'AF': '149.54.0.0/17',
3246 'AG': '209.59.64.0/18',
3247 'AI': '204.14.248.0/21',
3248 'AL': '46.99.0.0/16',
3249 'AM': '46.70.0.0/15',
3250 'AO': '105.168.0.0/13',
3251 'AP': '159.117.192.0/21',
3252 'AR': '181.0.0.0/12',
3253 'AS': '202.70.112.0/20',
3254 'AT': '84.112.0.0/13',
3255 'AU': '1.128.0.0/11',
3256 'AW': '181.41.0.0/18',
3257 'AZ': '5.191.0.0/16',
3258 'BA': '31.176.128.0/17',
3259 'BB': '65.48.128.0/17',
3260 'BD': '114.130.0.0/16',
3262 'BF': '129.45.128.0/17',
3263 'BG': '95.42.0.0/15',
3264 'BH': '37.131.0.0/17',
3265 'BI': '154.117.192.0/18',
3266 'BJ': '137.255.0.0/16',
3267 'BL': '192.131.134.0/24',
3268 'BM': '196.12.64.0/18',
3269 'BN': '156.31.0.0/16',
3270 'BO': '161.56.0.0/16',
3271 'BQ': '161.0.80.0/20',
3272 'BR': '152.240.0.0/12',
3273 'BS': '24.51.64.0/18',
3274 'BT': '119.2.96.0/19',
3275 'BW': '168.167.0.0/16',
3276 'BY': '178.120.0.0/13',
3277 'BZ': '179.42.192.0/18',
3278 'CA': '99.224.0.0/11',
3279 'CD': '41.243.0.0/16',
3280 'CF': '196.32.200.0/21',
3281 'CG': '197.214.128.0/17',
3282 'CH': '85.0.0.0/13',
3283 'CI': '154.232.0.0/14',
3284 'CK': '202.65.32.0/19',
3285 'CL': '152.172.0.0/14',
3286 'CM': '165.210.0.0/15',
3287 'CN': '36.128.0.0/10',
3288 'CO': '181.240.0.0/12',
3289 'CR': '201.192.0.0/12',
3290 'CU': '152.206.0.0/15',
3291 'CV': '165.90.96.0/19',
3292 'CW': '190.88.128.0/17',
3293 'CY': '46.198.0.0/15',
3294 'CZ': '88.100.0.0/14',
3296 'DJ': '197.241.0.0/17',
3297 'DK': '87.48.0.0/12',
3298 'DM': '192.243.48.0/20',
3299 'DO': '152.166.0.0/15',
3300 'DZ': '41.96.0.0/12',
3301 'EC': '186.68.0.0/15',
3302 'EE': '90.190.0.0/15',
3303 'EG': '156.160.0.0/11',
3304 'ER': '196.200.96.0/20',
3305 'ES': '88.0.0.0/11',
3306 'ET': '196.188.0.0/14',
3307 'EU': '2.16.0.0/13',
3308 'FI': '91.152.0.0/13',
3309 'FJ': '144.120.0.0/16',
3310 'FM': '119.252.112.0/20',
3311 'FO': '88.85.32.0/19',
3313 'GA': '41.158.0.0/15',
3315 'GD': '74.122.88.0/21',
3316 'GE': '31.146.0.0/16',
3317 'GF': '161.22.64.0/18',
3318 'GG': '62.68.160.0/19',
3319 'GH': '45.208.0.0/14',
3320 'GI': '85.115.128.0/19',
3321 'GL': '88.83.0.0/19',
3322 'GM': '160.182.0.0/15',
3323 'GN': '197.149.192.0/18',
3324 'GP': '104.250.0.0/19',
3325 'GQ': '105.235.224.0/20',
3326 'GR': '94.64.0.0/13',
3327 'GT': '168.234.0.0/16',
3328 'GU': '168.123.0.0/16',
3329 'GW': '197.214.80.0/20',
3330 'GY': '181.41.64.0/18',
3331 'HK': '113.252.0.0/14',
3332 'HN': '181.210.0.0/16',
3333 'HR': '93.136.0.0/13',
3334 'HT': '148.102.128.0/17',
3335 'HU': '84.0.0.0/14',
3336 'ID': '39.192.0.0/10',
3337 'IE': '87.32.0.0/12',
3338 'IL': '79.176.0.0/13',
3339 'IM': '5.62.80.0/20',
3340 'IN': '117.192.0.0/10',
3341 'IO': '203.83.48.0/21',
3342 'IQ': '37.236.0.0/14',
3343 'IR': '2.176.0.0/12',
3344 'IS': '82.221.0.0/16',
3345 'IT': '79.0.0.0/10',
3346 'JE': '87.244.64.0/18',
3347 'JM': '72.27.0.0/17',
3348 'JO': '176.29.0.0/16',
3349 'JP': '126.0.0.0/8',
3350 'KE': '105.48.0.0/12',
3351 'KG': '158.181.128.0/17',
3352 'KH': '36.37.128.0/17',
3353 'KI': '103.25.140.0/22',
3354 'KM': '197.255.224.0/20',
3355 'KN': '198.32.32.0/19',
3356 'KP': '175.45.176.0/22',
3357 'KR': '175.192.0.0/10',
3358 'KW': '37.36.0.0/14',
3359 'KY': '64.96.0.0/15',
3360 'KZ': '2.72.0.0/13',
3361 'LA': '115.84.64.0/18',
3362 'LB': '178.135.0.0/16',
3363 'LC': '192.147.231.0/24',
3364 'LI': '82.117.0.0/19',
3365 'LK': '112.134.0.0/15',
3366 'LR': '41.86.0.0/19',
3367 'LS': '129.232.0.0/17',
3368 'LT': '78.56.0.0/13',
3369 'LU': '188.42.0.0/16',
3370 'LV': '46.109.0.0/16',
3371 'LY': '41.252.0.0/14',
3372 'MA': '105.128.0.0/11',
3373 'MC': '88.209.64.0/18',
3374 'MD': '37.246.0.0/16',
3375 'ME': '178.175.0.0/17',
3376 'MF': '74.112.232.0/21',
3377 'MG': '154.126.0.0/17',
3378 'MH': '117.103.88.0/21',
3379 'MK': '77.28.0.0/15',
3380 'ML': '154.118.128.0/18',
3381 'MM': '37.111.0.0/17',
3382 'MN': '49.0.128.0/17',
3383 'MO': '60.246.0.0/16',
3384 'MP': '202.88.64.0/20',
3385 'MQ': '109.203.224.0/19',
3386 'MR': '41.188.64.0/18',
3387 'MS': '208.90.112.0/22',
3388 'MT': '46.11.0.0/16',
3389 'MU': '105.16.0.0/12',
3390 'MV': '27.114.128.0/18',
3391 'MW': '105.234.0.0/16',
3392 'MX': '187.192.0.0/11',
3393 'MY': '175.136.0.0/13',
3394 'MZ': '197.218.0.0/15',
3395 'NA': '41.182.0.0/16',
3396 'NC': '101.101.0.0/18',
3397 'NE': '197.214.0.0/18',
3398 'NF': '203.17.240.0/22',
3399 'NG': '105.112.0.0/12',
3400 'NI': '186.76.0.0/15',
3401 'NL': '145.96.0.0/11',
3402 'NO': '84.208.0.0/13',
3403 'NP': '36.252.0.0/15',
3404 'NR': '203.98.224.0/19',
3405 'NU': '49.156.48.0/22',
3406 'NZ': '49.224.0.0/14',
3407 'OM': '5.36.0.0/15',
3408 'PA': '186.72.0.0/15',
3409 'PE': '186.160.0.0/14',
3410 'PF': '123.50.64.0/18',
3411 'PG': '124.240.192.0/19',
3412 'PH': '49.144.0.0/13',
3413 'PK': '39.32.0.0/11',
3414 'PL': '83.0.0.0/11',
3415 'PM': '70.36.0.0/20',
3416 'PR': '66.50.0.0/16',
3417 'PS': '188.161.0.0/16',
3418 'PT': '85.240.0.0/13',
3419 'PW': '202.124.224.0/20',
3420 'PY': '181.120.0.0/14',
3421 'QA': '37.210.0.0/15',
3422 'RE': '139.26.0.0/16',
3423 'RO': '79.112.0.0/13',
3424 'RS': '178.220.0.0/14',
3425 'RU': '5.136.0.0/13',
3426 'RW': '105.178.0.0/15',
3427 'SA': '188.48.0.0/13',
3428 'SB': '202.1.160.0/19',
3429 'SC': '154.192.0.0/11',
3430 'SD': '154.96.0.0/13',
3431 'SE': '78.64.0.0/12',
3432 'SG': '152.56.0.0/14',
3433 'SI': '188.196.0.0/14',
3434 'SK': '78.98.0.0/15',
3435 'SL': '197.215.0.0/17',
3436 'SM': '89.186.32.0/19',
3437 'SN': '41.82.0.0/15',
3438 'SO': '197.220.64.0/19',
3439 'SR': '186.179.128.0/17',
3440 'SS': '105.235.208.0/21',
3441 'ST': '197.159.160.0/19',
3442 'SV': '168.243.0.0/16',
3443 'SX': '190.102.0.0/20',
3445 'SZ': '41.84.224.0/19',
3446 'TC': '65.255.48.0/20',
3447 'TD': '154.68.128.0/19',
3448 'TG': '196.168.0.0/14',
3449 'TH': '171.96.0.0/13',
3450 'TJ': '85.9.128.0/18',
3451 'TK': '27.96.24.0/21',
3452 'TL': '180.189.160.0/20',
3453 'TM': '95.85.96.0/19',
3454 'TN': '197.0.0.0/11',
3455 'TO': '175.176.144.0/21',
3456 'TR': '78.160.0.0/11',
3457 'TT': '186.44.0.0/15',
3458 'TV': '202.2.96.0/19',
3459 'TW': '120.96.0.0/11',
3460 'TZ': '156.156.0.0/14',
3461 'UA': '93.72.0.0/13',
3462 'UG': '154.224.0.0/13',
3464 'UY': '167.56.0.0/13',
3465 'UZ': '82.215.64.0/18',
3466 'VA': '212.77.0.0/19',
3467 'VC': '24.92.144.0/20',
3468 'VE': '186.88.0.0/13',
3469 'VG': '172.103.64.0/18',
3470 'VI': '146.226.0.0/16',
3471 'VN': '14.160.0.0/11',
3472 'VU': '202.80.32.0/20',
3473 'WF': '117.20.32.0/21',
3474 'WS': '202.4.32.0/19',
3475 'YE': '134.35.0.0/16',
3476 'YT': '41.242.116.0/22',
3477 'ZA': '41.0.0.0/11',
3478 'ZM': '165.56.0.0/13',
3479 'ZW': '41.85.192.0/19',
3483 def random_ipv4(cls, code):
3484 block = cls._country_ip_map.get(code.upper())
3487 addr, preflen = block.split('/')
3488 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3489 addr_max = addr_min | (0xffffffff >> int(preflen))
3490 return compat_str(socket.inet_ntoa(
3491 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3494 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3495 def __init__(self, proxies=None):
3496 # Set default handlers
3497 for type in ('http', 'https'):
3498 setattr(self, '%s_open' % type,
3499 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3500 meth(r, proxy, type))
3501 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3503 def proxy_open(self, req, proxy, type):
3504 req_proxy = req.headers.get('Ytdl-request-proxy')
3505 if req_proxy is not None:
3507 del req.headers['Ytdl-request-proxy']
3509 if proxy == '__noproxy__':
3510 return None # No Proxy
3511 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3512 req.add_header('Ytdl-socks-proxy', proxy)
3513 # youtube-dl's http/https handlers do wrapping the socket with socks
3515 return compat_urllib_request.ProxyHandler.proxy_open(
3516 self, req, proxy, type)
3519 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3520 # released into Public Domain
3521 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3523 def long_to_bytes(n, blocksize=0):
3524 """long_to_bytes(n:long, blocksize:int) : string
3525 Convert a long integer to a byte string.
3527 If optional blocksize is given and greater than zero, pad the front of the
3528 byte string with binary zeros so that the length is a multiple of
3531 # after much testing, this algorithm was deemed to be the fastest
3535 s = compat_struct_pack('>I', n & 0xffffffff) + s
3537 # strip off leading zeros
3538 for i in range(len(s)):
3539 if s[i] != b'\000'[0]:
3542 # only happens when n == 0
3546 # add back some pad bytes. this could be done more efficiently w.r.t. the
3547 # de-padding being done above, but sigh...
3548 if blocksize > 0 and len(s) % blocksize:
3549 s = (blocksize - len(s) % blocksize) * b'\000' + s
3553 def bytes_to_long(s):
3554 """bytes_to_long(string) : long
3555 Convert a byte string to a long integer.
3557 This is (essentially) the inverse of long_to_bytes().
3562 extra = (4 - length % 4)
3563 s = b'\000' * extra + s
3564 length = length + extra
3565 for i in range(0, length, 4):
3566 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3570 def ohdave_rsa_encrypt(data, exponent, modulus):
3572 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
3575 data: data to encrypt, bytes-like object
3576 exponent, modulus: parameter e and N of RSA algorithm, both integer
3577 Output: hex string of encrypted data
3579 Limitation: supports one block encryption only
3582 payload = int(binascii.hexlify(data[::-1]), 16)
3583 encrypted = pow(payload, exponent, modulus)
3584 return '%x' % encrypted
3587 def pkcs1pad(data, length):
3589 Padding input data with PKCS#1 scheme
3591 @param {int[]} data input data
3592 @param {int} length target length
3593 @returns {int[]} padded data
3595 if len(data) > length - 11:
3596 raise ValueError('Input data too
long for PKCS
#1 padding')
3598 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
3599 return [0, 2] + pseudo_random
+ [0] + data
3602 def encode_base_n(num
, n
, table
=None):
3603 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3605 table
= FULL_TABLE
[:n
]
3608 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
3615 ret
= table
[num
% n
] + ret
3620 def decode_packed_codes(code
):
3621 mobj
= re
.search(PACKED_CODES_RE
, code
)
3622 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
3625 symbols
= symbols
.split('|')
3630 base_n_count
= encode_base_n(count
, base
)
3631 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
3634 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
3638 def parse_m3u8_attributes(attrib
):
3640 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
3641 if val
.startswith('"'):
3647 def urshift(val
, n
):
3648 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
3651 # Based on png2str() written by @gdkchan and improved by @yokrysty
3652 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3653 def decode_png(png_data
):
3654 # Reference: https://www.w3.org/TR/PNG/
3655 header
= png_data
[8:]
3657 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
3658 raise IOError('Not a valid PNG file.')
3660 int_map
= {1: '>B', 2: '>H', 4: '>I'}
3661 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
3666 length
= unpack_integer(header
[:4])
3669 chunk_type
= header
[:4]
3672 chunk_data
= header
[:length
]
3673 header
= header
[length
:]
3675 header
= header
[4:] # Skip CRC
3683 ihdr
= chunks
[0]['data']
3685 width
= unpack_integer(ihdr
[:4])
3686 height
= unpack_integer(ihdr
[4:8])
3690 for chunk
in chunks
:
3691 if chunk
['type'] == b
'IDAT':
3692 idat
+= chunk
['data']
3695 raise IOError('Unable to read PNG data.')
3697 decompressed_data
= bytearray(zlib
.decompress(idat
))
3702 def _get_pixel(idx
):
3707 for y
in range(height
):
3708 basePos
= y
* (1 + stride
)
3709 filter_type
= decompressed_data
[basePos
]
3713 pixels
.append(current_row
)
3715 for x
in range(stride
):
3716 color
= decompressed_data
[1 + basePos
+ x
]
3717 basex
= y
* stride
+ x
3722 left
= _get_pixel(basex
- 3)
3724 up
= _get_pixel(basex
- stride
)
3726 if filter_type
== 1: # Sub
3727 color
= (color
+ left
) & 0xff
3728 elif filter_type
== 2: # Up
3729 color
= (color
+ up
) & 0xff
3730 elif filter_type
== 3: # Average
3731 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
3732 elif filter_type
== 4: # Paeth
3738 c
= _get_pixel(basex
- stride
- 3)
3746 if pa
<= pb
and pa
<= pc
:
3747 color
= (color
+ a
) & 0xff
3749 color
= (color
+ b
) & 0xff
3751 color
= (color
+ c
) & 0xff
3753 current_row
.append(color
)
3755 return width
, height
, pixels
3758 def write_xattr(path
, key
, value
):
3759 # This mess below finds the best xattr tool for the job
3761 # try the pyxattr module...
3764 if hasattr(xattr
, 'set'): # pyxattr
3765 # Unicode arguments are not supported in python-pyxattr until
3767 # See https://github.com/rg3/youtube-dl/issues/5498
3768 pyxattr_required_version
= '0.5.0'
3769 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
3770 # TODO: fallback to CLI tools
3771 raise XAttrUnavailableError(
3772 'python-pyxattr is detected but is too old. '
3773 'youtube-dl requires %s or above while your version is %s. '
3774 'Falling back to other xattr implementations' % (
3775 pyxattr_required_version
, xattr
.__version
__))
3777 setxattr
= xattr
.set
3779 setxattr
= xattr
.setxattr
3782 setxattr(path
, key
, value
)
3783 except EnvironmentError as e
:
3784 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3787 if compat_os_name
== 'nt':
3788 # Write xattrs to NTFS Alternate Data Streams:
3789 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3790 assert ':' not in key
3791 assert os
.path
.exists(path
)
3793 ads_fn
= path
+ ':' + key
3795 with open(ads_fn
, 'wb') as f
:
3797 except EnvironmentError as e
:
3798 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3800 user_has_setfattr
= check_executable('setfattr', ['--version'])
3801 user_has_xattr
= check_executable('xattr', ['-h'])
3803 if user_has_setfattr
or user_has_xattr
:
3805 value
= value
.decode('utf-8')
3806 if user_has_setfattr
:
3807 executable
= 'setfattr'
3808 opts
= ['-n', key
, '-v', value
]
3809 elif user_has_xattr
:
3810 executable
= 'xattr'
3811 opts
= ['-w', key
, value
]
3813 cmd
= ([encodeFilename(executable
, True)] +
3814 [encodeArgument(o
) for o
in opts
] +
3815 [encodeFilename(path
, True)])
3818 p
= subprocess
.Popen(
3819 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
3820 except EnvironmentError as e
:
3821 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3822 stdout
, stderr
= p
.communicate()
3823 stderr
= stderr
.decode('utf-8', 'replace')
3824 if p
.returncode
!= 0:
3825 raise XAttrMetadataError(p
.returncode
, stderr
)
3828 # On Unix, and can't find pyxattr, setfattr, or xattr.
3829 if sys
.platform
.startswith('linux'):
3830 raise XAttrUnavailableError(
3831 "Couldn't find a tool to set the xattrs. "
3832 "Install either the python 'pyxattr' or 'xattr' "
3833 "modules, or the GNU 'attr' package "
3834 "(which contains the 'setfattr' tool).")
3836 raise XAttrUnavailableError(
3837 "Couldn't find a tool to set the xattrs. "
3838 "Install either the python 'xattr' module, "
3839 "or the 'xattr' binary.")
3842 def random_birthday(year_field
, month_field
, day_field
):
3844 year_field
: str(random
.randint(1950, 1995)),
3845 month_field
: str(random
.randint(1, 12)),
3846 day_field
: str(random
.randint(1, 31)),