4 from __future__
import unicode_literals
33 import xml
.etree
.ElementTree
40 compat_etree_fromstring
,
42 compat_html_entities_html5
,
48 compat_socket_create_connection
,
54 compat_urllib_parse_urlencode
,
55 compat_urllib_parse_urlparse
,
56 compat_urllib_parse_unquote_plus
,
57 compat_urllib_request
,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme
not in compat_urlparse
.uses_netloc
:
74 compat_urlparse
.uses_netloc
.append(scheme
)
77 # This is not clearly defined otherwise
78 compiled_regex_type
= type(re
.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
91 ENGLISH_MONTH_NAMES
= [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
96 'en': ENGLISH_MONTH_NAMES
,
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
115 'f4f', 'f4m', 'm3u8', 'smil')
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
136 '%Y-%m-%d %H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
143 '%Y-%m-%dT%H:%M:%S.%f',
146 '%b %d %Y at %H:%M:%S',
149 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
150 DATE_FORMATS_DAY_FIRST
.extend([
159 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
160 DATE_FORMATS_MONTH_FIRST
.extend([
168 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
171 def preferredencoding():
172 """Get preferred encoding.
174 Returns the best encoding scheme for the system, based on
175 locale.getpreferredencoding() and some further tweaks.
178 pref
= locale
.getpreferredencoding()
186 def write_json_file(obj
, fn
):
187 """ Encode obj as JSON and write it to fn, atomically if possible """
189 fn
= encodeFilename(fn
)
190 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
191 encoding
= get_filesystem_encoding()
192 # os.path.basename returns a bytes object, but NamedTemporaryFile
193 # will fail if the filename contains non ascii characters unless we
194 # use a unicode object
195 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
196 # the same for os.path.dirname
197 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
199 path_basename
= os
.path
.basename
200 path_dirname
= os
.path
.dirname
204 'prefix': path_basename(fn
) + '.',
205 'dir': path_dirname(fn
),
209 # In Python 2.x, json.dump expects a bytestream.
210 # In Python 3.x, it writes to a character stream
211 if sys
.version_info
< (3, 0):
219 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
224 if sys
.platform
== 'win32':
225 # Need to remove existing file on Windows, else os.rename raises
226 # WindowsError or FileExistsError.
231 os
.rename(tf
.name
, fn
)
240 if sys
.version_info
>= (2, 7):
241 def find_xpath_attr(node
, xpath
, key
, val
=None):
242 """ Find the xpath xpath[@key=val] """
243 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
244 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
245 return node
.find(expr
)
247 def find_xpath_attr(node
, xpath
, key
, val
=None):
248 for f
in node
.findall(compat_xpath(xpath
)):
249 if key
not in f
.attrib
:
251 if val
is None or f
.attrib
.get(key
) == val
:
255 # On python2.6 the xml.etree.ElementTree.Element methods don't support
256 # the namespace parameter
259 def xpath_with_ns(path
, ns_map
):
260 components
= [c
.split(':') for c
in path
.split('/')]
264 replaced
.append(c
[0])
267 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
268 return '/'.join(replaced
)
271 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
272 def _find_xpath(xpath
):
273 return node
.find(compat_xpath(xpath
))
275 if isinstance(xpath
, (str, compat_str
)):
276 n
= _find_xpath(xpath
)
284 if default
is not NO_DEFAULT
:
287 name
= xpath
if name
is None else name
288 raise ExtractorError('Could not find XML element %s' % name
)
294 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
295 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
296 if n
is None or n
== default
:
299 if default
is not NO_DEFAULT
:
302 name
= xpath
if name
is None else name
303 raise ExtractorError('Could not find XML element\'s text %s' % name
)
309 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
310 n
= find_xpath_attr(node
, xpath
, key
)
312 if default
is not NO_DEFAULT
:
315 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
316 raise ExtractorError('Could not find XML attribute %s' % name
)
322 def get_element_by_id(id, html
):
323 """Return the content of the tag with the specified ID in the passed HTML document"""
324 return get_element_by_attribute('id', id, html
)
327 def get_element_by_class(class_name
, html
):
328 return get_element_by_attribute(
329 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
330 html, escape_value=False)
333 def get_element_by_attribute(attribute, value, html, escape_value=True):
334 """Return the content of the tag with the specified attribute in the passed HTML document"""
336 value = re.escape(value) if escape_value else value
338 m = re.search(r'''(?xs)
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'))*?
342 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'))*?
346 ''' % (re.escape(attribute), value), html)
350 res = m.group('content
')
352 if res.startswith('"') or res.startswith("'"):
355 return unescapeHTML(res)
358 class HTMLAttributeParser(compat_HTMLParser):
359 """Trivial HTML parser to gather the attributes for a single element"""
362 compat_HTMLParser.__init__(self)
364 def handle_starttag(self, tag, attrs):
365 self.attrs = dict(attrs)
368 def extract_attributes(html_element):
369 """Given a string for an HTML element such as
371 a="foo" B="bar" c="&98;az" d=boz
372 empty= noval entity="&"
375 Decode and return a dictionary of attributes.
377 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
378 'empty
': '', 'noval
': None, 'entity
': '&',
379 'sq
': '"', 'dq': '\''
381 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
382 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
384 parser = HTMLAttributeParser()
385 parser.feed(html_element)
390 def clean_html(html):
391 """Clean an HTML snippet into a readable string"""
393 if html is None: # Convenience for sanitizing descriptions etc.
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
415 It returns the tuple (stream, definitive_file_name).
419 if sys.platform == 'win32':
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = sanitize_path(filename)
431 if alt_filename == filename:
434 # An exception here should be caught in the caller
435 stream = open(encodeFilename(alt_filename), open_mode)
436 return (stream, alt_filename)
439 def timeconvert(timestr):
440 """Convert RFC 2822 defined time string into system timestamp"""
442 timetuple = email.utils.parsedate_tz(timestr)
443 if timetuple is not None:
444 timestamp = email.utils.mktime_tz(timetuple)
448 def sanitize_filename(s, restricted=False, is_id=False):
449 """Sanitizes a string so it could be used as part of a filename.
450 If restricted is set, use a stricter subset of allowed characters.
451 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
453 def replace_insane(char):
454 if restricted and char in ACCENT_CHARS:
455 return ACCENT_CHARS[char]
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
459 return '' if restricted else '\''
461 return '_
-' if restricted else ' -'
462 elif char in '\\/|
*<>':
464 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
466 if restricted
and ord(char
) > 127:
471 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
472 result
= ''.join(map(replace_insane
, s
))
474 while '__' in result
:
475 result
= result
.replace('__', '_')
476 result
= result
.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted
and result
.startswith('-_'):
480 if result
.startswith('-'):
481 result
= '_' + result
[len('-'):]
482 result
= result
.lstrip('.')
488 def sanitize_path(s
):
489 """Sanitizes and normalizes path on Windows"""
490 if sys
.platform
!= 'win32':
492 drive_or_unc
, _
= os
.path
.splitdrive(s
)
493 if sys
.version_info
< (2, 7) and not drive_or_unc
:
494 drive_or_unc
, _
= os
.path
.splitunc(s
)
495 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
499 path_part
if path_part
in ['.', '..'] else re
.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part
)
500 for path_part
in norm_path
]
502 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
503 return os
.path
.join(*sanitized_path
)
506 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
507 # unwanted failures due to missing protocol
508 def sanitize_url(url
):
509 return 'http:%s' % url
if url
.startswith('//') else url
512 def sanitized_Request(url
, *args
, **kwargs
):
513 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
516 def orderedSet(iterable
):
517 """ Remove all duplicates from the input iterable """
525 def _htmlentity_transform(entity_with_semicolon
):
526 """Transforms an HTML entity to a character."""
527 entity
= entity_with_semicolon
[:-1]
529 # Known non-numeric HTML entity
530 if entity
in compat_html_entities
.name2codepoint
:
531 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
533 # TODO: HTML5 allows entities without a semicolon. For example,
534 # 'Éric' should be decoded as 'Éric'.
535 if entity_with_semicolon
in compat_html_entities_html5
:
536 return compat_html_entities_html5
[entity_with_semicolon
]
538 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
540 numstr
= mobj
.group(1)
541 if numstr
.startswith('x'):
543 numstr
= '0%s' % numstr
546 # See https://github.com/rg3/youtube-dl/issues/7518
548 return compat_chr(int(numstr
, base
))
552 # Unknown entity in name, return its literal representation
553 return '&%s;' % entity
559 assert type(s
) == compat_str
562 r
'&([^;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
565 def get_subprocess_encoding():
566 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
567 # For subprocess calls, encode with locale encoding
568 # Refer to http://stackoverflow.com/a/9951851/35070
569 encoding
= preferredencoding()
571 encoding
= sys
.getfilesystemencoding()
577 def encodeFilename(s
, for_subprocess
=False):
579 @param s The name of the file
582 assert type(s
) == compat_str
584 # Python 3 has a Unicode API
585 if sys
.version_info
>= (3, 0):
588 # Pass '' directly to use Unicode APIs on Windows 2000 and up
589 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
590 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
591 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
594 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
595 if sys
.platform
.startswith('java'):
598 return s
.encode(get_subprocess_encoding(), 'ignore')
601 def decodeFilename(b
, for_subprocess
=False):
603 if sys
.version_info
>= (3, 0):
606 if not isinstance(b
, bytes):
609 return b
.decode(get_subprocess_encoding(), 'ignore')
612 def encodeArgument(s
):
613 if not isinstance(s
, compat_str
):
614 # Legacy code that uses byte strings
615 # Uncomment the following line after fixing all post processors
616 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
617 s
= s
.decode('ascii')
618 return encodeFilename(s
, True)
621 def decodeArgument(b
):
622 return decodeFilename(b
, True)
625 def decodeOption(optval
):
628 if isinstance(optval
, bytes):
629 optval
= optval
.decode(preferredencoding())
631 assert isinstance(optval
, compat_str
)
635 def formatSeconds(secs
):
637 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
639 return '%d:%02d' % (secs
// 60, secs
% 60)
644 def make_HTTPS_handler(params
, **kwargs
):
645 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
646 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
647 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
648 if opts_no_check_certificate
:
649 context
.check_hostname
= False
650 context
.verify_mode
= ssl
.CERT_NONE
652 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
655 # (create_default_context present but HTTPSHandler has no context=)
658 if sys
.version_info
< (3, 2):
659 return YoutubeDLHTTPSHandler(params
, **kwargs
)
661 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
662 context
.verify_mode
= (ssl
.CERT_NONE
663 if opts_no_check_certificate
664 else ssl
.CERT_REQUIRED
)
665 context
.set_default_verify_paths()
666 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
669 def bug_reports_message():
670 if ytdl_is_updateable():
671 update_cmd
= 'type youtube-dl -U to update'
673 update_cmd
= 'see https://yt-dl.org/update on how to update'
674 msg
= '; please report this issue on https://yt-dl.org/bug .'
675 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
676 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
680 class ExtractorError(Exception):
681 """Error during info extraction."""
683 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
684 """ tb, if given, is the original traceback (so that it can be printed out).
685 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
688 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
690 if video_id
is not None:
691 msg
= video_id
+ ': ' + msg
693 msg
+= ' (caused by %r)' % cause
695 msg
+= bug_reports_message()
696 super(ExtractorError
, self
).__init
__(msg
)
699 self
.exc_info
= sys
.exc_info() # preserve original exception
701 self
.video_id
= video_id
703 def format_traceback(self
):
704 if self
.traceback
is None:
706 return ''.join(traceback
.format_tb(self
.traceback
))
709 class UnsupportedError(ExtractorError
):
710 def __init__(self
, url
):
711 super(UnsupportedError
, self
).__init
__(
712 'Unsupported URL: %s' % url
, expected
=True)
716 class RegexNotFoundError(ExtractorError
):
717 """Error when a regex didn't match"""
721 class DownloadError(Exception):
722 """Download Error exception.
724 This exception may be thrown by FileDownloader objects if they are not
725 configured to continue on errors. They will contain the appropriate
729 def __init__(self
, msg
, exc_info
=None):
730 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
731 super(DownloadError
, self
).__init
__(msg
)
732 self
.exc_info
= exc_info
735 class SameFileError(Exception):
736 """Same File exception.
738 This exception will be thrown by FileDownloader objects if they detect
739 multiple files would have to be downloaded to the same file on disk.
744 class PostProcessingError(Exception):
745 """Post Processing exception.
747 This exception may be raised by PostProcessor's .run() method to
748 indicate an error in the postprocessing task.
751 def __init__(self
, msg
):
755 class MaxDownloadsReached(Exception):
756 """ --max-downloads limit has been reached. """
760 class UnavailableVideoError(Exception):
761 """Unavailable Format exception.
763 This exception will be thrown when a video is requested
764 in a format that is not available for that video.
769 class ContentTooShortError(Exception):
770 """Content Too Short exception.
772 This exception may be raised by FileDownloader objects when a file they
773 download is too small for what the server announced first, indicating
774 the connection was probably interrupted.
777 def __init__(self
, downloaded
, expected
):
779 self
.downloaded
= downloaded
780 self
.expected
= expected
783 class XAttrMetadataError(Exception):
784 def __init__(self
, code
=None, msg
='Unknown error'):
785 super(XAttrMetadataError
, self
).__init
__(msg
)
789 # Parsing code and msg
790 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
) or
791 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
792 self
.reason
= 'NO_SPACE'
793 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
794 self
.reason
= 'VALUE_TOO_LONG'
796 self
.reason
= 'NOT_SUPPORTED'
799 class XAttrUnavailableError(Exception):
803 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
804 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
805 # expected HTTP responses to meet HTTP/1.0 or later (see also
806 # https://github.com/rg3/youtube-dl/issues/6727)
807 if sys
.version_info
< (3, 0):
808 kwargs
[b
'strict'] = True
809 hc
= http_class(*args
, **kwargs
)
810 source_address
= ydl_handler
._params
.get('source_address')
811 if source_address
is not None:
812 sa
= (source_address
, 0)
813 if hasattr(hc
, 'source_address'): # Python 2.7+
814 hc
.source_address
= sa
816 def _hc_connect(self
, *args
, **kwargs
):
817 sock
= compat_socket_create_connection(
818 (self
.host
, self
.port
), self
.timeout
, sa
)
820 self
.sock
= ssl
.wrap_socket(
821 sock
, self
.key_file
, self
.cert_file
,
822 ssl_version
=ssl
.PROTOCOL_TLSv1
)
825 hc
.connect
= functools
.partial(_hc_connect
, hc
)
830 def handle_youtubedl_headers(headers
):
831 filtered_headers
= headers
833 if 'Youtubedl-no-compression' in filtered_headers
:
834 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
835 del filtered_headers
['Youtubedl-no-compression']
837 return filtered_headers
840 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
841 """Handler for HTTP requests and responses.
843 This class, when installed with an OpenerDirector, automatically adds
844 the standard headers to every HTTP request and handles gzipped and
845 deflated responses from web servers. If compression is to be avoided in
846 a particular request, the original request in the program code only has
847 to include the HTTP header "Youtubedl-no-compression", which will be
848 removed before making the real request.
850 Part of this code was copied from:
852 http://techknack.net/python-urllib2-handlers/
854 Andrew Rowls, the author of that code, agreed to release it to the
858 def __init__(self
, params
, *args
, **kwargs
):
859 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
860 self
._params
= params
862 def http_open(self
, req
):
863 conn_class
= compat_http_client
.HTTPConnection
865 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
867 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
868 del req
.headers
['Ytdl-socks-proxy']
870 return self
.do_open(functools
.partial(
871 _create_http_connection
, self
, conn_class
, False),
877 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
879 return zlib
.decompress(data
)
882 def addinfourl_wrapper(stream
, headers
, url
, code
):
883 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
884 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
885 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
889 def http_request(self
, req
):
890 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
891 # always respected by websites, some tend to give out URLs with non percent-encoded
892 # non-ASCII characters (see telemb.py, ard.py [#3412])
893 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
894 # To work around aforementioned issue we will replace request's original URL with
895 # percent-encoded one
896 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
897 # the code of this workaround has been moved here from YoutubeDL.urlopen()
898 url
= req
.get_full_url()
899 url_escaped
= escape_url(url
)
901 # Substitute URL if any change after escaping
902 if url
!= url_escaped
:
903 req
= update_Request(req
, url
=url_escaped
)
905 for h
, v
in std_headers
.items():
906 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
907 # The dict keys are capitalized because of this bug by urllib
908 if h
.capitalize() not in req
.headers
:
911 req
.headers
= handle_youtubedl_headers(req
.headers
)
913 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
914 # Python 2.6 is brain-dead when it comes to fragments
915 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
916 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
920 def http_response(self
, req
, resp
):
923 if resp
.headers
.get('Content-encoding', '') == 'gzip':
924 content
= resp
.read()
925 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
927 uncompressed
= io
.BytesIO(gz
.read())
928 except IOError as original_ioerror
:
929 # There may be junk add the end of the file
930 # See http://stackoverflow.com/q/4928560/35070 for details
931 for i
in range(1, 1024):
933 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
934 uncompressed
= io
.BytesIO(gz
.read())
939 raise original_ioerror
940 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
941 resp
.msg
= old_resp
.msg
942 del resp
.headers
['Content-encoding']
944 if resp
.headers
.get('Content-encoding', '') == 'deflate':
945 gz
= io
.BytesIO(self
.deflate(resp
.read()))
946 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
947 resp
.msg
= old_resp
.msg
948 del resp
.headers
['Content-encoding']
949 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
950 # https://github.com/rg3/youtube-dl/issues/6457).
951 if 300 <= resp
.code
< 400:
952 location
= resp
.headers
.get('Location')
954 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
955 if sys
.version_info
>= (3, 0):
956 location
= location
.encode('iso-8859-1').decode('utf-8')
958 location
= location
.decode('utf-8')
959 location_escaped
= escape_url(location
)
960 if location
!= location_escaped
:
961 del resp
.headers
['Location']
962 if sys
.version_info
< (3, 0):
963 location_escaped
= location_escaped
.encode('utf-8')
964 resp
.headers
['Location'] = location_escaped
967 https_request
= http_request
968 https_response
= http_response
971 def make_socks_conn_class(base_class
, socks_proxy
):
972 assert issubclass(base_class
, (
973 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
975 url_components
= compat_urlparse
.urlparse(socks_proxy
)
976 if url_components
.scheme
.lower() == 'socks5':
977 socks_type
= ProxyType
.SOCKS5
978 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
979 socks_type
= ProxyType
.SOCKS4
980 elif url_components
.scheme
.lower() == 'socks4a':
981 socks_type
= ProxyType
.SOCKS4A
983 def unquote_if_non_empty(s
):
986 return compat_urllib_parse_unquote_plus(s
)
990 url_components
.hostname
, url_components
.port
or 1080,
992 unquote_if_non_empty(url_components
.username
),
993 unquote_if_non_empty(url_components
.password
),
996 class SocksConnection(base_class
):
998 self
.sock
= sockssocket()
999 self
.sock
.setproxy(*proxy_args
)
1000 if type(self
.timeout
) in (int, float):
1001 self
.sock
.settimeout(self
.timeout
)
1002 self
.sock
.connect((self
.host
, self
.port
))
1004 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1005 if hasattr(self
, '_context'): # Python > 2.6
1006 self
.sock
= self
._context
.wrap_socket(
1007 self
.sock
, server_hostname
=self
.host
)
1009 self
.sock
= ssl
.wrap_socket(self
.sock
)
1011 return SocksConnection
1014 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1015 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1016 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1017 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1018 self
._params
= params
1020 def https_open(self
, req
):
1022 conn_class
= self
._https
_conn
_class
1024 if hasattr(self
, '_context'): # python > 2.6
1025 kwargs
['context'] = self
._context
1026 if hasattr(self
, '_check_hostname'): # python 3.x
1027 kwargs
['check_hostname'] = self
._check
_hostname
1029 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1031 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1032 del req
.headers
['Ytdl-socks-proxy']
1034 return self
.do_open(functools
.partial(
1035 _create_http_connection
, self
, conn_class
, True),
1039 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1040 def __init__(self
, cookiejar
=None):
1041 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1043 def http_response(self
, request
, response
):
1044 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1045 # characters in Set-Cookie HTTP header of last response (see
1046 # https://github.com/rg3/youtube-dl/issues/6769).
1047 # In order to at least prevent crashing we will percent encode Set-Cookie
1048 # header before HTTPCookieProcessor starts processing it.
1049 # if sys.version_info < (3, 0) and response.headers:
1050 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1051 # set_cookie = response.headers.get(set_cookie_header)
1053 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1054 # if set_cookie != set_cookie_escaped:
1055 # del response.headers[set_cookie_header]
1056 # response.headers[set_cookie_header] = set_cookie_escaped
1057 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1059 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1060 https_response
= http_response
1063 def extract_timezone(date_str
):
1065 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1068 timezone
= datetime
.timedelta()
1070 date_str
= date_str
[:-len(m
.group('tz'))]
1071 if not m
.group('sign'):
1072 timezone
= datetime
.timedelta()
1074 sign
= 1 if m
.group('sign') == '+' else -1
1075 timezone
= datetime
.timedelta(
1076 hours
=sign
* int(m
.group('hours')),
1077 minutes
=sign
* int(m
.group('minutes')))
1078 return timezone
, date_str
1081 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1082 """ Return a UNIX timestamp from the given date """
1084 if date_str
is None:
1087 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1089 if timezone
is None:
1090 timezone
, date_str
= extract_timezone(date_str
)
1093 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1094 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1095 return calendar
.timegm(dt
.timetuple())
1100 def date_formats(day_first
=True):
1101 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1104 def unified_strdate(date_str
, day_first
=True):
1105 """Return a string with the date in the format YYYYMMDD"""
1107 if date_str
is None:
1111 date_str
= date_str
.replace(',', ' ')
1112 # Remove AM/PM + timezone
1113 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1114 _
, date_str
= extract_timezone(date_str
)
1116 for expression
in date_formats(day_first
):
1118 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1121 if upload_date
is None:
1122 timetuple
= email
.utils
.parsedate_tz(date_str
)
1125 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1128 if upload_date
is not None:
1129 return compat_str(upload_date
)
1132 def unified_timestamp(date_str
, day_first
=True):
1133 if date_str
is None:
1136 date_str
= date_str
.replace(',', ' ')
1138 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1139 timezone
, date_str
= extract_timezone(date_str
)
1141 # Remove AM/PM + timezone
1142 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1144 for expression
in date_formats(day_first
):
1146 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1147 return calendar
.timegm(dt
.timetuple())
1150 timetuple
= email
.utils
.parsedate_tz(date_str
)
1152 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1155 def determine_ext(url
, default_ext
='unknown_video'):
1158 guess
= url
.partition('?')[0].rpartition('.')[2]
1159 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1161 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1162 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1163 return guess
.rstrip('/')
1168 def subtitles_filename(filename
, sub_lang
, sub_format
):
1169 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
1172 def date_from_str(date_str
):
1174 Return a datetime object from a string in the format YYYYMMDD or
1175 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1176 today
= datetime
.date
.today()
1177 if date_str
in ('now', 'today'):
1179 if date_str
== 'yesterday':
1180 return today
- datetime
.timedelta(days
=1)
1181 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
1182 if match
is not None:
1183 sign
= match
.group('sign')
1184 time
= int(match
.group('time'))
1187 unit
= match
.group('unit')
1188 # A bad approximation?
1192 elif unit
== 'year':
1196 delta
= datetime
.timedelta(**{unit
: time
})
1197 return today
+ delta
1198 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
1201 def hyphenate_date(date_str
):
1203 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1204 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1205 if match
is not None:
1206 return '-'.join(match
.groups())
1211 class DateRange(object):
1212 """Represents a time interval between two dates"""
1214 def __init__(self
, start
=None, end
=None):
1215 """start and end must be strings in the format accepted by date"""
1216 if start
is not None:
1217 self
.start
= date_from_str(start
)
1219 self
.start
= datetime
.datetime
.min.date()
1221 self
.end
= date_from_str(end
)
1223 self
.end
= datetime
.datetime
.max.date()
1224 if self
.start
> self
.end
:
1225 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1229 """Returns a range that only contains the given day"""
1230 return cls(day
, day
)
1232 def __contains__(self
, date
):
1233 """Check if the date is in the range"""
1234 if not isinstance(date
, datetime
.date
):
1235 date
= date_from_str(date
)
1236 return self
.start
<= date
<= self
.end
1239 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1242 def platform_name():
1243 """ Returns the platform name as a compat_str """
1244 res
= platform
.platform()
1245 if isinstance(res
, bytes):
1246 res
= res
.decode(preferredencoding())
1248 assert isinstance(res
, compat_str
)
1252 def _windows_write_string(s
, out
):
1253 """ Returns True if the string was written using special methods,
1254 False if it has yet to be written out."""
1255 # Adapted from http://stackoverflow.com/a/3259271/35070
1258 import ctypes
.wintypes
1266 fileno
= out
.fileno()
1267 except AttributeError:
1268 # If the output stream doesn't have a fileno, it's virtual
1270 except io
.UnsupportedOperation
:
1271 # Some strange Windows pseudo files?
1273 if fileno
not in WIN_OUTPUT_IDS
:
1276 GetStdHandle
= ctypes
.WINFUNCTYPE(
1277 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1278 (b
'GetStdHandle', ctypes
.windll
.kernel32
))
1279 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1281 WriteConsoleW
= ctypes
.WINFUNCTYPE(
1282 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1283 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1284 ctypes
.wintypes
.LPVOID
)((b
'WriteConsoleW', ctypes
.windll
.kernel32
))
1285 written
= ctypes
.wintypes
.DWORD(0)
1287 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
'GetFileType', ctypes
.windll
.kernel32
))
1288 FILE_TYPE_CHAR
= 0x0002
1289 FILE_TYPE_REMOTE
= 0x8000
1290 GetConsoleMode
= ctypes
.WINFUNCTYPE(
1291 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1292 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1293 (b
'GetConsoleMode', ctypes
.windll
.kernel32
))
1294 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1296 def not_a_console(handle
):
1297 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1299 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1300 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1302 if not_a_console(h
):
1305 def next_nonbmp_pos(s
):
1307 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1308 except StopIteration:
1312 count
= min(next_nonbmp_pos(s
), 1024)
1314 ret
= WriteConsoleW(
1315 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1317 raise OSError('Failed to write string')
1318 if not count
: # We just wrote a non-BMP character
1319 assert written
.value
== 2
1322 assert written
.value
> 0
1323 s
= s
[written
.value
:]
1327 def write_string(s
, out
=None, encoding
=None):
1330 assert type(s
) == compat_str
1332 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1333 if _windows_write_string(s
, out
):
1336 if ('b' in getattr(out
, 'mode', '') or
1337 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1338 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1340 elif hasattr(out
, 'buffer'):
1341 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1342 byt
= s
.encode(enc
, 'ignore')
1343 out
.buffer.write(byt
)
1349 def bytes_to_intlist(bs
):
1352 if isinstance(bs
[0], int): # Python 3
1355 return [ord(c
) for c
in bs
]
1358 def intlist_to_bytes(xs
):
1361 return compat_struct_pack('%dB' % len(xs
), *xs
)
1364 # Cross-platform file locking
1365 if sys
.platform
== 'win32':
1366 import ctypes
.wintypes
1369 class OVERLAPPED(ctypes
.Structure
):
1371 ('Internal', ctypes
.wintypes
.LPVOID
),
1372 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1373 ('Offset', ctypes
.wintypes
.DWORD
),
1374 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1375 ('hEvent', ctypes
.wintypes
.HANDLE
),
1378 kernel32
= ctypes
.windll
.kernel32
1379 LockFileEx
= kernel32
.LockFileEx
1380 LockFileEx
.argtypes
= [
1381 ctypes
.wintypes
.HANDLE
, # hFile
1382 ctypes
.wintypes
.DWORD
, # dwFlags
1383 ctypes
.wintypes
.DWORD
, # dwReserved
1384 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1385 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1386 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1388 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1389 UnlockFileEx
= kernel32
.UnlockFileEx
1390 UnlockFileEx
.argtypes
= [
1391 ctypes
.wintypes
.HANDLE
, # hFile
1392 ctypes
.wintypes
.DWORD
, # dwReserved
1393 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1394 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1395 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1397 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1398 whole_low
= 0xffffffff
1399 whole_high
= 0x7fffffff
1401 def _lock_file(f
, exclusive
):
1402 overlapped
= OVERLAPPED()
1403 overlapped
.Offset
= 0
1404 overlapped
.OffsetHigh
= 0
1405 overlapped
.hEvent
= 0
1406 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1407 handle
= msvcrt
.get_osfhandle(f
.fileno())
1408 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1409 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1410 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1412 def _unlock_file(f
):
1413 assert f
._lock
_file
_overlapped
_p
1414 handle
= msvcrt
.get_osfhandle(f
.fileno())
1415 if not UnlockFileEx(handle
, 0,
1416 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1417 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1420 # Some platforms, such as Jython, is missing fcntl
1424 def _lock_file(f
, exclusive
):
1425 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1427 def _unlock_file(f
):
1428 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1430 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
1432 def _lock_file(f
, exclusive
):
1433 raise IOError(UNSUPPORTED_MSG
)
1435 def _unlock_file(f
):
1436 raise IOError(UNSUPPORTED_MSG
)
1439 class locked_file(object):
1440 def __init__(self
, filename
, mode
, encoding
=None):
1441 assert mode
in ['r', 'a', 'w']
1442 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1445 def __enter__(self
):
1446 exclusive
= self
.mode
!= 'r'
1448 _lock_file(self
.f
, exclusive
)
1454 def __exit__(self
, etype
, value
, traceback
):
1456 _unlock_file(self
.f
)
1463 def write(self
, *args
):
1464 return self
.f
.write(*args
)
1466 def read(self
, *args
):
1467 return self
.f
.read(*args
)
1470 def get_filesystem_encoding():
1471 encoding
= sys
.getfilesystemencoding()
1472 return encoding
if encoding
is not None else 'utf-8'
1475 def shell_quote(args
):
1477 encoding
= get_filesystem_encoding()
1479 if isinstance(a
, bytes):
1480 # We may get a filename encoded with 'encodeFilename'
1481 a
= a
.decode(encoding
)
1482 quoted_args
.append(pipes
.quote(a
))
1483 return ' '.join(quoted_args
)
1486 def smuggle_url(url
, data
):
1487 """ Pass additional data in a URL for internal use. """
1489 url
, idata
= unsmuggle_url(url
, {})
1491 sdata
= compat_urllib_parse_urlencode(
1492 {'__youtubedl_smuggle': json
.dumps(data
)})
1493 return url
+ '#' + sdata
1496 def unsmuggle_url(smug_url
, default
=None):
1497 if '#__youtubedl_smuggle' not in smug_url
:
1498 return smug_url
, default
1499 url
, _
, sdata
= smug_url
.rpartition('#')
1500 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1501 data
= json
.loads(jsond
)
1505 def format_bytes(bytes):
1508 if type(bytes) is str:
1509 bytes = float(bytes)
1513 exponent
= int(math
.log(bytes, 1024.0))
1514 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1515 converted
= float(bytes) / float(1024 ** exponent
)
1516 return '%.2f%s' % (converted
, suffix
)
1519 def lookup_unit_table(unit_table
, s
):
1520 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1522 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
1525 num_str
= m
.group('num').replace(',', '.')
1526 mult
= unit_table
[m
.group('unit')]
1527 return int(float(num_str
) * mult
)
1530 def parse_filesize(s
):
1534 # The lower-case forms are of course incorrect and unofficial,
1535 # but we support those too
1552 'megabytes': 1000 ** 2,
1553 'mebibytes': 1024 ** 2,
1559 'gigabytes': 1000 ** 3,
1560 'gibibytes': 1024 ** 3,
1566 'terabytes': 1000 ** 4,
1567 'tebibytes': 1024 ** 4,
1573 'petabytes': 1000 ** 5,
1574 'pebibytes': 1024 ** 5,
1580 'exabytes': 1000 ** 6,
1581 'exbibytes': 1024 ** 6,
1587 'zettabytes': 1000 ** 7,
1588 'zebibytes': 1024 ** 7,
1594 'yottabytes': 1000 ** 8,
1595 'yobibytes': 1024 ** 8,
1598 return lookup_unit_table(_UNIT_TABLE
, s
)
1607 if re
.match(r
'^[\d,.]+$', s
):
1608 return str_to_int(s
)
1619 return lookup_unit_table(_UNIT_TABLE
, s
)
1622 def month_by_name(name
, lang
='en'):
1623 """ Return the number of a month by (locale-independently) English name """
1625 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1628 return month_names
.index(name
) + 1
1633 def month_by_abbreviation(abbrev
):
1634 """ Return the number of a month by (locale-independently) English
1638 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1643 def fix_xml_ampersands(xml_str
):
1644 """Replace all the '&' by '&' in XML"""
1646 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1651 def setproctitle(title
):
1652 assert isinstance(title
, compat_str
)
1654 # ctypes in Jython is not complete
1655 # http://bugs.jython.org/issue2148
1656 if sys
.platform
.startswith('java'):
1660 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1663 title_bytes
= title
.encode('utf-8')
1664 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1665 buf
.value
= title_bytes
1667 libc
.prctl(15, buf
, 0, 0, 0)
1668 except AttributeError:
1669 return # Strange libc, just skip this
1672 def remove_start(s
, start
):
1673 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1676 def remove_end(s
, end
):
1677 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1680 def remove_quotes(s
):
1681 if s
is None or len(s
) < 2:
1683 for quote
in ('"', "'", ):
1684 if s
[0] == quote
and s
[-1] == quote
:
1689 def url_basename(url
):
1690 path
= compat_urlparse
.urlparse(url
).path
1691 return path
.strip('/').split('/')[-1]
1695 return re
.match(r
'https?://[^?#&]+/', url
).group()
1698 class HEADRequest(compat_urllib_request
.Request
):
1699 def get_method(self
):
1703 class PUTRequest(compat_urllib_request
.Request
):
1704 def get_method(self
):
1708 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1711 v
= getattr(v
, get_attr
, None)
1717 return int(v
) * invscale
// scale
1722 def str_or_none(v
, default
=None):
1723 return default
if v
is None else compat_str(v
)
1726 def str_to_int(int_str
):
1727 """ A more relaxed version of int_or_none """
1730 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1734 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1738 return float(v
) * invscale
/ scale
1743 def strip_or_none(v
):
1744 return None if v
is None else v
.strip()
1747 def parse_duration(s
):
1748 if not isinstance(s
, compat_basestring
):
1753 days
, hours
, mins
, secs
, ms
= [None] * 5
1754 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s
)
1756 days
, hours
, mins
, secs
, ms
= m
.groups()
1761 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1764 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1767 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1770 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1773 days
, hours
, mins
, secs
, ms
= m
.groups()
1775 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s
)
1777 hours
, mins
= m
.groups()
1783 duration
+= float(secs
)
1785 duration
+= float(mins
) * 60
1787 duration
+= float(hours
) * 60 * 60
1789 duration
+= float(days
) * 24 * 60 * 60
1791 duration
+= float(ms
)
1795 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1796 name
, real_ext
= os
.path
.splitext(filename
)
1798 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1799 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1800 else '{0}.{1}'.format(filename
, ext
))
1803 def replace_extension(filename
, ext
, expected_real_ext
=None):
1804 name
, real_ext
= os
.path
.splitext(filename
)
1805 return '{0}.{1}'.format(
1806 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1810 def check_executable(exe
, args
=[]):
1811 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1812 args can be a list of arguments for a short output (like -version) """
1814 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1820 def get_exe_version(exe
, args
=['--version'],
1821 version_re
=None, unrecognized
='present'):
1822 """ Returns the version of the specified executable,
1823 or False if the executable is not present """
1825 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1826 # SIGTTOU if youtube-dl is run in the background.
1827 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1828 out
, _
= subprocess
.Popen(
1829 [encodeArgument(exe
)] + args
,
1830 stdin
=subprocess
.PIPE
,
1831 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1834 if isinstance(out
, bytes): # Python 2.x
1835 out
= out
.decode('ascii', 'ignore')
1836 return detect_exe_version(out
, version_re
, unrecognized
)
1839 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1840 assert isinstance(output
, compat_str
)
1841 if version_re
is None:
1842 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1843 m
= re
.search(version_re
, output
)
1850 class PagedList(object):
1852 # This is only useful for tests
1853 return len(self
.getslice())
1856 class OnDemandPagedList(PagedList
):
1857 def __init__(self
, pagefunc
, pagesize
, use_cache
=False):
1858 self
._pagefunc
= pagefunc
1859 self
._pagesize
= pagesize
1860 self
._use
_cache
= use_cache
1864 def getslice(self
, start
=0, end
=None):
1866 for pagenum
in itertools
.count(start
// self
._pagesize
):
1867 firstid
= pagenum
* self
._pagesize
1868 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1869 if start
>= nextfirstid
:
1874 page_results
= self
._cache
.get(pagenum
)
1875 if page_results
is None:
1876 page_results
= list(self
._pagefunc
(pagenum
))
1878 self
._cache
[pagenum
] = page_results
1881 start
% self
._pagesize
1882 if firstid
<= start
< nextfirstid
1886 ((end
- 1) % self
._pagesize
) + 1
1887 if (end
is not None and firstid
<= end
<= nextfirstid
)
1890 if startv
!= 0 or endv
is not None:
1891 page_results
= page_results
[startv
:endv
]
1892 res
.extend(page_results
)
1894 # A little optimization - if current page is not "full", ie. does
1895 # not contain page_size videos then we can assume that this page
1896 # is the last one - there are no more ids on further pages -
1897 # i.e. no need to query again.
1898 if len(page_results
) + startv
< self
._pagesize
:
1901 # If we got the whole page, but the next page is not interesting,
1902 # break out early as well
1903 if end
== nextfirstid
:
1908 class InAdvancePagedList(PagedList
):
1909 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1910 self
._pagefunc
= pagefunc
1911 self
._pagecount
= pagecount
1912 self
._pagesize
= pagesize
1914 def getslice(self
, start
=0, end
=None):
1916 start_page
= start
// self
._pagesize
1918 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1919 skip_elems
= start
- start_page
* self
._pagesize
1920 only_more
= None if end
is None else end
- start
1921 for pagenum
in range(start_page
, end_page
):
1922 page
= list(self
._pagefunc
(pagenum
))
1924 page
= page
[skip_elems
:]
1926 if only_more
is not None:
1927 if len(page
) < only_more
:
1928 only_more
-= len(page
)
1930 page
= page
[:only_more
]
1937 def uppercase_escape(s
):
1938 unicode_escape
= codecs
.getdecoder('unicode_escape')
1940 r
'\\U[0-9a-fA-F]{8}',
1941 lambda m
: unicode_escape(m
.group(0))[0],
1945 def lowercase_escape(s
):
1946 unicode_escape
= codecs
.getdecoder('unicode_escape')
1948 r
'\\u[0-9a-fA-F]{4}',
1949 lambda m
: unicode_escape(m
.group(0))[0],
1953 def escape_rfc3986(s
):
1954 """Escape non-ASCII characters as suggested by RFC 3986"""
1955 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
1956 s
= s
.encode('utf-8')
1957 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1960 def escape_url(url
):
1961 """Escape URL as suggested by RFC 3986"""
1962 url_parsed
= compat_urllib_parse_urlparse(url
)
1963 return url_parsed
._replace
(
1964 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
1965 path
=escape_rfc3986(url_parsed
.path
),
1966 params
=escape_rfc3986(url_parsed
.params
),
1967 query
=escape_rfc3986(url_parsed
.query
),
1968 fragment
=escape_rfc3986(url_parsed
.fragment
)
1972 def read_batch_urls(batch_fd
):
1974 if not isinstance(url
, compat_str
):
1975 url
= url
.decode('utf-8', 'replace')
1976 BOM_UTF8
= '\xef\xbb\xbf'
1977 if url
.startswith(BOM_UTF8
):
1978 url
= url
[len(BOM_UTF8
):]
1980 if url
.startswith(('#', ';', ']')):
1984 with contextlib
.closing(batch_fd
) as fd
:
1985 return [url
for url
in map(fixup
, fd
) if url
]
1988 def urlencode_postdata(*args
, **kargs
):
1989 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
1992 def update_url_query(url
, query
):
1995 parsed_url
= compat_urlparse
.urlparse(url
)
1996 qs
= compat_parse_qs(parsed_url
.query
)
1998 return compat_urlparse
.urlunparse(parsed_url
._replace
(
1999 query
=compat_urllib_parse_urlencode(qs
, True)))
2002 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
2003 req_headers
= req
.headers
.copy()
2004 req_headers
.update(headers
)
2005 req_data
= data
or req
.data
2006 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2007 req_get_method
= req
.get_method()
2008 if req_get_method
== 'HEAD':
2009 req_type
= HEADRequest
2010 elif req_get_method
== 'PUT':
2011 req_type
= PUTRequest
2013 req_type
= compat_urllib_request
.Request
2015 req_url
, data
=req_data
, headers
=req_headers
,
2016 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2017 if hasattr(req
, 'timeout'):
2018 new_req
.timeout
= req
.timeout
2022 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2023 if isinstance(key_or_keys
, (list, tuple)):
2024 for key
in key_or_keys
:
2025 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2029 return d
.get(key_or_keys
, default
)
2032 def try_get(src
, getter
, expected_type
=None):
2035 except (AttributeError, KeyError, TypeError, IndexError):
2038 if expected_type
is None or isinstance(v
, expected_type
):
2042 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2043 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2055 TV_PARENTAL_GUIDELINES
= {
2065 def parse_age_limit(s
):
2067 return s
if 0 <= s
<= 21 else None
2068 if not isinstance(s
, compat_basestring
):
2070 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2072 return int(m
.group('age'))
2074 return US_RATINGS
[s
]
2075 return TV_PARENTAL_GUIDELINES
.get(s
)
2078 def strip_jsonp(code
):
2080 r
'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
2083 def js_to_json(code
):
2086 if v
in ('true', 'false', 'null'):
2088 elif v
.startswith('/*') or v
== ',':
2091 if v
[0] in ("'", '"'):
2092 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
2097 }.get(m
.group(0), m
.group(0)), v
[1:-1])
2100 (r
'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2101 (r
'^(0+[0-7]+)\s*:?$', 8),
2104 for regex
, base
in INTEGER_TABLE
:
2105 im
= re
.match(regex
, v
)
2107 i
= int(im
.group(1), base
)
2108 return '"%d":' % i
if v
.endswith(':') else '%d' % i
2112 return re
.sub(r
'''(?sx)
2113 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2114 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2115 /\*.*?\*/|,(?=\s*[\]}])|
2116 [a-zA-Z_][.a-zA-Z_0-9]*|
2117 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2122 def qualities(quality_ids
):
2123 """ Get a numeric quality value out of a list of possible values """
2126 return quality_ids
.index(qid
)
2132 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
2135 def limit_length(s
, length
):
2136 """ Add ellipses to overly long strings """
2141 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
2145 def version_tuple(v
):
2146 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
2149 def is_outdated_version(version
, limit
, assume_new
=True):
2151 return not assume_new
2153 return version_tuple(version
) < version_tuple(limit
)
2155 return not assume_new
2158 def ytdl_is_updateable():
2159 """ Returns if youtube-dl can be updated with -U """
2160 from zipimport
import zipimporter
2162 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
2165 def args_to_str(args
):
2166 # Get a short string representation for a subprocess command
2167 return ' '.join(compat_shlex_quote(a
) for a
in args
)
2170 def error_to_compat_str(err
):
2172 # On python 2 error byte string must be decoded with proper
2173 # encoding rather than ascii
2174 if sys
.version_info
[0] < 3:
2175 err_str
= err_str
.decode(preferredencoding())
2179 def mimetype2ext(mt
):
2185 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2186 # it's the most popular one
2187 'audio/mpeg': 'mp3',
2192 _
, _
, res
= mt
.rpartition('/')
2193 res
= res
.split(';')[0].strip().lower()
2197 'smptett+xml': 'tt',
2203 'x-mp4-fragmented': 'mp4',
2206 'x-mpegurl': 'm3u8',
2207 'vnd.apple.mpegurl': 'm3u8',
2212 'vnd.ms-sstr+xml': 'ism',
2217 def parse_codecs(codecs_str
):
2218 # http://tools.ietf.org/html/rfc6381
2221 splited_codecs
= list(filter(None, map(
2222 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
2223 vcodec
, acodec
= None, None
2224 for full_codec
in splited_codecs
:
2225 codec
= full_codec
.split('.')[0]
2226 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2229 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2233 write_string('WARNING: Unknown codec %s' % full_codec
, sys
.stderr
)
2234 if not vcodec
and not acodec
:
2235 if len(splited_codecs
) == 2:
2240 elif len(splited_codecs
) == 1:
2247 'vcodec': vcodec
or 'none',
2248 'acodec': acodec
or 'none',
2253 def urlhandle_detect_ext(url_handle
):
2254 getheader
= url_handle
.headers
.get
2256 cd
= getheader('Content-Disposition')
2258 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
2260 e
= determine_ext(m
.group('filename'), default_ext
=None)
2264 return mimetype2ext(getheader('Content-Type'))
2267 def encode_data_uri(data
, mime_type
):
2268 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
2271 def age_restricted(content_limit
, age_limit
):
2272 """ Returns True iff the content should be blocked """
2274 if age_limit
is None: # No limit set
2276 if content_limit
is None:
2277 return False # Content available for everyone
2278 return age_limit
< content_limit
2281 def is_html(first_bytes
):
2282 """ Detect whether a file contains HTML by examining its first bytes. """
2285 (b
'\xef\xbb\xbf', 'utf-8'),
2286 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
2287 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
2288 (b
'\xff\xfe', 'utf-16-le'),
2289 (b
'\xfe\xff', 'utf-16-be'),
2291 for bom
, enc
in BOMS
:
2292 if first_bytes
.startswith(bom
):
2293 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
2296 s
= first_bytes
.decode('utf-8', 'replace')
2298 return re
.match(r
'^\s*<', s
)
2301 def determine_protocol(info_dict
):
2302 protocol
= info_dict
.get('protocol')
2303 if protocol
is not None:
2306 url
= info_dict
['url']
2307 if url
.startswith('rtmp'):
2309 elif url
.startswith('mms'):
2311 elif url
.startswith('rtsp'):
2314 ext
= determine_ext(url
)
2320 return compat_urllib_parse_urlparse(url
).scheme
2323 def render_table(header_row
, data
):
2324 """ Render a list of rows, each as a list of values """
2325 table
= [header_row
] + data
2326 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
2327 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
2328 return '\n'.join(format_str
% tuple(row
) for row
in table
)
2331 def _match_one(filter_part
, dct
):
2332 COMPARISON_OPERATORS
= {
2340 operator_rex
= re
.compile(r
'''(?x)\s*
2342 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2344 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2345 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2348 ''' % '|'.join(map(re
.escape
, COMPARISON_OPERATORS
.keys())))
2349 m
= operator_rex
.search(filter_part
)
2351 op
= COMPARISON_OPERATORS
[m
.group('op')]
2352 actual_value
= dct
.get(m
.group('key'))
2353 if (m
.group('strval') is not None or
2354 # If the original field is a string and matching comparisonvalue is
2355 # a number we should respect the origin of the original field
2356 # and process comparison value as a string (see
2357 # https://github.com/rg3/youtube-dl/issues/11082).
2358 actual_value
is not None and m
.group('intval') is not None and
2359 isinstance(actual_value
, compat_str
)):
2360 if m
.group('op') not in ('=', '!='):
2362 'Operator %s does not support string values!' % m
.group('op'))
2363 comparison_value
= m
.group('strval') or m
.group('intval')
2366 comparison_value
= int(m
.group('intval'))
2368 comparison_value
= parse_filesize(m
.group('intval'))
2369 if comparison_value
is None:
2370 comparison_value
= parse_filesize(m
.group('intval') + 'B')
2371 if comparison_value
is None:
2373 'Invalid integer value %r in filter part %r' % (
2374 m
.group('intval'), filter_part
))
2375 if actual_value
is None:
2376 return m
.group('none_inclusive')
2377 return op(actual_value
, comparison_value
)
2380 '': lambda v
: v
is not None,
2381 '!': lambda v
: v
is None,
2383 operator_rex
= re
.compile(r
'''(?x)\s*
2384 (?P<op>%s)\s*(?P<key>[a-z_]+)
2386 ''' % '|'.join(map(re
.escape
, UNARY_OPERATORS
.keys())))
2387 m
= operator_rex
.search(filter_part
)
2389 op
= UNARY_OPERATORS
[m
.group('op')]
2390 actual_value
= dct
.get(m
.group('key'))
2391 return op(actual_value
)
2393 raise ValueError('Invalid filter part %r' % filter_part
)
2396 def match_str(filter_str
, dct
):
2397 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2400 _match_one(filter_part
, dct
) for filter_part
in filter_str
.split('&'))
2403 def match_filter_func(filter_str
):
2404 def _match_func(info_dict
):
2405 if match_str(filter_str
, info_dict
):
2408 video_title
= info_dict
.get('title', info_dict
.get('id', 'video'))
2409 return '%s does not pass filter %s, skipping ..' % (video_title
, filter_str
)
2413 def parse_dfxp_time_expr(time_expr
):
2417 mobj
= re
.match(r
'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr
)
2419 return float(mobj
.group('time_offset'))
2421 mobj
= re
.match(r
'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr
)
2423 return 3600 * int(mobj
.group(1)) + 60 * int(mobj
.group(2)) + float(mobj
.group(3).replace(':', '.'))
2426 def srt_subtitles_timecode(seconds
):
2427 return '%02d:%02d:%02d,%03d' % (seconds
/ 3600, (seconds
% 3600) / 60, seconds
% 60, (seconds
% 1) * 1000)
2430 def dfxp2srt(dfxp_data
):
2431 _x
= functools
.partial(xpath_with_ns
, ns_map
={
2432 'ttml': 'http://www.w3.org/ns/ttml',
2433 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2434 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2437 class TTMLPElementParser(object):
2440 def start(self
, tag
, attrib
):
2441 if tag
in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2447 def data(self
, data
):
2451 return self
.out
.strip()
2453 def parse_node(node
):
2454 target
= TTMLPElementParser()
2455 parser
= xml
.etree
.ElementTree
.XMLParser(target
=target
)
2456 parser
.feed(xml
.etree
.ElementTree
.tostring(node
))
2457 return parser
.close()
2459 dfxp
= compat_etree_fromstring(dfxp_data
.encode('utf-8'))
2461 paras
= dfxp
.findall(_x('.//ttml:p')) or dfxp
.findall(_x('.//ttaf1:p')) or dfxp
.findall(_x('.//ttaf1_0604:p')) or dfxp
.findall('.//p')
2464 raise ValueError('Invalid dfxp/TTML subtitle')
2466 for para
, index
in zip(paras
, itertools
.count(1)):
2467 begin_time
= parse_dfxp_time_expr(para
.attrib
.get('begin'))
2468 end_time
= parse_dfxp_time_expr(para
.attrib
.get('end'))
2469 dur
= parse_dfxp_time_expr(para
.attrib
.get('dur'))
2470 if begin_time
is None:
2475 end_time
= begin_time
+ dur
2476 out
.append('%d\n%s --> %s\n%s\n\n' % (
2478 srt_subtitles_timecode(begin_time
),
2479 srt_subtitles_timecode(end_time
),
2485 def cli_option(params
, command_option
, param
):
2486 param
= params
.get(param
)
2488 param
= compat_str(param
)
2489 return [command_option
, param
] if param
is not None else []
2492 def cli_bool_option(params
, command_option
, param
, true_value
='true', false_value
='false', separator
=None):
2493 param
= params
.get(param
)
2494 assert isinstance(param
, bool)
2496 return [command_option
+ separator
+ (true_value
if param
else false_value
)]
2497 return [command_option
, true_value
if param
else false_value
]
2500 def cli_valueless_option(params
, command_option
, param
, expected_value
=True):
2501 param
= params
.get(param
)
2502 return [command_option
] if param
== expected_value
else []
2505 def cli_configuration_args(params
, param
, default
=[]):
2506 ex_args
= params
.get(param
)
2509 assert isinstance(ex_args
, list)
2513 class ISO639Utils(object):
2514 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2703 def short2long(cls
, code
):
2704 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2705 return cls
._lang
_map
.get(code
[:2])
2708 def long2short(cls
, code
):
2709 """Convert language code from ISO 639-2/T to ISO 639-1"""
2710 for short_name
, long_name
in cls
._lang
_map
.items():
2711 if long_name
== code
:
2715 class ISO3166Utils(object):
2716 # From http://data.okfn.org/data/core/country-list
2718 'AF': 'Afghanistan',
2719 'AX': 'Åland Islands',
2722 'AS': 'American Samoa',
2727 'AG': 'Antigua and Barbuda',
2744 'BO': 'Bolivia, Plurinational State of',
2745 'BQ': 'Bonaire, Sint Eustatius and Saba',
2746 'BA': 'Bosnia and Herzegovina',
2748 'BV': 'Bouvet Island',
2750 'IO': 'British Indian Ocean Territory',
2751 'BN': 'Brunei Darussalam',
2753 'BF': 'Burkina Faso',
2759 'KY': 'Cayman Islands',
2760 'CF': 'Central African Republic',
2764 'CX': 'Christmas Island',
2765 'CC': 'Cocos (Keeling) Islands',
2769 'CD': 'Congo, the Democratic Republic of the',
2770 'CK': 'Cook Islands',
2772 'CI': 'Côte d\'Ivoire',
2777 'CZ': 'Czech Republic',
2781 'DO': 'Dominican Republic',
2784 'SV': 'El Salvador',
2785 'GQ': 'Equatorial Guinea',
2789 'FK': 'Falkland Islands (Malvinas)',
2790 'FO': 'Faroe Islands',
2794 'GF': 'French Guiana',
2795 'PF': 'French Polynesia',
2796 'TF': 'French Southern Territories',
2811 'GW': 'Guinea-Bissau',
2814 'HM': 'Heard Island and McDonald Islands',
2815 'VA': 'Holy See (Vatican City State)',
2822 'IR': 'Iran, Islamic Republic of',
2825 'IM': 'Isle of Man',
2835 'KP': 'Korea, Democratic People\'s Republic of',
2836 'KR': 'Korea, Republic of',
2839 'LA': 'Lao People\'s Democratic Republic',
2845 'LI': 'Liechtenstein',
2849 'MK': 'Macedonia, the Former Yugoslav Republic of',
2856 'MH': 'Marshall Islands',
2862 'FM': 'Micronesia, Federated States of',
2863 'MD': 'Moldova, Republic of',
2874 'NL': 'Netherlands',
2875 'NC': 'New Caledonia',
2876 'NZ': 'New Zealand',
2881 'NF': 'Norfolk Island',
2882 'MP': 'Northern Mariana Islands',
2887 'PS': 'Palestine, State of',
2889 'PG': 'Papua New Guinea',
2892 'PH': 'Philippines',
2896 'PR': 'Puerto Rico',
2900 'RU': 'Russian Federation',
2902 'BL': 'Saint Barthélemy',
2903 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2904 'KN': 'Saint Kitts and Nevis',
2905 'LC': 'Saint Lucia',
2906 'MF': 'Saint Martin (French part)',
2907 'PM': 'Saint Pierre and Miquelon',
2908 'VC': 'Saint Vincent and the Grenadines',
2911 'ST': 'Sao Tome and Principe',
2912 'SA': 'Saudi Arabia',
2916 'SL': 'Sierra Leone',
2918 'SX': 'Sint Maarten (Dutch part)',
2921 'SB': 'Solomon Islands',
2923 'ZA': 'South Africa',
2924 'GS': 'South Georgia and the South Sandwich Islands',
2925 'SS': 'South Sudan',
2930 'SJ': 'Svalbard and Jan Mayen',
2933 'CH': 'Switzerland',
2934 'SY': 'Syrian Arab Republic',
2935 'TW': 'Taiwan, Province of China',
2937 'TZ': 'Tanzania, United Republic of',
2939 'TL': 'Timor-Leste',
2943 'TT': 'Trinidad and Tobago',
2946 'TM': 'Turkmenistan',
2947 'TC': 'Turks and Caicos Islands',
2951 'AE': 'United Arab Emirates',
2952 'GB': 'United Kingdom',
2953 'US': 'United States',
2954 'UM': 'United States Minor Outlying Islands',
2958 'VE': 'Venezuela, Bolivarian Republic of',
2960 'VG': 'Virgin Islands, British',
2961 'VI': 'Virgin Islands, U.S.',
2962 'WF': 'Wallis and Futuna',
2963 'EH': 'Western Sahara',
2970 def short2full(cls
, code
):
2971 """Convert an ISO 3166-2 country code to the corresponding full name"""
2972 return cls
._country
_map
.get(code
.upper())
2975 class PerRequestProxyHandler(compat_urllib_request
.ProxyHandler
):
2976 def __init__(self
, proxies
=None):
2977 # Set default handlers
2978 for type in ('http', 'https'):
2979 setattr(self
, '%s_open' % type,
2980 lambda r
, proxy
='__noproxy__', type=type, meth
=self
.proxy_open
:
2981 meth(r
, proxy
, type))
2982 return compat_urllib_request
.ProxyHandler
.__init
__(self
, proxies
)
2984 def proxy_open(self
, req
, proxy
, type):
2985 req_proxy
= req
.headers
.get('Ytdl-request-proxy')
2986 if req_proxy
is not None:
2988 del req
.headers
['Ytdl-request-proxy']
2990 if proxy
== '__noproxy__':
2991 return None # No Proxy
2992 if compat_urlparse
.urlparse(proxy
).scheme
.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2993 req
.add_header('Ytdl-socks-proxy', proxy
)
2994 # youtube-dl's http/https handlers do wrapping the socket with socks
2996 return compat_urllib_request
.ProxyHandler
.proxy_open(
2997 self
, req
, proxy
, type)
3000 def ohdave_rsa_encrypt(data
, exponent
, modulus
):
3002 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3005 data: data to encrypt, bytes-like object
3006 exponent, modulus: parameter e and N of RSA algorithm, both integer
3007 Output: hex string of encrypted data
3009 Limitation: supports one block encryption only
3012 payload
= int(binascii
.hexlify(data
[::-1]), 16)
3013 encrypted
= pow(payload
, exponent
, modulus
)
3014 return '%x' % encrypted
3017 def encode_base_n(num
, n
, table
=None):
3018 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3020 table
= FULL_TABLE
[:n
]
3023 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
3030 ret
= table
[num
% n
] + ret
3035 def decode_packed_codes(code
):
3036 mobj
= re
.search(PACKED_CODES_RE
, code
)
3037 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
3040 symbols
= symbols
.split('|')
3045 base_n_count
= encode_base_n(count
, base
)
3046 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
3049 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
3053 def parse_m3u8_attributes(attrib
):
3055 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
3056 if val
.startswith('"'):
3062 def urshift(val
, n
):
3063 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
3066 # Based on png2str() written by @gdkchan and improved by @yokrysty
3067 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3068 def decode_png(png_data
):
3069 # Reference: https://www.w3.org/TR/PNG/
3070 header
= png_data
[8:]
3072 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
3073 raise IOError('Not a valid PNG file.')
3075 int_map
= {1: '>B', 2: '>H', 4: '>I'}
3076 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
3081 length
= unpack_integer(header
[:4])
3084 chunk_type
= header
[:4]
3087 chunk_data
= header
[:length
]
3088 header
= header
[length
:]
3090 header
= header
[4:] # Skip CRC
3098 ihdr
= chunks
[0]['data']
3100 width
= unpack_integer(ihdr
[:4])
3101 height
= unpack_integer(ihdr
[4:8])
3105 for chunk
in chunks
:
3106 if chunk
['type'] == b
'IDAT':
3107 idat
+= chunk
['data']
3110 raise IOError('Unable to read PNG data.')
3112 decompressed_data
= bytearray(zlib
.decompress(idat
))
3117 def _get_pixel(idx
):
3122 for y
in range(height
):
3123 basePos
= y
* (1 + stride
)
3124 filter_type
= decompressed_data
[basePos
]
3128 pixels
.append(current_row
)
3130 for x
in range(stride
):
3131 color
= decompressed_data
[1 + basePos
+ x
]
3132 basex
= y
* stride
+ x
3137 left
= _get_pixel(basex
- 3)
3139 up
= _get_pixel(basex
- stride
)
3141 if filter_type
== 1: # Sub
3142 color
= (color
+ left
) & 0xff
3143 elif filter_type
== 2: # Up
3144 color
= (color
+ up
) & 0xff
3145 elif filter_type
== 3: # Average
3146 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
3147 elif filter_type
== 4: # Paeth
3153 c
= _get_pixel(basex
- stride
- 3)
3161 if pa
<= pb
and pa
<= pc
:
3162 color
= (color
+ a
) & 0xff
3164 color
= (color
+ b
) & 0xff
3166 color
= (color
+ c
) & 0xff
3168 current_row
.append(color
)
3170 return width
, height
, pixels
3173 def write_xattr(path
, key
, value
):
3174 # This mess below finds the best xattr tool for the job
3176 # try the pyxattr module...
3179 if hasattr(xattr
, 'set'): # pyxattr
3180 # Unicode arguments are not supported in python-pyxattr until
3182 # See https://github.com/rg3/youtube-dl/issues/5498
3183 pyxattr_required_version
= '0.5.0'
3184 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
3185 # TODO: fallback to CLI tools
3186 raise XAttrUnavailableError(
3187 'python-pyxattr is detected but is too old. '
3188 'youtube-dl requires %s or above while your version is %s. '
3189 'Falling back to other xattr implementations' % (
3190 pyxattr_required_version
, xattr
.__version
__))
3192 setxattr
= xattr
.set
3194 setxattr
= xattr
.setxattr
3197 setxattr(path
, key
, value
)
3198 except EnvironmentError as e
:
3199 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3202 if compat_os_name
== 'nt':
3203 # Write xattrs to NTFS Alternate Data Streams:
3204 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3205 assert ':' not in key
3206 assert os
.path
.exists(path
)
3208 ads_fn
= path
+ ':' + key
3210 with open(ads_fn
, 'wb') as f
:
3212 except EnvironmentError as e
:
3213 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3215 user_has_setfattr
= check_executable('setfattr', ['--version'])
3216 user_has_xattr
= check_executable('xattr', ['-h'])
3218 if user_has_setfattr
or user_has_xattr
:
3220 value
= value
.decode('utf-8')
3221 if user_has_setfattr
:
3222 executable
= 'setfattr'
3223 opts
= ['-n', key
, '-v', value
]
3224 elif user_has_xattr
:
3225 executable
= 'xattr'
3226 opts
= ['-w', key
, value
]
3228 cmd
= ([encodeFilename(executable
, True)] +
3229 [encodeArgument(o
) for o
in opts
] +
3230 [encodeFilename(path
, True)])
3233 p
= subprocess
.Popen(
3234 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
3235 except EnvironmentError as e
:
3236 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3237 stdout
, stderr
= p
.communicate()
3238 stderr
= stderr
.decode('utf-8', 'replace')
3239 if p
.returncode
!= 0:
3240 raise XAttrMetadataError(p
.returncode
, stderr
)
3243 # On Unix, and can't find pyxattr, setfattr, or xattr.
3244 if sys
.platform
.startswith('linux'):
3245 raise XAttrUnavailableError(
3246 "Couldn't find a tool to set the xattrs. "
3247 "Install either the python 'pyxattr' or 'xattr' "
3248 "modules, or the GNU 'attr' package "
3249 "(which contains the 'setfattr' tool).")
3251 raise XAttrUnavailableError(
3252 "Couldn't find a tool to set the xattrs. "
3253 "Install either the python 'xattr' module, "
3254 "or the 'xattr' binary.")