4 from __future__
import unicode_literals
33 import xml
.etree
.ElementTree
40 compat_etree_fromstring
,
42 compat_html_entities_html5
,
48 compat_socket_create_connection
,
54 compat_urllib_parse_urlencode
,
55 compat_urllib_parse_urlparse
,
56 compat_urllib_parse_unquote_plus
,
57 compat_urllib_request
,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme
not in compat_urlparse
.uses_netloc
:
74 compat_urlparse
.uses_netloc
.append(scheme
)
77 # This is not clearly defined otherwise
78 compiled_regex_type
= type(re
.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
96 ENGLISH_MONTH_NAMES
= [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
101 'en': ENGLISH_MONTH_NAMES
,
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
120 'f4f', 'f4m', 'm3u8', 'smil')
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
148 '%Y-%m-%d %H:%M:%S.%f',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
155 '%Y-%m-%dT%H:%M:%S.%f',
158 '%b %d %Y at %H:%M:%S',
161 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
162 DATE_FORMATS_DAY_FIRST
.extend([
171 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
172 DATE_FORMATS_MONTH_FIRST
.extend([
180 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
183 def preferredencoding():
184 """Get preferred encoding.
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
190 pref
= locale
.getpreferredencoding()
198 def write_json_file(obj
, fn
):
199 """ Encode obj as JSON and write it to fn, atomically if possible """
201 fn
= encodeFilename(fn
)
202 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
203 encoding
= get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
208 # the same for os.path.dirname
209 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
211 path_basename
= os
.path
.basename
212 path_dirname
= os
.path
.dirname
216 'prefix': path_basename(fn
) + '.',
217 'dir': path_dirname(fn
),
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys
.version_info
< (3, 0):
231 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
236 if sys
.platform
== 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
243 os
.rename(tf
.name
, fn
)
252 if sys
.version_info
>= (2, 7):
253 def find_xpath_attr(node
, xpath
, key
, val
=None):
254 """ Find the xpath xpath[@key=val] """
255 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
256 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
257 return node
.find(expr
)
259 def find_xpath_attr(node
, xpath
, key
, val
=None):
260 for f
in node
.findall(compat_xpath(xpath
)):
261 if key
not in f
.attrib
:
263 if val
is None or f
.attrib
.get(key
) == val
:
267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
268 # the namespace parameter
271 def xpath_with_ns(path
, ns_map
):
272 components
= [c
.split(':') for c
in path
.split('/')]
276 replaced
.append(c
[0])
279 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
280 return '/'.join(replaced
)
283 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
284 def _find_xpath(xpath
):
285 return node
.find(compat_xpath(xpath
))
287 if isinstance(xpath
, (str, compat_str
)):
288 n
= _find_xpath(xpath
)
296 if default
is not NO_DEFAULT
:
299 name
= xpath
if name
is None else name
300 raise ExtractorError('Could not find XML element %s' % name
)
306 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
307 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
308 if n
is None or n
== default
:
311 if default
is not NO_DEFAULT
:
314 name
= xpath
if name
is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name
)
321 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
322 n
= find_xpath_attr(node
, xpath
, key
)
324 if default
is not NO_DEFAULT
:
327 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name
)
334 def get_element_by_id(id, html
):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute('id', id, html
)
339 def get_element_by_class(class_name
, html
):
340 return get_element_by_attribute(
341 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
342 html, escape_value=False)
345 def get_element_by_attribute(attribute, value, html, escape_value=True):
346 """Return the content of the tag with the specified attribute in the passed HTML document"""
348 value = re.escape(value) if escape_value else value
350 m = re.search(r'''(?xs)
352 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'))*?
354 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'))*?
358 ''' % (re.escape(attribute), value), html)
362 res = m.group('content
')
364 if res.startswith('"') or res.startswith("'"):
367 return unescapeHTML(res)
370 class HTMLAttributeParser(compat_HTMLParser):
371 """Trivial HTML parser to gather the attributes for a single element"""
374 compat_HTMLParser.__init__(self)
376 def handle_starttag(self, tag, attrs):
377 self.attrs = dict(attrs)
380 def extract_attributes(html_element):
381 """Given a string for an HTML element such as
383 a="foo" B="bar" c="&98;az" d=boz
384 empty= noval entity="&"
387 Decode and return a dictionary of attributes.
389 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
390 'empty
': '', 'noval
': None, 'entity
': '&',
391 'sq
': '"', 'dq': '\''
393 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
394 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
396 parser = HTMLAttributeParser()
397 parser.feed(html_element)
402 def clean_html(html):
403 """Clean an HTML snippet into a readable string"""
405 if html is None: # Convenience for sanitizing descriptions etc.
409 html = html.replace('\n', ' ')
410 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
411 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
413 html = re.sub('<.*?>', '', html)
414 # Replace html entities
415 html = unescapeHTML(html)
419 def sanitize_open(filename, open_mode):
420 """Try to open the given filename, and slightly tweak it if this fails.
422 Attempts to open the given filename. If this fails, it tries to change
423 the filename slightly, step by step, until it's either able to open it
424 or it fails and raises a final exception, like the standard open()
427 It returns the tuple (stream, definitive_file_name).
431 if sys.platform == 'win32':
433 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
434 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
435 stream = open(encodeFilename(filename), open_mode)
436 return (stream, filename)
437 except (IOError, OSError) as err:
438 if err.errno in (errno.EACCES,):
441 # In case of error, try to remove win32 forbidden chars
442 alt_filename = sanitize_path(filename)
443 if alt_filename == filename:
446 # An exception here should be caught in the caller
447 stream = open(encodeFilename(alt_filename), open_mode)
448 return (stream, alt_filename)
451 def timeconvert(timestr):
452 """Convert RFC 2822 defined time string into system timestamp"""
454 timetuple = email.utils.parsedate_tz(timestr)
455 if timetuple is not None:
456 timestamp = email.utils.mktime_tz(timetuple)
460 def sanitize_filename(s, restricted=False, is_id=False):
461 """Sanitizes a string so it could be used as part of a filename.
462 If restricted is set, use a stricter subset of allowed characters.
463 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
465 def replace_insane(char):
466 if restricted and char in ACCENT_CHARS:
467 return ACCENT_CHARS[char]
468 if char == '?' or ord(char) < 32 or ord(char) == 127:
471 return '' if restricted else '\''
473 return '_
-' if restricted else ' -'
474 elif char in '\\/|
*<>':
476 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
478 if restricted
and ord(char
) > 127:
483 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
484 result
= ''.join(map(replace_insane
, s
))
486 while '__' in result
:
487 result
= result
.replace('__', '_')
488 result
= result
.strip('_')
489 # Common case of "Foreign band name - English song title"
490 if restricted
and result
.startswith('-_'):
492 if result
.startswith('-'):
493 result
= '_' + result
[len('-'):]
494 result
= result
.lstrip('.')
500 def sanitize_path(s
):
501 """Sanitizes and normalizes path on Windows"""
502 if sys
.platform
!= 'win32':
504 drive_or_unc
, _
= os
.path
.splitdrive(s
)
505 if sys
.version_info
< (2, 7) and not drive_or_unc
:
506 drive_or_unc
, _
= os
.path
.splitunc(s
)
507 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
511 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
512 for path_part
in norm_path
]
514 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
515 return os
.path
.join(*sanitized_path
)
518 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
519 # unwanted failures due to missing protocol
520 def sanitize_url(url
):
521 return 'http:%s' % url
if url
.startswith('//') else url
524 def sanitized_Request(url
, *args
, **kwargs
):
525 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
528 def orderedSet(iterable
):
529 """ Remove all duplicates from the input iterable """
537 def _htmlentity_transform(entity_with_semicolon
):
538 """Transforms an HTML entity to a character."""
539 entity
= entity_with_semicolon
[:-1]
541 # Known non-numeric HTML entity
542 if entity
in compat_html_entities
.name2codepoint
:
543 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
545 # TODO: HTML5 allows entities without a semicolon. For example,
546 # 'Éric' should be decoded as 'Éric'.
547 if entity_with_semicolon
in compat_html_entities_html5
:
548 return compat_html_entities_html5
[entity_with_semicolon
]
550 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
552 numstr
= mobj
.group(1)
553 if numstr
.startswith('x'):
555 numstr
= '0%s' % numstr
558 # See https://github.com/rg3/youtube-dl/issues/7518
560 return compat_chr(int(numstr
, base
))
564 # Unknown entity in name, return its literal representation
565 return '&%s;' % entity
571 assert type(s
) == compat_str
574 r
'&([^;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
577 def get_subprocess_encoding():
578 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
579 # For subprocess calls, encode with locale encoding
580 # Refer to http://stackoverflow.com/a/9951851/35070
581 encoding
= preferredencoding()
583 encoding
= sys
.getfilesystemencoding()
589 def encodeFilename(s
, for_subprocess
=False):
591 @param s The name of the file
594 assert type(s
) == compat_str
596 # Python 3 has a Unicode API
597 if sys
.version_info
>= (3, 0):
600 # Pass '' directly to use Unicode APIs on Windows 2000 and up
601 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
602 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
603 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
606 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
607 if sys
.platform
.startswith('java'):
610 return s
.encode(get_subprocess_encoding(), 'ignore')
613 def decodeFilename(b
, for_subprocess
=False):
615 if sys
.version_info
>= (3, 0):
618 if not isinstance(b
, bytes):
621 return b
.decode(get_subprocess_encoding(), 'ignore')
624 def encodeArgument(s
):
625 if not isinstance(s
, compat_str
):
626 # Legacy code that uses byte strings
627 # Uncomment the following line after fixing all post processors
628 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
629 s
= s
.decode('ascii')
630 return encodeFilename(s
, True)
633 def decodeArgument(b
):
634 return decodeFilename(b
, True)
637 def decodeOption(optval
):
640 if isinstance(optval
, bytes):
641 optval
= optval
.decode(preferredencoding())
643 assert isinstance(optval
, compat_str
)
647 def formatSeconds(secs
):
649 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
651 return '%d:%02d' % (secs
// 60, secs
% 60)
656 def make_HTTPS_handler(params
, **kwargs
):
657 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
658 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
659 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
660 if opts_no_check_certificate
:
661 context
.check_hostname
= False
662 context
.verify_mode
= ssl
.CERT_NONE
664 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
667 # (create_default_context present but HTTPSHandler has no context=)
670 if sys
.version_info
< (3, 2):
671 return YoutubeDLHTTPSHandler(params
, **kwargs
)
673 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
674 context
.verify_mode
= (ssl
.CERT_NONE
675 if opts_no_check_certificate
676 else ssl
.CERT_REQUIRED
)
677 context
.set_default_verify_paths()
678 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
681 def bug_reports_message():
682 if ytdl_is_updateable():
683 update_cmd
= 'type youtube-dl -U to update'
685 update_cmd
= 'see https://yt-dl.org/update on how to update'
686 msg
= '; please report this issue on https://yt-dl.org/bug .'
687 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
688 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
692 class ExtractorError(Exception):
693 """Error during info extraction."""
695 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
696 """ tb, if given, is the original traceback (so that it can be printed out).
697 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
700 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
702 if video_id
is not None:
703 msg
= video_id
+ ': ' + msg
705 msg
+= ' (caused by %r)' % cause
707 msg
+= bug_reports_message()
708 super(ExtractorError
, self
).__init
__(msg
)
711 self
.exc_info
= sys
.exc_info() # preserve original exception
713 self
.video_id
= video_id
715 def format_traceback(self
):
716 if self
.traceback
is None:
718 return ''.join(traceback
.format_tb(self
.traceback
))
721 class UnsupportedError(ExtractorError
):
722 def __init__(self
, url
):
723 super(UnsupportedError
, self
).__init
__(
724 'Unsupported URL: %s' % url
, expected
=True)
728 class RegexNotFoundError(ExtractorError
):
729 """Error when a regex didn't match"""
733 class DownloadError(Exception):
734 """Download Error exception.
736 This exception may be thrown by FileDownloader objects if they are not
737 configured to continue on errors. They will contain the appropriate
741 def __init__(self
, msg
, exc_info
=None):
742 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
743 super(DownloadError
, self
).__init
__(msg
)
744 self
.exc_info
= exc_info
747 class SameFileError(Exception):
748 """Same File exception.
750 This exception will be thrown by FileDownloader objects if they detect
751 multiple files would have to be downloaded to the same file on disk.
756 class PostProcessingError(Exception):
757 """Post Processing exception.
759 This exception may be raised by PostProcessor's .run() method to
760 indicate an error in the postprocessing task.
763 def __init__(self
, msg
):
767 class MaxDownloadsReached(Exception):
768 """ --max-downloads limit has been reached. """
772 class UnavailableVideoError(Exception):
773 """Unavailable Format exception.
775 This exception will be thrown when a video is requested
776 in a format that is not available for that video.
781 class ContentTooShortError(Exception):
782 """Content Too Short exception.
784 This exception may be raised by FileDownloader objects when a file they
785 download is too small for what the server announced first, indicating
786 the connection was probably interrupted.
789 def __init__(self
, downloaded
, expected
):
791 self
.downloaded
= downloaded
792 self
.expected
= expected
795 class XAttrMetadataError(Exception):
796 def __init__(self
, code
=None, msg
='Unknown error'):
797 super(XAttrMetadataError
, self
).__init
__(msg
)
801 # Parsing code and msg
802 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
) or
803 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
804 self
.reason
= 'NO_SPACE'
805 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
806 self
.reason
= 'VALUE_TOO_LONG'
808 self
.reason
= 'NOT_SUPPORTED'
811 class XAttrUnavailableError(Exception):
815 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
816 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
817 # expected HTTP responses to meet HTTP/1.0 or later (see also
818 # https://github.com/rg3/youtube-dl/issues/6727)
819 if sys
.version_info
< (3, 0):
820 kwargs
[b
'strict'] = True
821 hc
= http_class(*args
, **kwargs
)
822 source_address
= ydl_handler
._params
.get('source_address')
823 if source_address
is not None:
824 sa
= (source_address
, 0)
825 if hasattr(hc
, 'source_address'): # Python 2.7+
826 hc
.source_address
= sa
828 def _hc_connect(self
, *args
, **kwargs
):
829 sock
= compat_socket_create_connection(
830 (self
.host
, self
.port
), self
.timeout
, sa
)
832 self
.sock
= ssl
.wrap_socket(
833 sock
, self
.key_file
, self
.cert_file
,
834 ssl_version
=ssl
.PROTOCOL_TLSv1
)
837 hc
.connect
= functools
.partial(_hc_connect
, hc
)
842 def handle_youtubedl_headers(headers
):
843 filtered_headers
= headers
845 if 'Youtubedl-no-compression' in filtered_headers
:
846 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
847 del filtered_headers
['Youtubedl-no-compression']
849 return filtered_headers
852 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
853 """Handler for HTTP requests and responses.
855 This class, when installed with an OpenerDirector, automatically adds
856 the standard headers to every HTTP request and handles gzipped and
857 deflated responses from web servers. If compression is to be avoided in
858 a particular request, the original request in the program code only has
859 to include the HTTP header "Youtubedl-no-compression", which will be
860 removed before making the real request.
862 Part of this code was copied from:
864 http://techknack.net/python-urllib2-handlers/
866 Andrew Rowls, the author of that code, agreed to release it to the
870 def __init__(self
, params
, *args
, **kwargs
):
871 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
872 self
._params
= params
874 def http_open(self
, req
):
875 conn_class
= compat_http_client
.HTTPConnection
877 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
879 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
880 del req
.headers
['Ytdl-socks-proxy']
882 return self
.do_open(functools
.partial(
883 _create_http_connection
, self
, conn_class
, False),
889 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
891 return zlib
.decompress(data
)
894 def addinfourl_wrapper(stream
, headers
, url
, code
):
895 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
896 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
897 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
901 def http_request(self
, req
):
902 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
903 # always respected by websites, some tend to give out URLs with non percent-encoded
904 # non-ASCII characters (see telemb.py, ard.py [#3412])
905 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
906 # To work around aforementioned issue we will replace request's original URL with
907 # percent-encoded one
908 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
909 # the code of this workaround has been moved here from YoutubeDL.urlopen()
910 url
= req
.get_full_url()
911 url_escaped
= escape_url(url
)
913 # Substitute URL if any change after escaping
914 if url
!= url_escaped
:
915 req
= update_Request(req
, url
=url_escaped
)
917 for h
, v
in std_headers
.items():
918 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
919 # The dict keys are capitalized because of this bug by urllib
920 if h
.capitalize() not in req
.headers
:
923 req
.headers
= handle_youtubedl_headers(req
.headers
)
925 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
926 # Python 2.6 is brain-dead when it comes to fragments
927 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
928 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
932 def http_response(self
, req
, resp
):
935 if resp
.headers
.get('Content-encoding', '') == 'gzip':
936 content
= resp
.read()
937 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
939 uncompressed
= io
.BytesIO(gz
.read())
940 except IOError as original_ioerror
:
941 # There may be junk add the end of the file
942 # See http://stackoverflow.com/q/4928560/35070 for details
943 for i
in range(1, 1024):
945 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
946 uncompressed
= io
.BytesIO(gz
.read())
951 raise original_ioerror
952 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
953 resp
.msg
= old_resp
.msg
954 del resp
.headers
['Content-encoding']
956 if resp
.headers
.get('Content-encoding', '') == 'deflate':
957 gz
= io
.BytesIO(self
.deflate(resp
.read()))
958 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
959 resp
.msg
= old_resp
.msg
960 del resp
.headers
['Content-encoding']
961 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
962 # https://github.com/rg3/youtube-dl/issues/6457).
963 if 300 <= resp
.code
< 400:
964 location
= resp
.headers
.get('Location')
966 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
967 if sys
.version_info
>= (3, 0):
968 location
= location
.encode('iso-8859-1').decode('utf-8')
970 location
= location
.decode('utf-8')
971 location_escaped
= escape_url(location
)
972 if location
!= location_escaped
:
973 del resp
.headers
['Location']
974 if sys
.version_info
< (3, 0):
975 location_escaped
= location_escaped
.encode('utf-8')
976 resp
.headers
['Location'] = location_escaped
979 https_request
= http_request
980 https_response
= http_response
983 def make_socks_conn_class(base_class
, socks_proxy
):
984 assert issubclass(base_class
, (
985 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
987 url_components
= compat_urlparse
.urlparse(socks_proxy
)
988 if url_components
.scheme
.lower() == 'socks5':
989 socks_type
= ProxyType
.SOCKS5
990 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
991 socks_type
= ProxyType
.SOCKS4
992 elif url_components
.scheme
.lower() == 'socks4a':
993 socks_type
= ProxyType
.SOCKS4A
995 def unquote_if_non_empty(s
):
998 return compat_urllib_parse_unquote_plus(s
)
1002 url_components
.hostname
, url_components
.port
or 1080,
1004 unquote_if_non_empty(url_components
.username
),
1005 unquote_if_non_empty(url_components
.password
),
1008 class SocksConnection(base_class
):
1010 self
.sock
= sockssocket()
1011 self
.sock
.setproxy(*proxy_args
)
1012 if type(self
.timeout
) in (int, float):
1013 self
.sock
.settimeout(self
.timeout
)
1014 self
.sock
.connect((self
.host
, self
.port
))
1016 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1017 if hasattr(self
, '_context'): # Python > 2.6
1018 self
.sock
= self
._context
.wrap_socket(
1019 self
.sock
, server_hostname
=self
.host
)
1021 self
.sock
= ssl
.wrap_socket(self
.sock
)
1023 return SocksConnection
1026 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1027 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1028 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1029 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1030 self
._params
= params
1032 def https_open(self
, req
):
1034 conn_class
= self
._https
_conn
_class
1036 if hasattr(self
, '_context'): # python > 2.6
1037 kwargs
['context'] = self
._context
1038 if hasattr(self
, '_check_hostname'): # python 3.x
1039 kwargs
['check_hostname'] = self
._check
_hostname
1041 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1043 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1044 del req
.headers
['Ytdl-socks-proxy']
1046 return self
.do_open(functools
.partial(
1047 _create_http_connection
, self
, conn_class
, True),
1051 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1052 def __init__(self
, cookiejar
=None):
1053 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1055 def http_response(self
, request
, response
):
1056 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1057 # characters in Set-Cookie HTTP header of last response (see
1058 # https://github.com/rg3/youtube-dl/issues/6769).
1059 # In order to at least prevent crashing we will percent encode Set-Cookie
1060 # header before HTTPCookieProcessor starts processing it.
1061 # if sys.version_info < (3, 0) and response.headers:
1062 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1063 # set_cookie = response.headers.get(set_cookie_header)
1065 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1066 # if set_cookie != set_cookie_escaped:
1067 # del response.headers[set_cookie_header]
1068 # response.headers[set_cookie_header] = set_cookie_escaped
1069 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1071 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1072 https_response
= http_response
1075 def extract_timezone(date_str
):
1077 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1080 timezone
= datetime
.timedelta()
1082 date_str
= date_str
[:-len(m
.group('tz'))]
1083 if not m
.group('sign'):
1084 timezone
= datetime
.timedelta()
1086 sign
= 1 if m
.group('sign') == '+' else -1
1087 timezone
= datetime
.timedelta(
1088 hours
=sign
* int(m
.group('hours')),
1089 minutes
=sign
* int(m
.group('minutes')))
1090 return timezone
, date_str
1093 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1094 """ Return a UNIX timestamp from the given date """
1096 if date_str
is None:
1099 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1101 if timezone
is None:
1102 timezone
, date_str
= extract_timezone(date_str
)
1105 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1106 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1107 return calendar
.timegm(dt
.timetuple())
1112 def date_formats(day_first
=True):
1113 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1116 def unified_strdate(date_str
, day_first
=True):
1117 """Return a string with the date in the format YYYYMMDD"""
1119 if date_str
is None:
1123 date_str
= date_str
.replace(',', ' ')
1124 # Remove AM/PM + timezone
1125 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1126 _
, date_str
= extract_timezone(date_str
)
1128 for expression
in date_formats(day_first
):
1130 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1133 if upload_date
is None:
1134 timetuple
= email
.utils
.parsedate_tz(date_str
)
1137 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1140 if upload_date
is not None:
1141 return compat_str(upload_date
)
1144 def unified_timestamp(date_str
, day_first
=True):
1145 if date_str
is None:
1148 date_str
= date_str
.replace(',', ' ')
1150 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1151 timezone
, date_str
= extract_timezone(date_str
)
1153 # Remove AM/PM + timezone
1154 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1156 for expression
in date_formats(day_first
):
1158 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1159 return calendar
.timegm(dt
.timetuple())
1162 timetuple
= email
.utils
.parsedate_tz(date_str
)
1164 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1167 def determine_ext(url
, default_ext
='unknown_video'):
1170 guess
= url
.partition('?')[0].rpartition('.')[2]
1171 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1173 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1174 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1175 return guess
.rstrip('/')
1180 def subtitles_filename(filename
, sub_lang
, sub_format
):
1181 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
1184 def date_from_str(date_str
):
1186 Return a datetime object from a string in the format YYYYMMDD or
1187 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1188 today
= datetime
.date
.today()
1189 if date_str
in ('now', 'today'):
1191 if date_str
== 'yesterday':
1192 return today
- datetime
.timedelta(days
=1)
1193 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
1194 if match
is not None:
1195 sign
= match
.group('sign')
1196 time
= int(match
.group('time'))
1199 unit
= match
.group('unit')
1200 # A bad approximation?
1204 elif unit
== 'year':
1208 delta
= datetime
.timedelta(**{unit
: time
})
1209 return today
+ delta
1210 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
1213 def hyphenate_date(date_str
):
1215 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1216 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1217 if match
is not None:
1218 return '-'.join(match
.groups())
1223 class DateRange(object):
1224 """Represents a time interval between two dates"""
1226 def __init__(self
, start
=None, end
=None):
1227 """start and end must be strings in the format accepted by date"""
1228 if start
is not None:
1229 self
.start
= date_from_str(start
)
1231 self
.start
= datetime
.datetime
.min.date()
1233 self
.end
= date_from_str(end
)
1235 self
.end
= datetime
.datetime
.max.date()
1236 if self
.start
> self
.end
:
1237 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1241 """Returns a range that only contains the given day"""
1242 return cls(day
, day
)
1244 def __contains__(self
, date
):
1245 """Check if the date is in the range"""
1246 if not isinstance(date
, datetime
.date
):
1247 date
= date_from_str(date
)
1248 return self
.start
<= date
<= self
.end
1251 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1254 def platform_name():
1255 """ Returns the platform name as a compat_str """
1256 res
= platform
.platform()
1257 if isinstance(res
, bytes):
1258 res
= res
.decode(preferredencoding())
1260 assert isinstance(res
, compat_str
)
1264 def _windows_write_string(s
, out
):
1265 """ Returns True if the string was written using special methods,
1266 False if it has yet to be written out."""
1267 # Adapted from http://stackoverflow.com/a/3259271/35070
1270 import ctypes
.wintypes
1278 fileno
= out
.fileno()
1279 except AttributeError:
1280 # If the output stream doesn't have a fileno, it's virtual
1282 except io
.UnsupportedOperation
:
1283 # Some strange Windows pseudo files?
1285 if fileno
not in WIN_OUTPUT_IDS
:
1288 GetStdHandle
= ctypes
.WINFUNCTYPE(
1289 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1290 (b
'GetStdHandle', ctypes
.windll
.kernel32
))
1291 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1293 WriteConsoleW
= ctypes
.WINFUNCTYPE(
1294 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1295 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1296 ctypes
.wintypes
.LPVOID
)((b
'WriteConsoleW', ctypes
.windll
.kernel32
))
1297 written
= ctypes
.wintypes
.DWORD(0)
1299 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
'GetFileType', ctypes
.windll
.kernel32
))
1300 FILE_TYPE_CHAR
= 0x0002
1301 FILE_TYPE_REMOTE
= 0x8000
1302 GetConsoleMode
= ctypes
.WINFUNCTYPE(
1303 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1304 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1305 (b
'GetConsoleMode', ctypes
.windll
.kernel32
))
1306 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1308 def not_a_console(handle
):
1309 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1311 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1312 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1314 if not_a_console(h
):
1317 def next_nonbmp_pos(s
):
1319 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1320 except StopIteration:
1324 count
= min(next_nonbmp_pos(s
), 1024)
1326 ret
= WriteConsoleW(
1327 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1329 raise OSError('Failed to write string')
1330 if not count
: # We just wrote a non-BMP character
1331 assert written
.value
== 2
1334 assert written
.value
> 0
1335 s
= s
[written
.value
:]
1339 def write_string(s
, out
=None, encoding
=None):
1342 assert type(s
) == compat_str
1344 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1345 if _windows_write_string(s
, out
):
1348 if ('b' in getattr(out
, 'mode', '') or
1349 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1350 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1352 elif hasattr(out
, 'buffer'):
1353 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1354 byt
= s
.encode(enc
, 'ignore')
1355 out
.buffer.write(byt
)
1361 def bytes_to_intlist(bs
):
1364 if isinstance(bs
[0], int): # Python 3
1367 return [ord(c
) for c
in bs
]
1370 def intlist_to_bytes(xs
):
1373 return compat_struct_pack('%dB' % len(xs
), *xs
)
1376 # Cross-platform file locking
1377 if sys
.platform
== 'win32':
1378 import ctypes
.wintypes
1381 class OVERLAPPED(ctypes
.Structure
):
1383 ('Internal', ctypes
.wintypes
.LPVOID
),
1384 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1385 ('Offset', ctypes
.wintypes
.DWORD
),
1386 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1387 ('hEvent', ctypes
.wintypes
.HANDLE
),
1390 kernel32
= ctypes
.windll
.kernel32
1391 LockFileEx
= kernel32
.LockFileEx
1392 LockFileEx
.argtypes
= [
1393 ctypes
.wintypes
.HANDLE
, # hFile
1394 ctypes
.wintypes
.DWORD
, # dwFlags
1395 ctypes
.wintypes
.DWORD
, # dwReserved
1396 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1397 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1398 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1400 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1401 UnlockFileEx
= kernel32
.UnlockFileEx
1402 UnlockFileEx
.argtypes
= [
1403 ctypes
.wintypes
.HANDLE
, # hFile
1404 ctypes
.wintypes
.DWORD
, # dwReserved
1405 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1406 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1407 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1409 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1410 whole_low
= 0xffffffff
1411 whole_high
= 0x7fffffff
1413 def _lock_file(f
, exclusive
):
1414 overlapped
= OVERLAPPED()
1415 overlapped
.Offset
= 0
1416 overlapped
.OffsetHigh
= 0
1417 overlapped
.hEvent
= 0
1418 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1419 handle
= msvcrt
.get_osfhandle(f
.fileno())
1420 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1421 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1422 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1424 def _unlock_file(f
):
1425 assert f
._lock
_file
_overlapped
_p
1426 handle
= msvcrt
.get_osfhandle(f
.fileno())
1427 if not UnlockFileEx(handle
, 0,
1428 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1429 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1432 # Some platforms, such as Jython, is missing fcntl
1436 def _lock_file(f
, exclusive
):
1437 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1439 def _unlock_file(f
):
1440 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1442 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
1444 def _lock_file(f
, exclusive
):
1445 raise IOError(UNSUPPORTED_MSG
)
1447 def _unlock_file(f
):
1448 raise IOError(UNSUPPORTED_MSG
)
1451 class locked_file(object):
1452 def __init__(self
, filename
, mode
, encoding
=None):
1453 assert mode
in ['r', 'a', 'w']
1454 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1457 def __enter__(self
):
1458 exclusive
= self
.mode
!= 'r'
1460 _lock_file(self
.f
, exclusive
)
1466 def __exit__(self
, etype
, value
, traceback
):
1468 _unlock_file(self
.f
)
1475 def write(self
, *args
):
1476 return self
.f
.write(*args
)
1478 def read(self
, *args
):
1479 return self
.f
.read(*args
)
1482 def get_filesystem_encoding():
1483 encoding
= sys
.getfilesystemencoding()
1484 return encoding
if encoding
is not None else 'utf-8'
1487 def shell_quote(args
):
1489 encoding
= get_filesystem_encoding()
1491 if isinstance(a
, bytes):
1492 # We may get a filename encoded with 'encodeFilename'
1493 a
= a
.decode(encoding
)
1494 quoted_args
.append(pipes
.quote(a
))
1495 return ' '.join(quoted_args
)
1498 def smuggle_url(url
, data
):
1499 """ Pass additional data in a URL for internal use. """
1501 url
, idata
= unsmuggle_url(url
, {})
1503 sdata
= compat_urllib_parse_urlencode(
1504 {'__youtubedl_smuggle': json
.dumps(data
)})
1505 return url
+ '#' + sdata
1508 def unsmuggle_url(smug_url
, default
=None):
1509 if '#__youtubedl_smuggle' not in smug_url
:
1510 return smug_url
, default
1511 url
, _
, sdata
= smug_url
.rpartition('#')
1512 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1513 data
= json
.loads(jsond
)
1517 def format_bytes(bytes):
1520 if type(bytes) is str:
1521 bytes = float(bytes)
1525 exponent
= int(math
.log(bytes, 1024.0))
1526 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1527 converted
= float(bytes) / float(1024 ** exponent
)
1528 return '%.2f%s' % (converted
, suffix
)
1531 def lookup_unit_table(unit_table
, s
):
1532 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1534 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
1537 num_str
= m
.group('num').replace(',', '.')
1538 mult
= unit_table
[m
.group('unit')]
1539 return int(float(num_str
) * mult
)
1542 def parse_filesize(s
):
1546 # The lower-case forms are of course incorrect and unofficial,
1547 # but we support those too
1564 'megabytes': 1000 ** 2,
1565 'mebibytes': 1024 ** 2,
1571 'gigabytes': 1000 ** 3,
1572 'gibibytes': 1024 ** 3,
1578 'terabytes': 1000 ** 4,
1579 'tebibytes': 1024 ** 4,
1585 'petabytes': 1000 ** 5,
1586 'pebibytes': 1024 ** 5,
1592 'exabytes': 1000 ** 6,
1593 'exbibytes': 1024 ** 6,
1599 'zettabytes': 1000 ** 7,
1600 'zebibytes': 1024 ** 7,
1606 'yottabytes': 1000 ** 8,
1607 'yobibytes': 1024 ** 8,
1610 return lookup_unit_table(_UNIT_TABLE
, s
)
1619 if re
.match(r
'^[\d,.]+$', s
):
1620 return str_to_int(s
)
1631 return lookup_unit_table(_UNIT_TABLE
, s
)
1634 def month_by_name(name
, lang
='en'):
1635 """ Return the number of a month by (locale-independently) English name """
1637 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1640 return month_names
.index(name
) + 1
1645 def month_by_abbreviation(abbrev
):
1646 """ Return the number of a month by (locale-independently) English
1650 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1655 def fix_xml_ampersands(xml_str
):
1656 """Replace all the '&' by '&' in XML"""
1658 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1663 def setproctitle(title
):
1664 assert isinstance(title
, compat_str
)
1666 # ctypes in Jython is not complete
1667 # http://bugs.jython.org/issue2148
1668 if sys
.platform
.startswith('java'):
1672 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1675 title_bytes
= title
.encode('utf-8')
1676 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1677 buf
.value
= title_bytes
1679 libc
.prctl(15, buf
, 0, 0, 0)
1680 except AttributeError:
1681 return # Strange libc, just skip this
1684 def remove_start(s
, start
):
1685 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1688 def remove_end(s
, end
):
1689 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1692 def remove_quotes(s
):
1693 if s
is None or len(s
) < 2:
1695 for quote
in ('"', "'", ):
1696 if s
[0] == quote
and s
[-1] == quote
:
1701 def url_basename(url
):
1702 path
= compat_urlparse
.urlparse(url
).path
1703 return path
.strip('/').split('/')[-1]
1707 return re
.match(r
'https?://[^?#&]+/', url
).group()
1710 def urljoin(base
, path
):
1711 if not isinstance(path
, compat_str
) or not path
:
1713 if re
.match(r
'^(?:https?:)?//', path
):
1715 if not isinstance(base
, compat_str
) or not re
.match(r
'^(?:https?:)?//', base
):
1717 return compat_urlparse
.urljoin(base
, path
)
1720 class HEADRequest(compat_urllib_request
.Request
):
1721 def get_method(self
):
1725 class PUTRequest(compat_urllib_request
.Request
):
1726 def get_method(self
):
1730 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1733 v
= getattr(v
, get_attr
, None)
1739 return int(v
) * invscale
// scale
1744 def str_or_none(v
, default
=None):
1745 return default
if v
is None else compat_str(v
)
1748 def str_to_int(int_str
):
1749 """ A more relaxed version of int_or_none """
1752 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1756 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1760 return float(v
) * invscale
/ scale
1765 def strip_or_none(v
):
1766 return None if v
is None else v
.strip()
1769 def parse_duration(s
):
1770 if not isinstance(s
, compat_basestring
):
1775 days
, hours
, mins
, secs
, ms
= [None] * 5
1776 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
1778 days
, hours
, mins
, secs
, ms
= m
.groups()
1783 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1786 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1789 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1792 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1795 days
, hours
, mins
, secs
, ms
= m
.groups()
1797 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
1799 hours
, mins
= m
.groups()
1805 duration
+= float(secs
)
1807 duration
+= float(mins
) * 60
1809 duration
+= float(hours
) * 60 * 60
1811 duration
+= float(days
) * 24 * 60 * 60
1813 duration
+= float(ms
)
1817 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1818 name
, real_ext
= os
.path
.splitext(filename
)
1820 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1821 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1822 else '{0}.{1}'.format(filename
, ext
))
1825 def replace_extension(filename
, ext
, expected_real_ext
=None):
1826 name
, real_ext
= os
.path
.splitext(filename
)
1827 return '{0}.{1}'.format(
1828 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1832 def check_executable(exe
, args
=[]):
1833 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1834 args can be a list of arguments for a short output (like -version) """
1836 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1842 def get_exe_version(exe
, args
=['--version'],
1843 version_re
=None, unrecognized
='present'):
1844 """ Returns the version of the specified executable,
1845 or False if the executable is not present """
1847 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1848 # SIGTTOU if youtube-dl is run in the background.
1849 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1850 out
, _
= subprocess
.Popen(
1851 [encodeArgument(exe
)] + args
,
1852 stdin
=subprocess
.PIPE
,
1853 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1856 if isinstance(out
, bytes): # Python 2.x
1857 out
= out
.decode('ascii', 'ignore')
1858 return detect_exe_version(out
, version_re
, unrecognized
)
1861 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1862 assert isinstance(output
, compat_str
)
1863 if version_re
is None:
1864 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1865 m
= re
.search(version_re
, output
)
1872 class PagedList(object):
1874 # This is only useful for tests
1875 return len(self
.getslice())
1878 class OnDemandPagedList(PagedList
):
1879 def __init__(self
, pagefunc
, pagesize
, use_cache
=False):
1880 self
._pagefunc
= pagefunc
1881 self
._pagesize
= pagesize
1882 self
._use
_cache
= use_cache
1886 def getslice(self
, start
=0, end
=None):
1888 for pagenum
in itertools
.count(start
// self
._pagesize
):
1889 firstid
= pagenum
* self
._pagesize
1890 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1891 if start
>= nextfirstid
:
1896 page_results
= self
._cache
.get(pagenum
)
1897 if page_results
is None:
1898 page_results
= list(self
._pagefunc
(pagenum
))
1900 self
._cache
[pagenum
] = page_results
1903 start
% self
._pagesize
1904 if firstid
<= start
< nextfirstid
1908 ((end
- 1) % self
._pagesize
) + 1
1909 if (end
is not None and firstid
<= end
<= nextfirstid
)
1912 if startv
!= 0 or endv
is not None:
1913 page_results
= page_results
[startv
:endv
]
1914 res
.extend(page_results
)
1916 # A little optimization - if current page is not "full", ie. does
1917 # not contain page_size videos then we can assume that this page
1918 # is the last one - there are no more ids on further pages -
1919 # i.e. no need to query again.
1920 if len(page_results
) + startv
< self
._pagesize
:
1923 # If we got the whole page, but the next page is not interesting,
1924 # break out early as well
1925 if end
== nextfirstid
:
1930 class InAdvancePagedList(PagedList
):
1931 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1932 self
._pagefunc
= pagefunc
1933 self
._pagecount
= pagecount
1934 self
._pagesize
= pagesize
1936 def getslice(self
, start
=0, end
=None):
1938 start_page
= start
// self
._pagesize
1940 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1941 skip_elems
= start
- start_page
* self
._pagesize
1942 only_more
= None if end
is None else end
- start
1943 for pagenum
in range(start_page
, end_page
):
1944 page
= list(self
._pagefunc
(pagenum
))
1946 page
= page
[skip_elems
:]
1948 if only_more
is not None:
1949 if len(page
) < only_more
:
1950 only_more
-= len(page
)
1952 page
= page
[:only_more
]
1959 def uppercase_escape(s
):
1960 unicode_escape
= codecs
.getdecoder('unicode_escape')
1962 r
'\\U[0-9a-fA-F]{8}',
1963 lambda m
: unicode_escape(m
.group(0))[0],
1967 def lowercase_escape(s
):
1968 unicode_escape
= codecs
.getdecoder('unicode_escape')
1970 r
'\\u[0-9a-fA-F]{4}',
1971 lambda m
: unicode_escape(m
.group(0))[0],
1975 def escape_rfc3986(s
):
1976 """Escape non-ASCII characters as suggested by RFC 3986"""
1977 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
1978 s
= s
.encode('utf-8')
1979 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1982 def escape_url(url
):
1983 """Escape URL as suggested by RFC 3986"""
1984 url_parsed
= compat_urllib_parse_urlparse(url
)
1985 return url_parsed
._replace
(
1986 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
1987 path
=escape_rfc3986(url_parsed
.path
),
1988 params
=escape_rfc3986(url_parsed
.params
),
1989 query
=escape_rfc3986(url_parsed
.query
),
1990 fragment
=escape_rfc3986(url_parsed
.fragment
)
1994 def read_batch_urls(batch_fd
):
1996 if not isinstance(url
, compat_str
):
1997 url
= url
.decode('utf-8', 'replace')
1998 BOM_UTF8
= '\xef\xbb\xbf'
1999 if url
.startswith(BOM_UTF8
):
2000 url
= url
[len(BOM_UTF8
):]
2002 if url
.startswith(('#', ';', ']')):
2006 with contextlib
.closing(batch_fd
) as fd
:
2007 return [url
for url
in map(fixup
, fd
) if url
]
2010 def urlencode_postdata(*args
, **kargs
):
2011 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2014 def update_url_query(url
, query
):
2017 parsed_url
= compat_urlparse
.urlparse(url
)
2018 qs
= compat_parse_qs(parsed_url
.query
)
2020 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2021 query
=compat_urllib_parse_urlencode(qs
, True)))
2024 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
2025 req_headers
= req
.headers
.copy()
2026 req_headers
.update(headers
)
2027 req_data
= data
or req
.data
2028 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2029 req_get_method
= req
.get_method()
2030 if req_get_method
== 'HEAD':
2031 req_type
= HEADRequest
2032 elif req_get_method
== 'PUT':
2033 req_type
= PUTRequest
2035 req_type
= compat_urllib_request
.Request
2037 req_url
, data
=req_data
, headers
=req_headers
,
2038 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2039 if hasattr(req
, 'timeout'):
2040 new_req
.timeout
= req
.timeout
2044 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2045 if isinstance(key_or_keys
, (list, tuple)):
2046 for key
in key_or_keys
:
2047 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2051 return d
.get(key_or_keys
, default
)
2054 def try_get(src
, getter
, expected_type
=None):
2057 except (AttributeError, KeyError, TypeError, IndexError):
2060 if expected_type
is None or isinstance(v
, expected_type
):
2064 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2065 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2077 TV_PARENTAL_GUIDELINES
= {
2087 def parse_age_limit(s
):
2089 return s
if 0 <= s
<= 21 else None
2090 if not isinstance(s
, compat_basestring
):
2092 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2094 return int(m
.group('age'))
2096 return US_RATINGS
[s
]
2097 return TV_PARENTAL_GUIDELINES
.get(s
)
2100 def strip_jsonp(code
):
2102 r
'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
2105 def js_to_json(code
):
2106 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2107 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
2109 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
2110 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
2115 if v
in ('true', 'false', 'null'):
2117 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
2120 if v
[0] in ("'", '"'):
2121 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
2126 }.get(m
.group(0), m
.group(0)), v
[1:-1])
2128 for regex
, base
in INTEGER_TABLE
:
2129 im
= re
.match(regex
, v
)
2131 i
= int(im
.group(1), base
)
2132 return '"%d":' % i
if v
.endswith(':') else '%d' % i
2136 return re
.sub(r
'''(?sx)
2137 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2138 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2139 {comment}|,(?={skip}[\]}}])|
2140 [a-zA-Z_][.a-zA-Z_0-9]*|
2141 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2143 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
2146 def qualities(quality_ids
):
2147 """ Get a numeric quality value out of a list of possible values """
2150 return quality_ids
.index(qid
)
2156 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
2159 def limit_length(s
, length
):
2160 """ Add ellipses to overly long strings """
2165 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
2169 def version_tuple(v
):
2170 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
2173 def is_outdated_version(version
, limit
, assume_new
=True):
2175 return not assume_new
2177 return version_tuple(version
) < version_tuple(limit
)
2179 return not assume_new
2182 def ytdl_is_updateable():
2183 """ Returns if youtube-dl can be updated with -U """
2184 from zipimport
import zipimporter
2186 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
2189 def args_to_str(args
):
2190 # Get a short string representation for a subprocess command
2191 return ' '.join(compat_shlex_quote(a
) for a
in args
)
2194 def error_to_compat_str(err
):
2196 # On python 2 error byte string must be decoded with proper
2197 # encoding rather than ascii
2198 if sys
.version_info
[0] < 3:
2199 err_str
= err_str
.decode(preferredencoding())
2203 def mimetype2ext(mt
):
2209 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2210 # it's the most popular one
2211 'audio/mpeg': 'mp3',
2216 _
, _
, res
= mt
.rpartition('/')
2217 res
= res
.split(';')[0].strip().lower()
2221 'smptett+xml': 'tt',
2227 'x-mp4-fragmented': 'mp4',
2230 'x-mpegurl': 'm3u8',
2231 'vnd.apple.mpegurl': 'm3u8',
2236 'vnd.ms-sstr+xml': 'ism',
2241 def parse_codecs(codecs_str
):
2242 # http://tools.ietf.org/html/rfc6381
2245 splited_codecs
= list(filter(None, map(
2246 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
2247 vcodec
, acodec
= None, None
2248 for full_codec
in splited_codecs
:
2249 codec
= full_codec
.split('.')[0]
2250 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2253 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2257 write_string('WARNING: Unknown codec %s' % full_codec
, sys
.stderr
)
2258 if not vcodec
and not acodec
:
2259 if len(splited_codecs
) == 2:
2264 elif len(splited_codecs
) == 1:
2271 'vcodec': vcodec
or 'none',
2272 'acodec': acodec
or 'none',
2277 def urlhandle_detect_ext(url_handle
):
2278 getheader
= url_handle
.headers
.get
2280 cd
= getheader('Content-Disposition')
2282 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
2284 e
= determine_ext(m
.group('filename'), default_ext
=None)
2288 return mimetype2ext(getheader('Content-Type'))
2291 def encode_data_uri(data
, mime_type
):
2292 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
2295 def age_restricted(content_limit
, age_limit
):
2296 """ Returns True iff the content should be blocked """
2298 if age_limit
is None: # No limit set
2300 if content_limit
is None:
2301 return False # Content available for everyone
2302 return age_limit
< content_limit
2305 def is_html(first_bytes
):
2306 """ Detect whether a file contains HTML by examining its first bytes. """
2309 (b
'\xef\xbb\xbf', 'utf-8'),
2310 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
2311 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
2312 (b
'\xff\xfe', 'utf-16-le'),
2313 (b
'\xfe\xff', 'utf-16-be'),
2315 for bom
, enc
in BOMS
:
2316 if first_bytes
.startswith(bom
):
2317 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
2320 s
= first_bytes
.decode('utf-8', 'replace')
2322 return re
.match(r
'^\s*<', s
)
2325 def determine_protocol(info_dict
):
2326 protocol
= info_dict
.get('protocol')
2327 if protocol
is not None:
2330 url
= info_dict
['url']
2331 if url
.startswith('rtmp'):
2333 elif url
.startswith('mms'):
2335 elif url
.startswith('rtsp'):
2338 ext
= determine_ext(url
)
2344 return compat_urllib_parse_urlparse(url
).scheme
2347 def render_table(header_row
, data
):
2348 """ Render a list of rows, each as a list of values """
2349 table
= [header_row
] + data
2350 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
2351 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
2352 return '\n'.join(format_str
% tuple(row
) for row
in table
)
2355 def _match_one(filter_part
, dct
):
2356 COMPARISON_OPERATORS
= {
2364 operator_rex
= re
.compile(r
'''(?x)\s*
2366 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2368 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2369 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2372 ''' % '|'.join(map(re
.escape
, COMPARISON_OPERATORS
.keys())))
2373 m
= operator_rex
.search(filter_part
)
2375 op
= COMPARISON_OPERATORS
[m
.group('op')]
2376 actual_value
= dct
.get(m
.group('key'))
2377 if (m
.group('strval') is not None or
2378 # If the original field is a string and matching comparisonvalue is
2379 # a number we should respect the origin of the original field
2380 # and process comparison value as a string (see
2381 # https://github.com/rg3/youtube-dl/issues/11082).
2382 actual_value
is not None and m
.group('intval') is not None and
2383 isinstance(actual_value
, compat_str
)):
2384 if m
.group('op') not in ('=', '!='):
2386 'Operator %s does not support string values!' % m
.group('op'))
2387 comparison_value
= m
.group('strval') or m
.group('intval')
2390 comparison_value
= int(m
.group('intval'))
2392 comparison_value
= parse_filesize(m
.group('intval'))
2393 if comparison_value
is None:
2394 comparison_value
= parse_filesize(m
.group('intval') + 'B')
2395 if comparison_value
is None:
2397 'Invalid integer value %r in filter part %r' % (
2398 m
.group('intval'), filter_part
))
2399 if actual_value
is None:
2400 return m
.group('none_inclusive')
2401 return op(actual_value
, comparison_value
)
2404 '': lambda v
: v
is not None,
2405 '!': lambda v
: v
is None,
2407 operator_rex
= re
.compile(r
'''(?x)\s*
2408 (?P<op>%s)\s*(?P<key>[a-z_]+)
2410 ''' % '|'.join(map(re
.escape
, UNARY_OPERATORS
.keys())))
2411 m
= operator_rex
.search(filter_part
)
2413 op
= UNARY_OPERATORS
[m
.group('op')]
2414 actual_value
= dct
.get(m
.group('key'))
2415 return op(actual_value
)
2417 raise ValueError('Invalid filter part %r' % filter_part
)
2420 def match_str(filter_str
, dct
):
2421 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2424 _match_one(filter_part
, dct
) for filter_part
in filter_str
.split('&'))
2427 def match_filter_func(filter_str
):
2428 def _match_func(info_dict
):
2429 if match_str(filter_str
, info_dict
):
2432 video_title
= info_dict
.get('title', info_dict
.get('id', 'video'))
2433 return '%s does not pass filter %s, skipping ..' % (video_title
, filter_str
)
2437 def parse_dfxp_time_expr(time_expr
):
2441 mobj
= re
.match(r
'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr
)
2443 return float(mobj
.group('time_offset'))
2445 mobj
= re
.match(r
'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr
)
2447 return 3600 * int(mobj
.group(1)) + 60 * int(mobj
.group(2)) + float(mobj
.group(3).replace(':', '.'))
2450 def srt_subtitles_timecode(seconds
):
2451 return '%02d:%02d:%02d,%03d' % (seconds
/ 3600, (seconds
% 3600) / 60, seconds
% 60, (seconds
% 1) * 1000)
2454 def dfxp2srt(dfxp_data
):
2455 _x
= functools
.partial(xpath_with_ns
, ns_map
={
2456 'ttml': 'http://www.w3.org/ns/ttml',
2457 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2458 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2461 class TTMLPElementParser(object):
2464 def start(self
, tag
, attrib
):
2465 if tag
in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2471 def data(self
, data
):
2475 return self
.out
.strip()
2477 def parse_node(node
):
2478 target
= TTMLPElementParser()
2479 parser
= xml
.etree
.ElementTree
.XMLParser(target
=target
)
2480 parser
.feed(xml
.etree
.ElementTree
.tostring(node
))
2481 return parser
.close()
2483 dfxp
= compat_etree_fromstring(dfxp_data
.encode('utf-8'))
2485 paras
= dfxp
.findall(_x('.//ttml:p')) or dfxp
.findall(_x('.//ttaf1:p')) or dfxp
.findall(_x('.//ttaf1_0604:p')) or dfxp
.findall('.//p')
2488 raise ValueError('Invalid dfxp/TTML subtitle')
2490 for para
, index
in zip(paras
, itertools
.count(1)):
2491 begin_time
= parse_dfxp_time_expr(para
.attrib
.get('begin'))
2492 end_time
= parse_dfxp_time_expr(para
.attrib
.get('end'))
2493 dur
= parse_dfxp_time_expr(para
.attrib
.get('dur'))
2494 if begin_time
is None:
2499 end_time
= begin_time
+ dur
2500 out
.append('%d\n%s --> %s\n%s\n\n' % (
2502 srt_subtitles_timecode(begin_time
),
2503 srt_subtitles_timecode(end_time
),
2509 def cli_option(params
, command_option
, param
):
2510 param
= params
.get(param
)
2512 param
= compat_str(param
)
2513 return [command_option
, param
] if param
is not None else []
2516 def cli_bool_option(params
, command_option
, param
, true_value
='true', false_value
='false', separator
=None):
2517 param
= params
.get(param
)
2518 assert isinstance(param
, bool)
2520 return [command_option
+ separator
+ (true_value
if param
else false_value
)]
2521 return [command_option
, true_value
if param
else false_value
]
2524 def cli_valueless_option(params
, command_option
, param
, expected_value
=True):
2525 param
= params
.get(param
)
2526 return [command_option
] if param
== expected_value
else []
2529 def cli_configuration_args(params
, param
, default
=[]):
2530 ex_args
= params
.get(param
)
2533 assert isinstance(ex_args
, list)
2537 class ISO639Utils(object):
2538 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2727 def short2long(cls
, code
):
2728 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2729 return cls
._lang
_map
.get(code
[:2])
2732 def long2short(cls
, code
):
2733 """Convert language code from ISO 639-2/T to ISO 639-1"""
2734 for short_name
, long_name
in cls
._lang
_map
.items():
2735 if long_name
== code
:
2739 class ISO3166Utils(object):
2740 # From http://data.okfn.org/data/core/country-list
2742 'AF': 'Afghanistan',
2743 'AX': 'Åland Islands',
2746 'AS': 'American Samoa',
2751 'AG': 'Antigua and Barbuda',
2768 'BO': 'Bolivia, Plurinational State of',
2769 'BQ': 'Bonaire, Sint Eustatius and Saba',
2770 'BA': 'Bosnia and Herzegovina',
2772 'BV': 'Bouvet Island',
2774 'IO': 'British Indian Ocean Territory',
2775 'BN': 'Brunei Darussalam',
2777 'BF': 'Burkina Faso',
2783 'KY': 'Cayman Islands',
2784 'CF': 'Central African Republic',
2788 'CX': 'Christmas Island',
2789 'CC': 'Cocos (Keeling) Islands',
2793 'CD': 'Congo, the Democratic Republic of the',
2794 'CK': 'Cook Islands',
2796 'CI': 'Côte d\'Ivoire',
2801 'CZ': 'Czech Republic',
2805 'DO': 'Dominican Republic',
2808 'SV': 'El Salvador',
2809 'GQ': 'Equatorial Guinea',
2813 'FK': 'Falkland Islands (Malvinas)',
2814 'FO': 'Faroe Islands',
2818 'GF': 'French Guiana',
2819 'PF': 'French Polynesia',
2820 'TF': 'French Southern Territories',
2835 'GW': 'Guinea-Bissau',
2838 'HM': 'Heard Island and McDonald Islands',
2839 'VA': 'Holy See (Vatican City State)',
2846 'IR': 'Iran, Islamic Republic of',
2849 'IM': 'Isle of Man',
2859 'KP': 'Korea, Democratic People\'s Republic of',
2860 'KR': 'Korea, Republic of',
2863 'LA': 'Lao People\'s Democratic Republic',
2869 'LI': 'Liechtenstein',
2873 'MK': 'Macedonia, the Former Yugoslav Republic of',
2880 'MH': 'Marshall Islands',
2886 'FM': 'Micronesia, Federated States of',
2887 'MD': 'Moldova, Republic of',
2898 'NL': 'Netherlands',
2899 'NC': 'New Caledonia',
2900 'NZ': 'New Zealand',
2905 'NF': 'Norfolk Island',
2906 'MP': 'Northern Mariana Islands',
2911 'PS': 'Palestine, State of',
2913 'PG': 'Papua New Guinea',
2916 'PH': 'Philippines',
2920 'PR': 'Puerto Rico',
2924 'RU': 'Russian Federation',
2926 'BL': 'Saint Barthélemy',
2927 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2928 'KN': 'Saint Kitts and Nevis',
2929 'LC': 'Saint Lucia',
2930 'MF': 'Saint Martin (French part)',
2931 'PM': 'Saint Pierre and Miquelon',
2932 'VC': 'Saint Vincent and the Grenadines',
2935 'ST': 'Sao Tome and Principe',
2936 'SA': 'Saudi Arabia',
2940 'SL': 'Sierra Leone',
2942 'SX': 'Sint Maarten (Dutch part)',
2945 'SB': 'Solomon Islands',
2947 'ZA': 'South Africa',
2948 'GS': 'South Georgia and the South Sandwich Islands',
2949 'SS': 'South Sudan',
2954 'SJ': 'Svalbard and Jan Mayen',
2957 'CH': 'Switzerland',
2958 'SY': 'Syrian Arab Republic',
2959 'TW': 'Taiwan, Province of China',
2961 'TZ': 'Tanzania, United Republic of',
2963 'TL': 'Timor-Leste',
2967 'TT': 'Trinidad and Tobago',
2970 'TM': 'Turkmenistan',
2971 'TC': 'Turks and Caicos Islands',
2975 'AE': 'United Arab Emirates',
2976 'GB': 'United Kingdom',
2977 'US': 'United States',
2978 'UM': 'United States Minor Outlying Islands',
2982 'VE': 'Venezuela, Bolivarian Republic of',
2984 'VG': 'Virgin Islands, British',
2985 'VI': 'Virgin Islands, U.S.',
2986 'WF': 'Wallis and Futuna',
2987 'EH': 'Western Sahara',
2994 def short2full(cls
, code
):
2995 """Convert an ISO 3166-2 country code to the corresponding full name"""
2996 return cls
._country
_map
.get(code
.upper())
2999 class PerRequestProxyHandler(compat_urllib_request
.ProxyHandler
):
3000 def __init__(self
, proxies
=None):
3001 # Set default handlers
3002 for type in ('http', 'https'):
3003 setattr(self
, '%s_open' % type,
3004 lambda r
, proxy
='__noproxy__', type=type, meth
=self
.proxy_open
:
3005 meth(r
, proxy
, type))
3006 return compat_urllib_request
.ProxyHandler
.__init
__(self
, proxies
)
3008 def proxy_open(self
, req
, proxy
, type):
3009 req_proxy
= req
.headers
.get('Ytdl-request-proxy')
3010 if req_proxy
is not None:
3012 del req
.headers
['Ytdl-request-proxy']
3014 if proxy
== '__noproxy__':
3015 return None # No Proxy
3016 if compat_urlparse
.urlparse(proxy
).scheme
.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3017 req
.add_header('Ytdl-socks-proxy', proxy
)
3018 # youtube-dl's http/https handlers do wrapping the socket with socks
3020 return compat_urllib_request
.ProxyHandler
.proxy_open(
3021 self
, req
, proxy
, type)
3024 def ohdave_rsa_encrypt(data
, exponent
, modulus
):
3026 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3029 data: data to encrypt, bytes-like object
3030 exponent, modulus: parameter e and N of RSA algorithm, both integer
3031 Output: hex string of encrypted data
3033 Limitation: supports one block encryption only
3036 payload
= int(binascii
.hexlify(data
[::-1]), 16)
3037 encrypted
= pow(payload
, exponent
, modulus
)
3038 return '%x' % encrypted
3041 def encode_base_n(num
, n
, table
=None):
3042 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3044 table
= FULL_TABLE
[:n
]
3047 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
3054 ret
= table
[num
% n
] + ret
3059 def decode_packed_codes(code
):
3060 mobj
= re
.search(PACKED_CODES_RE
, code
)
3061 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
3064 symbols
= symbols
.split('|')
3069 base_n_count
= encode_base_n(count
, base
)
3070 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
3073 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
3077 def parse_m3u8_attributes(attrib
):
3079 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
3080 if val
.startswith('"'):
3086 def urshift(val
, n
):
3087 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
3090 # Based on png2str() written by @gdkchan and improved by @yokrysty
3091 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3092 def decode_png(png_data
):
3093 # Reference: https://www.w3.org/TR/PNG/
3094 header
= png_data
[8:]
3096 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
3097 raise IOError('Not a valid PNG file.')
3099 int_map
= {1: '>B', 2: '>H', 4: '>I'}
3100 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
3105 length
= unpack_integer(header
[:4])
3108 chunk_type
= header
[:4]
3111 chunk_data
= header
[:length
]
3112 header
= header
[length
:]
3114 header
= header
[4:] # Skip CRC
3122 ihdr
= chunks
[0]['data']
3124 width
= unpack_integer(ihdr
[:4])
3125 height
= unpack_integer(ihdr
[4:8])
3129 for chunk
in chunks
:
3130 if chunk
['type'] == b
'IDAT':
3131 idat
+= chunk
['data']
3134 raise IOError('Unable to read PNG data.')
3136 decompressed_data
= bytearray(zlib
.decompress(idat
))
3141 def _get_pixel(idx
):
3146 for y
in range(height
):
3147 basePos
= y
* (1 + stride
)
3148 filter_type
= decompressed_data
[basePos
]
3152 pixels
.append(current_row
)
3154 for x
in range(stride
):
3155 color
= decompressed_data
[1 + basePos
+ x
]
3156 basex
= y
* stride
+ x
3161 left
= _get_pixel(basex
- 3)
3163 up
= _get_pixel(basex
- stride
)
3165 if filter_type
== 1: # Sub
3166 color
= (color
+ left
) & 0xff
3167 elif filter_type
== 2: # Up
3168 color
= (color
+ up
) & 0xff
3169 elif filter_type
== 3: # Average
3170 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
3171 elif filter_type
== 4: # Paeth
3177 c
= _get_pixel(basex
- stride
- 3)
3185 if pa
<= pb
and pa
<= pc
:
3186 color
= (color
+ a
) & 0xff
3188 color
= (color
+ b
) & 0xff
3190 color
= (color
+ c
) & 0xff
3192 current_row
.append(color
)
3194 return width
, height
, pixels
3197 def write_xattr(path
, key
, value
):
3198 # This mess below finds the best xattr tool for the job
3200 # try the pyxattr module...
3203 if hasattr(xattr
, 'set'): # pyxattr
3204 # Unicode arguments are not supported in python-pyxattr until
3206 # See https://github.com/rg3/youtube-dl/issues/5498
3207 pyxattr_required_version
= '0.5.0'
3208 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
3209 # TODO: fallback to CLI tools
3210 raise XAttrUnavailableError(
3211 'python-pyxattr is detected but is too old. '
3212 'youtube-dl requires %s or above while your version is %s. '
3213 'Falling back to other xattr implementations' % (
3214 pyxattr_required_version
, xattr
.__version
__))
3216 setxattr
= xattr
.set
3218 setxattr
= xattr
.setxattr
3221 setxattr(path
, key
, value
)
3222 except EnvironmentError as e
:
3223 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3226 if compat_os_name
== 'nt':
3227 # Write xattrs to NTFS Alternate Data Streams:
3228 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3229 assert ':' not in key
3230 assert os
.path
.exists(path
)
3232 ads_fn
= path
+ ':' + key
3234 with open(ads_fn
, 'wb') as f
:
3236 except EnvironmentError as e
:
3237 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3239 user_has_setfattr
= check_executable('setfattr', ['--version'])
3240 user_has_xattr
= check_executable('xattr', ['-h'])
3242 if user_has_setfattr
or user_has_xattr
:
3244 value
= value
.decode('utf-8')
3245 if user_has_setfattr
:
3246 executable
= 'setfattr'
3247 opts
= ['-n', key
, '-v', value
]
3248 elif user_has_xattr
:
3249 executable
= 'xattr'
3250 opts
= ['-w', key
, value
]
3252 cmd
= ([encodeFilename(executable
, True)] +
3253 [encodeArgument(o
) for o
in opts
] +
3254 [encodeFilename(path
, True)])
3257 p
= subprocess
.Popen(
3258 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
3259 except EnvironmentError as e
:
3260 raise XAttrMetadataError(e
.errno
, e
.strerror
)
3261 stdout
, stderr
= p
.communicate()
3262 stderr
= stderr
.decode('utf-8', 'replace')
3263 if p
.returncode
!= 0:
3264 raise XAttrMetadataError(p
.returncode
, stderr
)
3267 # On Unix, and can't find pyxattr, setfattr, or xattr.
3268 if sys
.platform
.startswith('linux'):
3269 raise XAttrUnavailableError(
3270 "Couldn't find a tool to set the xattrs. "
3271 "Install either the python 'pyxattr' or 'xattr' "
3272 "modules, or the GNU 'attr' package "
3273 "(which contains the 'setfattr' tool).")
3275 raise XAttrUnavailableError(
3276 "Couldn't find a tool to set the xattrs. "
3277 "Install either the python 'xattr' module, "
3278 "or the 'xattr' binary.")