4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_unquote_plus
,
64 compat_urllib_request
,
75 def register_socks_protocols():
76 # "Register" SOCKS protocols
77 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
78 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
79 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
80 if scheme
not in compat_urlparse
.uses_netloc
:
81 compat_urlparse
.uses_netloc
.append(scheme
)
84 # This is not clearly defined otherwise
85 compiled_regex_type
= type(re
.compile(''))
88 def random_user_agent():
89 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1668 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1672 'User-Agent': random_user_agent(),
1673 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1674 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1675 'Accept-Encoding': 'gzip, deflate',
1676 'Accept-Language': 'en-us,en;q=0.5',
1681 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1685 NO_DEFAULT
= object()
1687 ENGLISH_MONTH_NAMES
= [
1688 'January', 'February', 'March', 'April', 'May', 'June',
1689 'July', 'August', 'September', 'October', 'November', 'December']
1692 'en': ENGLISH_MONTH_NAMES
,
1694 'janvier', 'fƩvrier', 'mars', 'avril', 'mai', 'juin',
1695 'juillet', 'aoƻt', 'septembre', 'octobre', 'novembre', 'dƩcembre'],
1698 KNOWN_EXTENSIONS
= (
1699 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1700 'flv', 'f4v', 'f4a', 'f4b',
1701 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1702 'mkv', 'mka', 'mk3d',
1705 'asf', 'wmv', 'wma',
1711 'f4f', 'f4m', 'm3u8', 'smil')
1713 # needed for sanitizing filenames in restricted mode
1714 ACCENT_CHARS
= dict(zip('ĆĆĆĆĆĆ
ĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆÅĆÅĆĆĆĆÅ°ĆĆĆĆ Ć”Ć¢Ć£Ć¤Ć„Ć¦Ć§ĆØĆ©ĆŖƫƬĆĆ®ĆÆĆ°Ć±Ć²Ć³Ć“ĆµĆ¶ÅĆøÅĆ¹ĆŗĆ»Ć¼Å±Ć½Ć¾Ćæ',
1715 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1716 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1739 '%Y/%m/%d %H:%M:%S',
1741 '%Y-%m-%d %H:%M:%S',
1742 '%Y-%m-%d %H:%M:%S.%f',
1745 '%Y-%m-%dT%H:%M:%SZ',
1746 '%Y-%m-%dT%H:%M:%S.%fZ',
1747 '%Y-%m-%dT%H:%M:%S.%f0Z',
1748 '%Y-%m-%dT%H:%M:%S',
1749 '%Y-%m-%dT%H:%M:%S.%f',
1751 '%b %d %Y at %H:%M',
1752 '%b %d %Y at %H:%M:%S',
1753 '%B %d %Y at %H:%M',
1754 '%B %d %Y at %H:%M:%S',
1757 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1758 DATE_FORMATS_DAY_FIRST
.extend([
1764 '%d/%m/%Y %H:%M:%S',
1767 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1768 DATE_FORMATS_MONTH_FIRST
.extend([
1773 '%m/%d/%Y %H:%M:%S',
1776 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1777 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1780 def preferredencoding():
1781 """Get preferred encoding.
1783 Returns the best encoding scheme for the system, based on
1784 locale.getpreferredencoding() and some further tweaks.
1787 pref = locale.getpreferredencoding()
1795 def write_json_file(obj, fn):
1796 """ Encode obj as JSON and write it to fn, atomically if possible """
1798 fn = encodeFilename(fn)
1799 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1800 encoding = get_filesystem_encoding()
1801 # os.path.basename returns a bytes object, but NamedTemporaryFile
1802 # will fail if the filename contains non ascii characters unless we
1803 # use a unicode object
1804 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1805 # the same for os.path.dirname
1806 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1808 path_basename = os.path.basename
1809 path_dirname = os.path.dirname
1813 'prefix
': path_basename(fn) + '.',
1814 'dir': path_dirname(fn),
1818 # In Python 2.x, json.dump expects a bytestream.
1819 # In Python 3.x, it writes to a character stream
1820 if sys.version_info < (3, 0):
1825 'encoding
': 'utf
-8',
1828 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1833 if sys.platform == 'win32
':
1834 # Need to remove existing file on Windows, else os.rename raises
1835 # WindowsError or FileExistsError.
1843 os.chmod(tf.name, 0o666 & ~mask)
1846 os.rename(tf.name, fn)
1855 if sys.version_info >= (2, 7):
1856 def find_xpath_attr(node, xpath, key, val=None):
1857 """ Find the xpath xpath[@key=val] """
1858 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1859 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1860 return node.find(expr)
1862 def find_xpath_attr(node, xpath, key, val=None):
1863 for f in node.findall(compat_xpath(xpath)):
1864 if key not in f.attrib:
1866 if val is None or f.attrib.get(key) == val:
1870 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1871 # the namespace parameter
1874 def xpath_with_ns(path
, ns_map
):
1875 components
= [c
.split(':') for c
in path
.split('/')]
1877 for c
in components
:
1879 replaced
.append(c
[0])
1882 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1883 return '/'.join(replaced
)
1886 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1887 def _find_xpath(xpath
):
1888 return node
.find(compat_xpath(xpath
))
1890 if isinstance(xpath
, (str, compat_str
)):
1891 n
= _find_xpath(xpath
)
1899 if default
is not NO_DEFAULT
:
1902 name
= xpath
if name
is None else name
1903 raise ExtractorError('Could not find XML element %s' % name
)
1909 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1910 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1911 if n
is None or n
== default
:
1914 if default
is not NO_DEFAULT
:
1917 name
= xpath
if name
is None else name
1918 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1924 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1925 n
= find_xpath_attr(node
, xpath
, key
)
1927 if default
is not NO_DEFAULT
:
1930 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1931 raise ExtractorError('Could not find XML attribute %s' % name
)
1934 return n
.attrib
[key
]
1937 def get_element_by_id(id, html
):
1938 """Return the content of the tag with the specified ID in the passed HTML document"""
1939 return get_element_by_attribute('id', id, html
)
1942 def get_element_by_class(class_name
, html
):
1943 """Return the content of the first tag with the specified class in the passed HTML document"""
1944 retval
= get_elements_by_class(class_name
, html
)
1945 return retval
[0] if retval
else None
1948 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1949 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1950 return retval
[0] if retval
else None
1953 def get_elements_by_class(class_name
, html
):
1954 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1955 return get_elements_by_attribute(
1956 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1957 html, escape_value=False)
1960 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1961 """Return the content of the tag with the specified attribute in the passed HTML document"""
1963 value = re.escape(value) if escape_value else value
1966 for m in re.finditer(r'''(?xs)
1968 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1970 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1974 ''' % (re.escape(attribute), value), html):
1975 res = m.group('content
')
1977 if res.startswith('"') or res.startswith("'"):
1980 retlist.append(unescapeHTML(res))
1985 class HTMLAttributeParser(compat_HTMLParser):
1986 """Trivial HTML parser to gather the attributes for a single element"""
1989 compat_HTMLParser.__init__(self)
1991 def handle_starttag(self, tag, attrs):
1992 self.attrs = dict(attrs)
1995 def extract_attributes(html_element):
1996 """Given a string for an HTML element such as
1998 a="foo" B="bar" c="&98;az" d=boz
1999 empty= noval entity="&"
2002 Decode and return a dictionary of attributes.
2004 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2005 'empty
': '', 'noval
': None, 'entity
': '&',
2006 'sq
': '"', 'dq': '\''
2008 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2009 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2011 parser = HTMLAttributeParser()
2013 parser.feed(html_element)
2015 # Older Python may throw HTMLParseError in case of malformed HTML
2016 except compat_HTMLParseError:
2021 def clean_html(html):
2022 """Clean an HTML snippet into a readable string"""
2024 if html is None: # Convenience for sanitizing descriptions etc.
2028 html = html.replace('\n', ' ')
2029 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2030 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2032 html = re.sub('<.*?>', '', html)
2033 # Replace html entities
2034 html = unescapeHTML(html)
2038 def sanitize_open(filename, open_mode):
2039 """Try to open the given filename, and slightly tweak it if this fails.
2041 Attempts to open the given filename. If this fails, it tries to change
2042 the filename slightly, step by step, until it's either able to open it
2043 or it fails and raises a final exception, like the standard open()
2046 It returns the tuple (stream, definitive_file_name).
2050 if sys.platform == 'win32':
2052 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2053 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2054 stream = open(encodeFilename(filename), open_mode)
2055 return (stream, filename)
2056 except (IOError, OSError) as err:
2057 if err.errno in (errno.EACCES,):
2060 # In case of error, try to remove win32 forbidden chars
2061 alt_filename = sanitize_path(filename)
2062 if alt_filename == filename:
2065 # An exception here should be caught in the caller
2066 stream = open(encodeFilename(alt_filename), open_mode)
2067 return (stream, alt_filename)
2070 def timeconvert(timestr):
2071 """Convert RFC 2822 defined time string into system timestamp"""
2073 timetuple = email.utils.parsedate_tz(timestr)
2074 if timetuple is not None:
2075 timestamp = email.utils.mktime_tz(timetuple)
2079 def sanitize_filename(s, restricted=False, is_id=False):
2080 """Sanitizes a string so it could be used as part of a filename.
2081 If restricted is set, use a stricter subset of allowed characters.
2082 Set is_id if this is not an arbitrary string, but an ID that should be kept
2085 def replace_insane(char):
2086 if restricted and char in ACCENT_CHARS:
2087 return ACCENT_CHARS[char]
2088 if char == '?' or ord(char) < 32 or ord(char) == 127:
2091 return '' if restricted else '\''
2093 return '_
-' if restricted else ' -'
2094 elif char in '\\/|
*<>':
2096 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2098 if restricted
and ord(char
) > 127:
2103 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2104 result
= ''.join(map(replace_insane
, s
))
2106 while '__' in result
:
2107 result
= result
.replace('__', '_')
2108 result
= result
.strip('_')
2109 # Common case of "Foreign band name - English song title"
2110 if restricted
and result
.startswith('-_'):
2112 if result
.startswith('-'):
2113 result
= '_' + result
[len('-'):]
2114 result
= result
.lstrip('.')
2120 def sanitize_path(s
):
2121 """Sanitizes and normalizes path on Windows"""
2122 if sys
.platform
!= 'win32':
2124 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2125 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2126 drive_or_unc
, _
= os
.path
.splitunc(s
)
2127 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2131 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2132 for path_part
in norm_path
]
2134 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2135 return os
.path
.join(*sanitized_path
)
2138 def sanitize_url(url
):
2139 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2140 # the number of unwanted failures due to missing protocol
2141 if url
.startswith('//'):
2142 return 'http:%s' % url
2143 # Fix some common typos seen so far
2145 # https://github.com/ytdl-org/youtube-dl/issues/15649
2146 (r
'^httpss://', r
'https://'),
2147 # https://bx1.be/lives/direct-tv/
2148 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2150 for mistake
, fixup
in COMMON_TYPOS
:
2151 if re
.match(mistake
, url
):
2152 return re
.sub(mistake
, fixup
, url
)
2156 def sanitized_Request(url
, *args
, **kwargs
):
2157 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2161 """Expand shell variables and ~"""
2162 return os
.path
.expandvars(compat_expanduser(s
))
2165 def orderedSet(iterable
):
2166 """ Remove all duplicates from the input iterable """
2174 def _htmlentity_transform(entity_with_semicolon
):
2175 """Transforms an HTML entity to a character."""
2176 entity
= entity_with_semicolon
[:-1]
2178 # Known non-numeric HTML entity
2179 if entity
in compat_html_entities
.name2codepoint
:
2180 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2182 # TODO: HTML5 allows entities without a semicolon. For example,
2183 # 'Éric' should be decoded as 'Ćric'.
2184 if entity_with_semicolon
in compat_html_entities_html5
:
2185 return compat_html_entities_html5
[entity_with_semicolon
]
2187 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2188 if mobj
is not None:
2189 numstr
= mobj
.group(1)
2190 if numstr
.startswith('x'):
2192 numstr
= '0%s' % numstr
2195 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2197 return compat_chr(int(numstr
, base
))
2201 # Unknown entity in name, return its literal representation
2202 return '&%s;' % entity
2205 def unescapeHTML(s
):
2208 assert type(s
) == compat_str
2211 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2214 def get_subprocess_encoding():
2215 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2216 # For subprocess calls, encode with locale encoding
2217 # Refer to http://stackoverflow.com/a/9951851/35070
2218 encoding
= preferredencoding()
2220 encoding
= sys
.getfilesystemencoding()
2221 if encoding
is None:
2226 def encodeFilename(s
, for_subprocess
=False):
2228 @param s The name of the file
2231 assert type(s
) == compat_str
2233 # Python 3 has a Unicode API
2234 if sys
.version_info
>= (3, 0):
2237 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2238 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2239 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2240 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2243 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2244 if sys
.platform
.startswith('java'):
2247 return s
.encode(get_subprocess_encoding(), 'ignore')
2250 def decodeFilename(b
, for_subprocess
=False):
2252 if sys
.version_info
>= (3, 0):
2255 if not isinstance(b
, bytes):
2258 return b
.decode(get_subprocess_encoding(), 'ignore')
2261 def encodeArgument(s
):
2262 if not isinstance(s
, compat_str
):
2263 # Legacy code that uses byte strings
2264 # Uncomment the following line after fixing all post processors
2265 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2266 s
= s
.decode('ascii')
2267 return encodeFilename(s
, True)
2270 def decodeArgument(b
):
2271 return decodeFilename(b
, True)
2274 def decodeOption(optval
):
2277 if isinstance(optval
, bytes):
2278 optval
= optval
.decode(preferredencoding())
2280 assert isinstance(optval
, compat_str
)
2284 def formatSeconds(secs
):
2286 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
2288 return '%d:%02d' % (secs
// 60, secs
% 60)
2293 def make_HTTPS_handler(params
, **kwargs
):
2294 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2295 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2296 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2297 if opts_no_check_certificate
:
2298 context
.check_hostname
= False
2299 context
.verify_mode
= ssl
.CERT_NONE
2301 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2304 # (create_default_context present but HTTPSHandler has no context=)
2307 if sys
.version_info
< (3, 2):
2308 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2309 else: # Python < 3.4
2310 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2311 context
.verify_mode
= (ssl
.CERT_NONE
2312 if opts_no_check_certificate
2313 else ssl
.CERT_REQUIRED
)
2314 context
.set_default_verify_paths()
2315 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2318 def bug_reports_message():
2319 if ytdl_is_updateable():
2320 update_cmd
= 'type youtube-dl -U to update'
2322 update_cmd
= 'see https://yt-dl.org/update on how to update'
2323 msg
= '; please report this issue on https://yt-dl.org/bug .'
2324 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2325 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2329 class YoutubeDLError(Exception):
2330 """Base exception for YoutubeDL errors."""
2334 class ExtractorError(YoutubeDLError
):
2335 """Error during info extraction."""
2337 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2338 """ tb, if given, is the original traceback (so that it can be printed out).
2339 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2342 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2344 if video_id
is not None:
2345 msg
= video_id
+ ': ' + msg
2347 msg
+= ' (caused by %r)' % cause
2349 msg
+= bug_reports_message()
2350 super(ExtractorError
, self
).__init
__(msg
)
2353 self
.exc_info
= sys
.exc_info() # preserve original exception
2355 self
.video_id
= video_id
2357 def format_traceback(self
):
2358 if self
.traceback
is None:
2360 return ''.join(traceback
.format_tb(self
.traceback
))
2363 class UnsupportedError(ExtractorError
):
2364 def __init__(self
, url
):
2365 super(UnsupportedError
, self
).__init
__(
2366 'Unsupported URL: %s' % url
, expected
=True)
2370 class RegexNotFoundError(ExtractorError
):
2371 """Error when a regex didn't match"""
2375 class GeoRestrictedError(ExtractorError
):
2376 """Geographic restriction Error exception.
2378 This exception may be thrown when a video is not available from your
2379 geographic location due to geographic restrictions imposed by a website.
2381 def __init__(self
, msg
, countries
=None):
2382 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2384 self
.countries
= countries
2387 class DownloadError(YoutubeDLError
):
2388 """Download Error exception.
2390 This exception may be thrown by FileDownloader objects if they are not
2391 configured to continue on errors. They will contain the appropriate
2395 def __init__(self
, msg
, exc_info
=None):
2396 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2397 super(DownloadError
, self
).__init
__(msg
)
2398 self
.exc_info
= exc_info
2401 class SameFileError(YoutubeDLError
):
2402 """Same File exception.
2404 This exception will be thrown by FileDownloader objects if they detect
2405 multiple files would have to be downloaded to the same file on disk.
2410 class PostProcessingError(YoutubeDLError
):
2411 """Post Processing exception.
2413 This exception may be raised by PostProcessor's .run() method to
2414 indicate an error in the postprocessing task.
2417 def __init__(self
, msg
):
2418 super(PostProcessingError
, self
).__init
__(msg
)
2422 class MaxDownloadsReached(YoutubeDLError
):
2423 """ --max-downloads limit has been reached. """
2427 class UnavailableVideoError(YoutubeDLError
):
2428 """Unavailable Format exception.
2430 This exception will be thrown when a video is requested
2431 in a format that is not available for that video.
2436 class ContentTooShortError(YoutubeDLError
):
2437 """Content Too Short exception.
2439 This exception may be raised by FileDownloader objects when a file they
2440 download is too small for what the server announced first, indicating
2441 the connection was probably interrupted.
2444 def __init__(self
, downloaded
, expected
):
2445 super(ContentTooShortError
, self
).__init
__(
2446 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2449 self
.downloaded
= downloaded
2450 self
.expected
= expected
2453 class XAttrMetadataError(YoutubeDLError
):
2454 def __init__(self
, code
=None, msg
='Unknown error'):
2455 super(XAttrMetadataError
, self
).__init
__(msg
)
2459 # Parsing code and msg
2460 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2461 or 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
2462 self
.reason
= 'NO_SPACE'
2463 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2464 self
.reason
= 'VALUE_TOO_LONG'
2466 self
.reason
= 'NOT_SUPPORTED'
2469 class XAttrUnavailableError(YoutubeDLError
):
2473 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2474 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2475 # expected HTTP responses to meet HTTP/1.0 or later (see also
2476 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2477 if sys
.version_info
< (3, 0):
2478 kwargs
['strict'] = True
2479 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2480 source_address
= ydl_handler
._params
.get('source_address')
2482 if source_address
is not None:
2483 # This is to workaround _create_connection() from socket where it will try all
2484 # address data from getaddrinfo() including IPv6. This filters the result from
2485 # getaddrinfo() based on the source_address value.
2486 # This is based on the cpython socket.create_connection() function.
2487 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2488 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2489 host
, port
= address
2491 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2492 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2493 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2494 if addrs
and not ip_addrs
:
2495 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2497 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2498 % (ip_version
, source_address
[0]))
2499 for res
in ip_addrs
:
2500 af
, socktype
, proto
, canonname
, sa
= res
2503 sock
= socket
.socket(af
, socktype
, proto
)
2504 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2505 sock
.settimeout(timeout
)
2506 sock
.bind(source_address
)
2508 err
= None # Explicitly break reference cycle
2510 except socket
.error
as _
:
2512 if sock
is not None:
2517 raise socket
.error('getaddrinfo returns an empty list')
2518 if hasattr(hc
, '_create_connection'):
2519 hc
._create
_connection
= _create_connection
2520 sa
= (source_address
, 0)
2521 if hasattr(hc
, 'source_address'): # Python 2.7+
2522 hc
.source_address
= sa
2524 def _hc_connect(self
, *args
, **kwargs
):
2525 sock
= _create_connection(
2526 (self
.host
, self
.port
), self
.timeout
, sa
)
2528 self
.sock
= ssl
.wrap_socket(
2529 sock
, self
.key_file
, self
.cert_file
,
2530 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2533 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2538 def handle_youtubedl_headers(headers
):
2539 filtered_headers
= headers
2541 if 'Youtubedl-no-compression' in filtered_headers
:
2542 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2543 del filtered_headers
['Youtubedl-no-compression']
2545 return filtered_headers
2548 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2549 """Handler for HTTP requests and responses.
2551 This class, when installed with an OpenerDirector, automatically adds
2552 the standard headers to every HTTP request and handles gzipped and
2553 deflated responses from web servers. If compression is to be avoided in
2554 a particular request, the original request in the program code only has
2555 to include the HTTP header "Youtubedl-no-compression", which will be
2556 removed before making the real request.
2558 Part of this code was copied from:
2560 http://techknack.net/python-urllib2-handlers/
2562 Andrew Rowls, the author of that code, agreed to release it to the
2566 def __init__(self
, params
, *args
, **kwargs
):
2567 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2568 self
._params
= params
2570 def http_open(self
, req
):
2571 conn_class
= compat_http_client
.HTTPConnection
2573 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2575 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2576 del req
.headers
['Ytdl-socks-proxy']
2578 return self
.do_open(functools
.partial(
2579 _create_http_connection
, self
, conn_class
, False),
2585 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2587 return zlib
.decompress(data
)
2589 def http_request(self
, req
):
2590 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2591 # always respected by websites, some tend to give out URLs with non percent-encoded
2592 # non-ASCII characters (see telemb.py, ard.py [#3412])
2593 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2594 # To work around aforementioned issue we will replace request's original URL with
2595 # percent-encoded one
2596 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2597 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2598 url
= req
.get_full_url()
2599 url_escaped
= escape_url(url
)
2601 # Substitute URL if any change after escaping
2602 if url
!= url_escaped
:
2603 req
= update_Request(req
, url
=url_escaped
)
2605 for h
, v
in std_headers
.items():
2606 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2607 # The dict keys are capitalized because of this bug by urllib
2608 if h
.capitalize() not in req
.headers
:
2609 req
.add_header(h
, v
)
2611 req
.headers
= handle_youtubedl_headers(req
.headers
)
2613 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2614 # Python 2.6 is brain-dead when it comes to fragments
2615 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2616 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2620 def http_response(self
, req
, resp
):
2623 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2624 content
= resp
.read()
2625 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2627 uncompressed
= io
.BytesIO(gz
.read())
2628 except IOError as original_ioerror
:
2629 # There may be junk add the end of the file
2630 # See http://stackoverflow.com/q/4928560/35070 for details
2631 for i
in range(1, 1024):
2633 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2634 uncompressed
= io
.BytesIO(gz
.read())
2639 raise original_ioerror
2640 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2641 resp
.msg
= old_resp
.msg
2642 del resp
.headers
['Content-encoding']
2644 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2645 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2646 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2647 resp
.msg
= old_resp
.msg
2648 del resp
.headers
['Content-encoding']
2649 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2650 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2651 if 300 <= resp
.code
< 400:
2652 location
= resp
.headers
.get('Location')
2654 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2655 if sys
.version_info
>= (3, 0):
2656 location
= location
.encode('iso-8859-1').decode('utf-8')
2658 location
= location
.decode('utf-8')
2659 location_escaped
= escape_url(location
)
2660 if location
!= location_escaped
:
2661 del resp
.headers
['Location']
2662 if sys
.version_info
< (3, 0):
2663 location_escaped
= location_escaped
.encode('utf-8')
2664 resp
.headers
['Location'] = location_escaped
2667 https_request
= http_request
2668 https_response
= http_response
2671 def make_socks_conn_class(base_class
, socks_proxy
):
2672 assert issubclass(base_class
, (
2673 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2675 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2676 if url_components
.scheme
.lower() == 'socks5':
2677 socks_type
= ProxyType
.SOCKS5
2678 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2679 socks_type
= ProxyType
.SOCKS4
2680 elif url_components
.scheme
.lower() == 'socks4a':
2681 socks_type
= ProxyType
.SOCKS4A
2683 def unquote_if_non_empty(s
):
2686 return compat_urllib_parse_unquote_plus(s
)
2690 url_components
.hostname
, url_components
.port
or 1080,
2692 unquote_if_non_empty(url_components
.username
),
2693 unquote_if_non_empty(url_components
.password
),
2696 class SocksConnection(base_class
):
2698 self
.sock
= sockssocket()
2699 self
.sock
.setproxy(*proxy_args
)
2700 if type(self
.timeout
) in (int, float):
2701 self
.sock
.settimeout(self
.timeout
)
2702 self
.sock
.connect((self
.host
, self
.port
))
2704 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2705 if hasattr(self
, '_context'): # Python > 2.6
2706 self
.sock
= self
._context
.wrap_socket(
2707 self
.sock
, server_hostname
=self
.host
)
2709 self
.sock
= ssl
.wrap_socket(self
.sock
)
2711 return SocksConnection
2714 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2715 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2716 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2717 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2718 self
._params
= params
2720 def https_open(self
, req
):
2722 conn_class
= self
._https
_conn
_class
2724 if hasattr(self
, '_context'): # python > 2.6
2725 kwargs
['context'] = self
._context
2726 if hasattr(self
, '_check_hostname'): # python 3.x
2727 kwargs
['check_hostname'] = self
._check
_hostname
2729 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2731 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2732 del req
.headers
['Ytdl-socks-proxy']
2734 return self
.do_open(functools
.partial(
2735 _create_http_connection
, self
, conn_class
, True),
2739 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2741 See [1] for cookie file format.
2743 1. https://curl.haxx.se/docs/http-cookies.html
2745 _HTTPONLY_PREFIX
= '#HttpOnly_'
2747 _HEADER
= '''# Netscape HTTP Cookie File
2748 # This file is generated by youtube-dl. Do not edit.
2751 _CookieFileEntry
= collections
.namedtuple(
2753 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2755 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2757 Save cookies to a file.
2759 Most of the code is taken from CPython 3.8 and slightly adapted
2760 to support cookie files with UTF-8 in both python 2 and 3.
2762 if filename
is None:
2763 if self
.filename
is not None:
2764 filename
= self
.filename
2766 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2768 # Store session cookies with `expires` set to 0 instead of an empty
2771 if cookie
.expires
is None:
2774 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2775 f
.write(self
._HEADER
)
2778 if not ignore_discard
and cookie
.discard
:
2780 if not ignore_expires
and cookie
.is_expired(now
):
2786 if cookie
.domain
.startswith('.'):
2787 initial_dot
= 'TRUE'
2789 initial_dot
= 'FALSE'
2790 if cookie
.expires
is not None:
2791 expires
= compat_str(cookie
.expires
)
2794 if cookie
.value
is None:
2795 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2796 # with no name, whereas http.cookiejar regards it as a
2797 # cookie with no value.
2802 value
= cookie
.value
2804 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2805 secure
, expires
, name
, value
]) + '\n')
2807 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2808 """Load cookies from a file."""
2809 if filename
is None:
2810 if self
.filename
is not None:
2811 filename
= self
.filename
2813 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2815 def prepare_line(line
):
2816 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2817 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2818 # comments and empty lines are fine
2819 if line
.startswith('#') or not line
.strip():
2821 cookie_list
= line
.split('\t')
2822 if len(cookie_list
) != self
._ENTRY
_LEN
:
2823 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2824 cookie
= self
._CookieFileEntry
(*cookie_list
)
2825 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2826 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2830 with io
.open(filename
, encoding
='utf-8') as f
:
2833 cf
.write(prepare_line(line
))
2834 except compat_cookiejar
.LoadError
as e
:
2836 'WARNING: skipping cookie file entry due to %s: %r\n'
2837 % (e
, line
), sys
.stderr
)
2840 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2841 # Session cookies are denoted by either `expires` field set to
2842 # an empty string or 0. MozillaCookieJar only recognizes the former
2843 # (see [1]). So we need force the latter to be recognized as session
2844 # cookies on our own.
2845 # Session cookies may be important for cookies-based authentication,
2846 # e.g. usually, when user does not check 'Remember me' check box while
2847 # logging in on a site, some important cookies are stored as session
2848 # cookies so that not recognizing them will result in failed login.
2849 # 1. https://bugs.python.org/issue17164
2851 # Treat `expires=0` cookies as session cookies
2852 if cookie
.expires
== 0:
2853 cookie
.expires
= None
2854 cookie
.discard
= True
2857 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2858 def __init__(self
, cookiejar
=None):
2859 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2861 def http_response(self
, request
, response
):
2862 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2863 # characters in Set-Cookie HTTP header of last response (see
2864 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2865 # In order to at least prevent crashing we will percent encode Set-Cookie
2866 # header before HTTPCookieProcessor starts processing it.
2867 # if sys.version_info < (3, 0) and response.headers:
2868 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2869 # set_cookie = response.headers.get(set_cookie_header)
2871 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2872 # if set_cookie != set_cookie_escaped:
2873 # del response.headers[set_cookie_header]
2874 # response.headers[set_cookie_header] = set_cookie_escaped
2875 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2877 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2878 https_response
= http_response
2881 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2882 if sys
.version_info
[0] < 3:
2883 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2884 # On python 2 urlh.geturl() may sometimes return redirect URL
2885 # as byte string instead of unicode. This workaround allows
2886 # to force it always return unicode.
2887 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2890 def extract_timezone(date_str
):
2892 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2895 timezone
= datetime
.timedelta()
2897 date_str
= date_str
[:-len(m
.group('tz'))]
2898 if not m
.group('sign'):
2899 timezone
= datetime
.timedelta()
2901 sign
= 1 if m
.group('sign') == '+' else -1
2902 timezone
= datetime
.timedelta(
2903 hours
=sign
* int(m
.group('hours')),
2904 minutes
=sign
* int(m
.group('minutes')))
2905 return timezone
, date_str
2908 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2909 """ Return a UNIX timestamp from the given date """
2911 if date_str
is None:
2914 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2916 if timezone
is None:
2917 timezone
, date_str
= extract_timezone(date_str
)
2920 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2921 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2922 return calendar
.timegm(dt
.timetuple())
2927 def date_formats(day_first
=True):
2928 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2931 def unified_strdate(date_str
, day_first
=True):
2932 """Return a string with the date in the format YYYYMMDD"""
2934 if date_str
is None:
2938 date_str
= date_str
.replace(',', ' ')
2939 # Remove AM/PM + timezone
2940 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2941 _
, date_str
= extract_timezone(date_str
)
2943 for expression
in date_formats(day_first
):
2945 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2948 if upload_date
is None:
2949 timetuple
= email
.utils
.parsedate_tz(date_str
)
2952 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2955 if upload_date
is not None:
2956 return compat_str(upload_date
)
2959 def unified_timestamp(date_str
, day_first
=True):
2960 if date_str
is None:
2963 date_str
= re
.sub(r
'[,|]', '', date_str
)
2965 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2966 timezone
, date_str
= extract_timezone(date_str
)
2968 # Remove AM/PM + timezone
2969 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2971 # Remove unrecognized timezones from ISO 8601 alike timestamps
2972 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2974 date_str
= date_str
[:-len(m
.group('tz'))]
2976 # Python only supports microseconds, so remove nanoseconds
2977 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2979 date_str
= m
.group(1)
2981 for expression
in date_formats(day_first
):
2983 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2984 return calendar
.timegm(dt
.timetuple())
2987 timetuple
= email
.utils
.parsedate_tz(date_str
)
2989 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2992 def determine_ext(url
, default_ext
='unknown_video'):
2993 if url
is None or '.' not in url
:
2995 guess
= url
.partition('?')[0].rpartition('.')[2]
2996 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
2998 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2999 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3000 return guess
.rstrip('/')
3005 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3006 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3009 def date_from_str(date_str
):
3011 Return a datetime object from a string in the format YYYYMMDD or
3012 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3013 today
= datetime
.date
.today()
3014 if date_str
in ('now', 'today'):
3016 if date_str
== 'yesterday':
3017 return today
- datetime
.timedelta(days
=1)
3018 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3019 if match
is not None:
3020 sign
= match
.group('sign')
3021 time
= int(match
.group('time'))
3024 unit
= match
.group('unit')
3025 # A bad approximation?
3029 elif unit
== 'year':
3033 delta
= datetime
.timedelta(**{unit
: time
})
3034 return today
+ delta
3035 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3038 def hyphenate_date(date_str
):
3040 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3041 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3042 if match
is not None:
3043 return '-'.join(match
.groups())
3048 class DateRange(object):
3049 """Represents a time interval between two dates"""
3051 def __init__(self
, start
=None, end
=None):
3052 """start and end must be strings in the format accepted by date"""
3053 if start
is not None:
3054 self
.start
= date_from_str(start
)
3056 self
.start
= datetime
.datetime
.min.date()
3058 self
.end
= date_from_str(end
)
3060 self
.end
= datetime
.datetime
.max.date()
3061 if self
.start
> self
.end
:
3062 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3066 """Returns a range that only contains the given day"""
3067 return cls(day
, day
)
3069 def __contains__(self
, date
):
3070 """Check if the date is in the range"""
3071 if not isinstance(date
, datetime
.date
):
3072 date
= date_from_str(date
)
3073 return self
.start
<= date
<= self
.end
3076 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3079 def platform_name():
3080 """ Returns the platform name as a compat_str """
3081 res
= platform
.platform()
3082 if isinstance(res
, bytes):
3083 res
= res
.decode(preferredencoding())
3085 assert isinstance(res
, compat_str
)
3089 def _windows_write_string(s
, out
):
3090 """ Returns True if the string was written using special methods,
3091 False if it has yet to be written out."""
3092 # Adapted from http://stackoverflow.com/a/3259271/35070
3095 import ctypes
.wintypes
3103 fileno
= out
.fileno()
3104 except AttributeError:
3105 # If the output stream doesn't have a fileno, it's virtual
3107 except io
.UnsupportedOperation
:
3108 # Some strange Windows pseudo files?
3110 if fileno
not in WIN_OUTPUT_IDS
:
3113 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3114 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3115 ('GetStdHandle', ctypes
.windll
.kernel32
))
3116 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3118 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3119 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3120 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3121 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3122 written
= ctypes
.wintypes
.DWORD(0)
3124 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3125 FILE_TYPE_CHAR
= 0x0002
3126 FILE_TYPE_REMOTE
= 0x8000
3127 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3128 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3129 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3130 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3131 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3133 def not_a_console(handle
):
3134 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3136 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3137 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3139 if not_a_console(h
):
3142 def next_nonbmp_pos(s
):
3144 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3145 except StopIteration:
3149 count
= min(next_nonbmp_pos(s
), 1024)
3151 ret
= WriteConsoleW(
3152 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3154 raise OSError('Failed to write string')
3155 if not count
: # We just wrote a non-BMP character
3156 assert written
.value
== 2
3159 assert written
.value
> 0
3160 s
= s
[written
.value
:]
3164 def write_string(s
, out
=None, encoding
=None):
3167 assert type(s
) == compat_str
3169 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3170 if _windows_write_string(s
, out
):
3173 if ('b' in getattr(out
, 'mode', '')
3174 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3175 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3177 elif hasattr(out
, 'buffer'):
3178 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3179 byt
= s
.encode(enc
, 'ignore')
3180 out
.buffer.write(byt
)
3186 def bytes_to_intlist(bs
):
3189 if isinstance(bs
[0], int): # Python 3
3192 return [ord(c
) for c
in bs
]
3195 def intlist_to_bytes(xs
):
3198 return compat_struct_pack('%dB' % len(xs
), *xs
)
3201 # Cross-platform file locking
3202 if sys
.platform
== 'win32':
3203 import ctypes
.wintypes
3206 class OVERLAPPED(ctypes
.Structure
):
3208 ('Internal', ctypes
.wintypes
.LPVOID
),
3209 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3210 ('Offset', ctypes
.wintypes
.DWORD
),
3211 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3212 ('hEvent', ctypes
.wintypes
.HANDLE
),
3215 kernel32
= ctypes
.windll
.kernel32
3216 LockFileEx
= kernel32
.LockFileEx
3217 LockFileEx
.argtypes
= [
3218 ctypes
.wintypes
.HANDLE
, # hFile
3219 ctypes
.wintypes
.DWORD
, # dwFlags
3220 ctypes
.wintypes
.DWORD
, # dwReserved
3221 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3222 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3223 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3225 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3226 UnlockFileEx
= kernel32
.UnlockFileEx
3227 UnlockFileEx
.argtypes
= [
3228 ctypes
.wintypes
.HANDLE
, # hFile
3229 ctypes
.wintypes
.DWORD
, # dwReserved
3230 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3231 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3232 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3234 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3235 whole_low
= 0xffffffff
3236 whole_high
= 0x7fffffff
3238 def _lock_file(f
, exclusive
):
3239 overlapped
= OVERLAPPED()
3240 overlapped
.Offset
= 0
3241 overlapped
.OffsetHigh
= 0
3242 overlapped
.hEvent
= 0
3243 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3244 handle
= msvcrt
.get_osfhandle(f
.fileno())
3245 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3246 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3247 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3249 def _unlock_file(f
):
3250 assert f
._lock
_file
_overlapped
_p
3251 handle
= msvcrt
.get_osfhandle(f
.fileno())
3252 if not UnlockFileEx(handle
, 0,
3253 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3254 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3257 # Some platforms, such as Jython, is missing fcntl
3261 def _lock_file(f
, exclusive
):
3262 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3264 def _unlock_file(f
):
3265 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3267 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3269 def _lock_file(f
, exclusive
):
3270 raise IOError(UNSUPPORTED_MSG
)
3272 def _unlock_file(f
):
3273 raise IOError(UNSUPPORTED_MSG
)
3276 class locked_file(object):
3277 def __init__(self
, filename
, mode
, encoding
=None):
3278 assert mode
in ['r', 'a', 'w']
3279 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3282 def __enter__(self
):
3283 exclusive
= self
.mode
!= 'r'
3285 _lock_file(self
.f
, exclusive
)
3291 def __exit__(self
, etype
, value
, traceback
):
3293 _unlock_file(self
.f
)
3300 def write(self
, *args
):
3301 return self
.f
.write(*args
)
3303 def read(self
, *args
):
3304 return self
.f
.read(*args
)
3307 def get_filesystem_encoding():
3308 encoding
= sys
.getfilesystemencoding()
3309 return encoding
if encoding
is not None else 'utf-8'
3312 def shell_quote(args
):
3314 encoding
= get_filesystem_encoding()
3316 if isinstance(a
, bytes):
3317 # We may get a filename encoded with 'encodeFilename'
3318 a
= a
.decode(encoding
)
3319 quoted_args
.append(compat_shlex_quote(a
))
3320 return ' '.join(quoted_args
)
3323 def smuggle_url(url
, data
):
3324 """ Pass additional data in a URL for internal use. """
3326 url
, idata
= unsmuggle_url(url
, {})
3328 sdata
= compat_urllib_parse_urlencode(
3329 {'__youtubedl_smuggle': json
.dumps(data
)})
3330 return url
+ '#' + sdata
3333 def unsmuggle_url(smug_url
, default
=None):
3334 if '#__youtubedl_smuggle' not in smug_url
:
3335 return smug_url
, default
3336 url
, _
, sdata
= smug_url
.rpartition('#')
3337 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3338 data
= json
.loads(jsond
)
3342 def format_bytes(bytes):
3345 if type(bytes) is str:
3346 bytes = float(bytes)
3350 exponent
= int(math
.log(bytes, 1024.0))
3351 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3352 converted
= float(bytes) / float(1024 ** exponent
)
3353 return '%.2f%s' % (converted
, suffix
)
3356 def lookup_unit_table(unit_table
, s
):
3357 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3359 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3362 num_str
= m
.group('num').replace(',', '.')
3363 mult
= unit_table
[m
.group('unit')]
3364 return int(float(num_str
) * mult
)
3367 def parse_filesize(s
):
3371 # The lower-case forms are of course incorrect and unofficial,
3372 # but we support those too
3389 'megabytes': 1000 ** 2,
3390 'mebibytes': 1024 ** 2,
3396 'gigabytes': 1000 ** 3,
3397 'gibibytes': 1024 ** 3,
3403 'terabytes': 1000 ** 4,
3404 'tebibytes': 1024 ** 4,
3410 'petabytes': 1000 ** 5,
3411 'pebibytes': 1024 ** 5,
3417 'exabytes': 1000 ** 6,
3418 'exbibytes': 1024 ** 6,
3424 'zettabytes': 1000 ** 7,
3425 'zebibytes': 1024 ** 7,
3431 'yottabytes': 1000 ** 8,
3432 'yobibytes': 1024 ** 8,
3435 return lookup_unit_table(_UNIT_TABLE
, s
)
3444 if re
.match(r
'^[\d,.]+$', s
):
3445 return str_to_int(s
)
3456 return lookup_unit_table(_UNIT_TABLE
, s
)
3459 def parse_resolution(s
):
3463 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xXĆ]\s*(?P<h>\d+)\b', s
)
3466 'width': int(mobj
.group('w')),
3467 'height': int(mobj
.group('h')),
3470 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3472 return {'height': int(mobj
.group(1))}
3474 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3476 return {'height': int(mobj
.group(1)) * 540}
3481 def parse_bitrate(s
):
3482 if not isinstance(s
, compat_str
):
3484 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3486 return int(mobj
.group(1))
3489 def month_by_name(name
, lang
='en'):
3490 """ Return the number of a month by (locale-independently) English name """
3492 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3495 return month_names
.index(name
) + 1
3500 def month_by_abbreviation(abbrev
):
3501 """ Return the number of a month by (locale-independently) English
3505 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3510 def fix_xml_ampersands(xml_str
):
3511 """Replace all the '&' by '&' in XML"""
3513 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3518 def setproctitle(title
):
3519 assert isinstance(title
, compat_str
)
3521 # ctypes in Jython is not complete
3522 # http://bugs.jython.org/issue2148
3523 if sys
.platform
.startswith('java'):
3527 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3531 # LoadLibrary in Windows Python 2.7.13 only expects
3532 # a bytestring, but since unicode_literals turns
3533 # every string into a unicode string, it fails.
3535 title_bytes
= title
.encode('utf-8')
3536 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3537 buf
.value
= title_bytes
3539 libc
.prctl(15, buf
, 0, 0, 0)
3540 except AttributeError:
3541 return # Strange libc, just skip this
3544 def remove_start(s
, start
):
3545 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3548 def remove_end(s
, end
):
3549 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3552 def remove_quotes(s
):
3553 if s
is None or len(s
) < 2:
3555 for quote
in ('"', "'", ):
3556 if s
[0] == quote
and s
[-1] == quote
:
3561 def url_basename(url
):
3562 path
= compat_urlparse
.urlparse(url
).path
3563 return path
.strip('/').split('/')[-1]
3567 return re
.match(r
'https?://[^?#&]+/', url
).group()
3570 def urljoin(base
, path
):
3571 if isinstance(path
, bytes):
3572 path
= path
.decode('utf-8')
3573 if not isinstance(path
, compat_str
) or not path
:
3575 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3577 if isinstance(base
, bytes):
3578 base
= base
.decode('utf-8')
3579 if not isinstance(base
, compat_str
) or not re
.match(
3580 r
'^(?:https?:)?//', base
):
3582 return compat_urlparse
.urljoin(base
, path
)
3585 class HEADRequest(compat_urllib_request
.Request
):
3586 def get_method(self
):
3590 class PUTRequest(compat_urllib_request
.Request
):
3591 def get_method(self
):
3595 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3598 v
= getattr(v
, get_attr
, None)
3604 return int(v
) * invscale
// scale
3605 except (ValueError, TypeError):
3609 def str_or_none(v
, default
=None):
3610 return default
if v
is None else compat_str(v
)
3613 def str_to_int(int_str
):
3614 """ A more relaxed version of int_or_none """
3615 if isinstance(int_str
, compat_integer_types
):
3617 elif isinstance(int_str
, compat_str
):
3618 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3619 return int_or_none(int_str
)
3622 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3626 return float(v
) * invscale
/ scale
3627 except (ValueError, TypeError):
3631 def bool_or_none(v
, default
=None):
3632 return v
if isinstance(v
, bool) else default
3635 def strip_or_none(v
, default
=None):
3636 return v
.strip() if isinstance(v
, compat_str
) else default
3639 def url_or_none(url
):
3640 if not url
or not isinstance(url
, compat_str
):
3643 return url
if re
.match(r
'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url
) else None
3646 def parse_duration(s
):
3647 if not isinstance(s
, compat_basestring
):
3652 days
, hours
, mins
, secs
, ms
= [None] * 5
3653 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3655 days
, hours
, mins
, secs
, ms
= m
.groups()
3660 [0-9]+\s*y(?:ears?)?\s*
3663 [0-9]+\s*m(?:onths?)?\s*
3666 [0-9]+\s*w(?:eeks?)?\s*
3669 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3673 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3676 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3679 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3682 days
, hours
, mins
, secs
, ms
= m
.groups()
3684 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3686 hours
, mins
= m
.groups()
3692 duration
+= float(secs
)
3694 duration
+= float(mins
) * 60
3696 duration
+= float(hours
) * 60 * 60
3698 duration
+= float(days
) * 24 * 60 * 60
3700 duration
+= float(ms
)
3704 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3705 name
, real_ext
= os
.path
.splitext(filename
)
3707 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3708 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3709 else '{0}.{1}'.format(filename
, ext
))
3712 def replace_extension(filename
, ext
, expected_real_ext
=None):
3713 name
, real_ext
= os
.path
.splitext(filename
)
3714 return '{0}.{1}'.format(
3715 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3719 def check_executable(exe
, args
=[]):
3720 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3721 args can be a list of arguments for a short output (like -version) """
3723 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3729 def get_exe_version(exe
, args
=['--version'],
3730 version_re
=None, unrecognized
='present'):
3731 """ Returns the version of the specified executable,
3732 or False if the executable is not present """
3734 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3735 # SIGTTOU if youtube-dl is run in the background.
3736 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3737 out
, _
= subprocess
.Popen(
3738 [encodeArgument(exe
)] + args
,
3739 stdin
=subprocess
.PIPE
,
3740 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3743 if isinstance(out
, bytes): # Python 2.x
3744 out
= out
.decode('ascii', 'ignore')
3745 return detect_exe_version(out
, version_re
, unrecognized
)
3748 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3749 assert isinstance(output
, compat_str
)
3750 if version_re
is None:
3751 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3752 m
= re
.search(version_re
, output
)
3759 class PagedList(object):
3761 # This is only useful for tests
3762 return len(self
.getslice())
3765 class OnDemandPagedList(PagedList
):
3766 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3767 self
._pagefunc
= pagefunc
3768 self
._pagesize
= pagesize
3769 self
._use
_cache
= use_cache
3773 def getslice(self
, start
=0, end
=None):
3775 for pagenum
in itertools
.count(start
// self
._pagesize
):
3776 firstid
= pagenum
* self
._pagesize
3777 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3778 if start
>= nextfirstid
:
3783 page_results
= self
._cache
.get(pagenum
)
3784 if page_results
is None:
3785 page_results
= list(self
._pagefunc
(pagenum
))
3787 self
._cache
[pagenum
] = page_results
3790 start
% self
._pagesize
3791 if firstid
<= start
< nextfirstid
3795 ((end
- 1) % self
._pagesize
) + 1
3796 if (end
is not None and firstid
<= end
<= nextfirstid
)
3799 if startv
!= 0 or endv
is not None:
3800 page_results
= page_results
[startv
:endv
]
3801 res
.extend(page_results
)
3803 # A little optimization - if current page is not "full", ie. does
3804 # not contain page_size videos then we can assume that this page
3805 # is the last one - there are no more ids on further pages -
3806 # i.e. no need to query again.
3807 if len(page_results
) + startv
< self
._pagesize
:
3810 # If we got the whole page, but the next page is not interesting,
3811 # break out early as well
3812 if end
== nextfirstid
:
3817 class InAdvancePagedList(PagedList
):
3818 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3819 self
._pagefunc
= pagefunc
3820 self
._pagecount
= pagecount
3821 self
._pagesize
= pagesize
3823 def getslice(self
, start
=0, end
=None):
3825 start_page
= start
// self
._pagesize
3827 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3828 skip_elems
= start
- start_page
* self
._pagesize
3829 only_more
= None if end
is None else end
- start
3830 for pagenum
in range(start_page
, end_page
):
3831 page
= list(self
._pagefunc
(pagenum
))
3833 page
= page
[skip_elems
:]
3835 if only_more
is not None:
3836 if len(page
) < only_more
:
3837 only_more
-= len(page
)
3839 page
= page
[:only_more
]
3846 def uppercase_escape(s
):
3847 unicode_escape
= codecs
.getdecoder('unicode_escape')
3849 r
'\\U[0-9a-fA-F]{8}',
3850 lambda m
: unicode_escape(m
.group(0))[0],
3854 def lowercase_escape(s
):
3855 unicode_escape
= codecs
.getdecoder('unicode_escape')
3857 r
'\\u[0-9a-fA-F]{4}',
3858 lambda m
: unicode_escape(m
.group(0))[0],
3862 def escape_rfc3986(s
):
3863 """Escape non-ASCII characters as suggested by RFC 3986"""
3864 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3865 s
= s
.encode('utf-8')
3866 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3869 def escape_url(url
):
3870 """Escape URL as suggested by RFC 3986"""
3871 url_parsed
= compat_urllib_parse_urlparse(url
)
3872 return url_parsed
._replace
(
3873 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3874 path
=escape_rfc3986(url_parsed
.path
),
3875 params
=escape_rfc3986(url_parsed
.params
),
3876 query
=escape_rfc3986(url_parsed
.query
),
3877 fragment
=escape_rfc3986(url_parsed
.fragment
)
3881 def read_batch_urls(batch_fd
):
3883 if not isinstance(url
, compat_str
):
3884 url
= url
.decode('utf-8', 'replace')
3885 BOM_UTF8
= '\xef\xbb\xbf'
3886 if url
.startswith(BOM_UTF8
):
3887 url
= url
[len(BOM_UTF8
):]
3889 if url
.startswith(('#', ';', ']')):
3893 with contextlib
.closing(batch_fd
) as fd
:
3894 return [url
for url
in map(fixup
, fd
) if url
]
3897 def urlencode_postdata(*args
, **kargs
):
3898 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3901 def update_url_query(url
, query
):
3904 parsed_url
= compat_urlparse
.urlparse(url
)
3905 qs
= compat_parse_qs(parsed_url
.query
)
3907 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3908 query
=compat_urllib_parse_urlencode(qs
, True)))
3911 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
3912 req_headers
= req
.headers
.copy()
3913 req_headers
.update(headers
)
3914 req_data
= data
or req
.data
3915 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3916 req_get_method
= req
.get_method()
3917 if req_get_method
== 'HEAD':
3918 req_type
= HEADRequest
3919 elif req_get_method
== 'PUT':
3920 req_type
= PUTRequest
3922 req_type
= compat_urllib_request
.Request
3924 req_url
, data
=req_data
, headers
=req_headers
,
3925 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3926 if hasattr(req
, 'timeout'):
3927 new_req
.timeout
= req
.timeout
3931 def _multipart_encode_impl(data
, boundary
):
3932 content_type
= 'multipart/form-data; boundary=%s' % boundary
3935 for k
, v
in data
.items():
3936 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3937 if isinstance(k
, compat_str
):
3938 k
= k
.encode('utf-8')
3939 if isinstance(v
, compat_str
):
3940 v
= v
.encode('utf-8')
3941 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3942 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3943 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3944 if boundary
.encode('ascii') in content
:
3945 raise ValueError('Boundary overlaps with data')
3948 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3950 return out
, content_type
3953 def multipart_encode(data
, boundary
=None):
3955 Encode a dict to RFC 7578-compliant form-data
3958 A dict where keys and values can be either Unicode or bytes-like
3961 If specified a Unicode object, it's used as the boundary. Otherwise
3962 a random boundary is generated.
3964 Reference: https://tools.ietf.org/html/rfc7578
3966 has_specified_boundary
= boundary
is not None
3969 if boundary
is None:
3970 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3973 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3976 if has_specified_boundary
:
3980 return out
, content_type
3983 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3984 if isinstance(key_or_keys
, (list, tuple)):
3985 for key
in key_or_keys
:
3986 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3990 return d
.get(key_or_keys
, default
)
3993 def try_get(src
, getter
, expected_type
=None):
3994 if not isinstance(getter
, (list, tuple)):
3999 except (AttributeError, KeyError, TypeError, IndexError):
4002 if expected_type
is None or isinstance(v
, expected_type
):
4006 def merge_dicts(*dicts
):
4008 for a_dict
in dicts
:
4009 for k
, v
in a_dict
.items():
4013 or (isinstance(v
, compat_str
) and v
4014 and isinstance(merged
[k
], compat_str
)
4015 and not merged
[k
])):
4020 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4021 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4033 TV_PARENTAL_GUIDELINES
= {
4043 def parse_age_limit(s
):
4045 return s
if 0 <= s
<= 21 else None
4046 if not isinstance(s
, compat_basestring
):
4048 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4050 return int(m
.group('age'))
4052 return US_RATINGS
[s
]
4053 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4055 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4059 def strip_jsonp(code
):
4062 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4063 (?:\s*&&\s*(?P=func_name))?
4064 \s*\(\s*(?P<callback_data>.*)\);?
4065 \s*?(?://[^\n]*)*$''',
4066 r
'\g<callback_data>', code
)
4069 def js_to_json(code
):
4070 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4071 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4073 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4074 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4079 if v
in ('true', 'false', 'null'):
4081 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
4084 if v
[0] in ("'", '"'):
4085 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4090 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4092 for regex
, base
in INTEGER_TABLE
:
4093 im
= re
.match(regex
, v
)
4095 i
= int(im
.group(1), base
)
4096 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4100 return re
.sub(r
'''(?sx)
4101 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4102 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4103 {comment}|,(?={skip}[\]}}])|
4104 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4105 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4107 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4110 def qualities(quality_ids
):
4111 """ Get a numeric quality value out of a list of possible values """
4114 return quality_ids
.index(qid
)
4120 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
4123 def limit_length(s
, length
):
4124 """ Add ellipses to overly long strings """
4129 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4133 def version_tuple(v
):
4134 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4137 def is_outdated_version(version
, limit
, assume_new
=True):
4139 return not assume_new
4141 return version_tuple(version
) < version_tuple(limit
)
4143 return not assume_new
4146 def ytdl_is_updateable():
4147 """ Returns if youtube-dl can be updated with -U """
4148 from zipimport
import zipimporter
4150 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4153 def args_to_str(args
):
4154 # Get a short string representation for a subprocess command
4155 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4158 def error_to_compat_str(err
):
4160 # On python 2 error byte string must be decoded with proper
4161 # encoding rather than ascii
4162 if sys
.version_info
[0] < 3:
4163 err_str
= err_str
.decode(preferredencoding())
4167 def mimetype2ext(mt
):
4173 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4174 # it's the most popular one
4175 'audio/mpeg': 'mp3',
4180 _
, _
, res
= mt
.rpartition('/')
4181 res
= res
.split(';')[0].strip().lower()
4185 'smptett+xml': 'tt',
4189 'x-mp4-fragmented': 'mp4',
4190 'x-ms-sami': 'sami',
4193 'x-mpegurl': 'm3u8',
4194 'vnd.apple.mpegurl': 'm3u8',
4198 'vnd.ms-sstr+xml': 'ism',
4205 def parse_codecs(codecs_str
):
4206 # http://tools.ietf.org/html/rfc6381
4209 splited_codecs
= list(filter(None, map(
4210 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4211 vcodec
, acodec
= None, None
4212 for full_codec
in splited_codecs
:
4213 codec
= full_codec
.split('.')[0]
4214 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4217 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4221 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4222 if not vcodec
and not acodec
:
4223 if len(splited_codecs
) == 2:
4225 'vcodec': splited_codecs
[0],
4226 'acodec': splited_codecs
[1],
4230 'vcodec': vcodec
or 'none',
4231 'acodec': acodec
or 'none',
4236 def urlhandle_detect_ext(url_handle
):
4237 getheader
= url_handle
.headers
.get
4239 cd
= getheader('Content-Disposition')
4241 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4243 e
= determine_ext(m
.group('filename'), default_ext
=None)
4247 return mimetype2ext(getheader('Content-Type'))
4250 def encode_data_uri(data
, mime_type
):
4251 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4254 def age_restricted(content_limit
, age_limit
):
4255 """ Returns True iff the content should be blocked """
4257 if age_limit
is None: # No limit set
4259 if content_limit
is None:
4260 return False # Content available for everyone
4261 return age_limit
< content_limit
4264 def is_html(first_bytes
):
4265 """ Detect whether a file contains HTML by examining its first bytes. """
4268 (b
'\xef\xbb\xbf', 'utf-8'),
4269 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4270 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4271 (b
'\xff\xfe', 'utf-16-le'),
4272 (b
'\xfe\xff', 'utf-16-be'),
4274 for bom
, enc
in BOMS
:
4275 if first_bytes
.startswith(bom
):
4276 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4279 s
= first_bytes
.decode('utf-8', 'replace')
4281 return re
.match(r
'^\s*<', s
)
4284 def determine_protocol(info_dict
):
4285 protocol
= info_dict
.get('protocol')
4286 if protocol
is not None:
4289 url
= info_dict
['url']
4290 if url
.startswith('rtmp'):
4292 elif url
.startswith('mms'):
4294 elif url
.startswith('rtsp'):
4297 ext
= determine_ext(url
)
4303 return compat_urllib_parse_urlparse(url
).scheme
4306 def render_table(header_row
, data
):
4307 """ Render a list of rows, each as a list of values """
4308 table
= [header_row
] + data
4309 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4310 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
4311 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4314 def _match_one(filter_part
, dct
):
4315 COMPARISON_OPERATORS
= {
4323 operator_rex
= re
.compile(r
'''(?x)\s*
4325 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4327 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4328 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4329 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4332 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4333 m = operator_rex.search(filter_part)
4335 op = COMPARISON_OPERATORS[m.group('op')]
4336 actual_value = dct.get(m.group('key'))
4337 if (m.group('quotedstrval') is not None
4338 or m.group('strval') is not None
4339 # If the original field is a string and matching comparisonvalue is
4340 # a number we should respect the origin of the original field
4341 # and process comparison value as a string (see
4342 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4343 or actual_value is not None and m.group('intval') is not None
4344 and isinstance(actual_value, compat_str)):
4345 if m.group('op') not in ('=', '!='):
4347 'Operator %s does not support string values!' % m.group('op'))
4348 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4349 quote = m.group('quote')
4350 if quote is not None:
4351 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4354 comparison_value = int(m.group('intval'))
4356 comparison_value = parse_filesize(m.group('intval'))
4357 if comparison_value is None:
4358 comparison_value = parse_filesize(m.group('intval') + 'B')
4359 if comparison_value is None:
4361 'Invalid integer value %r in filter part %r' % (
4362 m.group('intval'), filter_part))
4363 if actual_value is None:
4364 return m.group('none_inclusive')
4365 return op(actual_value, comparison_value)
4368 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4369 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4371 operator_rex = re.compile(r'''(?x
)\s
*
4372 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4374 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4375 m = operator_rex.search(filter_part)
4377 op = UNARY_OPERATORS[m.group('op')]
4378 actual_value = dct.get(m.group('key'))
4379 return op(actual_value)
4381 raise ValueError('Invalid filter part %r' % filter_part)
4384 def match_str(filter_str, dct):
4385 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4388 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4391 def match_filter_func(filter_str):
4392 def _match_func(info_dict):
4393 if match_str(filter_str, info_dict):
4396 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4397 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4401 def parse_dfxp_time_expr(time_expr):
4405 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4407 return float(mobj.group('time_offset'))
4409 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4411 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4414 def srt_subtitles_timecode(seconds):
4415 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4418 def dfxp2srt(dfxp_data):
4420 @param dfxp_data A
bytes-like
object containing DFXP data
4421 @returns A
unicode object containing converted SRT data
4423 LEGACY_NAMESPACES = (
4424 (b'http://www.w3.org/ns/ttml', [
4425 b'http://www.w3.org/2004/11/ttaf1',
4426 b'http://www.w3.org/2006/04/ttaf1',
4427 b'http://www.w3.org/2006/10/ttaf1',
4429 (b'http://www.w3.org/ns/ttml#styling', [
4430 b'http://www.w3.org/ns/ttml#style',
4434 SUPPORTED_STYLING = [
4443 _x = functools.partial(xpath_with_ns, ns_map={
4444 'xml': 'http://www.w3.org/XML/1998/namespace',
4445 'ttml': 'http://www.w3.org/ns/ttml',
4446 'tts': 'http://www.w3.org/ns/ttml#styling',
4452 class TTMLPElementParser(object):
4454 _unclosed_elements = []
4455 _applied_styles = []
4457 def start(self, tag, attrib):
4458 if tag in (_x('ttml:br'), 'br'):
4461 unclosed_elements = []
4463 element_style_id = attrib.get('style')
4465 style.update(default_style)
4466 if element_style_id:
4467 style.update(styles.get(element_style_id, {}))
4468 for prop in SUPPORTED_STYLING:
4469 prop_val = attrib.get(_x('tts:' + prop))
4471 style[prop] = prop_val
4474 for k, v in sorted(style.items()):
4475 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4478 font += ' color="%s"' % v
4479 elif k == 'fontSize':
4480 font += ' size="%s"' % v
4481 elif k == 'fontFamily':
4482 font += ' face="%s"' % v
4483 elif k == 'fontWeight' and v == 'bold':
4485 unclosed_elements.append('b')
4486 elif k == 'fontStyle' and v == 'italic':
4488 unclosed_elements.append('i')
4489 elif k == 'textDecoration' and v == 'underline':
4491 unclosed_elements.append('u')
4493 self._out += '<font' + font + '>'
4494 unclosed_elements.append('font')
4496 if self._applied_styles:
4497 applied_style.update(self._applied_styles[-1])
4498 applied_style.update(style)
4499 self._applied_styles.append(applied_style)
4500 self._unclosed_elements.append(unclosed_elements)
4503 if tag not in (_x('ttml:br'), 'br'):
4504 unclosed_elements = self._unclosed_elements.pop()
4505 for element in reversed(unclosed_elements):
4506 self._out += '</%s>' % element
4507 if unclosed_elements and self._applied_styles:
4508 self._applied_styles.pop()
4510 def data(self, data):
4514 return self._out.strip()
4516 def parse_node(node):
4517 target = TTMLPElementParser()
4518 parser = xml.etree.ElementTree.XMLParser(target=target)
4519 parser.feed(xml.etree.ElementTree.tostring(node))
4520 return parser.close()
4522 for k, v in LEGACY_NAMESPACES:
4524 dfxp_data = dfxp_data.replace(ns, k)
4526 dfxp = compat_etree_fromstring(dfxp_data)
4528 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4531 raise ValueError('Invalid dfxp/TTML subtitle')
4535 for style in dfxp.findall(_x('.//ttml:style')):
4536 style_id = style.get('id') or style.get(_x('xml:id'))
4539 parent_style_id = style.get('style')
4541 if parent_style_id not in styles:
4544 styles[style_id] = styles[parent_style_id].copy()
4545 for prop in SUPPORTED_STYLING:
4546 prop_val = style.get(_x('tts:' + prop))
4548 styles.setdefault(style_id, {})[prop] = prop_val
4554 for p in ('body', 'div'):
4555 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4558 style = styles.get(ele.get('style'))
4561 default_style.update(style)
4563 for para, index in zip(paras, itertools.count(1)):
4564 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4565 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4566 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4567 if begin_time is None:
4572 end_time = begin_time + dur
4573 out.append('%d\n%s --> %s\n%s\n\n' % (
4575 srt_subtitles_timecode(begin_time),
4576 srt_subtitles_timecode(end_time),
4582 def cli_option(params, command_option, param):
4583 param = params.get(param)
4585 param = compat_str(param)
4586 return [command_option, param] if param is not None else []
4589 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4590 param = params.get(param)
4593 assert isinstance(param, bool)
4595 return [command_option + separator + (true_value if param else false_value)]
4596 return [command_option, true_value if param else false_value]
4599 def cli_valueless_option(params, command_option, param, expected_value=True):
4600 param = params.get(param)
4601 return [command_option] if param == expected_value else []
4604 def cli_configuration_args(params, param, default=[]):
4605 ex_args = params.get(param)
4608 assert isinstance(ex_args, list)
4612 class ISO639Utils(object):
4613 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4672 'iw': 'heb', # Replaced by he in 1989 revision
4682 'in': 'ind', # Replaced by id in 1989 revision
4797 'ji': 'yid', # Replaced by yi in 1989 revision
4805 def short2long(cls, code):
4806 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4807 return cls._lang_map.get(code[:2])
4810 def long2short(cls, code):
4811 """Convert language code from ISO 639-2/T to ISO 639-1"""
4812 for short_name, long_name in cls._lang_map.items():
4813 if long_name == code:
4817 class ISO3166Utils(object):
4818 # From http://data.okfn.org/data/core/country-list
4820 'AF': 'Afghanistan',
4821 'AX': 'Ć
land Islands',
4824 'AS': 'American Samoa',
4829 'AG': 'Antigua and Barbuda',
4846 'BO': 'Bolivia, Plurinational State of',
4847 'BQ': 'Bonaire, Sint Eustatius and Saba',
4848 'BA': 'Bosnia and Herzegovina',
4850 'BV': 'Bouvet Island',
4852 'IO': 'British Indian Ocean Territory',
4853 'BN': 'Brunei Darussalam',
4855 'BF': 'Burkina Faso',
4861 'KY': 'Cayman Islands',
4862 'CF': 'Central African Republic',
4866 'CX': 'Christmas Island',
4867 'CC': 'Cocos (Keeling) Islands',
4871 'CD': 'Congo, the Democratic Republic of the',
4872 'CK': 'Cook Islands',
4874 'CI': 'CĆ“te d\'Ivoire',
4879 'CZ': 'Czech Republic',
4883 'DO': 'Dominican Republic',
4886 'SV': 'El Salvador',
4887 'GQ': 'Equatorial Guinea',
4891 'FK': 'Falkland Islands (Malvinas)',
4892 'FO': 'Faroe Islands',
4896 'GF': 'French Guiana',
4897 'PF': 'French Polynesia',
4898 'TF': 'French Southern Territories',
4913 'GW': 'Guinea-Bissau',
4916 'HM': 'Heard Island and McDonald Islands',
4917 'VA': 'Holy See (Vatican City State)',
4924 'IR': 'Iran, Islamic Republic of',
4927 'IM': 'Isle of Man',
4937 'KP': 'Korea, Democratic People\'s Republic of',
4938 'KR': 'Korea, Republic of',
4941 'LA': 'Lao People\'s Democratic Republic',
4947 'LI': 'Liechtenstein',
4951 'MK': 'Macedonia, the Former Yugoslav Republic of',
4958 'MH': 'Marshall Islands',
4964 'FM': 'Micronesia, Federated States of',
4965 'MD': 'Moldova, Republic of',
4976 'NL': 'Netherlands',
4977 'NC': 'New Caledonia',
4978 'NZ': 'New Zealand',
4983 'NF': 'Norfolk Island',
4984 'MP': 'Northern Mariana Islands',
4989 'PS': 'Palestine, State of',
4991 'PG': 'Papua New Guinea',
4994 'PH': 'Philippines',
4998 'PR': 'Puerto Rico',
5002 'RU': 'Russian Federation',
5004 'BL': 'Saint BarthƩlemy',
5005 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5006 'KN': 'Saint Kitts and Nevis',
5007 'LC': 'Saint Lucia',
5008 'MF': 'Saint Martin (French part)',
5009 'PM': 'Saint Pierre and Miquelon',
5010 'VC': 'Saint Vincent and the Grenadines',
5013 'ST': 'Sao Tome and Principe',
5014 'SA': 'Saudi Arabia',
5018 'SL': 'Sierra Leone',
5020 'SX': 'Sint Maarten (Dutch part)',
5023 'SB': 'Solomon Islands',
5025 'ZA': 'South Africa',
5026 'GS': 'South Georgia and the South Sandwich Islands',
5027 'SS': 'South Sudan',
5032 'SJ': 'Svalbard and Jan Mayen',
5035 'CH': 'Switzerland',
5036 'SY': 'Syrian Arab Republic',
5037 'TW': 'Taiwan, Province of China',
5039 'TZ': 'Tanzania, United Republic of',
5041 'TL': 'Timor-Leste',
5045 'TT': 'Trinidad and Tobago',
5048 'TM': 'Turkmenistan',
5049 'TC': 'Turks and Caicos Islands',
5053 'AE': 'United Arab Emirates',
5054 'GB': 'United Kingdom',
5055 'US': 'United States',
5056 'UM': 'United States Minor Outlying Islands',
5060 'VE': 'Venezuela, Bolivarian Republic of',
5062 'VG': 'Virgin Islands, British',
5063 'VI': 'Virgin Islands, U.S.',
5064 'WF': 'Wallis and Futuna',
5065 'EH': 'Western Sahara',
5072 def short2full(cls, code):
5073 """Convert an ISO 3166-2 country code to the corresponding full name"""
5074 return cls._country_map.get(code.upper())
5077 class GeoUtils(object):
5078 # Major IPv4 address blocks per country
5080 'AD': '46.172.224.0/19',
5081 'AE': '94.200.0.0/13',
5082 'AF': '149.54.0.0/17',
5083 'AG': '209.59.64.0/18',
5084 'AI': '204.14.248.0/21',
5085 'AL': '46.99.0.0/16',
5086 'AM': '46.70.0.0/15',
5087 'AO': '105.168.0.0/13',
5088 'AP': '182.50.184.0/21',
5089 'AQ': '23.154.160.0/24',
5090 'AR': '181.0.0.0/12',
5091 'AS': '202.70.112.0/20',
5092 'AT': '77.116.0.0/14',
5093 'AU': '1.128.0.0/11',
5094 'AW': '181.41.0.0/18',
5095 'AX': '185.217.4.0/22',
5096 'AZ': '5.197.0.0/16',
5097 'BA': '31.176.128.0/17',
5098 'BB': '65.48.128.0/17',
5099 'BD': '114.130.0.0/16',
5101 'BF': '102.178.0.0/15',
5102 'BG': '95.42.0.0/15',
5103 'BH': '37.131.0.0/17',
5104 'BI': '154.117.192.0/18',
5105 'BJ': '137.255.0.0/16',
5106 'BL': '185.212.72.0/23',
5107 'BM': '196.12.64.0/18',
5108 'BN': '156.31.0.0/16',
5109 'BO': '161.56.0.0/16',
5110 'BQ': '161.0.80.0/20',
5111 'BR': '191.128.0.0/12',
5112 'BS': '24.51.64.0/18',
5113 'BT': '119.2.96.0/19',
5114 'BW': '168.167.0.0/16',
5115 'BY': '178.120.0.0/13',
5116 'BZ': '179.42.192.0/18',
5117 'CA': '99.224.0.0/11',
5118 'CD': '41.243.0.0/16',
5119 'CF': '197.242.176.0/21',
5120 'CG': '160.113.0.0/16',
5121 'CH': '85.0.0.0/13',
5122 'CI': '102.136.0.0/14',
5123 'CK': '202.65.32.0/19',
5124 'CL': '152.172.0.0/14',
5125 'CM': '102.244.0.0/14',
5126 'CN': '36.128.0.0/10',
5127 'CO': '181.240.0.0/12',
5128 'CR': '201.192.0.0/12',
5129 'CU': '152.206.0.0/15',
5130 'CV': '165.90.96.0/19',
5131 'CW': '190.88.128.0/17',
5132 'CY': '31.153.0.0/16',
5133 'CZ': '88.100.0.0/14',
5135 'DJ': '197.241.0.0/17',
5136 'DK': '87.48.0.0/12',
5137 'DM': '192.243.48.0/20',
5138 'DO': '152.166.0.0/15',
5139 'DZ': '41.96.0.0/12',
5140 'EC': '186.68.0.0/15',
5141 'EE': '90.190.0.0/15',
5142 'EG': '156.160.0.0/11',
5143 'ER': '196.200.96.0/20',
5144 'ES': '88.0.0.0/11',
5145 'ET': '196.188.0.0/14',
5146 'EU': '2.16.0.0/13',
5147 'FI': '91.152.0.0/13',
5148 'FJ': '144.120.0.0/16',
5149 'FK': '80.73.208.0/21',
5150 'FM': '119.252.112.0/20',
5151 'FO': '88.85.32.0/19',
5153 'GA': '41.158.0.0/15',
5155 'GD': '74.122.88.0/21',
5156 'GE': '31.146.0.0/16',
5157 'GF': '161.22.64.0/18',
5158 'GG': '62.68.160.0/19',
5159 'GH': '154.160.0.0/12',
5160 'GI': '95.164.0.0/16',
5161 'GL': '88.83.0.0/19',
5162 'GM': '160.182.0.0/15',
5163 'GN': '197.149.192.0/18',
5164 'GP': '104.250.0.0/19',
5165 'GQ': '105.235.224.0/20',
5166 'GR': '94.64.0.0/13',
5167 'GT': '168.234.0.0/16',
5168 'GU': '168.123.0.0/16',
5169 'GW': '197.214.80.0/20',
5170 'GY': '181.41.64.0/18',
5171 'HK': '113.252.0.0/14',
5172 'HN': '181.210.0.0/16',
5173 'HR': '93.136.0.0/13',
5174 'HT': '148.102.128.0/17',
5175 'HU': '84.0.0.0/14',
5176 'ID': '39.192.0.0/10',
5177 'IE': '87.32.0.0/12',
5178 'IL': '79.176.0.0/13',
5179 'IM': '5.62.80.0/20',
5180 'IN': '117.192.0.0/10',
5181 'IO': '203.83.48.0/21',
5182 'IQ': '37.236.0.0/14',
5183 'IR': '2.176.0.0/12',
5184 'IS': '82.221.0.0/16',
5185 'IT': '79.0.0.0/10',
5186 'JE': '87.244.64.0/18',
5187 'JM': '72.27.0.0/17',
5188 'JO': '176.29.0.0/16',
5189 'JP': '133.0.0.0/8',
5190 'KE': '105.48.0.0/12',
5191 'KG': '158.181.128.0/17',
5192 'KH': '36.37.128.0/17',
5193 'KI': '103.25.140.0/22',
5194 'KM': '197.255.224.0/20',
5195 'KN': '198.167.192.0/19',
5196 'KP': '175.45.176.0/22',
5197 'KR': '175.192.0.0/10',
5198 'KW': '37.36.0.0/14',
5199 'KY': '64.96.0.0/15',
5200 'KZ': '2.72.0.0/13',
5201 'LA': '115.84.64.0/18',
5202 'LB': '178.135.0.0/16',
5203 'LC': '24.92.144.0/20',
5204 'LI': '82.117.0.0/19',
5205 'LK': '112.134.0.0/15',
5206 'LR': '102.183.0.0/16',
5207 'LS': '129.232.0.0/17',
5208 'LT': '78.56.0.0/13',
5209 'LU': '188.42.0.0/16',
5210 'LV': '46.109.0.0/16',
5211 'LY': '41.252.0.0/14',
5212 'MA': '105.128.0.0/11',
5213 'MC': '88.209.64.0/18',
5214 'MD': '37.246.0.0/16',
5215 'ME': '178.175.0.0/17',
5216 'MF': '74.112.232.0/21',
5217 'MG': '154.126.0.0/17',
5218 'MH': '117.103.88.0/21',
5219 'MK': '77.28.0.0/15',
5220 'ML': '154.118.128.0/18',
5221 'MM': '37.111.0.0/17',
5222 'MN': '49.0.128.0/17',
5223 'MO': '60.246.0.0/16',
5224 'MP': '202.88.64.0/20',
5225 'MQ': '109.203.224.0/19',
5226 'MR': '41.188.64.0/18',
5227 'MS': '208.90.112.0/22',
5228 'MT': '46.11.0.0/16',
5229 'MU': '105.16.0.0/12',
5230 'MV': '27.114.128.0/18',
5231 'MW': '102.70.0.0/15',
5232 'MX': '187.192.0.0/11',
5233 'MY': '175.136.0.0/13',
5234 'MZ': '197.218.0.0/15',
5235 'NA': '41.182.0.0/16',
5236 'NC': '101.101.0.0/18',
5237 'NE': '197.214.0.0/18',
5238 'NF': '203.17.240.0/22',
5239 'NG': '105.112.0.0/12',
5240 'NI': '186.76.0.0/15',
5241 'NL': '145.96.0.0/11',
5242 'NO': '84.208.0.0/13',
5243 'NP': '36.252.0.0/15',
5244 'NR': '203.98.224.0/19',
5245 'NU': '49.156.48.0/22',
5246 'NZ': '49.224.0.0/14',
5247 'OM': '5.36.0.0/15',
5248 'PA': '186.72.0.0/15',
5249 'PE': '186.160.0.0/14',
5250 'PF': '123.50.64.0/18',
5251 'PG': '124.240.192.0/19',
5252 'PH': '49.144.0.0/13',
5253 'PK': '39.32.0.0/11',
5254 'PL': '83.0.0.0/11',
5255 'PM': '70.36.0.0/20',
5256 'PR': '66.50.0.0/16',
5257 'PS': '188.161.0.0/16',
5258 'PT': '85.240.0.0/13',
5259 'PW': '202.124.224.0/20',
5260 'PY': '181.120.0.0/14',
5261 'QA': '37.210.0.0/15',
5262 'RE': '102.35.0.0/16',
5263 'RO': '79.112.0.0/13',
5264 'RS': '93.86.0.0/15',
5265 'RU': '5.136.0.0/13',
5266 'RW': '41.186.0.0/16',
5267 'SA': '188.48.0.0/13',
5268 'SB': '202.1.160.0/19',
5269 'SC': '154.192.0.0/11',
5270 'SD': '102.120.0.0/13',
5271 'SE': '78.64.0.0/12',
5272 'SG': '8.128.0.0/10',
5273 'SI': '188.196.0.0/14',
5274 'SK': '78.98.0.0/15',
5275 'SL': '102.143.0.0/17',
5276 'SM': '89.186.32.0/19',
5277 'SN': '41.82.0.0/15',
5278 'SO': '154.115.192.0/18',
5279 'SR': '186.179.128.0/17',
5280 'SS': '105.235.208.0/21',
5281 'ST': '197.159.160.0/19',
5282 'SV': '168.243.0.0/16',
5283 'SX': '190.102.0.0/20',
5285 'SZ': '41.84.224.0/19',
5286 'TC': '65.255.48.0/20',
5287 'TD': '154.68.128.0/19',
5288 'TG': '196.168.0.0/14',
5289 'TH': '171.96.0.0/13',
5290 'TJ': '85.9.128.0/18',
5291 'TK': '27.96.24.0/21',
5292 'TL': '180.189.160.0/20',
5293 'TM': '95.85.96.0/19',
5294 'TN': '197.0.0.0/11',
5295 'TO': '175.176.144.0/21',
5296 'TR': '78.160.0.0/11',
5297 'TT': '186.44.0.0/15',
5298 'TV': '202.2.96.0/19',
5299 'TW': '120.96.0.0/11',
5300 'TZ': '156.156.0.0/14',
5301 'UA': '37.52.0.0/14',
5302 'UG': '102.80.0.0/13',
5304 'UY': '167.56.0.0/13',
5305 'UZ': '84.54.64.0/18',
5306 'VA': '212.77.0.0/19',
5307 'VC': '207.191.240.0/21',
5308 'VE': '186.88.0.0/13',
5309 'VG': '66.81.192.0/20',
5310 'VI': '146.226.0.0/16',
5311 'VN': '14.160.0.0/11',
5312 'VU': '202.80.32.0/20',
5313 'WF': '117.20.32.0/21',
5314 'WS': '202.4.32.0/19',
5315 'YE': '134.35.0.0/16',
5316 'YT': '41.242.116.0/22',
5317 'ZA': '41.0.0.0/11',
5318 'ZM': '102.144.0.0/13',
5319 'ZW': '102.177.192.0/18',
5323 def random_ipv4(cls, code_or_block):
5324 if len(code_or_block) == 2:
5325 block = cls._country_ip_map.get(code_or_block.upper())
5329 block = code_or_block
5330 addr, preflen = block.split('/')
5331 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5332 addr_max = addr_min | (0xffffffff >> int(preflen))
5333 return compat_str(socket.inet_ntoa(
5334 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5337 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5338 def __init__(self, proxies=None):
5339 # Set default handlers
5340 for type in ('http', 'https'):
5341 setattr(self, '%s_open' % type,
5342 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5343 meth(r, proxy, type))
5344 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5346 def proxy_open(self, req, proxy, type):
5347 req_proxy = req.headers.get('Ytdl-request-proxy')
5348 if req_proxy is not None:
5350 del req.headers['Ytdl-request-proxy']
5352 if proxy == '__noproxy__':
5353 return None # No Proxy
5354 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5355 req.add_header('Ytdl-socks-proxy', proxy)
5356 # youtube-dl's http/https handlers do wrapping the socket with socks
5358 return compat_urllib_request.ProxyHandler.proxy_open(
5359 self, req, proxy, type)
5362 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5363 # released into Public Domain
5364 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5366 def long_to_bytes(n, blocksize=0):
5367 """long_to_bytes(n:long, blocksize:int) : string
5368 Convert a long integer to a byte string.
5370 If optional blocksize is given and greater than zero, pad the front of the
5371 byte string with binary zeros so that the length is a multiple of
5374 # after much testing, this algorithm was deemed to be the fastest
5378 s = compat_struct_pack('>I', n & 0xffffffff) + s
5380 # strip off leading zeros
5381 for i in range(len(s)):
5382 if s[i] != b'\000'[0]:
5385 # only happens when n == 0
5389 # add back some pad bytes. this could be done more efficiently w.r.t. the
5390 # de-padding being done above, but sigh...
5391 if blocksize > 0 and len(s) % blocksize:
5392 s = (blocksize - len(s) % blocksize) * b'\000' + s
5396 def bytes_to_long(s):
5397 """bytes_to_long(string) : long
5398 Convert a byte string to a long integer.
5400 This is (essentially) the inverse of long_to_bytes().
5405 extra = (4 - length % 4)
5406 s = b'\000' * extra + s
5407 length = length + extra
5408 for i in range(0, length, 4):
5409 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5413 def ohdave_rsa_encrypt(data, exponent, modulus):
5415 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5418 data: data to encrypt, bytes-like object
5419 exponent, modulus: parameter e and N of RSA algorithm, both integer
5420 Output: hex string of encrypted data
5422 Limitation: supports one block encryption only
5425 payload = int(binascii.hexlify(data[::-1]), 16)
5426 encrypted = pow(payload, exponent, modulus)
5427 return '%x' % encrypted
5430 def pkcs1pad(data, length):
5432 Padding input data with PKCS#1 scheme
5434 @param {int[]} data input data
5435 @param {int} length target length
5436 @returns {int[]} padded data
5438 if len(data) > length - 11:
5439 raise ValueError('Input data too
long for PKCS
#1 padding')
5441 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5442 return [0, 2] + pseudo_random
+ [0] + data
5445 def encode_base_n(num
, n
, table
=None):
5446 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5448 table
= FULL_TABLE
[:n
]
5451 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5458 ret
= table
[num
% n
] + ret
5463 def decode_packed_codes(code
):
5464 mobj
= re
.search(PACKED_CODES_RE
, code
)
5465 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
5468 symbols
= symbols
.split('|')
5473 base_n_count
= encode_base_n(count
, base
)
5474 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5477 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5481 def caesar(s
, alphabet
, shift
):
5486 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5491 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5494 def parse_m3u8_attributes(attrib
):
5496 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5497 if val
.startswith('"'):
5503 def urshift(val
, n
):
5504 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5507 # Based on png2str() written by @gdkchan and improved by @yokrysty
5508 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5509 def decode_png(png_data
):
5510 # Reference: https://www.w3.org/TR/PNG/
5511 header
= png_data
[8:]
5513 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5514 raise IOError('Not a valid PNG file.')
5516 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5517 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5522 length
= unpack_integer(header
[:4])
5525 chunk_type
= header
[:4]
5528 chunk_data
= header
[:length
]
5529 header
= header
[length
:]
5531 header
= header
[4:] # Skip CRC
5539 ihdr
= chunks
[0]['data']
5541 width
= unpack_integer(ihdr
[:4])
5542 height
= unpack_integer(ihdr
[4:8])
5546 for chunk
in chunks
:
5547 if chunk
['type'] == b
'IDAT':
5548 idat
+= chunk
['data']
5551 raise IOError('Unable to read PNG data.')
5553 decompressed_data
= bytearray(zlib
.decompress(idat
))
5558 def _get_pixel(idx
):
5563 for y
in range(height
):
5564 basePos
= y
* (1 + stride
)
5565 filter_type
= decompressed_data
[basePos
]
5569 pixels
.append(current_row
)
5571 for x
in range(stride
):
5572 color
= decompressed_data
[1 + basePos
+ x
]
5573 basex
= y
* stride
+ x
5578 left
= _get_pixel(basex
- 3)
5580 up
= _get_pixel(basex
- stride
)
5582 if filter_type
== 1: # Sub
5583 color
= (color
+ left
) & 0xff
5584 elif filter_type
== 2: # Up
5585 color
= (color
+ up
) & 0xff
5586 elif filter_type
== 3: # Average
5587 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5588 elif filter_type
== 4: # Paeth
5594 c
= _get_pixel(basex
- stride
- 3)
5602 if pa
<= pb
and pa
<= pc
:
5603 color
= (color
+ a
) & 0xff
5605 color
= (color
+ b
) & 0xff
5607 color
= (color
+ c
) & 0xff
5609 current_row
.append(color
)
5611 return width
, height
, pixels
5614 def write_xattr(path
, key
, value
):
5615 # This mess below finds the best xattr tool for the job
5617 # try the pyxattr module...
5620 if hasattr(xattr
, 'set'): # pyxattr
5621 # Unicode arguments are not supported in python-pyxattr until
5623 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5624 pyxattr_required_version
= '0.5.0'
5625 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5626 # TODO: fallback to CLI tools
5627 raise XAttrUnavailableError(
5628 'python-pyxattr is detected but is too old. '
5629 'youtube-dl requires %s or above while your version is %s. '
5630 'Falling back to other xattr implementations' % (
5631 pyxattr_required_version
, xattr
.__version
__))
5633 setxattr
= xattr
.set
5635 setxattr
= xattr
.setxattr
5638 setxattr(path
, key
, value
)
5639 except EnvironmentError as e
:
5640 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5643 if compat_os_name
== 'nt':
5644 # Write xattrs to NTFS Alternate Data Streams:
5645 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5646 assert ':' not in key
5647 assert os
.path
.exists(path
)
5649 ads_fn
= path
+ ':' + key
5651 with open(ads_fn
, 'wb') as f
:
5653 except EnvironmentError as e
:
5654 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5656 user_has_setfattr
= check_executable('setfattr', ['--version'])
5657 user_has_xattr
= check_executable('xattr', ['-h'])
5659 if user_has_setfattr
or user_has_xattr
:
5661 value
= value
.decode('utf-8')
5662 if user_has_setfattr
:
5663 executable
= 'setfattr'
5664 opts
= ['-n', key
, '-v', value
]
5665 elif user_has_xattr
:
5666 executable
= 'xattr'
5667 opts
= ['-w', key
, value
]
5669 cmd
= ([encodeFilename(executable
, True)]
5670 + [encodeArgument(o
) for o
in opts
]
5671 + [encodeFilename(path
, True)])
5674 p
= subprocess
.Popen(
5675 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5676 except EnvironmentError as e
:
5677 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5678 stdout
, stderr
= p
.communicate()
5679 stderr
= stderr
.decode('utf-8', 'replace')
5680 if p
.returncode
!= 0:
5681 raise XAttrMetadataError(p
.returncode
, stderr
)
5684 # On Unix, and can't find pyxattr, setfattr, or xattr.
5685 if sys
.platform
.startswith('linux'):
5686 raise XAttrUnavailableError(
5687 "Couldn't find a tool to set the xattrs. "
5688 "Install either the python 'pyxattr' or 'xattr' "
5689 "modules, or the GNU 'attr' package "
5690 "(which contains the 'setfattr' tool).")
5692 raise XAttrUnavailableError(
5693 "Couldn't find a tool to set the xattrs. "
5694 "Install either the python 'xattr' module, "
5695 "or the 'xattr' binary.")
5698 def random_birthday(year_field
, month_field
, day_field
):
5699 start_date
= datetime
.date(1950, 1, 1)
5700 end_date
= datetime
.date(1995, 12, 31)
5701 offset
= random
.randint(0, (end_date
- start_date
).days
)
5702 random_date
= start_date
+ datetime
.timedelta(offset
)
5704 year_field
: str(random_date
.year
),
5705 month_field
: str(random_date
.month
),
5706 day_field
: str(random_date
.day
),