4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_unquote_plus
,
64 compat_urllib_request
,
75 def register_socks_protocols():
76 # "Register" SOCKS protocols
77 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
78 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
79 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
80 if scheme
not in compat_urlparse
.uses_netloc
:
81 compat_urlparse
.uses_netloc
.append(scheme
)
84 # This is not clearly defined otherwise
85 compiled_regex_type
= type(re
.compile(''))
88 def random_user_agent():
89 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1668 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1672 'User-Agent': random_user_agent(),
1673 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1674 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1675 'Accept-Encoding': 'gzip, deflate',
1676 'Accept-Language': 'en-us,en;q=0.5',
1681 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1685 NO_DEFAULT
= object()
1687 ENGLISH_MONTH_NAMES
= [
1688 'January', 'February', 'March', 'April', 'May', 'June',
1689 'July', 'August', 'September', 'October', 'November', 'December']
1692 'en': ENGLISH_MONTH_NAMES
,
1694 'janvier', 'fƩvrier', 'mars', 'avril', 'mai', 'juin',
1695 'juillet', 'aoƻt', 'septembre', 'octobre', 'novembre', 'dƩcembre'],
1698 KNOWN_EXTENSIONS
= (
1699 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1700 'flv', 'f4v', 'f4a', 'f4b',
1701 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1702 'mkv', 'mka', 'mk3d',
1705 'asf', 'wmv', 'wma',
1711 'f4f', 'f4m', 'm3u8', 'smil')
1713 # needed for sanitizing filenames in restricted mode
1714 ACCENT_CHARS
= dict(zip('ĆĆĆĆĆĆ
ĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆÅĆÅĆĆĆĆÅ°ĆĆĆĆ Ć”Ć¢Ć£Ć¤Ć„Ć¦Ć§ĆØĆ©ĆŖƫƬĆĆ®ĆÆĆ°Ć±Ć²Ć³Ć“ĆµĆ¶ÅĆøÅĆ¹ĆŗĆ»Ć¼Å±Ć½Ć¾Ćæ',
1715 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1716 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1739 '%Y/%m/%d %H:%M:%S',
1741 '%Y-%m-%d %H:%M:%S',
1742 '%Y-%m-%d %H:%M:%S.%f',
1745 '%Y-%m-%dT%H:%M:%SZ',
1746 '%Y-%m-%dT%H:%M:%S.%fZ',
1747 '%Y-%m-%dT%H:%M:%S.%f0Z',
1748 '%Y-%m-%dT%H:%M:%S',
1749 '%Y-%m-%dT%H:%M:%S.%f',
1751 '%b %d %Y at %H:%M',
1752 '%b %d %Y at %H:%M:%S',
1753 '%B %d %Y at %H:%M',
1754 '%B %d %Y at %H:%M:%S',
1757 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1758 DATE_FORMATS_DAY_FIRST
.extend([
1764 '%d/%m/%Y %H:%M:%S',
1767 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1768 DATE_FORMATS_MONTH_FIRST
.extend([
1773 '%m/%d/%Y %H:%M:%S',
1776 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1777 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1780 def preferredencoding():
1781 """Get preferred encoding.
1783 Returns the best encoding scheme for the system, based on
1784 locale.getpreferredencoding() and some further tweaks.
1787 pref = locale.getpreferredencoding()
1795 def write_json_file(obj, fn):
1796 """ Encode obj as JSON and write it to fn, atomically if possible """
1798 fn = encodeFilename(fn)
1799 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1800 encoding = get_filesystem_encoding()
1801 # os.path.basename returns a bytes object, but NamedTemporaryFile
1802 # will fail if the filename contains non ascii characters unless we
1803 # use a unicode object
1804 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1805 # the same for os.path.dirname
1806 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1808 path_basename = os.path.basename
1809 path_dirname = os.path.dirname
1813 'prefix
': path_basename(fn) + '.',
1814 'dir': path_dirname(fn),
1818 # In Python 2.x, json.dump expects a bytestream.
1819 # In Python 3.x, it writes to a character stream
1820 if sys.version_info < (3, 0):
1825 'encoding
': 'utf
-8',
1828 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1833 if sys.platform == 'win32
':
1834 # Need to remove existing file on Windows, else os.rename raises
1835 # WindowsError or FileExistsError.
1843 os.chmod(tf.name, 0o666 & ~mask)
1846 os.rename(tf.name, fn)
1855 if sys.version_info >= (2, 7):
1856 def find_xpath_attr(node, xpath, key, val=None):
1857 """ Find the xpath xpath[@key=val] """
1858 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1859 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1860 return node.find(expr)
1862 def find_xpath_attr(node, xpath, key, val=None):
1863 for f in node.findall(compat_xpath(xpath)):
1864 if key not in f.attrib:
1866 if val is None or f.attrib.get(key) == val:
1870 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1871 # the namespace parameter
1874 def xpath_with_ns(path
, ns_map
):
1875 components
= [c
.split(':') for c
in path
.split('/')]
1877 for c
in components
:
1879 replaced
.append(c
[0])
1882 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1883 return '/'.join(replaced
)
1886 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1887 def _find_xpath(xpath
):
1888 return node
.find(compat_xpath(xpath
))
1890 if isinstance(xpath
, (str, compat_str
)):
1891 n
= _find_xpath(xpath
)
1899 if default
is not NO_DEFAULT
:
1902 name
= xpath
if name
is None else name
1903 raise ExtractorError('Could not find XML element %s' % name
)
1909 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1910 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1911 if n
is None or n
== default
:
1914 if default
is not NO_DEFAULT
:
1917 name
= xpath
if name
is None else name
1918 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1924 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1925 n
= find_xpath_attr(node
, xpath
, key
)
1927 if default
is not NO_DEFAULT
:
1930 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1931 raise ExtractorError('Could not find XML attribute %s' % name
)
1934 return n
.attrib
[key
]
1937 def get_element_by_id(id, html
):
1938 """Return the content of the tag with the specified ID in the passed HTML document"""
1939 return get_element_by_attribute('id', id, html
)
1942 def get_element_by_class(class_name
, html
):
1943 """Return the content of the first tag with the specified class in the passed HTML document"""
1944 retval
= get_elements_by_class(class_name
, html
)
1945 return retval
[0] if retval
else None
1948 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1949 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1950 return retval
[0] if retval
else None
1953 def get_elements_by_class(class_name
, html
):
1954 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1955 return get_elements_by_attribute(
1956 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1957 html, escape_value=False)
1960 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1961 """Return the content of the tag with the specified attribute in the passed HTML document"""
1963 value = re.escape(value) if escape_value else value
1966 for m in re.finditer(r'''(?xs)
1968 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1970 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1974 ''' % (re.escape(attribute), value), html):
1975 res = m.group('content
')
1977 if res.startswith('"') or res.startswith("'"):
1980 retlist.append(unescapeHTML(res))
1985 class HTMLAttributeParser(compat_HTMLParser):
1986 """Trivial HTML parser to gather the attributes for a single element"""
1989 compat_HTMLParser.__init__(self)
1991 def handle_starttag(self, tag, attrs):
1992 self.attrs = dict(attrs)
1995 def extract_attributes(html_element):
1996 """Given a string for an HTML element such as
1998 a="foo" B="bar" c="&98;az" d=boz
1999 empty= noval entity="&"
2002 Decode and return a dictionary of attributes.
2004 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2005 'empty
': '', 'noval
': None, 'entity
': '&',
2006 'sq
': '"', 'dq': '\''
2008 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2009 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2011 parser = HTMLAttributeParser()
2013 parser.feed(html_element)
2015 # Older Python may throw HTMLParseError in case of malformed HTML
2016 except compat_HTMLParseError:
2021 def clean_html(html):
2022 """Clean an HTML snippet into a readable string"""
2024 if html is None: # Convenience for sanitizing descriptions etc.
2028 html = html.replace('\n', ' ')
2029 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2030 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2032 html = re.sub('<.*?>', '', html)
2033 # Replace html entities
2034 html = unescapeHTML(html)
2038 def sanitize_open(filename, open_mode):
2039 """Try to open the given filename, and slightly tweak it if this fails.
2041 Attempts to open the given filename. If this fails, it tries to change
2042 the filename slightly, step by step, until it's either able to open it
2043 or it fails and raises a final exception, like the standard open()
2046 It returns the tuple (stream, definitive_file_name).
2050 if sys.platform == 'win32':
2052 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2053 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2054 stream = open(encodeFilename(filename), open_mode)
2055 return (stream, filename)
2056 except (IOError, OSError) as err:
2057 if err.errno in (errno.EACCES,):
2060 # In case of error, try to remove win32 forbidden chars
2061 alt_filename = sanitize_path(filename)
2062 if alt_filename == filename:
2065 # An exception here should be caught in the caller
2066 stream = open(encodeFilename(alt_filename), open_mode)
2067 return (stream, alt_filename)
2070 def timeconvert(timestr):
2071 """Convert RFC 2822 defined time string into system timestamp"""
2073 timetuple = email.utils.parsedate_tz(timestr)
2074 if timetuple is not None:
2075 timestamp = email.utils.mktime_tz(timetuple)
2079 def sanitize_filename(s, restricted=False, is_id=False):
2080 """Sanitizes a string so it could be used as part of a filename.
2081 If restricted is set, use a stricter subset of allowed characters.
2082 Set is_id if this is not an arbitrary string, but an ID that should be kept
2085 def replace_insane(char):
2086 if restricted and char in ACCENT_CHARS:
2087 return ACCENT_CHARS[char]
2088 if char == '?' or ord(char) < 32 or ord(char) == 127:
2091 return '' if restricted else '\''
2093 return '_
-' if restricted else ' -'
2094 elif char in '\\/|
*<>':
2096 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2098 if restricted
and ord(char
) > 127:
2103 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2104 result
= ''.join(map(replace_insane
, s
))
2106 while '__' in result
:
2107 result
= result
.replace('__', '_')
2108 result
= result
.strip('_')
2109 # Common case of "Foreign band name - English song title"
2110 if restricted
and result
.startswith('-_'):
2112 if result
.startswith('-'):
2113 result
= '_' + result
[len('-'):]
2114 result
= result
.lstrip('.')
2120 def sanitize_path(s
):
2121 """Sanitizes and normalizes path on Windows"""
2122 if sys
.platform
!= 'win32':
2124 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2125 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2126 drive_or_unc
, _
= os
.path
.splitunc(s
)
2127 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2131 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2132 for path_part
in norm_path
]
2134 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2135 return os
.path
.join(*sanitized_path
)
2138 def sanitize_url(url
):
2139 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2140 # the number of unwanted failures due to missing protocol
2141 if url
.startswith('//'):
2142 return 'http:%s' % url
2143 # Fix some common typos seen so far
2145 # https://github.com/ytdl-org/youtube-dl/issues/15649
2146 (r
'^httpss://', r
'https://'),
2147 # https://bx1.be/lives/direct-tv/
2148 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2150 for mistake
, fixup
in COMMON_TYPOS
:
2151 if re
.match(mistake
, url
):
2152 return re
.sub(mistake
, fixup
, url
)
2156 def sanitized_Request(url
, *args
, **kwargs
):
2157 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2161 """Expand shell variables and ~"""
2162 return os
.path
.expandvars(compat_expanduser(s
))
2165 def orderedSet(iterable
):
2166 """ Remove all duplicates from the input iterable """
2174 def _htmlentity_transform(entity_with_semicolon
):
2175 """Transforms an HTML entity to a character."""
2176 entity
= entity_with_semicolon
[:-1]
2178 # Known non-numeric HTML entity
2179 if entity
in compat_html_entities
.name2codepoint
:
2180 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2182 # TODO: HTML5 allows entities without a semicolon. For example,
2183 # 'Éric' should be decoded as 'Ćric'.
2184 if entity_with_semicolon
in compat_html_entities_html5
:
2185 return compat_html_entities_html5
[entity_with_semicolon
]
2187 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2188 if mobj
is not None:
2189 numstr
= mobj
.group(1)
2190 if numstr
.startswith('x'):
2192 numstr
= '0%s' % numstr
2195 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2197 return compat_chr(int(numstr
, base
))
2201 # Unknown entity in name, return its literal representation
2202 return '&%s;' % entity
2205 def unescapeHTML(s
):
2208 assert type(s
) == compat_str
2211 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2214 def get_subprocess_encoding():
2215 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2216 # For subprocess calls, encode with locale encoding
2217 # Refer to http://stackoverflow.com/a/9951851/35070
2218 encoding
= preferredencoding()
2220 encoding
= sys
.getfilesystemencoding()
2221 if encoding
is None:
2226 def encodeFilename(s
, for_subprocess
=False):
2228 @param s The name of the file
2231 assert type(s
) == compat_str
2233 # Python 3 has a Unicode API
2234 if sys
.version_info
>= (3, 0):
2237 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2238 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2239 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2240 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2243 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2244 if sys
.platform
.startswith('java'):
2247 return s
.encode(get_subprocess_encoding(), 'ignore')
2250 def decodeFilename(b
, for_subprocess
=False):
2252 if sys
.version_info
>= (3, 0):
2255 if not isinstance(b
, bytes):
2258 return b
.decode(get_subprocess_encoding(), 'ignore')
2261 def encodeArgument(s
):
2262 if not isinstance(s
, compat_str
):
2263 # Legacy code that uses byte strings
2264 # Uncomment the following line after fixing all post processors
2265 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2266 s
= s
.decode('ascii')
2267 return encodeFilename(s
, True)
2270 def decodeArgument(b
):
2271 return decodeFilename(b
, True)
2274 def decodeOption(optval
):
2277 if isinstance(optval
, bytes):
2278 optval
= optval
.decode(preferredencoding())
2280 assert isinstance(optval
, compat_str
)
2284 def formatSeconds(secs
):
2286 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
2288 return '%d:%02d' % (secs
// 60, secs
% 60)
2293 def make_HTTPS_handler(params
, **kwargs
):
2294 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2295 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2296 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2297 if opts_no_check_certificate
:
2298 context
.check_hostname
= False
2299 context
.verify_mode
= ssl
.CERT_NONE
2301 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2304 # (create_default_context present but HTTPSHandler has no context=)
2307 if sys
.version_info
< (3, 2):
2308 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2309 else: # Python < 3.4
2310 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2311 context
.verify_mode
= (ssl
.CERT_NONE
2312 if opts_no_check_certificate
2313 else ssl
.CERT_REQUIRED
)
2314 context
.set_default_verify_paths()
2315 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2318 def bug_reports_message():
2319 if ytdl_is_updateable():
2320 update_cmd
= 'type youtube-dl -U to update'
2322 update_cmd
= 'see https://yt-dl.org/update on how to update'
2323 msg
= '; please report this issue on https://yt-dl.org/bug .'
2324 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2325 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2329 class YoutubeDLError(Exception):
2330 """Base exception for YoutubeDL errors."""
2334 class ExtractorError(YoutubeDLError
):
2335 """Error during info extraction."""
2337 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2338 """ tb, if given, is the original traceback (so that it can be printed out).
2339 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2342 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2344 if video_id
is not None:
2345 msg
= video_id
+ ': ' + msg
2347 msg
+= ' (caused by %r)' % cause
2349 msg
+= bug_reports_message()
2350 super(ExtractorError
, self
).__init
__(msg
)
2353 self
.exc_info
= sys
.exc_info() # preserve original exception
2355 self
.video_id
= video_id
2357 def format_traceback(self
):
2358 if self
.traceback
is None:
2360 return ''.join(traceback
.format_tb(self
.traceback
))
2363 class UnsupportedError(ExtractorError
):
2364 def __init__(self
, url
):
2365 super(UnsupportedError
, self
).__init
__(
2366 'Unsupported URL: %s' % url
, expected
=True)
2370 class RegexNotFoundError(ExtractorError
):
2371 """Error when a regex didn't match"""
2375 class GeoRestrictedError(ExtractorError
):
2376 """Geographic restriction Error exception.
2378 This exception may be thrown when a video is not available from your
2379 geographic location due to geographic restrictions imposed by a website.
2381 def __init__(self
, msg
, countries
=None):
2382 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2384 self
.countries
= countries
2387 class DownloadError(YoutubeDLError
):
2388 """Download Error exception.
2390 This exception may be thrown by FileDownloader objects if they are not
2391 configured to continue on errors. They will contain the appropriate
2395 def __init__(self
, msg
, exc_info
=None):
2396 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2397 super(DownloadError
, self
).__init
__(msg
)
2398 self
.exc_info
= exc_info
2401 class SameFileError(YoutubeDLError
):
2402 """Same File exception.
2404 This exception will be thrown by FileDownloader objects if they detect
2405 multiple files would have to be downloaded to the same file on disk.
2410 class PostProcessingError(YoutubeDLError
):
2411 """Post Processing exception.
2413 This exception may be raised by PostProcessor's .run() method to
2414 indicate an error in the postprocessing task.
2417 def __init__(self
, msg
):
2418 super(PostProcessingError
, self
).__init
__(msg
)
2422 class MaxDownloadsReached(YoutubeDLError
):
2423 """ --max-downloads limit has been reached. """
2427 class UnavailableVideoError(YoutubeDLError
):
2428 """Unavailable Format exception.
2430 This exception will be thrown when a video is requested
2431 in a format that is not available for that video.
2436 class ContentTooShortError(YoutubeDLError
):
2437 """Content Too Short exception.
2439 This exception may be raised by FileDownloader objects when a file they
2440 download is too small for what the server announced first, indicating
2441 the connection was probably interrupted.
2444 def __init__(self
, downloaded
, expected
):
2445 super(ContentTooShortError
, self
).__init
__(
2446 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2449 self
.downloaded
= downloaded
2450 self
.expected
= expected
2453 class XAttrMetadataError(YoutubeDLError
):
2454 def __init__(self
, code
=None, msg
='Unknown error'):
2455 super(XAttrMetadataError
, self
).__init
__(msg
)
2459 # Parsing code and msg
2460 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2461 or 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
2462 self
.reason
= 'NO_SPACE'
2463 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2464 self
.reason
= 'VALUE_TOO_LONG'
2466 self
.reason
= 'NOT_SUPPORTED'
2469 class XAttrUnavailableError(YoutubeDLError
):
2473 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2474 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2475 # expected HTTP responses to meet HTTP/1.0 or later (see also
2476 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2477 if sys
.version_info
< (3, 0):
2478 kwargs
['strict'] = True
2479 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2480 source_address
= ydl_handler
._params
.get('source_address')
2482 if source_address
is not None:
2483 # This is to workaround _create_connection() from socket where it will try all
2484 # address data from getaddrinfo() including IPv6. This filters the result from
2485 # getaddrinfo() based on the source_address value.
2486 # This is based on the cpython socket.create_connection() function.
2487 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2488 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2489 host
, port
= address
2491 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2492 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2493 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2494 if addrs
and not ip_addrs
:
2495 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2497 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2498 % (ip_version
, source_address
[0]))
2499 for res
in ip_addrs
:
2500 af
, socktype
, proto
, canonname
, sa
= res
2503 sock
= socket
.socket(af
, socktype
, proto
)
2504 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2505 sock
.settimeout(timeout
)
2506 sock
.bind(source_address
)
2508 err
= None # Explicitly break reference cycle
2510 except socket
.error
as _
:
2512 if sock
is not None:
2517 raise socket
.error('getaddrinfo returns an empty list')
2518 if hasattr(hc
, '_create_connection'):
2519 hc
._create
_connection
= _create_connection
2520 sa
= (source_address
, 0)
2521 if hasattr(hc
, 'source_address'): # Python 2.7+
2522 hc
.source_address
= sa
2524 def _hc_connect(self
, *args
, **kwargs
):
2525 sock
= _create_connection(
2526 (self
.host
, self
.port
), self
.timeout
, sa
)
2528 self
.sock
= ssl
.wrap_socket(
2529 sock
, self
.key_file
, self
.cert_file
,
2530 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2533 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2538 def handle_youtubedl_headers(headers
):
2539 filtered_headers
= headers
2541 if 'Youtubedl-no-compression' in filtered_headers
:
2542 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2543 del filtered_headers
['Youtubedl-no-compression']
2545 return filtered_headers
2548 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2549 """Handler for HTTP requests and responses.
2551 This class, when installed with an OpenerDirector, automatically adds
2552 the standard headers to every HTTP request and handles gzipped and
2553 deflated responses from web servers. If compression is to be avoided in
2554 a particular request, the original request in the program code only has
2555 to include the HTTP header "Youtubedl-no-compression", which will be
2556 removed before making the real request.
2558 Part of this code was copied from:
2560 http://techknack.net/python-urllib2-handlers/
2562 Andrew Rowls, the author of that code, agreed to release it to the
2566 def __init__(self
, params
, *args
, **kwargs
):
2567 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2568 self
._params
= params
2570 def http_open(self
, req
):
2571 conn_class
= compat_http_client
.HTTPConnection
2573 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2575 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2576 del req
.headers
['Ytdl-socks-proxy']
2578 return self
.do_open(functools
.partial(
2579 _create_http_connection
, self
, conn_class
, False),
2585 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2587 return zlib
.decompress(data
)
2589 def http_request(self
, req
):
2590 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2591 # always respected by websites, some tend to give out URLs with non percent-encoded
2592 # non-ASCII characters (see telemb.py, ard.py [#3412])
2593 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2594 # To work around aforementioned issue we will replace request's original URL with
2595 # percent-encoded one
2596 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2597 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2598 url
= req
.get_full_url()
2599 url_escaped
= escape_url(url
)
2601 # Substitute URL if any change after escaping
2602 if url
!= url_escaped
:
2603 req
= update_Request(req
, url
=url_escaped
)
2605 for h
, v
in std_headers
.items():
2606 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2607 # The dict keys are capitalized because of this bug by urllib
2608 if h
.capitalize() not in req
.headers
:
2609 req
.add_header(h
, v
)
2611 req
.headers
= handle_youtubedl_headers(req
.headers
)
2613 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2614 # Python 2.6 is brain-dead when it comes to fragments
2615 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2616 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2620 def http_response(self
, req
, resp
):
2623 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2624 content
= resp
.read()
2625 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2627 uncompressed
= io
.BytesIO(gz
.read())
2628 except IOError as original_ioerror
:
2629 # There may be junk add the end of the file
2630 # See http://stackoverflow.com/q/4928560/35070 for details
2631 for i
in range(1, 1024):
2633 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2634 uncompressed
= io
.BytesIO(gz
.read())
2639 raise original_ioerror
2640 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2641 resp
.msg
= old_resp
.msg
2642 del resp
.headers
['Content-encoding']
2644 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2645 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2646 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2647 resp
.msg
= old_resp
.msg
2648 del resp
.headers
['Content-encoding']
2649 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2650 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2651 if 300 <= resp
.code
< 400:
2652 location
= resp
.headers
.get('Location')
2654 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2655 if sys
.version_info
>= (3, 0):
2656 location
= location
.encode('iso-8859-1').decode('utf-8')
2658 location
= location
.decode('utf-8')
2659 location_escaped
= escape_url(location
)
2660 if location
!= location_escaped
:
2661 del resp
.headers
['Location']
2662 if sys
.version_info
< (3, 0):
2663 location_escaped
= location_escaped
.encode('utf-8')
2664 resp
.headers
['Location'] = location_escaped
2667 https_request
= http_request
2668 https_response
= http_response
2671 def make_socks_conn_class(base_class
, socks_proxy
):
2672 assert issubclass(base_class
, (
2673 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2675 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2676 if url_components
.scheme
.lower() == 'socks5':
2677 socks_type
= ProxyType
.SOCKS5
2678 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2679 socks_type
= ProxyType
.SOCKS4
2680 elif url_components
.scheme
.lower() == 'socks4a':
2681 socks_type
= ProxyType
.SOCKS4A
2683 def unquote_if_non_empty(s
):
2686 return compat_urllib_parse_unquote_plus(s
)
2690 url_components
.hostname
, url_components
.port
or 1080,
2692 unquote_if_non_empty(url_components
.username
),
2693 unquote_if_non_empty(url_components
.password
),
2696 class SocksConnection(base_class
):
2698 self
.sock
= sockssocket()
2699 self
.sock
.setproxy(*proxy_args
)
2700 if type(self
.timeout
) in (int, float):
2701 self
.sock
.settimeout(self
.timeout
)
2702 self
.sock
.connect((self
.host
, self
.port
))
2704 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2705 if hasattr(self
, '_context'): # Python > 2.6
2706 self
.sock
= self
._context
.wrap_socket(
2707 self
.sock
, server_hostname
=self
.host
)
2709 self
.sock
= ssl
.wrap_socket(self
.sock
)
2711 return SocksConnection
2714 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2715 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2716 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2717 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2718 self
._params
= params
2720 def https_open(self
, req
):
2722 conn_class
= self
._https
_conn
_class
2724 if hasattr(self
, '_context'): # python > 2.6
2725 kwargs
['context'] = self
._context
2726 if hasattr(self
, '_check_hostname'): # python 3.x
2727 kwargs
['check_hostname'] = self
._check
_hostname
2729 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2731 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2732 del req
.headers
['Ytdl-socks-proxy']
2734 return self
.do_open(functools
.partial(
2735 _create_http_connection
, self
, conn_class
, True),
2739 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2741 See [1] for cookie file format.
2743 1. https://curl.haxx.se/docs/http-cookies.html
2745 _HTTPONLY_PREFIX
= '#HttpOnly_'
2747 _HEADER
= '''# Netscape HTTP Cookie File
2748 # This file is generated by youtube-dl. Do not edit.
2751 _CookieFileEntry
= collections
.namedtuple(
2753 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2755 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2757 Save cookies to a file.
2759 Most of the code is taken from CPython 3.8 and slightly adapted
2760 to support cookie files with UTF-8 in both python 2 and 3.
2762 if filename
is None:
2763 if self
.filename
is not None:
2764 filename
= self
.filename
2766 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2768 # Store session cookies with `expires` set to 0 instead of an empty
2771 if cookie
.expires
is None:
2774 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2775 f
.write(self
._HEADER
)
2778 if not ignore_discard
and cookie
.discard
:
2780 if not ignore_expires
and cookie
.is_expired(now
):
2786 if cookie
.domain
.startswith('.'):
2787 initial_dot
= 'TRUE'
2789 initial_dot
= 'FALSE'
2790 if cookie
.expires
is not None:
2791 expires
= compat_str(cookie
.expires
)
2794 if cookie
.value
is None:
2795 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2796 # with no name, whereas http.cookiejar regards it as a
2797 # cookie with no value.
2802 value
= cookie
.value
2804 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2805 secure
, expires
, name
, value
]) + '\n')
2807 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2808 """Load cookies from a file."""
2809 if filename
is None:
2810 if self
.filename
is not None:
2811 filename
= self
.filename
2813 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2815 def prepare_line(line
):
2816 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2817 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2818 # comments and empty lines are fine
2819 if line
.startswith('#') or not line
.strip():
2821 cookie_list
= line
.split('\t')
2822 if len(cookie_list
) != self
._ENTRY
_LEN
:
2823 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2824 cookie
= self
._CookieFileEntry
(*cookie_list
)
2825 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2826 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2830 with io
.open(filename
, encoding
='utf-8') as f
:
2833 cf
.write(prepare_line(line
))
2834 except compat_cookiejar
.LoadError
as e
:
2836 'WARNING: skipping cookie file entry due to %s: %r\n'
2837 % (e
, line
), sys
.stderr
)
2840 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2841 # Session cookies are denoted by either `expires` field set to
2842 # an empty string or 0. MozillaCookieJar only recognizes the former
2843 # (see [1]). So we need force the latter to be recognized as session
2844 # cookies on our own.
2845 # Session cookies may be important for cookies-based authentication,
2846 # e.g. usually, when user does not check 'Remember me' check box while
2847 # logging in on a site, some important cookies are stored as session
2848 # cookies so that not recognizing them will result in failed login.
2849 # 1. https://bugs.python.org/issue17164
2851 # Treat `expires=0` cookies as session cookies
2852 if cookie
.expires
== 0:
2853 cookie
.expires
= None
2854 cookie
.discard
= True
2857 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2858 def __init__(self
, cookiejar
=None):
2859 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2861 def http_response(self
, request
, response
):
2862 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2863 # characters in Set-Cookie HTTP header of last response (see
2864 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2865 # In order to at least prevent crashing we will percent encode Set-Cookie
2866 # header before HTTPCookieProcessor starts processing it.
2867 # if sys.version_info < (3, 0) and response.headers:
2868 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2869 # set_cookie = response.headers.get(set_cookie_header)
2871 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2872 # if set_cookie != set_cookie_escaped:
2873 # del response.headers[set_cookie_header]
2874 # response.headers[set_cookie_header] = set_cookie_escaped
2875 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2877 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2878 https_response
= http_response
2881 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2882 if sys
.version_info
[0] < 3:
2883 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2884 # On python 2 urlh.geturl() may sometimes return redirect URL
2885 # as byte string instead of unicode. This workaround allows
2886 # to force it always return unicode.
2887 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2890 def extract_timezone(date_str
):
2892 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2895 timezone
= datetime
.timedelta()
2897 date_str
= date_str
[:-len(m
.group('tz'))]
2898 if not m
.group('sign'):
2899 timezone
= datetime
.timedelta()
2901 sign
= 1 if m
.group('sign') == '+' else -1
2902 timezone
= datetime
.timedelta(
2903 hours
=sign
* int(m
.group('hours')),
2904 minutes
=sign
* int(m
.group('minutes')))
2905 return timezone
, date_str
2908 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2909 """ Return a UNIX timestamp from the given date """
2911 if date_str
is None:
2914 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2916 if timezone
is None:
2917 timezone
, date_str
= extract_timezone(date_str
)
2920 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2921 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2922 return calendar
.timegm(dt
.timetuple())
2927 def date_formats(day_first
=True):
2928 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2931 def unified_strdate(date_str
, day_first
=True):
2932 """Return a string with the date in the format YYYYMMDD"""
2934 if date_str
is None:
2938 date_str
= date_str
.replace(',', ' ')
2939 # Remove AM/PM + timezone
2940 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2941 _
, date_str
= extract_timezone(date_str
)
2943 for expression
in date_formats(day_first
):
2945 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2948 if upload_date
is None:
2949 timetuple
= email
.utils
.parsedate_tz(date_str
)
2952 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2955 if upload_date
is not None:
2956 return compat_str(upload_date
)
2959 def unified_timestamp(date_str
, day_first
=True):
2960 if date_str
is None:
2963 date_str
= re
.sub(r
'[,|]', '', date_str
)
2965 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2966 timezone
, date_str
= extract_timezone(date_str
)
2968 # Remove AM/PM + timezone
2969 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2971 # Remove unrecognized timezones from ISO 8601 alike timestamps
2972 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2974 date_str
= date_str
[:-len(m
.group('tz'))]
2976 # Python only supports microseconds, so remove nanoseconds
2977 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2979 date_str
= m
.group(1)
2981 for expression
in date_formats(day_first
):
2983 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2984 return calendar
.timegm(dt
.timetuple())
2987 timetuple
= email
.utils
.parsedate_tz(date_str
)
2989 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2992 def determine_ext(url
, default_ext
='unknown_video'):
2993 if url
is None or '.' not in url
:
2995 guess
= url
.partition('?')[0].rpartition('.')[2]
2996 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
2998 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2999 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3000 return guess
.rstrip('/')
3005 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3006 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3009 def date_from_str(date_str
):
3011 Return a datetime object from a string in the format YYYYMMDD or
3012 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3013 today
= datetime
.date
.today()
3014 if date_str
in ('now', 'today'):
3016 if date_str
== 'yesterday':
3017 return today
- datetime
.timedelta(days
=1)
3018 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3019 if match
is not None:
3020 sign
= match
.group('sign')
3021 time
= int(match
.group('time'))
3024 unit
= match
.group('unit')
3025 # A bad approximation?
3029 elif unit
== 'year':
3033 delta
= datetime
.timedelta(**{unit
: time
})
3034 return today
+ delta
3035 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3038 def hyphenate_date(date_str
):
3040 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3041 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3042 if match
is not None:
3043 return '-'.join(match
.groups())
3048 class DateRange(object):
3049 """Represents a time interval between two dates"""
3051 def __init__(self
, start
=None, end
=None):
3052 """start and end must be strings in the format accepted by date"""
3053 if start
is not None:
3054 self
.start
= date_from_str(start
)
3056 self
.start
= datetime
.datetime
.min.date()
3058 self
.end
= date_from_str(end
)
3060 self
.end
= datetime
.datetime
.max.date()
3061 if self
.start
> self
.end
:
3062 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3066 """Returns a range that only contains the given day"""
3067 return cls(day
, day
)
3069 def __contains__(self
, date
):
3070 """Check if the date is in the range"""
3071 if not isinstance(date
, datetime
.date
):
3072 date
= date_from_str(date
)
3073 return self
.start
<= date
<= self
.end
3076 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3079 def platform_name():
3080 """ Returns the platform name as a compat_str """
3081 res
= platform
.platform()
3082 if isinstance(res
, bytes):
3083 res
= res
.decode(preferredencoding())
3085 assert isinstance(res
, compat_str
)
3089 def _windows_write_string(s
, out
):
3090 """ Returns True if the string was written using special methods,
3091 False if it has yet to be written out."""
3092 # Adapted from http://stackoverflow.com/a/3259271/35070
3095 import ctypes
.wintypes
3103 fileno
= out
.fileno()
3104 except AttributeError:
3105 # If the output stream doesn't have a fileno, it's virtual
3107 except io
.UnsupportedOperation
:
3108 # Some strange Windows pseudo files?
3110 if fileno
not in WIN_OUTPUT_IDS
:
3113 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3114 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3115 ('GetStdHandle', ctypes
.windll
.kernel32
))
3116 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3118 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3119 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3120 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3121 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3122 written
= ctypes
.wintypes
.DWORD(0)
3124 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3125 FILE_TYPE_CHAR
= 0x0002
3126 FILE_TYPE_REMOTE
= 0x8000
3127 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3128 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3129 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3130 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3131 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3133 def not_a_console(handle
):
3134 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3136 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3137 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3139 if not_a_console(h
):
3142 def next_nonbmp_pos(s
):
3144 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3145 except StopIteration:
3149 count
= min(next_nonbmp_pos(s
), 1024)
3151 ret
= WriteConsoleW(
3152 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3154 raise OSError('Failed to write string')
3155 if not count
: # We just wrote a non-BMP character
3156 assert written
.value
== 2
3159 assert written
.value
> 0
3160 s
= s
[written
.value
:]
3164 def write_string(s
, out
=None, encoding
=None):
3167 assert type(s
) == compat_str
3169 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3170 if _windows_write_string(s
, out
):
3173 if ('b' in getattr(out
, 'mode', '')
3174 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3175 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3177 elif hasattr(out
, 'buffer'):
3178 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3179 byt
= s
.encode(enc
, 'ignore')
3180 out
.buffer.write(byt
)
3186 def bytes_to_intlist(bs
):
3189 if isinstance(bs
[0], int): # Python 3
3192 return [ord(c
) for c
in bs
]
3195 def intlist_to_bytes(xs
):
3198 return compat_struct_pack('%dB' % len(xs
), *xs
)
3201 # Cross-platform file locking
3202 if sys
.platform
== 'win32':
3203 import ctypes
.wintypes
3206 class OVERLAPPED(ctypes
.Structure
):
3208 ('Internal', ctypes
.wintypes
.LPVOID
),
3209 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3210 ('Offset', ctypes
.wintypes
.DWORD
),
3211 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3212 ('hEvent', ctypes
.wintypes
.HANDLE
),
3215 kernel32
= ctypes
.windll
.kernel32
3216 LockFileEx
= kernel32
.LockFileEx
3217 LockFileEx
.argtypes
= [
3218 ctypes
.wintypes
.HANDLE
, # hFile
3219 ctypes
.wintypes
.DWORD
, # dwFlags
3220 ctypes
.wintypes
.DWORD
, # dwReserved
3221 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3222 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3223 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3225 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3226 UnlockFileEx
= kernel32
.UnlockFileEx
3227 UnlockFileEx
.argtypes
= [
3228 ctypes
.wintypes
.HANDLE
, # hFile
3229 ctypes
.wintypes
.DWORD
, # dwReserved
3230 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3231 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3232 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3234 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3235 whole_low
= 0xffffffff
3236 whole_high
= 0x7fffffff
3238 def _lock_file(f
, exclusive
):
3239 overlapped
= OVERLAPPED()
3240 overlapped
.Offset
= 0
3241 overlapped
.OffsetHigh
= 0
3242 overlapped
.hEvent
= 0
3243 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3244 handle
= msvcrt
.get_osfhandle(f
.fileno())
3245 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3246 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3247 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3249 def _unlock_file(f
):
3250 assert f
._lock
_file
_overlapped
_p
3251 handle
= msvcrt
.get_osfhandle(f
.fileno())
3252 if not UnlockFileEx(handle
, 0,
3253 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3254 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3257 # Some platforms, such as Jython, is missing fcntl
3261 def _lock_file(f
, exclusive
):
3262 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3264 def _unlock_file(f
):
3265 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3267 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3269 def _lock_file(f
, exclusive
):
3270 raise IOError(UNSUPPORTED_MSG
)
3272 def _unlock_file(f
):
3273 raise IOError(UNSUPPORTED_MSG
)
3276 class locked_file(object):
3277 def __init__(self
, filename
, mode
, encoding
=None):
3278 assert mode
in ['r', 'a', 'w']
3279 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3282 def __enter__(self
):
3283 exclusive
= self
.mode
!= 'r'
3285 _lock_file(self
.f
, exclusive
)
3291 def __exit__(self
, etype
, value
, traceback
):
3293 _unlock_file(self
.f
)
3300 def write(self
, *args
):
3301 return self
.f
.write(*args
)
3303 def read(self
, *args
):
3304 return self
.f
.read(*args
)
3307 def get_filesystem_encoding():
3308 encoding
= sys
.getfilesystemencoding()
3309 return encoding
if encoding
is not None else 'utf-8'
3312 def shell_quote(args
):
3314 encoding
= get_filesystem_encoding()
3316 if isinstance(a
, bytes):
3317 # We may get a filename encoded with 'encodeFilename'
3318 a
= a
.decode(encoding
)
3319 quoted_args
.append(compat_shlex_quote(a
))
3320 return ' '.join(quoted_args
)
3323 def smuggle_url(url
, data
):
3324 """ Pass additional data in a URL for internal use. """
3326 url
, idata
= unsmuggle_url(url
, {})
3328 sdata
= compat_urllib_parse_urlencode(
3329 {'__youtubedl_smuggle': json
.dumps(data
)})
3330 return url
+ '#' + sdata
3333 def unsmuggle_url(smug_url
, default
=None):
3334 if '#__youtubedl_smuggle' not in smug_url
:
3335 return smug_url
, default
3336 url
, _
, sdata
= smug_url
.rpartition('#')
3337 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3338 data
= json
.loads(jsond
)
3342 def format_bytes(bytes):
3345 if type(bytes) is str:
3346 bytes = float(bytes)
3350 exponent
= int(math
.log(bytes, 1024.0))
3351 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3352 converted
= float(bytes) / float(1024 ** exponent
)
3353 return '%.2f%s' % (converted
, suffix
)
3356 def lookup_unit_table(unit_table
, s
):
3357 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3359 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3362 num_str
= m
.group('num').replace(',', '.')
3363 mult
= unit_table
[m
.group('unit')]
3364 return int(float(num_str
) * mult
)
3367 def parse_filesize(s
):
3371 # The lower-case forms are of course incorrect and unofficial,
3372 # but we support those too
3389 'megabytes': 1000 ** 2,
3390 'mebibytes': 1024 ** 2,
3396 'gigabytes': 1000 ** 3,
3397 'gibibytes': 1024 ** 3,
3403 'terabytes': 1000 ** 4,
3404 'tebibytes': 1024 ** 4,
3410 'petabytes': 1000 ** 5,
3411 'pebibytes': 1024 ** 5,
3417 'exabytes': 1000 ** 6,
3418 'exbibytes': 1024 ** 6,
3424 'zettabytes': 1000 ** 7,
3425 'zebibytes': 1024 ** 7,
3431 'yottabytes': 1000 ** 8,
3432 'yobibytes': 1024 ** 8,
3435 return lookup_unit_table(_UNIT_TABLE
, s
)
3444 if re
.match(r
'^[\d,.]+$', s
):
3445 return str_to_int(s
)
3456 return lookup_unit_table(_UNIT_TABLE
, s
)
3459 def parse_resolution(s
):
3463 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xXĆ]\s*(?P<h>\d+)\b', s
)
3466 'width': int(mobj
.group('w')),
3467 'height': int(mobj
.group('h')),
3470 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3472 return {'height': int(mobj
.group(1))}
3474 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3476 return {'height': int(mobj
.group(1)) * 540}
3481 def parse_bitrate(s
):
3482 if not isinstance(s
, compat_str
):
3484 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3486 return int(mobj
.group(1))
3489 def month_by_name(name
, lang
='en'):
3490 """ Return the number of a month by (locale-independently) English name """
3492 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3495 return month_names
.index(name
) + 1
3500 def month_by_abbreviation(abbrev
):
3501 """ Return the number of a month by (locale-independently) English
3505 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3510 def fix_xml_ampersands(xml_str
):
3511 """Replace all the '&' by '&' in XML"""
3513 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3518 def setproctitle(title
):
3519 assert isinstance(title
, compat_str
)
3521 # ctypes in Jython is not complete
3522 # http://bugs.jython.org/issue2148
3523 if sys
.platform
.startswith('java'):
3527 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3531 # LoadLibrary in Windows Python 2.7.13 only expects
3532 # a bytestring, but since unicode_literals turns
3533 # every string into a unicode string, it fails.
3535 title_bytes
= title
.encode('utf-8')
3536 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3537 buf
.value
= title_bytes
3539 libc
.prctl(15, buf
, 0, 0, 0)
3540 except AttributeError:
3541 return # Strange libc, just skip this
3544 def remove_start(s
, start
):
3545 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3548 def remove_end(s
, end
):
3549 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3552 def remove_quotes(s
):
3553 if s
is None or len(s
) < 2:
3555 for quote
in ('"', "'", ):
3556 if s
[0] == quote
and s
[-1] == quote
:
3561 def url_basename(url
):
3562 path
= compat_urlparse
.urlparse(url
).path
3563 return path
.strip('/').split('/')[-1]
3567 return re
.match(r
'https?://[^?#&]+/', url
).group()
3570 def urljoin(base
, path
):
3571 if isinstance(path
, bytes):
3572 path
= path
.decode('utf-8')
3573 if not isinstance(path
, compat_str
) or not path
:
3575 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3577 if isinstance(base
, bytes):
3578 base
= base
.decode('utf-8')
3579 if not isinstance(base
, compat_str
) or not re
.match(
3580 r
'^(?:https?:)?//', base
):
3582 return compat_urlparse
.urljoin(base
, path
)
3585 class HEADRequest(compat_urllib_request
.Request
):
3586 def get_method(self
):
3590 class PUTRequest(compat_urllib_request
.Request
):
3591 def get_method(self
):
3595 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3598 v
= getattr(v
, get_attr
, None)
3604 return int(v
) * invscale
// scale
3605 except (ValueError, TypeError):
3609 def str_or_none(v
, default
=None):
3610 return default
if v
is None else compat_str(v
)
3613 def str_to_int(int_str
):
3614 """ A more relaxed version of int_or_none """
3615 if isinstance(int_str
, compat_integer_types
):
3617 elif isinstance(int_str
, compat_str
):
3618 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3619 return int_or_none(int_str
)
3622 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3626 return float(v
) * invscale
/ scale
3627 except (ValueError, TypeError):
3631 def bool_or_none(v
, default
=None):
3632 return v
if isinstance(v
, bool) else default
3635 def strip_or_none(v
, default
=None):
3636 return v
.strip() if isinstance(v
, compat_str
) else default
3639 def url_or_none(url
):
3640 if not url
or not isinstance(url
, compat_str
):
3643 return url
if re
.match(r
'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url
) else None
3646 def parse_duration(s
):
3647 if not isinstance(s
, compat_basestring
):
3652 days
, hours
, mins
, secs
, ms
= [None] * 5
3653 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3655 days
, hours
, mins
, secs
, ms
= m
.groups()
3660 [0-9]+\s*y(?:ears?)?\s*
3663 [0-9]+\s*m(?:onths?)?\s*
3666 [0-9]+\s*w(?:eeks?)?\s*
3669 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3673 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3676 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3679 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3682 days
, hours
, mins
, secs
, ms
= m
.groups()
3684 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3686 hours
, mins
= m
.groups()
3692 duration
+= float(secs
)
3694 duration
+= float(mins
) * 60
3696 duration
+= float(hours
) * 60 * 60
3698 duration
+= float(days
) * 24 * 60 * 60
3700 duration
+= float(ms
)
3704 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3705 name
, real_ext
= os
.path
.splitext(filename
)
3707 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3708 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3709 else '{0}.{1}'.format(filename
, ext
))
3712 def replace_extension(filename
, ext
, expected_real_ext
=None):
3713 name
, real_ext
= os
.path
.splitext(filename
)
3714 return '{0}.{1}'.format(
3715 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3719 def check_executable(exe
, args
=[]):
3720 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3721 args can be a list of arguments for a short output (like -version) """
3723 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3729 def get_exe_version(exe
, args
=['--version'],
3730 version_re
=None, unrecognized
='present'):
3731 """ Returns the version of the specified executable,
3732 or False if the executable is not present """
3734 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3735 # SIGTTOU if youtube-dl is run in the background.
3736 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3737 out
, _
= subprocess
.Popen(
3738 [encodeArgument(exe
)] + args
,
3739 stdin
=subprocess
.PIPE
,
3740 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3743 if isinstance(out
, bytes): # Python 2.x
3744 out
= out
.decode('ascii', 'ignore')
3745 return detect_exe_version(out
, version_re
, unrecognized
)
3748 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3749 assert isinstance(output
, compat_str
)
3750 if version_re
is None:
3751 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3752 m
= re
.search(version_re
, output
)
3759 class PagedList(object):
3761 # This is only useful for tests
3762 return len(self
.getslice())
3765 class OnDemandPagedList(PagedList
):
3766 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3767 self
._pagefunc
= pagefunc
3768 self
._pagesize
= pagesize
3769 self
._use
_cache
= use_cache
3773 def getslice(self
, start
=0, end
=None):
3775 for pagenum
in itertools
.count(start
// self
._pagesize
):
3776 firstid
= pagenum
* self
._pagesize
3777 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3778 if start
>= nextfirstid
:
3783 page_results
= self
._cache
.get(pagenum
)
3784 if page_results
is None:
3785 page_results
= list(self
._pagefunc
(pagenum
))
3787 self
._cache
[pagenum
] = page_results
3790 start
% self
._pagesize
3791 if firstid
<= start
< nextfirstid
3795 ((end
- 1) % self
._pagesize
) + 1
3796 if (end
is not None and firstid
<= end
<= nextfirstid
)
3799 if startv
!= 0 or endv
is not None:
3800 page_results
= page_results
[startv
:endv
]
3801 res
.extend(page_results
)
3803 # A little optimization - if current page is not "full", ie. does
3804 # not contain page_size videos then we can assume that this page
3805 # is the last one - there are no more ids on further pages -
3806 # i.e. no need to query again.
3807 if len(page_results
) + startv
< self
._pagesize
:
3810 # If we got the whole page, but the next page is not interesting,
3811 # break out early as well
3812 if end
== nextfirstid
:
3817 class InAdvancePagedList(PagedList
):
3818 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3819 self
._pagefunc
= pagefunc
3820 self
._pagecount
= pagecount
3821 self
._pagesize
= pagesize
3823 def getslice(self
, start
=0, end
=None):
3825 start_page
= start
// self
._pagesize
3827 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3828 skip_elems
= start
- start_page
* self
._pagesize
3829 only_more
= None if end
is None else end
- start
3830 for pagenum
in range(start_page
, end_page
):
3831 page
= list(self
._pagefunc
(pagenum
))
3833 page
= page
[skip_elems
:]
3835 if only_more
is not None:
3836 if len(page
) < only_more
:
3837 only_more
-= len(page
)
3839 page
= page
[:only_more
]
3846 def uppercase_escape(s
):
3847 unicode_escape
= codecs
.getdecoder('unicode_escape')
3849 r
'\\U[0-9a-fA-F]{8}',
3850 lambda m
: unicode_escape(m
.group(0))[0],
3854 def lowercase_escape(s
):
3855 unicode_escape
= codecs
.getdecoder('unicode_escape')
3857 r
'\\u[0-9a-fA-F]{4}',
3858 lambda m
: unicode_escape(m
.group(0))[0],
3862 def escape_rfc3986(s
):
3863 """Escape non-ASCII characters as suggested by RFC 3986"""
3864 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3865 s
= s
.encode('utf-8')
3866 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3869 def escape_url(url
):
3870 """Escape URL as suggested by RFC 3986"""
3871 url_parsed
= compat_urllib_parse_urlparse(url
)
3872 return url_parsed
._replace
(
3873 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3874 path
=escape_rfc3986(url_parsed
.path
),
3875 params
=escape_rfc3986(url_parsed
.params
),
3876 query
=escape_rfc3986(url_parsed
.query
),
3877 fragment
=escape_rfc3986(url_parsed
.fragment
)
3881 def read_batch_urls(batch_fd
):
3883 if not isinstance(url
, compat_str
):
3884 url
= url
.decode('utf-8', 'replace')
3885 BOM_UTF8
= '\xef\xbb\xbf'
3886 if url
.startswith(BOM_UTF8
):
3887 url
= url
[len(BOM_UTF8
):]
3889 if url
.startswith(('#', ';', ']')):
3893 with contextlib
.closing(batch_fd
) as fd
:
3894 return [url
for url
in map(fixup
, fd
) if url
]
3897 def urlencode_postdata(*args
, **kargs
):
3898 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3901 def update_url_query(url
, query
):
3904 parsed_url
= compat_urlparse
.urlparse(url
)
3905 qs
= compat_parse_qs(parsed_url
.query
)
3907 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3908 query
=compat_urllib_parse_urlencode(qs
, True)))
3911 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
3912 req_headers
= req
.headers
.copy()
3913 req_headers
.update(headers
)
3914 req_data
= data
or req
.data
3915 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3916 req_get_method
= req
.get_method()
3917 if req_get_method
== 'HEAD':
3918 req_type
= HEADRequest
3919 elif req_get_method
== 'PUT':
3920 req_type
= PUTRequest
3922 req_type
= compat_urllib_request
.Request
3924 req_url
, data
=req_data
, headers
=req_headers
,
3925 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3926 if hasattr(req
, 'timeout'):
3927 new_req
.timeout
= req
.timeout
3931 def _multipart_encode_impl(data
, boundary
):
3932 content_type
= 'multipart/form-data; boundary=%s' % boundary
3935 for k
, v
in data
.items():
3936 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3937 if isinstance(k
, compat_str
):
3938 k
= k
.encode('utf-8')
3939 if isinstance(v
, compat_str
):
3940 v
= v
.encode('utf-8')
3941 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3942 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3943 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3944 if boundary
.encode('ascii') in content
:
3945 raise ValueError('Boundary overlaps with data')
3948 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3950 return out
, content_type
3953 def multipart_encode(data
, boundary
=None):
3955 Encode a dict to RFC 7578-compliant form-data
3958 A dict where keys and values can be either Unicode or bytes-like
3961 If specified a Unicode object, it's used as the boundary. Otherwise
3962 a random boundary is generated.
3964 Reference: https://tools.ietf.org/html/rfc7578
3966 has_specified_boundary
= boundary
is not None
3969 if boundary
is None:
3970 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3973 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3976 if has_specified_boundary
:
3980 return out
, content_type
3983 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3984 if isinstance(key_or_keys
, (list, tuple)):
3985 for key
in key_or_keys
:
3986 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3990 return d
.get(key_or_keys
, default
)
3993 def try_get(src
, getter
, expected_type
=None):
3994 if not isinstance(getter
, (list, tuple)):
3999 except (AttributeError, KeyError, TypeError, IndexError):
4002 if expected_type
is None or isinstance(v
, expected_type
):
4006 def merge_dicts(*dicts
):
4008 for a_dict
in dicts
:
4009 for k
, v
in a_dict
.items():
4013 or (isinstance(v
, compat_str
) and v
4014 and isinstance(merged
[k
], compat_str
)
4015 and not merged
[k
])):
4020 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4021 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4033 TV_PARENTAL_GUIDELINES
= {
4043 def parse_age_limit(s
):
4045 return s
if 0 <= s
<= 21 else None
4046 if not isinstance(s
, compat_basestring
):
4048 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4050 return int(m
.group('age'))
4052 return US_RATINGS
[s
]
4053 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4055 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4059 def strip_jsonp(code
):
4062 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4063 (?:\s*&&\s*(?P=func_name))?
4064 \s*\(\s*(?P<callback_data>.*)\);?
4065 \s*?(?://[^\n]*)*$''',
4066 r
'\g<callback_data>', code
)
4069 def js_to_json(code
):
4070 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4071 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4073 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4074 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4079 if v
in ('true', 'false', 'null'):
4081 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
4084 if v
[0] in ("'", '"'):
4085 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4090 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4092 for regex
, base
in INTEGER_TABLE
:
4093 im
= re
.match(regex
, v
)
4095 i
= int(im
.group(1), base
)
4096 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4100 return re
.sub(r
'''(?sx)
4101 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4102 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4103 {comment}|,(?={skip}[\]}}])|
4104 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4105 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4107 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4110 def qualities(quality_ids
):
4111 """ Get a numeric quality value out of a list of possible values """
4114 return quality_ids
.index(qid
)
4120 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
4123 def limit_length(s
, length
):
4124 """ Add ellipses to overly long strings """
4129 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4133 def version_tuple(v
):
4134 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4137 def is_outdated_version(version
, limit
, assume_new
=True):
4139 return not assume_new
4141 return version_tuple(version
) < version_tuple(limit
)
4143 return not assume_new
4146 def ytdl_is_updateable():
4147 """ Returns if youtube-dl can be updated with -U """
4148 from zipimport
import zipimporter
4150 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4153 def args_to_str(args
):
4154 # Get a short string representation for a subprocess command
4155 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4158 def error_to_compat_str(err
):
4160 # On python 2 error byte string must be decoded with proper
4161 # encoding rather than ascii
4162 if sys
.version_info
[0] < 3:
4163 err_str
= err_str
.decode(preferredencoding())
4167 def mimetype2ext(mt
):
4173 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4174 # it's the most popular one
4175 'audio/mpeg': 'mp3',
4180 _
, _
, res
= mt
.rpartition('/')
4181 res
= res
.split(';')[0].strip().lower()
4185 'smptett+xml': 'tt',
4189 'x-mp4-fragmented': 'mp4',
4190 'x-ms-sami': 'sami',
4193 'x-mpegurl': 'm3u8',
4194 'vnd.apple.mpegurl': 'm3u8',
4198 'vnd.ms-sstr+xml': 'ism',
4204 def parse_codecs(codecs_str
):
4205 # http://tools.ietf.org/html/rfc6381
4208 splited_codecs
= list(filter(None, map(
4209 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4210 vcodec
, acodec
= None, None
4211 for full_codec
in splited_codecs
:
4212 codec
= full_codec
.split('.')[0]
4213 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4216 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4220 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4221 if not vcodec
and not acodec
:
4222 if len(splited_codecs
) == 2:
4224 'vcodec': splited_codecs
[0],
4225 'acodec': splited_codecs
[1],
4229 'vcodec': vcodec
or 'none',
4230 'acodec': acodec
or 'none',
4235 def urlhandle_detect_ext(url_handle
):
4236 getheader
= url_handle
.headers
.get
4238 cd
= getheader('Content-Disposition')
4240 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4242 e
= determine_ext(m
.group('filename'), default_ext
=None)
4246 return mimetype2ext(getheader('Content-Type'))
4249 def encode_data_uri(data
, mime_type
):
4250 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4253 def age_restricted(content_limit
, age_limit
):
4254 """ Returns True iff the content should be blocked """
4256 if age_limit
is None: # No limit set
4258 if content_limit
is None:
4259 return False # Content available for everyone
4260 return age_limit
< content_limit
4263 def is_html(first_bytes
):
4264 """ Detect whether a file contains HTML by examining its first bytes. """
4267 (b
'\xef\xbb\xbf', 'utf-8'),
4268 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4269 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4270 (b
'\xff\xfe', 'utf-16-le'),
4271 (b
'\xfe\xff', 'utf-16-be'),
4273 for bom
, enc
in BOMS
:
4274 if first_bytes
.startswith(bom
):
4275 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4278 s
= first_bytes
.decode('utf-8', 'replace')
4280 return re
.match(r
'^\s*<', s
)
4283 def determine_protocol(info_dict
):
4284 protocol
= info_dict
.get('protocol')
4285 if protocol
is not None:
4288 url
= info_dict
['url']
4289 if url
.startswith('rtmp'):
4291 elif url
.startswith('mms'):
4293 elif url
.startswith('rtsp'):
4296 ext
= determine_ext(url
)
4302 return compat_urllib_parse_urlparse(url
).scheme
4305 def render_table(header_row
, data
):
4306 """ Render a list of rows, each as a list of values """
4307 table
= [header_row
] + data
4308 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4309 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
4310 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4313 def _match_one(filter_part
, dct
):
4314 COMPARISON_OPERATORS
= {
4322 operator_rex
= re
.compile(r
'''(?x)\s*
4324 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4326 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4327 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4328 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4331 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4332 m = operator_rex.search(filter_part)
4334 op = COMPARISON_OPERATORS[m.group('op')]
4335 actual_value = dct.get(m.group('key'))
4336 if (m.group('quotedstrval') is not None
4337 or m.group('strval') is not None
4338 # If the original field is a string and matching comparisonvalue is
4339 # a number we should respect the origin of the original field
4340 # and process comparison value as a string (see
4341 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4342 or actual_value is not None and m.group('intval') is not None
4343 and isinstance(actual_value, compat_str)):
4344 if m.group('op') not in ('=', '!='):
4346 'Operator %s does not support string values!' % m.group('op'))
4347 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4348 quote = m.group('quote')
4349 if quote is not None:
4350 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4353 comparison_value = int(m.group('intval'))
4355 comparison_value = parse_filesize(m.group('intval'))
4356 if comparison_value is None:
4357 comparison_value = parse_filesize(m.group('intval') + 'B')
4358 if comparison_value is None:
4360 'Invalid integer value %r in filter part %r' % (
4361 m.group('intval'), filter_part))
4362 if actual_value is None:
4363 return m.group('none_inclusive')
4364 return op(actual_value, comparison_value)
4367 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4368 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4370 operator_rex = re.compile(r'''(?x
)\s
*
4371 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4373 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4374 m = operator_rex.search(filter_part)
4376 op = UNARY_OPERATORS[m.group('op')]
4377 actual_value = dct.get(m.group('key'))
4378 return op(actual_value)
4380 raise ValueError('Invalid filter part %r' % filter_part)
4383 def match_str(filter_str, dct):
4384 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4387 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4390 def match_filter_func(filter_str):
4391 def _match_func(info_dict):
4392 if match_str(filter_str, info_dict):
4395 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4396 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4400 def parse_dfxp_time_expr(time_expr):
4404 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4406 return float(mobj.group('time_offset'))
4408 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4410 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4413 def srt_subtitles_timecode(seconds):
4414 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4417 def dfxp2srt(dfxp_data):
4419 @param dfxp_data A
bytes-like
object containing DFXP data
4420 @returns A
unicode object containing converted SRT data
4422 LEGACY_NAMESPACES = (
4423 (b'http://www.w3.org/ns/ttml', [
4424 b'http://www.w3.org/2004/11/ttaf1',
4425 b'http://www.w3.org/2006/04/ttaf1',
4426 b'http://www.w3.org/2006/10/ttaf1',
4428 (b'http://www.w3.org/ns/ttml#styling', [
4429 b'http://www.w3.org/ns/ttml#style',
4433 SUPPORTED_STYLING = [
4442 _x = functools.partial(xpath_with_ns, ns_map={
4443 'xml': 'http://www.w3.org/XML/1998/namespace',
4444 'ttml': 'http://www.w3.org/ns/ttml',
4445 'tts': 'http://www.w3.org/ns/ttml#styling',
4451 class TTMLPElementParser(object):
4453 _unclosed_elements = []
4454 _applied_styles = []
4456 def start(self, tag, attrib):
4457 if tag in (_x('ttml:br'), 'br'):
4460 unclosed_elements = []
4462 element_style_id = attrib.get('style')
4464 style.update(default_style)
4465 if element_style_id:
4466 style.update(styles.get(element_style_id, {}))
4467 for prop in SUPPORTED_STYLING:
4468 prop_val = attrib.get(_x('tts:' + prop))
4470 style[prop] = prop_val
4473 for k, v in sorted(style.items()):
4474 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4477 font += ' color="%s"' % v
4478 elif k == 'fontSize':
4479 font += ' size="%s"' % v
4480 elif k == 'fontFamily':
4481 font += ' face="%s"' % v
4482 elif k == 'fontWeight' and v == 'bold':
4484 unclosed_elements.append('b')
4485 elif k == 'fontStyle' and v == 'italic':
4487 unclosed_elements.append('i')
4488 elif k == 'textDecoration' and v == 'underline':
4490 unclosed_elements.append('u')
4492 self._out += '<font' + font + '>'
4493 unclosed_elements.append('font')
4495 if self._applied_styles:
4496 applied_style.update(self._applied_styles[-1])
4497 applied_style.update(style)
4498 self._applied_styles.append(applied_style)
4499 self._unclosed_elements.append(unclosed_elements)
4502 if tag not in (_x('ttml:br'), 'br'):
4503 unclosed_elements = self._unclosed_elements.pop()
4504 for element in reversed(unclosed_elements):
4505 self._out += '</%s>' % element
4506 if unclosed_elements and self._applied_styles:
4507 self._applied_styles.pop()
4509 def data(self, data):
4513 return self._out.strip()
4515 def parse_node(node):
4516 target = TTMLPElementParser()
4517 parser = xml.etree.ElementTree.XMLParser(target=target)
4518 parser.feed(xml.etree.ElementTree.tostring(node))
4519 return parser.close()
4521 for k, v in LEGACY_NAMESPACES:
4523 dfxp_data = dfxp_data.replace(ns, k)
4525 dfxp = compat_etree_fromstring(dfxp_data)
4527 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4530 raise ValueError('Invalid dfxp/TTML subtitle')
4534 for style in dfxp.findall(_x('.//ttml:style')):
4535 style_id = style.get('id') or style.get(_x('xml:id'))
4538 parent_style_id = style.get('style')
4540 if parent_style_id not in styles:
4543 styles[style_id] = styles[parent_style_id].copy()
4544 for prop in SUPPORTED_STYLING:
4545 prop_val = style.get(_x('tts:' + prop))
4547 styles.setdefault(style_id, {})[prop] = prop_val
4553 for p in ('body', 'div'):
4554 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4557 style = styles.get(ele.get('style'))
4560 default_style.update(style)
4562 for para, index in zip(paras, itertools.count(1)):
4563 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4564 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4565 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4566 if begin_time is None:
4571 end_time = begin_time + dur
4572 out.append('%d\n%s --> %s\n%s\n\n' % (
4574 srt_subtitles_timecode(begin_time),
4575 srt_subtitles_timecode(end_time),
4581 def cli_option(params, command_option, param):
4582 param = params.get(param)
4584 param = compat_str(param)
4585 return [command_option, param] if param is not None else []
4588 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4589 param = params.get(param)
4592 assert isinstance(param, bool)
4594 return [command_option + separator + (true_value if param else false_value)]
4595 return [command_option, true_value if param else false_value]
4598 def cli_valueless_option(params, command_option, param, expected_value=True):
4599 param = params.get(param)
4600 return [command_option] if param == expected_value else []
4603 def cli_configuration_args(params, param, default=[]):
4604 ex_args = params.get(param)
4607 assert isinstance(ex_args, list)
4611 class ISO639Utils(object):
4612 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4671 'iw': 'heb', # Replaced by he in 1989 revision
4681 'in': 'ind', # Replaced by id in 1989 revision
4796 'ji': 'yid', # Replaced by yi in 1989 revision
4804 def short2long(cls, code):
4805 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4806 return cls._lang_map.get(code[:2])
4809 def long2short(cls, code):
4810 """Convert language code from ISO 639-2/T to ISO 639-1"""
4811 for short_name, long_name in cls._lang_map.items():
4812 if long_name == code:
4816 class ISO3166Utils(object):
4817 # From http://data.okfn.org/data/core/country-list
4819 'AF': 'Afghanistan',
4820 'AX': 'Ć
land Islands',
4823 'AS': 'American Samoa',
4828 'AG': 'Antigua and Barbuda',
4845 'BO': 'Bolivia, Plurinational State of',
4846 'BQ': 'Bonaire, Sint Eustatius and Saba',
4847 'BA': 'Bosnia and Herzegovina',
4849 'BV': 'Bouvet Island',
4851 'IO': 'British Indian Ocean Territory',
4852 'BN': 'Brunei Darussalam',
4854 'BF': 'Burkina Faso',
4860 'KY': 'Cayman Islands',
4861 'CF': 'Central African Republic',
4865 'CX': 'Christmas Island',
4866 'CC': 'Cocos (Keeling) Islands',
4870 'CD': 'Congo, the Democratic Republic of the',
4871 'CK': 'Cook Islands',
4873 'CI': 'CĆ“te d\'Ivoire',
4878 'CZ': 'Czech Republic',
4882 'DO': 'Dominican Republic',
4885 'SV': 'El Salvador',
4886 'GQ': 'Equatorial Guinea',
4890 'FK': 'Falkland Islands (Malvinas)',
4891 'FO': 'Faroe Islands',
4895 'GF': 'French Guiana',
4896 'PF': 'French Polynesia',
4897 'TF': 'French Southern Territories',
4912 'GW': 'Guinea-Bissau',
4915 'HM': 'Heard Island and McDonald Islands',
4916 'VA': 'Holy See (Vatican City State)',
4923 'IR': 'Iran, Islamic Republic of',
4926 'IM': 'Isle of Man',
4936 'KP': 'Korea, Democratic People\'s Republic of',
4937 'KR': 'Korea, Republic of',
4940 'LA': 'Lao People\'s Democratic Republic',
4946 'LI': 'Liechtenstein',
4950 'MK': 'Macedonia, the Former Yugoslav Republic of',
4957 'MH': 'Marshall Islands',
4963 'FM': 'Micronesia, Federated States of',
4964 'MD': 'Moldova, Republic of',
4975 'NL': 'Netherlands',
4976 'NC': 'New Caledonia',
4977 'NZ': 'New Zealand',
4982 'NF': 'Norfolk Island',
4983 'MP': 'Northern Mariana Islands',
4988 'PS': 'Palestine, State of',
4990 'PG': 'Papua New Guinea',
4993 'PH': 'Philippines',
4997 'PR': 'Puerto Rico',
5001 'RU': 'Russian Federation',
5003 'BL': 'Saint BarthƩlemy',
5004 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5005 'KN': 'Saint Kitts and Nevis',
5006 'LC': 'Saint Lucia',
5007 'MF': 'Saint Martin (French part)',
5008 'PM': 'Saint Pierre and Miquelon',
5009 'VC': 'Saint Vincent and the Grenadines',
5012 'ST': 'Sao Tome and Principe',
5013 'SA': 'Saudi Arabia',
5017 'SL': 'Sierra Leone',
5019 'SX': 'Sint Maarten (Dutch part)',
5022 'SB': 'Solomon Islands',
5024 'ZA': 'South Africa',
5025 'GS': 'South Georgia and the South Sandwich Islands',
5026 'SS': 'South Sudan',
5031 'SJ': 'Svalbard and Jan Mayen',
5034 'CH': 'Switzerland',
5035 'SY': 'Syrian Arab Republic',
5036 'TW': 'Taiwan, Province of China',
5038 'TZ': 'Tanzania, United Republic of',
5040 'TL': 'Timor-Leste',
5044 'TT': 'Trinidad and Tobago',
5047 'TM': 'Turkmenistan',
5048 'TC': 'Turks and Caicos Islands',
5052 'AE': 'United Arab Emirates',
5053 'GB': 'United Kingdom',
5054 'US': 'United States',
5055 'UM': 'United States Minor Outlying Islands',
5059 'VE': 'Venezuela, Bolivarian Republic of',
5061 'VG': 'Virgin Islands, British',
5062 'VI': 'Virgin Islands, U.S.',
5063 'WF': 'Wallis and Futuna',
5064 'EH': 'Western Sahara',
5071 def short2full(cls, code):
5072 """Convert an ISO 3166-2 country code to the corresponding full name"""
5073 return cls._country_map.get(code.upper())
5076 class GeoUtils(object):
5077 # Major IPv4 address blocks per country
5079 'AD': '46.172.224.0/19',
5080 'AE': '94.200.0.0/13',
5081 'AF': '149.54.0.0/17',
5082 'AG': '209.59.64.0/18',
5083 'AI': '204.14.248.0/21',
5084 'AL': '46.99.0.0/16',
5085 'AM': '46.70.0.0/15',
5086 'AO': '105.168.0.0/13',
5087 'AP': '182.50.184.0/21',
5088 'AQ': '23.154.160.0/24',
5089 'AR': '181.0.0.0/12',
5090 'AS': '202.70.112.0/20',
5091 'AT': '77.116.0.0/14',
5092 'AU': '1.128.0.0/11',
5093 'AW': '181.41.0.0/18',
5094 'AX': '185.217.4.0/22',
5095 'AZ': '5.197.0.0/16',
5096 'BA': '31.176.128.0/17',
5097 'BB': '65.48.128.0/17',
5098 'BD': '114.130.0.0/16',
5100 'BF': '102.178.0.0/15',
5101 'BG': '95.42.0.0/15',
5102 'BH': '37.131.0.0/17',
5103 'BI': '154.117.192.0/18',
5104 'BJ': '137.255.0.0/16',
5105 'BL': '185.212.72.0/23',
5106 'BM': '196.12.64.0/18',
5107 'BN': '156.31.0.0/16',
5108 'BO': '161.56.0.0/16',
5109 'BQ': '161.0.80.0/20',
5110 'BR': '191.128.0.0/12',
5111 'BS': '24.51.64.0/18',
5112 'BT': '119.2.96.0/19',
5113 'BW': '168.167.0.0/16',
5114 'BY': '178.120.0.0/13',
5115 'BZ': '179.42.192.0/18',
5116 'CA': '99.224.0.0/11',
5117 'CD': '41.243.0.0/16',
5118 'CF': '197.242.176.0/21',
5119 'CG': '160.113.0.0/16',
5120 'CH': '85.0.0.0/13',
5121 'CI': '102.136.0.0/14',
5122 'CK': '202.65.32.0/19',
5123 'CL': '152.172.0.0/14',
5124 'CM': '102.244.0.0/14',
5125 'CN': '36.128.0.0/10',
5126 'CO': '181.240.0.0/12',
5127 'CR': '201.192.0.0/12',
5128 'CU': '152.206.0.0/15',
5129 'CV': '165.90.96.0/19',
5130 'CW': '190.88.128.0/17',
5131 'CY': '31.153.0.0/16',
5132 'CZ': '88.100.0.0/14',
5134 'DJ': '197.241.0.0/17',
5135 'DK': '87.48.0.0/12',
5136 'DM': '192.243.48.0/20',
5137 'DO': '152.166.0.0/15',
5138 'DZ': '41.96.0.0/12',
5139 'EC': '186.68.0.0/15',
5140 'EE': '90.190.0.0/15',
5141 'EG': '156.160.0.0/11',
5142 'ER': '196.200.96.0/20',
5143 'ES': '88.0.0.0/11',
5144 'ET': '196.188.0.0/14',
5145 'EU': '2.16.0.0/13',
5146 'FI': '91.152.0.0/13',
5147 'FJ': '144.120.0.0/16',
5148 'FK': '80.73.208.0/21',
5149 'FM': '119.252.112.0/20',
5150 'FO': '88.85.32.0/19',
5152 'GA': '41.158.0.0/15',
5154 'GD': '74.122.88.0/21',
5155 'GE': '31.146.0.0/16',
5156 'GF': '161.22.64.0/18',
5157 'GG': '62.68.160.0/19',
5158 'GH': '154.160.0.0/12',
5159 'GI': '95.164.0.0/16',
5160 'GL': '88.83.0.0/19',
5161 'GM': '160.182.0.0/15',
5162 'GN': '197.149.192.0/18',
5163 'GP': '104.250.0.0/19',
5164 'GQ': '105.235.224.0/20',
5165 'GR': '94.64.0.0/13',
5166 'GT': '168.234.0.0/16',
5167 'GU': '168.123.0.0/16',
5168 'GW': '197.214.80.0/20',
5169 'GY': '181.41.64.0/18',
5170 'HK': '113.252.0.0/14',
5171 'HN': '181.210.0.0/16',
5172 'HR': '93.136.0.0/13',
5173 'HT': '148.102.128.0/17',
5174 'HU': '84.0.0.0/14',
5175 'ID': '39.192.0.0/10',
5176 'IE': '87.32.0.0/12',
5177 'IL': '79.176.0.0/13',
5178 'IM': '5.62.80.0/20',
5179 'IN': '117.192.0.0/10',
5180 'IO': '203.83.48.0/21',
5181 'IQ': '37.236.0.0/14',
5182 'IR': '2.176.0.0/12',
5183 'IS': '82.221.0.0/16',
5184 'IT': '79.0.0.0/10',
5185 'JE': '87.244.64.0/18',
5186 'JM': '72.27.0.0/17',
5187 'JO': '176.29.0.0/16',
5188 'JP': '133.0.0.0/8',
5189 'KE': '105.48.0.0/12',
5190 'KG': '158.181.128.0/17',
5191 'KH': '36.37.128.0/17',
5192 'KI': '103.25.140.0/22',
5193 'KM': '197.255.224.0/20',
5194 'KN': '198.167.192.0/19',
5195 'KP': '175.45.176.0/22',
5196 'KR': '175.192.0.0/10',
5197 'KW': '37.36.0.0/14',
5198 'KY': '64.96.0.0/15',
5199 'KZ': '2.72.0.0/13',
5200 'LA': '115.84.64.0/18',
5201 'LB': '178.135.0.0/16',
5202 'LC': '24.92.144.0/20',
5203 'LI': '82.117.0.0/19',
5204 'LK': '112.134.0.0/15',
5205 'LR': '102.183.0.0/16',
5206 'LS': '129.232.0.0/17',
5207 'LT': '78.56.0.0/13',
5208 'LU': '188.42.0.0/16',
5209 'LV': '46.109.0.0/16',
5210 'LY': '41.252.0.0/14',
5211 'MA': '105.128.0.0/11',
5212 'MC': '88.209.64.0/18',
5213 'MD': '37.246.0.0/16',
5214 'ME': '178.175.0.0/17',
5215 'MF': '74.112.232.0/21',
5216 'MG': '154.126.0.0/17',
5217 'MH': '117.103.88.0/21',
5218 'MK': '77.28.0.0/15',
5219 'ML': '154.118.128.0/18',
5220 'MM': '37.111.0.0/17',
5221 'MN': '49.0.128.0/17',
5222 'MO': '60.246.0.0/16',
5223 'MP': '202.88.64.0/20',
5224 'MQ': '109.203.224.0/19',
5225 'MR': '41.188.64.0/18',
5226 'MS': '208.90.112.0/22',
5227 'MT': '46.11.0.0/16',
5228 'MU': '105.16.0.0/12',
5229 'MV': '27.114.128.0/18',
5230 'MW': '102.70.0.0/15',
5231 'MX': '187.192.0.0/11',
5232 'MY': '175.136.0.0/13',
5233 'MZ': '197.218.0.0/15',
5234 'NA': '41.182.0.0/16',
5235 'NC': '101.101.0.0/18',
5236 'NE': '197.214.0.0/18',
5237 'NF': '203.17.240.0/22',
5238 'NG': '105.112.0.0/12',
5239 'NI': '186.76.0.0/15',
5240 'NL': '145.96.0.0/11',
5241 'NO': '84.208.0.0/13',
5242 'NP': '36.252.0.0/15',
5243 'NR': '203.98.224.0/19',
5244 'NU': '49.156.48.0/22',
5245 'NZ': '49.224.0.0/14',
5246 'OM': '5.36.0.0/15',
5247 'PA': '186.72.0.0/15',
5248 'PE': '186.160.0.0/14',
5249 'PF': '123.50.64.0/18',
5250 'PG': '124.240.192.0/19',
5251 'PH': '49.144.0.0/13',
5252 'PK': '39.32.0.0/11',
5253 'PL': '83.0.0.0/11',
5254 'PM': '70.36.0.0/20',
5255 'PR': '66.50.0.0/16',
5256 'PS': '188.161.0.0/16',
5257 'PT': '85.240.0.0/13',
5258 'PW': '202.124.224.0/20',
5259 'PY': '181.120.0.0/14',
5260 'QA': '37.210.0.0/15',
5261 'RE': '102.35.0.0/16',
5262 'RO': '79.112.0.0/13',
5263 'RS': '93.86.0.0/15',
5264 'RU': '5.136.0.0/13',
5265 'RW': '41.186.0.0/16',
5266 'SA': '188.48.0.0/13',
5267 'SB': '202.1.160.0/19',
5268 'SC': '154.192.0.0/11',
5269 'SD': '102.120.0.0/13',
5270 'SE': '78.64.0.0/12',
5271 'SG': '8.128.0.0/10',
5272 'SI': '188.196.0.0/14',
5273 'SK': '78.98.0.0/15',
5274 'SL': '102.143.0.0/17',
5275 'SM': '89.186.32.0/19',
5276 'SN': '41.82.0.0/15',
5277 'SO': '154.115.192.0/18',
5278 'SR': '186.179.128.0/17',
5279 'SS': '105.235.208.0/21',
5280 'ST': '197.159.160.0/19',
5281 'SV': '168.243.0.0/16',
5282 'SX': '190.102.0.0/20',
5284 'SZ': '41.84.224.0/19',
5285 'TC': '65.255.48.0/20',
5286 'TD': '154.68.128.0/19',
5287 'TG': '196.168.0.0/14',
5288 'TH': '171.96.0.0/13',
5289 'TJ': '85.9.128.0/18',
5290 'TK': '27.96.24.0/21',
5291 'TL': '180.189.160.0/20',
5292 'TM': '95.85.96.0/19',
5293 'TN': '197.0.0.0/11',
5294 'TO': '175.176.144.0/21',
5295 'TR': '78.160.0.0/11',
5296 'TT': '186.44.0.0/15',
5297 'TV': '202.2.96.0/19',
5298 'TW': '120.96.0.0/11',
5299 'TZ': '156.156.0.0/14',
5300 'UA': '37.52.0.0/14',
5301 'UG': '102.80.0.0/13',
5303 'UY': '167.56.0.0/13',
5304 'UZ': '84.54.64.0/18',
5305 'VA': '212.77.0.0/19',
5306 'VC': '207.191.240.0/21',
5307 'VE': '186.88.0.0/13',
5308 'VG': '66.81.192.0/20',
5309 'VI': '146.226.0.0/16',
5310 'VN': '14.160.0.0/11',
5311 'VU': '202.80.32.0/20',
5312 'WF': '117.20.32.0/21',
5313 'WS': '202.4.32.0/19',
5314 'YE': '134.35.0.0/16',
5315 'YT': '41.242.116.0/22',
5316 'ZA': '41.0.0.0/11',
5317 'ZM': '102.144.0.0/13',
5318 'ZW': '102.177.192.0/18',
5322 def random_ipv4(cls, code_or_block):
5323 if len(code_or_block) == 2:
5324 block = cls._country_ip_map.get(code_or_block.upper())
5328 block = code_or_block
5329 addr, preflen = block.split('/')
5330 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5331 addr_max = addr_min | (0xffffffff >> int(preflen))
5332 return compat_str(socket.inet_ntoa(
5333 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5336 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5337 def __init__(self, proxies=None):
5338 # Set default handlers
5339 for type in ('http', 'https'):
5340 setattr(self, '%s_open' % type,
5341 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5342 meth(r, proxy, type))
5343 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5345 def proxy_open(self, req, proxy, type):
5346 req_proxy = req.headers.get('Ytdl-request-proxy')
5347 if req_proxy is not None:
5349 del req.headers['Ytdl-request-proxy']
5351 if proxy == '__noproxy__':
5352 return None # No Proxy
5353 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5354 req.add_header('Ytdl-socks-proxy', proxy)
5355 # youtube-dl's http/https handlers do wrapping the socket with socks
5357 return compat_urllib_request.ProxyHandler.proxy_open(
5358 self, req, proxy, type)
5361 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5362 # released into Public Domain
5363 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5365 def long_to_bytes(n, blocksize=0):
5366 """long_to_bytes(n:long, blocksize:int) : string
5367 Convert a long integer to a byte string.
5369 If optional blocksize is given and greater than zero, pad the front of the
5370 byte string with binary zeros so that the length is a multiple of
5373 # after much testing, this algorithm was deemed to be the fastest
5377 s = compat_struct_pack('>I', n & 0xffffffff) + s
5379 # strip off leading zeros
5380 for i in range(len(s)):
5381 if s[i] != b'\000'[0]:
5384 # only happens when n == 0
5388 # add back some pad bytes. this could be done more efficiently w.r.t. the
5389 # de-padding being done above, but sigh...
5390 if blocksize > 0 and len(s) % blocksize:
5391 s = (blocksize - len(s) % blocksize) * b'\000' + s
5395 def bytes_to_long(s):
5396 """bytes_to_long(string) : long
5397 Convert a byte string to a long integer.
5399 This is (essentially) the inverse of long_to_bytes().
5404 extra = (4 - length % 4)
5405 s = b'\000' * extra + s
5406 length = length + extra
5407 for i in range(0, length, 4):
5408 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5412 def ohdave_rsa_encrypt(data, exponent, modulus):
5414 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5417 data: data to encrypt, bytes-like object
5418 exponent, modulus: parameter e and N of RSA algorithm, both integer
5419 Output: hex string of encrypted data
5421 Limitation: supports one block encryption only
5424 payload = int(binascii.hexlify(data[::-1]), 16)
5425 encrypted = pow(payload, exponent, modulus)
5426 return '%x' % encrypted
5429 def pkcs1pad(data, length):
5431 Padding input data with PKCS#1 scheme
5433 @param {int[]} data input data
5434 @param {int} length target length
5435 @returns {int[]} padded data
5437 if len(data) > length - 11:
5438 raise ValueError('Input data too
long for PKCS
#1 padding')
5440 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5441 return [0, 2] + pseudo_random
+ [0] + data
5444 def encode_base_n(num
, n
, table
=None):
5445 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5447 table
= FULL_TABLE
[:n
]
5450 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5457 ret
= table
[num
% n
] + ret
5462 def decode_packed_codes(code
):
5463 mobj
= re
.search(PACKED_CODES_RE
, code
)
5464 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
5467 symbols
= symbols
.split('|')
5472 base_n_count
= encode_base_n(count
, base
)
5473 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5476 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5480 def caesar(s
, alphabet
, shift
):
5485 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5490 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5493 def parse_m3u8_attributes(attrib
):
5495 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5496 if val
.startswith('"'):
5502 def urshift(val
, n
):
5503 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5506 # Based on png2str() written by @gdkchan and improved by @yokrysty
5507 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5508 def decode_png(png_data
):
5509 # Reference: https://www.w3.org/TR/PNG/
5510 header
= png_data
[8:]
5512 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5513 raise IOError('Not a valid PNG file.')
5515 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5516 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5521 length
= unpack_integer(header
[:4])
5524 chunk_type
= header
[:4]
5527 chunk_data
= header
[:length
]
5528 header
= header
[length
:]
5530 header
= header
[4:] # Skip CRC
5538 ihdr
= chunks
[0]['data']
5540 width
= unpack_integer(ihdr
[:4])
5541 height
= unpack_integer(ihdr
[4:8])
5545 for chunk
in chunks
:
5546 if chunk
['type'] == b
'IDAT':
5547 idat
+= chunk
['data']
5550 raise IOError('Unable to read PNG data.')
5552 decompressed_data
= bytearray(zlib
.decompress(idat
))
5557 def _get_pixel(idx
):
5562 for y
in range(height
):
5563 basePos
= y
* (1 + stride
)
5564 filter_type
= decompressed_data
[basePos
]
5568 pixels
.append(current_row
)
5570 for x
in range(stride
):
5571 color
= decompressed_data
[1 + basePos
+ x
]
5572 basex
= y
* stride
+ x
5577 left
= _get_pixel(basex
- 3)
5579 up
= _get_pixel(basex
- stride
)
5581 if filter_type
== 1: # Sub
5582 color
= (color
+ left
) & 0xff
5583 elif filter_type
== 2: # Up
5584 color
= (color
+ up
) & 0xff
5585 elif filter_type
== 3: # Average
5586 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5587 elif filter_type
== 4: # Paeth
5593 c
= _get_pixel(basex
- stride
- 3)
5601 if pa
<= pb
and pa
<= pc
:
5602 color
= (color
+ a
) & 0xff
5604 color
= (color
+ b
) & 0xff
5606 color
= (color
+ c
) & 0xff
5608 current_row
.append(color
)
5610 return width
, height
, pixels
5613 def write_xattr(path
, key
, value
):
5614 # This mess below finds the best xattr tool for the job
5616 # try the pyxattr module...
5619 if hasattr(xattr
, 'set'): # pyxattr
5620 # Unicode arguments are not supported in python-pyxattr until
5622 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5623 pyxattr_required_version
= '0.5.0'
5624 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5625 # TODO: fallback to CLI tools
5626 raise XAttrUnavailableError(
5627 'python-pyxattr is detected but is too old. '
5628 'youtube-dl requires %s or above while your version is %s. '
5629 'Falling back to other xattr implementations' % (
5630 pyxattr_required_version
, xattr
.__version
__))
5632 setxattr
= xattr
.set
5634 setxattr
= xattr
.setxattr
5637 setxattr(path
, key
, value
)
5638 except EnvironmentError as e
:
5639 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5642 if compat_os_name
== 'nt':
5643 # Write xattrs to NTFS Alternate Data Streams:
5644 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5645 assert ':' not in key
5646 assert os
.path
.exists(path
)
5648 ads_fn
= path
+ ':' + key
5650 with open(ads_fn
, 'wb') as f
:
5652 except EnvironmentError as e
:
5653 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5655 user_has_setfattr
= check_executable('setfattr', ['--version'])
5656 user_has_xattr
= check_executable('xattr', ['-h'])
5658 if user_has_setfattr
or user_has_xattr
:
5660 value
= value
.decode('utf-8')
5661 if user_has_setfattr
:
5662 executable
= 'setfattr'
5663 opts
= ['-n', key
, '-v', value
]
5664 elif user_has_xattr
:
5665 executable
= 'xattr'
5666 opts
= ['-w', key
, value
]
5668 cmd
= ([encodeFilename(executable
, True)]
5669 + [encodeArgument(o
) for o
in opts
]
5670 + [encodeFilename(path
, True)])
5673 p
= subprocess
.Popen(
5674 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5675 except EnvironmentError as e
:
5676 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5677 stdout
, stderr
= p
.communicate()
5678 stderr
= stderr
.decode('utf-8', 'replace')
5679 if p
.returncode
!= 0:
5680 raise XAttrMetadataError(p
.returncode
, stderr
)
5683 # On Unix, and can't find pyxattr, setfattr, or xattr.
5684 if sys
.platform
.startswith('linux'):
5685 raise XAttrUnavailableError(
5686 "Couldn't find a tool to set the xattrs. "
5687 "Install either the python 'pyxattr' or 'xattr' "
5688 "modules, or the GNU 'attr' package "
5689 "(which contains the 'setfattr' tool).")
5691 raise XAttrUnavailableError(
5692 "Couldn't find a tool to set the xattrs. "
5693 "Install either the python 'xattr' module, "
5694 "or the 'xattr' binary.")
5697 def random_birthday(year_field
, month_field
, day_field
):
5698 start_date
= datetime
.date(1950, 1, 1)
5699 end_date
= datetime
.date(1995, 12, 31)
5700 offset
= random
.randint(0, (end_date
- start_date
).days
)
5701 random_date
= start_date
+ datetime
.timedelta(offset
)
5703 year_field
: str(random_date
.year
),
5704 month_field
: str(random_date
.month
),
5705 day_field
: str(random_date
.day
),