4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_unquote_plus
,
64 compat_urllib_request
,
75 def register_socks_protocols():
76 # "Register" SOCKS protocols
77 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
78 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
79 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
80 if scheme
not in compat_urlparse
.uses_netloc
:
81 compat_urlparse
.uses_netloc
.append(scheme
)
84 # This is not clearly defined otherwise
85 compiled_regex_type
= type(re
.compile(''))
88 def random_user_agent():
89 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1668 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1672 'User-Agent': random_user_agent(),
1673 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1674 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1675 'Accept-Encoding': 'gzip, deflate',
1676 'Accept-Language': 'en-us,en;q=0.5',
1681 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1685 NO_DEFAULT
= object()
1687 ENGLISH_MONTH_NAMES
= [
1688 'January', 'February', 'March', 'April', 'May', 'June',
1689 'July', 'August', 'September', 'October', 'November', 'December']
1692 'en': ENGLISH_MONTH_NAMES
,
1694 'janvier', 'fƩvrier', 'mars', 'avril', 'mai', 'juin',
1695 'juillet', 'aoƻt', 'septembre', 'octobre', 'novembre', 'dƩcembre'],
1698 KNOWN_EXTENSIONS
= (
1699 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1700 'flv', 'f4v', 'f4a', 'f4b',
1701 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1702 'mkv', 'mka', 'mk3d',
1705 'asf', 'wmv', 'wma',
1711 'f4f', 'f4m', 'm3u8', 'smil')
1713 # needed for sanitizing filenames in restricted mode
1714 ACCENT_CHARS
= dict(zip('ĆĆĆĆĆĆ
ĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆÅĆÅĆĆĆĆÅ°ĆĆĆĆ Ć”Ć¢Ć£Ć¤Ć„Ć¦Ć§ĆØĆ©ĆŖƫƬĆĆ®ĆÆĆ°Ć±Ć²Ć³Ć“ĆµĆ¶ÅĆøÅĆ¹ĆŗĆ»Ć¼Å±Ć½Ć¾Ćæ',
1715 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1716 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1739 '%Y/%m/%d %H:%M:%S',
1741 '%Y-%m-%d %H:%M:%S',
1742 '%Y-%m-%d %H:%M:%S.%f',
1745 '%Y-%m-%dT%H:%M:%SZ',
1746 '%Y-%m-%dT%H:%M:%S.%fZ',
1747 '%Y-%m-%dT%H:%M:%S.%f0Z',
1748 '%Y-%m-%dT%H:%M:%S',
1749 '%Y-%m-%dT%H:%M:%S.%f',
1751 '%b %d %Y at %H:%M',
1752 '%b %d %Y at %H:%M:%S',
1753 '%B %d %Y at %H:%M',
1754 '%B %d %Y at %H:%M:%S',
1757 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1758 DATE_FORMATS_DAY_FIRST
.extend([
1764 '%d/%m/%Y %H:%M:%S',
1767 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1768 DATE_FORMATS_MONTH_FIRST
.extend([
1773 '%m/%d/%Y %H:%M:%S',
1776 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1777 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1780 def preferredencoding():
1781 """Get preferred encoding.
1783 Returns the best encoding scheme for the system, based on
1784 locale.getpreferredencoding() and some further tweaks.
1787 pref = locale.getpreferredencoding()
1795 def write_json_file(obj, fn):
1796 """ Encode obj as JSON and write it to fn, atomically if possible """
1798 fn = encodeFilename(fn)
1799 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1800 encoding = get_filesystem_encoding()
1801 # os.path.basename returns a bytes object, but NamedTemporaryFile
1802 # will fail if the filename contains non ascii characters unless we
1803 # use a unicode object
1804 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1805 # the same for os.path.dirname
1806 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1808 path_basename = os.path.basename
1809 path_dirname = os.path.dirname
1813 'prefix
': path_basename(fn) + '.',
1814 'dir': path_dirname(fn),
1818 # In Python 2.x, json.dump expects a bytestream.
1819 # In Python 3.x, it writes to a character stream
1820 if sys.version_info < (3, 0):
1825 'encoding
': 'utf
-8',
1828 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1833 if sys.platform == 'win32
':
1834 # Need to remove existing file on Windows, else os.rename raises
1835 # WindowsError or FileExistsError.
1840 os.rename(tf.name, fn)
1849 if sys.version_info >= (2, 7):
1850 def find_xpath_attr(node, xpath, key, val=None):
1851 """ Find the xpath xpath[@key=val] """
1852 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1853 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1854 return node.find(expr)
1856 def find_xpath_attr(node, xpath, key, val=None):
1857 for f in node.findall(compat_xpath(xpath)):
1858 if key not in f.attrib:
1860 if val is None or f.attrib.get(key) == val:
1864 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1865 # the namespace parameter
1868 def xpath_with_ns(path
, ns_map
):
1869 components
= [c
.split(':') for c
in path
.split('/')]
1871 for c
in components
:
1873 replaced
.append(c
[0])
1876 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1877 return '/'.join(replaced
)
1880 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1881 def _find_xpath(xpath
):
1882 return node
.find(compat_xpath(xpath
))
1884 if isinstance(xpath
, (str, compat_str
)):
1885 n
= _find_xpath(xpath
)
1893 if default
is not NO_DEFAULT
:
1896 name
= xpath
if name
is None else name
1897 raise ExtractorError('Could not find XML element %s' % name
)
1903 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1904 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1905 if n
is None or n
== default
:
1908 if default
is not NO_DEFAULT
:
1911 name
= xpath
if name
is None else name
1912 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1918 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1919 n
= find_xpath_attr(node
, xpath
, key
)
1921 if default
is not NO_DEFAULT
:
1924 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1925 raise ExtractorError('Could not find XML attribute %s' % name
)
1928 return n
.attrib
[key
]
1931 def get_element_by_id(id, html
):
1932 """Return the content of the tag with the specified ID in the passed HTML document"""
1933 return get_element_by_attribute('id', id, html
)
1936 def get_element_by_class(class_name
, html
):
1937 """Return the content of the first tag with the specified class in the passed HTML document"""
1938 retval
= get_elements_by_class(class_name
, html
)
1939 return retval
[0] if retval
else None
1942 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1943 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1944 return retval
[0] if retval
else None
1947 def get_elements_by_class(class_name
, html
):
1948 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1949 return get_elements_by_attribute(
1950 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1951 html, escape_value=False)
1954 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1955 """Return the content of the tag with the specified attribute in the passed HTML document"""
1957 value = re.escape(value) if escape_value else value
1960 for m in re.finditer(r'''(?xs)
1962 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1964 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1968 ''' % (re.escape(attribute), value), html):
1969 res = m.group('content
')
1971 if res.startswith('"') or res.startswith("'"):
1974 retlist.append(unescapeHTML(res))
1979 class HTMLAttributeParser(compat_HTMLParser):
1980 """Trivial HTML parser to gather the attributes for a single element"""
1983 compat_HTMLParser.__init__(self)
1985 def handle_starttag(self, tag, attrs):
1986 self.attrs = dict(attrs)
1989 def extract_attributes(html_element):
1990 """Given a string for an HTML element such as
1992 a="foo" B="bar" c="&98;az" d=boz
1993 empty= noval entity="&"
1996 Decode and return a dictionary of attributes.
1998 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
1999 'empty
': '', 'noval
': None, 'entity
': '&',
2000 'sq
': '"', 'dq': '\''
2002 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2003 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2005 parser = HTMLAttributeParser()
2007 parser.feed(html_element)
2009 # Older Python may throw HTMLParseError in case of malformed HTML
2010 except compat_HTMLParseError:
2015 def clean_html(html):
2016 """Clean an HTML snippet into a readable string"""
2018 if html is None: # Convenience for sanitizing descriptions etc.
2022 html = html.replace('\n', ' ')
2023 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2024 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2026 html = re.sub('<.*?>', '', html)
2027 # Replace html entities
2028 html = unescapeHTML(html)
2032 def sanitize_open(filename, open_mode):
2033 """Try to open the given filename, and slightly tweak it if this fails.
2035 Attempts to open the given filename. If this fails, it tries to change
2036 the filename slightly, step by step, until it's either able to open it
2037 or it fails and raises a final exception, like the standard open()
2040 It returns the tuple (stream, definitive_file_name).
2044 if sys.platform == 'win32':
2046 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2047 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2048 stream = open(encodeFilename(filename), open_mode)
2049 return (stream, filename)
2050 except (IOError, OSError) as err:
2051 if err.errno in (errno.EACCES,):
2054 # In case of error, try to remove win32 forbidden chars
2055 alt_filename = sanitize_path(filename)
2056 if alt_filename == filename:
2059 # An exception here should be caught in the caller
2060 stream = open(encodeFilename(alt_filename), open_mode)
2061 return (stream, alt_filename)
2064 def timeconvert(timestr):
2065 """Convert RFC 2822 defined time string into system timestamp"""
2067 timetuple = email.utils.parsedate_tz(timestr)
2068 if timetuple is not None:
2069 timestamp = email.utils.mktime_tz(timetuple)
2073 def sanitize_filename(s, restricted=False, is_id=False):
2074 """Sanitizes a string so it could be used as part of a filename.
2075 If restricted is set, use a stricter subset of allowed characters.
2076 Set is_id if this is not an arbitrary string, but an ID that should be kept
2079 def replace_insane(char):
2080 if restricted and char in ACCENT_CHARS:
2081 return ACCENT_CHARS[char]
2082 if char == '?' or ord(char) < 32 or ord(char) == 127:
2085 return '' if restricted else '\''
2087 return '_
-' if restricted else ' -'
2088 elif char in '\\/|
*<>':
2090 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2092 if restricted
and ord(char
) > 127:
2097 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2098 result
= ''.join(map(replace_insane
, s
))
2100 while '__' in result
:
2101 result
= result
.replace('__', '_')
2102 result
= result
.strip('_')
2103 # Common case of "Foreign band name - English song title"
2104 if restricted
and result
.startswith('-_'):
2106 if result
.startswith('-'):
2107 result
= '_' + result
[len('-'):]
2108 result
= result
.lstrip('.')
2114 def sanitize_path(s
):
2115 """Sanitizes and normalizes path on Windows"""
2116 if sys
.platform
!= 'win32':
2118 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2119 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2120 drive_or_unc
, _
= os
.path
.splitunc(s
)
2121 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2125 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2126 for path_part
in norm_path
]
2128 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2129 return os
.path
.join(*sanitized_path
)
2132 def sanitize_url(url
):
2133 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2134 # the number of unwanted failures due to missing protocol
2135 if url
.startswith('//'):
2136 return 'http:%s' % url
2137 # Fix some common typos seen so far
2139 # https://github.com/ytdl-org/youtube-dl/issues/15649
2140 (r
'^httpss://', r
'https://'),
2141 # https://bx1.be/lives/direct-tv/
2142 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2144 for mistake
, fixup
in COMMON_TYPOS
:
2145 if re
.match(mistake
, url
):
2146 return re
.sub(mistake
, fixup
, url
)
2150 def sanitized_Request(url
, *args
, **kwargs
):
2151 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2155 """Expand shell variables and ~"""
2156 return os
.path
.expandvars(compat_expanduser(s
))
2159 def orderedSet(iterable
):
2160 """ Remove all duplicates from the input iterable """
2168 def _htmlentity_transform(entity_with_semicolon
):
2169 """Transforms an HTML entity to a character."""
2170 entity
= entity_with_semicolon
[:-1]
2172 # Known non-numeric HTML entity
2173 if entity
in compat_html_entities
.name2codepoint
:
2174 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2176 # TODO: HTML5 allows entities without a semicolon. For example,
2177 # 'Éric' should be decoded as 'Ćric'.
2178 if entity_with_semicolon
in compat_html_entities_html5
:
2179 return compat_html_entities_html5
[entity_with_semicolon
]
2181 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2182 if mobj
is not None:
2183 numstr
= mobj
.group(1)
2184 if numstr
.startswith('x'):
2186 numstr
= '0%s' % numstr
2189 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2191 return compat_chr(int(numstr
, base
))
2195 # Unknown entity in name, return its literal representation
2196 return '&%s;' % entity
2199 def unescapeHTML(s
):
2202 assert type(s
) == compat_str
2205 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2208 def get_subprocess_encoding():
2209 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2210 # For subprocess calls, encode with locale encoding
2211 # Refer to http://stackoverflow.com/a/9951851/35070
2212 encoding
= preferredencoding()
2214 encoding
= sys
.getfilesystemencoding()
2215 if encoding
is None:
2220 def encodeFilename(s
, for_subprocess
=False):
2222 @param s The name of the file
2225 assert type(s
) == compat_str
2227 # Python 3 has a Unicode API
2228 if sys
.version_info
>= (3, 0):
2231 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2232 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2233 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2234 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2237 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2238 if sys
.platform
.startswith('java'):
2241 return s
.encode(get_subprocess_encoding(), 'ignore')
2244 def decodeFilename(b
, for_subprocess
=False):
2246 if sys
.version_info
>= (3, 0):
2249 if not isinstance(b
, bytes):
2252 return b
.decode(get_subprocess_encoding(), 'ignore')
2255 def encodeArgument(s
):
2256 if not isinstance(s
, compat_str
):
2257 # Legacy code that uses byte strings
2258 # Uncomment the following line after fixing all post processors
2259 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2260 s
= s
.decode('ascii')
2261 return encodeFilename(s
, True)
2264 def decodeArgument(b
):
2265 return decodeFilename(b
, True)
2268 def decodeOption(optval
):
2271 if isinstance(optval
, bytes):
2272 optval
= optval
.decode(preferredencoding())
2274 assert isinstance(optval
, compat_str
)
2278 def formatSeconds(secs
):
2280 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
2282 return '%d:%02d' % (secs
// 60, secs
% 60)
2287 def make_HTTPS_handler(params
, **kwargs
):
2288 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2289 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2290 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2291 if opts_no_check_certificate
:
2292 context
.check_hostname
= False
2293 context
.verify_mode
= ssl
.CERT_NONE
2295 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2298 # (create_default_context present but HTTPSHandler has no context=)
2301 if sys
.version_info
< (3, 2):
2302 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2303 else: # Python < 3.4
2304 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2305 context
.verify_mode
= (ssl
.CERT_NONE
2306 if opts_no_check_certificate
2307 else ssl
.CERT_REQUIRED
)
2308 context
.set_default_verify_paths()
2309 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2312 def bug_reports_message():
2313 if ytdl_is_updateable():
2314 update_cmd
= 'type youtube-dl -U to update'
2316 update_cmd
= 'see https://yt-dl.org/update on how to update'
2317 msg
= '; please report this issue on https://yt-dl.org/bug .'
2318 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2319 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2323 class YoutubeDLError(Exception):
2324 """Base exception for YoutubeDL errors."""
2328 class ExtractorError(YoutubeDLError
):
2329 """Error during info extraction."""
2331 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2332 """ tb, if given, is the original traceback (so that it can be printed out).
2333 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2336 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2338 if video_id
is not None:
2339 msg
= video_id
+ ': ' + msg
2341 msg
+= ' (caused by %r)' % cause
2343 msg
+= bug_reports_message()
2344 super(ExtractorError
, self
).__init
__(msg
)
2347 self
.exc_info
= sys
.exc_info() # preserve original exception
2349 self
.video_id
= video_id
2351 def format_traceback(self
):
2352 if self
.traceback
is None:
2354 return ''.join(traceback
.format_tb(self
.traceback
))
2357 class UnsupportedError(ExtractorError
):
2358 def __init__(self
, url
):
2359 super(UnsupportedError
, self
).__init
__(
2360 'Unsupported URL: %s' % url
, expected
=True)
2364 class RegexNotFoundError(ExtractorError
):
2365 """Error when a regex didn't match"""
2369 class GeoRestrictedError(ExtractorError
):
2370 """Geographic restriction Error exception.
2372 This exception may be thrown when a video is not available from your
2373 geographic location due to geographic restrictions imposed by a website.
2375 def __init__(self
, msg
, countries
=None):
2376 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2378 self
.countries
= countries
2381 class DownloadError(YoutubeDLError
):
2382 """Download Error exception.
2384 This exception may be thrown by FileDownloader objects if they are not
2385 configured to continue on errors. They will contain the appropriate
2389 def __init__(self
, msg
, exc_info
=None):
2390 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2391 super(DownloadError
, self
).__init
__(msg
)
2392 self
.exc_info
= exc_info
2395 class SameFileError(YoutubeDLError
):
2396 """Same File exception.
2398 This exception will be thrown by FileDownloader objects if they detect
2399 multiple files would have to be downloaded to the same file on disk.
2404 class PostProcessingError(YoutubeDLError
):
2405 """Post Processing exception.
2407 This exception may be raised by PostProcessor's .run() method to
2408 indicate an error in the postprocessing task.
2411 def __init__(self
, msg
):
2412 super(PostProcessingError
, self
).__init
__(msg
)
2416 class MaxDownloadsReached(YoutubeDLError
):
2417 """ --max-downloads limit has been reached. """
2421 class UnavailableVideoError(YoutubeDLError
):
2422 """Unavailable Format exception.
2424 This exception will be thrown when a video is requested
2425 in a format that is not available for that video.
2430 class ContentTooShortError(YoutubeDLError
):
2431 """Content Too Short exception.
2433 This exception may be raised by FileDownloader objects when a file they
2434 download is too small for what the server announced first, indicating
2435 the connection was probably interrupted.
2438 def __init__(self
, downloaded
, expected
):
2439 super(ContentTooShortError
, self
).__init
__(
2440 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2443 self
.downloaded
= downloaded
2444 self
.expected
= expected
2447 class XAttrMetadataError(YoutubeDLError
):
2448 def __init__(self
, code
=None, msg
='Unknown error'):
2449 super(XAttrMetadataError
, self
).__init
__(msg
)
2453 # Parsing code and msg
2454 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2455 or 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
2456 self
.reason
= 'NO_SPACE'
2457 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2458 self
.reason
= 'VALUE_TOO_LONG'
2460 self
.reason
= 'NOT_SUPPORTED'
2463 class XAttrUnavailableError(YoutubeDLError
):
2467 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2468 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2469 # expected HTTP responses to meet HTTP/1.0 or later (see also
2470 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2471 if sys
.version_info
< (3, 0):
2472 kwargs
['strict'] = True
2473 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2474 source_address
= ydl_handler
._params
.get('source_address')
2476 if source_address
is not None:
2477 # This is to workaround _create_connection() from socket where it will try all
2478 # address data from getaddrinfo() including IPv6. This filters the result from
2479 # getaddrinfo() based on the source_address value.
2480 # This is based on the cpython socket.create_connection() function.
2481 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2482 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2483 host
, port
= address
2485 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2486 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2487 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2488 if addrs
and not ip_addrs
:
2489 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2491 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2492 % (ip_version
, source_address
[0]))
2493 for res
in ip_addrs
:
2494 af
, socktype
, proto
, canonname
, sa
= res
2497 sock
= socket
.socket(af
, socktype
, proto
)
2498 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2499 sock
.settimeout(timeout
)
2500 sock
.bind(source_address
)
2502 err
= None # Explicitly break reference cycle
2504 except socket
.error
as _
:
2506 if sock
is not None:
2511 raise socket
.error('getaddrinfo returns an empty list')
2512 if hasattr(hc
, '_create_connection'):
2513 hc
._create
_connection
= _create_connection
2514 sa
= (source_address
, 0)
2515 if hasattr(hc
, 'source_address'): # Python 2.7+
2516 hc
.source_address
= sa
2518 def _hc_connect(self
, *args
, **kwargs
):
2519 sock
= _create_connection(
2520 (self
.host
, self
.port
), self
.timeout
, sa
)
2522 self
.sock
= ssl
.wrap_socket(
2523 sock
, self
.key_file
, self
.cert_file
,
2524 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2527 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2532 def handle_youtubedl_headers(headers
):
2533 filtered_headers
= headers
2535 if 'Youtubedl-no-compression' in filtered_headers
:
2536 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2537 del filtered_headers
['Youtubedl-no-compression']
2539 return filtered_headers
2542 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2543 """Handler for HTTP requests and responses.
2545 This class, when installed with an OpenerDirector, automatically adds
2546 the standard headers to every HTTP request and handles gzipped and
2547 deflated responses from web servers. If compression is to be avoided in
2548 a particular request, the original request in the program code only has
2549 to include the HTTP header "Youtubedl-no-compression", which will be
2550 removed before making the real request.
2552 Part of this code was copied from:
2554 http://techknack.net/python-urllib2-handlers/
2556 Andrew Rowls, the author of that code, agreed to release it to the
2560 def __init__(self
, params
, *args
, **kwargs
):
2561 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2562 self
._params
= params
2564 def http_open(self
, req
):
2565 conn_class
= compat_http_client
.HTTPConnection
2567 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2569 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2570 del req
.headers
['Ytdl-socks-proxy']
2572 return self
.do_open(functools
.partial(
2573 _create_http_connection
, self
, conn_class
, False),
2579 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2581 return zlib
.decompress(data
)
2583 def http_request(self
, req
):
2584 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2585 # always respected by websites, some tend to give out URLs with non percent-encoded
2586 # non-ASCII characters (see telemb.py, ard.py [#3412])
2587 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2588 # To work around aforementioned issue we will replace request's original URL with
2589 # percent-encoded one
2590 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2591 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2592 url
= req
.get_full_url()
2593 url_escaped
= escape_url(url
)
2595 # Substitute URL if any change after escaping
2596 if url
!= url_escaped
:
2597 req
= update_Request(req
, url
=url_escaped
)
2599 for h
, v
in std_headers
.items():
2600 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2601 # The dict keys are capitalized because of this bug by urllib
2602 if h
.capitalize() not in req
.headers
:
2603 req
.add_header(h
, v
)
2605 req
.headers
= handle_youtubedl_headers(req
.headers
)
2607 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2608 # Python 2.6 is brain-dead when it comes to fragments
2609 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2610 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2614 def http_response(self
, req
, resp
):
2617 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2618 content
= resp
.read()
2619 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2621 uncompressed
= io
.BytesIO(gz
.read())
2622 except IOError as original_ioerror
:
2623 # There may be junk add the end of the file
2624 # See http://stackoverflow.com/q/4928560/35070 for details
2625 for i
in range(1, 1024):
2627 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2628 uncompressed
= io
.BytesIO(gz
.read())
2633 raise original_ioerror
2634 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2635 resp
.msg
= old_resp
.msg
2636 del resp
.headers
['Content-encoding']
2638 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2639 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2640 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2641 resp
.msg
= old_resp
.msg
2642 del resp
.headers
['Content-encoding']
2643 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2644 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2645 if 300 <= resp
.code
< 400:
2646 location
= resp
.headers
.get('Location')
2648 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2649 if sys
.version_info
>= (3, 0):
2650 location
= location
.encode('iso-8859-1').decode('utf-8')
2652 location
= location
.decode('utf-8')
2653 location_escaped
= escape_url(location
)
2654 if location
!= location_escaped
:
2655 del resp
.headers
['Location']
2656 if sys
.version_info
< (3, 0):
2657 location_escaped
= location_escaped
.encode('utf-8')
2658 resp
.headers
['Location'] = location_escaped
2661 https_request
= http_request
2662 https_response
= http_response
2665 def make_socks_conn_class(base_class
, socks_proxy
):
2666 assert issubclass(base_class
, (
2667 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2669 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2670 if url_components
.scheme
.lower() == 'socks5':
2671 socks_type
= ProxyType
.SOCKS5
2672 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2673 socks_type
= ProxyType
.SOCKS4
2674 elif url_components
.scheme
.lower() == 'socks4a':
2675 socks_type
= ProxyType
.SOCKS4A
2677 def unquote_if_non_empty(s
):
2680 return compat_urllib_parse_unquote_plus(s
)
2684 url_components
.hostname
, url_components
.port
or 1080,
2686 unquote_if_non_empty(url_components
.username
),
2687 unquote_if_non_empty(url_components
.password
),
2690 class SocksConnection(base_class
):
2692 self
.sock
= sockssocket()
2693 self
.sock
.setproxy(*proxy_args
)
2694 if type(self
.timeout
) in (int, float):
2695 self
.sock
.settimeout(self
.timeout
)
2696 self
.sock
.connect((self
.host
, self
.port
))
2698 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2699 if hasattr(self
, '_context'): # Python > 2.6
2700 self
.sock
= self
._context
.wrap_socket(
2701 self
.sock
, server_hostname
=self
.host
)
2703 self
.sock
= ssl
.wrap_socket(self
.sock
)
2705 return SocksConnection
2708 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2709 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2710 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2711 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2712 self
._params
= params
2714 def https_open(self
, req
):
2716 conn_class
= self
._https
_conn
_class
2718 if hasattr(self
, '_context'): # python > 2.6
2719 kwargs
['context'] = self
._context
2720 if hasattr(self
, '_check_hostname'): # python 3.x
2721 kwargs
['check_hostname'] = self
._check
_hostname
2723 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2725 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2726 del req
.headers
['Ytdl-socks-proxy']
2728 return self
.do_open(functools
.partial(
2729 _create_http_connection
, self
, conn_class
, True),
2733 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2735 See [1] for cookie file format.
2737 1. https://curl.haxx.se/docs/http-cookies.html
2739 _HTTPONLY_PREFIX
= '#HttpOnly_'
2741 _HEADER
= '''# Netscape HTTP Cookie File
2742 # This file is generated by youtube-dl. Do not edit.
2745 _CookieFileEntry
= collections
.namedtuple(
2747 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2749 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2751 Save cookies to a file.
2753 Most of the code is taken from CPython 3.8 and slightly adapted
2754 to support cookie files with UTF-8 in both python 2 and 3.
2756 if filename
is None:
2757 if self
.filename
is not None:
2758 filename
= self
.filename
2760 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2762 # Store session cookies with `expires` set to 0 instead of an empty
2765 if cookie
.expires
is None:
2768 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2769 f
.write(self
._HEADER
)
2772 if not ignore_discard
and cookie
.discard
:
2774 if not ignore_expires
and cookie
.is_expired(now
):
2780 if cookie
.domain
.startswith('.'):
2781 initial_dot
= 'TRUE'
2783 initial_dot
= 'FALSE'
2784 if cookie
.expires
is not None:
2785 expires
= compat_str(cookie
.expires
)
2788 if cookie
.value
is None:
2789 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2790 # with no name, whereas http.cookiejar regards it as a
2791 # cookie with no value.
2796 value
= cookie
.value
2798 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2799 secure
, expires
, name
, value
]) + '\n')
2801 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2802 """Load cookies from a file."""
2803 if filename
is None:
2804 if self
.filename
is not None:
2805 filename
= self
.filename
2807 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2809 def prepare_line(line
):
2810 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2811 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2812 # comments and empty lines are fine
2813 if line
.startswith('#') or not line
.strip():
2815 cookie_list
= line
.split('\t')
2816 if len(cookie_list
) != self
._ENTRY
_LEN
:
2817 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2818 cookie
= self
._CookieFileEntry
(*cookie_list
)
2819 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2820 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2824 with io
.open(filename
, encoding
='utf-8') as f
:
2827 cf
.write(prepare_line(line
))
2828 except compat_cookiejar
.LoadError
as e
:
2830 'WARNING: skipping cookie file entry due to %s: %r\n'
2831 % (e
, line
), sys
.stderr
)
2834 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2835 # Session cookies are denoted by either `expires` field set to
2836 # an empty string or 0. MozillaCookieJar only recognizes the former
2837 # (see [1]). So we need force the latter to be recognized as session
2838 # cookies on our own.
2839 # Session cookies may be important for cookies-based authentication,
2840 # e.g. usually, when user does not check 'Remember me' check box while
2841 # logging in on a site, some important cookies are stored as session
2842 # cookies so that not recognizing them will result in failed login.
2843 # 1. https://bugs.python.org/issue17164
2845 # Treat `expires=0` cookies as session cookies
2846 if cookie
.expires
== 0:
2847 cookie
.expires
= None
2848 cookie
.discard
= True
2851 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2852 def __init__(self
, cookiejar
=None):
2853 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2855 def http_response(self
, request
, response
):
2856 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2857 # characters in Set-Cookie HTTP header of last response (see
2858 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2859 # In order to at least prevent crashing we will percent encode Set-Cookie
2860 # header before HTTPCookieProcessor starts processing it.
2861 # if sys.version_info < (3, 0) and response.headers:
2862 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2863 # set_cookie = response.headers.get(set_cookie_header)
2865 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2866 # if set_cookie != set_cookie_escaped:
2867 # del response.headers[set_cookie_header]
2868 # response.headers[set_cookie_header] = set_cookie_escaped
2869 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2871 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2872 https_response
= http_response
2875 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2876 if sys
.version_info
[0] < 3:
2877 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2878 # On python 2 urlh.geturl() may sometimes return redirect URL
2879 # as byte string instead of unicode. This workaround allows
2880 # to force it always return unicode.
2881 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2884 def extract_timezone(date_str
):
2886 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2889 timezone
= datetime
.timedelta()
2891 date_str
= date_str
[:-len(m
.group('tz'))]
2892 if not m
.group('sign'):
2893 timezone
= datetime
.timedelta()
2895 sign
= 1 if m
.group('sign') == '+' else -1
2896 timezone
= datetime
.timedelta(
2897 hours
=sign
* int(m
.group('hours')),
2898 minutes
=sign
* int(m
.group('minutes')))
2899 return timezone
, date_str
2902 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2903 """ Return a UNIX timestamp from the given date """
2905 if date_str
is None:
2908 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2910 if timezone
is None:
2911 timezone
, date_str
= extract_timezone(date_str
)
2914 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2915 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2916 return calendar
.timegm(dt
.timetuple())
2921 def date_formats(day_first
=True):
2922 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2925 def unified_strdate(date_str
, day_first
=True):
2926 """Return a string with the date in the format YYYYMMDD"""
2928 if date_str
is None:
2932 date_str
= date_str
.replace(',', ' ')
2933 # Remove AM/PM + timezone
2934 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2935 _
, date_str
= extract_timezone(date_str
)
2937 for expression
in date_formats(day_first
):
2939 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2942 if upload_date
is None:
2943 timetuple
= email
.utils
.parsedate_tz(date_str
)
2946 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2949 if upload_date
is not None:
2950 return compat_str(upload_date
)
2953 def unified_timestamp(date_str
, day_first
=True):
2954 if date_str
is None:
2957 date_str
= re
.sub(r
'[,|]', '', date_str
)
2959 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2960 timezone
, date_str
= extract_timezone(date_str
)
2962 # Remove AM/PM + timezone
2963 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2965 # Remove unrecognized timezones from ISO 8601 alike timestamps
2966 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2968 date_str
= date_str
[:-len(m
.group('tz'))]
2970 # Python only supports microseconds, so remove nanoseconds
2971 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2973 date_str
= m
.group(1)
2975 for expression
in date_formats(day_first
):
2977 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2978 return calendar
.timegm(dt
.timetuple())
2981 timetuple
= email
.utils
.parsedate_tz(date_str
)
2983 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2986 def determine_ext(url
, default_ext
='unknown_video'):
2987 if url
is None or '.' not in url
:
2989 guess
= url
.partition('?')[0].rpartition('.')[2]
2990 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
2992 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2993 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
2994 return guess
.rstrip('/')
2999 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3000 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3003 def date_from_str(date_str
):
3005 Return a datetime object from a string in the format YYYYMMDD or
3006 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3007 today
= datetime
.date
.today()
3008 if date_str
in ('now', 'today'):
3010 if date_str
== 'yesterday':
3011 return today
- datetime
.timedelta(days
=1)
3012 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3013 if match
is not None:
3014 sign
= match
.group('sign')
3015 time
= int(match
.group('time'))
3018 unit
= match
.group('unit')
3019 # A bad approximation?
3023 elif unit
== 'year':
3027 delta
= datetime
.timedelta(**{unit
: time
})
3028 return today
+ delta
3029 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3032 def hyphenate_date(date_str
):
3034 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3035 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3036 if match
is not None:
3037 return '-'.join(match
.groups())
3042 class DateRange(object):
3043 """Represents a time interval between two dates"""
3045 def __init__(self
, start
=None, end
=None):
3046 """start and end must be strings in the format accepted by date"""
3047 if start
is not None:
3048 self
.start
= date_from_str(start
)
3050 self
.start
= datetime
.datetime
.min.date()
3052 self
.end
= date_from_str(end
)
3054 self
.end
= datetime
.datetime
.max.date()
3055 if self
.start
> self
.end
:
3056 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3060 """Returns a range that only contains the given day"""
3061 return cls(day
, day
)
3063 def __contains__(self
, date
):
3064 """Check if the date is in the range"""
3065 if not isinstance(date
, datetime
.date
):
3066 date
= date_from_str(date
)
3067 return self
.start
<= date
<= self
.end
3070 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3073 def platform_name():
3074 """ Returns the platform name as a compat_str """
3075 res
= platform
.platform()
3076 if isinstance(res
, bytes):
3077 res
= res
.decode(preferredencoding())
3079 assert isinstance(res
, compat_str
)
3083 def _windows_write_string(s
, out
):
3084 """ Returns True if the string was written using special methods,
3085 False if it has yet to be written out."""
3086 # Adapted from http://stackoverflow.com/a/3259271/35070
3089 import ctypes
.wintypes
3097 fileno
= out
.fileno()
3098 except AttributeError:
3099 # If the output stream doesn't have a fileno, it's virtual
3101 except io
.UnsupportedOperation
:
3102 # Some strange Windows pseudo files?
3104 if fileno
not in WIN_OUTPUT_IDS
:
3107 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3108 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3109 ('GetStdHandle', ctypes
.windll
.kernel32
))
3110 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3112 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3113 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3114 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3115 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3116 written
= ctypes
.wintypes
.DWORD(0)
3118 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3119 FILE_TYPE_CHAR
= 0x0002
3120 FILE_TYPE_REMOTE
= 0x8000
3121 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3122 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3123 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3124 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3125 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3127 def not_a_console(handle
):
3128 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3130 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3131 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3133 if not_a_console(h
):
3136 def next_nonbmp_pos(s
):
3138 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3139 except StopIteration:
3143 count
= min(next_nonbmp_pos(s
), 1024)
3145 ret
= WriteConsoleW(
3146 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3148 raise OSError('Failed to write string')
3149 if not count
: # We just wrote a non-BMP character
3150 assert written
.value
== 2
3153 assert written
.value
> 0
3154 s
= s
[written
.value
:]
3158 def write_string(s
, out
=None, encoding
=None):
3161 assert type(s
) == compat_str
3163 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3164 if _windows_write_string(s
, out
):
3167 if ('b' in getattr(out
, 'mode', '')
3168 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3169 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3171 elif hasattr(out
, 'buffer'):
3172 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3173 byt
= s
.encode(enc
, 'ignore')
3174 out
.buffer.write(byt
)
3180 def bytes_to_intlist(bs
):
3183 if isinstance(bs
[0], int): # Python 3
3186 return [ord(c
) for c
in bs
]
3189 def intlist_to_bytes(xs
):
3192 return compat_struct_pack('%dB' % len(xs
), *xs
)
3195 # Cross-platform file locking
3196 if sys
.platform
== 'win32':
3197 import ctypes
.wintypes
3200 class OVERLAPPED(ctypes
.Structure
):
3202 ('Internal', ctypes
.wintypes
.LPVOID
),
3203 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3204 ('Offset', ctypes
.wintypes
.DWORD
),
3205 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3206 ('hEvent', ctypes
.wintypes
.HANDLE
),
3209 kernel32
= ctypes
.windll
.kernel32
3210 LockFileEx
= kernel32
.LockFileEx
3211 LockFileEx
.argtypes
= [
3212 ctypes
.wintypes
.HANDLE
, # hFile
3213 ctypes
.wintypes
.DWORD
, # dwFlags
3214 ctypes
.wintypes
.DWORD
, # dwReserved
3215 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3216 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3217 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3219 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3220 UnlockFileEx
= kernel32
.UnlockFileEx
3221 UnlockFileEx
.argtypes
= [
3222 ctypes
.wintypes
.HANDLE
, # hFile
3223 ctypes
.wintypes
.DWORD
, # dwReserved
3224 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3225 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3226 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3228 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3229 whole_low
= 0xffffffff
3230 whole_high
= 0x7fffffff
3232 def _lock_file(f
, exclusive
):
3233 overlapped
= OVERLAPPED()
3234 overlapped
.Offset
= 0
3235 overlapped
.OffsetHigh
= 0
3236 overlapped
.hEvent
= 0
3237 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3238 handle
= msvcrt
.get_osfhandle(f
.fileno())
3239 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3240 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3241 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3243 def _unlock_file(f
):
3244 assert f
._lock
_file
_overlapped
_p
3245 handle
= msvcrt
.get_osfhandle(f
.fileno())
3246 if not UnlockFileEx(handle
, 0,
3247 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3248 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3251 # Some platforms, such as Jython, is missing fcntl
3255 def _lock_file(f
, exclusive
):
3256 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3258 def _unlock_file(f
):
3259 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3261 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3263 def _lock_file(f
, exclusive
):
3264 raise IOError(UNSUPPORTED_MSG
)
3266 def _unlock_file(f
):
3267 raise IOError(UNSUPPORTED_MSG
)
3270 class locked_file(object):
3271 def __init__(self
, filename
, mode
, encoding
=None):
3272 assert mode
in ['r', 'a', 'w']
3273 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3276 def __enter__(self
):
3277 exclusive
= self
.mode
!= 'r'
3279 _lock_file(self
.f
, exclusive
)
3285 def __exit__(self
, etype
, value
, traceback
):
3287 _unlock_file(self
.f
)
3294 def write(self
, *args
):
3295 return self
.f
.write(*args
)
3297 def read(self
, *args
):
3298 return self
.f
.read(*args
)
3301 def get_filesystem_encoding():
3302 encoding
= sys
.getfilesystemencoding()
3303 return encoding
if encoding
is not None else 'utf-8'
3306 def shell_quote(args
):
3308 encoding
= get_filesystem_encoding()
3310 if isinstance(a
, bytes):
3311 # We may get a filename encoded with 'encodeFilename'
3312 a
= a
.decode(encoding
)
3313 quoted_args
.append(compat_shlex_quote(a
))
3314 return ' '.join(quoted_args
)
3317 def smuggle_url(url
, data
):
3318 """ Pass additional data in a URL for internal use. """
3320 url
, idata
= unsmuggle_url(url
, {})
3322 sdata
= compat_urllib_parse_urlencode(
3323 {'__youtubedl_smuggle': json
.dumps(data
)})
3324 return url
+ '#' + sdata
3327 def unsmuggle_url(smug_url
, default
=None):
3328 if '#__youtubedl_smuggle' not in smug_url
:
3329 return smug_url
, default
3330 url
, _
, sdata
= smug_url
.rpartition('#')
3331 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3332 data
= json
.loads(jsond
)
3336 def format_bytes(bytes):
3339 if type(bytes) is str:
3340 bytes = float(bytes)
3344 exponent
= int(math
.log(bytes, 1024.0))
3345 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3346 converted
= float(bytes) / float(1024 ** exponent
)
3347 return '%.2f%s' % (converted
, suffix
)
3350 def lookup_unit_table(unit_table
, s
):
3351 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3353 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3356 num_str
= m
.group('num').replace(',', '.')
3357 mult
= unit_table
[m
.group('unit')]
3358 return int(float(num_str
) * mult
)
3361 def parse_filesize(s
):
3365 # The lower-case forms are of course incorrect and unofficial,
3366 # but we support those too
3383 'megabytes': 1000 ** 2,
3384 'mebibytes': 1024 ** 2,
3390 'gigabytes': 1000 ** 3,
3391 'gibibytes': 1024 ** 3,
3397 'terabytes': 1000 ** 4,
3398 'tebibytes': 1024 ** 4,
3404 'petabytes': 1000 ** 5,
3405 'pebibytes': 1024 ** 5,
3411 'exabytes': 1000 ** 6,
3412 'exbibytes': 1024 ** 6,
3418 'zettabytes': 1000 ** 7,
3419 'zebibytes': 1024 ** 7,
3425 'yottabytes': 1000 ** 8,
3426 'yobibytes': 1024 ** 8,
3429 return lookup_unit_table(_UNIT_TABLE
, s
)
3438 if re
.match(r
'^[\d,.]+$', s
):
3439 return str_to_int(s
)
3450 return lookup_unit_table(_UNIT_TABLE
, s
)
3453 def parse_resolution(s
):
3457 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xXĆ]\s*(?P<h>\d+)\b', s
)
3460 'width': int(mobj
.group('w')),
3461 'height': int(mobj
.group('h')),
3464 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3466 return {'height': int(mobj
.group(1))}
3468 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3470 return {'height': int(mobj
.group(1)) * 540}
3475 def parse_bitrate(s
):
3476 if not isinstance(s
, compat_str
):
3478 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3480 return int(mobj
.group(1))
3483 def month_by_name(name
, lang
='en'):
3484 """ Return the number of a month by (locale-independently) English name """
3486 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3489 return month_names
.index(name
) + 1
3494 def month_by_abbreviation(abbrev
):
3495 """ Return the number of a month by (locale-independently) English
3499 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3504 def fix_xml_ampersands(xml_str
):
3505 """Replace all the '&' by '&' in XML"""
3507 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3512 def setproctitle(title
):
3513 assert isinstance(title
, compat_str
)
3515 # ctypes in Jython is not complete
3516 # http://bugs.jython.org/issue2148
3517 if sys
.platform
.startswith('java'):
3521 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3525 # LoadLibrary in Windows Python 2.7.13 only expects
3526 # a bytestring, but since unicode_literals turns
3527 # every string into a unicode string, it fails.
3529 title_bytes
= title
.encode('utf-8')
3530 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3531 buf
.value
= title_bytes
3533 libc
.prctl(15, buf
, 0, 0, 0)
3534 except AttributeError:
3535 return # Strange libc, just skip this
3538 def remove_start(s
, start
):
3539 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3542 def remove_end(s
, end
):
3543 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3546 def remove_quotes(s
):
3547 if s
is None or len(s
) < 2:
3549 for quote
in ('"', "'", ):
3550 if s
[0] == quote
and s
[-1] == quote
:
3555 def url_basename(url
):
3556 path
= compat_urlparse
.urlparse(url
).path
3557 return path
.strip('/').split('/')[-1]
3561 return re
.match(r
'https?://[^?#&]+/', url
).group()
3564 def urljoin(base
, path
):
3565 if isinstance(path
, bytes):
3566 path
= path
.decode('utf-8')
3567 if not isinstance(path
, compat_str
) or not path
:
3569 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3571 if isinstance(base
, bytes):
3572 base
= base
.decode('utf-8')
3573 if not isinstance(base
, compat_str
) or not re
.match(
3574 r
'^(?:https?:)?//', base
):
3576 return compat_urlparse
.urljoin(base
, path
)
3579 class HEADRequest(compat_urllib_request
.Request
):
3580 def get_method(self
):
3584 class PUTRequest(compat_urllib_request
.Request
):
3585 def get_method(self
):
3589 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3592 v
= getattr(v
, get_attr
, None)
3598 return int(v
) * invscale
// scale
3599 except (ValueError, TypeError):
3603 def str_or_none(v
, default
=None):
3604 return default
if v
is None else compat_str(v
)
3607 def str_to_int(int_str
):
3608 """ A more relaxed version of int_or_none """
3609 if isinstance(int_str
, compat_integer_types
):
3611 elif isinstance(int_str
, compat_str
):
3612 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3613 return int_or_none(int_str
)
3616 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3620 return float(v
) * invscale
/ scale
3621 except (ValueError, TypeError):
3625 def bool_or_none(v
, default
=None):
3626 return v
if isinstance(v
, bool) else default
3629 def strip_or_none(v
, default
=None):
3630 return v
.strip() if isinstance(v
, compat_str
) else default
3633 def url_or_none(url
):
3634 if not url
or not isinstance(url
, compat_str
):
3637 return url
if re
.match(r
'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url
) else None
3640 def parse_duration(s
):
3641 if not isinstance(s
, compat_basestring
):
3646 days
, hours
, mins
, secs
, ms
= [None] * 5
3647 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3649 days
, hours
, mins
, secs
, ms
= m
.groups()
3654 [0-9]+\s*y(?:ears?)?\s*
3657 [0-9]+\s*m(?:onths?)?\s*
3660 [0-9]+\s*w(?:eeks?)?\s*
3663 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3667 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3670 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3673 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3676 days
, hours
, mins
, secs
, ms
= m
.groups()
3678 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3680 hours
, mins
= m
.groups()
3686 duration
+= float(secs
)
3688 duration
+= float(mins
) * 60
3690 duration
+= float(hours
) * 60 * 60
3692 duration
+= float(days
) * 24 * 60 * 60
3694 duration
+= float(ms
)
3698 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3699 name
, real_ext
= os
.path
.splitext(filename
)
3701 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3702 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3703 else '{0}.{1}'.format(filename
, ext
))
3706 def replace_extension(filename
, ext
, expected_real_ext
=None):
3707 name
, real_ext
= os
.path
.splitext(filename
)
3708 return '{0}.{1}'.format(
3709 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3713 def check_executable(exe
, args
=[]):
3714 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3715 args can be a list of arguments for a short output (like -version) """
3717 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3723 def get_exe_version(exe
, args
=['--version'],
3724 version_re
=None, unrecognized
='present'):
3725 """ Returns the version of the specified executable,
3726 or False if the executable is not present """
3728 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3729 # SIGTTOU if youtube-dl is run in the background.
3730 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3731 out
, _
= subprocess
.Popen(
3732 [encodeArgument(exe
)] + args
,
3733 stdin
=subprocess
.PIPE
,
3734 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3737 if isinstance(out
, bytes): # Python 2.x
3738 out
= out
.decode('ascii', 'ignore')
3739 return detect_exe_version(out
, version_re
, unrecognized
)
3742 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3743 assert isinstance(output
, compat_str
)
3744 if version_re
is None:
3745 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3746 m
= re
.search(version_re
, output
)
3753 class PagedList(object):
3755 # This is only useful for tests
3756 return len(self
.getslice())
3759 class OnDemandPagedList(PagedList
):
3760 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3761 self
._pagefunc
= pagefunc
3762 self
._pagesize
= pagesize
3763 self
._use
_cache
= use_cache
3767 def getslice(self
, start
=0, end
=None):
3769 for pagenum
in itertools
.count(start
// self
._pagesize
):
3770 firstid
= pagenum
* self
._pagesize
3771 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3772 if start
>= nextfirstid
:
3777 page_results
= self
._cache
.get(pagenum
)
3778 if page_results
is None:
3779 page_results
= list(self
._pagefunc
(pagenum
))
3781 self
._cache
[pagenum
] = page_results
3784 start
% self
._pagesize
3785 if firstid
<= start
< nextfirstid
3789 ((end
- 1) % self
._pagesize
) + 1
3790 if (end
is not None and firstid
<= end
<= nextfirstid
)
3793 if startv
!= 0 or endv
is not None:
3794 page_results
= page_results
[startv
:endv
]
3795 res
.extend(page_results
)
3797 # A little optimization - if current page is not "full", ie. does
3798 # not contain page_size videos then we can assume that this page
3799 # is the last one - there are no more ids on further pages -
3800 # i.e. no need to query again.
3801 if len(page_results
) + startv
< self
._pagesize
:
3804 # If we got the whole page, but the next page is not interesting,
3805 # break out early as well
3806 if end
== nextfirstid
:
3811 class InAdvancePagedList(PagedList
):
3812 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3813 self
._pagefunc
= pagefunc
3814 self
._pagecount
= pagecount
3815 self
._pagesize
= pagesize
3817 def getslice(self
, start
=0, end
=None):
3819 start_page
= start
// self
._pagesize
3821 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3822 skip_elems
= start
- start_page
* self
._pagesize
3823 only_more
= None if end
is None else end
- start
3824 for pagenum
in range(start_page
, end_page
):
3825 page
= list(self
._pagefunc
(pagenum
))
3827 page
= page
[skip_elems
:]
3829 if only_more
is not None:
3830 if len(page
) < only_more
:
3831 only_more
-= len(page
)
3833 page
= page
[:only_more
]
3840 def uppercase_escape(s
):
3841 unicode_escape
= codecs
.getdecoder('unicode_escape')
3843 r
'\\U[0-9a-fA-F]{8}',
3844 lambda m
: unicode_escape(m
.group(0))[0],
3848 def lowercase_escape(s
):
3849 unicode_escape
= codecs
.getdecoder('unicode_escape')
3851 r
'\\u[0-9a-fA-F]{4}',
3852 lambda m
: unicode_escape(m
.group(0))[0],
3856 def escape_rfc3986(s
):
3857 """Escape non-ASCII characters as suggested by RFC 3986"""
3858 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3859 s
= s
.encode('utf-8')
3860 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3863 def escape_url(url
):
3864 """Escape URL as suggested by RFC 3986"""
3865 url_parsed
= compat_urllib_parse_urlparse(url
)
3866 return url_parsed
._replace
(
3867 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3868 path
=escape_rfc3986(url_parsed
.path
),
3869 params
=escape_rfc3986(url_parsed
.params
),
3870 query
=escape_rfc3986(url_parsed
.query
),
3871 fragment
=escape_rfc3986(url_parsed
.fragment
)
3875 def read_batch_urls(batch_fd
):
3877 if not isinstance(url
, compat_str
):
3878 url
= url
.decode('utf-8', 'replace')
3879 BOM_UTF8
= '\xef\xbb\xbf'
3880 if url
.startswith(BOM_UTF8
):
3881 url
= url
[len(BOM_UTF8
):]
3883 if url
.startswith(('#', ';', ']')):
3887 with contextlib
.closing(batch_fd
) as fd
:
3888 return [url
for url
in map(fixup
, fd
) if url
]
3891 def urlencode_postdata(*args
, **kargs
):
3892 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3895 def update_url_query(url
, query
):
3898 parsed_url
= compat_urlparse
.urlparse(url
)
3899 qs
= compat_parse_qs(parsed_url
.query
)
3901 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3902 query
=compat_urllib_parse_urlencode(qs
, True)))
3905 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
3906 req_headers
= req
.headers
.copy()
3907 req_headers
.update(headers
)
3908 req_data
= data
or req
.data
3909 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3910 req_get_method
= req
.get_method()
3911 if req_get_method
== 'HEAD':
3912 req_type
= HEADRequest
3913 elif req_get_method
== 'PUT':
3914 req_type
= PUTRequest
3916 req_type
= compat_urllib_request
.Request
3918 req_url
, data
=req_data
, headers
=req_headers
,
3919 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3920 if hasattr(req
, 'timeout'):
3921 new_req
.timeout
= req
.timeout
3925 def _multipart_encode_impl(data
, boundary
):
3926 content_type
= 'multipart/form-data; boundary=%s' % boundary
3929 for k
, v
in data
.items():
3930 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3931 if isinstance(k
, compat_str
):
3932 k
= k
.encode('utf-8')
3933 if isinstance(v
, compat_str
):
3934 v
= v
.encode('utf-8')
3935 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3936 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3937 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3938 if boundary
.encode('ascii') in content
:
3939 raise ValueError('Boundary overlaps with data')
3942 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3944 return out
, content_type
3947 def multipart_encode(data
, boundary
=None):
3949 Encode a dict to RFC 7578-compliant form-data
3952 A dict where keys and values can be either Unicode or bytes-like
3955 If specified a Unicode object, it's used as the boundary. Otherwise
3956 a random boundary is generated.
3958 Reference: https://tools.ietf.org/html/rfc7578
3960 has_specified_boundary
= boundary
is not None
3963 if boundary
is None:
3964 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3967 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3970 if has_specified_boundary
:
3974 return out
, content_type
3977 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3978 if isinstance(key_or_keys
, (list, tuple)):
3979 for key
in key_or_keys
:
3980 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3984 return d
.get(key_or_keys
, default
)
3987 def try_get(src
, getter
, expected_type
=None):
3988 if not isinstance(getter
, (list, tuple)):
3993 except (AttributeError, KeyError, TypeError, IndexError):
3996 if expected_type
is None or isinstance(v
, expected_type
):
4000 def merge_dicts(*dicts
):
4002 for a_dict
in dicts
:
4003 for k
, v
in a_dict
.items():
4007 or (isinstance(v
, compat_str
) and v
4008 and isinstance(merged
[k
], compat_str
)
4009 and not merged
[k
])):
4014 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4015 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4027 TV_PARENTAL_GUIDELINES
= {
4037 def parse_age_limit(s
):
4039 return s
if 0 <= s
<= 21 else None
4040 if not isinstance(s
, compat_basestring
):
4042 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4044 return int(m
.group('age'))
4046 return US_RATINGS
[s
]
4047 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4049 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4053 def strip_jsonp(code
):
4056 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4057 (?:\s*&&\s*(?P=func_name))?
4058 \s*\(\s*(?P<callback_data>.*)\);?
4059 \s*?(?://[^\n]*)*$''',
4060 r
'\g<callback_data>', code
)
4063 def js_to_json(code
):
4064 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4065 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4067 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4068 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4073 if v
in ('true', 'false', 'null'):
4075 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
4078 if v
[0] in ("'", '"'):
4079 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4084 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4086 for regex
, base
in INTEGER_TABLE
:
4087 im
= re
.match(regex
, v
)
4089 i
= int(im
.group(1), base
)
4090 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4094 return re
.sub(r
'''(?sx)
4095 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4096 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4097 {comment}|,(?={skip}[\]}}])|
4098 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4099 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4101 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4104 def qualities(quality_ids
):
4105 """ Get a numeric quality value out of a list of possible values """
4108 return quality_ids
.index(qid
)
4114 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
4117 def limit_length(s
, length
):
4118 """ Add ellipses to overly long strings """
4123 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4127 def version_tuple(v
):
4128 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4131 def is_outdated_version(version
, limit
, assume_new
=True):
4133 return not assume_new
4135 return version_tuple(version
) < version_tuple(limit
)
4137 return not assume_new
4140 def ytdl_is_updateable():
4141 """ Returns if youtube-dl can be updated with -U """
4142 from zipimport
import zipimporter
4144 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4147 def args_to_str(args
):
4148 # Get a short string representation for a subprocess command
4149 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4152 def error_to_compat_str(err
):
4154 # On python 2 error byte string must be decoded with proper
4155 # encoding rather than ascii
4156 if sys
.version_info
[0] < 3:
4157 err_str
= err_str
.decode(preferredencoding())
4161 def mimetype2ext(mt
):
4167 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4168 # it's the most popular one
4169 'audio/mpeg': 'mp3',
4174 _
, _
, res
= mt
.rpartition('/')
4175 res
= res
.split(';')[0].strip().lower()
4179 'smptett+xml': 'tt',
4183 'x-mp4-fragmented': 'mp4',
4184 'x-ms-sami': 'sami',
4187 'x-mpegurl': 'm3u8',
4188 'vnd.apple.mpegurl': 'm3u8',
4192 'vnd.ms-sstr+xml': 'ism',
4198 def parse_codecs(codecs_str
):
4199 # http://tools.ietf.org/html/rfc6381
4202 splited_codecs
= list(filter(None, map(
4203 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4204 vcodec
, acodec
= None, None
4205 for full_codec
in splited_codecs
:
4206 codec
= full_codec
.split('.')[0]
4207 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4210 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4214 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4215 if not vcodec
and not acodec
:
4216 if len(splited_codecs
) == 2:
4218 'vcodec': splited_codecs
[0],
4219 'acodec': splited_codecs
[1],
4223 'vcodec': vcodec
or 'none',
4224 'acodec': acodec
or 'none',
4229 def urlhandle_detect_ext(url_handle
):
4230 getheader
= url_handle
.headers
.get
4232 cd
= getheader('Content-Disposition')
4234 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4236 e
= determine_ext(m
.group('filename'), default_ext
=None)
4240 return mimetype2ext(getheader('Content-Type'))
4243 def encode_data_uri(data
, mime_type
):
4244 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4247 def age_restricted(content_limit
, age_limit
):
4248 """ Returns True iff the content should be blocked """
4250 if age_limit
is None: # No limit set
4252 if content_limit
is None:
4253 return False # Content available for everyone
4254 return age_limit
< content_limit
4257 def is_html(first_bytes
):
4258 """ Detect whether a file contains HTML by examining its first bytes. """
4261 (b
'\xef\xbb\xbf', 'utf-8'),
4262 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4263 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4264 (b
'\xff\xfe', 'utf-16-le'),
4265 (b
'\xfe\xff', 'utf-16-be'),
4267 for bom
, enc
in BOMS
:
4268 if first_bytes
.startswith(bom
):
4269 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4272 s
= first_bytes
.decode('utf-8', 'replace')
4274 return re
.match(r
'^\s*<', s
)
4277 def determine_protocol(info_dict
):
4278 protocol
= info_dict
.get('protocol')
4279 if protocol
is not None:
4282 url
= info_dict
['url']
4283 if url
.startswith('rtmp'):
4285 elif url
.startswith('mms'):
4287 elif url
.startswith('rtsp'):
4290 ext
= determine_ext(url
)
4296 return compat_urllib_parse_urlparse(url
).scheme
4299 def render_table(header_row
, data
):
4300 """ Render a list of rows, each as a list of values """
4301 table
= [header_row
] + data
4302 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4303 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
4304 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4307 def _match_one(filter_part
, dct
):
4308 COMPARISON_OPERATORS
= {
4316 operator_rex
= re
.compile(r
'''(?x)\s*
4318 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4320 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4321 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4322 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4325 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4326 m = operator_rex.search(filter_part)
4328 op = COMPARISON_OPERATORS[m.group('op')]
4329 actual_value = dct.get(m.group('key'))
4330 if (m.group('quotedstrval') is not None
4331 or m.group('strval') is not None
4332 # If the original field is a string and matching comparisonvalue is
4333 # a number we should respect the origin of the original field
4334 # and process comparison value as a string (see
4335 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4336 or actual_value is not None and m.group('intval') is not None
4337 and isinstance(actual_value, compat_str)):
4338 if m.group('op') not in ('=', '!='):
4340 'Operator %s does not support string values!' % m.group('op'))
4341 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4342 quote = m.group('quote')
4343 if quote is not None:
4344 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4347 comparison_value = int(m.group('intval'))
4349 comparison_value = parse_filesize(m.group('intval'))
4350 if comparison_value is None:
4351 comparison_value = parse_filesize(m.group('intval') + 'B')
4352 if comparison_value is None:
4354 'Invalid integer value %r in filter part %r' % (
4355 m.group('intval'), filter_part))
4356 if actual_value is None:
4357 return m.group('none_inclusive')
4358 return op(actual_value, comparison_value)
4361 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4362 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4364 operator_rex = re.compile(r'''(?x
)\s
*
4365 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4367 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4368 m = operator_rex.search(filter_part)
4370 op = UNARY_OPERATORS[m.group('op')]
4371 actual_value = dct.get(m.group('key'))
4372 return op(actual_value)
4374 raise ValueError('Invalid filter part %r' % filter_part)
4377 def match_str(filter_str, dct):
4378 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4381 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4384 def match_filter_func(filter_str):
4385 def _match_func(info_dict):
4386 if match_str(filter_str, info_dict):
4389 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4390 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4394 def parse_dfxp_time_expr(time_expr):
4398 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4400 return float(mobj.group('time_offset'))
4402 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4404 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4407 def srt_subtitles_timecode(seconds):
4408 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4411 def dfxp2srt(dfxp_data):
4413 @param dfxp_data A
bytes-like
object containing DFXP data
4414 @returns A
unicode object containing converted SRT data
4416 LEGACY_NAMESPACES = (
4417 (b'http://www.w3.org/ns/ttml', [
4418 b'http://www.w3.org/2004/11/ttaf1',
4419 b'http://www.w3.org/2006/04/ttaf1',
4420 b'http://www.w3.org/2006/10/ttaf1',
4422 (b'http://www.w3.org/ns/ttml#styling', [
4423 b'http://www.w3.org/ns/ttml#style',
4427 SUPPORTED_STYLING = [
4436 _x = functools.partial(xpath_with_ns, ns_map={
4437 'xml': 'http://www.w3.org/XML/1998/namespace',
4438 'ttml': 'http://www.w3.org/ns/ttml',
4439 'tts': 'http://www.w3.org/ns/ttml#styling',
4445 class TTMLPElementParser(object):
4447 _unclosed_elements = []
4448 _applied_styles = []
4450 def start(self, tag, attrib):
4451 if tag in (_x('ttml:br'), 'br'):
4454 unclosed_elements = []
4456 element_style_id = attrib.get('style')
4458 style.update(default_style)
4459 if element_style_id:
4460 style.update(styles.get(element_style_id, {}))
4461 for prop in SUPPORTED_STYLING:
4462 prop_val = attrib.get(_x('tts:' + prop))
4464 style[prop] = prop_val
4467 for k, v in sorted(style.items()):
4468 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4471 font += ' color="%s"' % v
4472 elif k == 'fontSize':
4473 font += ' size="%s"' % v
4474 elif k == 'fontFamily':
4475 font += ' face="%s"' % v
4476 elif k == 'fontWeight' and v == 'bold':
4478 unclosed_elements.append('b')
4479 elif k == 'fontStyle' and v == 'italic':
4481 unclosed_elements.append('i')
4482 elif k == 'textDecoration' and v == 'underline':
4484 unclosed_elements.append('u')
4486 self._out += '<font' + font + '>'
4487 unclosed_elements.append('font')
4489 if self._applied_styles:
4490 applied_style.update(self._applied_styles[-1])
4491 applied_style.update(style)
4492 self._applied_styles.append(applied_style)
4493 self._unclosed_elements.append(unclosed_elements)
4496 if tag not in (_x('ttml:br'), 'br'):
4497 unclosed_elements = self._unclosed_elements.pop()
4498 for element in reversed(unclosed_elements):
4499 self._out += '</%s>' % element
4500 if unclosed_elements and self._applied_styles:
4501 self._applied_styles.pop()
4503 def data(self, data):
4507 return self._out.strip()
4509 def parse_node(node):
4510 target = TTMLPElementParser()
4511 parser = xml.etree.ElementTree.XMLParser(target=target)
4512 parser.feed(xml.etree.ElementTree.tostring(node))
4513 return parser.close()
4515 for k, v in LEGACY_NAMESPACES:
4517 dfxp_data = dfxp_data.replace(ns, k)
4519 dfxp = compat_etree_fromstring(dfxp_data)
4521 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4524 raise ValueError('Invalid dfxp/TTML subtitle')
4528 for style in dfxp.findall(_x('.//ttml:style')):
4529 style_id = style.get('id') or style.get(_x('xml:id'))
4532 parent_style_id = style.get('style')
4534 if parent_style_id not in styles:
4537 styles[style_id] = styles[parent_style_id].copy()
4538 for prop in SUPPORTED_STYLING:
4539 prop_val = style.get(_x('tts:' + prop))
4541 styles.setdefault(style_id, {})[prop] = prop_val
4547 for p in ('body', 'div'):
4548 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4551 style = styles.get(ele.get('style'))
4554 default_style.update(style)
4556 for para, index in zip(paras, itertools.count(1)):
4557 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4558 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4559 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4560 if begin_time is None:
4565 end_time = begin_time + dur
4566 out.append('%d\n%s --> %s\n%s\n\n' % (
4568 srt_subtitles_timecode(begin_time),
4569 srt_subtitles_timecode(end_time),
4575 def cli_option(params, command_option, param):
4576 param = params.get(param)
4578 param = compat_str(param)
4579 return [command_option, param] if param is not None else []
4582 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4583 param = params.get(param)
4586 assert isinstance(param, bool)
4588 return [command_option + separator + (true_value if param else false_value)]
4589 return [command_option, true_value if param else false_value]
4592 def cli_valueless_option(params, command_option, param, expected_value=True):
4593 param = params.get(param)
4594 return [command_option] if param == expected_value else []
4597 def cli_configuration_args(params, param, default=[]):
4598 ex_args = params.get(param)
4601 assert isinstance(ex_args, list)
4605 class ISO639Utils(object):
4606 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4665 'iw': 'heb', # Replaced by he in 1989 revision
4675 'in': 'ind', # Replaced by id in 1989 revision
4790 'ji': 'yid', # Replaced by yi in 1989 revision
4798 def short2long(cls, code):
4799 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4800 return cls._lang_map.get(code[:2])
4803 def long2short(cls, code):
4804 """Convert language code from ISO 639-2/T to ISO 639-1"""
4805 for short_name, long_name in cls._lang_map.items():
4806 if long_name == code:
4810 class ISO3166Utils(object):
4811 # From http://data.okfn.org/data/core/country-list
4813 'AF': 'Afghanistan',
4814 'AX': 'Ć
land Islands',
4817 'AS': 'American Samoa',
4822 'AG': 'Antigua and Barbuda',
4839 'BO': 'Bolivia, Plurinational State of',
4840 'BQ': 'Bonaire, Sint Eustatius and Saba',
4841 'BA': 'Bosnia and Herzegovina',
4843 'BV': 'Bouvet Island',
4845 'IO': 'British Indian Ocean Territory',
4846 'BN': 'Brunei Darussalam',
4848 'BF': 'Burkina Faso',
4854 'KY': 'Cayman Islands',
4855 'CF': 'Central African Republic',
4859 'CX': 'Christmas Island',
4860 'CC': 'Cocos (Keeling) Islands',
4864 'CD': 'Congo, the Democratic Republic of the',
4865 'CK': 'Cook Islands',
4867 'CI': 'CĆ“te d\'Ivoire',
4872 'CZ': 'Czech Republic',
4876 'DO': 'Dominican Republic',
4879 'SV': 'El Salvador',
4880 'GQ': 'Equatorial Guinea',
4884 'FK': 'Falkland Islands (Malvinas)',
4885 'FO': 'Faroe Islands',
4889 'GF': 'French Guiana',
4890 'PF': 'French Polynesia',
4891 'TF': 'French Southern Territories',
4906 'GW': 'Guinea-Bissau',
4909 'HM': 'Heard Island and McDonald Islands',
4910 'VA': 'Holy See (Vatican City State)',
4917 'IR': 'Iran, Islamic Republic of',
4920 'IM': 'Isle of Man',
4930 'KP': 'Korea, Democratic People\'s Republic of',
4931 'KR': 'Korea, Republic of',
4934 'LA': 'Lao People\'s Democratic Republic',
4940 'LI': 'Liechtenstein',
4944 'MK': 'Macedonia, the Former Yugoslav Republic of',
4951 'MH': 'Marshall Islands',
4957 'FM': 'Micronesia, Federated States of',
4958 'MD': 'Moldova, Republic of',
4969 'NL': 'Netherlands',
4970 'NC': 'New Caledonia',
4971 'NZ': 'New Zealand',
4976 'NF': 'Norfolk Island',
4977 'MP': 'Northern Mariana Islands',
4982 'PS': 'Palestine, State of',
4984 'PG': 'Papua New Guinea',
4987 'PH': 'Philippines',
4991 'PR': 'Puerto Rico',
4995 'RU': 'Russian Federation',
4997 'BL': 'Saint BarthƩlemy',
4998 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4999 'KN': 'Saint Kitts and Nevis',
5000 'LC': 'Saint Lucia',
5001 'MF': 'Saint Martin (French part)',
5002 'PM': 'Saint Pierre and Miquelon',
5003 'VC': 'Saint Vincent and the Grenadines',
5006 'ST': 'Sao Tome and Principe',
5007 'SA': 'Saudi Arabia',
5011 'SL': 'Sierra Leone',
5013 'SX': 'Sint Maarten (Dutch part)',
5016 'SB': 'Solomon Islands',
5018 'ZA': 'South Africa',
5019 'GS': 'South Georgia and the South Sandwich Islands',
5020 'SS': 'South Sudan',
5025 'SJ': 'Svalbard and Jan Mayen',
5028 'CH': 'Switzerland',
5029 'SY': 'Syrian Arab Republic',
5030 'TW': 'Taiwan, Province of China',
5032 'TZ': 'Tanzania, United Republic of',
5034 'TL': 'Timor-Leste',
5038 'TT': 'Trinidad and Tobago',
5041 'TM': 'Turkmenistan',
5042 'TC': 'Turks and Caicos Islands',
5046 'AE': 'United Arab Emirates',
5047 'GB': 'United Kingdom',
5048 'US': 'United States',
5049 'UM': 'United States Minor Outlying Islands',
5053 'VE': 'Venezuela, Bolivarian Republic of',
5055 'VG': 'Virgin Islands, British',
5056 'VI': 'Virgin Islands, U.S.',
5057 'WF': 'Wallis and Futuna',
5058 'EH': 'Western Sahara',
5065 def short2full(cls, code):
5066 """Convert an ISO 3166-2 country code to the corresponding full name"""
5067 return cls._country_map.get(code.upper())
5070 class GeoUtils(object):
5071 # Major IPv4 address blocks per country
5073 'AD': '46.172.224.0/19',
5074 'AE': '94.200.0.0/13',
5075 'AF': '149.54.0.0/17',
5076 'AG': '209.59.64.0/18',
5077 'AI': '204.14.248.0/21',
5078 'AL': '46.99.0.0/16',
5079 'AM': '46.70.0.0/15',
5080 'AO': '105.168.0.0/13',
5081 'AP': '182.50.184.0/21',
5082 'AQ': '23.154.160.0/24',
5083 'AR': '181.0.0.0/12',
5084 'AS': '202.70.112.0/20',
5085 'AT': '77.116.0.0/14',
5086 'AU': '1.128.0.0/11',
5087 'AW': '181.41.0.0/18',
5088 'AX': '185.217.4.0/22',
5089 'AZ': '5.197.0.0/16',
5090 'BA': '31.176.128.0/17',
5091 'BB': '65.48.128.0/17',
5092 'BD': '114.130.0.0/16',
5094 'BF': '102.178.0.0/15',
5095 'BG': '95.42.0.0/15',
5096 'BH': '37.131.0.0/17',
5097 'BI': '154.117.192.0/18',
5098 'BJ': '137.255.0.0/16',
5099 'BL': '185.212.72.0/23',
5100 'BM': '196.12.64.0/18',
5101 'BN': '156.31.0.0/16',
5102 'BO': '161.56.0.0/16',
5103 'BQ': '161.0.80.0/20',
5104 'BR': '191.128.0.0/12',
5105 'BS': '24.51.64.0/18',
5106 'BT': '119.2.96.0/19',
5107 'BW': '168.167.0.0/16',
5108 'BY': '178.120.0.0/13',
5109 'BZ': '179.42.192.0/18',
5110 'CA': '99.224.0.0/11',
5111 'CD': '41.243.0.0/16',
5112 'CF': '197.242.176.0/21',
5113 'CG': '160.113.0.0/16',
5114 'CH': '85.0.0.0/13',
5115 'CI': '102.136.0.0/14',
5116 'CK': '202.65.32.0/19',
5117 'CL': '152.172.0.0/14',
5118 'CM': '102.244.0.0/14',
5119 'CN': '36.128.0.0/10',
5120 'CO': '181.240.0.0/12',
5121 'CR': '201.192.0.0/12',
5122 'CU': '152.206.0.0/15',
5123 'CV': '165.90.96.0/19',
5124 'CW': '190.88.128.0/17',
5125 'CY': '31.153.0.0/16',
5126 'CZ': '88.100.0.0/14',
5128 'DJ': '197.241.0.0/17',
5129 'DK': '87.48.0.0/12',
5130 'DM': '192.243.48.0/20',
5131 'DO': '152.166.0.0/15',
5132 'DZ': '41.96.0.0/12',
5133 'EC': '186.68.0.0/15',
5134 'EE': '90.190.0.0/15',
5135 'EG': '156.160.0.0/11',
5136 'ER': '196.200.96.0/20',
5137 'ES': '88.0.0.0/11',
5138 'ET': '196.188.0.0/14',
5139 'EU': '2.16.0.0/13',
5140 'FI': '91.152.0.0/13',
5141 'FJ': '144.120.0.0/16',
5142 'FK': '80.73.208.0/21',
5143 'FM': '119.252.112.0/20',
5144 'FO': '88.85.32.0/19',
5146 'GA': '41.158.0.0/15',
5148 'GD': '74.122.88.0/21',
5149 'GE': '31.146.0.0/16',
5150 'GF': '161.22.64.0/18',
5151 'GG': '62.68.160.0/19',
5152 'GH': '154.160.0.0/12',
5153 'GI': '95.164.0.0/16',
5154 'GL': '88.83.0.0/19',
5155 'GM': '160.182.0.0/15',
5156 'GN': '197.149.192.0/18',
5157 'GP': '104.250.0.0/19',
5158 'GQ': '105.235.224.0/20',
5159 'GR': '94.64.0.0/13',
5160 'GT': '168.234.0.0/16',
5161 'GU': '168.123.0.0/16',
5162 'GW': '197.214.80.0/20',
5163 'GY': '181.41.64.0/18',
5164 'HK': '113.252.0.0/14',
5165 'HN': '181.210.0.0/16',
5166 'HR': '93.136.0.0/13',
5167 'HT': '148.102.128.0/17',
5168 'HU': '84.0.0.0/14',
5169 'ID': '39.192.0.0/10',
5170 'IE': '87.32.0.0/12',
5171 'IL': '79.176.0.0/13',
5172 'IM': '5.62.80.0/20',
5173 'IN': '117.192.0.0/10',
5174 'IO': '203.83.48.0/21',
5175 'IQ': '37.236.0.0/14',
5176 'IR': '2.176.0.0/12',
5177 'IS': '82.221.0.0/16',
5178 'IT': '79.0.0.0/10',
5179 'JE': '87.244.64.0/18',
5180 'JM': '72.27.0.0/17',
5181 'JO': '176.29.0.0/16',
5182 'JP': '133.0.0.0/8',
5183 'KE': '105.48.0.0/12',
5184 'KG': '158.181.128.0/17',
5185 'KH': '36.37.128.0/17',
5186 'KI': '103.25.140.0/22',
5187 'KM': '197.255.224.0/20',
5188 'KN': '198.167.192.0/19',
5189 'KP': '175.45.176.0/22',
5190 'KR': '175.192.0.0/10',
5191 'KW': '37.36.0.0/14',
5192 'KY': '64.96.0.0/15',
5193 'KZ': '2.72.0.0/13',
5194 'LA': '115.84.64.0/18',
5195 'LB': '178.135.0.0/16',
5196 'LC': '24.92.144.0/20',
5197 'LI': '82.117.0.0/19',
5198 'LK': '112.134.0.0/15',
5199 'LR': '102.183.0.0/16',
5200 'LS': '129.232.0.0/17',
5201 'LT': '78.56.0.0/13',
5202 'LU': '188.42.0.0/16',
5203 'LV': '46.109.0.0/16',
5204 'LY': '41.252.0.0/14',
5205 'MA': '105.128.0.0/11',
5206 'MC': '88.209.64.0/18',
5207 'MD': '37.246.0.0/16',
5208 'ME': '178.175.0.0/17',
5209 'MF': '74.112.232.0/21',
5210 'MG': '154.126.0.0/17',
5211 'MH': '117.103.88.0/21',
5212 'MK': '77.28.0.0/15',
5213 'ML': '154.118.128.0/18',
5214 'MM': '37.111.0.0/17',
5215 'MN': '49.0.128.0/17',
5216 'MO': '60.246.0.0/16',
5217 'MP': '202.88.64.0/20',
5218 'MQ': '109.203.224.0/19',
5219 'MR': '41.188.64.0/18',
5220 'MS': '208.90.112.0/22',
5221 'MT': '46.11.0.0/16',
5222 'MU': '105.16.0.0/12',
5223 'MV': '27.114.128.0/18',
5224 'MW': '102.70.0.0/15',
5225 'MX': '187.192.0.0/11',
5226 'MY': '175.136.0.0/13',
5227 'MZ': '197.218.0.0/15',
5228 'NA': '41.182.0.0/16',
5229 'NC': '101.101.0.0/18',
5230 'NE': '197.214.0.0/18',
5231 'NF': '203.17.240.0/22',
5232 'NG': '105.112.0.0/12',
5233 'NI': '186.76.0.0/15',
5234 'NL': '145.96.0.0/11',
5235 'NO': '84.208.0.0/13',
5236 'NP': '36.252.0.0/15',
5237 'NR': '203.98.224.0/19',
5238 'NU': '49.156.48.0/22',
5239 'NZ': '49.224.0.0/14',
5240 'OM': '5.36.0.0/15',
5241 'PA': '186.72.0.0/15',
5242 'PE': '186.160.0.0/14',
5243 'PF': '123.50.64.0/18',
5244 'PG': '124.240.192.0/19',
5245 'PH': '49.144.0.0/13',
5246 'PK': '39.32.0.0/11',
5247 'PL': '83.0.0.0/11',
5248 'PM': '70.36.0.0/20',
5249 'PR': '66.50.0.0/16',
5250 'PS': '188.161.0.0/16',
5251 'PT': '85.240.0.0/13',
5252 'PW': '202.124.224.0/20',
5253 'PY': '181.120.0.0/14',
5254 'QA': '37.210.0.0/15',
5255 'RE': '102.35.0.0/16',
5256 'RO': '79.112.0.0/13',
5257 'RS': '93.86.0.0/15',
5258 'RU': '5.136.0.0/13',
5259 'RW': '41.186.0.0/16',
5260 'SA': '188.48.0.0/13',
5261 'SB': '202.1.160.0/19',
5262 'SC': '154.192.0.0/11',
5263 'SD': '102.120.0.0/13',
5264 'SE': '78.64.0.0/12',
5265 'SG': '8.128.0.0/10',
5266 'SI': '188.196.0.0/14',
5267 'SK': '78.98.0.0/15',
5268 'SL': '102.143.0.0/17',
5269 'SM': '89.186.32.0/19',
5270 'SN': '41.82.0.0/15',
5271 'SO': '154.115.192.0/18',
5272 'SR': '186.179.128.0/17',
5273 'SS': '105.235.208.0/21',
5274 'ST': '197.159.160.0/19',
5275 'SV': '168.243.0.0/16',
5276 'SX': '190.102.0.0/20',
5278 'SZ': '41.84.224.0/19',
5279 'TC': '65.255.48.0/20',
5280 'TD': '154.68.128.0/19',
5281 'TG': '196.168.0.0/14',
5282 'TH': '171.96.0.0/13',
5283 'TJ': '85.9.128.0/18',
5284 'TK': '27.96.24.0/21',
5285 'TL': '180.189.160.0/20',
5286 'TM': '95.85.96.0/19',
5287 'TN': '197.0.0.0/11',
5288 'TO': '175.176.144.0/21',
5289 'TR': '78.160.0.0/11',
5290 'TT': '186.44.0.0/15',
5291 'TV': '202.2.96.0/19',
5292 'TW': '120.96.0.0/11',
5293 'TZ': '156.156.0.0/14',
5294 'UA': '37.52.0.0/14',
5295 'UG': '102.80.0.0/13',
5297 'UY': '167.56.0.0/13',
5298 'UZ': '84.54.64.0/18',
5299 'VA': '212.77.0.0/19',
5300 'VC': '207.191.240.0/21',
5301 'VE': '186.88.0.0/13',
5302 'VG': '66.81.192.0/20',
5303 'VI': '146.226.0.0/16',
5304 'VN': '14.160.0.0/11',
5305 'VU': '202.80.32.0/20',
5306 'WF': '117.20.32.0/21',
5307 'WS': '202.4.32.0/19',
5308 'YE': '134.35.0.0/16',
5309 'YT': '41.242.116.0/22',
5310 'ZA': '41.0.0.0/11',
5311 'ZM': '102.144.0.0/13',
5312 'ZW': '102.177.192.0/18',
5316 def random_ipv4(cls, code_or_block):
5317 if len(code_or_block) == 2:
5318 block = cls._country_ip_map.get(code_or_block.upper())
5322 block = code_or_block
5323 addr, preflen = block.split('/')
5324 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5325 addr_max = addr_min | (0xffffffff >> int(preflen))
5326 return compat_str(socket.inet_ntoa(
5327 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5330 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5331 def __init__(self, proxies=None):
5332 # Set default handlers
5333 for type in ('http', 'https'):
5334 setattr(self, '%s_open' % type,
5335 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5336 meth(r, proxy, type))
5337 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5339 def proxy_open(self, req, proxy, type):
5340 req_proxy = req.headers.get('Ytdl-request-proxy')
5341 if req_proxy is not None:
5343 del req.headers['Ytdl-request-proxy']
5345 if proxy == '__noproxy__':
5346 return None # No Proxy
5347 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5348 req.add_header('Ytdl-socks-proxy', proxy)
5349 # youtube-dl's http/https handlers do wrapping the socket with socks
5351 return compat_urllib_request.ProxyHandler.proxy_open(
5352 self, req, proxy, type)
5355 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5356 # released into Public Domain
5357 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5359 def long_to_bytes(n, blocksize=0):
5360 """long_to_bytes(n:long, blocksize:int) : string
5361 Convert a long integer to a byte string.
5363 If optional blocksize is given and greater than zero, pad the front of the
5364 byte string with binary zeros so that the length is a multiple of
5367 # after much testing, this algorithm was deemed to be the fastest
5371 s = compat_struct_pack('>I', n & 0xffffffff) + s
5373 # strip off leading zeros
5374 for i in range(len(s)):
5375 if s[i] != b'\000'[0]:
5378 # only happens when n == 0
5382 # add back some pad bytes. this could be done more efficiently w.r.t. the
5383 # de-padding being done above, but sigh...
5384 if blocksize > 0 and len(s) % blocksize:
5385 s = (blocksize - len(s) % blocksize) * b'\000' + s
5389 def bytes_to_long(s):
5390 """bytes_to_long(string) : long
5391 Convert a byte string to a long integer.
5393 This is (essentially) the inverse of long_to_bytes().
5398 extra = (4 - length % 4)
5399 s = b'\000' * extra + s
5400 length = length + extra
5401 for i in range(0, length, 4):
5402 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5406 def ohdave_rsa_encrypt(data, exponent, modulus):
5408 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5411 data: data to encrypt, bytes-like object
5412 exponent, modulus: parameter e and N of RSA algorithm, both integer
5413 Output: hex string of encrypted data
5415 Limitation: supports one block encryption only
5418 payload = int(binascii.hexlify(data[::-1]), 16)
5419 encrypted = pow(payload, exponent, modulus)
5420 return '%x' % encrypted
5423 def pkcs1pad(data, length):
5425 Padding input data with PKCS#1 scheme
5427 @param {int[]} data input data
5428 @param {int} length target length
5429 @returns {int[]} padded data
5431 if len(data) > length - 11:
5432 raise ValueError('Input data too
long for PKCS
#1 padding')
5434 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5435 return [0, 2] + pseudo_random
+ [0] + data
5438 def encode_base_n(num
, n
, table
=None):
5439 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5441 table
= FULL_TABLE
[:n
]
5444 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5451 ret
= table
[num
% n
] + ret
5456 def decode_packed_codes(code
):
5457 mobj
= re
.search(PACKED_CODES_RE
, code
)
5458 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
5461 symbols
= symbols
.split('|')
5466 base_n_count
= encode_base_n(count
, base
)
5467 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5470 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5474 def caesar(s
, alphabet
, shift
):
5479 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5484 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5487 def parse_m3u8_attributes(attrib
):
5489 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5490 if val
.startswith('"'):
5496 def urshift(val
, n
):
5497 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5500 # Based on png2str() written by @gdkchan and improved by @yokrysty
5501 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5502 def decode_png(png_data
):
5503 # Reference: https://www.w3.org/TR/PNG/
5504 header
= png_data
[8:]
5506 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5507 raise IOError('Not a valid PNG file.')
5509 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5510 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5515 length
= unpack_integer(header
[:4])
5518 chunk_type
= header
[:4]
5521 chunk_data
= header
[:length
]
5522 header
= header
[length
:]
5524 header
= header
[4:] # Skip CRC
5532 ihdr
= chunks
[0]['data']
5534 width
= unpack_integer(ihdr
[:4])
5535 height
= unpack_integer(ihdr
[4:8])
5539 for chunk
in chunks
:
5540 if chunk
['type'] == b
'IDAT':
5541 idat
+= chunk
['data']
5544 raise IOError('Unable to read PNG data.')
5546 decompressed_data
= bytearray(zlib
.decompress(idat
))
5551 def _get_pixel(idx
):
5556 for y
in range(height
):
5557 basePos
= y
* (1 + stride
)
5558 filter_type
= decompressed_data
[basePos
]
5562 pixels
.append(current_row
)
5564 for x
in range(stride
):
5565 color
= decompressed_data
[1 + basePos
+ x
]
5566 basex
= y
* stride
+ x
5571 left
= _get_pixel(basex
- 3)
5573 up
= _get_pixel(basex
- stride
)
5575 if filter_type
== 1: # Sub
5576 color
= (color
+ left
) & 0xff
5577 elif filter_type
== 2: # Up
5578 color
= (color
+ up
) & 0xff
5579 elif filter_type
== 3: # Average
5580 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5581 elif filter_type
== 4: # Paeth
5587 c
= _get_pixel(basex
- stride
- 3)
5595 if pa
<= pb
and pa
<= pc
:
5596 color
= (color
+ a
) & 0xff
5598 color
= (color
+ b
) & 0xff
5600 color
= (color
+ c
) & 0xff
5602 current_row
.append(color
)
5604 return width
, height
, pixels
5607 def write_xattr(path
, key
, value
):
5608 # This mess below finds the best xattr tool for the job
5610 # try the pyxattr module...
5613 if hasattr(xattr
, 'set'): # pyxattr
5614 # Unicode arguments are not supported in python-pyxattr until
5616 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5617 pyxattr_required_version
= '0.5.0'
5618 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5619 # TODO: fallback to CLI tools
5620 raise XAttrUnavailableError(
5621 'python-pyxattr is detected but is too old. '
5622 'youtube-dl requires %s or above while your version is %s. '
5623 'Falling back to other xattr implementations' % (
5624 pyxattr_required_version
, xattr
.__version
__))
5626 setxattr
= xattr
.set
5628 setxattr
= xattr
.setxattr
5631 setxattr(path
, key
, value
)
5632 except EnvironmentError as e
:
5633 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5636 if compat_os_name
== 'nt':
5637 # Write xattrs to NTFS Alternate Data Streams:
5638 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5639 assert ':' not in key
5640 assert os
.path
.exists(path
)
5642 ads_fn
= path
+ ':' + key
5644 with open(ads_fn
, 'wb') as f
:
5646 except EnvironmentError as e
:
5647 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5649 user_has_setfattr
= check_executable('setfattr', ['--version'])
5650 user_has_xattr
= check_executable('xattr', ['-h'])
5652 if user_has_setfattr
or user_has_xattr
:
5654 value
= value
.decode('utf-8')
5655 if user_has_setfattr
:
5656 executable
= 'setfattr'
5657 opts
= ['-n', key
, '-v', value
]
5658 elif user_has_xattr
:
5659 executable
= 'xattr'
5660 opts
= ['-w', key
, value
]
5662 cmd
= ([encodeFilename(executable
, True)]
5663 + [encodeArgument(o
) for o
in opts
]
5664 + [encodeFilename(path
, True)])
5667 p
= subprocess
.Popen(
5668 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5669 except EnvironmentError as e
:
5670 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5671 stdout
, stderr
= p
.communicate()
5672 stderr
= stderr
.decode('utf-8', 'replace')
5673 if p
.returncode
!= 0:
5674 raise XAttrMetadataError(p
.returncode
, stderr
)
5677 # On Unix, and can't find pyxattr, setfattr, or xattr.
5678 if sys
.platform
.startswith('linux'):
5679 raise XAttrUnavailableError(
5680 "Couldn't find a tool to set the xattrs. "
5681 "Install either the python 'pyxattr' or 'xattr' "
5682 "modules, or the GNU 'attr' package "
5683 "(which contains the 'setfattr' tool).")
5685 raise XAttrUnavailableError(
5686 "Couldn't find a tool to set the xattrs. "
5687 "Install either the python 'xattr' module, "
5688 "or the 'xattr' binary.")
5691 def random_birthday(year_field
, month_field
, day_field
):
5692 start_date
= datetime
.date(1950, 1, 1)
5693 end_date
= datetime
.date(1995, 12, 31)
5694 offset
= random
.randint(0, (end_date
- start_date
).days
)
5695 random_date
= start_date
+ datetime
.timedelta(offset
)
5697 year_field
: str(random_date
.year
),
5698 month_field
: str(random_date
.month
),
5699 day_field
: str(random_date
.day
),