4 from __future__
import unicode_literals
34 import xml
.etree
.ElementTree
38 compat_HTMLParseError
,
43 compat_ctypes_WINFUNCTYPE
,
44 compat_etree_fromstring
,
47 compat_html_entities_html5
,
58 compat_urllib_parse_urlencode
,
59 compat_urllib_parse_urlparse
,
60 compat_urllib_parse_unquote_plus
,
61 compat_urllib_request
,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme
not in compat_urlparse
.uses_netloc
:
78 compat_urlparse
.uses_netloc
.append(scheme
)
81 # This is not clearly defined otherwise
82 compiled_regex_type
= type(re
.compile(''))
85 def random_user_agent():
86 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1665 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1669 'User-Agent': random_user_agent(),
1670 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1671 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1672 'Accept-Encoding': 'gzip, deflate',
1673 'Accept-Language': 'en-us,en;q=0.5',
1678 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1682 NO_DEFAULT
= object()
1684 ENGLISH_MONTH_NAMES
= [
1685 'January', 'February', 'March', 'April', 'May', 'June',
1686 'July', 'August', 'September', 'October', 'November', 'December']
1689 'en': ENGLISH_MONTH_NAMES
,
1691 'janvier', 'fƩvrier', 'mars', 'avril', 'mai', 'juin',
1692 'juillet', 'aoƻt', 'septembre', 'octobre', 'novembre', 'dƩcembre'],
1695 KNOWN_EXTENSIONS
= (
1696 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1697 'flv', 'f4v', 'f4a', 'f4b',
1698 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1699 'mkv', 'mka', 'mk3d',
1702 'asf', 'wmv', 'wma',
1708 'f4f', 'f4m', 'm3u8', 'smil')
1710 # needed for sanitizing filenames in restricted mode
1711 ACCENT_CHARS
= dict(zip('ĆĆĆĆĆĆ
ĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆĆÅĆÅĆĆĆĆÅ°ĆĆĆĆ Ć”Ć¢Ć£Ć¤Ć„Ć¦Ć§ĆØĆ©ĆŖƫƬĆĆ®ĆÆĆ°Ć±Ć²Ć³Ć“ĆµĆ¶ÅĆøÅĆ¹ĆŗĆ»Ć¼Å±Ć½Ć¾Ćæ',
1712 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1713 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1733 '%Y/%m/%d %H:%M:%S',
1735 '%Y-%m-%d %H:%M:%S',
1736 '%Y-%m-%d %H:%M:%S.%f',
1739 '%Y-%m-%dT%H:%M:%SZ',
1740 '%Y-%m-%dT%H:%M:%S.%fZ',
1741 '%Y-%m-%dT%H:%M:%S.%f0Z',
1742 '%Y-%m-%dT%H:%M:%S',
1743 '%Y-%m-%dT%H:%M:%S.%f',
1745 '%b %d %Y at %H:%M',
1746 '%b %d %Y at %H:%M:%S',
1747 '%B %d %Y at %H:%M',
1748 '%B %d %Y at %H:%M:%S',
1751 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1752 DATE_FORMATS_DAY_FIRST
.extend([
1758 '%d/%m/%Y %H:%M:%S',
1761 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1762 DATE_FORMATS_MONTH_FIRST
.extend([
1767 '%m/%d/%Y %H:%M:%S',
1770 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1771 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1774 def preferredencoding():
1775 """Get preferred encoding.
1777 Returns the best encoding scheme for the system, based on
1778 locale.getpreferredencoding() and some further tweaks.
1781 pref = locale.getpreferredencoding()
1789 def write_json_file(obj, fn):
1790 """ Encode obj as JSON and write it to fn, atomically if possible """
1792 fn = encodeFilename(fn)
1793 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1794 encoding = get_filesystem_encoding()
1795 # os.path.basename returns a bytes object, but NamedTemporaryFile
1796 # will fail if the filename contains non ascii characters unless we
1797 # use a unicode object
1798 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1799 # the same for os.path.dirname
1800 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1802 path_basename = os.path.basename
1803 path_dirname = os.path.dirname
1807 'prefix
': path_basename(fn) + '.',
1808 'dir': path_dirname(fn),
1812 # In Python 2.x, json.dump expects a bytestream.
1813 # In Python 3.x, it writes to a character stream
1814 if sys.version_info < (3, 0):
1819 'encoding
': 'utf
-8',
1822 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1827 if sys.platform == 'win32
':
1828 # Need to remove existing file on Windows, else os.rename raises
1829 # WindowsError or FileExistsError.
1834 os.rename(tf.name, fn)
1843 if sys.version_info >= (2, 7):
1844 def find_xpath_attr(node, xpath, key, val=None):
1845 """ Find the xpath xpath[@key=val] """
1846 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1847 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1848 return node.find(expr)
1850 def find_xpath_attr(node, xpath, key, val=None):
1851 for f in node.findall(compat_xpath(xpath)):
1852 if key not in f.attrib:
1854 if val is None or f.attrib.get(key) == val:
1858 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1859 # the namespace parameter
1862 def xpath_with_ns(path
, ns_map
):
1863 components
= [c
.split(':') for c
in path
.split('/')]
1865 for c
in components
:
1867 replaced
.append(c
[0])
1870 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1871 return '/'.join(replaced
)
1874 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1875 def _find_xpath(xpath
):
1876 return node
.find(compat_xpath(xpath
))
1878 if isinstance(xpath
, (str, compat_str
)):
1879 n
= _find_xpath(xpath
)
1887 if default
is not NO_DEFAULT
:
1890 name
= xpath
if name
is None else name
1891 raise ExtractorError('Could not find XML element %s' % name
)
1897 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1898 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1899 if n
is None or n
== default
:
1902 if default
is not NO_DEFAULT
:
1905 name
= xpath
if name
is None else name
1906 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1912 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1913 n
= find_xpath_attr(node
, xpath
, key
)
1915 if default
is not NO_DEFAULT
:
1918 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1919 raise ExtractorError('Could not find XML attribute %s' % name
)
1922 return n
.attrib
[key
]
1925 def get_element_by_id(id, html
):
1926 """Return the content of the tag with the specified ID in the passed HTML document"""
1927 return get_element_by_attribute('id', id, html
)
1930 def get_element_by_class(class_name
, html
):
1931 """Return the content of the first tag with the specified class in the passed HTML document"""
1932 retval
= get_elements_by_class(class_name
, html
)
1933 return retval
[0] if retval
else None
1936 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1937 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1938 return retval
[0] if retval
else None
1941 def get_elements_by_class(class_name
, html
):
1942 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1943 return get_elements_by_attribute(
1944 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1945 html, escape_value=False)
1948 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1949 """Return the content of the tag with the specified attribute in the passed HTML document"""
1951 value = re.escape(value) if escape_value else value
1954 for m in re.finditer(r'''(?xs)
1956 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1958 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1962 ''' % (re.escape(attribute), value), html):
1963 res = m.group('content
')
1965 if res.startswith('"') or res.startswith("'"):
1968 retlist.append(unescapeHTML(res))
1973 class HTMLAttributeParser(compat_HTMLParser):
1974 """Trivial HTML parser to gather the attributes for a single element"""
1977 compat_HTMLParser.__init__(self)
1979 def handle_starttag(self, tag, attrs):
1980 self.attrs = dict(attrs)
1983 def extract_attributes(html_element):
1984 """Given a string for an HTML element such as
1986 a="foo" B="bar" c="&98;az" d=boz
1987 empty= noval entity="&"
1990 Decode and return a dictionary of attributes.
1992 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
1993 'empty
': '', 'noval
': None, 'entity
': '&',
1994 'sq
': '"', 'dq': '\''
1996 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
1997 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
1999 parser = HTMLAttributeParser()
2001 parser.feed(html_element)
2003 # Older Python may throw HTMLParseError in case of malformed HTML
2004 except compat_HTMLParseError:
2009 def clean_html(html):
2010 """Clean an HTML snippet into a readable string"""
2012 if html is None: # Convenience for sanitizing descriptions etc.
2016 html = html.replace('\n', ' ')
2017 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2018 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2020 html = re.sub('<.*?>', '', html)
2021 # Replace html entities
2022 html = unescapeHTML(html)
2026 def sanitize_open(filename, open_mode):
2027 """Try to open the given filename, and slightly tweak it if this fails.
2029 Attempts to open the given filename. If this fails, it tries to change
2030 the filename slightly, step by step, until it's either able to open it
2031 or it fails and raises a final exception, like the standard open()
2034 It returns the tuple (stream, definitive_file_name).
2038 if sys.platform == 'win32':
2040 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2041 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2042 stream = open(encodeFilename(filename), open_mode)
2043 return (stream, filename)
2044 except (IOError, OSError) as err:
2045 if err.errno in (errno.EACCES,):
2048 # In case of error, try to remove win32 forbidden chars
2049 alt_filename = sanitize_path(filename)
2050 if alt_filename == filename:
2053 # An exception here should be caught in the caller
2054 stream = open(encodeFilename(alt_filename), open_mode)
2055 return (stream, alt_filename)
2058 def timeconvert(timestr):
2059 """Convert RFC 2822 defined time string into system timestamp"""
2061 timetuple = email.utils.parsedate_tz(timestr)
2062 if timetuple is not None:
2063 timestamp = email.utils.mktime_tz(timetuple)
2067 def sanitize_filename(s, restricted=False, is_id=False):
2068 """Sanitizes a string so it could be used as part of a filename.
2069 If restricted is set, use a stricter subset of allowed characters.
2070 Set is_id if this is not an arbitrary string, but an ID that should be kept
2073 def replace_insane(char):
2074 if restricted and char in ACCENT_CHARS:
2075 return ACCENT_CHARS[char]
2076 if char == '?' or ord(char) < 32 or ord(char) == 127:
2079 return '' if restricted else '\''
2081 return '_
-' if restricted else ' -'
2082 elif char in '\\/|
*<>':
2084 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2086 if restricted
and ord(char
) > 127:
2091 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2092 result
= ''.join(map(replace_insane
, s
))
2094 while '__' in result
:
2095 result
= result
.replace('__', '_')
2096 result
= result
.strip('_')
2097 # Common case of "Foreign band name - English song title"
2098 if restricted
and result
.startswith('-_'):
2100 if result
.startswith('-'):
2101 result
= '_' + result
[len('-'):]
2102 result
= result
.lstrip('.')
2108 def sanitize_path(s
):
2109 """Sanitizes and normalizes path on Windows"""
2110 if sys
.platform
!= 'win32':
2112 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2113 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2114 drive_or_unc
, _
= os
.path
.splitunc(s
)
2115 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2119 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2120 for path_part
in norm_path
]
2122 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2123 return os
.path
.join(*sanitized_path
)
2126 def sanitize_url(url
):
2127 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2128 # the number of unwanted failures due to missing protocol
2129 if url
.startswith('//'):
2130 return 'http:%s' % url
2131 # Fix some common typos seen so far
2133 # https://github.com/ytdl-org/youtube-dl/issues/15649
2134 (r
'^httpss://', r
'https://'),
2135 # https://bx1.be/lives/direct-tv/
2136 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2138 for mistake
, fixup
in COMMON_TYPOS
:
2139 if re
.match(mistake
, url
):
2140 return re
.sub(mistake
, fixup
, url
)
2144 def sanitized_Request(url
, *args
, **kwargs
):
2145 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2149 """Expand shell variables and ~"""
2150 return os
.path
.expandvars(compat_expanduser(s
))
2153 def orderedSet(iterable
):
2154 """ Remove all duplicates from the input iterable """
2162 def _htmlentity_transform(entity_with_semicolon
):
2163 """Transforms an HTML entity to a character."""
2164 entity
= entity_with_semicolon
[:-1]
2166 # Known non-numeric HTML entity
2167 if entity
in compat_html_entities
.name2codepoint
:
2168 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2170 # TODO: HTML5 allows entities without a semicolon. For example,
2171 # 'Éric' should be decoded as 'Ćric'.
2172 if entity_with_semicolon
in compat_html_entities_html5
:
2173 return compat_html_entities_html5
[entity_with_semicolon
]
2175 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2176 if mobj
is not None:
2177 numstr
= mobj
.group(1)
2178 if numstr
.startswith('x'):
2180 numstr
= '0%s' % numstr
2183 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2185 return compat_chr(int(numstr
, base
))
2189 # Unknown entity in name, return its literal representation
2190 return '&%s;' % entity
2193 def unescapeHTML(s
):
2196 assert type(s
) == compat_str
2199 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2202 def get_subprocess_encoding():
2203 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2204 # For subprocess calls, encode with locale encoding
2205 # Refer to http://stackoverflow.com/a/9951851/35070
2206 encoding
= preferredencoding()
2208 encoding
= sys
.getfilesystemencoding()
2209 if encoding
is None:
2214 def encodeFilename(s
, for_subprocess
=False):
2216 @param s The name of the file
2219 assert type(s
) == compat_str
2221 # Python 3 has a Unicode API
2222 if sys
.version_info
>= (3, 0):
2225 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2226 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2227 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2228 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2231 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2232 if sys
.platform
.startswith('java'):
2235 return s
.encode(get_subprocess_encoding(), 'ignore')
2238 def decodeFilename(b
, for_subprocess
=False):
2240 if sys
.version_info
>= (3, 0):
2243 if not isinstance(b
, bytes):
2246 return b
.decode(get_subprocess_encoding(), 'ignore')
2249 def encodeArgument(s
):
2250 if not isinstance(s
, compat_str
):
2251 # Legacy code that uses byte strings
2252 # Uncomment the following line after fixing all post processors
2253 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2254 s
= s
.decode('ascii')
2255 return encodeFilename(s
, True)
2258 def decodeArgument(b
):
2259 return decodeFilename(b
, True)
2262 def decodeOption(optval
):
2265 if isinstance(optval
, bytes):
2266 optval
= optval
.decode(preferredencoding())
2268 assert isinstance(optval
, compat_str
)
2272 def formatSeconds(secs
):
2274 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
2276 return '%d:%02d' % (secs
// 60, secs
% 60)
2281 def make_HTTPS_handler(params
, **kwargs
):
2282 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2283 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2284 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2285 if opts_no_check_certificate
:
2286 context
.check_hostname
= False
2287 context
.verify_mode
= ssl
.CERT_NONE
2289 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2292 # (create_default_context present but HTTPSHandler has no context=)
2295 if sys
.version_info
< (3, 2):
2296 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2297 else: # Python < 3.4
2298 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2299 context
.verify_mode
= (ssl
.CERT_NONE
2300 if opts_no_check_certificate
2301 else ssl
.CERT_REQUIRED
)
2302 context
.set_default_verify_paths()
2303 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2306 def bug_reports_message():
2307 if ytdl_is_updateable():
2308 update_cmd
= 'type youtube-dl -U to update'
2310 update_cmd
= 'see https://yt-dl.org/update on how to update'
2311 msg
= '; please report this issue on https://yt-dl.org/bug .'
2312 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2313 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2317 class YoutubeDLError(Exception):
2318 """Base exception for YoutubeDL errors."""
2322 class ExtractorError(YoutubeDLError
):
2323 """Error during info extraction."""
2325 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2326 """ tb, if given, is the original traceback (so that it can be printed out).
2327 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2330 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2332 if video_id
is not None:
2333 msg
= video_id
+ ': ' + msg
2335 msg
+= ' (caused by %r)' % cause
2337 msg
+= bug_reports_message()
2338 super(ExtractorError
, self
).__init
__(msg
)
2341 self
.exc_info
= sys
.exc_info() # preserve original exception
2343 self
.video_id
= video_id
2345 def format_traceback(self
):
2346 if self
.traceback
is None:
2348 return ''.join(traceback
.format_tb(self
.traceback
))
2351 class UnsupportedError(ExtractorError
):
2352 def __init__(self
, url
):
2353 super(UnsupportedError
, self
).__init
__(
2354 'Unsupported URL: %s' % url
, expected
=True)
2358 class RegexNotFoundError(ExtractorError
):
2359 """Error when a regex didn't match"""
2363 class GeoRestrictedError(ExtractorError
):
2364 """Geographic restriction Error exception.
2366 This exception may be thrown when a video is not available from your
2367 geographic location due to geographic restrictions imposed by a website.
2369 def __init__(self
, msg
, countries
=None):
2370 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2372 self
.countries
= countries
2375 class DownloadError(YoutubeDLError
):
2376 """Download Error exception.
2378 This exception may be thrown by FileDownloader objects if they are not
2379 configured to continue on errors. They will contain the appropriate
2383 def __init__(self
, msg
, exc_info
=None):
2384 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2385 super(DownloadError
, self
).__init
__(msg
)
2386 self
.exc_info
= exc_info
2389 class SameFileError(YoutubeDLError
):
2390 """Same File exception.
2392 This exception will be thrown by FileDownloader objects if they detect
2393 multiple files would have to be downloaded to the same file on disk.
2398 class PostProcessingError(YoutubeDLError
):
2399 """Post Processing exception.
2401 This exception may be raised by PostProcessor's .run() method to
2402 indicate an error in the postprocessing task.
2405 def __init__(self
, msg
):
2406 super(PostProcessingError
, self
).__init
__(msg
)
2410 class MaxDownloadsReached(YoutubeDLError
):
2411 """ --max-downloads limit has been reached. """
2415 class UnavailableVideoError(YoutubeDLError
):
2416 """Unavailable Format exception.
2418 This exception will be thrown when a video is requested
2419 in a format that is not available for that video.
2424 class ContentTooShortError(YoutubeDLError
):
2425 """Content Too Short exception.
2427 This exception may be raised by FileDownloader objects when a file they
2428 download is too small for what the server announced first, indicating
2429 the connection was probably interrupted.
2432 def __init__(self
, downloaded
, expected
):
2433 super(ContentTooShortError
, self
).__init
__(
2434 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2437 self
.downloaded
= downloaded
2438 self
.expected
= expected
2441 class XAttrMetadataError(YoutubeDLError
):
2442 def __init__(self
, code
=None, msg
='Unknown error'):
2443 super(XAttrMetadataError
, self
).__init
__(msg
)
2447 # Parsing code and msg
2448 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2449 or 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
2450 self
.reason
= 'NO_SPACE'
2451 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2452 self
.reason
= 'VALUE_TOO_LONG'
2454 self
.reason
= 'NOT_SUPPORTED'
2457 class XAttrUnavailableError(YoutubeDLError
):
2461 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2462 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2463 # expected HTTP responses to meet HTTP/1.0 or later (see also
2464 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2465 if sys
.version_info
< (3, 0):
2466 kwargs
['strict'] = True
2467 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2468 source_address
= ydl_handler
._params
.get('source_address')
2470 if source_address
is not None:
2471 # This is to workaround _create_connection() from socket where it will try all
2472 # address data from getaddrinfo() including IPv6. This filters the result from
2473 # getaddrinfo() based on the source_address value.
2474 # This is based on the cpython socket.create_connection() function.
2475 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2476 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2477 host
, port
= address
2479 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2480 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2481 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2482 if addrs
and not ip_addrs
:
2483 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2485 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2486 % (ip_version
, source_address
[0]))
2487 for res
in ip_addrs
:
2488 af
, socktype
, proto
, canonname
, sa
= res
2491 sock
= socket
.socket(af
, socktype
, proto
)
2492 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2493 sock
.settimeout(timeout
)
2494 sock
.bind(source_address
)
2496 err
= None # Explicitly break reference cycle
2498 except socket
.error
as _
:
2500 if sock
is not None:
2505 raise socket
.error('getaddrinfo returns an empty list')
2506 if hasattr(hc
, '_create_connection'):
2507 hc
._create
_connection
= _create_connection
2508 sa
= (source_address
, 0)
2509 if hasattr(hc
, 'source_address'): # Python 2.7+
2510 hc
.source_address
= sa
2512 def _hc_connect(self
, *args
, **kwargs
):
2513 sock
= _create_connection(
2514 (self
.host
, self
.port
), self
.timeout
, sa
)
2516 self
.sock
= ssl
.wrap_socket(
2517 sock
, self
.key_file
, self
.cert_file
,
2518 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2521 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2526 def handle_youtubedl_headers(headers
):
2527 filtered_headers
= headers
2529 if 'Youtubedl-no-compression' in filtered_headers
:
2530 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2531 del filtered_headers
['Youtubedl-no-compression']
2533 return filtered_headers
2536 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2537 """Handler for HTTP requests and responses.
2539 This class, when installed with an OpenerDirector, automatically adds
2540 the standard headers to every HTTP request and handles gzipped and
2541 deflated responses from web servers. If compression is to be avoided in
2542 a particular request, the original request in the program code only has
2543 to include the HTTP header "Youtubedl-no-compression", which will be
2544 removed before making the real request.
2546 Part of this code was copied from:
2548 http://techknack.net/python-urllib2-handlers/
2550 Andrew Rowls, the author of that code, agreed to release it to the
2554 def __init__(self
, params
, *args
, **kwargs
):
2555 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2556 self
._params
= params
2558 def http_open(self
, req
):
2559 conn_class
= compat_http_client
.HTTPConnection
2561 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2563 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2564 del req
.headers
['Ytdl-socks-proxy']
2566 return self
.do_open(functools
.partial(
2567 _create_http_connection
, self
, conn_class
, False),
2573 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2575 return zlib
.decompress(data
)
2577 def http_request(self
, req
):
2578 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2579 # always respected by websites, some tend to give out URLs with non percent-encoded
2580 # non-ASCII characters (see telemb.py, ard.py [#3412])
2581 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2582 # To work around aforementioned issue we will replace request's original URL with
2583 # percent-encoded one
2584 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2585 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2586 url
= req
.get_full_url()
2587 url_escaped
= escape_url(url
)
2589 # Substitute URL if any change after escaping
2590 if url
!= url_escaped
:
2591 req
= update_Request(req
, url
=url_escaped
)
2593 for h
, v
in std_headers
.items():
2594 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2595 # The dict keys are capitalized because of this bug by urllib
2596 if h
.capitalize() not in req
.headers
:
2597 req
.add_header(h
, v
)
2599 req
.headers
= handle_youtubedl_headers(req
.headers
)
2601 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2602 # Python 2.6 is brain-dead when it comes to fragments
2603 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2604 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2608 def http_response(self
, req
, resp
):
2611 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2612 content
= resp
.read()
2613 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2615 uncompressed
= io
.BytesIO(gz
.read())
2616 except IOError as original_ioerror
:
2617 # There may be junk add the end of the file
2618 # See http://stackoverflow.com/q/4928560/35070 for details
2619 for i
in range(1, 1024):
2621 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2622 uncompressed
= io
.BytesIO(gz
.read())
2627 raise original_ioerror
2628 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2629 resp
.msg
= old_resp
.msg
2630 del resp
.headers
['Content-encoding']
2632 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2633 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2634 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2635 resp
.msg
= old_resp
.msg
2636 del resp
.headers
['Content-encoding']
2637 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2638 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2639 if 300 <= resp
.code
< 400:
2640 location
= resp
.headers
.get('Location')
2642 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2643 if sys
.version_info
>= (3, 0):
2644 location
= location
.encode('iso-8859-1').decode('utf-8')
2646 location
= location
.decode('utf-8')
2647 location_escaped
= escape_url(location
)
2648 if location
!= location_escaped
:
2649 del resp
.headers
['Location']
2650 if sys
.version_info
< (3, 0):
2651 location_escaped
= location_escaped
.encode('utf-8')
2652 resp
.headers
['Location'] = location_escaped
2655 https_request
= http_request
2656 https_response
= http_response
2659 def make_socks_conn_class(base_class
, socks_proxy
):
2660 assert issubclass(base_class
, (
2661 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2663 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2664 if url_components
.scheme
.lower() == 'socks5':
2665 socks_type
= ProxyType
.SOCKS5
2666 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2667 socks_type
= ProxyType
.SOCKS4
2668 elif url_components
.scheme
.lower() == 'socks4a':
2669 socks_type
= ProxyType
.SOCKS4A
2671 def unquote_if_non_empty(s
):
2674 return compat_urllib_parse_unquote_plus(s
)
2678 url_components
.hostname
, url_components
.port
or 1080,
2680 unquote_if_non_empty(url_components
.username
),
2681 unquote_if_non_empty(url_components
.password
),
2684 class SocksConnection(base_class
):
2686 self
.sock
= sockssocket()
2687 self
.sock
.setproxy(*proxy_args
)
2688 if type(self
.timeout
) in (int, float):
2689 self
.sock
.settimeout(self
.timeout
)
2690 self
.sock
.connect((self
.host
, self
.port
))
2692 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2693 if hasattr(self
, '_context'): # Python > 2.6
2694 self
.sock
= self
._context
.wrap_socket(
2695 self
.sock
, server_hostname
=self
.host
)
2697 self
.sock
= ssl
.wrap_socket(self
.sock
)
2699 return SocksConnection
2702 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2703 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2704 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2705 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2706 self
._params
= params
2708 def https_open(self
, req
):
2710 conn_class
= self
._https
_conn
_class
2712 if hasattr(self
, '_context'): # python > 2.6
2713 kwargs
['context'] = self
._context
2714 if hasattr(self
, '_check_hostname'): # python 3.x
2715 kwargs
['check_hostname'] = self
._check
_hostname
2717 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2719 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2720 del req
.headers
['Ytdl-socks-proxy']
2722 return self
.do_open(functools
.partial(
2723 _create_http_connection
, self
, conn_class
, True),
2727 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2728 _HTTPONLY_PREFIX
= '#HttpOnly_'
2730 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2731 # Store session cookies with `expires` set to 0 instead of an empty
2734 if cookie
.expires
is None:
2736 compat_cookiejar
.MozillaCookieJar
.save(self
, filename
, ignore_discard
, ignore_expires
)
2738 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2739 """Load cookies from a file."""
2740 if filename
is None:
2741 if self
.filename
is not None:
2742 filename
= self
.filename
2744 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2747 with open(filename
) as f
:
2749 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2750 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2751 cf
.write(compat_str(line
))
2753 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2754 # Session cookies are denoted by either `expires` field set to
2755 # an empty string or 0. MozillaCookieJar only recognizes the former
2756 # (see [1]). So we need force the latter to be recognized as session
2757 # cookies on our own.
2758 # Session cookies may be important for cookies-based authentication,
2759 # e.g. usually, when user does not check 'Remember me' check box while
2760 # logging in on a site, some important cookies are stored as session
2761 # cookies so that not recognizing them will result in failed login.
2762 # 1. https://bugs.python.org/issue17164
2764 # Treat `expires=0` cookies as session cookies
2765 if cookie
.expires
== 0:
2766 cookie
.expires
= None
2767 cookie
.discard
= True
2770 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2771 def __init__(self
, cookiejar
=None):
2772 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2774 def http_response(self
, request
, response
):
2775 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2776 # characters in Set-Cookie HTTP header of last response (see
2777 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2778 # In order to at least prevent crashing we will percent encode Set-Cookie
2779 # header before HTTPCookieProcessor starts processing it.
2780 # if sys.version_info < (3, 0) and response.headers:
2781 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2782 # set_cookie = response.headers.get(set_cookie_header)
2784 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2785 # if set_cookie != set_cookie_escaped:
2786 # del response.headers[set_cookie_header]
2787 # response.headers[set_cookie_header] = set_cookie_escaped
2788 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2790 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2791 https_response
= http_response
2794 def extract_timezone(date_str
):
2796 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2799 timezone
= datetime
.timedelta()
2801 date_str
= date_str
[:-len(m
.group('tz'))]
2802 if not m
.group('sign'):
2803 timezone
= datetime
.timedelta()
2805 sign
= 1 if m
.group('sign') == '+' else -1
2806 timezone
= datetime
.timedelta(
2807 hours
=sign
* int(m
.group('hours')),
2808 minutes
=sign
* int(m
.group('minutes')))
2809 return timezone
, date_str
2812 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2813 """ Return a UNIX timestamp from the given date """
2815 if date_str
is None:
2818 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2820 if timezone
is None:
2821 timezone
, date_str
= extract_timezone(date_str
)
2824 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2825 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2826 return calendar
.timegm(dt
.timetuple())
2831 def date_formats(day_first
=True):
2832 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2835 def unified_strdate(date_str
, day_first
=True):
2836 """Return a string with the date in the format YYYYMMDD"""
2838 if date_str
is None:
2842 date_str
= date_str
.replace(',', ' ')
2843 # Remove AM/PM + timezone
2844 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2845 _
, date_str
= extract_timezone(date_str
)
2847 for expression
in date_formats(day_first
):
2849 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2852 if upload_date
is None:
2853 timetuple
= email
.utils
.parsedate_tz(date_str
)
2856 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2859 if upload_date
is not None:
2860 return compat_str(upload_date
)
2863 def unified_timestamp(date_str
, day_first
=True):
2864 if date_str
is None:
2867 date_str
= re
.sub(r
'[,|]', '', date_str
)
2869 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2870 timezone
, date_str
= extract_timezone(date_str
)
2872 # Remove AM/PM + timezone
2873 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2875 # Remove unrecognized timezones from ISO 8601 alike timestamps
2876 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2878 date_str
= date_str
[:-len(m
.group('tz'))]
2880 # Python only supports microseconds, so remove nanoseconds
2881 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2883 date_str
= m
.group(1)
2885 for expression
in date_formats(day_first
):
2887 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2888 return calendar
.timegm(dt
.timetuple())
2891 timetuple
= email
.utils
.parsedate_tz(date_str
)
2893 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2896 def determine_ext(url
, default_ext
='unknown_video'):
2897 if url
is None or '.' not in url
:
2899 guess
= url
.partition('?')[0].rpartition('.')[2]
2900 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
2902 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2903 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
2904 return guess
.rstrip('/')
2909 def subtitles_filename(filename
, sub_lang
, sub_format
):
2910 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
2913 def date_from_str(date_str
):
2915 Return a datetime object from a string in the format YYYYMMDD or
2916 (now|today)[+-][0-9](day|week|month|year)(s)?"""
2917 today
= datetime
.date
.today()
2918 if date_str
in ('now', 'today'):
2920 if date_str
== 'yesterday':
2921 return today
- datetime
.timedelta(days
=1)
2922 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
2923 if match
is not None:
2924 sign
= match
.group('sign')
2925 time
= int(match
.group('time'))
2928 unit
= match
.group('unit')
2929 # A bad approximation?
2933 elif unit
== 'year':
2937 delta
= datetime
.timedelta(**{unit
: time
})
2938 return today
+ delta
2939 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
2942 def hyphenate_date(date_str
):
2944 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
2945 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
2946 if match
is not None:
2947 return '-'.join(match
.groups())
2952 class DateRange(object):
2953 """Represents a time interval between two dates"""
2955 def __init__(self
, start
=None, end
=None):
2956 """start and end must be strings in the format accepted by date"""
2957 if start
is not None:
2958 self
.start
= date_from_str(start
)
2960 self
.start
= datetime
.datetime
.min.date()
2962 self
.end
= date_from_str(end
)
2964 self
.end
= datetime
.datetime
.max.date()
2965 if self
.start
> self
.end
:
2966 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
2970 """Returns a range that only contains the given day"""
2971 return cls(day
, day
)
2973 def __contains__(self
, date
):
2974 """Check if the date is in the range"""
2975 if not isinstance(date
, datetime
.date
):
2976 date
= date_from_str(date
)
2977 return self
.start
<= date
<= self
.end
2980 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
2983 def platform_name():
2984 """ Returns the platform name as a compat_str """
2985 res
= platform
.platform()
2986 if isinstance(res
, bytes):
2987 res
= res
.decode(preferredencoding())
2989 assert isinstance(res
, compat_str
)
2993 def _windows_write_string(s
, out
):
2994 """ Returns True if the string was written using special methods,
2995 False if it has yet to be written out."""
2996 # Adapted from http://stackoverflow.com/a/3259271/35070
2999 import ctypes
.wintypes
3007 fileno
= out
.fileno()
3008 except AttributeError:
3009 # If the output stream doesn't have a fileno, it's virtual
3011 except io
.UnsupportedOperation
:
3012 # Some strange Windows pseudo files?
3014 if fileno
not in WIN_OUTPUT_IDS
:
3017 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3018 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3019 ('GetStdHandle', ctypes
.windll
.kernel32
))
3020 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3022 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3023 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3024 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3025 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3026 written
= ctypes
.wintypes
.DWORD(0)
3028 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3029 FILE_TYPE_CHAR
= 0x0002
3030 FILE_TYPE_REMOTE
= 0x8000
3031 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3032 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3033 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3034 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3035 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3037 def not_a_console(handle
):
3038 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3040 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3041 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3043 if not_a_console(h
):
3046 def next_nonbmp_pos(s
):
3048 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3049 except StopIteration:
3053 count
= min(next_nonbmp_pos(s
), 1024)
3055 ret
= WriteConsoleW(
3056 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3058 raise OSError('Failed to write string')
3059 if not count
: # We just wrote a non-BMP character
3060 assert written
.value
== 2
3063 assert written
.value
> 0
3064 s
= s
[written
.value
:]
3068 def write_string(s
, out
=None, encoding
=None):
3071 assert type(s
) == compat_str
3073 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3074 if _windows_write_string(s
, out
):
3077 if ('b' in getattr(out
, 'mode', '')
3078 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3079 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3081 elif hasattr(out
, 'buffer'):
3082 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3083 byt
= s
.encode(enc
, 'ignore')
3084 out
.buffer.write(byt
)
3090 def bytes_to_intlist(bs
):
3093 if isinstance(bs
[0], int): # Python 3
3096 return [ord(c
) for c
in bs
]
3099 def intlist_to_bytes(xs
):
3102 return compat_struct_pack('%dB' % len(xs
), *xs
)
3105 # Cross-platform file locking
3106 if sys
.platform
== 'win32':
3107 import ctypes
.wintypes
3110 class OVERLAPPED(ctypes
.Structure
):
3112 ('Internal', ctypes
.wintypes
.LPVOID
),
3113 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3114 ('Offset', ctypes
.wintypes
.DWORD
),
3115 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3116 ('hEvent', ctypes
.wintypes
.HANDLE
),
3119 kernel32
= ctypes
.windll
.kernel32
3120 LockFileEx
= kernel32
.LockFileEx
3121 LockFileEx
.argtypes
= [
3122 ctypes
.wintypes
.HANDLE
, # hFile
3123 ctypes
.wintypes
.DWORD
, # dwFlags
3124 ctypes
.wintypes
.DWORD
, # dwReserved
3125 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3126 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3127 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3129 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3130 UnlockFileEx
= kernel32
.UnlockFileEx
3131 UnlockFileEx
.argtypes
= [
3132 ctypes
.wintypes
.HANDLE
, # hFile
3133 ctypes
.wintypes
.DWORD
, # dwReserved
3134 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3135 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3136 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3138 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3139 whole_low
= 0xffffffff
3140 whole_high
= 0x7fffffff
3142 def _lock_file(f
, exclusive
):
3143 overlapped
= OVERLAPPED()
3144 overlapped
.Offset
= 0
3145 overlapped
.OffsetHigh
= 0
3146 overlapped
.hEvent
= 0
3147 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3148 handle
= msvcrt
.get_osfhandle(f
.fileno())
3149 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3150 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3151 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3153 def _unlock_file(f
):
3154 assert f
._lock
_file
_overlapped
_p
3155 handle
= msvcrt
.get_osfhandle(f
.fileno())
3156 if not UnlockFileEx(handle
, 0,
3157 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3158 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3161 # Some platforms, such as Jython, is missing fcntl
3165 def _lock_file(f
, exclusive
):
3166 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3168 def _unlock_file(f
):
3169 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3171 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3173 def _lock_file(f
, exclusive
):
3174 raise IOError(UNSUPPORTED_MSG
)
3176 def _unlock_file(f
):
3177 raise IOError(UNSUPPORTED_MSG
)
3180 class locked_file(object):
3181 def __init__(self
, filename
, mode
, encoding
=None):
3182 assert mode
in ['r', 'a', 'w']
3183 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3186 def __enter__(self
):
3187 exclusive
= self
.mode
!= 'r'
3189 _lock_file(self
.f
, exclusive
)
3195 def __exit__(self
, etype
, value
, traceback
):
3197 _unlock_file(self
.f
)
3204 def write(self
, *args
):
3205 return self
.f
.write(*args
)
3207 def read(self
, *args
):
3208 return self
.f
.read(*args
)
3211 def get_filesystem_encoding():
3212 encoding
= sys
.getfilesystemencoding()
3213 return encoding
if encoding
is not None else 'utf-8'
3216 def shell_quote(args
):
3218 encoding
= get_filesystem_encoding()
3220 if isinstance(a
, bytes):
3221 # We may get a filename encoded with 'encodeFilename'
3222 a
= a
.decode(encoding
)
3223 quoted_args
.append(compat_shlex_quote(a
))
3224 return ' '.join(quoted_args
)
3227 def smuggle_url(url
, data
):
3228 """ Pass additional data in a URL for internal use. """
3230 url
, idata
= unsmuggle_url(url
, {})
3232 sdata
= compat_urllib_parse_urlencode(
3233 {'__youtubedl_smuggle': json
.dumps(data
)})
3234 return url
+ '#' + sdata
3237 def unsmuggle_url(smug_url
, default
=None):
3238 if '#__youtubedl_smuggle' not in smug_url
:
3239 return smug_url
, default
3240 url
, _
, sdata
= smug_url
.rpartition('#')
3241 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3242 data
= json
.loads(jsond
)
3246 def format_bytes(bytes):
3249 if type(bytes) is str:
3250 bytes = float(bytes)
3254 exponent
= int(math
.log(bytes, 1024.0))
3255 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3256 converted
= float(bytes) / float(1024 ** exponent
)
3257 return '%.2f%s' % (converted
, suffix
)
3260 def lookup_unit_table(unit_table
, s
):
3261 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3263 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3266 num_str
= m
.group('num').replace(',', '.')
3267 mult
= unit_table
[m
.group('unit')]
3268 return int(float(num_str
) * mult
)
3271 def parse_filesize(s
):
3275 # The lower-case forms are of course incorrect and unofficial,
3276 # but we support those too
3293 'megabytes': 1000 ** 2,
3294 'mebibytes': 1024 ** 2,
3300 'gigabytes': 1000 ** 3,
3301 'gibibytes': 1024 ** 3,
3307 'terabytes': 1000 ** 4,
3308 'tebibytes': 1024 ** 4,
3314 'petabytes': 1000 ** 5,
3315 'pebibytes': 1024 ** 5,
3321 'exabytes': 1000 ** 6,
3322 'exbibytes': 1024 ** 6,
3328 'zettabytes': 1000 ** 7,
3329 'zebibytes': 1024 ** 7,
3335 'yottabytes': 1000 ** 8,
3336 'yobibytes': 1024 ** 8,
3339 return lookup_unit_table(_UNIT_TABLE
, s
)
3348 if re
.match(r
'^[\d,.]+$', s
):
3349 return str_to_int(s
)
3360 return lookup_unit_table(_UNIT_TABLE
, s
)
3363 def parse_resolution(s
):
3367 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xXĆ]\s*(?P<h>\d+)\b', s
)
3370 'width': int(mobj
.group('w')),
3371 'height': int(mobj
.group('h')),
3374 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3376 return {'height': int(mobj
.group(1))}
3378 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3380 return {'height': int(mobj
.group(1)) * 540}
3385 def parse_bitrate(s
):
3386 if not isinstance(s
, compat_str
):
3388 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3390 return int(mobj
.group(1))
3393 def month_by_name(name
, lang
='en'):
3394 """ Return the number of a month by (locale-independently) English name """
3396 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3399 return month_names
.index(name
) + 1
3404 def month_by_abbreviation(abbrev
):
3405 """ Return the number of a month by (locale-independently) English
3409 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3414 def fix_xml_ampersands(xml_str
):
3415 """Replace all the '&' by '&' in XML"""
3417 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3422 def setproctitle(title
):
3423 assert isinstance(title
, compat_str
)
3425 # ctypes in Jython is not complete
3426 # http://bugs.jython.org/issue2148
3427 if sys
.platform
.startswith('java'):
3431 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3435 # LoadLibrary in Windows Python 2.7.13 only expects
3436 # a bytestring, but since unicode_literals turns
3437 # every string into a unicode string, it fails.
3439 title_bytes
= title
.encode('utf-8')
3440 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3441 buf
.value
= title_bytes
3443 libc
.prctl(15, buf
, 0, 0, 0)
3444 except AttributeError:
3445 return # Strange libc, just skip this
3448 def remove_start(s
, start
):
3449 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3452 def remove_end(s
, end
):
3453 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3456 def remove_quotes(s
):
3457 if s
is None or len(s
) < 2:
3459 for quote
in ('"', "'", ):
3460 if s
[0] == quote
and s
[-1] == quote
:
3465 def url_basename(url
):
3466 path
= compat_urlparse
.urlparse(url
).path
3467 return path
.strip('/').split('/')[-1]
3471 return re
.match(r
'https?://[^?#&]+/', url
).group()
3474 def urljoin(base
, path
):
3475 if isinstance(path
, bytes):
3476 path
= path
.decode('utf-8')
3477 if not isinstance(path
, compat_str
) or not path
:
3479 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3481 if isinstance(base
, bytes):
3482 base
= base
.decode('utf-8')
3483 if not isinstance(base
, compat_str
) or not re
.match(
3484 r
'^(?:https?:)?//', base
):
3486 return compat_urlparse
.urljoin(base
, path
)
3489 class HEADRequest(compat_urllib_request
.Request
):
3490 def get_method(self
):
3494 class PUTRequest(compat_urllib_request
.Request
):
3495 def get_method(self
):
3499 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3502 v
= getattr(v
, get_attr
, None)
3508 return int(v
) * invscale
// scale
3509 except (ValueError, TypeError):
3513 def str_or_none(v
, default
=None):
3514 return default
if v
is None else compat_str(v
)
3517 def str_to_int(int_str
):
3518 """ A more relaxed version of int_or_none """
3521 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3525 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3529 return float(v
) * invscale
/ scale
3530 except (ValueError, TypeError):
3534 def bool_or_none(v
, default
=None):
3535 return v
if isinstance(v
, bool) else default
3538 def strip_or_none(v
, default
=None):
3539 return v
.strip() if isinstance(v
, compat_str
) else default
3542 def url_or_none(url
):
3543 if not url
or not isinstance(url
, compat_str
):
3546 return url
if re
.match(r
'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url
) else None
3549 def parse_duration(s
):
3550 if not isinstance(s
, compat_basestring
):
3555 days
, hours
, mins
, secs
, ms
= [None] * 5
3556 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3558 days
, hours
, mins
, secs
, ms
= m
.groups()
3563 [0-9]+\s*y(?:ears?)?\s*
3566 [0-9]+\s*m(?:onths?)?\s*
3569 [0-9]+\s*w(?:eeks?)?\s*
3572 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3576 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3579 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3582 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3585 days
, hours
, mins
, secs
, ms
= m
.groups()
3587 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3589 hours
, mins
= m
.groups()
3595 duration
+= float(secs
)
3597 duration
+= float(mins
) * 60
3599 duration
+= float(hours
) * 60 * 60
3601 duration
+= float(days
) * 24 * 60 * 60
3603 duration
+= float(ms
)
3607 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3608 name
, real_ext
= os
.path
.splitext(filename
)
3610 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3611 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3612 else '{0}.{1}'.format(filename
, ext
))
3615 def replace_extension(filename
, ext
, expected_real_ext
=None):
3616 name
, real_ext
= os
.path
.splitext(filename
)
3617 return '{0}.{1}'.format(
3618 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3622 def check_executable(exe
, args
=[]):
3623 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3624 args can be a list of arguments for a short output (like -version) """
3626 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3632 def get_exe_version(exe
, args
=['--version'],
3633 version_re
=None, unrecognized
='present'):
3634 """ Returns the version of the specified executable,
3635 or False if the executable is not present """
3637 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3638 # SIGTTOU if youtube-dl is run in the background.
3639 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3640 out
, _
= subprocess
.Popen(
3641 [encodeArgument(exe
)] + args
,
3642 stdin
=subprocess
.PIPE
,
3643 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3646 if isinstance(out
, bytes): # Python 2.x
3647 out
= out
.decode('ascii', 'ignore')
3648 return detect_exe_version(out
, version_re
, unrecognized
)
3651 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3652 assert isinstance(output
, compat_str
)
3653 if version_re
is None:
3654 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3655 m
= re
.search(version_re
, output
)
3662 class PagedList(object):
3664 # This is only useful for tests
3665 return len(self
.getslice())
3668 class OnDemandPagedList(PagedList
):
3669 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3670 self
._pagefunc
= pagefunc
3671 self
._pagesize
= pagesize
3672 self
._use
_cache
= use_cache
3676 def getslice(self
, start
=0, end
=None):
3678 for pagenum
in itertools
.count(start
// self
._pagesize
):
3679 firstid
= pagenum
* self
._pagesize
3680 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3681 if start
>= nextfirstid
:
3686 page_results
= self
._cache
.get(pagenum
)
3687 if page_results
is None:
3688 page_results
= list(self
._pagefunc
(pagenum
))
3690 self
._cache
[pagenum
] = page_results
3693 start
% self
._pagesize
3694 if firstid
<= start
< nextfirstid
3698 ((end
- 1) % self
._pagesize
) + 1
3699 if (end
is not None and firstid
<= end
<= nextfirstid
)
3702 if startv
!= 0 or endv
is not None:
3703 page_results
= page_results
[startv
:endv
]
3704 res
.extend(page_results
)
3706 # A little optimization - if current page is not "full", ie. does
3707 # not contain page_size videos then we can assume that this page
3708 # is the last one - there are no more ids on further pages -
3709 # i.e. no need to query again.
3710 if len(page_results
) + startv
< self
._pagesize
:
3713 # If we got the whole page, but the next page is not interesting,
3714 # break out early as well
3715 if end
== nextfirstid
:
3720 class InAdvancePagedList(PagedList
):
3721 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3722 self
._pagefunc
= pagefunc
3723 self
._pagecount
= pagecount
3724 self
._pagesize
= pagesize
3726 def getslice(self
, start
=0, end
=None):
3728 start_page
= start
// self
._pagesize
3730 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3731 skip_elems
= start
- start_page
* self
._pagesize
3732 only_more
= None if end
is None else end
- start
3733 for pagenum
in range(start_page
, end_page
):
3734 page
= list(self
._pagefunc
(pagenum
))
3736 page
= page
[skip_elems
:]
3738 if only_more
is not None:
3739 if len(page
) < only_more
:
3740 only_more
-= len(page
)
3742 page
= page
[:only_more
]
3749 def uppercase_escape(s
):
3750 unicode_escape
= codecs
.getdecoder('unicode_escape')
3752 r
'\\U[0-9a-fA-F]{8}',
3753 lambda m
: unicode_escape(m
.group(0))[0],
3757 def lowercase_escape(s
):
3758 unicode_escape
= codecs
.getdecoder('unicode_escape')
3760 r
'\\u[0-9a-fA-F]{4}',
3761 lambda m
: unicode_escape(m
.group(0))[0],
3765 def escape_rfc3986(s
):
3766 """Escape non-ASCII characters as suggested by RFC 3986"""
3767 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3768 s
= s
.encode('utf-8')
3769 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3772 def escape_url(url
):
3773 """Escape URL as suggested by RFC 3986"""
3774 url_parsed
= compat_urllib_parse_urlparse(url
)
3775 return url_parsed
._replace
(
3776 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3777 path
=escape_rfc3986(url_parsed
.path
),
3778 params
=escape_rfc3986(url_parsed
.params
),
3779 query
=escape_rfc3986(url_parsed
.query
),
3780 fragment
=escape_rfc3986(url_parsed
.fragment
)
3784 def read_batch_urls(batch_fd
):
3786 if not isinstance(url
, compat_str
):
3787 url
= url
.decode('utf-8', 'replace')
3788 BOM_UTF8
= '\xef\xbb\xbf'
3789 if url
.startswith(BOM_UTF8
):
3790 url
= url
[len(BOM_UTF8
):]
3792 if url
.startswith(('#', ';', ']')):
3796 with contextlib
.closing(batch_fd
) as fd
:
3797 return [url
for url
in map(fixup
, fd
) if url
]
3800 def urlencode_postdata(*args
, **kargs
):
3801 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3804 def update_url_query(url
, query
):
3807 parsed_url
= compat_urlparse
.urlparse(url
)
3808 qs
= compat_parse_qs(parsed_url
.query
)
3810 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3811 query
=compat_urllib_parse_urlencode(qs
, True)))
3814 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
3815 req_headers
= req
.headers
.copy()
3816 req_headers
.update(headers
)
3817 req_data
= data
or req
.data
3818 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3819 req_get_method
= req
.get_method()
3820 if req_get_method
== 'HEAD':
3821 req_type
= HEADRequest
3822 elif req_get_method
== 'PUT':
3823 req_type
= PUTRequest
3825 req_type
= compat_urllib_request
.Request
3827 req_url
, data
=req_data
, headers
=req_headers
,
3828 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3829 if hasattr(req
, 'timeout'):
3830 new_req
.timeout
= req
.timeout
3834 def _multipart_encode_impl(data
, boundary
):
3835 content_type
= 'multipart/form-data; boundary=%s' % boundary
3838 for k
, v
in data
.items():
3839 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3840 if isinstance(k
, compat_str
):
3841 k
= k
.encode('utf-8')
3842 if isinstance(v
, compat_str
):
3843 v
= v
.encode('utf-8')
3844 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3845 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3846 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3847 if boundary
.encode('ascii') in content
:
3848 raise ValueError('Boundary overlaps with data')
3851 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3853 return out
, content_type
3856 def multipart_encode(data
, boundary
=None):
3858 Encode a dict to RFC 7578-compliant form-data
3861 A dict where keys and values can be either Unicode or bytes-like
3864 If specified a Unicode object, it's used as the boundary. Otherwise
3865 a random boundary is generated.
3867 Reference: https://tools.ietf.org/html/rfc7578
3869 has_specified_boundary
= boundary
is not None
3872 if boundary
is None:
3873 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3876 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3879 if has_specified_boundary
:
3883 return out
, content_type
3886 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3887 if isinstance(key_or_keys
, (list, tuple)):
3888 for key
in key_or_keys
:
3889 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3893 return d
.get(key_or_keys
, default
)
3896 def try_get(src
, getter
, expected_type
=None):
3897 if not isinstance(getter
, (list, tuple)):
3902 except (AttributeError, KeyError, TypeError, IndexError):
3905 if expected_type
is None or isinstance(v
, expected_type
):
3909 def merge_dicts(*dicts
):
3911 for a_dict
in dicts
:
3912 for k
, v
in a_dict
.items():
3916 or (isinstance(v
, compat_str
) and v
3917 and isinstance(merged
[k
], compat_str
)
3918 and not merged
[k
])):
3923 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
3924 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
3936 TV_PARENTAL_GUIDELINES
= {
3946 def parse_age_limit(s
):
3948 return s
if 0 <= s
<= 21 else None
3949 if not isinstance(s
, compat_basestring
):
3951 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
3953 return int(m
.group('age'))
3955 return US_RATINGS
[s
]
3956 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
3958 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3962 def strip_jsonp(code
):
3965 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3966 (?:\s*&&\s*(?P=func_name))?
3967 \s*\(\s*(?P<callback_data>.*)\);?
3968 \s*?(?://[^\n]*)*$''',
3969 r
'\g<callback_data>', code
)
3972 def js_to_json(code
):
3973 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
3974 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
3976 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
3977 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
3982 if v
in ('true', 'false', 'null'):
3984 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
3987 if v
[0] in ("'", '"'):
3988 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3993 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3995 for regex
, base
in INTEGER_TABLE
:
3996 im
= re
.match(regex
, v
)
3998 i
= int(im
.group(1), base
)
3999 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4003 return re
.sub(r
'''(?sx)
4004 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4005 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4006 {comment}|,(?={skip}[\]}}])|
4007 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4008 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4010 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4013 def qualities(quality_ids
):
4014 """ Get a numeric quality value out of a list of possible values """
4017 return quality_ids
.index(qid
)
4023 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
4026 def limit_length(s
, length
):
4027 """ Add ellipses to overly long strings """
4032 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4036 def version_tuple(v
):
4037 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4040 def is_outdated_version(version
, limit
, assume_new
=True):
4042 return not assume_new
4044 return version_tuple(version
) < version_tuple(limit
)
4046 return not assume_new
4049 def ytdl_is_updateable():
4050 """ Returns if youtube-dl can be updated with -U """
4051 from zipimport
import zipimporter
4053 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4056 def args_to_str(args
):
4057 # Get a short string representation for a subprocess command
4058 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4061 def error_to_compat_str(err
):
4063 # On python 2 error byte string must be decoded with proper
4064 # encoding rather than ascii
4065 if sys
.version_info
[0] < 3:
4066 err_str
= err_str
.decode(preferredencoding())
4070 def mimetype2ext(mt
):
4076 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4077 # it's the most popular one
4078 'audio/mpeg': 'mp3',
4083 _
, _
, res
= mt
.rpartition('/')
4084 res
= res
.split(';')[0].strip().lower()
4088 'smptett+xml': 'tt',
4092 'x-mp4-fragmented': 'mp4',
4093 'x-ms-sami': 'sami',
4096 'x-mpegurl': 'm3u8',
4097 'vnd.apple.mpegurl': 'm3u8',
4101 'vnd.ms-sstr+xml': 'ism',
4107 def parse_codecs(codecs_str
):
4108 # http://tools.ietf.org/html/rfc6381
4111 splited_codecs
= list(filter(None, map(
4112 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4113 vcodec
, acodec
= None, None
4114 for full_codec
in splited_codecs
:
4115 codec
= full_codec
.split('.')[0]
4116 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4119 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4123 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4124 if not vcodec
and not acodec
:
4125 if len(splited_codecs
) == 2:
4127 'vcodec': splited_codecs
[0],
4128 'acodec': splited_codecs
[1],
4132 'vcodec': vcodec
or 'none',
4133 'acodec': acodec
or 'none',
4138 def urlhandle_detect_ext(url_handle
):
4139 getheader
= url_handle
.headers
.get
4141 cd
= getheader('Content-Disposition')
4143 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4145 e
= determine_ext(m
.group('filename'), default_ext
=None)
4149 return mimetype2ext(getheader('Content-Type'))
4152 def encode_data_uri(data
, mime_type
):
4153 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4156 def age_restricted(content_limit
, age_limit
):
4157 """ Returns True iff the content should be blocked """
4159 if age_limit
is None: # No limit set
4161 if content_limit
is None:
4162 return False # Content available for everyone
4163 return age_limit
< content_limit
4166 def is_html(first_bytes
):
4167 """ Detect whether a file contains HTML by examining its first bytes. """
4170 (b
'\xef\xbb\xbf', 'utf-8'),
4171 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4172 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4173 (b
'\xff\xfe', 'utf-16-le'),
4174 (b
'\xfe\xff', 'utf-16-be'),
4176 for bom
, enc
in BOMS
:
4177 if first_bytes
.startswith(bom
):
4178 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4181 s
= first_bytes
.decode('utf-8', 'replace')
4183 return re
.match(r
'^\s*<', s
)
4186 def determine_protocol(info_dict
):
4187 protocol
= info_dict
.get('protocol')
4188 if protocol
is not None:
4191 url
= info_dict
['url']
4192 if url
.startswith('rtmp'):
4194 elif url
.startswith('mms'):
4196 elif url
.startswith('rtsp'):
4199 ext
= determine_ext(url
)
4205 return compat_urllib_parse_urlparse(url
).scheme
4208 def render_table(header_row
, data
):
4209 """ Render a list of rows, each as a list of values """
4210 table
= [header_row
] + data
4211 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4212 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
4213 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4216 def _match_one(filter_part
, dct
):
4217 COMPARISON_OPERATORS
= {
4225 operator_rex
= re
.compile(r
'''(?x)\s*
4227 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4229 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4230 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4231 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4234 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4235 m = operator_rex.search(filter_part)
4237 op = COMPARISON_OPERATORS[m.group('op')]
4238 actual_value = dct.get(m.group('key'))
4239 if (m.group('quotedstrval') is not None
4240 or m.group('strval') is not None
4241 # If the original field is a string and matching comparisonvalue is
4242 # a number we should respect the origin of the original field
4243 # and process comparison value as a string (see
4244 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4245 or actual_value is not None and m.group('intval') is not None
4246 and isinstance(actual_value, compat_str)):
4247 if m.group('op') not in ('=', '!='):
4249 'Operator %s does not support string values!' % m.group('op'))
4250 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4251 quote = m.group('quote')
4252 if quote is not None:
4253 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4256 comparison_value = int(m.group('intval'))
4258 comparison_value = parse_filesize(m.group('intval'))
4259 if comparison_value is None:
4260 comparison_value = parse_filesize(m.group('intval') + 'B')
4261 if comparison_value is None:
4263 'Invalid integer value %r in filter part %r' % (
4264 m.group('intval'), filter_part))
4265 if actual_value is None:
4266 return m.group('none_inclusive')
4267 return op(actual_value, comparison_value)
4270 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4271 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4273 operator_rex = re.compile(r'''(?x
)\s
*
4274 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4276 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4277 m = operator_rex.search(filter_part)
4279 op = UNARY_OPERATORS[m.group('op')]
4280 actual_value = dct.get(m.group('key'))
4281 return op(actual_value)
4283 raise ValueError('Invalid filter part %r' % filter_part)
4286 def match_str(filter_str, dct):
4287 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4290 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4293 def match_filter_func(filter_str):
4294 def _match_func(info_dict):
4295 if match_str(filter_str, info_dict):
4298 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4299 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4303 def parse_dfxp_time_expr(time_expr):
4307 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4309 return float(mobj.group('time_offset'))
4311 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4313 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4316 def srt_subtitles_timecode(seconds):
4317 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4320 def dfxp2srt(dfxp_data):
4322 @param dfxp_data A
bytes-like
object containing DFXP data
4323 @returns A
unicode object containing converted SRT data
4325 LEGACY_NAMESPACES = (
4326 (b'http://www.w3.org/ns/ttml', [
4327 b'http://www.w3.org/2004/11/ttaf1',
4328 b'http://www.w3.org/2006/04/ttaf1',
4329 b'http://www.w3.org/2006/10/ttaf1',
4331 (b'http://www.w3.org/ns/ttml#styling', [
4332 b'http://www.w3.org/ns/ttml#style',
4336 SUPPORTED_STYLING = [
4345 _x = functools.partial(xpath_with_ns, ns_map={
4346 'xml': 'http://www.w3.org/XML/1998/namespace',
4347 'ttml': 'http://www.w3.org/ns/ttml',
4348 'tts': 'http://www.w3.org/ns/ttml#styling',
4354 class TTMLPElementParser(object):
4356 _unclosed_elements = []
4357 _applied_styles = []
4359 def start(self, tag, attrib):
4360 if tag in (_x('ttml:br'), 'br'):
4363 unclosed_elements = []
4365 element_style_id = attrib.get('style')
4367 style.update(default_style)
4368 if element_style_id:
4369 style.update(styles.get(element_style_id, {}))
4370 for prop in SUPPORTED_STYLING:
4371 prop_val = attrib.get(_x('tts:' + prop))
4373 style[prop] = prop_val
4376 for k, v in sorted(style.items()):
4377 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4380 font += ' color="%s"' % v
4381 elif k == 'fontSize':
4382 font += ' size="%s"' % v
4383 elif k == 'fontFamily':
4384 font += ' face="%s"' % v
4385 elif k == 'fontWeight' and v == 'bold':
4387 unclosed_elements.append('b')
4388 elif k == 'fontStyle' and v == 'italic':
4390 unclosed_elements.append('i')
4391 elif k == 'textDecoration' and v == 'underline':
4393 unclosed_elements.append('u')
4395 self._out += '<font' + font + '>'
4396 unclosed_elements.append('font')
4398 if self._applied_styles:
4399 applied_style.update(self._applied_styles[-1])
4400 applied_style.update(style)
4401 self._applied_styles.append(applied_style)
4402 self._unclosed_elements.append(unclosed_elements)
4405 if tag not in (_x('ttml:br'), 'br'):
4406 unclosed_elements = self._unclosed_elements.pop()
4407 for element in reversed(unclosed_elements):
4408 self._out += '</%s>' % element
4409 if unclosed_elements and self._applied_styles:
4410 self._applied_styles.pop()
4412 def data(self, data):
4416 return self._out.strip()
4418 def parse_node(node):
4419 target = TTMLPElementParser()
4420 parser = xml.etree.ElementTree.XMLParser(target=target)
4421 parser.feed(xml.etree.ElementTree.tostring(node))
4422 return parser.close()
4424 for k, v in LEGACY_NAMESPACES:
4426 dfxp_data = dfxp_data.replace(ns, k)
4428 dfxp = compat_etree_fromstring(dfxp_data)
4430 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4433 raise ValueError('Invalid dfxp/TTML subtitle')
4437 for style in dfxp.findall(_x('.//ttml:style')):
4438 style_id = style.get('id') or style.get(_x('xml:id'))
4441 parent_style_id = style.get('style')
4443 if parent_style_id not in styles:
4446 styles[style_id] = styles[parent_style_id].copy()
4447 for prop in SUPPORTED_STYLING:
4448 prop_val = style.get(_x('tts:' + prop))
4450 styles.setdefault(style_id, {})[prop] = prop_val
4456 for p in ('body', 'div'):
4457 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4460 style = styles.get(ele.get('style'))
4463 default_style.update(style)
4465 for para, index in zip(paras, itertools.count(1)):
4466 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4467 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4468 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4469 if begin_time is None:
4474 end_time = begin_time + dur
4475 out.append('%d\n%s --> %s\n%s\n\n' % (
4477 srt_subtitles_timecode(begin_time),
4478 srt_subtitles_timecode(end_time),
4484 def cli_option(params, command_option, param):
4485 param = params.get(param)
4487 param = compat_str(param)
4488 return [command_option, param] if param is not None else []
4491 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4492 param = params.get(param)
4495 assert isinstance(param, bool)
4497 return [command_option + separator + (true_value if param else false_value)]
4498 return [command_option, true_value if param else false_value]
4501 def cli_valueless_option(params, command_option, param, expected_value=True):
4502 param = params.get(param)
4503 return [command_option] if param == expected_value else []
4506 def cli_configuration_args(params, param, default=[]):
4507 ex_args = params.get(param)
4510 assert isinstance(ex_args, list)
4514 class ISO639Utils(object):
4515 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4574 'iw': 'heb', # Replaced by he in 1989 revision
4584 'in': 'ind', # Replaced by id in 1989 revision
4699 'ji': 'yid', # Replaced by yi in 1989 revision
4707 def short2long(cls, code):
4708 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4709 return cls._lang_map.get(code[:2])
4712 def long2short(cls, code):
4713 """Convert language code from ISO 639-2/T to ISO 639-1"""
4714 for short_name, long_name in cls._lang_map.items():
4715 if long_name == code:
4719 class ISO3166Utils(object):
4720 # From http://data.okfn.org/data/core/country-list
4722 'AF': 'Afghanistan',
4723 'AX': 'Ć
land Islands',
4726 'AS': 'American Samoa',
4731 'AG': 'Antigua and Barbuda',
4748 'BO': 'Bolivia, Plurinational State of',
4749 'BQ': 'Bonaire, Sint Eustatius and Saba',
4750 'BA': 'Bosnia and Herzegovina',
4752 'BV': 'Bouvet Island',
4754 'IO': 'British Indian Ocean Territory',
4755 'BN': 'Brunei Darussalam',
4757 'BF': 'Burkina Faso',
4763 'KY': 'Cayman Islands',
4764 'CF': 'Central African Republic',
4768 'CX': 'Christmas Island',
4769 'CC': 'Cocos (Keeling) Islands',
4773 'CD': 'Congo, the Democratic Republic of the',
4774 'CK': 'Cook Islands',
4776 'CI': 'CĆ“te d\'Ivoire',
4781 'CZ': 'Czech Republic',
4785 'DO': 'Dominican Republic',
4788 'SV': 'El Salvador',
4789 'GQ': 'Equatorial Guinea',
4793 'FK': 'Falkland Islands (Malvinas)',
4794 'FO': 'Faroe Islands',
4798 'GF': 'French Guiana',
4799 'PF': 'French Polynesia',
4800 'TF': 'French Southern Territories',
4815 'GW': 'Guinea-Bissau',
4818 'HM': 'Heard Island and McDonald Islands',
4819 'VA': 'Holy See (Vatican City State)',
4826 'IR': 'Iran, Islamic Republic of',
4829 'IM': 'Isle of Man',
4839 'KP': 'Korea, Democratic People\'s Republic of',
4840 'KR': 'Korea, Republic of',
4843 'LA': 'Lao People\'s Democratic Republic',
4849 'LI': 'Liechtenstein',
4853 'MK': 'Macedonia, the Former Yugoslav Republic of',
4860 'MH': 'Marshall Islands',
4866 'FM': 'Micronesia, Federated States of',
4867 'MD': 'Moldova, Republic of',
4878 'NL': 'Netherlands',
4879 'NC': 'New Caledonia',
4880 'NZ': 'New Zealand',
4885 'NF': 'Norfolk Island',
4886 'MP': 'Northern Mariana Islands',
4891 'PS': 'Palestine, State of',
4893 'PG': 'Papua New Guinea',
4896 'PH': 'Philippines',
4900 'PR': 'Puerto Rico',
4904 'RU': 'Russian Federation',
4906 'BL': 'Saint BarthƩlemy',
4907 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4908 'KN': 'Saint Kitts and Nevis',
4909 'LC': 'Saint Lucia',
4910 'MF': 'Saint Martin (French part)',
4911 'PM': 'Saint Pierre and Miquelon',
4912 'VC': 'Saint Vincent and the Grenadines',
4915 'ST': 'Sao Tome and Principe',
4916 'SA': 'Saudi Arabia',
4920 'SL': 'Sierra Leone',
4922 'SX': 'Sint Maarten (Dutch part)',
4925 'SB': 'Solomon Islands',
4927 'ZA': 'South Africa',
4928 'GS': 'South Georgia and the South Sandwich Islands',
4929 'SS': 'South Sudan',
4934 'SJ': 'Svalbard and Jan Mayen',
4937 'CH': 'Switzerland',
4938 'SY': 'Syrian Arab Republic',
4939 'TW': 'Taiwan, Province of China',
4941 'TZ': 'Tanzania, United Republic of',
4943 'TL': 'Timor-Leste',
4947 'TT': 'Trinidad and Tobago',
4950 'TM': 'Turkmenistan',
4951 'TC': 'Turks and Caicos Islands',
4955 'AE': 'United Arab Emirates',
4956 'GB': 'United Kingdom',
4957 'US': 'United States',
4958 'UM': 'United States Minor Outlying Islands',
4962 'VE': 'Venezuela, Bolivarian Republic of',
4964 'VG': 'Virgin Islands, British',
4965 'VI': 'Virgin Islands, U.S.',
4966 'WF': 'Wallis and Futuna',
4967 'EH': 'Western Sahara',
4974 def short2full(cls, code):
4975 """Convert an ISO 3166-2 country code to the corresponding full name"""
4976 return cls._country_map.get(code.upper())
4979 class GeoUtils(object):
4980 # Major IPv4 address blocks per country
4982 'AD': '85.94.160.0/19',
4983 'AE': '94.200.0.0/13',
4984 'AF': '149.54.0.0/17',
4985 'AG': '209.59.64.0/18',
4986 'AI': '204.14.248.0/21',
4987 'AL': '46.99.0.0/16',
4988 'AM': '46.70.0.0/15',
4989 'AO': '105.168.0.0/13',
4990 'AP': '159.117.192.0/21',
4991 'AR': '181.0.0.0/12',
4992 'AS': '202.70.112.0/20',
4993 'AT': '84.112.0.0/13',
4994 'AU': '1.128.0.0/11',
4995 'AW': '181.41.0.0/18',
4996 'AZ': '5.191.0.0/16',
4997 'BA': '31.176.128.0/17',
4998 'BB': '65.48.128.0/17',
4999 'BD': '114.130.0.0/16',
5001 'BF': '129.45.128.0/17',
5002 'BG': '95.42.0.0/15',
5003 'BH': '37.131.0.0/17',
5004 'BI': '154.117.192.0/18',
5005 'BJ': '137.255.0.0/16',
5006 'BL': '192.131.134.0/24',
5007 'BM': '196.12.64.0/18',
5008 'BN': '156.31.0.0/16',
5009 'BO': '161.56.0.0/16',
5010 'BQ': '161.0.80.0/20',
5011 'BR': '152.240.0.0/12',
5012 'BS': '24.51.64.0/18',
5013 'BT': '119.2.96.0/19',
5014 'BW': '168.167.0.0/16',
5015 'BY': '178.120.0.0/13',
5016 'BZ': '179.42.192.0/18',
5017 'CA': '99.224.0.0/11',
5018 'CD': '41.243.0.0/16',
5019 'CF': '196.32.200.0/21',
5020 'CG': '197.214.128.0/17',
5021 'CH': '85.0.0.0/13',
5022 'CI': '154.232.0.0/14',
5023 'CK': '202.65.32.0/19',
5024 'CL': '152.172.0.0/14',
5025 'CM': '165.210.0.0/15',
5026 'CN': '36.128.0.0/10',
5027 'CO': '181.240.0.0/12',
5028 'CR': '201.192.0.0/12',
5029 'CU': '152.206.0.0/15',
5030 'CV': '165.90.96.0/19',
5031 'CW': '190.88.128.0/17',
5032 'CY': '46.198.0.0/15',
5033 'CZ': '88.100.0.0/14',
5035 'DJ': '197.241.0.0/17',
5036 'DK': '87.48.0.0/12',
5037 'DM': '192.243.48.0/20',
5038 'DO': '152.166.0.0/15',
5039 'DZ': '41.96.0.0/12',
5040 'EC': '186.68.0.0/15',
5041 'EE': '90.190.0.0/15',
5042 'EG': '156.160.0.0/11',
5043 'ER': '196.200.96.0/20',
5044 'ES': '88.0.0.0/11',
5045 'ET': '196.188.0.0/14',
5046 'EU': '2.16.0.0/13',
5047 'FI': '91.152.0.0/13',
5048 'FJ': '144.120.0.0/16',
5049 'FM': '119.252.112.0/20',
5050 'FO': '88.85.32.0/19',
5052 'GA': '41.158.0.0/15',
5054 'GD': '74.122.88.0/21',
5055 'GE': '31.146.0.0/16',
5056 'GF': '161.22.64.0/18',
5057 'GG': '62.68.160.0/19',
5058 'GH': '45.208.0.0/14',
5059 'GI': '85.115.128.0/19',
5060 'GL': '88.83.0.0/19',
5061 'GM': '160.182.0.0/15',
5062 'GN': '197.149.192.0/18',
5063 'GP': '104.250.0.0/19',
5064 'GQ': '105.235.224.0/20',
5065 'GR': '94.64.0.0/13',
5066 'GT': '168.234.0.0/16',
5067 'GU': '168.123.0.0/16',
5068 'GW': '197.214.80.0/20',
5069 'GY': '181.41.64.0/18',
5070 'HK': '113.252.0.0/14',
5071 'HN': '181.210.0.0/16',
5072 'HR': '93.136.0.0/13',
5073 'HT': '148.102.128.0/17',
5074 'HU': '84.0.0.0/14',
5075 'ID': '39.192.0.0/10',
5076 'IE': '87.32.0.0/12',
5077 'IL': '79.176.0.0/13',
5078 'IM': '5.62.80.0/20',
5079 'IN': '117.192.0.0/10',
5080 'IO': '203.83.48.0/21',
5081 'IQ': '37.236.0.0/14',
5082 'IR': '2.176.0.0/12',
5083 'IS': '82.221.0.0/16',
5084 'IT': '79.0.0.0/10',
5085 'JE': '87.244.64.0/18',
5086 'JM': '72.27.0.0/17',
5087 'JO': '176.29.0.0/16',
5088 'JP': '126.0.0.0/8',
5089 'KE': '105.48.0.0/12',
5090 'KG': '158.181.128.0/17',
5091 'KH': '36.37.128.0/17',
5092 'KI': '103.25.140.0/22',
5093 'KM': '197.255.224.0/20',
5094 'KN': '198.32.32.0/19',
5095 'KP': '175.45.176.0/22',
5096 'KR': '175.192.0.0/10',
5097 'KW': '37.36.0.0/14',
5098 'KY': '64.96.0.0/15',
5099 'KZ': '2.72.0.0/13',
5100 'LA': '115.84.64.0/18',
5101 'LB': '178.135.0.0/16',
5102 'LC': '192.147.231.0/24',
5103 'LI': '82.117.0.0/19',
5104 'LK': '112.134.0.0/15',
5105 'LR': '41.86.0.0/19',
5106 'LS': '129.232.0.0/17',
5107 'LT': '78.56.0.0/13',
5108 'LU': '188.42.0.0/16',
5109 'LV': '46.109.0.0/16',
5110 'LY': '41.252.0.0/14',
5111 'MA': '105.128.0.0/11',
5112 'MC': '88.209.64.0/18',
5113 'MD': '37.246.0.0/16',
5114 'ME': '178.175.0.0/17',
5115 'MF': '74.112.232.0/21',
5116 'MG': '154.126.0.0/17',
5117 'MH': '117.103.88.0/21',
5118 'MK': '77.28.0.0/15',
5119 'ML': '154.118.128.0/18',
5120 'MM': '37.111.0.0/17',
5121 'MN': '49.0.128.0/17',
5122 'MO': '60.246.0.0/16',
5123 'MP': '202.88.64.0/20',
5124 'MQ': '109.203.224.0/19',
5125 'MR': '41.188.64.0/18',
5126 'MS': '208.90.112.0/22',
5127 'MT': '46.11.0.0/16',
5128 'MU': '105.16.0.0/12',
5129 'MV': '27.114.128.0/18',
5130 'MW': '105.234.0.0/16',
5131 'MX': '187.192.0.0/11',
5132 'MY': '175.136.0.0/13',
5133 'MZ': '197.218.0.0/15',
5134 'NA': '41.182.0.0/16',
5135 'NC': '101.101.0.0/18',
5136 'NE': '197.214.0.0/18',
5137 'NF': '203.17.240.0/22',
5138 'NG': '105.112.0.0/12',
5139 'NI': '186.76.0.0/15',
5140 'NL': '145.96.0.0/11',
5141 'NO': '84.208.0.0/13',
5142 'NP': '36.252.0.0/15',
5143 'NR': '203.98.224.0/19',
5144 'NU': '49.156.48.0/22',
5145 'NZ': '49.224.0.0/14',
5146 'OM': '5.36.0.0/15',
5147 'PA': '186.72.0.0/15',
5148 'PE': '186.160.0.0/14',
5149 'PF': '123.50.64.0/18',
5150 'PG': '124.240.192.0/19',
5151 'PH': '49.144.0.0/13',
5152 'PK': '39.32.0.0/11',
5153 'PL': '83.0.0.0/11',
5154 'PM': '70.36.0.0/20',
5155 'PR': '66.50.0.0/16',
5156 'PS': '188.161.0.0/16',
5157 'PT': '85.240.0.0/13',
5158 'PW': '202.124.224.0/20',
5159 'PY': '181.120.0.0/14',
5160 'QA': '37.210.0.0/15',
5161 'RE': '139.26.0.0/16',
5162 'RO': '79.112.0.0/13',
5163 'RS': '178.220.0.0/14',
5164 'RU': '5.136.0.0/13',
5165 'RW': '105.178.0.0/15',
5166 'SA': '188.48.0.0/13',
5167 'SB': '202.1.160.0/19',
5168 'SC': '154.192.0.0/11',
5169 'SD': '154.96.0.0/13',
5170 'SE': '78.64.0.0/12',
5171 'SG': '152.56.0.0/14',
5172 'SI': '188.196.0.0/14',
5173 'SK': '78.98.0.0/15',
5174 'SL': '197.215.0.0/17',
5175 'SM': '89.186.32.0/19',
5176 'SN': '41.82.0.0/15',
5177 'SO': '197.220.64.0/19',
5178 'SR': '186.179.128.0/17',
5179 'SS': '105.235.208.0/21',
5180 'ST': '197.159.160.0/19',
5181 'SV': '168.243.0.0/16',
5182 'SX': '190.102.0.0/20',
5184 'SZ': '41.84.224.0/19',
5185 'TC': '65.255.48.0/20',
5186 'TD': '154.68.128.0/19',
5187 'TG': '196.168.0.0/14',
5188 'TH': '171.96.0.0/13',
5189 'TJ': '85.9.128.0/18',
5190 'TK': '27.96.24.0/21',
5191 'TL': '180.189.160.0/20',
5192 'TM': '95.85.96.0/19',
5193 'TN': '197.0.0.0/11',
5194 'TO': '175.176.144.0/21',
5195 'TR': '78.160.0.0/11',
5196 'TT': '186.44.0.0/15',
5197 'TV': '202.2.96.0/19',
5198 'TW': '120.96.0.0/11',
5199 'TZ': '156.156.0.0/14',
5200 'UA': '93.72.0.0/13',
5201 'UG': '154.224.0.0/13',
5203 'UY': '167.56.0.0/13',
5204 'UZ': '82.215.64.0/18',
5205 'VA': '212.77.0.0/19',
5206 'VC': '24.92.144.0/20',
5207 'VE': '186.88.0.0/13',
5208 'VG': '172.103.64.0/18',
5209 'VI': '146.226.0.0/16',
5210 'VN': '14.160.0.0/11',
5211 'VU': '202.80.32.0/20',
5212 'WF': '117.20.32.0/21',
5213 'WS': '202.4.32.0/19',
5214 'YE': '134.35.0.0/16',
5215 'YT': '41.242.116.0/22',
5216 'ZA': '41.0.0.0/11',
5217 'ZM': '165.56.0.0/13',
5218 'ZW': '41.85.192.0/19',
5222 def random_ipv4(cls, code_or_block):
5223 if len(code_or_block) == 2:
5224 block = cls._country_ip_map.get(code_or_block.upper())
5228 block = code_or_block
5229 addr, preflen = block.split('/')
5230 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5231 addr_max = addr_min | (0xffffffff >> int(preflen))
5232 return compat_str(socket.inet_ntoa(
5233 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5236 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5237 def __init__(self, proxies=None):
5238 # Set default handlers
5239 for type in ('http', 'https'):
5240 setattr(self, '%s_open' % type,
5241 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5242 meth(r, proxy, type))
5243 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5245 def proxy_open(self, req, proxy, type):
5246 req_proxy = req.headers.get('Ytdl-request-proxy')
5247 if req_proxy is not None:
5249 del req.headers['Ytdl-request-proxy']
5251 if proxy == '__noproxy__':
5252 return None # No Proxy
5253 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5254 req.add_header('Ytdl-socks-proxy', proxy)
5255 # youtube-dl's http/https handlers do wrapping the socket with socks
5257 return compat_urllib_request.ProxyHandler.proxy_open(
5258 self, req, proxy, type)
5261 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5262 # released into Public Domain
5263 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5265 def long_to_bytes(n, blocksize=0):
5266 """long_to_bytes(n:long, blocksize:int) : string
5267 Convert a long integer to a byte string.
5269 If optional blocksize is given and greater than zero, pad the front of the
5270 byte string with binary zeros so that the length is a multiple of
5273 # after much testing, this algorithm was deemed to be the fastest
5277 s = compat_struct_pack('>I', n & 0xffffffff) + s
5279 # strip off leading zeros
5280 for i in range(len(s)):
5281 if s[i] != b'\000'[0]:
5284 # only happens when n == 0
5288 # add back some pad bytes. this could be done more efficiently w.r.t. the
5289 # de-padding being done above, but sigh...
5290 if blocksize > 0 and len(s) % blocksize:
5291 s = (blocksize - len(s) % blocksize) * b'\000' + s
5295 def bytes_to_long(s):
5296 """bytes_to_long(string) : long
5297 Convert a byte string to a long integer.
5299 This is (essentially) the inverse of long_to_bytes().
5304 extra = (4 - length % 4)
5305 s = b'\000' * extra + s
5306 length = length + extra
5307 for i in range(0, length, 4):
5308 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5312 def ohdave_rsa_encrypt(data, exponent, modulus):
5314 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5317 data: data to encrypt, bytes-like object
5318 exponent, modulus: parameter e and N of RSA algorithm, both integer
5319 Output: hex string of encrypted data
5321 Limitation: supports one block encryption only
5324 payload = int(binascii.hexlify(data[::-1]), 16)
5325 encrypted = pow(payload, exponent, modulus)
5326 return '%x' % encrypted
5329 def pkcs1pad(data, length):
5331 Padding input data with PKCS#1 scheme
5333 @param {int[]} data input data
5334 @param {int} length target length
5335 @returns {int[]} padded data
5337 if len(data) > length - 11:
5338 raise ValueError('Input data too
long for PKCS
#1 padding')
5340 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5341 return [0, 2] + pseudo_random
+ [0] + data
5344 def encode_base_n(num
, n
, table
=None):
5345 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5347 table
= FULL_TABLE
[:n
]
5350 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5357 ret
= table
[num
% n
] + ret
5362 def decode_packed_codes(code
):
5363 mobj
= re
.search(PACKED_CODES_RE
, code
)
5364 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
5367 symbols
= symbols
.split('|')
5372 base_n_count
= encode_base_n(count
, base
)
5373 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5376 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5380 def parse_m3u8_attributes(attrib
):
5382 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5383 if val
.startswith('"'):
5389 def urshift(val
, n
):
5390 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5393 # Based on png2str() written by @gdkchan and improved by @yokrysty
5394 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5395 def decode_png(png_data
):
5396 # Reference: https://www.w3.org/TR/PNG/
5397 header
= png_data
[8:]
5399 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5400 raise IOError('Not a valid PNG file.')
5402 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5403 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5408 length
= unpack_integer(header
[:4])
5411 chunk_type
= header
[:4]
5414 chunk_data
= header
[:length
]
5415 header
= header
[length
:]
5417 header
= header
[4:] # Skip CRC
5425 ihdr
= chunks
[0]['data']
5427 width
= unpack_integer(ihdr
[:4])
5428 height
= unpack_integer(ihdr
[4:8])
5432 for chunk
in chunks
:
5433 if chunk
['type'] == b
'IDAT':
5434 idat
+= chunk
['data']
5437 raise IOError('Unable to read PNG data.')
5439 decompressed_data
= bytearray(zlib
.decompress(idat
))
5444 def _get_pixel(idx
):
5449 for y
in range(height
):
5450 basePos
= y
* (1 + stride
)
5451 filter_type
= decompressed_data
[basePos
]
5455 pixels
.append(current_row
)
5457 for x
in range(stride
):
5458 color
= decompressed_data
[1 + basePos
+ x
]
5459 basex
= y
* stride
+ x
5464 left
= _get_pixel(basex
- 3)
5466 up
= _get_pixel(basex
- stride
)
5468 if filter_type
== 1: # Sub
5469 color
= (color
+ left
) & 0xff
5470 elif filter_type
== 2: # Up
5471 color
= (color
+ up
) & 0xff
5472 elif filter_type
== 3: # Average
5473 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5474 elif filter_type
== 4: # Paeth
5480 c
= _get_pixel(basex
- stride
- 3)
5488 if pa
<= pb
and pa
<= pc
:
5489 color
= (color
+ a
) & 0xff
5491 color
= (color
+ b
) & 0xff
5493 color
= (color
+ c
) & 0xff
5495 current_row
.append(color
)
5497 return width
, height
, pixels
5500 def write_xattr(path
, key
, value
):
5501 # This mess below finds the best xattr tool for the job
5503 # try the pyxattr module...
5506 if hasattr(xattr
, 'set'): # pyxattr
5507 # Unicode arguments are not supported in python-pyxattr until
5509 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5510 pyxattr_required_version
= '0.5.0'
5511 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5512 # TODO: fallback to CLI tools
5513 raise XAttrUnavailableError(
5514 'python-pyxattr is detected but is too old. '
5515 'youtube-dl requires %s or above while your version is %s. '
5516 'Falling back to other xattr implementations' % (
5517 pyxattr_required_version
, xattr
.__version
__))
5519 setxattr
= xattr
.set
5521 setxattr
= xattr
.setxattr
5524 setxattr(path
, key
, value
)
5525 except EnvironmentError as e
:
5526 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5529 if compat_os_name
== 'nt':
5530 # Write xattrs to NTFS Alternate Data Streams:
5531 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5532 assert ':' not in key
5533 assert os
.path
.exists(path
)
5535 ads_fn
= path
+ ':' + key
5537 with open(ads_fn
, 'wb') as f
:
5539 except EnvironmentError as e
:
5540 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5542 user_has_setfattr
= check_executable('setfattr', ['--version'])
5543 user_has_xattr
= check_executable('xattr', ['-h'])
5545 if user_has_setfattr
or user_has_xattr
:
5547 value
= value
.decode('utf-8')
5548 if user_has_setfattr
:
5549 executable
= 'setfattr'
5550 opts
= ['-n', key
, '-v', value
]
5551 elif user_has_xattr
:
5552 executable
= 'xattr'
5553 opts
= ['-w', key
, value
]
5555 cmd
= ([encodeFilename(executable
, True)]
5556 + [encodeArgument(o
) for o
in opts
]
5557 + [encodeFilename(path
, True)])
5560 p
= subprocess
.Popen(
5561 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5562 except EnvironmentError as e
:
5563 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5564 stdout
, stderr
= p
.communicate()
5565 stderr
= stderr
.decode('utf-8', 'replace')
5566 if p
.returncode
!= 0:
5567 raise XAttrMetadataError(p
.returncode
, stderr
)
5570 # On Unix, and can't find pyxattr, setfattr, or xattr.
5571 if sys
.platform
.startswith('linux'):
5572 raise XAttrUnavailableError(
5573 "Couldn't find a tool to set the xattrs. "
5574 "Install either the python 'pyxattr' or 'xattr' "
5575 "modules, or the GNU 'attr' package "
5576 "(which contains the 'setfattr' tool).")
5578 raise XAttrUnavailableError(
5579 "Couldn't find a tool to set the xattrs. "
5580 "Install either the python 'xattr' module, "
5581 "or the 'xattr' binary.")
5584 def random_birthday(year_field
, month_field
, day_field
):
5585 start_date
= datetime
.date(1950, 1, 1)
5586 end_date
= datetime
.date(1995, 12, 31)
5587 offset
= random
.randint(0, (end_date
- start_date
).days
)
5588 random_date
= start_date
+ datetime
.timedelta(offset
)
5590 year_field
: str(random_date
.year
),
5591 month_field
: str(random_date
.month
),
5592 day_field
: str(random_date
.day
),