Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_os_name,
  46     compat_parse_qs,
  47     compat_shlex_quote,
  48     compat_socket_create_connection,
  49     compat_str,
  50     compat_struct_pack,
  51     compat_struct_unpack,
  52     compat_urllib_error,
  53     compat_urllib_parse,
  54     compat_urllib_parse_urlencode,
  55     compat_urllib_parse_urlparse,
  56     compat_urllib_parse_unquote_plus,
  57     compat_urllib_request,
  58     compat_urlparse,
  59     compat_xpath,
  60 )
  61
  62 from .socks import (
  63     ProxyType,
  64     sockssocket,
  65 )
  66
  67
  68 def register_socks_protocols():
  69     # "Register" SOCKS protocols
  70     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  71     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  72     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  73         if scheme not in compat_urlparse.uses_netloc:
  74             compat_urlparse.uses_netloc.append(scheme)
  75
  76
  77 # This is not clearly defined otherwise
  78 compiled_regex_type = type(re.compile(''))
  79
  80 std_headers = {
  81     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  82     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84     'Accept-Encoding': 'gzip, deflate',
  85     'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88
  89 USER_AGENTS = {
  90     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  91 }
  92
  93
  94 NO_DEFAULT = object()
  95
  96 ENGLISH_MONTH_NAMES = [
  97     'January', 'February', 'March', 'April', 'May', 'June',
  98     'July', 'August', 'September', 'October', 'November', 'December']
  99
 100 MONTH_NAMES = {
 101     'en': ENGLISH_MONTH_NAMES,
 102     'fr': [
 103         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 104         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 105 }
 106
 107 KNOWN_EXTENSIONS = (
 108     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 109     'flv', 'f4v', 'f4a', 'f4b',
 110     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 111     'mkv', 'mka', 'mk3d',
 112     'avi', 'divx',
 113     'mov',
 114     'asf', 'wmv', 'wma',
 115     '3gp', '3g2',
 116     'mp3',
 117     'flac',
 118     'ape',
 119     'wav',
 120     'f4f', 'f4m', 'm3u8', 'smil')
 121
 122 # needed for sanitizing filenames in restricted mode
 123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 124                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 125                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 126
 127 DATE_FORMATS = (
 128     '%d %B %Y',
 129     '%d %b %Y',
 130     '%B %d %Y',
 131     '%B %dst %Y',
 132     '%B %dnd %Y',
 133     '%B %dth %Y',
 134     '%b %d %Y',
 135     '%b %dst %Y',
 136     '%b %dnd %Y',
 137     '%b %dth %Y',
 138     '%b %dst %Y %I:%M',
 139     '%b %dnd %Y %I:%M',
 140     '%b %dth %Y %I:%M',
 141     '%Y %m %d',
 142     '%Y-%m-%d',
 143     '%Y/%m/%d',
 144     '%Y/%m/%d %H:%M',
 145     '%Y/%m/%d %H:%M:%S',
 146     '%Y-%m-%d %H:%M',
 147     '%Y-%m-%d %H:%M:%S',
 148     '%Y-%m-%d %H:%M:%S.%f',
 149     '%d.%m.%Y %H:%M',
 150     '%d.%m.%Y %H.%M',
 151     '%Y-%m-%dT%H:%M:%SZ',
 152     '%Y-%m-%dT%H:%M:%S.%fZ',
 153     '%Y-%m-%dT%H:%M:%S.%f0Z',
 154     '%Y-%m-%dT%H:%M:%S',
 155     '%Y-%m-%dT%H:%M:%S.%f',
 156     '%Y-%m-%dT%H:%M',
 157     '%b %d %Y at %H:%M',
 158     '%b %d %Y at %H:%M:%S',
 159 )
 160
 161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_DAY_FIRST.extend([
 163     '%d-%m-%Y',
 164     '%d.%m.%Y',
 165     '%d.%m.%y',
 166     '%d/%m/%Y',
 167     '%d/%m/%y',
 168     '%d/%m/%Y %H:%M:%S',
 169 ])
 170
 171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 172 DATE_FORMATS_MONTH_FIRST.extend([
 173     '%m-%d-%Y',
 174     '%m.%d.%Y',
 175     '%m/%d/%Y',
 176     '%m/%d/%y',
 177     '%m/%d/%Y %H:%M:%S',
 178 ])
 179
 180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 181
 182
 183 def preferredencoding():
 184     """Get preferred encoding.
 185
 186     Returns the best encoding scheme for the system, based on
 187     locale.getpreferredencoding() and some further tweaks.
 188     """
 189     try:
 190         pref = locale.getpreferredencoding()
 191         'TEST'.encode(pref)
 192     except Exception:
 193         pref = 'UTF-8'
 194
 195     return pref
 196
 197
 198 def write_json_file(obj, fn):
 199     """ Encode obj as JSON and write it to fn, atomically if possible """
 200
 201     fn = encodeFilename(fn)
 202     if sys.version_info < (3, 0) and sys.platform != 'win32':
 203         encoding = get_filesystem_encoding()
 204         # os.path.basename returns a bytes object, but NamedTemporaryFile
 205         # will fail if the filename contains non ascii characters unless we
 206         # use a unicode object
 207         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 208         # the same for os.path.dirname
 209         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 210     else:
 211         path_basename = os.path.basename
 212         path_dirname = os.path.dirname
 213
 214     args = {
 215         'suffix': '.tmp',
 216         'prefix': path_basename(fn) + '.',
 217         'dir': path_dirname(fn),
 218         'delete': False,
 219     }
 220
 221     # In Python 2.x, json.dump expects a bytestream.
 222     # In Python 3.x, it writes to a character stream
 223     if sys.version_info < (3, 0):
 224         args['mode'] = 'wb'
 225     else:
 226         args.update({
 227             'mode': 'w',
 228             'encoding': 'utf-8',
 229         })
 230
 231     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 232
 233     try:
 234         with tf:
 235             json.dump(obj, tf)
 236         if sys.platform == 'win32':
 237             # Need to remove existing file on Windows, else os.rename raises
 238             # WindowsError or FileExistsError.
 239             try:
 240                 os.unlink(fn)
 241             except OSError:
 242                 pass
 243         os.rename(tf.name, fn)
 244     except Exception:
 245         try:
 246             os.remove(tf.name)
 247         except OSError:
 248             pass
 249         raise
 250
 251
 252 if sys.version_info >= (2, 7):
 253     def find_xpath_attr(node, xpath, key, val=None):
 254         """ Find the xpath xpath[@key=val] """
 255         assert re.match(r'^[a-zA-Z_-]+$', key)
 256         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 257         return node.find(expr)
 258 else:
 259     def find_xpath_attr(node, xpath, key, val=None):
 260         for f in node.findall(compat_xpath(xpath)):
 261             if key not in f.attrib:
 262                 continue
 263             if val is None or f.attrib.get(key) == val:
 264                 return f
 265         return None
 266
 267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 268 # the namespace parameter
 269
 270
 271 def xpath_with_ns(path, ns_map):
 272     components = [c.split(':') for c in path.split('/')]
 273     replaced = []
 274     for c in components:
 275         if len(c) == 1:
 276             replaced.append(c[0])
 277         else:
 278             ns, tag = c
 279             replaced.append('{%s}%s' % (ns_map[ns], tag))
 280     return '/'.join(replaced)
 281
 282
 283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 284     def _find_xpath(xpath):
 285         return node.find(compat_xpath(xpath))
 286
 287     if isinstance(xpath, (str, compat_str)):
 288         n = _find_xpath(xpath)
 289     else:
 290         for xp in xpath:
 291             n = _find_xpath(xp)
 292             if n is not None:
 293                 break
 294
 295     if n is None:
 296         if default is not NO_DEFAULT:
 297             return default
 298         elif fatal:
 299             name = xpath if name is None else name
 300             raise ExtractorError('Could not find XML element %s' % name)
 301         else:
 302             return None
 303     return n
 304
 305
 306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 307     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 308     if n is None or n == default:
 309         return n
 310     if n.text is None:
 311         if default is not NO_DEFAULT:
 312             return default
 313         elif fatal:
 314             name = xpath if name is None else name
 315             raise ExtractorError('Could not find XML element\'s text %s' % name)
 316         else:
 317             return None
 318     return n.text
 319
 320
 321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 322     n = find_xpath_attr(node, xpath, key)
 323     if n is None:
 324         if default is not NO_DEFAULT:
 325             return default
 326         elif fatal:
 327             name = '%s[@%s]' % (xpath, key) if name is None else name
 328             raise ExtractorError('Could not find XML attribute %s' % name)
 329         else:
 330             return None
 331     return n.attrib[key]
 332
 333
 334 def get_element_by_id(id, html):
 335     """Return the content of the tag with the specified ID in the passed HTML document"""
 336     return get_element_by_attribute('id', id, html)
 337
 338
 339 def get_element_by_class(class_name, html):
 340     return get_element_by_attribute(
 341         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 342         html, escape_value=False)
 343
 344
 345 def get_element_by_attribute(attribute, value, html, escape_value=True):
 346     """Return the content of the tag with the specified attribute in the passed HTML document"""
 347
 348     value = re.escape(value) if escape_value else value
 349
 350     m = re.search(r'''(?xs)
 351         <([a-zA-Z0-9:._-]+)
 352          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 353          \s+%s=['"]?%s['"]?
 354          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 355         \s*>
 356         (?P<content>.*?)
 357         </\1>
 358     ''' % (re.escape(attribute), value), html)
 359
 360     if not m:
 361         return None
 362     res = m.group('content')
 363
 364     if res.startswith('"') or res.startswith("'"):
 365         res = res[1:-1]
 366
 367     return unescapeHTML(res)
 368
 369
 370 class HTMLAttributeParser(compat_HTMLParser):
 371     """Trivial HTML parser to gather the attributes for a single element"""
 372     def __init__(self):
 373         self.attrs = {}
 374         compat_HTMLParser.__init__(self)
 375
 376     def handle_starttag(self, tag, attrs):
 377         self.attrs = dict(attrs)
 378
 379
 380 def extract_attributes(html_element):
 381     """Given a string for an HTML element such as
 382     <el
 383          a="foo" B="bar" c="&98;az" d=boz
 384          empty= noval entity="&amp;"
 385          sq='"' dq="'"
 386     >
 387     Decode and return a dictionary of attributes.
 388     {
 389         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 390         'empty': '', 'noval': None, 'entity': '&',
 391         'sq': '"', 'dq': '\''
 392     }.
 393     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 394     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 395     """
 396     parser = HTMLAttributeParser()
 397     parser.feed(html_element)
 398     parser.close()
 399     return parser.attrs
 400
 401
 402 def clean_html(html):
 403     """Clean an HTML snippet into a readable string"""
 404
 405     if html is None:  # Convenience for sanitizing descriptions etc.
 406         return html
 407
 408     # Newline vs <br />
 409     html = html.replace('\n', ' ')
 410     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 411     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 412     # Strip html tags
 413     html = re.sub('<.*?>', '', html)
 414     # Replace html entities
 415     html = unescapeHTML(html)
 416     return html.strip()
 417
 418
 419 def sanitize_open(filename, open_mode):
 420     """Try to open the given filename, and slightly tweak it if this fails.
 421
 422     Attempts to open the given filename. If this fails, it tries to change
 423     the filename slightly, step by step, until it's either able to open it
 424     or it fails and raises a final exception, like the standard open()
 425     function.
 426
 427     It returns the tuple (stream, definitive_file_name).
 428     """
 429     try:
 430         if filename == '-':
 431             if sys.platform == 'win32':
 432                 import msvcrt
 433                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 434             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 435         stream = open(encodeFilename(filename), open_mode)
 436         return (stream, filename)
 437     except (IOError, OSError) as err:
 438         if err.errno in (errno.EACCES,):
 439             raise
 440
 441         # In case of error, try to remove win32 forbidden chars
 442         alt_filename = sanitize_path(filename)
 443         if alt_filename == filename:
 444             raise
 445         else:
 446             # An exception here should be caught in the caller
 447             stream = open(encodeFilename(alt_filename), open_mode)
 448             return (stream, alt_filename)
 449
 450
 451 def timeconvert(timestr):
 452     """Convert RFC 2822 defined time string into system timestamp"""
 453     timestamp = None
 454     timetuple = email.utils.parsedate_tz(timestr)
 455     if timetuple is not None:
 456         timestamp = email.utils.mktime_tz(timetuple)
 457     return timestamp
 458
 459
 460 def sanitize_filename(s, restricted=False, is_id=False):
 461     """Sanitizes a string so it could be used as part of a filename.
 462     If restricted is set, use a stricter subset of allowed characters.
 463     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 464     """
 465     def replace_insane(char):
 466         if restricted and char in ACCENT_CHARS:
 467             return ACCENT_CHARS[char]
 468         if char == '?' or ord(char) < 32 or ord(char) == 127:
 469             return ''
 470         elif char == '"':
 471             return '' if restricted else '\''
 472         elif char == ':':
 473             return '_-' if restricted else ' -'
 474         elif char in '\\/|*<>':
 475             return '_'
 476         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 477             return '_'
 478         if restricted and ord(char) > 127:
 479             return '_'
 480         return char
 481
 482     # Handle timestamps
 483     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 484     result = ''.join(map(replace_insane, s))
 485     if not is_id:
 486         while '__' in result:
 487             result = result.replace('__', '_')
 488         result = result.strip('_')
 489         # Common case of "Foreign band name - English song title"
 490         if restricted and result.startswith('-_'):
 491             result = result[2:]
 492         if result.startswith('-'):
 493             result = '_' + result[len('-'):]
 494         result = result.lstrip('.')
 495         if not result:
 496             result = '_'
 497     return result
 498
 499
 500 def sanitize_path(s):
 501     """Sanitizes and normalizes path on Windows"""
 502     if sys.platform != 'win32':
 503         return s
 504     drive_or_unc, _ = os.path.splitdrive(s)
 505     if sys.version_info < (2, 7) and not drive_or_unc:
 506         drive_or_unc, _ = os.path.splitunc(s)
 507     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 508     if drive_or_unc:
 509         norm_path.pop(0)
 510     sanitized_path = [
 511         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 512         for path_part in norm_path]
 513     if drive_or_unc:
 514         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 515     return os.path.join(*sanitized_path)
 516
 517
 518 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 519 # unwanted failures due to missing protocol
 520 def sanitize_url(url):
 521     return 'http:%s' % url if url.startswith('//') else url
 522
 523
 524 def sanitized_Request(url, *args, **kwargs):
 525     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 526
 527
 528 def orderedSet(iterable):
 529     """ Remove all duplicates from the input iterable """
 530     res = []
 531     for el in iterable:
 532         if el not in res:
 533             res.append(el)
 534     return res
 535
 536
 537 def _htmlentity_transform(entity_with_semicolon):
 538     """Transforms an HTML entity to a character."""
 539     entity = entity_with_semicolon[:-1]
 540
 541     # Known non-numeric HTML entity
 542     if entity in compat_html_entities.name2codepoint:
 543         return compat_chr(compat_html_entities.name2codepoint[entity])
 544
 545     # TODO: HTML5 allows entities without a semicolon. For example,
 546     # '&Eacuteric' should be decoded as 'Éric'.
 547     if entity_with_semicolon in compat_html_entities_html5:
 548         return compat_html_entities_html5[entity_with_semicolon]
 549
 550     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 551     if mobj is not None:
 552         numstr = mobj.group(1)
 553         if numstr.startswith('x'):
 554             base = 16
 555             numstr = '0%s' % numstr
 556         else:
 557             base = 10
 558         # See https://github.com/rg3/youtube-dl/issues/7518
 559         try:
 560             return compat_chr(int(numstr, base))
 561         except ValueError:
 562             pass
 563
 564     # Unknown entity in name, return its literal representation
 565     return '&%s;' % entity
 566
 567
 568 def unescapeHTML(s):
 569     if s is None:
 570         return None
 571     assert type(s) == compat_str
 572
 573     return re.sub(
 574         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 575
 576
 577 def get_subprocess_encoding():
 578     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 579         # For subprocess calls, encode with locale encoding
 580         # Refer to http://stackoverflow.com/a/9951851/35070
 581         encoding = preferredencoding()
 582     else:
 583         encoding = sys.getfilesystemencoding()
 584     if encoding is None:
 585         encoding = 'utf-8'
 586     return encoding
 587
 588
 589 def encodeFilename(s, for_subprocess=False):
 590     """
 591     @param s The name of the file
 592     """
 593
 594     assert type(s) == compat_str
 595
 596     # Python 3 has a Unicode API
 597     if sys.version_info >= (3, 0):
 598         return s
 599
 600     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 601     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 602     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 603     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 604         return s
 605
 606     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 607     if sys.platform.startswith('java'):
 608         return s
 609
 610     return s.encode(get_subprocess_encoding(), 'ignore')
 611
 612
 613 def decodeFilename(b, for_subprocess=False):
 614
 615     if sys.version_info >= (3, 0):
 616         return b
 617
 618     if not isinstance(b, bytes):
 619         return b
 620
 621     return b.decode(get_subprocess_encoding(), 'ignore')
 622
 623
 624 def encodeArgument(s):
 625     if not isinstance(s, compat_str):
 626         # Legacy code that uses byte strings
 627         # Uncomment the following line after fixing all post processors
 628         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 629         s = s.decode('ascii')
 630     return encodeFilename(s, True)
 631
 632
 633 def decodeArgument(b):
 634     return decodeFilename(b, True)
 635
 636
 637 def decodeOption(optval):
 638     if optval is None:
 639         return optval
 640     if isinstance(optval, bytes):
 641         optval = optval.decode(preferredencoding())
 642
 643     assert isinstance(optval, compat_str)
 644     return optval
 645
 646
 647 def formatSeconds(secs):
 648     if secs > 3600:
 649         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 650     elif secs > 60:
 651         return '%d:%02d' % (secs // 60, secs % 60)
 652     else:
 653         return '%d' % secs
 654
 655
 656 def make_HTTPS_handler(params, **kwargs):
 657     opts_no_check_certificate = params.get('nocheckcertificate', False)
 658     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 659         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 660         if opts_no_check_certificate:
 661             context.check_hostname = False
 662             context.verify_mode = ssl.CERT_NONE
 663         try:
 664             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 665         except TypeError:
 666             # Python 2.7.8
 667             # (create_default_context present but HTTPSHandler has no context=)
 668             pass
 669
 670     if sys.version_info < (3, 2):
 671         return YoutubeDLHTTPSHandler(params, **kwargs)
 672     else:  # Python < 3.4
 673         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 674         context.verify_mode = (ssl.CERT_NONE
 675                                if opts_no_check_certificate
 676                                else ssl.CERT_REQUIRED)
 677         context.set_default_verify_paths()
 678         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 679
 680
 681 def bug_reports_message():
 682     if ytdl_is_updateable():
 683         update_cmd = 'type  youtube-dl -U  to update'
 684     else:
 685         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 686     msg = '; please report this issue on https://yt-dl.org/bug .'
 687     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 688     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 689     return msg
 690
 691
 692 class ExtractorError(Exception):
 693     """Error during info extraction."""
 694
 695     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 696         """ tb, if given, is the original traceback (so that it can be printed out).
 697         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 698         """
 699
 700         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 701             expected = True
 702         if video_id is not None:
 703             msg = video_id + ': ' + msg
 704         if cause:
 705             msg += ' (caused by %r)' % cause
 706         if not expected:
 707             msg += bug_reports_message()
 708         super(ExtractorError, self).__init__(msg)
 709
 710         self.traceback = tb
 711         self.exc_info = sys.exc_info()  # preserve original exception
 712         self.cause = cause
 713         self.video_id = video_id
 714
 715     def format_traceback(self):
 716         if self.traceback is None:
 717             return None
 718         return ''.join(traceback.format_tb(self.traceback))
 719
 720
 721 class UnsupportedError(ExtractorError):
 722     def __init__(self, url):
 723         super(UnsupportedError, self).__init__(
 724             'Unsupported URL: %s' % url, expected=True)
 725         self.url = url
 726
 727
 728 class RegexNotFoundError(ExtractorError):
 729     """Error when a regex didn't match"""
 730     pass
 731
 732
 733 class DownloadError(Exception):
 734     """Download Error exception.
 735
 736     This exception may be thrown by FileDownloader objects if they are not
 737     configured to continue on errors. They will contain the appropriate
 738     error message.
 739     """
 740
 741     def __init__(self, msg, exc_info=None):
 742         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 743         super(DownloadError, self).__init__(msg)
 744         self.exc_info = exc_info
 745
 746
 747 class SameFileError(Exception):
 748     """Same File exception.
 749
 750     This exception will be thrown by FileDownloader objects if they detect
 751     multiple files would have to be downloaded to the same file on disk.
 752     """
 753     pass
 754
 755
 756 class PostProcessingError(Exception):
 757     """Post Processing exception.
 758
 759     This exception may be raised by PostProcessor's .run() method to
 760     indicate an error in the postprocessing task.
 761     """
 762
 763     def __init__(self, msg):
 764         self.msg = msg
 765
 766
 767 class MaxDownloadsReached(Exception):
 768     """ --max-downloads limit has been reached. """
 769     pass
 770
 771
 772 class UnavailableVideoError(Exception):
 773     """Unavailable Format exception.
 774
 775     This exception will be thrown when a video is requested
 776     in a format that is not available for that video.
 777     """
 778     pass
 779
 780
 781 class ContentTooShortError(Exception):
 782     """Content Too Short exception.
 783
 784     This exception may be raised by FileDownloader objects when a file they
 785     download is too small for what the server announced first, indicating
 786     the connection was probably interrupted.
 787     """
 788
 789     def __init__(self, downloaded, expected):
 790         # Both in bytes
 791         self.downloaded = downloaded
 792         self.expected = expected
 793
 794
 795 class XAttrMetadataError(Exception):
 796     def __init__(self, code=None, msg='Unknown error'):
 797         super(XAttrMetadataError, self).__init__(msg)
 798         self.code = code
 799         self.msg = msg
 800
 801         # Parsing code and msg
 802         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 803                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 804             self.reason = 'NO_SPACE'
 805         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 806             self.reason = 'VALUE_TOO_LONG'
 807         else:
 808             self.reason = 'NOT_SUPPORTED'
 809
 810
 811 class XAttrUnavailableError(Exception):
 812     pass
 813
 814
 815 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 816     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 817     # expected HTTP responses to meet HTTP/1.0 or later (see also
 818     # https://github.com/rg3/youtube-dl/issues/6727)
 819     if sys.version_info < (3, 0):
 820         kwargs[b'strict'] = True
 821     hc = http_class(*args, **kwargs)
 822     source_address = ydl_handler._params.get('source_address')
 823     if source_address is not None:
 824         sa = (source_address, 0)
 825         if hasattr(hc, 'source_address'):  # Python 2.7+
 826             hc.source_address = sa
 827         else:  # Python 2.6
 828             def _hc_connect(self, *args, **kwargs):
 829                 sock = compat_socket_create_connection(
 830                     (self.host, self.port), self.timeout, sa)
 831                 if is_https:
 832                     self.sock = ssl.wrap_socket(
 833                         sock, self.key_file, self.cert_file,
 834                         ssl_version=ssl.PROTOCOL_TLSv1)
 835                 else:
 836                     self.sock = sock
 837             hc.connect = functools.partial(_hc_connect, hc)
 838
 839     return hc
 840
 841
 842 def handle_youtubedl_headers(headers):
 843     filtered_headers = headers
 844
 845     if 'Youtubedl-no-compression' in filtered_headers:
 846         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 847         del filtered_headers['Youtubedl-no-compression']
 848
 849     return filtered_headers
 850
 851
 852 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 853     """Handler for HTTP requests and responses.
 854
 855     This class, when installed with an OpenerDirector, automatically adds
 856     the standard headers to every HTTP request and handles gzipped and
 857     deflated responses from web servers. If compression is to be avoided in
 858     a particular request, the original request in the program code only has
 859     to include the HTTP header "Youtubedl-no-compression", which will be
 860     removed before making the real request.
 861
 862     Part of this code was copied from:
 863
 864     http://techknack.net/python-urllib2-handlers/
 865
 866     Andrew Rowls, the author of that code, agreed to release it to the
 867     public domain.
 868     """
 869
 870     def __init__(self, params, *args, **kwargs):
 871         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 872         self._params = params
 873
 874     def http_open(self, req):
 875         conn_class = compat_http_client.HTTPConnection
 876
 877         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 878         if socks_proxy:
 879             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 880             del req.headers['Ytdl-socks-proxy']
 881
 882         return self.do_open(functools.partial(
 883             _create_http_connection, self, conn_class, False),
 884             req)
 885
 886     @staticmethod
 887     def deflate(data):
 888         try:
 889             return zlib.decompress(data, -zlib.MAX_WBITS)
 890         except zlib.error:
 891             return zlib.decompress(data)
 892
 893     @staticmethod
 894     def addinfourl_wrapper(stream, headers, url, code):
 895         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 896             return compat_urllib_request.addinfourl(stream, headers, url, code)
 897         ret = compat_urllib_request.addinfourl(stream, headers, url)
 898         ret.code = code
 899         return ret
 900
 901     def http_request(self, req):
 902         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 903         # always respected by websites, some tend to give out URLs with non percent-encoded
 904         # non-ASCII characters (see telemb.py, ard.py [#3412])
 905         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 906         # To work around aforementioned issue we will replace request's original URL with
 907         # percent-encoded one
 908         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 909         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 910         url = req.get_full_url()
 911         url_escaped = escape_url(url)
 912
 913         # Substitute URL if any change after escaping
 914         if url != url_escaped:
 915             req = update_Request(req, url=url_escaped)
 916
 917         for h, v in std_headers.items():
 918             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 919             # The dict keys are capitalized because of this bug by urllib
 920             if h.capitalize() not in req.headers:
 921                 req.add_header(h, v)
 922
 923         req.headers = handle_youtubedl_headers(req.headers)
 924
 925         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 926             # Python 2.6 is brain-dead when it comes to fragments
 927             req._Request__original = req._Request__original.partition('#')[0]
 928             req._Request__r_type = req._Request__r_type.partition('#')[0]
 929
 930         return req
 931
 932     def http_response(self, req, resp):
 933         old_resp = resp
 934         # gzip
 935         if resp.headers.get('Content-encoding', '') == 'gzip':
 936             content = resp.read()
 937             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 938             try:
 939                 uncompressed = io.BytesIO(gz.read())
 940             except IOError as original_ioerror:
 941                 # There may be junk add the end of the file
 942                 # See http://stackoverflow.com/q/4928560/35070 for details
 943                 for i in range(1, 1024):
 944                     try:
 945                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 946                         uncompressed = io.BytesIO(gz.read())
 947                     except IOError:
 948                         continue
 949                     break
 950                 else:
 951                     raise original_ioerror
 952             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 953             resp.msg = old_resp.msg
 954             del resp.headers['Content-encoding']
 955         # deflate
 956         if resp.headers.get('Content-encoding', '') == 'deflate':
 957             gz = io.BytesIO(self.deflate(resp.read()))
 958             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 959             resp.msg = old_resp.msg
 960             del resp.headers['Content-encoding']
 961         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 962         # https://github.com/rg3/youtube-dl/issues/6457).
 963         if 300 <= resp.code < 400:
 964             location = resp.headers.get('Location')
 965             if location:
 966                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 967                 if sys.version_info >= (3, 0):
 968                     location = location.encode('iso-8859-1').decode('utf-8')
 969                 else:
 970                     location = location.decode('utf-8')
 971                 location_escaped = escape_url(location)
 972                 if location != location_escaped:
 973                     del resp.headers['Location']
 974                     if sys.version_info < (3, 0):
 975                         location_escaped = location_escaped.encode('utf-8')
 976                     resp.headers['Location'] = location_escaped
 977         return resp
 978
 979     https_request = http_request
 980     https_response = http_response
 981
 982
 983 def make_socks_conn_class(base_class, socks_proxy):
 984     assert issubclass(base_class, (
 985         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 986
 987     url_components = compat_urlparse.urlparse(socks_proxy)
 988     if url_components.scheme.lower() == 'socks5':
 989         socks_type = ProxyType.SOCKS5
 990     elif url_components.scheme.lower() in ('socks', 'socks4'):
 991         socks_type = ProxyType.SOCKS4
 992     elif url_components.scheme.lower() == 'socks4a':
 993         socks_type = ProxyType.SOCKS4A
 994
 995     def unquote_if_non_empty(s):
 996         if not s:
 997             return s
 998         return compat_urllib_parse_unquote_plus(s)
 999
1000     proxy_args = (
1001         socks_type,
1002         url_components.hostname, url_components.port or 1080,
1003         True,  # Remote DNS
1004         unquote_if_non_empty(url_components.username),
1005         unquote_if_non_empty(url_components.password),
1006     )
1007
1008     class SocksConnection(base_class):
1009         def connect(self):
1010             self.sock = sockssocket()
1011             self.sock.setproxy(*proxy_args)
1012             if type(self.timeout) in (int, float):
1013                 self.sock.settimeout(self.timeout)
1014             self.sock.connect((self.host, self.port))
1015
1016             if isinstance(self, compat_http_client.HTTPSConnection):
1017                 if hasattr(self, '_context'):  # Python > 2.6
1018                     self.sock = self._context.wrap_socket(
1019                         self.sock, server_hostname=self.host)
1020                 else:
1021                     self.sock = ssl.wrap_socket(self.sock)
1022
1023     return SocksConnection
1024
1025
1026 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1027     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1028         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1029         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1030         self._params = params
1031
1032     def https_open(self, req):
1033         kwargs = {}
1034         conn_class = self._https_conn_class
1035
1036         if hasattr(self, '_context'):  # python > 2.6
1037             kwargs['context'] = self._context
1038         if hasattr(self, '_check_hostname'):  # python 3.x
1039             kwargs['check_hostname'] = self._check_hostname
1040
1041         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1042         if socks_proxy:
1043             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1044             del req.headers['Ytdl-socks-proxy']
1045
1046         return self.do_open(functools.partial(
1047             _create_http_connection, self, conn_class, True),
1048             req, **kwargs)
1049
1050
1051 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1052     def __init__(self, cookiejar=None):
1053         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1054
1055     def http_response(self, request, response):
1056         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1057         # characters in Set-Cookie HTTP header of last response (see
1058         # https://github.com/rg3/youtube-dl/issues/6769).
1059         # In order to at least prevent crashing we will percent encode Set-Cookie
1060         # header before HTTPCookieProcessor starts processing it.
1061         # if sys.version_info < (3, 0) and response.headers:
1062         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1063         #         set_cookie = response.headers.get(set_cookie_header)
1064         #         if set_cookie:
1065         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1066         #             if set_cookie != set_cookie_escaped:
1067         #                 del response.headers[set_cookie_header]
1068         #                 response.headers[set_cookie_header] = set_cookie_escaped
1069         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1070
1071     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1072     https_response = http_response
1073
1074
1075 def extract_timezone(date_str):
1076     m = re.search(
1077         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1078         date_str)
1079     if not m:
1080         timezone = datetime.timedelta()
1081     else:
1082         date_str = date_str[:-len(m.group('tz'))]
1083         if not m.group('sign'):
1084             timezone = datetime.timedelta()
1085         else:
1086             sign = 1 if m.group('sign') == '+' else -1
1087             timezone = datetime.timedelta(
1088                 hours=sign * int(m.group('hours')),
1089                 minutes=sign * int(m.group('minutes')))
1090     return timezone, date_str
1091
1092
1093 def parse_iso8601(date_str, delimiter='T', timezone=None):
1094     """ Return a UNIX timestamp from the given date """
1095
1096     if date_str is None:
1097         return None
1098
1099     date_str = re.sub(r'\.[0-9]+', '', date_str)
1100
1101     if timezone is None:
1102         timezone, date_str = extract_timezone(date_str)
1103
1104     try:
1105         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1106         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1107         return calendar.timegm(dt.timetuple())
1108     except ValueError:
1109         pass
1110
1111
1112 def date_formats(day_first=True):
1113     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1114
1115
1116 def unified_strdate(date_str, day_first=True):
1117     """Return a string with the date in the format YYYYMMDD"""
1118
1119     if date_str is None:
1120         return None
1121     upload_date = None
1122     # Replace commas
1123     date_str = date_str.replace(',', ' ')
1124     # Remove AM/PM + timezone
1125     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1126     _, date_str = extract_timezone(date_str)
1127
1128     for expression in date_formats(day_first):
1129         try:
1130             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1131         except ValueError:
1132             pass
1133     if upload_date is None:
1134         timetuple = email.utils.parsedate_tz(date_str)
1135         if timetuple:
1136             try:
1137                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1138             except ValueError:
1139                 pass
1140     if upload_date is not None:
1141         return compat_str(upload_date)
1142
1143
1144 def unified_timestamp(date_str, day_first=True):
1145     if date_str is None:
1146         return None
1147
1148     date_str = date_str.replace(',', ' ')
1149
1150     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1151     timezone, date_str = extract_timezone(date_str)
1152
1153     # Remove AM/PM + timezone
1154     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1155
1156     for expression in date_formats(day_first):
1157         try:
1158             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1159             return calendar.timegm(dt.timetuple())
1160         except ValueError:
1161             pass
1162     timetuple = email.utils.parsedate_tz(date_str)
1163     if timetuple:
1164         return calendar.timegm(timetuple) + pm_delta * 3600
1165
1166
1167 def determine_ext(url, default_ext='unknown_video'):
1168     if url is None:
1169         return default_ext
1170     guess = url.partition('?')[0].rpartition('.')[2]
1171     if re.match(r'^[A-Za-z0-9]+$', guess):
1172         return guess
1173     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1174     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1175         return guess.rstrip('/')
1176     else:
1177         return default_ext
1178
1179
1180 def subtitles_filename(filename, sub_lang, sub_format):
1181     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1182
1183
1184 def date_from_str(date_str):
1185     """
1186     Return a datetime object from a string in the format YYYYMMDD or
1187     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1188     today = datetime.date.today()
1189     if date_str in ('now', 'today'):
1190         return today
1191     if date_str == 'yesterday':
1192         return today - datetime.timedelta(days=1)
1193     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1194     if match is not None:
1195         sign = match.group('sign')
1196         time = int(match.group('time'))
1197         if sign == '-':
1198             time = -time
1199         unit = match.group('unit')
1200         # A bad approximation?
1201         if unit == 'month':
1202             unit = 'day'
1203             time *= 30
1204         elif unit == 'year':
1205             unit = 'day'
1206             time *= 365
1207         unit += 's'
1208         delta = datetime.timedelta(**{unit: time})
1209         return today + delta
1210     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1211
1212
1213 def hyphenate_date(date_str):
1214     """
1215     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1216     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1217     if match is not None:
1218         return '-'.join(match.groups())
1219     else:
1220         return date_str
1221
1222
1223 class DateRange(object):
1224     """Represents a time interval between two dates"""
1225
1226     def __init__(self, start=None, end=None):
1227         """start and end must be strings in the format accepted by date"""
1228         if start is not None:
1229             self.start = date_from_str(start)
1230         else:
1231             self.start = datetime.datetime.min.date()
1232         if end is not None:
1233             self.end = date_from_str(end)
1234         else:
1235             self.end = datetime.datetime.max.date()
1236         if self.start > self.end:
1237             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1238
1239     @classmethod
1240     def day(cls, day):
1241         """Returns a range that only contains the given day"""
1242         return cls(day, day)
1243
1244     def __contains__(self, date):
1245         """Check if the date is in the range"""
1246         if not isinstance(date, datetime.date):
1247             date = date_from_str(date)
1248         return self.start <= date <= self.end
1249
1250     def __str__(self):
1251         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1252
1253
1254 def platform_name():
1255     """ Returns the platform name as a compat_str """
1256     res = platform.platform()
1257     if isinstance(res, bytes):
1258         res = res.decode(preferredencoding())
1259
1260     assert isinstance(res, compat_str)
1261     return res
1262
1263
1264 def _windows_write_string(s, out):
1265     """ Returns True if the string was written using special methods,
1266     False if it has yet to be written out."""
1267     # Adapted from http://stackoverflow.com/a/3259271/35070
1268
1269     import ctypes
1270     import ctypes.wintypes
1271
1272     WIN_OUTPUT_IDS = {
1273         1: -11,
1274         2: -12,
1275     }
1276
1277     try:
1278         fileno = out.fileno()
1279     except AttributeError:
1280         # If the output stream doesn't have a fileno, it's virtual
1281         return False
1282     except io.UnsupportedOperation:
1283         # Some strange Windows pseudo files?
1284         return False
1285     if fileno not in WIN_OUTPUT_IDS:
1286         return False
1287
1288     GetStdHandle = ctypes.WINFUNCTYPE(
1289         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1290         (b'GetStdHandle', ctypes.windll.kernel32))
1291     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1292
1293     WriteConsoleW = ctypes.WINFUNCTYPE(
1294         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1295         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1296         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1297     written = ctypes.wintypes.DWORD(0)
1298
1299     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1300     FILE_TYPE_CHAR = 0x0002
1301     FILE_TYPE_REMOTE = 0x8000
1302     GetConsoleMode = ctypes.WINFUNCTYPE(
1303         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1304         ctypes.POINTER(ctypes.wintypes.DWORD))(
1305         (b'GetConsoleMode', ctypes.windll.kernel32))
1306     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1307
1308     def not_a_console(handle):
1309         if handle == INVALID_HANDLE_VALUE or handle is None:
1310             return True
1311         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1312                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1313
1314     if not_a_console(h):
1315         return False
1316
1317     def next_nonbmp_pos(s):
1318         try:
1319             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1320         except StopIteration:
1321             return len(s)
1322
1323     while s:
1324         count = min(next_nonbmp_pos(s), 1024)
1325
1326         ret = WriteConsoleW(
1327             h, s, count if count else 2, ctypes.byref(written), None)
1328         if ret == 0:
1329             raise OSError('Failed to write string')
1330         if not count:  # We just wrote a non-BMP character
1331             assert written.value == 2
1332             s = s[1:]
1333         else:
1334             assert written.value > 0
1335             s = s[written.value:]
1336     return True
1337
1338
1339 def write_string(s, out=None, encoding=None):
1340     if out is None:
1341         out = sys.stderr
1342     assert type(s) == compat_str
1343
1344     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1345         if _windows_write_string(s, out):
1346             return
1347
1348     if ('b' in getattr(out, 'mode', '') or
1349             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1350         byt = s.encode(encoding or preferredencoding(), 'ignore')
1351         out.write(byt)
1352     elif hasattr(out, 'buffer'):
1353         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1354         byt = s.encode(enc, 'ignore')
1355         out.buffer.write(byt)
1356     else:
1357         out.write(s)
1358     out.flush()
1359
1360
1361 def bytes_to_intlist(bs):
1362     if not bs:
1363         return []
1364     if isinstance(bs[0], int):  # Python 3
1365         return list(bs)
1366     else:
1367         return [ord(c) for c in bs]
1368
1369
1370 def intlist_to_bytes(xs):
1371     if not xs:
1372         return b''
1373     return compat_struct_pack('%dB' % len(xs), *xs)
1374
1375
1376 # Cross-platform file locking
1377 if sys.platform == 'win32':
1378     import ctypes.wintypes
1379     import msvcrt
1380
1381     class OVERLAPPED(ctypes.Structure):
1382         _fields_ = [
1383             ('Internal', ctypes.wintypes.LPVOID),
1384             ('InternalHigh', ctypes.wintypes.LPVOID),
1385             ('Offset', ctypes.wintypes.DWORD),
1386             ('OffsetHigh', ctypes.wintypes.DWORD),
1387             ('hEvent', ctypes.wintypes.HANDLE),
1388         ]
1389
1390     kernel32 = ctypes.windll.kernel32
1391     LockFileEx = kernel32.LockFileEx
1392     LockFileEx.argtypes = [
1393         ctypes.wintypes.HANDLE,     # hFile
1394         ctypes.wintypes.DWORD,      # dwFlags
1395         ctypes.wintypes.DWORD,      # dwReserved
1396         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1397         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1398         ctypes.POINTER(OVERLAPPED)  # Overlapped
1399     ]
1400     LockFileEx.restype = ctypes.wintypes.BOOL
1401     UnlockFileEx = kernel32.UnlockFileEx
1402     UnlockFileEx.argtypes = [
1403         ctypes.wintypes.HANDLE,     # hFile
1404         ctypes.wintypes.DWORD,      # dwReserved
1405         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1406         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1407         ctypes.POINTER(OVERLAPPED)  # Overlapped
1408     ]
1409     UnlockFileEx.restype = ctypes.wintypes.BOOL
1410     whole_low = 0xffffffff
1411     whole_high = 0x7fffffff
1412
1413     def _lock_file(f, exclusive):
1414         overlapped = OVERLAPPED()
1415         overlapped.Offset = 0
1416         overlapped.OffsetHigh = 0
1417         overlapped.hEvent = 0
1418         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1419         handle = msvcrt.get_osfhandle(f.fileno())
1420         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1421                           whole_low, whole_high, f._lock_file_overlapped_p):
1422             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1423
1424     def _unlock_file(f):
1425         assert f._lock_file_overlapped_p
1426         handle = msvcrt.get_osfhandle(f.fileno())
1427         if not UnlockFileEx(handle, 0,
1428                             whole_low, whole_high, f._lock_file_overlapped_p):
1429             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1430
1431 else:
1432     # Some platforms, such as Jython, is missing fcntl
1433     try:
1434         import fcntl
1435
1436         def _lock_file(f, exclusive):
1437             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1438
1439         def _unlock_file(f):
1440             fcntl.flock(f, fcntl.LOCK_UN)
1441     except ImportError:
1442         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1443
1444         def _lock_file(f, exclusive):
1445             raise IOError(UNSUPPORTED_MSG)
1446
1447         def _unlock_file(f):
1448             raise IOError(UNSUPPORTED_MSG)
1449
1450
1451 class locked_file(object):
1452     def __init__(self, filename, mode, encoding=None):
1453         assert mode in ['r', 'a', 'w']
1454         self.f = io.open(filename, mode, encoding=encoding)
1455         self.mode = mode
1456
1457     def __enter__(self):
1458         exclusive = self.mode != 'r'
1459         try:
1460             _lock_file(self.f, exclusive)
1461         except IOError:
1462             self.f.close()
1463             raise
1464         return self
1465
1466     def __exit__(self, etype, value, traceback):
1467         try:
1468             _unlock_file(self.f)
1469         finally:
1470             self.f.close()
1471
1472     def __iter__(self):
1473         return iter(self.f)
1474
1475     def write(self, *args):
1476         return self.f.write(*args)
1477
1478     def read(self, *args):
1479         return self.f.read(*args)
1480
1481
1482 def get_filesystem_encoding():
1483     encoding = sys.getfilesystemencoding()
1484     return encoding if encoding is not None else 'utf-8'
1485
1486
1487 def shell_quote(args):
1488     quoted_args = []
1489     encoding = get_filesystem_encoding()
1490     for a in args:
1491         if isinstance(a, bytes):
1492             # We may get a filename encoded with 'encodeFilename'
1493             a = a.decode(encoding)
1494         quoted_args.append(pipes.quote(a))
1495     return ' '.join(quoted_args)
1496
1497
1498 def smuggle_url(url, data):
1499     """ Pass additional data in a URL for internal use. """
1500
1501     url, idata = unsmuggle_url(url, {})
1502     data.update(idata)
1503     sdata = compat_urllib_parse_urlencode(
1504         {'__youtubedl_smuggle': json.dumps(data)})
1505     return url + '#' + sdata
1506
1507
1508 def unsmuggle_url(smug_url, default=None):
1509     if '#__youtubedl_smuggle' not in smug_url:
1510         return smug_url, default
1511     url, _, sdata = smug_url.rpartition('#')
1512     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1513     data = json.loads(jsond)
1514     return url, data
1515
1516
1517 def format_bytes(bytes):
1518     if bytes is None:
1519         return 'N/A'
1520     if type(bytes) is str:
1521         bytes = float(bytes)
1522     if bytes == 0.0:
1523         exponent = 0
1524     else:
1525         exponent = int(math.log(bytes, 1024.0))
1526     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1527     converted = float(bytes) / float(1024 ** exponent)
1528     return '%.2f%s' % (converted, suffix)
1529
1530
1531 def lookup_unit_table(unit_table, s):
1532     units_re = '|'.join(re.escape(u) for u in unit_table)
1533     m = re.match(
1534         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1535     if not m:
1536         return None
1537     num_str = m.group('num').replace(',', '.')
1538     mult = unit_table[m.group('unit')]
1539     return int(float(num_str) * mult)
1540
1541
1542 def parse_filesize(s):
1543     if s is None:
1544         return None
1545
1546     # The lower-case forms are of course incorrect and unofficial,
1547     # but we support those too
1548     _UNIT_TABLE = {
1549         'B': 1,
1550         'b': 1,
1551         'bytes': 1,
1552         'KiB': 1024,
1553         'KB': 1000,
1554         'kB': 1024,
1555         'Kb': 1000,
1556         'kb': 1000,
1557         'kilobytes': 1000,
1558         'kibibytes': 1024,
1559         'MiB': 1024 ** 2,
1560         'MB': 1000 ** 2,
1561         'mB': 1024 ** 2,
1562         'Mb': 1000 ** 2,
1563         'mb': 1000 ** 2,
1564         'megabytes': 1000 ** 2,
1565         'mebibytes': 1024 ** 2,
1566         'GiB': 1024 ** 3,
1567         'GB': 1000 ** 3,
1568         'gB': 1024 ** 3,
1569         'Gb': 1000 ** 3,
1570         'gb': 1000 ** 3,
1571         'gigabytes': 1000 ** 3,
1572         'gibibytes': 1024 ** 3,
1573         'TiB': 1024 ** 4,
1574         'TB': 1000 ** 4,
1575         'tB': 1024 ** 4,
1576         'Tb': 1000 ** 4,
1577         'tb': 1000 ** 4,
1578         'terabytes': 1000 ** 4,
1579         'tebibytes': 1024 ** 4,
1580         'PiB': 1024 ** 5,
1581         'PB': 1000 ** 5,
1582         'pB': 1024 ** 5,
1583         'Pb': 1000 ** 5,
1584         'pb': 1000 ** 5,
1585         'petabytes': 1000 ** 5,
1586         'pebibytes': 1024 ** 5,
1587         'EiB': 1024 ** 6,
1588         'EB': 1000 ** 6,
1589         'eB': 1024 ** 6,
1590         'Eb': 1000 ** 6,
1591         'eb': 1000 ** 6,
1592         'exabytes': 1000 ** 6,
1593         'exbibytes': 1024 ** 6,
1594         'ZiB': 1024 ** 7,
1595         'ZB': 1000 ** 7,
1596         'zB': 1024 ** 7,
1597         'Zb': 1000 ** 7,
1598         'zb': 1000 ** 7,
1599         'zettabytes': 1000 ** 7,
1600         'zebibytes': 1024 ** 7,
1601         'YiB': 1024 ** 8,
1602         'YB': 1000 ** 8,
1603         'yB': 1024 ** 8,
1604         'Yb': 1000 ** 8,
1605         'yb': 1000 ** 8,
1606         'yottabytes': 1000 ** 8,
1607         'yobibytes': 1024 ** 8,
1608     }
1609
1610     return lookup_unit_table(_UNIT_TABLE, s)
1611
1612
1613 def parse_count(s):
1614     if s is None:
1615         return None
1616
1617     s = s.strip()
1618
1619     if re.match(r'^[\d,.]+$', s):
1620         return str_to_int(s)
1621
1622     _UNIT_TABLE = {
1623         'k': 1000,
1624         'K': 1000,
1625         'm': 1000 ** 2,
1626         'M': 1000 ** 2,
1627         'kk': 1000 ** 2,
1628         'KK': 1000 ** 2,
1629     }
1630
1631     return lookup_unit_table(_UNIT_TABLE, s)
1632
1633
1634 def month_by_name(name, lang='en'):
1635     """ Return the number of a month by (locale-independently) English name """
1636
1637     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1638
1639     try:
1640         return month_names.index(name) + 1
1641     except ValueError:
1642         return None
1643
1644
1645 def month_by_abbreviation(abbrev):
1646     """ Return the number of a month by (locale-independently) English
1647         abbreviations """
1648
1649     try:
1650         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1651     except ValueError:
1652         return None
1653
1654
1655 def fix_xml_ampersands(xml_str):
1656     """Replace all the '&' by '&amp;' in XML"""
1657     return re.sub(
1658         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1659         '&amp;',
1660         xml_str)
1661
1662
1663 def setproctitle(title):
1664     assert isinstance(title, compat_str)
1665
1666     # ctypes in Jython is not complete
1667     # http://bugs.jython.org/issue2148
1668     if sys.platform.startswith('java'):
1669         return
1670
1671     try:
1672         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1673     except OSError:
1674         return
1675     title_bytes = title.encode('utf-8')
1676     buf = ctypes.create_string_buffer(len(title_bytes))
1677     buf.value = title_bytes
1678     try:
1679         libc.prctl(15, buf, 0, 0, 0)
1680     except AttributeError:
1681         return  # Strange libc, just skip this
1682
1683
1684 def remove_start(s, start):
1685     return s[len(start):] if s is not None and s.startswith(start) else s
1686
1687
1688 def remove_end(s, end):
1689     return s[:-len(end)] if s is not None and s.endswith(end) else s
1690
1691
1692 def remove_quotes(s):
1693     if s is None or len(s) < 2:
1694         return s
1695     for quote in ('"', "'", ):
1696         if s[0] == quote and s[-1] == quote:
1697             return s[1:-1]
1698     return s
1699
1700
1701 def url_basename(url):
1702     path = compat_urlparse.urlparse(url).path
1703     return path.strip('/').split('/')[-1]
1704
1705
1706 def base_url(url):
1707     return re.match(r'https?://[^?#&]+/', url).group()
1708
1709
1710 def urljoin(base, path):
1711     if not isinstance(path, compat_str) or not path:
1712         return None
1713     if re.match(r'^(?:https?:)?//', path):
1714         return path
1715     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1716         return None
1717     return compat_urlparse.urljoin(base, path)
1718
1719
1720 class HEADRequest(compat_urllib_request.Request):
1721     def get_method(self):
1722         return 'HEAD'
1723
1724
1725 class PUTRequest(compat_urllib_request.Request):
1726     def get_method(self):
1727         return 'PUT'
1728
1729
1730 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1731     if get_attr:
1732         if v is not None:
1733             v = getattr(v, get_attr, None)
1734     if v == '':
1735         v = None
1736     if v is None:
1737         return default
1738     try:
1739         return int(v) * invscale // scale
1740     except ValueError:
1741         return default
1742
1743
1744 def str_or_none(v, default=None):
1745     return default if v is None else compat_str(v)
1746
1747
1748 def str_to_int(int_str):
1749     """ A more relaxed version of int_or_none """
1750     if int_str is None:
1751         return None
1752     int_str = re.sub(r'[,\.\+]', '', int_str)
1753     return int(int_str)
1754
1755
1756 def float_or_none(v, scale=1, invscale=1, default=None):
1757     if v is None:
1758         return default
1759     try:
1760         return float(v) * invscale / scale
1761     except ValueError:
1762         return default
1763
1764
1765 def strip_or_none(v):
1766     return None if v is None else v.strip()
1767
1768
1769 def parse_duration(s):
1770     if not isinstance(s, compat_basestring):
1771         return None
1772
1773     s = s.strip()
1774
1775     days, hours, mins, secs, ms = [None] * 5
1776     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1777     if m:
1778         days, hours, mins, secs, ms = m.groups()
1779     else:
1780         m = re.match(
1781             r'''(?ix)(?:P?T)?
1782                 (?:
1783                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1784                 )?
1785                 (?:
1786                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1787                 )?
1788                 (?:
1789                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1790                 )?
1791                 (?:
1792                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1793                 )?Z?$''', s)
1794         if m:
1795             days, hours, mins, secs, ms = m.groups()
1796         else:
1797             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1798             if m:
1799                 hours, mins = m.groups()
1800             else:
1801                 return None
1802
1803     duration = 0
1804     if secs:
1805         duration += float(secs)
1806     if mins:
1807         duration += float(mins) * 60
1808     if hours:
1809         duration += float(hours) * 60 * 60
1810     if days:
1811         duration += float(days) * 24 * 60 * 60
1812     if ms:
1813         duration += float(ms)
1814     return duration
1815
1816
1817 def prepend_extension(filename, ext, expected_real_ext=None):
1818     name, real_ext = os.path.splitext(filename)
1819     return (
1820         '{0}.{1}{2}'.format(name, ext, real_ext)
1821         if not expected_real_ext or real_ext[1:] == expected_real_ext
1822         else '{0}.{1}'.format(filename, ext))
1823
1824
1825 def replace_extension(filename, ext, expected_real_ext=None):
1826     name, real_ext = os.path.splitext(filename)
1827     return '{0}.{1}'.format(
1828         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1829         ext)
1830
1831
1832 def check_executable(exe, args=[]):
1833     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1834     args can be a list of arguments for a short output (like -version) """
1835     try:
1836         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1837     except OSError:
1838         return False
1839     return exe
1840
1841
1842 def get_exe_version(exe, args=['--version'],
1843                     version_re=None, unrecognized='present'):
1844     """ Returns the version of the specified executable,
1845     or False if the executable is not present """
1846     try:
1847         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1848         # SIGTTOU if youtube-dl is run in the background.
1849         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1850         out, _ = subprocess.Popen(
1851             [encodeArgument(exe)] + args,
1852             stdin=subprocess.PIPE,
1853             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1854     except OSError:
1855         return False
1856     if isinstance(out, bytes):  # Python 2.x
1857         out = out.decode('ascii', 'ignore')
1858     return detect_exe_version(out, version_re, unrecognized)
1859
1860
1861 def detect_exe_version(output, version_re=None, unrecognized='present'):
1862     assert isinstance(output, compat_str)
1863     if version_re is None:
1864         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1865     m = re.search(version_re, output)
1866     if m:
1867         return m.group(1)
1868     else:
1869         return unrecognized
1870
1871
1872 class PagedList(object):
1873     def __len__(self):
1874         # This is only useful for tests
1875         return len(self.getslice())
1876
1877
1878 class OnDemandPagedList(PagedList):
1879     def __init__(self, pagefunc, pagesize, use_cache=False):
1880         self._pagefunc = pagefunc
1881         self._pagesize = pagesize
1882         self._use_cache = use_cache
1883         if use_cache:
1884             self._cache = {}
1885
1886     def getslice(self, start=0, end=None):
1887         res = []
1888         for pagenum in itertools.count(start // self._pagesize):
1889             firstid = pagenum * self._pagesize
1890             nextfirstid = pagenum * self._pagesize + self._pagesize
1891             if start >= nextfirstid:
1892                 continue
1893
1894             page_results = None
1895             if self._use_cache:
1896                 page_results = self._cache.get(pagenum)
1897             if page_results is None:
1898                 page_results = list(self._pagefunc(pagenum))
1899             if self._use_cache:
1900                 self._cache[pagenum] = page_results
1901
1902             startv = (
1903                 start % self._pagesize
1904                 if firstid <= start < nextfirstid
1905                 else 0)
1906
1907             endv = (
1908                 ((end - 1) % self._pagesize) + 1
1909                 if (end is not None and firstid <= end <= nextfirstid)
1910                 else None)
1911
1912             if startv != 0 or endv is not None:
1913                 page_results = page_results[startv:endv]
1914             res.extend(page_results)
1915
1916             # A little optimization - if current page is not "full", ie. does
1917             # not contain page_size videos then we can assume that this page
1918             # is the last one - there are no more ids on further pages -
1919             # i.e. no need to query again.
1920             if len(page_results) + startv < self._pagesize:
1921                 break
1922
1923             # If we got the whole page, but the next page is not interesting,
1924             # break out early as well
1925             if end == nextfirstid:
1926                 break
1927         return res
1928
1929
1930 class InAdvancePagedList(PagedList):
1931     def __init__(self, pagefunc, pagecount, pagesize):
1932         self._pagefunc = pagefunc
1933         self._pagecount = pagecount
1934         self._pagesize = pagesize
1935
1936     def getslice(self, start=0, end=None):
1937         res = []
1938         start_page = start // self._pagesize
1939         end_page = (
1940             self._pagecount if end is None else (end // self._pagesize + 1))
1941         skip_elems = start - start_page * self._pagesize
1942         only_more = None if end is None else end - start
1943         for pagenum in range(start_page, end_page):
1944             page = list(self._pagefunc(pagenum))
1945             if skip_elems:
1946                 page = page[skip_elems:]
1947                 skip_elems = None
1948             if only_more is not None:
1949                 if len(page) < only_more:
1950                     only_more -= len(page)
1951                 else:
1952                     page = page[:only_more]
1953                     res.extend(page)
1954                     break
1955             res.extend(page)
1956         return res
1957
1958
1959 def uppercase_escape(s):
1960     unicode_escape = codecs.getdecoder('unicode_escape')
1961     return re.sub(
1962         r'\\U[0-9a-fA-F]{8}',
1963         lambda m: unicode_escape(m.group(0))[0],
1964         s)
1965
1966
1967 def lowercase_escape(s):
1968     unicode_escape = codecs.getdecoder('unicode_escape')
1969     return re.sub(
1970         r'\\u[0-9a-fA-F]{4}',
1971         lambda m: unicode_escape(m.group(0))[0],
1972         s)
1973
1974
1975 def escape_rfc3986(s):
1976     """Escape non-ASCII characters as suggested by RFC 3986"""
1977     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1978         s = s.encode('utf-8')
1979     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1980
1981
1982 def escape_url(url):
1983     """Escape URL as suggested by RFC 3986"""
1984     url_parsed = compat_urllib_parse_urlparse(url)
1985     return url_parsed._replace(
1986         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1987         path=escape_rfc3986(url_parsed.path),
1988         params=escape_rfc3986(url_parsed.params),
1989         query=escape_rfc3986(url_parsed.query),
1990         fragment=escape_rfc3986(url_parsed.fragment)
1991     ).geturl()
1992
1993
1994 def read_batch_urls(batch_fd):
1995     def fixup(url):
1996         if not isinstance(url, compat_str):
1997             url = url.decode('utf-8', 'replace')
1998         BOM_UTF8 = '\xef\xbb\xbf'
1999         if url.startswith(BOM_UTF8):
2000             url = url[len(BOM_UTF8):]
2001         url = url.strip()
2002         if url.startswith(('#', ';', ']')):
2003             return False
2004         return url
2005
2006     with contextlib.closing(batch_fd) as fd:
2007         return [url for url in map(fixup, fd) if url]
2008
2009
2010 def urlencode_postdata(*args, **kargs):
2011     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2012
2013
2014 def update_url_query(url, query):
2015     if not query:
2016         return url
2017     parsed_url = compat_urlparse.urlparse(url)
2018     qs = compat_parse_qs(parsed_url.query)
2019     qs.update(query)
2020     return compat_urlparse.urlunparse(parsed_url._replace(
2021         query=compat_urllib_parse_urlencode(qs, True)))
2022
2023
2024 def update_Request(req, url=None, data=None, headers={}, query={}):
2025     req_headers = req.headers.copy()
2026     req_headers.update(headers)
2027     req_data = data or req.data
2028     req_url = update_url_query(url or req.get_full_url(), query)
2029     req_get_method = req.get_method()
2030     if req_get_method == 'HEAD':
2031         req_type = HEADRequest
2032     elif req_get_method == 'PUT':
2033         req_type = PUTRequest
2034     else:
2035         req_type = compat_urllib_request.Request
2036     new_req = req_type(
2037         req_url, data=req_data, headers=req_headers,
2038         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2039     if hasattr(req, 'timeout'):
2040         new_req.timeout = req.timeout
2041     return new_req
2042
2043
2044 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2045     if isinstance(key_or_keys, (list, tuple)):
2046         for key in key_or_keys:
2047             if key not in d or d[key] is None or skip_false_values and not d[key]:
2048                 continue
2049             return d[key]
2050         return default
2051     return d.get(key_or_keys, default)
2052
2053
2054 def try_get(src, getter, expected_type=None):
2055     try:
2056         v = getter(src)
2057     except (AttributeError, KeyError, TypeError, IndexError):
2058         pass
2059     else:
2060         if expected_type is None or isinstance(v, expected_type):
2061             return v
2062
2063
2064 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2065     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2066
2067
2068 US_RATINGS = {
2069     'G': 0,
2070     'PG': 10,
2071     'PG-13': 13,
2072     'R': 16,
2073     'NC': 18,
2074 }
2075
2076
2077 TV_PARENTAL_GUIDELINES = {
2078     'TV-Y': 0,
2079     'TV-Y7': 7,
2080     'TV-G': 0,
2081     'TV-PG': 0,
2082     'TV-14': 14,
2083     'TV-MA': 17,
2084 }
2085
2086
2087 def parse_age_limit(s):
2088     if type(s) == int:
2089         return s if 0 <= s <= 21 else None
2090     if not isinstance(s, compat_basestring):
2091         return None
2092     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2093     if m:
2094         return int(m.group('age'))
2095     if s in US_RATINGS:
2096         return US_RATINGS[s]
2097     return TV_PARENTAL_GUIDELINES.get(s)
2098
2099
2100 def strip_jsonp(code):
2101     return re.sub(
2102         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2103
2104
2105 def js_to_json(code):
2106     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2107     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2108     INTEGER_TABLE = (
2109         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2110         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2111     )
2112
2113     def fix_kv(m):
2114         v = m.group(0)
2115         if v in ('true', 'false', 'null'):
2116             return v
2117         elif v.startswith('/*') or v.startswith('//') or v == ',':
2118             return ""
2119
2120         if v[0] in ("'", '"'):
2121             v = re.sub(r'(?s)\\.|"', lambda m: {
2122                 '"': '\\"',
2123                 "\\'": "'",
2124                 '\\\n': '',
2125                 '\\x': '\\u00',
2126             }.get(m.group(0), m.group(0)), v[1:-1])
2127
2128         for regex, base in INTEGER_TABLE:
2129             im = re.match(regex, v)
2130             if im:
2131                 i = int(im.group(1), base)
2132                 return '"%d":' % i if v.endswith(':') else '%d' % i
2133
2134         return '"%s"' % v
2135
2136     return re.sub(r'''(?sx)
2137         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2138         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2139         {comment}|,(?={skip}[\]}}])|
2140         [a-zA-Z_][.a-zA-Z_0-9]*|
2141         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2142         [0-9]+(?={skip}:)
2143         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2144
2145
2146 def qualities(quality_ids):
2147     """ Get a numeric quality value out of a list of possible values """
2148     def q(qid):
2149         try:
2150             return quality_ids.index(qid)
2151         except ValueError:
2152             return -1
2153     return q
2154
2155
2156 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2157
2158
2159 def limit_length(s, length):
2160     """ Add ellipses to overly long strings """
2161     if s is None:
2162         return None
2163     ELLIPSES = '...'
2164     if len(s) > length:
2165         return s[:length - len(ELLIPSES)] + ELLIPSES
2166     return s
2167
2168
2169 def version_tuple(v):
2170     return tuple(int(e) for e in re.split(r'[-.]', v))
2171
2172
2173 def is_outdated_version(version, limit, assume_new=True):
2174     if not version:
2175         return not assume_new
2176     try:
2177         return version_tuple(version) < version_tuple(limit)
2178     except ValueError:
2179         return not assume_new
2180
2181
2182 def ytdl_is_updateable():
2183     """ Returns if youtube-dl can be updated with -U """
2184     from zipimport import zipimporter
2185
2186     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2187
2188
2189 def args_to_str(args):
2190     # Get a short string representation for a subprocess command
2191     return ' '.join(compat_shlex_quote(a) for a in args)
2192
2193
2194 def error_to_compat_str(err):
2195     err_str = str(err)
2196     # On python 2 error byte string must be decoded with proper
2197     # encoding rather than ascii
2198     if sys.version_info[0] < 3:
2199         err_str = err_str.decode(preferredencoding())
2200     return err_str
2201
2202
2203 def mimetype2ext(mt):
2204     if mt is None:
2205         return None
2206
2207     ext = {
2208         'audio/mp4': 'm4a',
2209         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2210         # it's the most popular one
2211         'audio/mpeg': 'mp3',
2212     }.get(mt)
2213     if ext is not None:
2214         return ext
2215
2216     _, _, res = mt.rpartition('/')
2217     res = res.split(';')[0].strip().lower()
2218
2219     return {
2220         '3gpp': '3gp',
2221         'smptett+xml': 'tt',
2222         'srt': 'srt',
2223         'ttaf+xml': 'dfxp',
2224         'ttml+xml': 'ttml',
2225         'vtt': 'vtt',
2226         'x-flv': 'flv',
2227         'x-mp4-fragmented': 'mp4',
2228         'x-ms-wmv': 'wmv',
2229         'mpegurl': 'm3u8',
2230         'x-mpegurl': 'm3u8',
2231         'vnd.apple.mpegurl': 'm3u8',
2232         'dash+xml': 'mpd',
2233         'f4m': 'f4m',
2234         'f4m+xml': 'f4m',
2235         'hds+xml': 'f4m',
2236         'vnd.ms-sstr+xml': 'ism',
2237         'quicktime': 'mov',
2238     }.get(res, res)
2239
2240
2241 def parse_codecs(codecs_str):
2242     # http://tools.ietf.org/html/rfc6381
2243     if not codecs_str:
2244         return {}
2245     splited_codecs = list(filter(None, map(
2246         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2247     vcodec, acodec = None, None
2248     for full_codec in splited_codecs:
2249         codec = full_codec.split('.')[0]
2250         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2251             if not vcodec:
2252                 vcodec = full_codec
2253         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2254             if not acodec:
2255                 acodec = full_codec
2256         else:
2257             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2258     if not vcodec and not acodec:
2259         if len(splited_codecs) == 2:
2260             return {
2261                 'vcodec': vcodec,
2262                 'acodec': acodec,
2263             }
2264         elif len(splited_codecs) == 1:
2265             return {
2266                 'vcodec': 'none',
2267                 'acodec': vcodec,
2268             }
2269     else:
2270         return {
2271             'vcodec': vcodec or 'none',
2272             'acodec': acodec or 'none',
2273         }
2274     return {}
2275
2276
2277 def urlhandle_detect_ext(url_handle):
2278     getheader = url_handle.headers.get
2279
2280     cd = getheader('Content-Disposition')
2281     if cd:
2282         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2283         if m:
2284             e = determine_ext(m.group('filename'), default_ext=None)
2285             if e:
2286                 return e
2287
2288     return mimetype2ext(getheader('Content-Type'))
2289
2290
2291 def encode_data_uri(data, mime_type):
2292     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2293
2294
2295 def age_restricted(content_limit, age_limit):
2296     """ Returns True iff the content should be blocked """
2297
2298     if age_limit is None:  # No limit set
2299         return False
2300     if content_limit is None:
2301         return False  # Content available for everyone
2302     return age_limit < content_limit
2303
2304
2305 def is_html(first_bytes):
2306     """ Detect whether a file contains HTML by examining its first bytes. """
2307
2308     BOMS = [
2309         (b'\xef\xbb\xbf', 'utf-8'),
2310         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2311         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2312         (b'\xff\xfe', 'utf-16-le'),
2313         (b'\xfe\xff', 'utf-16-be'),
2314     ]
2315     for bom, enc in BOMS:
2316         if first_bytes.startswith(bom):
2317             s = first_bytes[len(bom):].decode(enc, 'replace')
2318             break
2319     else:
2320         s = first_bytes.decode('utf-8', 'replace')
2321
2322     return re.match(r'^\s*<', s)
2323
2324
2325 def determine_protocol(info_dict):
2326     protocol = info_dict.get('protocol')
2327     if protocol is not None:
2328         return protocol
2329
2330     url = info_dict['url']
2331     if url.startswith('rtmp'):
2332         return 'rtmp'
2333     elif url.startswith('mms'):
2334         return 'mms'
2335     elif url.startswith('rtsp'):
2336         return 'rtsp'
2337
2338     ext = determine_ext(url)
2339     if ext == 'm3u8':
2340         return 'm3u8'
2341     elif ext == 'f4m':
2342         return 'f4m'
2343
2344     return compat_urllib_parse_urlparse(url).scheme
2345
2346
2347 def render_table(header_row, data):
2348     """ Render a list of rows, each as a list of values """
2349     table = [header_row] + data
2350     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2351     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2352     return '\n'.join(format_str % tuple(row) for row in table)
2353
2354
2355 def _match_one(filter_part, dct):
2356     COMPARISON_OPERATORS = {
2357         '<': operator.lt,
2358         '<=': operator.le,
2359         '>': operator.gt,
2360         '>=': operator.ge,
2361         '=': operator.eq,
2362         '!=': operator.ne,
2363     }
2364     operator_rex = re.compile(r'''(?x)\s*
2365         (?P<key>[a-z_]+)
2366         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2367         (?:
2368             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2369             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2370         )
2371         \s*$
2372         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2373     m = operator_rex.search(filter_part)
2374     if m:
2375         op = COMPARISON_OPERATORS[m.group('op')]
2376         actual_value = dct.get(m.group('key'))
2377         if (m.group('strval') is not None or
2378             # If the original field is a string and matching comparisonvalue is
2379             # a number we should respect the origin of the original field
2380             # and process comparison value as a string (see
2381             # https://github.com/rg3/youtube-dl/issues/11082).
2382             actual_value is not None and m.group('intval') is not None and
2383                 isinstance(actual_value, compat_str)):
2384             if m.group('op') not in ('=', '!='):
2385                 raise ValueError(
2386                     'Operator %s does not support string values!' % m.group('op'))
2387             comparison_value = m.group('strval') or m.group('intval')
2388         else:
2389             try:
2390                 comparison_value = int(m.group('intval'))
2391             except ValueError:
2392                 comparison_value = parse_filesize(m.group('intval'))
2393                 if comparison_value is None:
2394                     comparison_value = parse_filesize(m.group('intval') + 'B')
2395                 if comparison_value is None:
2396                     raise ValueError(
2397                         'Invalid integer value %r in filter part %r' % (
2398                             m.group('intval'), filter_part))
2399         if actual_value is None:
2400             return m.group('none_inclusive')
2401         return op(actual_value, comparison_value)
2402
2403     UNARY_OPERATORS = {
2404         '': lambda v: v is not None,
2405         '!': lambda v: v is None,
2406     }
2407     operator_rex = re.compile(r'''(?x)\s*
2408         (?P<op>%s)\s*(?P<key>[a-z_]+)
2409         \s*$
2410         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2411     m = operator_rex.search(filter_part)
2412     if m:
2413         op = UNARY_OPERATORS[m.group('op')]
2414         actual_value = dct.get(m.group('key'))
2415         return op(actual_value)
2416
2417     raise ValueError('Invalid filter part %r' % filter_part)
2418
2419
2420 def match_str(filter_str, dct):
2421     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2422
2423     return all(
2424         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2425
2426
2427 def match_filter_func(filter_str):
2428     def _match_func(info_dict):
2429         if match_str(filter_str, info_dict):
2430             return None
2431         else:
2432             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2433             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2434     return _match_func
2435
2436
2437 def parse_dfxp_time_expr(time_expr):
2438     if not time_expr:
2439         return
2440
2441     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2442     if mobj:
2443         return float(mobj.group('time_offset'))
2444
2445     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2446     if mobj:
2447         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2448
2449
2450 def srt_subtitles_timecode(seconds):
2451     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2452
2453
2454 def dfxp2srt(dfxp_data):
2455     _x = functools.partial(xpath_with_ns, ns_map={
2456         'ttml': 'http://www.w3.org/ns/ttml',
2457         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2458         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2459     })
2460
2461     class TTMLPElementParser(object):
2462         out = ''
2463
2464         def start(self, tag, attrib):
2465             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2466                 self.out += '\n'
2467
2468         def end(self, tag):
2469             pass
2470
2471         def data(self, data):
2472             self.out += data
2473
2474         def close(self):
2475             return self.out.strip()
2476
2477     def parse_node(node):
2478         target = TTMLPElementParser()
2479         parser = xml.etree.ElementTree.XMLParser(target=target)
2480         parser.feed(xml.etree.ElementTree.tostring(node))
2481         return parser.close()
2482
2483     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2484     out = []
2485     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2486
2487     if not paras:
2488         raise ValueError('Invalid dfxp/TTML subtitle')
2489
2490     for para, index in zip(paras, itertools.count(1)):
2491         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2492         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2493         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2494         if begin_time is None:
2495             continue
2496         if not end_time:
2497             if not dur:
2498                 continue
2499             end_time = begin_time + dur
2500         out.append('%d\n%s --> %s\n%s\n\n' % (
2501             index,
2502             srt_subtitles_timecode(begin_time),
2503             srt_subtitles_timecode(end_time),
2504             parse_node(para)))
2505
2506     return ''.join(out)
2507
2508
2509 def cli_option(params, command_option, param):
2510     param = params.get(param)
2511     if param:
2512         param = compat_str(param)
2513     return [command_option, param] if param is not None else []
2514
2515
2516 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2517     param = params.get(param)
2518     assert isinstance(param, bool)
2519     if separator:
2520         return [command_option + separator + (true_value if param else false_value)]
2521     return [command_option, true_value if param else false_value]
2522
2523
2524 def cli_valueless_option(params, command_option, param, expected_value=True):
2525     param = params.get(param)
2526     return [command_option] if param == expected_value else []
2527
2528
2529 def cli_configuration_args(params, param, default=[]):
2530     ex_args = params.get(param)
2531     if ex_args is None:
2532         return default
2533     assert isinstance(ex_args, list)
2534     return ex_args
2535
2536
2537 class ISO639Utils(object):
2538     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2539     _lang_map = {
2540         'aa': 'aar',
2541         'ab': 'abk',
2542         'ae': 'ave',
2543         'af': 'afr',
2544         'ak': 'aka',
2545         'am': 'amh',
2546         'an': 'arg',
2547         'ar': 'ara',
2548         'as': 'asm',
2549         'av': 'ava',
2550         'ay': 'aym',
2551         'az': 'aze',
2552         'ba': 'bak',
2553         'be': 'bel',
2554         'bg': 'bul',
2555         'bh': 'bih',
2556         'bi': 'bis',
2557         'bm': 'bam',
2558         'bn': 'ben',
2559         'bo': 'bod',
2560         'br': 'bre',
2561         'bs': 'bos',
2562         'ca': 'cat',
2563         'ce': 'che',
2564         'ch': 'cha',
2565         'co': 'cos',
2566         'cr': 'cre',
2567         'cs': 'ces',
2568         'cu': 'chu',
2569         'cv': 'chv',
2570         'cy': 'cym',
2571         'da': 'dan',
2572         'de': 'deu',
2573         'dv': 'div',
2574         'dz': 'dzo',
2575         'ee': 'ewe',
2576         'el': 'ell',
2577         'en': 'eng',
2578         'eo': 'epo',
2579         'es': 'spa',
2580         'et': 'est',
2581         'eu': 'eus',
2582         'fa': 'fas',
2583         'ff': 'ful',
2584         'fi': 'fin',
2585         'fj': 'fij',
2586         'fo': 'fao',
2587         'fr': 'fra',
2588         'fy': 'fry',
2589         'ga': 'gle',
2590         'gd': 'gla',
2591         'gl': 'glg',
2592         'gn': 'grn',
2593         'gu': 'guj',
2594         'gv': 'glv',
2595         'ha': 'hau',
2596         'he': 'heb',
2597         'hi': 'hin',
2598         'ho': 'hmo',
2599         'hr': 'hrv',
2600         'ht': 'hat',
2601         'hu': 'hun',
2602         'hy': 'hye',
2603         'hz': 'her',
2604         'ia': 'ina',
2605         'id': 'ind',
2606         'ie': 'ile',
2607         'ig': 'ibo',
2608         'ii': 'iii',
2609         'ik': 'ipk',
2610         'io': 'ido',
2611         'is': 'isl',
2612         'it': 'ita',
2613         'iu': 'iku',
2614         'ja': 'jpn',
2615         'jv': 'jav',
2616         'ka': 'kat',
2617         'kg': 'kon',
2618         'ki': 'kik',
2619         'kj': 'kua',
2620         'kk': 'kaz',
2621         'kl': 'kal',
2622         'km': 'khm',
2623         'kn': 'kan',
2624         'ko': 'kor',
2625         'kr': 'kau',
2626         'ks': 'kas',
2627         'ku': 'kur',
2628         'kv': 'kom',
2629         'kw': 'cor',
2630         'ky': 'kir',
2631         'la': 'lat',
2632         'lb': 'ltz',
2633         'lg': 'lug',
2634         'li': 'lim',
2635         'ln': 'lin',
2636         'lo': 'lao',
2637         'lt': 'lit',
2638         'lu': 'lub',
2639         'lv': 'lav',
2640         'mg': 'mlg',
2641         'mh': 'mah',
2642         'mi': 'mri',
2643         'mk': 'mkd',
2644         'ml': 'mal',
2645         'mn': 'mon',
2646         'mr': 'mar',
2647         'ms': 'msa',
2648         'mt': 'mlt',
2649         'my': 'mya',
2650         'na': 'nau',
2651         'nb': 'nob',
2652         'nd': 'nde',
2653         'ne': 'nep',
2654         'ng': 'ndo',
2655         'nl': 'nld',
2656         'nn': 'nno',
2657         'no': 'nor',
2658         'nr': 'nbl',
2659         'nv': 'nav',
2660         'ny': 'nya',
2661         'oc': 'oci',
2662         'oj': 'oji',
2663         'om': 'orm',
2664         'or': 'ori',
2665         'os': 'oss',
2666         'pa': 'pan',
2667         'pi': 'pli',
2668         'pl': 'pol',
2669         'ps': 'pus',
2670         'pt': 'por',
2671         'qu': 'que',
2672         'rm': 'roh',
2673         'rn': 'run',
2674         'ro': 'ron',
2675         'ru': 'rus',
2676         'rw': 'kin',
2677         'sa': 'san',
2678         'sc': 'srd',
2679         'sd': 'snd',
2680         'se': 'sme',
2681         'sg': 'sag',
2682         'si': 'sin',
2683         'sk': 'slk',
2684         'sl': 'slv',
2685         'sm': 'smo',
2686         'sn': 'sna',
2687         'so': 'som',
2688         'sq': 'sqi',
2689         'sr': 'srp',
2690         'ss': 'ssw',
2691         'st': 'sot',
2692         'su': 'sun',
2693         'sv': 'swe',
2694         'sw': 'swa',
2695         'ta': 'tam',
2696         'te': 'tel',
2697         'tg': 'tgk',
2698         'th': 'tha',
2699         'ti': 'tir',
2700         'tk': 'tuk',
2701         'tl': 'tgl',
2702         'tn': 'tsn',
2703         'to': 'ton',
2704         'tr': 'tur',
2705         'ts': 'tso',
2706         'tt': 'tat',
2707         'tw': 'twi',
2708         'ty': 'tah',
2709         'ug': 'uig',
2710         'uk': 'ukr',
2711         'ur': 'urd',
2712         'uz': 'uzb',
2713         've': 'ven',
2714         'vi': 'vie',
2715         'vo': 'vol',
2716         'wa': 'wln',
2717         'wo': 'wol',
2718         'xh': 'xho',
2719         'yi': 'yid',
2720         'yo': 'yor',
2721         'za': 'zha',
2722         'zh': 'zho',
2723         'zu': 'zul',
2724     }
2725
2726     @classmethod
2727     def short2long(cls, code):
2728         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2729         return cls._lang_map.get(code[:2])
2730
2731     @classmethod
2732     def long2short(cls, code):
2733         """Convert language code from ISO 639-2/T to ISO 639-1"""
2734         for short_name, long_name in cls._lang_map.items():
2735             if long_name == code:
2736                 return short_name
2737
2738
2739 class ISO3166Utils(object):
2740     # From http://data.okfn.org/data/core/country-list
2741     _country_map = {
2742         'AF': 'Afghanistan',
2743         'AX': 'Åland Islands',
2744         'AL': 'Albania',
2745         'DZ': 'Algeria',
2746         'AS': 'American Samoa',
2747         'AD': 'Andorra',
2748         'AO': 'Angola',
2749         'AI': 'Anguilla',
2750         'AQ': 'Antarctica',
2751         'AG': 'Antigua and Barbuda',
2752         'AR': 'Argentina',
2753         'AM': 'Armenia',
2754         'AW': 'Aruba',
2755         'AU': 'Australia',
2756         'AT': 'Austria',
2757         'AZ': 'Azerbaijan',
2758         'BS': 'Bahamas',
2759         'BH': 'Bahrain',
2760         'BD': 'Bangladesh',
2761         'BB': 'Barbados',
2762         'BY': 'Belarus',
2763         'BE': 'Belgium',
2764         'BZ': 'Belize',
2765         'BJ': 'Benin',
2766         'BM': 'Bermuda',
2767         'BT': 'Bhutan',
2768         'BO': 'Bolivia, Plurinational State of',
2769         'BQ': 'Bonaire, Sint Eustatius and Saba',
2770         'BA': 'Bosnia and Herzegovina',
2771         'BW': 'Botswana',
2772         'BV': 'Bouvet Island',
2773         'BR': 'Brazil',
2774         'IO': 'British Indian Ocean Territory',
2775         'BN': 'Brunei Darussalam',
2776         'BG': 'Bulgaria',
2777         'BF': 'Burkina Faso',
2778         'BI': 'Burundi',
2779         'KH': 'Cambodia',
2780         'CM': 'Cameroon',
2781         'CA': 'Canada',
2782         'CV': 'Cape Verde',
2783         'KY': 'Cayman Islands',
2784         'CF': 'Central African Republic',
2785         'TD': 'Chad',
2786         'CL': 'Chile',
2787         'CN': 'China',
2788         'CX': 'Christmas Island',
2789         'CC': 'Cocos (Keeling) Islands',
2790         'CO': 'Colombia',
2791         'KM': 'Comoros',
2792         'CG': 'Congo',
2793         'CD': 'Congo, the Democratic Republic of the',
2794         'CK': 'Cook Islands',
2795         'CR': 'Costa Rica',
2796         'CI': 'Côte d\'Ivoire',
2797         'HR': 'Croatia',
2798         'CU': 'Cuba',
2799         'CW': 'Curaçao',
2800         'CY': 'Cyprus',
2801         'CZ': 'Czech Republic',
2802         'DK': 'Denmark',
2803         'DJ': 'Djibouti',
2804         'DM': 'Dominica',
2805         'DO': 'Dominican Republic',
2806         'EC': 'Ecuador',
2807         'EG': 'Egypt',
2808         'SV': 'El Salvador',
2809         'GQ': 'Equatorial Guinea',
2810         'ER': 'Eritrea',
2811         'EE': 'Estonia',
2812         'ET': 'Ethiopia',
2813         'FK': 'Falkland Islands (Malvinas)',
2814         'FO': 'Faroe Islands',
2815         'FJ': 'Fiji',
2816         'FI': 'Finland',
2817         'FR': 'France',
2818         'GF': 'French Guiana',
2819         'PF': 'French Polynesia',
2820         'TF': 'French Southern Territories',
2821         'GA': 'Gabon',
2822         'GM': 'Gambia',
2823         'GE': 'Georgia',
2824         'DE': 'Germany',
2825         'GH': 'Ghana',
2826         'GI': 'Gibraltar',
2827         'GR': 'Greece',
2828         'GL': 'Greenland',
2829         'GD': 'Grenada',
2830         'GP': 'Guadeloupe',
2831         'GU': 'Guam',
2832         'GT': 'Guatemala',
2833         'GG': 'Guernsey',
2834         'GN': 'Guinea',
2835         'GW': 'Guinea-Bissau',
2836         'GY': 'Guyana',
2837         'HT': 'Haiti',
2838         'HM': 'Heard Island and McDonald Islands',
2839         'VA': 'Holy See (Vatican City State)',
2840         'HN': 'Honduras',
2841         'HK': 'Hong Kong',
2842         'HU': 'Hungary',
2843         'IS': 'Iceland',
2844         'IN': 'India',
2845         'ID': 'Indonesia',
2846         'IR': 'Iran, Islamic Republic of',
2847         'IQ': 'Iraq',
2848         'IE': 'Ireland',
2849         'IM': 'Isle of Man',
2850         'IL': 'Israel',
2851         'IT': 'Italy',
2852         'JM': 'Jamaica',
2853         'JP': 'Japan',
2854         'JE': 'Jersey',
2855         'JO': 'Jordan',
2856         'KZ': 'Kazakhstan',
2857         'KE': 'Kenya',
2858         'KI': 'Kiribati',
2859         'KP': 'Korea, Democratic People\'s Republic of',
2860         'KR': 'Korea, Republic of',
2861         'KW': 'Kuwait',
2862         'KG': 'Kyrgyzstan',
2863         'LA': 'Lao People\'s Democratic Republic',
2864         'LV': 'Latvia',
2865         'LB': 'Lebanon',
2866         'LS': 'Lesotho',
2867         'LR': 'Liberia',
2868         'LY': 'Libya',
2869         'LI': 'Liechtenstein',
2870         'LT': 'Lithuania',
2871         'LU': 'Luxembourg',
2872         'MO': 'Macao',
2873         'MK': 'Macedonia, the Former Yugoslav Republic of',
2874         'MG': 'Madagascar',
2875         'MW': 'Malawi',
2876         'MY': 'Malaysia',
2877         'MV': 'Maldives',
2878         'ML': 'Mali',
2879         'MT': 'Malta',
2880         'MH': 'Marshall Islands',
2881         'MQ': 'Martinique',
2882         'MR': 'Mauritania',
2883         'MU': 'Mauritius',
2884         'YT': 'Mayotte',
2885         'MX': 'Mexico',
2886         'FM': 'Micronesia, Federated States of',
2887         'MD': 'Moldova, Republic of',
2888         'MC': 'Monaco',
2889         'MN': 'Mongolia',
2890         'ME': 'Montenegro',
2891         'MS': 'Montserrat',
2892         'MA': 'Morocco',
2893         'MZ': 'Mozambique',
2894         'MM': 'Myanmar',
2895         'NA': 'Namibia',
2896         'NR': 'Nauru',
2897         'NP': 'Nepal',
2898         'NL': 'Netherlands',
2899         'NC': 'New Caledonia',
2900         'NZ': 'New Zealand',
2901         'NI': 'Nicaragua',
2902         'NE': 'Niger',
2903         'NG': 'Nigeria',
2904         'NU': 'Niue',
2905         'NF': 'Norfolk Island',
2906         'MP': 'Northern Mariana Islands',
2907         'NO': 'Norway',
2908         'OM': 'Oman',
2909         'PK': 'Pakistan',
2910         'PW': 'Palau',
2911         'PS': 'Palestine, State of',
2912         'PA': 'Panama',
2913         'PG': 'Papua New Guinea',
2914         'PY': 'Paraguay',
2915         'PE': 'Peru',
2916         'PH': 'Philippines',
2917         'PN': 'Pitcairn',
2918         'PL': 'Poland',
2919         'PT': 'Portugal',
2920         'PR': 'Puerto Rico',
2921         'QA': 'Qatar',
2922         'RE': 'Réunion',
2923         'RO': 'Romania',
2924         'RU': 'Russian Federation',
2925         'RW': 'Rwanda',
2926         'BL': 'Saint Barthélemy',
2927         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2928         'KN': 'Saint Kitts and Nevis',
2929         'LC': 'Saint Lucia',
2930         'MF': 'Saint Martin (French part)',
2931         'PM': 'Saint Pierre and Miquelon',
2932         'VC': 'Saint Vincent and the Grenadines',
2933         'WS': 'Samoa',
2934         'SM': 'San Marino',
2935         'ST': 'Sao Tome and Principe',
2936         'SA': 'Saudi Arabia',
2937         'SN': 'Senegal',
2938         'RS': 'Serbia',
2939         'SC': 'Seychelles',
2940         'SL': 'Sierra Leone',
2941         'SG': 'Singapore',
2942         'SX': 'Sint Maarten (Dutch part)',
2943         'SK': 'Slovakia',
2944         'SI': 'Slovenia',
2945         'SB': 'Solomon Islands',
2946         'SO': 'Somalia',
2947         'ZA': 'South Africa',
2948         'GS': 'South Georgia and the South Sandwich Islands',
2949         'SS': 'South Sudan',
2950         'ES': 'Spain',
2951         'LK': 'Sri Lanka',
2952         'SD': 'Sudan',
2953         'SR': 'Suriname',
2954         'SJ': 'Svalbard and Jan Mayen',
2955         'SZ': 'Swaziland',
2956         'SE': 'Sweden',
2957         'CH': 'Switzerland',
2958         'SY': 'Syrian Arab Republic',
2959         'TW': 'Taiwan, Province of China',
2960         'TJ': 'Tajikistan',
2961         'TZ': 'Tanzania, United Republic of',
2962         'TH': 'Thailand',
2963         'TL': 'Timor-Leste',
2964         'TG': 'Togo',
2965         'TK': 'Tokelau',
2966         'TO': 'Tonga',
2967         'TT': 'Trinidad and Tobago',
2968         'TN': 'Tunisia',
2969         'TR': 'Turkey',
2970         'TM': 'Turkmenistan',
2971         'TC': 'Turks and Caicos Islands',
2972         'TV': 'Tuvalu',
2973         'UG': 'Uganda',
2974         'UA': 'Ukraine',
2975         'AE': 'United Arab Emirates',
2976         'GB': 'United Kingdom',
2977         'US': 'United States',
2978         'UM': 'United States Minor Outlying Islands',
2979         'UY': 'Uruguay',
2980         'UZ': 'Uzbekistan',
2981         'VU': 'Vanuatu',
2982         'VE': 'Venezuela, Bolivarian Republic of',
2983         'VN': 'Viet Nam',
2984         'VG': 'Virgin Islands, British',
2985         'VI': 'Virgin Islands, U.S.',
2986         'WF': 'Wallis and Futuna',
2987         'EH': 'Western Sahara',
2988         'YE': 'Yemen',
2989         'ZM': 'Zambia',
2990         'ZW': 'Zimbabwe',
2991     }
2992
2993     @classmethod
2994     def short2full(cls, code):
2995         """Convert an ISO 3166-2 country code to the corresponding full name"""
2996         return cls._country_map.get(code.upper())
2997
2998
2999 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3000     def __init__(self, proxies=None):
3001         # Set default handlers
3002         for type in ('http', 'https'):
3003             setattr(self, '%s_open' % type,
3004                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3005                         meth(r, proxy, type))
3006         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3007
3008     def proxy_open(self, req, proxy, type):
3009         req_proxy = req.headers.get('Ytdl-request-proxy')
3010         if req_proxy is not None:
3011             proxy = req_proxy
3012             del req.headers['Ytdl-request-proxy']
3013
3014         if proxy == '__noproxy__':
3015             return None  # No Proxy
3016         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3017             req.add_header('Ytdl-socks-proxy', proxy)
3018             # youtube-dl's http/https handlers do wrapping the socket with socks
3019             return None
3020         return compat_urllib_request.ProxyHandler.proxy_open(
3021             self, req, proxy, type)
3022
3023
3024 def ohdave_rsa_encrypt(data, exponent, modulus):
3025     '''
3026     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3027
3028     Input:
3029         data: data to encrypt, bytes-like object
3030         exponent, modulus: parameter e and N of RSA algorithm, both integer
3031     Output: hex string of encrypted data
3032
3033     Limitation: supports one block encryption only
3034     '''
3035
3036     payload = int(binascii.hexlify(data[::-1]), 16)
3037     encrypted = pow(payload, exponent, modulus)
3038     return '%x' % encrypted
3039
3040
3041 def encode_base_n(num, n, table=None):
3042     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3043     if not table:
3044         table = FULL_TABLE[:n]
3045
3046     if n > len(table):
3047         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3048
3049     if num == 0:
3050         return table[0]
3051
3052     ret = ''
3053     while num:
3054         ret = table[num % n] + ret
3055         num = num // n
3056     return ret
3057
3058
3059 def decode_packed_codes(code):
3060     mobj = re.search(PACKED_CODES_RE, code)
3061     obfucasted_code, base, count, symbols = mobj.groups()
3062     base = int(base)
3063     count = int(count)
3064     symbols = symbols.split('|')
3065     symbol_table = {}
3066
3067     while count:
3068         count -= 1
3069         base_n_count = encode_base_n(count, base)
3070         symbol_table[base_n_count] = symbols[count] or base_n_count
3071
3072     return re.sub(
3073         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3074         obfucasted_code)
3075
3076
3077 def parse_m3u8_attributes(attrib):
3078     info = {}
3079     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3080         if val.startswith('"'):
3081             val = val[1:-1]
3082         info[key] = val
3083     return info
3084
3085
3086 def urshift(val, n):
3087     return val >> n if val >= 0 else (val + 0x100000000) >> n
3088
3089
3090 # Based on png2str() written by @gdkchan and improved by @yokrysty
3091 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3092 def decode_png(png_data):
3093     # Reference: https://www.w3.org/TR/PNG/
3094     header = png_data[8:]
3095
3096     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3097         raise IOError('Not a valid PNG file.')
3098
3099     int_map = {1: '>B', 2: '>H', 4: '>I'}
3100     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3101
3102     chunks = []
3103
3104     while header:
3105         length = unpack_integer(header[:4])
3106         header = header[4:]
3107
3108         chunk_type = header[:4]
3109         header = header[4:]
3110
3111         chunk_data = header[:length]
3112         header = header[length:]
3113
3114         header = header[4:]  # Skip CRC
3115
3116         chunks.append({
3117             'type': chunk_type,
3118             'length': length,
3119             'data': chunk_data
3120         })
3121
3122     ihdr = chunks[0]['data']
3123
3124     width = unpack_integer(ihdr[:4])
3125     height = unpack_integer(ihdr[4:8])
3126
3127     idat = b''
3128
3129     for chunk in chunks:
3130         if chunk['type'] == b'IDAT':
3131             idat += chunk['data']
3132
3133     if not idat:
3134         raise IOError('Unable to read PNG data.')
3135
3136     decompressed_data = bytearray(zlib.decompress(idat))
3137
3138     stride = width * 3
3139     pixels = []
3140
3141     def _get_pixel(idx):
3142         x = idx % stride
3143         y = idx // stride
3144         return pixels[y][x]
3145
3146     for y in range(height):
3147         basePos = y * (1 + stride)
3148         filter_type = decompressed_data[basePos]
3149
3150         current_row = []
3151
3152         pixels.append(current_row)
3153
3154         for x in range(stride):
3155             color = decompressed_data[1 + basePos + x]
3156             basex = y * stride + x
3157             left = 0
3158             up = 0
3159
3160             if x > 2:
3161                 left = _get_pixel(basex - 3)
3162             if y > 0:
3163                 up = _get_pixel(basex - stride)
3164
3165             if filter_type == 1:  # Sub
3166                 color = (color + left) & 0xff
3167             elif filter_type == 2:  # Up
3168                 color = (color + up) & 0xff
3169             elif filter_type == 3:  # Average
3170                 color = (color + ((left + up) >> 1)) & 0xff
3171             elif filter_type == 4:  # Paeth
3172                 a = left
3173                 b = up
3174                 c = 0
3175
3176                 if x > 2 and y > 0:
3177                     c = _get_pixel(basex - stride - 3)
3178
3179                 p = a + b - c
3180
3181                 pa = abs(p - a)
3182                 pb = abs(p - b)
3183                 pc = abs(p - c)
3184
3185                 if pa <= pb and pa <= pc:
3186                     color = (color + a) & 0xff
3187                 elif pb <= pc:
3188                     color = (color + b) & 0xff
3189                 else:
3190                     color = (color + c) & 0xff
3191
3192             current_row.append(color)
3193
3194     return width, height, pixels
3195
3196
3197 def write_xattr(path, key, value):
3198     # This mess below finds the best xattr tool for the job
3199     try:
3200         # try the pyxattr module...
3201         import xattr
3202
3203         if hasattr(xattr, 'set'):  # pyxattr
3204             # Unicode arguments are not supported in python-pyxattr until
3205             # version 0.5.0
3206             # See https://github.com/rg3/youtube-dl/issues/5498
3207             pyxattr_required_version = '0.5.0'
3208             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3209                 # TODO: fallback to CLI tools
3210                 raise XAttrUnavailableError(
3211                     'python-pyxattr is detected but is too old. '
3212                     'youtube-dl requires %s or above while your version is %s. '
3213                     'Falling back to other xattr implementations' % (
3214                         pyxattr_required_version, xattr.__version__))
3215
3216             setxattr = xattr.set
3217         else:  # xattr
3218             setxattr = xattr.setxattr
3219
3220         try:
3221             setxattr(path, key, value)
3222         except EnvironmentError as e:
3223             raise XAttrMetadataError(e.errno, e.strerror)
3224
3225     except ImportError:
3226         if compat_os_name == 'nt':
3227             # Write xattrs to NTFS Alternate Data Streams:
3228             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3229             assert ':' not in key
3230             assert os.path.exists(path)
3231
3232             ads_fn = path + ':' + key
3233             try:
3234                 with open(ads_fn, 'wb') as f:
3235                     f.write(value)
3236             except EnvironmentError as e:
3237                 raise XAttrMetadataError(e.errno, e.strerror)
3238         else:
3239             user_has_setfattr = check_executable('setfattr', ['--version'])
3240             user_has_xattr = check_executable('xattr', ['-h'])
3241
3242             if user_has_setfattr or user_has_xattr:
3243
3244                 value = value.decode('utf-8')
3245                 if user_has_setfattr:
3246                     executable = 'setfattr'
3247                     opts = ['-n', key, '-v', value]
3248                 elif user_has_xattr:
3249                     executable = 'xattr'
3250                     opts = ['-w', key, value]
3251
3252                 cmd = ([encodeFilename(executable, True)] +
3253                        [encodeArgument(o) for o in opts] +
3254                        [encodeFilename(path, True)])
3255
3256                 try:
3257                     p = subprocess.Popen(
3258                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3259                 except EnvironmentError as e:
3260                     raise XAttrMetadataError(e.errno, e.strerror)
3261                 stdout, stderr = p.communicate()
3262                 stderr = stderr.decode('utf-8', 'replace')
3263                 if p.returncode != 0:
3264                     raise XAttrMetadataError(p.returncode, stderr)
3265
3266             else:
3267                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3268                 if sys.platform.startswith('linux'):
3269                     raise XAttrUnavailableError(
3270                         "Couldn't find a tool to set the xattrs. "
3271                         "Install either the python 'pyxattr' or 'xattr' "
3272                         "modules, or the GNU 'attr' package "
3273                         "(which contains the 'setfattr' tool).")
3274                 else:
3275                     raise XAttrUnavailableError(
3276                         "Couldn't find a tool to set the xattrs. "
3277                         "Install either the python 'xattr' module, "
3278                         "or the 'xattr' binary.")