Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_cookiejar,
  43     compat_ctypes_WINFUNCTYPE,
  44     compat_etree_fromstring,
  45     compat_expanduser,
  46     compat_html_entities,
  47     compat_html_entities_html5,
  48     compat_http_client,
  49     compat_kwargs,
  50     compat_os_name,
  51     compat_parse_qs,
  52     compat_shlex_quote,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 188
 189
 190 def preferredencoding():
 191     """Get preferred encoding.
 192
 193     Returns the best encoding scheme for the system, based on
 194     locale.getpreferredencoding() and some further tweaks.
 195     """
 196     try:
 197         pref = locale.getpreferredencoding()
 198         'TEST'.encode(pref)
 199     except Exception:
 200         pref = 'UTF-8'
 201
 202     return pref
 203
 204
 205 def write_json_file(obj, fn):
 206     """ Encode obj as JSON and write it to fn, atomically if possible """
 207
 208     fn = encodeFilename(fn)
 209     if sys.version_info < (3, 0) and sys.platform != 'win32':
 210         encoding = get_filesystem_encoding()
 211         # os.path.basename returns a bytes object, but NamedTemporaryFile
 212         # will fail if the filename contains non ascii characters unless we
 213         # use a unicode object
 214         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 215         # the same for os.path.dirname
 216         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 217     else:
 218         path_basename = os.path.basename
 219         path_dirname = os.path.dirname
 220
 221     args = {
 222         'suffix': '.tmp',
 223         'prefix': path_basename(fn) + '.',
 224         'dir': path_dirname(fn),
 225         'delete': False,
 226     }
 227
 228     # In Python 2.x, json.dump expects a bytestream.
 229     # In Python 3.x, it writes to a character stream
 230     if sys.version_info < (3, 0):
 231         args['mode'] = 'wb'
 232     else:
 233         args.update({
 234             'mode': 'w',
 235             'encoding': 'utf-8',
 236         })
 237
 238     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 239
 240     try:
 241         with tf:
 242             json.dump(obj, tf)
 243         if sys.platform == 'win32':
 244             # Need to remove existing file on Windows, else os.rename raises
 245             # WindowsError or FileExistsError.
 246             try:
 247                 os.unlink(fn)
 248             except OSError:
 249                 pass
 250         os.rename(tf.name, fn)
 251     except Exception:
 252         try:
 253             os.remove(tf.name)
 254         except OSError:
 255             pass
 256         raise
 257
 258
 259 if sys.version_info >= (2, 7):
 260     def find_xpath_attr(node, xpath, key, val=None):
 261         """ Find the xpath xpath[@key=val] """
 262         assert re.match(r'^[a-zA-Z_-]+$', key)
 263         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 264         return node.find(expr)
 265 else:
 266     def find_xpath_attr(node, xpath, key, val=None):
 267         for f in node.findall(compat_xpath(xpath)):
 268             if key not in f.attrib:
 269                 continue
 270             if val is None or f.attrib.get(key) == val:
 271                 return f
 272         return None
 273
 274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 275 # the namespace parameter
 276
 277
 278 def xpath_with_ns(path, ns_map):
 279     components = [c.split(':') for c in path.split('/')]
 280     replaced = []
 281     for c in components:
 282         if len(c) == 1:
 283             replaced.append(c[0])
 284         else:
 285             ns, tag = c
 286             replaced.append('{%s}%s' % (ns_map[ns], tag))
 287     return '/'.join(replaced)
 288
 289
 290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 291     def _find_xpath(xpath):
 292         return node.find(compat_xpath(xpath))
 293
 294     if isinstance(xpath, (str, compat_str)):
 295         n = _find_xpath(xpath)
 296     else:
 297         for xp in xpath:
 298             n = _find_xpath(xp)
 299             if n is not None:
 300                 break
 301
 302     if n is None:
 303         if default is not NO_DEFAULT:
 304             return default
 305         elif fatal:
 306             name = xpath if name is None else name
 307             raise ExtractorError('Could not find XML element %s' % name)
 308         else:
 309             return None
 310     return n
 311
 312
 313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 314     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 315     if n is None or n == default:
 316         return n
 317     if n.text is None:
 318         if default is not NO_DEFAULT:
 319             return default
 320         elif fatal:
 321             name = xpath if name is None else name
 322             raise ExtractorError('Could not find XML element\'s text %s' % name)
 323         else:
 324             return None
 325     return n.text
 326
 327
 328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 329     n = find_xpath_attr(node, xpath, key)
 330     if n is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = '%s[@%s]' % (xpath, key) if name is None else name
 335             raise ExtractorError('Could not find XML attribute %s' % name)
 336         else:
 337             return None
 338     return n.attrib[key]
 339
 340
 341 def get_element_by_id(id, html):
 342     """Return the content of the tag with the specified ID in the passed HTML document"""
 343     return get_element_by_attribute('id', id, html)
 344
 345
 346 def get_element_by_class(class_name, html):
 347     """Return the content of the first tag with the specified class in the passed HTML document"""
 348     retval = get_elements_by_class(class_name, html)
 349     return retval[0] if retval else None
 350
 351
 352 def get_element_by_attribute(attribute, value, html, escape_value=True):
 353     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 354     return retval[0] if retval else None
 355
 356
 357 def get_elements_by_class(class_name, html):
 358     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 359     return get_elements_by_attribute(
 360         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 361         html, escape_value=False)
 362
 363
 364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 365     """Return the content of the tag with the specified attribute in the passed HTML document"""
 366
 367     value = re.escape(value) if escape_value else value
 368
 369     retlist = []
 370     for m in re.finditer(r'''(?xs)
 371         <([a-zA-Z0-9:._-]+)
 372          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 373          \s+%s=['"]?%s['"]?
 374          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 375         \s*>
 376         (?P<content>.*?)
 377         </\1>
 378     ''' % (re.escape(attribute), value), html):
 379         res = m.group('content')
 380
 381         if res.startswith('"') or res.startswith("'"):
 382             res = res[1:-1]
 383
 384         retlist.append(unescapeHTML(res))
 385
 386     return retlist
 387
 388
 389 class HTMLAttributeParser(compat_HTMLParser):
 390     """Trivial HTML parser to gather the attributes for a single element"""
 391     def __init__(self):
 392         self.attrs = {}
 393         compat_HTMLParser.__init__(self)
 394
 395     def handle_starttag(self, tag, attrs):
 396         self.attrs = dict(attrs)
 397
 398
 399 def extract_attributes(html_element):
 400     """Given a string for an HTML element such as
 401     <el
 402          a="foo" B="bar" c="&98;az" d=boz
 403          empty= noval entity="&amp;"
 404          sq='"' dq="'"
 405     >
 406     Decode and return a dictionary of attributes.
 407     {
 408         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 409         'empty': '', 'noval': None, 'entity': '&',
 410         'sq': '"', 'dq': '\''
 411     }.
 412     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 413     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 414     """
 415     parser = HTMLAttributeParser()
 416     try:
 417         parser.feed(html_element)
 418         parser.close()
 419     # Older Python may throw HTMLParseError in case of malformed HTML
 420     except compat_HTMLParseError:
 421         pass
 422     return parser.attrs
 423
 424
 425 def clean_html(html):
 426     """Clean an HTML snippet into a readable string"""
 427
 428     if html is None:  # Convenience for sanitizing descriptions etc.
 429         return html
 430
 431     # Newline vs <br />
 432     html = html.replace('\n', ' ')
 433     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 434     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 435     # Strip html tags
 436     html = re.sub('<.*?>', '', html)
 437     # Replace html entities
 438     html = unescapeHTML(html)
 439     return html.strip()
 440
 441
 442 def sanitize_open(filename, open_mode):
 443     """Try to open the given filename, and slightly tweak it if this fails.
 444
 445     Attempts to open the given filename. If this fails, it tries to change
 446     the filename slightly, step by step, until it's either able to open it
 447     or it fails and raises a final exception, like the standard open()
 448     function.
 449
 450     It returns the tuple (stream, definitive_file_name).
 451     """
 452     try:
 453         if filename == '-':
 454             if sys.platform == 'win32':
 455                 import msvcrt
 456                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 457             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 458         stream = open(encodeFilename(filename), open_mode)
 459         return (stream, filename)
 460     except (IOError, OSError) as err:
 461         if err.errno in (errno.EACCES,):
 462             raise
 463
 464         # In case of error, try to remove win32 forbidden chars
 465         alt_filename = sanitize_path(filename)
 466         if alt_filename == filename:
 467             raise
 468         else:
 469             # An exception here should be caught in the caller
 470             stream = open(encodeFilename(alt_filename), open_mode)
 471             return (stream, alt_filename)
 472
 473
 474 def timeconvert(timestr):
 475     """Convert RFC 2822 defined time string into system timestamp"""
 476     timestamp = None
 477     timetuple = email.utils.parsedate_tz(timestr)
 478     if timetuple is not None:
 479         timestamp = email.utils.mktime_tz(timetuple)
 480     return timestamp
 481
 482
 483 def sanitize_filename(s, restricted=False, is_id=False):
 484     """Sanitizes a string so it could be used as part of a filename.
 485     If restricted is set, use a stricter subset of allowed characters.
 486     Set is_id if this is not an arbitrary string, but an ID that should be kept
 487     if possible.
 488     """
 489     def replace_insane(char):
 490         if restricted and char in ACCENT_CHARS:
 491             return ACCENT_CHARS[char]
 492         if char == '?' or ord(char) < 32 or ord(char) == 127:
 493             return ''
 494         elif char == '"':
 495             return '' if restricted else '\''
 496         elif char == ':':
 497             return '_-' if restricted else ' -'
 498         elif char in '\\/|*<>':
 499             return '_'
 500         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 501             return '_'
 502         if restricted and ord(char) > 127:
 503             return '_'
 504         return char
 505
 506     # Handle timestamps
 507     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 508     result = ''.join(map(replace_insane, s))
 509     if not is_id:
 510         while '__' in result:
 511             result = result.replace('__', '_')
 512         result = result.strip('_')
 513         # Common case of "Foreign band name - English song title"
 514         if restricted and result.startswith('-_'):
 515             result = result[2:]
 516         if result.startswith('-'):
 517             result = '_' + result[len('-'):]
 518         result = result.lstrip('.')
 519         if not result:
 520             result = '_'
 521     return result
 522
 523
 524 def sanitize_path(s):
 525     """Sanitizes and normalizes path on Windows"""
 526     if sys.platform != 'win32':
 527         return s
 528     drive_or_unc, _ = os.path.splitdrive(s)
 529     if sys.version_info < (2, 7) and not drive_or_unc:
 530         drive_or_unc, _ = os.path.splitunc(s)
 531     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 532     if drive_or_unc:
 533         norm_path.pop(0)
 534     sanitized_path = [
 535         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 536         for path_part in norm_path]
 537     if drive_or_unc:
 538         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 539     return os.path.join(*sanitized_path)
 540
 541
 542 def sanitize_url(url):
 543     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 544     # the number of unwanted failures due to missing protocol
 545     if url.startswith('//'):
 546         return 'http:%s' % url
 547     # Fix some common typos seen so far
 548     COMMON_TYPOS = (
 549         # https://github.com/ytdl-org/youtube-dl/issues/15649
 550         (r'^httpss://', r'https://'),
 551         # https://bx1.be/lives/direct-tv/
 552         (r'^rmtp([es]?)://', r'rtmp\1://'),
 553     )
 554     for mistake, fixup in COMMON_TYPOS:
 555         if re.match(mistake, url):
 556             return re.sub(mistake, fixup, url)
 557     return url
 558
 559
 560 def sanitized_Request(url, *args, **kwargs):
 561     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 562
 563
 564 def expand_path(s):
 565     """Expand shell variables and ~"""
 566     return os.path.expandvars(compat_expanduser(s))
 567
 568
 569 def orderedSet(iterable):
 570     """ Remove all duplicates from the input iterable """
 571     res = []
 572     for el in iterable:
 573         if el not in res:
 574             res.append(el)
 575     return res
 576
 577
 578 def _htmlentity_transform(entity_with_semicolon):
 579     """Transforms an HTML entity to a character."""
 580     entity = entity_with_semicolon[:-1]
 581
 582     # Known non-numeric HTML entity
 583     if entity in compat_html_entities.name2codepoint:
 584         return compat_chr(compat_html_entities.name2codepoint[entity])
 585
 586     # TODO: HTML5 allows entities without a semicolon. For example,
 587     # '&Eacuteric' should be decoded as 'Éric'.
 588     if entity_with_semicolon in compat_html_entities_html5:
 589         return compat_html_entities_html5[entity_with_semicolon]
 590
 591     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 592     if mobj is not None:
 593         numstr = mobj.group(1)
 594         if numstr.startswith('x'):
 595             base = 16
 596             numstr = '0%s' % numstr
 597         else:
 598             base = 10
 599         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 600         try:
 601             return compat_chr(int(numstr, base))
 602         except ValueError:
 603             pass
 604
 605     # Unknown entity in name, return its literal representation
 606     return '&%s;' % entity
 607
 608
 609 def unescapeHTML(s):
 610     if s is None:
 611         return None
 612     assert type(s) == compat_str
 613
 614     return re.sub(
 615         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 616
 617
 618 def get_subprocess_encoding():
 619     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 620         # For subprocess calls, encode with locale encoding
 621         # Refer to http://stackoverflow.com/a/9951851/35070
 622         encoding = preferredencoding()
 623     else:
 624         encoding = sys.getfilesystemencoding()
 625     if encoding is None:
 626         encoding = 'utf-8'
 627     return encoding
 628
 629
 630 def encodeFilename(s, for_subprocess=False):
 631     """
 632     @param s The name of the file
 633     """
 634
 635     assert type(s) == compat_str
 636
 637     # Python 3 has a Unicode API
 638     if sys.version_info >= (3, 0):
 639         return s
 640
 641     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 642     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 643     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 644     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 645         return s
 646
 647     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 648     if sys.platform.startswith('java'):
 649         return s
 650
 651     return s.encode(get_subprocess_encoding(), 'ignore')
 652
 653
 654 def decodeFilename(b, for_subprocess=False):
 655
 656     if sys.version_info >= (3, 0):
 657         return b
 658
 659     if not isinstance(b, bytes):
 660         return b
 661
 662     return b.decode(get_subprocess_encoding(), 'ignore')
 663
 664
 665 def encodeArgument(s):
 666     if not isinstance(s, compat_str):
 667         # Legacy code that uses byte strings
 668         # Uncomment the following line after fixing all post processors
 669         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 670         s = s.decode('ascii')
 671     return encodeFilename(s, True)
 672
 673
 674 def decodeArgument(b):
 675     return decodeFilename(b, True)
 676
 677
 678 def decodeOption(optval):
 679     if optval is None:
 680         return optval
 681     if isinstance(optval, bytes):
 682         optval = optval.decode(preferredencoding())
 683
 684     assert isinstance(optval, compat_str)
 685     return optval
 686
 687
 688 def formatSeconds(secs):
 689     if secs > 3600:
 690         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 691     elif secs > 60:
 692         return '%d:%02d' % (secs // 60, secs % 60)
 693     else:
 694         return '%d' % secs
 695
 696
 697 def make_HTTPS_handler(params, **kwargs):
 698     opts_no_check_certificate = params.get('nocheckcertificate', False)
 699     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 700         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 701         if opts_no_check_certificate:
 702             context.check_hostname = False
 703             context.verify_mode = ssl.CERT_NONE
 704         try:
 705             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 706         except TypeError:
 707             # Python 2.7.8
 708             # (create_default_context present but HTTPSHandler has no context=)
 709             pass
 710
 711     if sys.version_info < (3, 2):
 712         return YoutubeDLHTTPSHandler(params, **kwargs)
 713     else:  # Python < 3.4
 714         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 715         context.verify_mode = (ssl.CERT_NONE
 716                                if opts_no_check_certificate
 717                                else ssl.CERT_REQUIRED)
 718         context.set_default_verify_paths()
 719         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 720
 721
 722 def bug_reports_message():
 723     if ytdl_is_updateable():
 724         update_cmd = 'type  youtube-dl -U  to update'
 725     else:
 726         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 727     msg = '; please report this issue on https://yt-dl.org/bug .'
 728     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 729     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 730     return msg
 731
 732
 733 class YoutubeDLError(Exception):
 734     """Base exception for YoutubeDL errors."""
 735     pass
 736
 737
 738 class ExtractorError(YoutubeDLError):
 739     """Error during info extraction."""
 740
 741     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 742         """ tb, if given, is the original traceback (so that it can be printed out).
 743         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 744         """
 745
 746         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 747             expected = True
 748         if video_id is not None:
 749             msg = video_id + ': ' + msg
 750         if cause:
 751             msg += ' (caused by %r)' % cause
 752         if not expected:
 753             msg += bug_reports_message()
 754         super(ExtractorError, self).__init__(msg)
 755
 756         self.traceback = tb
 757         self.exc_info = sys.exc_info()  # preserve original exception
 758         self.cause = cause
 759         self.video_id = video_id
 760
 761     def format_traceback(self):
 762         if self.traceback is None:
 763             return None
 764         return ''.join(traceback.format_tb(self.traceback))
 765
 766
 767 class UnsupportedError(ExtractorError):
 768     def __init__(self, url):
 769         super(UnsupportedError, self).__init__(
 770             'Unsupported URL: %s' % url, expected=True)
 771         self.url = url
 772
 773
 774 class RegexNotFoundError(ExtractorError):
 775     """Error when a regex didn't match"""
 776     pass
 777
 778
 779 class GeoRestrictedError(ExtractorError):
 780     """Geographic restriction Error exception.
 781
 782     This exception may be thrown when a video is not available from your
 783     geographic location due to geographic restrictions imposed by a website.
 784     """
 785     def __init__(self, msg, countries=None):
 786         super(GeoRestrictedError, self).__init__(msg, expected=True)
 787         self.msg = msg
 788         self.countries = countries
 789
 790
 791 class DownloadError(YoutubeDLError):
 792     """Download Error exception.
 793
 794     This exception may be thrown by FileDownloader objects if they are not
 795     configured to continue on errors. They will contain the appropriate
 796     error message.
 797     """
 798
 799     def __init__(self, msg, exc_info=None):
 800         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 801         super(DownloadError, self).__init__(msg)
 802         self.exc_info = exc_info
 803
 804
 805 class SameFileError(YoutubeDLError):
 806     """Same File exception.
 807
 808     This exception will be thrown by FileDownloader objects if they detect
 809     multiple files would have to be downloaded to the same file on disk.
 810     """
 811     pass
 812
 813
 814 class PostProcessingError(YoutubeDLError):
 815     """Post Processing exception.
 816
 817     This exception may be raised by PostProcessor's .run() method to
 818     indicate an error in the postprocessing task.
 819     """
 820
 821     def __init__(self, msg):
 822         super(PostProcessingError, self).__init__(msg)
 823         self.msg = msg
 824
 825
 826 class MaxDownloadsReached(YoutubeDLError):
 827     """ --max-downloads limit has been reached. """
 828     pass
 829
 830
 831 class UnavailableVideoError(YoutubeDLError):
 832     """Unavailable Format exception.
 833
 834     This exception will be thrown when a video is requested
 835     in a format that is not available for that video.
 836     """
 837     pass
 838
 839
 840 class ContentTooShortError(YoutubeDLError):
 841     """Content Too Short exception.
 842
 843     This exception may be raised by FileDownloader objects when a file they
 844     download is too small for what the server announced first, indicating
 845     the connection was probably interrupted.
 846     """
 847
 848     def __init__(self, downloaded, expected):
 849         super(ContentTooShortError, self).__init__(
 850             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 851         )
 852         # Both in bytes
 853         self.downloaded = downloaded
 854         self.expected = expected
 855
 856
 857 class XAttrMetadataError(YoutubeDLError):
 858     def __init__(self, code=None, msg='Unknown error'):
 859         super(XAttrMetadataError, self).__init__(msg)
 860         self.code = code
 861         self.msg = msg
 862
 863         # Parsing code and msg
 864         if (self.code in (errno.ENOSPC, errno.EDQUOT)
 865                 or 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 866             self.reason = 'NO_SPACE'
 867         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 868             self.reason = 'VALUE_TOO_LONG'
 869         else:
 870             self.reason = 'NOT_SUPPORTED'
 871
 872
 873 class XAttrUnavailableError(YoutubeDLError):
 874     pass
 875
 876
 877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 878     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 879     # expected HTTP responses to meet HTTP/1.0 or later (see also
 880     # https://github.com/ytdl-org/youtube-dl/issues/6727)
 881     if sys.version_info < (3, 0):
 882         kwargs['strict'] = True
 883     hc = http_class(*args, **compat_kwargs(kwargs))
 884     source_address = ydl_handler._params.get('source_address')
 885
 886     if source_address is not None:
 887         # This is to workaround _create_connection() from socket where it will try all
 888         # address data from getaddrinfo() including IPv6. This filters the result from
 889         # getaddrinfo() based on the source_address value.
 890         # This is based on the cpython socket.create_connection() function.
 891         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
 892         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
 893             host, port = address
 894             err = None
 895             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
 896             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
 897             ip_addrs = [addr for addr in addrs if addr[0] == af]
 898             if addrs and not ip_addrs:
 899                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
 900                 raise socket.error(
 901                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
 902                     % (ip_version, source_address[0]))
 903             for res in ip_addrs:
 904                 af, socktype, proto, canonname, sa = res
 905                 sock = None
 906                 try:
 907                     sock = socket.socket(af, socktype, proto)
 908                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
 909                         sock.settimeout(timeout)
 910                     sock.bind(source_address)
 911                     sock.connect(sa)
 912                     err = None  # Explicitly break reference cycle
 913                     return sock
 914                 except socket.error as _:
 915                     err = _
 916                     if sock is not None:
 917                         sock.close()
 918             if err is not None:
 919                 raise err
 920             else:
 921                 raise socket.error('getaddrinfo returns an empty list')
 922         if hasattr(hc, '_create_connection'):
 923             hc._create_connection = _create_connection
 924         sa = (source_address, 0)
 925         if hasattr(hc, 'source_address'):  # Python 2.7+
 926             hc.source_address = sa
 927         else:  # Python 2.6
 928             def _hc_connect(self, *args, **kwargs):
 929                 sock = _create_connection(
 930                     (self.host, self.port), self.timeout, sa)
 931                 if is_https:
 932                     self.sock = ssl.wrap_socket(
 933                         sock, self.key_file, self.cert_file,
 934                         ssl_version=ssl.PROTOCOL_TLSv1)
 935                 else:
 936                     self.sock = sock
 937             hc.connect = functools.partial(_hc_connect, hc)
 938
 939     return hc
 940
 941
 942 def handle_youtubedl_headers(headers):
 943     filtered_headers = headers
 944
 945     if 'Youtubedl-no-compression' in filtered_headers:
 946         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 947         del filtered_headers['Youtubedl-no-compression']
 948
 949     return filtered_headers
 950
 951
 952 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 953     """Handler for HTTP requests and responses.
 954
 955     This class, when installed with an OpenerDirector, automatically adds
 956     the standard headers to every HTTP request and handles gzipped and
 957     deflated responses from web servers. If compression is to be avoided in
 958     a particular request, the original request in the program code only has
 959     to include the HTTP header "Youtubedl-no-compression", which will be
 960     removed before making the real request.
 961
 962     Part of this code was copied from:
 963
 964     http://techknack.net/python-urllib2-handlers/
 965
 966     Andrew Rowls, the author of that code, agreed to release it to the
 967     public domain.
 968     """
 969
 970     def __init__(self, params, *args, **kwargs):
 971         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 972         self._params = params
 973
 974     def http_open(self, req):
 975         conn_class = compat_http_client.HTTPConnection
 976
 977         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 978         if socks_proxy:
 979             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 980             del req.headers['Ytdl-socks-proxy']
 981
 982         return self.do_open(functools.partial(
 983             _create_http_connection, self, conn_class, False),
 984             req)
 985
 986     @staticmethod
 987     def deflate(data):
 988         try:
 989             return zlib.decompress(data, -zlib.MAX_WBITS)
 990         except zlib.error:
 991             return zlib.decompress(data)
 992
 993     def http_request(self, req):
 994         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 995         # always respected by websites, some tend to give out URLs with non percent-encoded
 996         # non-ASCII characters (see telemb.py, ard.py [#3412])
 997         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 998         # To work around aforementioned issue we will replace request's original URL with
 999         # percent-encoded one
1000         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1001         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1002         url = req.get_full_url()
1003         url_escaped = escape_url(url)
1004
1005         # Substitute URL if any change after escaping
1006         if url != url_escaped:
1007             req = update_Request(req, url=url_escaped)
1008
1009         for h, v in std_headers.items():
1010             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1011             # The dict keys are capitalized because of this bug by urllib
1012             if h.capitalize() not in req.headers:
1013                 req.add_header(h, v)
1014
1015         req.headers = handle_youtubedl_headers(req.headers)
1016
1017         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1018             # Python 2.6 is brain-dead when it comes to fragments
1019             req._Request__original = req._Request__original.partition('#')[0]
1020             req._Request__r_type = req._Request__r_type.partition('#')[0]
1021
1022         return req
1023
1024     def http_response(self, req, resp):
1025         old_resp = resp
1026         # gzip
1027         if resp.headers.get('Content-encoding', '') == 'gzip':
1028             content = resp.read()
1029             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1030             try:
1031                 uncompressed = io.BytesIO(gz.read())
1032             except IOError as original_ioerror:
1033                 # There may be junk add the end of the file
1034                 # See http://stackoverflow.com/q/4928560/35070 for details
1035                 for i in range(1, 1024):
1036                     try:
1037                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1038                         uncompressed = io.BytesIO(gz.read())
1039                     except IOError:
1040                         continue
1041                     break
1042                 else:
1043                     raise original_ioerror
1044             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1045             resp.msg = old_resp.msg
1046             del resp.headers['Content-encoding']
1047         # deflate
1048         if resp.headers.get('Content-encoding', '') == 'deflate':
1049             gz = io.BytesIO(self.deflate(resp.read()))
1050             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1051             resp.msg = old_resp.msg
1052             del resp.headers['Content-encoding']
1053         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1054         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1055         if 300 <= resp.code < 400:
1056             location = resp.headers.get('Location')
1057             if location:
1058                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1059                 if sys.version_info >= (3, 0):
1060                     location = location.encode('iso-8859-1').decode('utf-8')
1061                 else:
1062                     location = location.decode('utf-8')
1063                 location_escaped = escape_url(location)
1064                 if location != location_escaped:
1065                     del resp.headers['Location']
1066                     if sys.version_info < (3, 0):
1067                         location_escaped = location_escaped.encode('utf-8')
1068                     resp.headers['Location'] = location_escaped
1069         return resp
1070
1071     https_request = http_request
1072     https_response = http_response
1073
1074
1075 def make_socks_conn_class(base_class, socks_proxy):
1076     assert issubclass(base_class, (
1077         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1078
1079     url_components = compat_urlparse.urlparse(socks_proxy)
1080     if url_components.scheme.lower() == 'socks5':
1081         socks_type = ProxyType.SOCKS5
1082     elif url_components.scheme.lower() in ('socks', 'socks4'):
1083         socks_type = ProxyType.SOCKS4
1084     elif url_components.scheme.lower() == 'socks4a':
1085         socks_type = ProxyType.SOCKS4A
1086
1087     def unquote_if_non_empty(s):
1088         if not s:
1089             return s
1090         return compat_urllib_parse_unquote_plus(s)
1091
1092     proxy_args = (
1093         socks_type,
1094         url_components.hostname, url_components.port or 1080,
1095         True,  # Remote DNS
1096         unquote_if_non_empty(url_components.username),
1097         unquote_if_non_empty(url_components.password),
1098     )
1099
1100     class SocksConnection(base_class):
1101         def connect(self):
1102             self.sock = sockssocket()
1103             self.sock.setproxy(*proxy_args)
1104             if type(self.timeout) in (int, float):
1105                 self.sock.settimeout(self.timeout)
1106             self.sock.connect((self.host, self.port))
1107
1108             if isinstance(self, compat_http_client.HTTPSConnection):
1109                 if hasattr(self, '_context'):  # Python > 2.6
1110                     self.sock = self._context.wrap_socket(
1111                         self.sock, server_hostname=self.host)
1112                 else:
1113                     self.sock = ssl.wrap_socket(self.sock)
1114
1115     return SocksConnection
1116
1117
1118 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1119     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1120         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1121         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1122         self._params = params
1123
1124     def https_open(self, req):
1125         kwargs = {}
1126         conn_class = self._https_conn_class
1127
1128         if hasattr(self, '_context'):  # python > 2.6
1129             kwargs['context'] = self._context
1130         if hasattr(self, '_check_hostname'):  # python 3.x
1131             kwargs['check_hostname'] = self._check_hostname
1132
1133         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1134         if socks_proxy:
1135             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1136             del req.headers['Ytdl-socks-proxy']
1137
1138         return self.do_open(functools.partial(
1139             _create_http_connection, self, conn_class, True),
1140             req, **kwargs)
1141
1142
1143 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1144     _HTTPONLY_PREFIX = '#HttpOnly_'
1145
1146     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1147         # Store session cookies with `expires` set to 0 instead of an empty
1148         # string
1149         for cookie in self:
1150             if cookie.expires is None:
1151                 cookie.expires = 0
1152         compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
1153
1154     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1155         """Load cookies from a file."""
1156         if filename is None:
1157             if self.filename is not None:
1158                 filename = self.filename
1159             else:
1160                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1161
1162         cf = io.StringIO()
1163         with open(filename) as f:
1164             for line in f:
1165                 if line.startswith(self._HTTPONLY_PREFIX):
1166                     line = line[len(self._HTTPONLY_PREFIX):]
1167                 cf.write(compat_str(line))
1168         cf.seek(0)
1169         self._really_load(cf, filename, ignore_discard, ignore_expires)
1170         # Session cookies are denoted by either `expires` field set to
1171         # an empty string or 0. MozillaCookieJar only recognizes the former
1172         # (see [1]). So we need force the latter to be recognized as session
1173         # cookies on our own.
1174         # Session cookies may be important for cookies-based authentication,
1175         # e.g. usually, when user does not check 'Remember me' check box while
1176         # logging in on a site, some important cookies are stored as session
1177         # cookies so that not recognizing them will result in failed login.
1178         # 1. https://bugs.python.org/issue17164
1179         for cookie in self:
1180             # Treat `expires=0` cookies as session cookies
1181             if cookie.expires == 0:
1182                 cookie.expires = None
1183                 cookie.discard = True
1184
1185
1186 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1187     def __init__(self, cookiejar=None):
1188         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1189
1190     def http_response(self, request, response):
1191         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1192         # characters in Set-Cookie HTTP header of last response (see
1193         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1194         # In order to at least prevent crashing we will percent encode Set-Cookie
1195         # header before HTTPCookieProcessor starts processing it.
1196         # if sys.version_info < (3, 0) and response.headers:
1197         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1198         #         set_cookie = response.headers.get(set_cookie_header)
1199         #         if set_cookie:
1200         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1201         #             if set_cookie != set_cookie_escaped:
1202         #                 del response.headers[set_cookie_header]
1203         #                 response.headers[set_cookie_header] = set_cookie_escaped
1204         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1205
1206     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1207     https_response = http_response
1208
1209
1210 def extract_timezone(date_str):
1211     m = re.search(
1212         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1213         date_str)
1214     if not m:
1215         timezone = datetime.timedelta()
1216     else:
1217         date_str = date_str[:-len(m.group('tz'))]
1218         if not m.group('sign'):
1219             timezone = datetime.timedelta()
1220         else:
1221             sign = 1 if m.group('sign') == '+' else -1
1222             timezone = datetime.timedelta(
1223                 hours=sign * int(m.group('hours')),
1224                 minutes=sign * int(m.group('minutes')))
1225     return timezone, date_str
1226
1227
1228 def parse_iso8601(date_str, delimiter='T', timezone=None):
1229     """ Return a UNIX timestamp from the given date """
1230
1231     if date_str is None:
1232         return None
1233
1234     date_str = re.sub(r'\.[0-9]+', '', date_str)
1235
1236     if timezone is None:
1237         timezone, date_str = extract_timezone(date_str)
1238
1239     try:
1240         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1241         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1242         return calendar.timegm(dt.timetuple())
1243     except ValueError:
1244         pass
1245
1246
1247 def date_formats(day_first=True):
1248     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1249
1250
1251 def unified_strdate(date_str, day_first=True):
1252     """Return a string with the date in the format YYYYMMDD"""
1253
1254     if date_str is None:
1255         return None
1256     upload_date = None
1257     # Replace commas
1258     date_str = date_str.replace(',', ' ')
1259     # Remove AM/PM + timezone
1260     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1261     _, date_str = extract_timezone(date_str)
1262
1263     for expression in date_formats(day_first):
1264         try:
1265             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1266         except ValueError:
1267             pass
1268     if upload_date is None:
1269         timetuple = email.utils.parsedate_tz(date_str)
1270         if timetuple:
1271             try:
1272                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1273             except ValueError:
1274                 pass
1275     if upload_date is not None:
1276         return compat_str(upload_date)
1277
1278
1279 def unified_timestamp(date_str, day_first=True):
1280     if date_str is None:
1281         return None
1282
1283     date_str = re.sub(r'[,|]', '', date_str)
1284
1285     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1286     timezone, date_str = extract_timezone(date_str)
1287
1288     # Remove AM/PM + timezone
1289     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1290
1291     # Remove unrecognized timezones from ISO 8601 alike timestamps
1292     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1293     if m:
1294         date_str = date_str[:-len(m.group('tz'))]
1295
1296     # Python only supports microseconds, so remove nanoseconds
1297     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1298     if m:
1299         date_str = m.group(1)
1300
1301     for expression in date_formats(day_first):
1302         try:
1303             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1304             return calendar.timegm(dt.timetuple())
1305         except ValueError:
1306             pass
1307     timetuple = email.utils.parsedate_tz(date_str)
1308     if timetuple:
1309         return calendar.timegm(timetuple) + pm_delta * 3600
1310
1311
1312 def determine_ext(url, default_ext='unknown_video'):
1313     if url is None or '.' not in url:
1314         return default_ext
1315     guess = url.partition('?')[0].rpartition('.')[2]
1316     if re.match(r'^[A-Za-z0-9]+$', guess):
1317         return guess
1318     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1319     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1320         return guess.rstrip('/')
1321     else:
1322         return default_ext
1323
1324
1325 def subtitles_filename(filename, sub_lang, sub_format):
1326     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1327
1328
1329 def date_from_str(date_str):
1330     """
1331     Return a datetime object from a string in the format YYYYMMDD or
1332     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1333     today = datetime.date.today()
1334     if date_str in ('now', 'today'):
1335         return today
1336     if date_str == 'yesterday':
1337         return today - datetime.timedelta(days=1)
1338     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1339     if match is not None:
1340         sign = match.group('sign')
1341         time = int(match.group('time'))
1342         if sign == '-':
1343             time = -time
1344         unit = match.group('unit')
1345         # A bad approximation?
1346         if unit == 'month':
1347             unit = 'day'
1348             time *= 30
1349         elif unit == 'year':
1350             unit = 'day'
1351             time *= 365
1352         unit += 's'
1353         delta = datetime.timedelta(**{unit: time})
1354         return today + delta
1355     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1356
1357
1358 def hyphenate_date(date_str):
1359     """
1360     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1361     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1362     if match is not None:
1363         return '-'.join(match.groups())
1364     else:
1365         return date_str
1366
1367
1368 class DateRange(object):
1369     """Represents a time interval between two dates"""
1370
1371     def __init__(self, start=None, end=None):
1372         """start and end must be strings in the format accepted by date"""
1373         if start is not None:
1374             self.start = date_from_str(start)
1375         else:
1376             self.start = datetime.datetime.min.date()
1377         if end is not None:
1378             self.end = date_from_str(end)
1379         else:
1380             self.end = datetime.datetime.max.date()
1381         if self.start > self.end:
1382             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1383
1384     @classmethod
1385     def day(cls, day):
1386         """Returns a range that only contains the given day"""
1387         return cls(day, day)
1388
1389     def __contains__(self, date):
1390         """Check if the date is in the range"""
1391         if not isinstance(date, datetime.date):
1392             date = date_from_str(date)
1393         return self.start <= date <= self.end
1394
1395     def __str__(self):
1396         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1397
1398
1399 def platform_name():
1400     """ Returns the platform name as a compat_str """
1401     res = platform.platform()
1402     if isinstance(res, bytes):
1403         res = res.decode(preferredencoding())
1404
1405     assert isinstance(res, compat_str)
1406     return res
1407
1408
1409 def _windows_write_string(s, out):
1410     """ Returns True if the string was written using special methods,
1411     False if it has yet to be written out."""
1412     # Adapted from http://stackoverflow.com/a/3259271/35070
1413
1414     import ctypes
1415     import ctypes.wintypes
1416
1417     WIN_OUTPUT_IDS = {
1418         1: -11,
1419         2: -12,
1420     }
1421
1422     try:
1423         fileno = out.fileno()
1424     except AttributeError:
1425         # If the output stream doesn't have a fileno, it's virtual
1426         return False
1427     except io.UnsupportedOperation:
1428         # Some strange Windows pseudo files?
1429         return False
1430     if fileno not in WIN_OUTPUT_IDS:
1431         return False
1432
1433     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1434         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1435         ('GetStdHandle', ctypes.windll.kernel32))
1436     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1437
1438     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1439         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1440         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1441         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1442     written = ctypes.wintypes.DWORD(0)
1443
1444     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1445     FILE_TYPE_CHAR = 0x0002
1446     FILE_TYPE_REMOTE = 0x8000
1447     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1448         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1449         ctypes.POINTER(ctypes.wintypes.DWORD))(
1450         ('GetConsoleMode', ctypes.windll.kernel32))
1451     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1452
1453     def not_a_console(handle):
1454         if handle == INVALID_HANDLE_VALUE or handle is None:
1455             return True
1456         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1457                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1458
1459     if not_a_console(h):
1460         return False
1461
1462     def next_nonbmp_pos(s):
1463         try:
1464             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1465         except StopIteration:
1466             return len(s)
1467
1468     while s:
1469         count = min(next_nonbmp_pos(s), 1024)
1470
1471         ret = WriteConsoleW(
1472             h, s, count if count else 2, ctypes.byref(written), None)
1473         if ret == 0:
1474             raise OSError('Failed to write string')
1475         if not count:  # We just wrote a non-BMP character
1476             assert written.value == 2
1477             s = s[1:]
1478         else:
1479             assert written.value > 0
1480             s = s[written.value:]
1481     return True
1482
1483
1484 def write_string(s, out=None, encoding=None):
1485     if out is None:
1486         out = sys.stderr
1487     assert type(s) == compat_str
1488
1489     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1490         if _windows_write_string(s, out):
1491             return
1492
1493     if ('b' in getattr(out, 'mode', '')
1494             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1495         byt = s.encode(encoding or preferredencoding(), 'ignore')
1496         out.write(byt)
1497     elif hasattr(out, 'buffer'):
1498         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1499         byt = s.encode(enc, 'ignore')
1500         out.buffer.write(byt)
1501     else:
1502         out.write(s)
1503     out.flush()
1504
1505
1506 def bytes_to_intlist(bs):
1507     if not bs:
1508         return []
1509     if isinstance(bs[0], int):  # Python 3
1510         return list(bs)
1511     else:
1512         return [ord(c) for c in bs]
1513
1514
1515 def intlist_to_bytes(xs):
1516     if not xs:
1517         return b''
1518     return compat_struct_pack('%dB' % len(xs), *xs)
1519
1520
1521 # Cross-platform file locking
1522 if sys.platform == 'win32':
1523     import ctypes.wintypes
1524     import msvcrt
1525
1526     class OVERLAPPED(ctypes.Structure):
1527         _fields_ = [
1528             ('Internal', ctypes.wintypes.LPVOID),
1529             ('InternalHigh', ctypes.wintypes.LPVOID),
1530             ('Offset', ctypes.wintypes.DWORD),
1531             ('OffsetHigh', ctypes.wintypes.DWORD),
1532             ('hEvent', ctypes.wintypes.HANDLE),
1533         ]
1534
1535     kernel32 = ctypes.windll.kernel32
1536     LockFileEx = kernel32.LockFileEx
1537     LockFileEx.argtypes = [
1538         ctypes.wintypes.HANDLE,     # hFile
1539         ctypes.wintypes.DWORD,      # dwFlags
1540         ctypes.wintypes.DWORD,      # dwReserved
1541         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1542         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1543         ctypes.POINTER(OVERLAPPED)  # Overlapped
1544     ]
1545     LockFileEx.restype = ctypes.wintypes.BOOL
1546     UnlockFileEx = kernel32.UnlockFileEx
1547     UnlockFileEx.argtypes = [
1548         ctypes.wintypes.HANDLE,     # hFile
1549         ctypes.wintypes.DWORD,      # dwReserved
1550         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1551         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1552         ctypes.POINTER(OVERLAPPED)  # Overlapped
1553     ]
1554     UnlockFileEx.restype = ctypes.wintypes.BOOL
1555     whole_low = 0xffffffff
1556     whole_high = 0x7fffffff
1557
1558     def _lock_file(f, exclusive):
1559         overlapped = OVERLAPPED()
1560         overlapped.Offset = 0
1561         overlapped.OffsetHigh = 0
1562         overlapped.hEvent = 0
1563         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1564         handle = msvcrt.get_osfhandle(f.fileno())
1565         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1566                           whole_low, whole_high, f._lock_file_overlapped_p):
1567             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1568
1569     def _unlock_file(f):
1570         assert f._lock_file_overlapped_p
1571         handle = msvcrt.get_osfhandle(f.fileno())
1572         if not UnlockFileEx(handle, 0,
1573                             whole_low, whole_high, f._lock_file_overlapped_p):
1574             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1575
1576 else:
1577     # Some platforms, such as Jython, is missing fcntl
1578     try:
1579         import fcntl
1580
1581         def _lock_file(f, exclusive):
1582             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1583
1584         def _unlock_file(f):
1585             fcntl.flock(f, fcntl.LOCK_UN)
1586     except ImportError:
1587         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1588
1589         def _lock_file(f, exclusive):
1590             raise IOError(UNSUPPORTED_MSG)
1591
1592         def _unlock_file(f):
1593             raise IOError(UNSUPPORTED_MSG)
1594
1595
1596 class locked_file(object):
1597     def __init__(self, filename, mode, encoding=None):
1598         assert mode in ['r', 'a', 'w']
1599         self.f = io.open(filename, mode, encoding=encoding)
1600         self.mode = mode
1601
1602     def __enter__(self):
1603         exclusive = self.mode != 'r'
1604         try:
1605             _lock_file(self.f, exclusive)
1606         except IOError:
1607             self.f.close()
1608             raise
1609         return self
1610
1611     def __exit__(self, etype, value, traceback):
1612         try:
1613             _unlock_file(self.f)
1614         finally:
1615             self.f.close()
1616
1617     def __iter__(self):
1618         return iter(self.f)
1619
1620     def write(self, *args):
1621         return self.f.write(*args)
1622
1623     def read(self, *args):
1624         return self.f.read(*args)
1625
1626
1627 def get_filesystem_encoding():
1628     encoding = sys.getfilesystemencoding()
1629     return encoding if encoding is not None else 'utf-8'
1630
1631
1632 def shell_quote(args):
1633     quoted_args = []
1634     encoding = get_filesystem_encoding()
1635     for a in args:
1636         if isinstance(a, bytes):
1637             # We may get a filename encoded with 'encodeFilename'
1638             a = a.decode(encoding)
1639         quoted_args.append(compat_shlex_quote(a))
1640     return ' '.join(quoted_args)
1641
1642
1643 def smuggle_url(url, data):
1644     """ Pass additional data in a URL for internal use. """
1645
1646     url, idata = unsmuggle_url(url, {})
1647     data.update(idata)
1648     sdata = compat_urllib_parse_urlencode(
1649         {'__youtubedl_smuggle': json.dumps(data)})
1650     return url + '#' + sdata
1651
1652
1653 def unsmuggle_url(smug_url, default=None):
1654     if '#__youtubedl_smuggle' not in smug_url:
1655         return smug_url, default
1656     url, _, sdata = smug_url.rpartition('#')
1657     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1658     data = json.loads(jsond)
1659     return url, data
1660
1661
1662 def format_bytes(bytes):
1663     if bytes is None:
1664         return 'N/A'
1665     if type(bytes) is str:
1666         bytes = float(bytes)
1667     if bytes == 0.0:
1668         exponent = 0
1669     else:
1670         exponent = int(math.log(bytes, 1024.0))
1671     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1672     converted = float(bytes) / float(1024 ** exponent)
1673     return '%.2f%s' % (converted, suffix)
1674
1675
1676 def lookup_unit_table(unit_table, s):
1677     units_re = '|'.join(re.escape(u) for u in unit_table)
1678     m = re.match(
1679         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1680     if not m:
1681         return None
1682     num_str = m.group('num').replace(',', '.')
1683     mult = unit_table[m.group('unit')]
1684     return int(float(num_str) * mult)
1685
1686
1687 def parse_filesize(s):
1688     if s is None:
1689         return None
1690
1691     # The lower-case forms are of course incorrect and unofficial,
1692     # but we support those too
1693     _UNIT_TABLE = {
1694         'B': 1,
1695         'b': 1,
1696         'bytes': 1,
1697         'KiB': 1024,
1698         'KB': 1000,
1699         'kB': 1024,
1700         'Kb': 1000,
1701         'kb': 1000,
1702         'kilobytes': 1000,
1703         'kibibytes': 1024,
1704         'MiB': 1024 ** 2,
1705         'MB': 1000 ** 2,
1706         'mB': 1024 ** 2,
1707         'Mb': 1000 ** 2,
1708         'mb': 1000 ** 2,
1709         'megabytes': 1000 ** 2,
1710         'mebibytes': 1024 ** 2,
1711         'GiB': 1024 ** 3,
1712         'GB': 1000 ** 3,
1713         'gB': 1024 ** 3,
1714         'Gb': 1000 ** 3,
1715         'gb': 1000 ** 3,
1716         'gigabytes': 1000 ** 3,
1717         'gibibytes': 1024 ** 3,
1718         'TiB': 1024 ** 4,
1719         'TB': 1000 ** 4,
1720         'tB': 1024 ** 4,
1721         'Tb': 1000 ** 4,
1722         'tb': 1000 ** 4,
1723         'terabytes': 1000 ** 4,
1724         'tebibytes': 1024 ** 4,
1725         'PiB': 1024 ** 5,
1726         'PB': 1000 ** 5,
1727         'pB': 1024 ** 5,
1728         'Pb': 1000 ** 5,
1729         'pb': 1000 ** 5,
1730         'petabytes': 1000 ** 5,
1731         'pebibytes': 1024 ** 5,
1732         'EiB': 1024 ** 6,
1733         'EB': 1000 ** 6,
1734         'eB': 1024 ** 6,
1735         'Eb': 1000 ** 6,
1736         'eb': 1000 ** 6,
1737         'exabytes': 1000 ** 6,
1738         'exbibytes': 1024 ** 6,
1739         'ZiB': 1024 ** 7,
1740         'ZB': 1000 ** 7,
1741         'zB': 1024 ** 7,
1742         'Zb': 1000 ** 7,
1743         'zb': 1000 ** 7,
1744         'zettabytes': 1000 ** 7,
1745         'zebibytes': 1024 ** 7,
1746         'YiB': 1024 ** 8,
1747         'YB': 1000 ** 8,
1748         'yB': 1024 ** 8,
1749         'Yb': 1000 ** 8,
1750         'yb': 1000 ** 8,
1751         'yottabytes': 1000 ** 8,
1752         'yobibytes': 1024 ** 8,
1753     }
1754
1755     return lookup_unit_table(_UNIT_TABLE, s)
1756
1757
1758 def parse_count(s):
1759     if s is None:
1760         return None
1761
1762     s = s.strip()
1763
1764     if re.match(r'^[\d,.]+$', s):
1765         return str_to_int(s)
1766
1767     _UNIT_TABLE = {
1768         'k': 1000,
1769         'K': 1000,
1770         'm': 1000 ** 2,
1771         'M': 1000 ** 2,
1772         'kk': 1000 ** 2,
1773         'KK': 1000 ** 2,
1774     }
1775
1776     return lookup_unit_table(_UNIT_TABLE, s)
1777
1778
1779 def parse_resolution(s):
1780     if s is None:
1781         return {}
1782
1783     mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1784     if mobj:
1785         return {
1786             'width': int(mobj.group('w')),
1787             'height': int(mobj.group('h')),
1788         }
1789
1790     mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1791     if mobj:
1792         return {'height': int(mobj.group(1))}
1793
1794     mobj = re.search(r'\b([48])[kK]\b', s)
1795     if mobj:
1796         return {'height': int(mobj.group(1)) * 540}
1797
1798     return {}
1799
1800
1801 def parse_bitrate(s):
1802     if not isinstance(s, compat_str):
1803         return
1804     mobj = re.search(r'\b(\d+)\s*kbps', s)
1805     if mobj:
1806         return int(mobj.group(1))
1807
1808
1809 def month_by_name(name, lang='en'):
1810     """ Return the number of a month by (locale-independently) English name """
1811
1812     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1813
1814     try:
1815         return month_names.index(name) + 1
1816     except ValueError:
1817         return None
1818
1819
1820 def month_by_abbreviation(abbrev):
1821     """ Return the number of a month by (locale-independently) English
1822         abbreviations """
1823
1824     try:
1825         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1826     except ValueError:
1827         return None
1828
1829
1830 def fix_xml_ampersands(xml_str):
1831     """Replace all the '&' by '&amp;' in XML"""
1832     return re.sub(
1833         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1834         '&amp;',
1835         xml_str)
1836
1837
1838 def setproctitle(title):
1839     assert isinstance(title, compat_str)
1840
1841     # ctypes in Jython is not complete
1842     # http://bugs.jython.org/issue2148
1843     if sys.platform.startswith('java'):
1844         return
1845
1846     try:
1847         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1848     except OSError:
1849         return
1850     except TypeError:
1851         # LoadLibrary in Windows Python 2.7.13 only expects
1852         # a bytestring, but since unicode_literals turns
1853         # every string into a unicode string, it fails.
1854         return
1855     title_bytes = title.encode('utf-8')
1856     buf = ctypes.create_string_buffer(len(title_bytes))
1857     buf.value = title_bytes
1858     try:
1859         libc.prctl(15, buf, 0, 0, 0)
1860     except AttributeError:
1861         return  # Strange libc, just skip this
1862
1863
1864 def remove_start(s, start):
1865     return s[len(start):] if s is not None and s.startswith(start) else s
1866
1867
1868 def remove_end(s, end):
1869     return s[:-len(end)] if s is not None and s.endswith(end) else s
1870
1871
1872 def remove_quotes(s):
1873     if s is None or len(s) < 2:
1874         return s
1875     for quote in ('"', "'", ):
1876         if s[0] == quote and s[-1] == quote:
1877             return s[1:-1]
1878     return s
1879
1880
1881 def url_basename(url):
1882     path = compat_urlparse.urlparse(url).path
1883     return path.strip('/').split('/')[-1]
1884
1885
1886 def base_url(url):
1887     return re.match(r'https?://[^?#&]+/', url).group()
1888
1889
1890 def urljoin(base, path):
1891     if isinstance(path, bytes):
1892         path = path.decode('utf-8')
1893     if not isinstance(path, compat_str) or not path:
1894         return None
1895     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1896         return path
1897     if isinstance(base, bytes):
1898         base = base.decode('utf-8')
1899     if not isinstance(base, compat_str) or not re.match(
1900             r'^(?:https?:)?//', base):
1901         return None
1902     return compat_urlparse.urljoin(base, path)
1903
1904
1905 class HEADRequest(compat_urllib_request.Request):
1906     def get_method(self):
1907         return 'HEAD'
1908
1909
1910 class PUTRequest(compat_urllib_request.Request):
1911     def get_method(self):
1912         return 'PUT'
1913
1914
1915 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1916     if get_attr:
1917         if v is not None:
1918             v = getattr(v, get_attr, None)
1919     if v == '':
1920         v = None
1921     if v is None:
1922         return default
1923     try:
1924         return int(v) * invscale // scale
1925     except (ValueError, TypeError):
1926         return default
1927
1928
1929 def str_or_none(v, default=None):
1930     return default if v is None else compat_str(v)
1931
1932
1933 def str_to_int(int_str):
1934     """ A more relaxed version of int_or_none """
1935     if int_str is None:
1936         return None
1937     int_str = re.sub(r'[,\.\+]', '', int_str)
1938     return int(int_str)
1939
1940
1941 def float_or_none(v, scale=1, invscale=1, default=None):
1942     if v is None:
1943         return default
1944     try:
1945         return float(v) * invscale / scale
1946     except (ValueError, TypeError):
1947         return default
1948
1949
1950 def bool_or_none(v, default=None):
1951     return v if isinstance(v, bool) else default
1952
1953
1954 def strip_or_none(v, default=None):
1955     return v.strip() if isinstance(v, compat_str) else default
1956
1957
1958 def url_or_none(url):
1959     if not url or not isinstance(url, compat_str):
1960         return None
1961     url = url.strip()
1962     return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1963
1964
1965 def parse_duration(s):
1966     if not isinstance(s, compat_basestring):
1967         return None
1968
1969     s = s.strip()
1970
1971     days, hours, mins, secs, ms = [None] * 5
1972     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1973     if m:
1974         days, hours, mins, secs, ms = m.groups()
1975     else:
1976         m = re.match(
1977             r'''(?ix)(?:P?
1978                 (?:
1979                     [0-9]+\s*y(?:ears?)?\s*
1980                 )?
1981                 (?:
1982                     [0-9]+\s*m(?:onths?)?\s*
1983                 )?
1984                 (?:
1985                     [0-9]+\s*w(?:eeks?)?\s*
1986                 )?
1987                 (?:
1988                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1989                 )?
1990                 T)?
1991                 (?:
1992                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1993                 )?
1994                 (?:
1995                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1996                 )?
1997                 (?:
1998                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1999                 )?Z?$''', s)
2000         if m:
2001             days, hours, mins, secs, ms = m.groups()
2002         else:
2003             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2004             if m:
2005                 hours, mins = m.groups()
2006             else:
2007                 return None
2008
2009     duration = 0
2010     if secs:
2011         duration += float(secs)
2012     if mins:
2013         duration += float(mins) * 60
2014     if hours:
2015         duration += float(hours) * 60 * 60
2016     if days:
2017         duration += float(days) * 24 * 60 * 60
2018     if ms:
2019         duration += float(ms)
2020     return duration
2021
2022
2023 def prepend_extension(filename, ext, expected_real_ext=None):
2024     name, real_ext = os.path.splitext(filename)
2025     return (
2026         '{0}.{1}{2}'.format(name, ext, real_ext)
2027         if not expected_real_ext or real_ext[1:] == expected_real_ext
2028         else '{0}.{1}'.format(filename, ext))
2029
2030
2031 def replace_extension(filename, ext, expected_real_ext=None):
2032     name, real_ext = os.path.splitext(filename)
2033     return '{0}.{1}'.format(
2034         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2035         ext)
2036
2037
2038 def check_executable(exe, args=[]):
2039     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2040     args can be a list of arguments for a short output (like -version) """
2041     try:
2042         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
2043     except OSError:
2044         return False
2045     return exe
2046
2047
2048 def get_exe_version(exe, args=['--version'],
2049                     version_re=None, unrecognized='present'):
2050     """ Returns the version of the specified executable,
2051     or False if the executable is not present """
2052     try:
2053         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2054         # SIGTTOU if youtube-dl is run in the background.
2055         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2056         out, _ = subprocess.Popen(
2057             [encodeArgument(exe)] + args,
2058             stdin=subprocess.PIPE,
2059             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2060     except OSError:
2061         return False
2062     if isinstance(out, bytes):  # Python 2.x
2063         out = out.decode('ascii', 'ignore')
2064     return detect_exe_version(out, version_re, unrecognized)
2065
2066
2067 def detect_exe_version(output, version_re=None, unrecognized='present'):
2068     assert isinstance(output, compat_str)
2069     if version_re is None:
2070         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2071     m = re.search(version_re, output)
2072     if m:
2073         return m.group(1)
2074     else:
2075         return unrecognized
2076
2077
2078 class PagedList(object):
2079     def __len__(self):
2080         # This is only useful for tests
2081         return len(self.getslice())
2082
2083
2084 class OnDemandPagedList(PagedList):
2085     def __init__(self, pagefunc, pagesize, use_cache=True):
2086         self._pagefunc = pagefunc
2087         self._pagesize = pagesize
2088         self._use_cache = use_cache
2089         if use_cache:
2090             self._cache = {}
2091
2092     def getslice(self, start=0, end=None):
2093         res = []
2094         for pagenum in itertools.count(start // self._pagesize):
2095             firstid = pagenum * self._pagesize
2096             nextfirstid = pagenum * self._pagesize + self._pagesize
2097             if start >= nextfirstid:
2098                 continue
2099
2100             page_results = None
2101             if self._use_cache:
2102                 page_results = self._cache.get(pagenum)
2103             if page_results is None:
2104                 page_results = list(self._pagefunc(pagenum))
2105             if self._use_cache:
2106                 self._cache[pagenum] = page_results
2107
2108             startv = (
2109                 start % self._pagesize
2110                 if firstid <= start < nextfirstid
2111                 else 0)
2112
2113             endv = (
2114                 ((end - 1) % self._pagesize) + 1
2115                 if (end is not None and firstid <= end <= nextfirstid)
2116                 else None)
2117
2118             if startv != 0 or endv is not None:
2119                 page_results = page_results[startv:endv]
2120             res.extend(page_results)
2121
2122             # A little optimization - if current page is not "full", ie. does
2123             # not contain page_size videos then we can assume that this page
2124             # is the last one - there are no more ids on further pages -
2125             # i.e. no need to query again.
2126             if len(page_results) + startv < self._pagesize:
2127                 break
2128
2129             # If we got the whole page, but the next page is not interesting,
2130             # break out early as well
2131             if end == nextfirstid:
2132                 break
2133         return res
2134
2135
2136 class InAdvancePagedList(PagedList):
2137     def __init__(self, pagefunc, pagecount, pagesize):
2138         self._pagefunc = pagefunc
2139         self._pagecount = pagecount
2140         self._pagesize = pagesize
2141
2142     def getslice(self, start=0, end=None):
2143         res = []
2144         start_page = start // self._pagesize
2145         end_page = (
2146             self._pagecount if end is None else (end // self._pagesize + 1))
2147         skip_elems = start - start_page * self._pagesize
2148         only_more = None if end is None else end - start
2149         for pagenum in range(start_page, end_page):
2150             page = list(self._pagefunc(pagenum))
2151             if skip_elems:
2152                 page = page[skip_elems:]
2153                 skip_elems = None
2154             if only_more is not None:
2155                 if len(page) < only_more:
2156                     only_more -= len(page)
2157                 else:
2158                     page = page[:only_more]
2159                     res.extend(page)
2160                     break
2161             res.extend(page)
2162         return res
2163
2164
2165 def uppercase_escape(s):
2166     unicode_escape = codecs.getdecoder('unicode_escape')
2167     return re.sub(
2168         r'\\U[0-9a-fA-F]{8}',
2169         lambda m: unicode_escape(m.group(0))[0],
2170         s)
2171
2172
2173 def lowercase_escape(s):
2174     unicode_escape = codecs.getdecoder('unicode_escape')
2175     return re.sub(
2176         r'\\u[0-9a-fA-F]{4}',
2177         lambda m: unicode_escape(m.group(0))[0],
2178         s)
2179
2180
2181 def escape_rfc3986(s):
2182     """Escape non-ASCII characters as suggested by RFC 3986"""
2183     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2184         s = s.encode('utf-8')
2185     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2186
2187
2188 def escape_url(url):
2189     """Escape URL as suggested by RFC 3986"""
2190     url_parsed = compat_urllib_parse_urlparse(url)
2191     return url_parsed._replace(
2192         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2193         path=escape_rfc3986(url_parsed.path),
2194         params=escape_rfc3986(url_parsed.params),
2195         query=escape_rfc3986(url_parsed.query),
2196         fragment=escape_rfc3986(url_parsed.fragment)
2197     ).geturl()
2198
2199
2200 def read_batch_urls(batch_fd):
2201     def fixup(url):
2202         if not isinstance(url, compat_str):
2203             url = url.decode('utf-8', 'replace')
2204         BOM_UTF8 = '\xef\xbb\xbf'
2205         if url.startswith(BOM_UTF8):
2206             url = url[len(BOM_UTF8):]
2207         url = url.strip()
2208         if url.startswith(('#', ';', ']')):
2209             return False
2210         return url
2211
2212     with contextlib.closing(batch_fd) as fd:
2213         return [url for url in map(fixup, fd) if url]
2214
2215
2216 def urlencode_postdata(*args, **kargs):
2217     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2218
2219
2220 def update_url_query(url, query):
2221     if not query:
2222         return url
2223     parsed_url = compat_urlparse.urlparse(url)
2224     qs = compat_parse_qs(parsed_url.query)
2225     qs.update(query)
2226     return compat_urlparse.urlunparse(parsed_url._replace(
2227         query=compat_urllib_parse_urlencode(qs, True)))
2228
2229
2230 def update_Request(req, url=None, data=None, headers={}, query={}):
2231     req_headers = req.headers.copy()
2232     req_headers.update(headers)
2233     req_data = data or req.data
2234     req_url = update_url_query(url or req.get_full_url(), query)
2235     req_get_method = req.get_method()
2236     if req_get_method == 'HEAD':
2237         req_type = HEADRequest
2238     elif req_get_method == 'PUT':
2239         req_type = PUTRequest
2240     else:
2241         req_type = compat_urllib_request.Request
2242     new_req = req_type(
2243         req_url, data=req_data, headers=req_headers,
2244         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2245     if hasattr(req, 'timeout'):
2246         new_req.timeout = req.timeout
2247     return new_req
2248
2249
2250 def _multipart_encode_impl(data, boundary):
2251     content_type = 'multipart/form-data; boundary=%s' % boundary
2252
2253     out = b''
2254     for k, v in data.items():
2255         out += b'--' + boundary.encode('ascii') + b'\r\n'
2256         if isinstance(k, compat_str):
2257             k = k.encode('utf-8')
2258         if isinstance(v, compat_str):
2259             v = v.encode('utf-8')
2260         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2261         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2262         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2263         if boundary.encode('ascii') in content:
2264             raise ValueError('Boundary overlaps with data')
2265         out += content
2266
2267     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2268
2269     return out, content_type
2270
2271
2272 def multipart_encode(data, boundary=None):
2273     '''
2274     Encode a dict to RFC 7578-compliant form-data
2275
2276     data:
2277         A dict where keys and values can be either Unicode or bytes-like
2278         objects.
2279     boundary:
2280         If specified a Unicode object, it's used as the boundary. Otherwise
2281         a random boundary is generated.
2282
2283     Reference: https://tools.ietf.org/html/rfc7578
2284     '''
2285     has_specified_boundary = boundary is not None
2286
2287     while True:
2288         if boundary is None:
2289             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2290
2291         try:
2292             out, content_type = _multipart_encode_impl(data, boundary)
2293             break
2294         except ValueError:
2295             if has_specified_boundary:
2296                 raise
2297             boundary = None
2298
2299     return out, content_type
2300
2301
2302 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2303     if isinstance(key_or_keys, (list, tuple)):
2304         for key in key_or_keys:
2305             if key not in d or d[key] is None or skip_false_values and not d[key]:
2306                 continue
2307             return d[key]
2308         return default
2309     return d.get(key_or_keys, default)
2310
2311
2312 def try_get(src, getter, expected_type=None):
2313     if not isinstance(getter, (list, tuple)):
2314         getter = [getter]
2315     for get in getter:
2316         try:
2317             v = get(src)
2318         except (AttributeError, KeyError, TypeError, IndexError):
2319             pass
2320         else:
2321             if expected_type is None or isinstance(v, expected_type):
2322                 return v
2323
2324
2325 def merge_dicts(*dicts):
2326     merged = {}
2327     for a_dict in dicts:
2328         for k, v in a_dict.items():
2329             if v is None:
2330                 continue
2331             if (k not in merged
2332                     or (isinstance(v, compat_str) and v
2333                         and isinstance(merged[k], compat_str)
2334                         and not merged[k])):
2335                 merged[k] = v
2336     return merged
2337
2338
2339 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2340     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2341
2342
2343 US_RATINGS = {
2344     'G': 0,
2345     'PG': 10,
2346     'PG-13': 13,
2347     'R': 16,
2348     'NC': 18,
2349 }
2350
2351
2352 TV_PARENTAL_GUIDELINES = {
2353     'TV-Y': 0,
2354     'TV-Y7': 7,
2355     'TV-G': 0,
2356     'TV-PG': 0,
2357     'TV-14': 14,
2358     'TV-MA': 17,
2359 }
2360
2361
2362 def parse_age_limit(s):
2363     if type(s) == int:
2364         return s if 0 <= s <= 21 else None
2365     if not isinstance(s, compat_basestring):
2366         return None
2367     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2368     if m:
2369         return int(m.group('age'))
2370     if s in US_RATINGS:
2371         return US_RATINGS[s]
2372     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2373     if m:
2374         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2375     return None
2376
2377
2378 def strip_jsonp(code):
2379     return re.sub(
2380         r'''(?sx)^
2381             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2382             (?:\s*&&\s*(?P=func_name))?
2383             \s*\(\s*(?P<callback_data>.*)\);?
2384             \s*?(?://[^\n]*)*$''',
2385         r'\g<callback_data>', code)
2386
2387
2388 def js_to_json(code):
2389     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2390     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2391     INTEGER_TABLE = (
2392         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2393         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2394     )
2395
2396     def fix_kv(m):
2397         v = m.group(0)
2398         if v in ('true', 'false', 'null'):
2399             return v
2400         elif v.startswith('/*') or v.startswith('//') or v == ',':
2401             return ""
2402
2403         if v[0] in ("'", '"'):
2404             v = re.sub(r'(?s)\\.|"', lambda m: {
2405                 '"': '\\"',
2406                 "\\'": "'",
2407                 '\\\n': '',
2408                 '\\x': '\\u00',
2409             }.get(m.group(0), m.group(0)), v[1:-1])
2410
2411         for regex, base in INTEGER_TABLE:
2412             im = re.match(regex, v)
2413             if im:
2414                 i = int(im.group(1), base)
2415                 return '"%d":' % i if v.endswith(':') else '%d' % i
2416
2417         return '"%s"' % v
2418
2419     return re.sub(r'''(?sx)
2420         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2421         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2422         {comment}|,(?={skip}[\]}}])|
2423         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2424         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2425         [0-9]+(?={skip}:)
2426         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2427
2428
2429 def qualities(quality_ids):
2430     """ Get a numeric quality value out of a list of possible values """
2431     def q(qid):
2432         try:
2433             return quality_ids.index(qid)
2434         except ValueError:
2435             return -1
2436     return q
2437
2438
2439 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2440
2441
2442 def limit_length(s, length):
2443     """ Add ellipses to overly long strings """
2444     if s is None:
2445         return None
2446     ELLIPSES = '...'
2447     if len(s) > length:
2448         return s[:length - len(ELLIPSES)] + ELLIPSES
2449     return s
2450
2451
2452 def version_tuple(v):
2453     return tuple(int(e) for e in re.split(r'[-.]', v))
2454
2455
2456 def is_outdated_version(version, limit, assume_new=True):
2457     if not version:
2458         return not assume_new
2459     try:
2460         return version_tuple(version) < version_tuple(limit)
2461     except ValueError:
2462         return not assume_new
2463
2464
2465 def ytdl_is_updateable():
2466     """ Returns if youtube-dl can be updated with -U """
2467     from zipimport import zipimporter
2468
2469     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2470
2471
2472 def args_to_str(args):
2473     # Get a short string representation for a subprocess command
2474     return ' '.join(compat_shlex_quote(a) for a in args)
2475
2476
2477 def error_to_compat_str(err):
2478     err_str = str(err)
2479     # On python 2 error byte string must be decoded with proper
2480     # encoding rather than ascii
2481     if sys.version_info[0] < 3:
2482         err_str = err_str.decode(preferredencoding())
2483     return err_str
2484
2485
2486 def mimetype2ext(mt):
2487     if mt is None:
2488         return None
2489
2490     ext = {
2491         'audio/mp4': 'm4a',
2492         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2493         # it's the most popular one
2494         'audio/mpeg': 'mp3',
2495     }.get(mt)
2496     if ext is not None:
2497         return ext
2498
2499     _, _, res = mt.rpartition('/')
2500     res = res.split(';')[0].strip().lower()
2501
2502     return {
2503         '3gpp': '3gp',
2504         'smptett+xml': 'tt',
2505         'ttaf+xml': 'dfxp',
2506         'ttml+xml': 'ttml',
2507         'x-flv': 'flv',
2508         'x-mp4-fragmented': 'mp4',
2509         'x-ms-sami': 'sami',
2510         'x-ms-wmv': 'wmv',
2511         'mpegurl': 'm3u8',
2512         'x-mpegurl': 'm3u8',
2513         'vnd.apple.mpegurl': 'm3u8',
2514         'dash+xml': 'mpd',
2515         'f4m+xml': 'f4m',
2516         'hds+xml': 'f4m',
2517         'vnd.ms-sstr+xml': 'ism',
2518         'quicktime': 'mov',
2519         'mp2t': 'ts',
2520     }.get(res, res)
2521
2522
2523 def parse_codecs(codecs_str):
2524     # http://tools.ietf.org/html/rfc6381
2525     if not codecs_str:
2526         return {}
2527     splited_codecs = list(filter(None, map(
2528         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2529     vcodec, acodec = None, None
2530     for full_codec in splited_codecs:
2531         codec = full_codec.split('.')[0]
2532         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
2533             if not vcodec:
2534                 vcodec = full_codec
2535         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2536             if not acodec:
2537                 acodec = full_codec
2538         else:
2539             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2540     if not vcodec and not acodec:
2541         if len(splited_codecs) == 2:
2542             return {
2543                 'vcodec': vcodec,
2544                 'acodec': acodec,
2545             }
2546         elif len(splited_codecs) == 1:
2547             return {
2548                 'vcodec': 'none',
2549                 'acodec': vcodec,
2550             }
2551     else:
2552         return {
2553             'vcodec': vcodec or 'none',
2554             'acodec': acodec or 'none',
2555         }
2556     return {}
2557
2558
2559 def urlhandle_detect_ext(url_handle):
2560     getheader = url_handle.headers.get
2561
2562     cd = getheader('Content-Disposition')
2563     if cd:
2564         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2565         if m:
2566             e = determine_ext(m.group('filename'), default_ext=None)
2567             if e:
2568                 return e
2569
2570     return mimetype2ext(getheader('Content-Type'))
2571
2572
2573 def encode_data_uri(data, mime_type):
2574     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2575
2576
2577 def age_restricted(content_limit, age_limit):
2578     """ Returns True iff the content should be blocked """
2579
2580     if age_limit is None:  # No limit set
2581         return False
2582     if content_limit is None:
2583         return False  # Content available for everyone
2584     return age_limit < content_limit
2585
2586
2587 def is_html(first_bytes):
2588     """ Detect whether a file contains HTML by examining its first bytes. """
2589
2590     BOMS = [
2591         (b'\xef\xbb\xbf', 'utf-8'),
2592         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2593         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2594         (b'\xff\xfe', 'utf-16-le'),
2595         (b'\xfe\xff', 'utf-16-be'),
2596     ]
2597     for bom, enc in BOMS:
2598         if first_bytes.startswith(bom):
2599             s = first_bytes[len(bom):].decode(enc, 'replace')
2600             break
2601     else:
2602         s = first_bytes.decode('utf-8', 'replace')
2603
2604     return re.match(r'^\s*<', s)
2605
2606
2607 def determine_protocol(info_dict):
2608     protocol = info_dict.get('protocol')
2609     if protocol is not None:
2610         return protocol
2611
2612     url = info_dict['url']
2613     if url.startswith('rtmp'):
2614         return 'rtmp'
2615     elif url.startswith('mms'):
2616         return 'mms'
2617     elif url.startswith('rtsp'):
2618         return 'rtsp'
2619
2620     ext = determine_ext(url)
2621     if ext == 'm3u8':
2622         return 'm3u8'
2623     elif ext == 'f4m':
2624         return 'f4m'
2625
2626     return compat_urllib_parse_urlparse(url).scheme
2627
2628
2629 def render_table(header_row, data):
2630     """ Render a list of rows, each as a list of values """
2631     table = [header_row] + data
2632     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2633     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2634     return '\n'.join(format_str % tuple(row) for row in table)
2635
2636
2637 def _match_one(filter_part, dct):
2638     COMPARISON_OPERATORS = {
2639         '<': operator.lt,
2640         '<=': operator.le,
2641         '>': operator.gt,
2642         '>=': operator.ge,
2643         '=': operator.eq,
2644         '!=': operator.ne,
2645     }
2646     operator_rex = re.compile(r'''(?x)\s*
2647         (?P<key>[a-z_]+)
2648         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2649         (?:
2650             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2651             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2652             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2653         )
2654         \s*$
2655         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2656     m = operator_rex.search(filter_part)
2657     if m:
2658         op = COMPARISON_OPERATORS[m.group('op')]
2659         actual_value = dct.get(m.group('key'))
2660         if (m.group('quotedstrval') is not None
2661             or m.group('strval') is not None
2662             # If the original field is a string and matching comparisonvalue is
2663             # a number we should respect the origin of the original field
2664             # and process comparison value as a string (see
2665             # https://github.com/ytdl-org/youtube-dl/issues/11082).
2666             or actual_value is not None and m.group('intval') is not None
2667                 and isinstance(actual_value, compat_str)):
2668             if m.group('op') not in ('=', '!='):
2669                 raise ValueError(
2670                     'Operator %s does not support string values!' % m.group('op'))
2671             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2672             quote = m.group('quote')
2673             if quote is not None:
2674                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2675         else:
2676             try:
2677                 comparison_value = int(m.group('intval'))
2678             except ValueError:
2679                 comparison_value = parse_filesize(m.group('intval'))
2680                 if comparison_value is None:
2681                     comparison_value = parse_filesize(m.group('intval') + 'B')
2682                 if comparison_value is None:
2683                     raise ValueError(
2684                         'Invalid integer value %r in filter part %r' % (
2685                             m.group('intval'), filter_part))
2686         if actual_value is None:
2687             return m.group('none_inclusive')
2688         return op(actual_value, comparison_value)
2689
2690     UNARY_OPERATORS = {
2691         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2692         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2693     }
2694     operator_rex = re.compile(r'''(?x)\s*
2695         (?P<op>%s)\s*(?P<key>[a-z_]+)
2696         \s*$
2697         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2698     m = operator_rex.search(filter_part)
2699     if m:
2700         op = UNARY_OPERATORS[m.group('op')]
2701         actual_value = dct.get(m.group('key'))
2702         return op(actual_value)
2703
2704     raise ValueError('Invalid filter part %r' % filter_part)
2705
2706
2707 def match_str(filter_str, dct):
2708     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2709
2710     return all(
2711         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2712
2713
2714 def match_filter_func(filter_str):
2715     def _match_func(info_dict):
2716         if match_str(filter_str, info_dict):
2717             return None
2718         else:
2719             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2720             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2721     return _match_func
2722
2723
2724 def parse_dfxp_time_expr(time_expr):
2725     if not time_expr:
2726         return
2727
2728     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2729     if mobj:
2730         return float(mobj.group('time_offset'))
2731
2732     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2733     if mobj:
2734         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2735
2736
2737 def srt_subtitles_timecode(seconds):
2738     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2739
2740
2741 def dfxp2srt(dfxp_data):
2742     '''
2743     @param dfxp_data A bytes-like object containing DFXP data
2744     @returns A unicode object containing converted SRT data
2745     '''
2746     LEGACY_NAMESPACES = (
2747         (b'http://www.w3.org/ns/ttml', [
2748             b'http://www.w3.org/2004/11/ttaf1',
2749             b'http://www.w3.org/2006/04/ttaf1',
2750             b'http://www.w3.org/2006/10/ttaf1',
2751         ]),
2752         (b'http://www.w3.org/ns/ttml#styling', [
2753             b'http://www.w3.org/ns/ttml#style',
2754         ]),
2755     )
2756
2757     SUPPORTED_STYLING = [
2758         'color',
2759         'fontFamily',
2760         'fontSize',
2761         'fontStyle',
2762         'fontWeight',
2763         'textDecoration'
2764     ]
2765
2766     _x = functools.partial(xpath_with_ns, ns_map={
2767         'xml': 'http://www.w3.org/XML/1998/namespace',
2768         'ttml': 'http://www.w3.org/ns/ttml',
2769         'tts': 'http://www.w3.org/ns/ttml#styling',
2770     })
2771
2772     styles = {}
2773     default_style = {}
2774
2775     class TTMLPElementParser(object):
2776         _out = ''
2777         _unclosed_elements = []
2778         _applied_styles = []
2779
2780         def start(self, tag, attrib):
2781             if tag in (_x('ttml:br'), 'br'):
2782                 self._out += '\n'
2783             else:
2784                 unclosed_elements = []
2785                 style = {}
2786                 element_style_id = attrib.get('style')
2787                 if default_style:
2788                     style.update(default_style)
2789                 if element_style_id:
2790                     style.update(styles.get(element_style_id, {}))
2791                 for prop in SUPPORTED_STYLING:
2792                     prop_val = attrib.get(_x('tts:' + prop))
2793                     if prop_val:
2794                         style[prop] = prop_val
2795                 if style:
2796                     font = ''
2797                     for k, v in sorted(style.items()):
2798                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2799                             continue
2800                         if k == 'color':
2801                             font += ' color="%s"' % v
2802                         elif k == 'fontSize':
2803                             font += ' size="%s"' % v
2804                         elif k == 'fontFamily':
2805                             font += ' face="%s"' % v
2806                         elif k == 'fontWeight' and v == 'bold':
2807                             self._out += '<b>'
2808                             unclosed_elements.append('b')
2809                         elif k == 'fontStyle' and v == 'italic':
2810                             self._out += '<i>'
2811                             unclosed_elements.append('i')
2812                         elif k == 'textDecoration' and v == 'underline':
2813                             self._out += '<u>'
2814                             unclosed_elements.append('u')
2815                     if font:
2816                         self._out += '<font' + font + '>'
2817                         unclosed_elements.append('font')
2818                     applied_style = {}
2819                     if self._applied_styles:
2820                         applied_style.update(self._applied_styles[-1])
2821                     applied_style.update(style)
2822                     self._applied_styles.append(applied_style)
2823                 self._unclosed_elements.append(unclosed_elements)
2824
2825         def end(self, tag):
2826             if tag not in (_x('ttml:br'), 'br'):
2827                 unclosed_elements = self._unclosed_elements.pop()
2828                 for element in reversed(unclosed_elements):
2829                     self._out += '</%s>' % element
2830                 if unclosed_elements and self._applied_styles:
2831                     self._applied_styles.pop()
2832
2833         def data(self, data):
2834             self._out += data
2835
2836         def close(self):
2837             return self._out.strip()
2838
2839     def parse_node(node):
2840         target = TTMLPElementParser()
2841         parser = xml.etree.ElementTree.XMLParser(target=target)
2842         parser.feed(xml.etree.ElementTree.tostring(node))
2843         return parser.close()
2844
2845     for k, v in LEGACY_NAMESPACES:
2846         for ns in v:
2847             dfxp_data = dfxp_data.replace(ns, k)
2848
2849     dfxp = compat_etree_fromstring(dfxp_data)
2850     out = []
2851     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2852
2853     if not paras:
2854         raise ValueError('Invalid dfxp/TTML subtitle')
2855
2856     repeat = False
2857     while True:
2858         for style in dfxp.findall(_x('.//ttml:style')):
2859             style_id = style.get('id') or style.get(_x('xml:id'))
2860             if not style_id:
2861                 continue
2862             parent_style_id = style.get('style')
2863             if parent_style_id:
2864                 if parent_style_id not in styles:
2865                     repeat = True
2866                     continue
2867                 styles[style_id] = styles[parent_style_id].copy()
2868             for prop in SUPPORTED_STYLING:
2869                 prop_val = style.get(_x('tts:' + prop))
2870                 if prop_val:
2871                     styles.setdefault(style_id, {})[prop] = prop_val
2872         if repeat:
2873             repeat = False
2874         else:
2875             break
2876
2877     for p in ('body', 'div'):
2878         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2879         if ele is None:
2880             continue
2881         style = styles.get(ele.get('style'))
2882         if not style:
2883             continue
2884         default_style.update(style)
2885
2886     for para, index in zip(paras, itertools.count(1)):
2887         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2888         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2889         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2890         if begin_time is None:
2891             continue
2892         if not end_time:
2893             if not dur:
2894                 continue
2895             end_time = begin_time + dur
2896         out.append('%d\n%s --> %s\n%s\n\n' % (
2897             index,
2898             srt_subtitles_timecode(begin_time),
2899             srt_subtitles_timecode(end_time),
2900             parse_node(para)))
2901
2902     return ''.join(out)
2903
2904
2905 def cli_option(params, command_option, param):
2906     param = params.get(param)
2907     if param:
2908         param = compat_str(param)
2909     return [command_option, param] if param is not None else []
2910
2911
2912 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2913     param = params.get(param)
2914     if param is None:
2915         return []
2916     assert isinstance(param, bool)
2917     if separator:
2918         return [command_option + separator + (true_value if param else false_value)]
2919     return [command_option, true_value if param else false_value]
2920
2921
2922 def cli_valueless_option(params, command_option, param, expected_value=True):
2923     param = params.get(param)
2924     return [command_option] if param == expected_value else []
2925
2926
2927 def cli_configuration_args(params, param, default=[]):
2928     ex_args = params.get(param)
2929     if ex_args is None:
2930         return default
2931     assert isinstance(ex_args, list)
2932     return ex_args
2933
2934
2935 class ISO639Utils(object):
2936     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2937     _lang_map = {
2938         'aa': 'aar',
2939         'ab': 'abk',
2940         'ae': 'ave',
2941         'af': 'afr',
2942         'ak': 'aka',
2943         'am': 'amh',
2944         'an': 'arg',
2945         'ar': 'ara',
2946         'as': 'asm',
2947         'av': 'ava',
2948         'ay': 'aym',
2949         'az': 'aze',
2950         'ba': 'bak',
2951         'be': 'bel',
2952         'bg': 'bul',
2953         'bh': 'bih',
2954         'bi': 'bis',
2955         'bm': 'bam',
2956         'bn': 'ben',
2957         'bo': 'bod',
2958         'br': 'bre',
2959         'bs': 'bos',
2960         'ca': 'cat',
2961         'ce': 'che',
2962         'ch': 'cha',
2963         'co': 'cos',
2964         'cr': 'cre',
2965         'cs': 'ces',
2966         'cu': 'chu',
2967         'cv': 'chv',
2968         'cy': 'cym',
2969         'da': 'dan',
2970         'de': 'deu',
2971         'dv': 'div',
2972         'dz': 'dzo',
2973         'ee': 'ewe',
2974         'el': 'ell',
2975         'en': 'eng',
2976         'eo': 'epo',
2977         'es': 'spa',
2978         'et': 'est',
2979         'eu': 'eus',
2980         'fa': 'fas',
2981         'ff': 'ful',
2982         'fi': 'fin',
2983         'fj': 'fij',
2984         'fo': 'fao',
2985         'fr': 'fra',
2986         'fy': 'fry',
2987         'ga': 'gle',
2988         'gd': 'gla',
2989         'gl': 'glg',
2990         'gn': 'grn',
2991         'gu': 'guj',
2992         'gv': 'glv',
2993         'ha': 'hau',
2994         'he': 'heb',
2995         'iw': 'heb',  # Replaced by he in 1989 revision
2996         'hi': 'hin',
2997         'ho': 'hmo',
2998         'hr': 'hrv',
2999         'ht': 'hat',
3000         'hu': 'hun',
3001         'hy': 'hye',
3002         'hz': 'her',
3003         'ia': 'ina',
3004         'id': 'ind',
3005         'in': 'ind',  # Replaced by id in 1989 revision
3006         'ie': 'ile',
3007         'ig': 'ibo',
3008         'ii': 'iii',
3009         'ik': 'ipk',
3010         'io': 'ido',
3011         'is': 'isl',
3012         'it': 'ita',
3013         'iu': 'iku',
3014         'ja': 'jpn',
3015         'jv': 'jav',
3016         'ka': 'kat',
3017         'kg': 'kon',
3018         'ki': 'kik',
3019         'kj': 'kua',
3020         'kk': 'kaz',
3021         'kl': 'kal',
3022         'km': 'khm',
3023         'kn': 'kan',
3024         'ko': 'kor',
3025         'kr': 'kau',
3026         'ks': 'kas',
3027         'ku': 'kur',
3028         'kv': 'kom',
3029         'kw': 'cor',
3030         'ky': 'kir',
3031         'la': 'lat',
3032         'lb': 'ltz',
3033         'lg': 'lug',
3034         'li': 'lim',
3035         'ln': 'lin',
3036         'lo': 'lao',
3037         'lt': 'lit',
3038         'lu': 'lub',
3039         'lv': 'lav',
3040         'mg': 'mlg',
3041         'mh': 'mah',
3042         'mi': 'mri',
3043         'mk': 'mkd',
3044         'ml': 'mal',
3045         'mn': 'mon',
3046         'mr': 'mar',
3047         'ms': 'msa',
3048         'mt': 'mlt',
3049         'my': 'mya',
3050         'na': 'nau',
3051         'nb': 'nob',
3052         'nd': 'nde',
3053         'ne': 'nep',
3054         'ng': 'ndo',
3055         'nl': 'nld',
3056         'nn': 'nno',
3057         'no': 'nor',
3058         'nr': 'nbl',
3059         'nv': 'nav',
3060         'ny': 'nya',
3061         'oc': 'oci',
3062         'oj': 'oji',
3063         'om': 'orm',
3064         'or': 'ori',
3065         'os': 'oss',
3066         'pa': 'pan',
3067         'pi': 'pli',
3068         'pl': 'pol',
3069         'ps': 'pus',
3070         'pt': 'por',
3071         'qu': 'que',
3072         'rm': 'roh',
3073         'rn': 'run',
3074         'ro': 'ron',
3075         'ru': 'rus',
3076         'rw': 'kin',
3077         'sa': 'san',
3078         'sc': 'srd',
3079         'sd': 'snd',
3080         'se': 'sme',
3081         'sg': 'sag',
3082         'si': 'sin',
3083         'sk': 'slk',
3084         'sl': 'slv',
3085         'sm': 'smo',
3086         'sn': 'sna',
3087         'so': 'som',
3088         'sq': 'sqi',
3089         'sr': 'srp',
3090         'ss': 'ssw',
3091         'st': 'sot',
3092         'su': 'sun',
3093         'sv': 'swe',
3094         'sw': 'swa',
3095         'ta': 'tam',
3096         'te': 'tel',
3097         'tg': 'tgk',
3098         'th': 'tha',
3099         'ti': 'tir',
3100         'tk': 'tuk',
3101         'tl': 'tgl',
3102         'tn': 'tsn',
3103         'to': 'ton',
3104         'tr': 'tur',
3105         'ts': 'tso',
3106         'tt': 'tat',
3107         'tw': 'twi',
3108         'ty': 'tah',
3109         'ug': 'uig',
3110         'uk': 'ukr',
3111         'ur': 'urd',
3112         'uz': 'uzb',
3113         've': 'ven',
3114         'vi': 'vie',
3115         'vo': 'vol',
3116         'wa': 'wln',
3117         'wo': 'wol',
3118         'xh': 'xho',
3119         'yi': 'yid',
3120         'ji': 'yid',  # Replaced by yi in 1989 revision
3121         'yo': 'yor',
3122         'za': 'zha',
3123         'zh': 'zho',
3124         'zu': 'zul',
3125     }
3126
3127     @classmethod
3128     def short2long(cls, code):
3129         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3130         return cls._lang_map.get(code[:2])
3131
3132     @classmethod
3133     def long2short(cls, code):
3134         """Convert language code from ISO 639-2/T to ISO 639-1"""
3135         for short_name, long_name in cls._lang_map.items():
3136             if long_name == code:
3137                 return short_name
3138
3139
3140 class ISO3166Utils(object):
3141     # From http://data.okfn.org/data/core/country-list
3142     _country_map = {
3143         'AF': 'Afghanistan',
3144         'AX': 'Åland Islands',
3145         'AL': 'Albania',
3146         'DZ': 'Algeria',
3147         'AS': 'American Samoa',
3148         'AD': 'Andorra',
3149         'AO': 'Angola',
3150         'AI': 'Anguilla',
3151         'AQ': 'Antarctica',
3152         'AG': 'Antigua and Barbuda',
3153         'AR': 'Argentina',
3154         'AM': 'Armenia',
3155         'AW': 'Aruba',
3156         'AU': 'Australia',
3157         'AT': 'Austria',
3158         'AZ': 'Azerbaijan',
3159         'BS': 'Bahamas',
3160         'BH': 'Bahrain',
3161         'BD': 'Bangladesh',
3162         'BB': 'Barbados',
3163         'BY': 'Belarus',
3164         'BE': 'Belgium',
3165         'BZ': 'Belize',
3166         'BJ': 'Benin',
3167         'BM': 'Bermuda',
3168         'BT': 'Bhutan',
3169         'BO': 'Bolivia, Plurinational State of',
3170         'BQ': 'Bonaire, Sint Eustatius and Saba',
3171         'BA': 'Bosnia and Herzegovina',
3172         'BW': 'Botswana',
3173         'BV': 'Bouvet Island',
3174         'BR': 'Brazil',
3175         'IO': 'British Indian Ocean Territory',
3176         'BN': 'Brunei Darussalam',
3177         'BG': 'Bulgaria',
3178         'BF': 'Burkina Faso',
3179         'BI': 'Burundi',
3180         'KH': 'Cambodia',
3181         'CM': 'Cameroon',
3182         'CA': 'Canada',
3183         'CV': 'Cape Verde',
3184         'KY': 'Cayman Islands',
3185         'CF': 'Central African Republic',
3186         'TD': 'Chad',
3187         'CL': 'Chile',
3188         'CN': 'China',
3189         'CX': 'Christmas Island',
3190         'CC': 'Cocos (Keeling) Islands',
3191         'CO': 'Colombia',
3192         'KM': 'Comoros',
3193         'CG': 'Congo',
3194         'CD': 'Congo, the Democratic Republic of the',
3195         'CK': 'Cook Islands',
3196         'CR': 'Costa Rica',
3197         'CI': 'Côte d\'Ivoire',
3198         'HR': 'Croatia',
3199         'CU': 'Cuba',
3200         'CW': 'Curaçao',
3201         'CY': 'Cyprus',
3202         'CZ': 'Czech Republic',
3203         'DK': 'Denmark',
3204         'DJ': 'Djibouti',
3205         'DM': 'Dominica',
3206         'DO': 'Dominican Republic',
3207         'EC': 'Ecuador',
3208         'EG': 'Egypt',
3209         'SV': 'El Salvador',
3210         'GQ': 'Equatorial Guinea',
3211         'ER': 'Eritrea',
3212         'EE': 'Estonia',
3213         'ET': 'Ethiopia',
3214         'FK': 'Falkland Islands (Malvinas)',
3215         'FO': 'Faroe Islands',
3216         'FJ': 'Fiji',
3217         'FI': 'Finland',
3218         'FR': 'France',
3219         'GF': 'French Guiana',
3220         'PF': 'French Polynesia',
3221         'TF': 'French Southern Territories',
3222         'GA': 'Gabon',
3223         'GM': 'Gambia',
3224         'GE': 'Georgia',
3225         'DE': 'Germany',
3226         'GH': 'Ghana',
3227         'GI': 'Gibraltar',
3228         'GR': 'Greece',
3229         'GL': 'Greenland',
3230         'GD': 'Grenada',
3231         'GP': 'Guadeloupe',
3232         'GU': 'Guam',
3233         'GT': 'Guatemala',
3234         'GG': 'Guernsey',
3235         'GN': 'Guinea',
3236         'GW': 'Guinea-Bissau',
3237         'GY': 'Guyana',
3238         'HT': 'Haiti',
3239         'HM': 'Heard Island and McDonald Islands',
3240         'VA': 'Holy See (Vatican City State)',
3241         'HN': 'Honduras',
3242         'HK': 'Hong Kong',
3243         'HU': 'Hungary',
3244         'IS': 'Iceland',
3245         'IN': 'India',
3246         'ID': 'Indonesia',
3247         'IR': 'Iran, Islamic Republic of',
3248         'IQ': 'Iraq',
3249         'IE': 'Ireland',
3250         'IM': 'Isle of Man',
3251         'IL': 'Israel',
3252         'IT': 'Italy',
3253         'JM': 'Jamaica',
3254         'JP': 'Japan',
3255         'JE': 'Jersey',
3256         'JO': 'Jordan',
3257         'KZ': 'Kazakhstan',
3258         'KE': 'Kenya',
3259         'KI': 'Kiribati',
3260         'KP': 'Korea, Democratic People\'s Republic of',
3261         'KR': 'Korea, Republic of',
3262         'KW': 'Kuwait',
3263         'KG': 'Kyrgyzstan',
3264         'LA': 'Lao People\'s Democratic Republic',
3265         'LV': 'Latvia',
3266         'LB': 'Lebanon',
3267         'LS': 'Lesotho',
3268         'LR': 'Liberia',
3269         'LY': 'Libya',
3270         'LI': 'Liechtenstein',
3271         'LT': 'Lithuania',
3272         'LU': 'Luxembourg',
3273         'MO': 'Macao',
3274         'MK': 'Macedonia, the Former Yugoslav Republic of',
3275         'MG': 'Madagascar',
3276         'MW': 'Malawi',
3277         'MY': 'Malaysia',
3278         'MV': 'Maldives',
3279         'ML': 'Mali',
3280         'MT': 'Malta',
3281         'MH': 'Marshall Islands',
3282         'MQ': 'Martinique',
3283         'MR': 'Mauritania',
3284         'MU': 'Mauritius',
3285         'YT': 'Mayotte',
3286         'MX': 'Mexico',
3287         'FM': 'Micronesia, Federated States of',
3288         'MD': 'Moldova, Republic of',
3289         'MC': 'Monaco',
3290         'MN': 'Mongolia',
3291         'ME': 'Montenegro',
3292         'MS': 'Montserrat',
3293         'MA': 'Morocco',
3294         'MZ': 'Mozambique',
3295         'MM': 'Myanmar',
3296         'NA': 'Namibia',
3297         'NR': 'Nauru',
3298         'NP': 'Nepal',
3299         'NL': 'Netherlands',
3300         'NC': 'New Caledonia',
3301         'NZ': 'New Zealand',
3302         'NI': 'Nicaragua',
3303         'NE': 'Niger',
3304         'NG': 'Nigeria',
3305         'NU': 'Niue',
3306         'NF': 'Norfolk Island',
3307         'MP': 'Northern Mariana Islands',
3308         'NO': 'Norway',
3309         'OM': 'Oman',
3310         'PK': 'Pakistan',
3311         'PW': 'Palau',
3312         'PS': 'Palestine, State of',
3313         'PA': 'Panama',
3314         'PG': 'Papua New Guinea',
3315         'PY': 'Paraguay',
3316         'PE': 'Peru',
3317         'PH': 'Philippines',
3318         'PN': 'Pitcairn',
3319         'PL': 'Poland',
3320         'PT': 'Portugal',
3321         'PR': 'Puerto Rico',
3322         'QA': 'Qatar',
3323         'RE': 'Réunion',
3324         'RO': 'Romania',
3325         'RU': 'Russian Federation',
3326         'RW': 'Rwanda',
3327         'BL': 'Saint Barthélemy',
3328         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3329         'KN': 'Saint Kitts and Nevis',
3330         'LC': 'Saint Lucia',
3331         'MF': 'Saint Martin (French part)',
3332         'PM': 'Saint Pierre and Miquelon',
3333         'VC': 'Saint Vincent and the Grenadines',
3334         'WS': 'Samoa',
3335         'SM': 'San Marino',
3336         'ST': 'Sao Tome and Principe',
3337         'SA': 'Saudi Arabia',
3338         'SN': 'Senegal',
3339         'RS': 'Serbia',
3340         'SC': 'Seychelles',
3341         'SL': 'Sierra Leone',
3342         'SG': 'Singapore',
3343         'SX': 'Sint Maarten (Dutch part)',
3344         'SK': 'Slovakia',
3345         'SI': 'Slovenia',
3346         'SB': 'Solomon Islands',
3347         'SO': 'Somalia',
3348         'ZA': 'South Africa',
3349         'GS': 'South Georgia and the South Sandwich Islands',
3350         'SS': 'South Sudan',
3351         'ES': 'Spain',
3352         'LK': 'Sri Lanka',
3353         'SD': 'Sudan',
3354         'SR': 'Suriname',
3355         'SJ': 'Svalbard and Jan Mayen',
3356         'SZ': 'Swaziland',
3357         'SE': 'Sweden',
3358         'CH': 'Switzerland',
3359         'SY': 'Syrian Arab Republic',
3360         'TW': 'Taiwan, Province of China',
3361         'TJ': 'Tajikistan',
3362         'TZ': 'Tanzania, United Republic of',
3363         'TH': 'Thailand',
3364         'TL': 'Timor-Leste',
3365         'TG': 'Togo',
3366         'TK': 'Tokelau',
3367         'TO': 'Tonga',
3368         'TT': 'Trinidad and Tobago',
3369         'TN': 'Tunisia',
3370         'TR': 'Turkey',
3371         'TM': 'Turkmenistan',
3372         'TC': 'Turks and Caicos Islands',
3373         'TV': 'Tuvalu',
3374         'UG': 'Uganda',
3375         'UA': 'Ukraine',
3376         'AE': 'United Arab Emirates',
3377         'GB': 'United Kingdom',
3378         'US': 'United States',
3379         'UM': 'United States Minor Outlying Islands',
3380         'UY': 'Uruguay',
3381         'UZ': 'Uzbekistan',
3382         'VU': 'Vanuatu',
3383         'VE': 'Venezuela, Bolivarian Republic of',
3384         'VN': 'Viet Nam',
3385         'VG': 'Virgin Islands, British',
3386         'VI': 'Virgin Islands, U.S.',
3387         'WF': 'Wallis and Futuna',
3388         'EH': 'Western Sahara',
3389         'YE': 'Yemen',
3390         'ZM': 'Zambia',
3391         'ZW': 'Zimbabwe',
3392     }
3393
3394     @classmethod
3395     def short2full(cls, code):
3396         """Convert an ISO 3166-2 country code to the corresponding full name"""
3397         return cls._country_map.get(code.upper())
3398
3399
3400 class GeoUtils(object):
3401     # Major IPv4 address blocks per country
3402     _country_ip_map = {
3403         'AD': '85.94.160.0/19',
3404         'AE': '94.200.0.0/13',
3405         'AF': '149.54.0.0/17',
3406         'AG': '209.59.64.0/18',
3407         'AI': '204.14.248.0/21',
3408         'AL': '46.99.0.0/16',
3409         'AM': '46.70.0.0/15',
3410         'AO': '105.168.0.0/13',
3411         'AP': '159.117.192.0/21',
3412         'AR': '181.0.0.0/12',
3413         'AS': '202.70.112.0/20',
3414         'AT': '84.112.0.0/13',
3415         'AU': '1.128.0.0/11',
3416         'AW': '181.41.0.0/18',
3417         'AZ': '5.191.0.0/16',
3418         'BA': '31.176.128.0/17',
3419         'BB': '65.48.128.0/17',
3420         'BD': '114.130.0.0/16',
3421         'BE': '57.0.0.0/8',
3422         'BF': '129.45.128.0/17',
3423         'BG': '95.42.0.0/15',
3424         'BH': '37.131.0.0/17',
3425         'BI': '154.117.192.0/18',
3426         'BJ': '137.255.0.0/16',
3427         'BL': '192.131.134.0/24',
3428         'BM': '196.12.64.0/18',
3429         'BN': '156.31.0.0/16',
3430         'BO': '161.56.0.0/16',
3431         'BQ': '161.0.80.0/20',
3432         'BR': '152.240.0.0/12',
3433         'BS': '24.51.64.0/18',
3434         'BT': '119.2.96.0/19',
3435         'BW': '168.167.0.0/16',
3436         'BY': '178.120.0.0/13',
3437         'BZ': '179.42.192.0/18',
3438         'CA': '99.224.0.0/11',
3439         'CD': '41.243.0.0/16',
3440         'CF': '196.32.200.0/21',
3441         'CG': '197.214.128.0/17',
3442         'CH': '85.0.0.0/13',
3443         'CI': '154.232.0.0/14',
3444         'CK': '202.65.32.0/19',
3445         'CL': '152.172.0.0/14',
3446         'CM': '165.210.0.0/15',
3447         'CN': '36.128.0.0/10',
3448         'CO': '181.240.0.0/12',
3449         'CR': '201.192.0.0/12',
3450         'CU': '152.206.0.0/15',
3451         'CV': '165.90.96.0/19',
3452         'CW': '190.88.128.0/17',
3453         'CY': '46.198.0.0/15',
3454         'CZ': '88.100.0.0/14',
3455         'DE': '53.0.0.0/8',
3456         'DJ': '197.241.0.0/17',
3457         'DK': '87.48.0.0/12',
3458         'DM': '192.243.48.0/20',
3459         'DO': '152.166.0.0/15',
3460         'DZ': '41.96.0.0/12',
3461         'EC': '186.68.0.0/15',
3462         'EE': '90.190.0.0/15',
3463         'EG': '156.160.0.0/11',
3464         'ER': '196.200.96.0/20',
3465         'ES': '88.0.0.0/11',
3466         'ET': '196.188.0.0/14',
3467         'EU': '2.16.0.0/13',
3468         'FI': '91.152.0.0/13',
3469         'FJ': '144.120.0.0/16',
3470         'FM': '119.252.112.0/20',
3471         'FO': '88.85.32.0/19',
3472         'FR': '90.0.0.0/9',
3473         'GA': '41.158.0.0/15',
3474         'GB': '25.0.0.0/8',
3475         'GD': '74.122.88.0/21',
3476         'GE': '31.146.0.0/16',
3477         'GF': '161.22.64.0/18',
3478         'GG': '62.68.160.0/19',
3479         'GH': '45.208.0.0/14',
3480         'GI': '85.115.128.0/19',
3481         'GL': '88.83.0.0/19',
3482         'GM': '160.182.0.0/15',
3483         'GN': '197.149.192.0/18',
3484         'GP': '104.250.0.0/19',
3485         'GQ': '105.235.224.0/20',
3486         'GR': '94.64.0.0/13',
3487         'GT': '168.234.0.0/16',
3488         'GU': '168.123.0.0/16',
3489         'GW': '197.214.80.0/20',
3490         'GY': '181.41.64.0/18',
3491         'HK': '113.252.0.0/14',
3492         'HN': '181.210.0.0/16',
3493         'HR': '93.136.0.0/13',
3494         'HT': '148.102.128.0/17',
3495         'HU': '84.0.0.0/14',
3496         'ID': '39.192.0.0/10',
3497         'IE': '87.32.0.0/12',
3498         'IL': '79.176.0.0/13',
3499         'IM': '5.62.80.0/20',
3500         'IN': '117.192.0.0/10',
3501         'IO': '203.83.48.0/21',
3502         'IQ': '37.236.0.0/14',
3503         'IR': '2.176.0.0/12',
3504         'IS': '82.221.0.0/16',
3505         'IT': '79.0.0.0/10',
3506         'JE': '87.244.64.0/18',
3507         'JM': '72.27.0.0/17',
3508         'JO': '176.29.0.0/16',
3509         'JP': '126.0.0.0/8',
3510         'KE': '105.48.0.0/12',
3511         'KG': '158.181.128.0/17',
3512         'KH': '36.37.128.0/17',
3513         'KI': '103.25.140.0/22',
3514         'KM': '197.255.224.0/20',
3515         'KN': '198.32.32.0/19',
3516         'KP': '175.45.176.0/22',
3517         'KR': '175.192.0.0/10',
3518         'KW': '37.36.0.0/14',
3519         'KY': '64.96.0.0/15',
3520         'KZ': '2.72.0.0/13',
3521         'LA': '115.84.64.0/18',
3522         'LB': '178.135.0.0/16',
3523         'LC': '192.147.231.0/24',
3524         'LI': '82.117.0.0/19',
3525         'LK': '112.134.0.0/15',
3526         'LR': '41.86.0.0/19',
3527         'LS': '129.232.0.0/17',
3528         'LT': '78.56.0.0/13',
3529         'LU': '188.42.0.0/16',
3530         'LV': '46.109.0.0/16',
3531         'LY': '41.252.0.0/14',
3532         'MA': '105.128.0.0/11',
3533         'MC': '88.209.64.0/18',
3534         'MD': '37.246.0.0/16',
3535         'ME': '178.175.0.0/17',
3536         'MF': '74.112.232.0/21',
3537         'MG': '154.126.0.0/17',
3538         'MH': '117.103.88.0/21',
3539         'MK': '77.28.0.0/15',
3540         'ML': '154.118.128.0/18',
3541         'MM': '37.111.0.0/17',
3542         'MN': '49.0.128.0/17',
3543         'MO': '60.246.0.0/16',
3544         'MP': '202.88.64.0/20',
3545         'MQ': '109.203.224.0/19',
3546         'MR': '41.188.64.0/18',
3547         'MS': '208.90.112.0/22',
3548         'MT': '46.11.0.0/16',
3549         'MU': '105.16.0.0/12',
3550         'MV': '27.114.128.0/18',
3551         'MW': '105.234.0.0/16',
3552         'MX': '187.192.0.0/11',
3553         'MY': '175.136.0.0/13',
3554         'MZ': '197.218.0.0/15',
3555         'NA': '41.182.0.0/16',
3556         'NC': '101.101.0.0/18',
3557         'NE': '197.214.0.0/18',
3558         'NF': '203.17.240.0/22',
3559         'NG': '105.112.0.0/12',
3560         'NI': '186.76.0.0/15',
3561         'NL': '145.96.0.0/11',
3562         'NO': '84.208.0.0/13',
3563         'NP': '36.252.0.0/15',
3564         'NR': '203.98.224.0/19',
3565         'NU': '49.156.48.0/22',
3566         'NZ': '49.224.0.0/14',
3567         'OM': '5.36.0.0/15',
3568         'PA': '186.72.0.0/15',
3569         'PE': '186.160.0.0/14',
3570         'PF': '123.50.64.0/18',
3571         'PG': '124.240.192.0/19',
3572         'PH': '49.144.0.0/13',
3573         'PK': '39.32.0.0/11',
3574         'PL': '83.0.0.0/11',
3575         'PM': '70.36.0.0/20',
3576         'PR': '66.50.0.0/16',
3577         'PS': '188.161.0.0/16',
3578         'PT': '85.240.0.0/13',
3579         'PW': '202.124.224.0/20',
3580         'PY': '181.120.0.0/14',
3581         'QA': '37.210.0.0/15',
3582         'RE': '139.26.0.0/16',
3583         'RO': '79.112.0.0/13',
3584         'RS': '178.220.0.0/14',
3585         'RU': '5.136.0.0/13',
3586         'RW': '105.178.0.0/15',
3587         'SA': '188.48.0.0/13',
3588         'SB': '202.1.160.0/19',
3589         'SC': '154.192.0.0/11',
3590         'SD': '154.96.0.0/13',
3591         'SE': '78.64.0.0/12',
3592         'SG': '152.56.0.0/14',
3593         'SI': '188.196.0.0/14',
3594         'SK': '78.98.0.0/15',
3595         'SL': '197.215.0.0/17',
3596         'SM': '89.186.32.0/19',
3597         'SN': '41.82.0.0/15',
3598         'SO': '197.220.64.0/19',
3599         'SR': '186.179.128.0/17',
3600         'SS': '105.235.208.0/21',
3601         'ST': '197.159.160.0/19',
3602         'SV': '168.243.0.0/16',
3603         'SX': '190.102.0.0/20',
3604         'SY': '5.0.0.0/16',
3605         'SZ': '41.84.224.0/19',
3606         'TC': '65.255.48.0/20',
3607         'TD': '154.68.128.0/19',
3608         'TG': '196.168.0.0/14',
3609         'TH': '171.96.0.0/13',
3610         'TJ': '85.9.128.0/18',
3611         'TK': '27.96.24.0/21',
3612         'TL': '180.189.160.0/20',
3613         'TM': '95.85.96.0/19',
3614         'TN': '197.0.0.0/11',
3615         'TO': '175.176.144.0/21',
3616         'TR': '78.160.0.0/11',
3617         'TT': '186.44.0.0/15',
3618         'TV': '202.2.96.0/19',
3619         'TW': '120.96.0.0/11',
3620         'TZ': '156.156.0.0/14',
3621         'UA': '93.72.0.0/13',
3622         'UG': '154.224.0.0/13',
3623         'US': '3.0.0.0/8',
3624         'UY': '167.56.0.0/13',
3625         'UZ': '82.215.64.0/18',
3626         'VA': '212.77.0.0/19',
3627         'VC': '24.92.144.0/20',
3628         'VE': '186.88.0.0/13',
3629         'VG': '172.103.64.0/18',
3630         'VI': '146.226.0.0/16',
3631         'VN': '14.160.0.0/11',
3632         'VU': '202.80.32.0/20',
3633         'WF': '117.20.32.0/21',
3634         'WS': '202.4.32.0/19',
3635         'YE': '134.35.0.0/16',
3636         'YT': '41.242.116.0/22',
3637         'ZA': '41.0.0.0/11',
3638         'ZM': '165.56.0.0/13',
3639         'ZW': '41.85.192.0/19',
3640     }
3641
3642     @classmethod
3643     def random_ipv4(cls, code_or_block):
3644         if len(code_or_block) == 2:
3645             block = cls._country_ip_map.get(code_or_block.upper())
3646             if not block:
3647                 return None
3648         else:
3649             block = code_or_block
3650         addr, preflen = block.split('/')
3651         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3652         addr_max = addr_min | (0xffffffff >> int(preflen))
3653         return compat_str(socket.inet_ntoa(
3654             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3655
3656
3657 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3658     def __init__(self, proxies=None):
3659         # Set default handlers
3660         for type in ('http', 'https'):
3661             setattr(self, '%s_open' % type,
3662                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3663                         meth(r, proxy, type))
3664         compat_urllib_request.ProxyHandler.__init__(self, proxies)
3665
3666     def proxy_open(self, req, proxy, type):
3667         req_proxy = req.headers.get('Ytdl-request-proxy')
3668         if req_proxy is not None:
3669             proxy = req_proxy
3670             del req.headers['Ytdl-request-proxy']
3671
3672         if proxy == '__noproxy__':
3673             return None  # No Proxy
3674         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3675             req.add_header('Ytdl-socks-proxy', proxy)
3676             # youtube-dl's http/https handlers do wrapping the socket with socks
3677             return None
3678         return compat_urllib_request.ProxyHandler.proxy_open(
3679             self, req, proxy, type)
3680
3681
3682 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3683 # released into Public Domain
3684 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3685
3686 def long_to_bytes(n, blocksize=0):
3687     """long_to_bytes(n:long, blocksize:int) : string
3688     Convert a long integer to a byte string.
3689
3690     If optional blocksize is given and greater than zero, pad the front of the
3691     byte string with binary zeros so that the length is a multiple of
3692     blocksize.
3693     """
3694     # after much testing, this algorithm was deemed to be the fastest
3695     s = b''
3696     n = int(n)
3697     while n > 0:
3698         s = compat_struct_pack('>I', n & 0xffffffff) + s
3699         n = n >> 32
3700     # strip off leading zeros
3701     for i in range(len(s)):
3702         if s[i] != b'\000'[0]:
3703             break
3704     else:
3705         # only happens when n == 0
3706         s = b'\000'
3707         i = 0
3708     s = s[i:]
3709     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3710     # de-padding being done above, but sigh...
3711     if blocksize > 0 and len(s) % blocksize:
3712         s = (blocksize - len(s) % blocksize) * b'\000' + s
3713     return s
3714
3715
3716 def bytes_to_long(s):
3717     """bytes_to_long(string) : long
3718     Convert a byte string to a long integer.
3719
3720     This is (essentially) the inverse of long_to_bytes().
3721     """
3722     acc = 0
3723     length = len(s)
3724     if length % 4:
3725         extra = (4 - length % 4)
3726         s = b'\000' * extra + s
3727         length = length + extra
3728     for i in range(0, length, 4):
3729         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3730     return acc
3731
3732
3733 def ohdave_rsa_encrypt(data, exponent, modulus):
3734     '''
3735     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3736
3737     Input:
3738         data: data to encrypt, bytes-like object
3739         exponent, modulus: parameter e and N of RSA algorithm, both integer
3740     Output: hex string of encrypted data
3741
3742     Limitation: supports one block encryption only
3743     '''
3744
3745     payload = int(binascii.hexlify(data[::-1]), 16)
3746     encrypted = pow(payload, exponent, modulus)
3747     return '%x' % encrypted
3748
3749
3750 def pkcs1pad(data, length):
3751     """
3752     Padding input data with PKCS#1 scheme
3753
3754     @param {int[]} data        input data
3755     @param {int}   length      target length
3756     @returns {int[]}           padded data
3757     """
3758     if len(data) > length - 11:
3759         raise ValueError('Input data too long for PKCS#1 padding')
3760
3761     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3762     return [0, 2] + pseudo_random + [0] + data
3763
3764
3765 def encode_base_n(num, n, table=None):
3766     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3767     if not table:
3768         table = FULL_TABLE[:n]
3769
3770     if n > len(table):
3771         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3772
3773     if num == 0:
3774         return table[0]
3775
3776     ret = ''
3777     while num:
3778         ret = table[num % n] + ret
3779         num = num // n
3780     return ret
3781
3782
3783 def decode_packed_codes(code):
3784     mobj = re.search(PACKED_CODES_RE, code)
3785     obfucasted_code, base, count, symbols = mobj.groups()
3786     base = int(base)
3787     count = int(count)
3788     symbols = symbols.split('|')
3789     symbol_table = {}
3790
3791     while count:
3792         count -= 1
3793         base_n_count = encode_base_n(count, base)
3794         symbol_table[base_n_count] = symbols[count] or base_n_count
3795
3796     return re.sub(
3797         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3798         obfucasted_code)
3799
3800
3801 def parse_m3u8_attributes(attrib):
3802     info = {}
3803     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3804         if val.startswith('"'):
3805             val = val[1:-1]
3806         info[key] = val
3807     return info
3808
3809
3810 def urshift(val, n):
3811     return val >> n if val >= 0 else (val + 0x100000000) >> n
3812
3813
3814 # Based on png2str() written by @gdkchan and improved by @yokrysty
3815 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
3816 def decode_png(png_data):
3817     # Reference: https://www.w3.org/TR/PNG/
3818     header = png_data[8:]
3819
3820     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3821         raise IOError('Not a valid PNG file.')
3822
3823     int_map = {1: '>B', 2: '>H', 4: '>I'}
3824     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3825
3826     chunks = []
3827
3828     while header:
3829         length = unpack_integer(header[:4])
3830         header = header[4:]
3831
3832         chunk_type = header[:4]
3833         header = header[4:]
3834
3835         chunk_data = header[:length]
3836         header = header[length:]
3837
3838         header = header[4:]  # Skip CRC
3839
3840         chunks.append({
3841             'type': chunk_type,
3842             'length': length,
3843             'data': chunk_data
3844         })
3845
3846     ihdr = chunks[0]['data']
3847
3848     width = unpack_integer(ihdr[:4])
3849     height = unpack_integer(ihdr[4:8])
3850
3851     idat = b''
3852
3853     for chunk in chunks:
3854         if chunk['type'] == b'IDAT':
3855             idat += chunk['data']
3856
3857     if not idat:
3858         raise IOError('Unable to read PNG data.')
3859
3860     decompressed_data = bytearray(zlib.decompress(idat))
3861
3862     stride = width * 3
3863     pixels = []
3864
3865     def _get_pixel(idx):
3866         x = idx % stride
3867         y = idx // stride
3868         return pixels[y][x]
3869
3870     for y in range(height):
3871         basePos = y * (1 + stride)
3872         filter_type = decompressed_data[basePos]
3873
3874         current_row = []
3875
3876         pixels.append(current_row)
3877
3878         for x in range(stride):
3879             color = decompressed_data[1 + basePos + x]
3880             basex = y * stride + x
3881             left = 0
3882             up = 0
3883
3884             if x > 2:
3885                 left = _get_pixel(basex - 3)
3886             if y > 0:
3887                 up = _get_pixel(basex - stride)
3888
3889             if filter_type == 1:  # Sub
3890                 color = (color + left) & 0xff
3891             elif filter_type == 2:  # Up
3892                 color = (color + up) & 0xff
3893             elif filter_type == 3:  # Average
3894                 color = (color + ((left + up) >> 1)) & 0xff
3895             elif filter_type == 4:  # Paeth
3896                 a = left
3897                 b = up
3898                 c = 0
3899
3900                 if x > 2 and y > 0:
3901                     c = _get_pixel(basex - stride - 3)
3902
3903                 p = a + b - c
3904
3905                 pa = abs(p - a)
3906                 pb = abs(p - b)
3907                 pc = abs(p - c)
3908
3909                 if pa <= pb and pa <= pc:
3910                     color = (color + a) & 0xff
3911                 elif pb <= pc:
3912                     color = (color + b) & 0xff
3913                 else:
3914                     color = (color + c) & 0xff
3915
3916             current_row.append(color)
3917
3918     return width, height, pixels
3919
3920
3921 def write_xattr(path, key, value):
3922     # This mess below finds the best xattr tool for the job
3923     try:
3924         # try the pyxattr module...
3925         import xattr
3926
3927         if hasattr(xattr, 'set'):  # pyxattr
3928             # Unicode arguments are not supported in python-pyxattr until
3929             # version 0.5.0
3930             # See https://github.com/ytdl-org/youtube-dl/issues/5498
3931             pyxattr_required_version = '0.5.0'
3932             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3933                 # TODO: fallback to CLI tools
3934                 raise XAttrUnavailableError(
3935                     'python-pyxattr is detected but is too old. '
3936                     'youtube-dl requires %s or above while your version is %s. '
3937                     'Falling back to other xattr implementations' % (
3938                         pyxattr_required_version, xattr.__version__))
3939
3940             setxattr = xattr.set
3941         else:  # xattr
3942             setxattr = xattr.setxattr
3943
3944         try:
3945             setxattr(path, key, value)
3946         except EnvironmentError as e:
3947             raise XAttrMetadataError(e.errno, e.strerror)
3948
3949     except ImportError:
3950         if compat_os_name == 'nt':
3951             # Write xattrs to NTFS Alternate Data Streams:
3952             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3953             assert ':' not in key
3954             assert os.path.exists(path)
3955
3956             ads_fn = path + ':' + key
3957             try:
3958                 with open(ads_fn, 'wb') as f:
3959                     f.write(value)
3960             except EnvironmentError as e:
3961                 raise XAttrMetadataError(e.errno, e.strerror)
3962         else:
3963             user_has_setfattr = check_executable('setfattr', ['--version'])
3964             user_has_xattr = check_executable('xattr', ['-h'])
3965
3966             if user_has_setfattr or user_has_xattr:
3967
3968                 value = value.decode('utf-8')
3969                 if user_has_setfattr:
3970                     executable = 'setfattr'
3971                     opts = ['-n', key, '-v', value]
3972                 elif user_has_xattr:
3973                     executable = 'xattr'
3974                     opts = ['-w', key, value]
3975
3976                 cmd = ([encodeFilename(executable, True)]
3977                        + [encodeArgument(o) for o in opts]
3978                        + [encodeFilename(path, True)])
3979
3980                 try:
3981                     p = subprocess.Popen(
3982                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3983                 except EnvironmentError as e:
3984                     raise XAttrMetadataError(e.errno, e.strerror)
3985                 stdout, stderr = p.communicate()
3986                 stderr = stderr.decode('utf-8', 'replace')
3987                 if p.returncode != 0:
3988                     raise XAttrMetadataError(p.returncode, stderr)
3989
3990             else:
3991                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3992                 if sys.platform.startswith('linux'):
3993                     raise XAttrUnavailableError(
3994                         "Couldn't find a tool to set the xattrs. "
3995                         "Install either the python 'pyxattr' or 'xattr' "
3996                         "modules, or the GNU 'attr' package "
3997                         "(which contains the 'setfattr' tool).")
3998                 else:
3999                     raise XAttrUnavailableError(
4000                         "Couldn't find a tool to set the xattrs. "
4001                         "Install either the python 'xattr' module, "
4002                         "or the 'xattr' binary.")
4003
4004
4005 def random_birthday(year_field, month_field, day_field):
4006     start_date = datetime.date(1950, 1, 1)
4007     end_date = datetime.date(1995, 12, 31)
4008     offset = random.randint(0, (end_date - start_date).days)
4009     random_date = start_date + datetime.timedelta(offset)
4010     return {
4011         year_field: str(random_date.year),
4012         month_field: str(random_date.month),
4013         day_field: str(random_date.day),
4014     }