Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_ctypes_WINFUNCTYPE,
  43     compat_etree_fromstring,
  44     compat_expanduser,
  45     compat_html_entities,
  46     compat_html_entities_html5,
  47     compat_http_client,
  48     compat_kwargs,
  49     compat_os_name,
  50     compat_parse_qs,
  51     compat_shlex_quote,
  52     compat_socket_create_connection,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187
 188
 189 def preferredencoding():
 190     """Get preferred encoding.
 191
 192     Returns the best encoding scheme for the system, based on
 193     locale.getpreferredencoding() and some further tweaks.
 194     """
 195     try:
 196         pref = locale.getpreferredencoding()
 197         'TEST'.encode(pref)
 198     except Exception:
 199         pref = 'UTF-8'
 200
 201     return pref
 202
 203
 204 def write_json_file(obj, fn):
 205     """ Encode obj as JSON and write it to fn, atomically if possible """
 206
 207     fn = encodeFilename(fn)
 208     if sys.version_info < (3, 0) and sys.platform != 'win32':
 209         encoding = get_filesystem_encoding()
 210         # os.path.basename returns a bytes object, but NamedTemporaryFile
 211         # will fail if the filename contains non ascii characters unless we
 212         # use a unicode object
 213         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 214         # the same for os.path.dirname
 215         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 216     else:
 217         path_basename = os.path.basename
 218         path_dirname = os.path.dirname
 219
 220     args = {
 221         'suffix': '.tmp',
 222         'prefix': path_basename(fn) + '.',
 223         'dir': path_dirname(fn),
 224         'delete': False,
 225     }
 226
 227     # In Python 2.x, json.dump expects a bytestream.
 228     # In Python 3.x, it writes to a character stream
 229     if sys.version_info < (3, 0):
 230         args['mode'] = 'wb'
 231     else:
 232         args.update({
 233             'mode': 'w',
 234             'encoding': 'utf-8',
 235         })
 236
 237     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 238
 239     try:
 240         with tf:
 241             json.dump(obj, tf)
 242         if sys.platform == 'win32':
 243             # Need to remove existing file on Windows, else os.rename raises
 244             # WindowsError or FileExistsError.
 245             try:
 246                 os.unlink(fn)
 247             except OSError:
 248                 pass
 249         os.rename(tf.name, fn)
 250     except Exception:
 251         try:
 252             os.remove(tf.name)
 253         except OSError:
 254             pass
 255         raise
 256
 257
 258 if sys.version_info >= (2, 7):
 259     def find_xpath_attr(node, xpath, key, val=None):
 260         """ Find the xpath xpath[@key=val] """
 261         assert re.match(r'^[a-zA-Z_-]+$', key)
 262         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 263         return node.find(expr)
 264 else:
 265     def find_xpath_attr(node, xpath, key, val=None):
 266         for f in node.findall(compat_xpath(xpath)):
 267             if key not in f.attrib:
 268                 continue
 269             if val is None or f.attrib.get(key) == val:
 270                 return f
 271         return None
 272
 273 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 274 # the namespace parameter
 275
 276
 277 def xpath_with_ns(path, ns_map):
 278     components = [c.split(':') for c in path.split('/')]
 279     replaced = []
 280     for c in components:
 281         if len(c) == 1:
 282             replaced.append(c[0])
 283         else:
 284             ns, tag = c
 285             replaced.append('{%s}%s' % (ns_map[ns], tag))
 286     return '/'.join(replaced)
 287
 288
 289 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 290     def _find_xpath(xpath):
 291         return node.find(compat_xpath(xpath))
 292
 293     if isinstance(xpath, (str, compat_str)):
 294         n = _find_xpath(xpath)
 295     else:
 296         for xp in xpath:
 297             n = _find_xpath(xp)
 298             if n is not None:
 299                 break
 300
 301     if n is None:
 302         if default is not NO_DEFAULT:
 303             return default
 304         elif fatal:
 305             name = xpath if name is None else name
 306             raise ExtractorError('Could not find XML element %s' % name)
 307         else:
 308             return None
 309     return n
 310
 311
 312 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 313     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 314     if n is None or n == default:
 315         return n
 316     if n.text is None:
 317         if default is not NO_DEFAULT:
 318             return default
 319         elif fatal:
 320             name = xpath if name is None else name
 321             raise ExtractorError('Could not find XML element\'s text %s' % name)
 322         else:
 323             return None
 324     return n.text
 325
 326
 327 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 328     n = find_xpath_attr(node, xpath, key)
 329     if n is None:
 330         if default is not NO_DEFAULT:
 331             return default
 332         elif fatal:
 333             name = '%s[@%s]' % (xpath, key) if name is None else name
 334             raise ExtractorError('Could not find XML attribute %s' % name)
 335         else:
 336             return None
 337     return n.attrib[key]
 338
 339
 340 def get_element_by_id(id, html):
 341     """Return the content of the tag with the specified ID in the passed HTML document"""
 342     return get_element_by_attribute('id', id, html)
 343
 344
 345 def get_element_by_class(class_name, html):
 346     """Return the content of the first tag with the specified class in the passed HTML document"""
 347     retval = get_elements_by_class(class_name, html)
 348     return retval[0] if retval else None
 349
 350
 351 def get_element_by_attribute(attribute, value, html, escape_value=True):
 352     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 353     return retval[0] if retval else None
 354
 355
 356 def get_elements_by_class(class_name, html):
 357     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 358     return get_elements_by_attribute(
 359         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 360         html, escape_value=False)
 361
 362
 363 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 364     """Return the content of the tag with the specified attribute in the passed HTML document"""
 365
 366     value = re.escape(value) if escape_value else value
 367
 368     retlist = []
 369     for m in re.finditer(r'''(?xs)
 370         <([a-zA-Z0-9:._-]+)
 371          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 372          \s+%s=['"]?%s['"]?
 373          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 374         \s*>
 375         (?P<content>.*?)
 376         </\1>
 377     ''' % (re.escape(attribute), value), html):
 378         res = m.group('content')
 379
 380         if res.startswith('"') or res.startswith("'"):
 381             res = res[1:-1]
 382
 383         retlist.append(unescapeHTML(res))
 384
 385     return retlist
 386
 387
 388 class HTMLAttributeParser(compat_HTMLParser):
 389     """Trivial HTML parser to gather the attributes for a single element"""
 390     def __init__(self):
 391         self.attrs = {}
 392         compat_HTMLParser.__init__(self)
 393
 394     def handle_starttag(self, tag, attrs):
 395         self.attrs = dict(attrs)
 396
 397
 398 def extract_attributes(html_element):
 399     """Given a string for an HTML element such as
 400     <el
 401          a="foo" B="bar" c="&98;az" d=boz
 402          empty= noval entity="&amp;"
 403          sq='"' dq="'"
 404     >
 405     Decode and return a dictionary of attributes.
 406     {
 407         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 408         'empty': '', 'noval': None, 'entity': '&',
 409         'sq': '"', 'dq': '\''
 410     }.
 411     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 412     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 413     """
 414     parser = HTMLAttributeParser()
 415     try:
 416         parser.feed(html_element)
 417         parser.close()
 418     # Older Python may throw HTMLParseError in case of malformed HTML
 419     except compat_HTMLParseError:
 420         pass
 421     return parser.attrs
 422
 423
 424 def clean_html(html):
 425     """Clean an HTML snippet into a readable string"""
 426
 427     if html is None:  # Convenience for sanitizing descriptions etc.
 428         return html
 429
 430     # Newline vs <br />
 431     html = html.replace('\n', ' ')
 432     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 433     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 434     # Strip html tags
 435     html = re.sub('<.*?>', '', html)
 436     # Replace html entities
 437     html = unescapeHTML(html)
 438     return html.strip()
 439
 440
 441 def sanitize_open(filename, open_mode):
 442     """Try to open the given filename, and slightly tweak it if this fails.
 443
 444     Attempts to open the given filename. If this fails, it tries to change
 445     the filename slightly, step by step, until it's either able to open it
 446     or it fails and raises a final exception, like the standard open()
 447     function.
 448
 449     It returns the tuple (stream, definitive_file_name).
 450     """
 451     try:
 452         if filename == '-':
 453             if sys.platform == 'win32':
 454                 import msvcrt
 455                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 456             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 457         stream = open(encodeFilename(filename), open_mode)
 458         return (stream, filename)
 459     except (IOError, OSError) as err:
 460         if err.errno in (errno.EACCES,):
 461             raise
 462
 463         # In case of error, try to remove win32 forbidden chars
 464         alt_filename = sanitize_path(filename)
 465         if alt_filename == filename:
 466             raise
 467         else:
 468             # An exception here should be caught in the caller
 469             stream = open(encodeFilename(alt_filename), open_mode)
 470             return (stream, alt_filename)
 471
 472
 473 def timeconvert(timestr):
 474     """Convert RFC 2822 defined time string into system timestamp"""
 475     timestamp = None
 476     timetuple = email.utils.parsedate_tz(timestr)
 477     if timetuple is not None:
 478         timestamp = email.utils.mktime_tz(timetuple)
 479     return timestamp
 480
 481
 482 def sanitize_filename(s, restricted=False, is_id=False):
 483     """Sanitizes a string so it could be used as part of a filename.
 484     If restricted is set, use a stricter subset of allowed characters.
 485     Set is_id if this is not an arbitrary string, but an ID that should be kept
 486     if possible.
 487     """
 488     def replace_insane(char):
 489         if restricted and char in ACCENT_CHARS:
 490             return ACCENT_CHARS[char]
 491         if char == '?' or ord(char) < 32 or ord(char) == 127:
 492             return ''
 493         elif char == '"':
 494             return '' if restricted else '\''
 495         elif char == ':':
 496             return '_-' if restricted else ' -'
 497         elif char in '\\/|*<>':
 498             return '_'
 499         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 500             return '_'
 501         if restricted and ord(char) > 127:
 502             return '_'
 503         return char
 504
 505     # Handle timestamps
 506     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 507     result = ''.join(map(replace_insane, s))
 508     if not is_id:
 509         while '__' in result:
 510             result = result.replace('__', '_')
 511         result = result.strip('_')
 512         # Common case of "Foreign band name - English song title"
 513         if restricted and result.startswith('-_'):
 514             result = result[2:]
 515         if result.startswith('-'):
 516             result = '_' + result[len('-'):]
 517         result = result.lstrip('.')
 518         if not result:
 519             result = '_'
 520     return result
 521
 522
 523 def sanitize_path(s):
 524     """Sanitizes and normalizes path on Windows"""
 525     if sys.platform != 'win32':
 526         return s
 527     drive_or_unc, _ = os.path.splitdrive(s)
 528     if sys.version_info < (2, 7) and not drive_or_unc:
 529         drive_or_unc, _ = os.path.splitunc(s)
 530     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 531     if drive_or_unc:
 532         norm_path.pop(0)
 533     sanitized_path = [
 534         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 535         for path_part in norm_path]
 536     if drive_or_unc:
 537         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 538     return os.path.join(*sanitized_path)
 539
 540
 541 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 542 # unwanted failures due to missing protocol
 543 def sanitize_url(url):
 544     return 'http:%s' % url if url.startswith('//') else url
 545
 546
 547 def sanitized_Request(url, *args, **kwargs):
 548     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 549
 550
 551 def expand_path(s):
 552     """Expand shell variables and ~"""
 553     return os.path.expandvars(compat_expanduser(s))
 554
 555
 556 def orderedSet(iterable):
 557     """ Remove all duplicates from the input iterable """
 558     res = []
 559     for el in iterable:
 560         if el not in res:
 561             res.append(el)
 562     return res
 563
 564
 565 def _htmlentity_transform(entity_with_semicolon):
 566     """Transforms an HTML entity to a character."""
 567     entity = entity_with_semicolon[:-1]
 568
 569     # Known non-numeric HTML entity
 570     if entity in compat_html_entities.name2codepoint:
 571         return compat_chr(compat_html_entities.name2codepoint[entity])
 572
 573     # TODO: HTML5 allows entities without a semicolon. For example,
 574     # '&Eacuteric' should be decoded as 'Éric'.
 575     if entity_with_semicolon in compat_html_entities_html5:
 576         return compat_html_entities_html5[entity_with_semicolon]
 577
 578     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 579     if mobj is not None:
 580         numstr = mobj.group(1)
 581         if numstr.startswith('x'):
 582             base = 16
 583             numstr = '0%s' % numstr
 584         else:
 585             base = 10
 586         # See https://github.com/rg3/youtube-dl/issues/7518
 587         try:
 588             return compat_chr(int(numstr, base))
 589         except ValueError:
 590             pass
 591
 592     # Unknown entity in name, return its literal representation
 593     return '&%s;' % entity
 594
 595
 596 def unescapeHTML(s):
 597     if s is None:
 598         return None
 599     assert type(s) == compat_str
 600
 601     return re.sub(
 602         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 603
 604
 605 def get_subprocess_encoding():
 606     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 607         # For subprocess calls, encode with locale encoding
 608         # Refer to http://stackoverflow.com/a/9951851/35070
 609         encoding = preferredencoding()
 610     else:
 611         encoding = sys.getfilesystemencoding()
 612     if encoding is None:
 613         encoding = 'utf-8'
 614     return encoding
 615
 616
 617 def encodeFilename(s, for_subprocess=False):
 618     """
 619     @param s The name of the file
 620     """
 621
 622     assert type(s) == compat_str
 623
 624     # Python 3 has a Unicode API
 625     if sys.version_info >= (3, 0):
 626         return s
 627
 628     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 629     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 630     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 631     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 632         return s
 633
 634     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 635     if sys.platform.startswith('java'):
 636         return s
 637
 638     return s.encode(get_subprocess_encoding(), 'ignore')
 639
 640
 641 def decodeFilename(b, for_subprocess=False):
 642
 643     if sys.version_info >= (3, 0):
 644         return b
 645
 646     if not isinstance(b, bytes):
 647         return b
 648
 649     return b.decode(get_subprocess_encoding(), 'ignore')
 650
 651
 652 def encodeArgument(s):
 653     if not isinstance(s, compat_str):
 654         # Legacy code that uses byte strings
 655         # Uncomment the following line after fixing all post processors
 656         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 657         s = s.decode('ascii')
 658     return encodeFilename(s, True)
 659
 660
 661 def decodeArgument(b):
 662     return decodeFilename(b, True)
 663
 664
 665 def decodeOption(optval):
 666     if optval is None:
 667         return optval
 668     if isinstance(optval, bytes):
 669         optval = optval.decode(preferredencoding())
 670
 671     assert isinstance(optval, compat_str)
 672     return optval
 673
 674
 675 def formatSeconds(secs):
 676     if secs > 3600:
 677         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 678     elif secs > 60:
 679         return '%d:%02d' % (secs // 60, secs % 60)
 680     else:
 681         return '%d' % secs
 682
 683
 684 def make_HTTPS_handler(params, **kwargs):
 685     opts_no_check_certificate = params.get('nocheckcertificate', False)
 686     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 687         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 688         if opts_no_check_certificate:
 689             context.check_hostname = False
 690             context.verify_mode = ssl.CERT_NONE
 691         try:
 692             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 693         except TypeError:
 694             # Python 2.7.8
 695             # (create_default_context present but HTTPSHandler has no context=)
 696             pass
 697
 698     if sys.version_info < (3, 2):
 699         return YoutubeDLHTTPSHandler(params, **kwargs)
 700     else:  # Python < 3.4
 701         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 702         context.verify_mode = (ssl.CERT_NONE
 703                                if opts_no_check_certificate
 704                                else ssl.CERT_REQUIRED)
 705         context.set_default_verify_paths()
 706         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 707
 708
 709 def bug_reports_message():
 710     if ytdl_is_updateable():
 711         update_cmd = 'type  youtube-dl -U  to update'
 712     else:
 713         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 714     msg = '; please report this issue on https://yt-dl.org/bug .'
 715     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 716     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 717     return msg
 718
 719
 720 class YoutubeDLError(Exception):
 721     """Base exception for YoutubeDL errors."""
 722     pass
 723
 724
 725 class ExtractorError(YoutubeDLError):
 726     """Error during info extraction."""
 727
 728     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 729         """ tb, if given, is the original traceback (so that it can be printed out).
 730         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 731         """
 732
 733         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 734             expected = True
 735         if video_id is not None:
 736             msg = video_id + ': ' + msg
 737         if cause:
 738             msg += ' (caused by %r)' % cause
 739         if not expected:
 740             msg += bug_reports_message()
 741         super(ExtractorError, self).__init__(msg)
 742
 743         self.traceback = tb
 744         self.exc_info = sys.exc_info()  # preserve original exception
 745         self.cause = cause
 746         self.video_id = video_id
 747
 748     def format_traceback(self):
 749         if self.traceback is None:
 750             return None
 751         return ''.join(traceback.format_tb(self.traceback))
 752
 753
 754 class UnsupportedError(ExtractorError):
 755     def __init__(self, url):
 756         super(UnsupportedError, self).__init__(
 757             'Unsupported URL: %s' % url, expected=True)
 758         self.url = url
 759
 760
 761 class RegexNotFoundError(ExtractorError):
 762     """Error when a regex didn't match"""
 763     pass
 764
 765
 766 class GeoRestrictedError(ExtractorError):
 767     """Geographic restriction Error exception.
 768
 769     This exception may be thrown when a video is not available from your
 770     geographic location due to geographic restrictions imposed by a website.
 771     """
 772     def __init__(self, msg, countries=None):
 773         super(GeoRestrictedError, self).__init__(msg, expected=True)
 774         self.msg = msg
 775         self.countries = countries
 776
 777
 778 class DownloadError(YoutubeDLError):
 779     """Download Error exception.
 780
 781     This exception may be thrown by FileDownloader objects if they are not
 782     configured to continue on errors. They will contain the appropriate
 783     error message.
 784     """
 785
 786     def __init__(self, msg, exc_info=None):
 787         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 788         super(DownloadError, self).__init__(msg)
 789         self.exc_info = exc_info
 790
 791
 792 class SameFileError(YoutubeDLError):
 793     """Same File exception.
 794
 795     This exception will be thrown by FileDownloader objects if they detect
 796     multiple files would have to be downloaded to the same file on disk.
 797     """
 798     pass
 799
 800
 801 class PostProcessingError(YoutubeDLError):
 802     """Post Processing exception.
 803
 804     This exception may be raised by PostProcessor's .run() method to
 805     indicate an error in the postprocessing task.
 806     """
 807
 808     def __init__(self, msg):
 809         super(PostProcessingError, self).__init__(msg)
 810         self.msg = msg
 811
 812
 813 class MaxDownloadsReached(YoutubeDLError):
 814     """ --max-downloads limit has been reached. """
 815     pass
 816
 817
 818 class UnavailableVideoError(YoutubeDLError):
 819     """Unavailable Format exception.
 820
 821     This exception will be thrown when a video is requested
 822     in a format that is not available for that video.
 823     """
 824     pass
 825
 826
 827 class ContentTooShortError(YoutubeDLError):
 828     """Content Too Short exception.
 829
 830     This exception may be raised by FileDownloader objects when a file they
 831     download is too small for what the server announced first, indicating
 832     the connection was probably interrupted.
 833     """
 834
 835     def __init__(self, downloaded, expected):
 836         super(ContentTooShortError, self).__init__(
 837             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 838         )
 839         # Both in bytes
 840         self.downloaded = downloaded
 841         self.expected = expected
 842
 843
 844 class XAttrMetadataError(YoutubeDLError):
 845     def __init__(self, code=None, msg='Unknown error'):
 846         super(XAttrMetadataError, self).__init__(msg)
 847         self.code = code
 848         self.msg = msg
 849
 850         # Parsing code and msg
 851         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 852                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 853             self.reason = 'NO_SPACE'
 854         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 855             self.reason = 'VALUE_TOO_LONG'
 856         else:
 857             self.reason = 'NOT_SUPPORTED'
 858
 859
 860 class XAttrUnavailableError(YoutubeDLError):
 861     pass
 862
 863
 864 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 865     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 866     # expected HTTP responses to meet HTTP/1.0 or later (see also
 867     # https://github.com/rg3/youtube-dl/issues/6727)
 868     if sys.version_info < (3, 0):
 869         kwargs[b'strict'] = True
 870     hc = http_class(*args, **kwargs)
 871     source_address = ydl_handler._params.get('source_address')
 872     if source_address is not None:
 873         sa = (source_address, 0)
 874         if hasattr(hc, 'source_address'):  # Python 2.7+
 875             hc.source_address = sa
 876         else:  # Python 2.6
 877             def _hc_connect(self, *args, **kwargs):
 878                 sock = compat_socket_create_connection(
 879                     (self.host, self.port), self.timeout, sa)
 880                 if is_https:
 881                     self.sock = ssl.wrap_socket(
 882                         sock, self.key_file, self.cert_file,
 883                         ssl_version=ssl.PROTOCOL_TLSv1)
 884                 else:
 885                     self.sock = sock
 886             hc.connect = functools.partial(_hc_connect, hc)
 887
 888     return hc
 889
 890
 891 def handle_youtubedl_headers(headers):
 892     filtered_headers = headers
 893
 894     if 'Youtubedl-no-compression' in filtered_headers:
 895         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 896         del filtered_headers['Youtubedl-no-compression']
 897
 898     return filtered_headers
 899
 900
 901 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 902     """Handler for HTTP requests and responses.
 903
 904     This class, when installed with an OpenerDirector, automatically adds
 905     the standard headers to every HTTP request and handles gzipped and
 906     deflated responses from web servers. If compression is to be avoided in
 907     a particular request, the original request in the program code only has
 908     to include the HTTP header "Youtubedl-no-compression", which will be
 909     removed before making the real request.
 910
 911     Part of this code was copied from:
 912
 913     http://techknack.net/python-urllib2-handlers/
 914
 915     Andrew Rowls, the author of that code, agreed to release it to the
 916     public domain.
 917     """
 918
 919     def __init__(self, params, *args, **kwargs):
 920         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 921         self._params = params
 922
 923     def http_open(self, req):
 924         conn_class = compat_http_client.HTTPConnection
 925
 926         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 927         if socks_proxy:
 928             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 929             del req.headers['Ytdl-socks-proxy']
 930
 931         return self.do_open(functools.partial(
 932             _create_http_connection, self, conn_class, False),
 933             req)
 934
 935     @staticmethod
 936     def deflate(data):
 937         try:
 938             return zlib.decompress(data, -zlib.MAX_WBITS)
 939         except zlib.error:
 940             return zlib.decompress(data)
 941
 942     def http_request(self, req):
 943         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 944         # always respected by websites, some tend to give out URLs with non percent-encoded
 945         # non-ASCII characters (see telemb.py, ard.py [#3412])
 946         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 947         # To work around aforementioned issue we will replace request's original URL with
 948         # percent-encoded one
 949         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 950         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 951         url = req.get_full_url()
 952         url_escaped = escape_url(url)
 953
 954         # Substitute URL if any change after escaping
 955         if url != url_escaped:
 956             req = update_Request(req, url=url_escaped)
 957
 958         for h, v in std_headers.items():
 959             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 960             # The dict keys are capitalized because of this bug by urllib
 961             if h.capitalize() not in req.headers:
 962                 req.add_header(h, v)
 963
 964         req.headers = handle_youtubedl_headers(req.headers)
 965
 966         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 967             # Python 2.6 is brain-dead when it comes to fragments
 968             req._Request__original = req._Request__original.partition('#')[0]
 969             req._Request__r_type = req._Request__r_type.partition('#')[0]
 970
 971         return req
 972
 973     def http_response(self, req, resp):
 974         old_resp = resp
 975         # gzip
 976         if resp.headers.get('Content-encoding', '') == 'gzip':
 977             content = resp.read()
 978             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 979             try:
 980                 uncompressed = io.BytesIO(gz.read())
 981             except IOError as original_ioerror:
 982                 # There may be junk add the end of the file
 983                 # See http://stackoverflow.com/q/4928560/35070 for details
 984                 for i in range(1, 1024):
 985                     try:
 986                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 987                         uncompressed = io.BytesIO(gz.read())
 988                     except IOError:
 989                         continue
 990                     break
 991                 else:
 992                     raise original_ioerror
 993             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 994             resp.msg = old_resp.msg
 995             del resp.headers['Content-encoding']
 996         # deflate
 997         if resp.headers.get('Content-encoding', '') == 'deflate':
 998             gz = io.BytesIO(self.deflate(resp.read()))
 999             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1000             resp.msg = old_resp.msg
1001             del resp.headers['Content-encoding']
1002         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1003         # https://github.com/rg3/youtube-dl/issues/6457).
1004         if 300 <= resp.code < 400:
1005             location = resp.headers.get('Location')
1006             if location:
1007                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1008                 if sys.version_info >= (3, 0):
1009                     location = location.encode('iso-8859-1').decode('utf-8')
1010                 else:
1011                     location = location.decode('utf-8')
1012                 location_escaped = escape_url(location)
1013                 if location != location_escaped:
1014                     del resp.headers['Location']
1015                     if sys.version_info < (3, 0):
1016                         location_escaped = location_escaped.encode('utf-8')
1017                     resp.headers['Location'] = location_escaped
1018         return resp
1019
1020     https_request = http_request
1021     https_response = http_response
1022
1023
1024 def make_socks_conn_class(base_class, socks_proxy):
1025     assert issubclass(base_class, (
1026         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1027
1028     url_components = compat_urlparse.urlparse(socks_proxy)
1029     if url_components.scheme.lower() == 'socks5':
1030         socks_type = ProxyType.SOCKS5
1031     elif url_components.scheme.lower() in ('socks', 'socks4'):
1032         socks_type = ProxyType.SOCKS4
1033     elif url_components.scheme.lower() == 'socks4a':
1034         socks_type = ProxyType.SOCKS4A
1035
1036     def unquote_if_non_empty(s):
1037         if not s:
1038             return s
1039         return compat_urllib_parse_unquote_plus(s)
1040
1041     proxy_args = (
1042         socks_type,
1043         url_components.hostname, url_components.port or 1080,
1044         True,  # Remote DNS
1045         unquote_if_non_empty(url_components.username),
1046         unquote_if_non_empty(url_components.password),
1047     )
1048
1049     class SocksConnection(base_class):
1050         def connect(self):
1051             self.sock = sockssocket()
1052             self.sock.setproxy(*proxy_args)
1053             if type(self.timeout) in (int, float):
1054                 self.sock.settimeout(self.timeout)
1055             self.sock.connect((self.host, self.port))
1056
1057             if isinstance(self, compat_http_client.HTTPSConnection):
1058                 if hasattr(self, '_context'):  # Python > 2.6
1059                     self.sock = self._context.wrap_socket(
1060                         self.sock, server_hostname=self.host)
1061                 else:
1062                     self.sock = ssl.wrap_socket(self.sock)
1063
1064     return SocksConnection
1065
1066
1067 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1068     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1069         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1070         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1071         self._params = params
1072
1073     def https_open(self, req):
1074         kwargs = {}
1075         conn_class = self._https_conn_class
1076
1077         if hasattr(self, '_context'):  # python > 2.6
1078             kwargs['context'] = self._context
1079         if hasattr(self, '_check_hostname'):  # python 3.x
1080             kwargs['check_hostname'] = self._check_hostname
1081
1082         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1083         if socks_proxy:
1084             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1085             del req.headers['Ytdl-socks-proxy']
1086
1087         return self.do_open(functools.partial(
1088             _create_http_connection, self, conn_class, True),
1089             req, **kwargs)
1090
1091
1092 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1093     def __init__(self, cookiejar=None):
1094         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1095
1096     def http_response(self, request, response):
1097         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1098         # characters in Set-Cookie HTTP header of last response (see
1099         # https://github.com/rg3/youtube-dl/issues/6769).
1100         # In order to at least prevent crashing we will percent encode Set-Cookie
1101         # header before HTTPCookieProcessor starts processing it.
1102         # if sys.version_info < (3, 0) and response.headers:
1103         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1104         #         set_cookie = response.headers.get(set_cookie_header)
1105         #         if set_cookie:
1106         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1107         #             if set_cookie != set_cookie_escaped:
1108         #                 del response.headers[set_cookie_header]
1109         #                 response.headers[set_cookie_header] = set_cookie_escaped
1110         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1111
1112     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1113     https_response = http_response
1114
1115
1116 def extract_timezone(date_str):
1117     m = re.search(
1118         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1119         date_str)
1120     if not m:
1121         timezone = datetime.timedelta()
1122     else:
1123         date_str = date_str[:-len(m.group('tz'))]
1124         if not m.group('sign'):
1125             timezone = datetime.timedelta()
1126         else:
1127             sign = 1 if m.group('sign') == '+' else -1
1128             timezone = datetime.timedelta(
1129                 hours=sign * int(m.group('hours')),
1130                 minutes=sign * int(m.group('minutes')))
1131     return timezone, date_str
1132
1133
1134 def parse_iso8601(date_str, delimiter='T', timezone=None):
1135     """ Return a UNIX timestamp from the given date """
1136
1137     if date_str is None:
1138         return None
1139
1140     date_str = re.sub(r'\.[0-9]+', '', date_str)
1141
1142     if timezone is None:
1143         timezone, date_str = extract_timezone(date_str)
1144
1145     try:
1146         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1147         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1148         return calendar.timegm(dt.timetuple())
1149     except ValueError:
1150         pass
1151
1152
1153 def date_formats(day_first=True):
1154     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1155
1156
1157 def unified_strdate(date_str, day_first=True):
1158     """Return a string with the date in the format YYYYMMDD"""
1159
1160     if date_str is None:
1161         return None
1162     upload_date = None
1163     # Replace commas
1164     date_str = date_str.replace(',', ' ')
1165     # Remove AM/PM + timezone
1166     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1167     _, date_str = extract_timezone(date_str)
1168
1169     for expression in date_formats(day_first):
1170         try:
1171             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1172         except ValueError:
1173             pass
1174     if upload_date is None:
1175         timetuple = email.utils.parsedate_tz(date_str)
1176         if timetuple:
1177             try:
1178                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1179             except ValueError:
1180                 pass
1181     if upload_date is not None:
1182         return compat_str(upload_date)
1183
1184
1185 def unified_timestamp(date_str, day_first=True):
1186     if date_str is None:
1187         return None
1188
1189     date_str = re.sub(r'[,|]', '', date_str)
1190
1191     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1192     timezone, date_str = extract_timezone(date_str)
1193
1194     # Remove AM/PM + timezone
1195     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1196
1197     # Remove unrecognized timezones from ISO 8601 alike timestamps
1198     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1199     if m:
1200         date_str = date_str[:-len(m.group('tz'))]
1201
1202     for expression in date_formats(day_first):
1203         try:
1204             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1205             return calendar.timegm(dt.timetuple())
1206         except ValueError:
1207             pass
1208     timetuple = email.utils.parsedate_tz(date_str)
1209     if timetuple:
1210         return calendar.timegm(timetuple) + pm_delta * 3600
1211
1212
1213 def determine_ext(url, default_ext='unknown_video'):
1214     if url is None:
1215         return default_ext
1216     guess = url.partition('?')[0].rpartition('.')[2]
1217     if re.match(r'^[A-Za-z0-9]+$', guess):
1218         return guess
1219     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1220     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1221         return guess.rstrip('/')
1222     else:
1223         return default_ext
1224
1225
1226 def subtitles_filename(filename, sub_lang, sub_format):
1227     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1228
1229
1230 def date_from_str(date_str):
1231     """
1232     Return a datetime object from a string in the format YYYYMMDD or
1233     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1234     today = datetime.date.today()
1235     if date_str in ('now', 'today'):
1236         return today
1237     if date_str == 'yesterday':
1238         return today - datetime.timedelta(days=1)
1239     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1240     if match is not None:
1241         sign = match.group('sign')
1242         time = int(match.group('time'))
1243         if sign == '-':
1244             time = -time
1245         unit = match.group('unit')
1246         # A bad approximation?
1247         if unit == 'month':
1248             unit = 'day'
1249             time *= 30
1250         elif unit == 'year':
1251             unit = 'day'
1252             time *= 365
1253         unit += 's'
1254         delta = datetime.timedelta(**{unit: time})
1255         return today + delta
1256     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1257
1258
1259 def hyphenate_date(date_str):
1260     """
1261     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1262     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1263     if match is not None:
1264         return '-'.join(match.groups())
1265     else:
1266         return date_str
1267
1268
1269 class DateRange(object):
1270     """Represents a time interval between two dates"""
1271
1272     def __init__(self, start=None, end=None):
1273         """start and end must be strings in the format accepted by date"""
1274         if start is not None:
1275             self.start = date_from_str(start)
1276         else:
1277             self.start = datetime.datetime.min.date()
1278         if end is not None:
1279             self.end = date_from_str(end)
1280         else:
1281             self.end = datetime.datetime.max.date()
1282         if self.start > self.end:
1283             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1284
1285     @classmethod
1286     def day(cls, day):
1287         """Returns a range that only contains the given day"""
1288         return cls(day, day)
1289
1290     def __contains__(self, date):
1291         """Check if the date is in the range"""
1292         if not isinstance(date, datetime.date):
1293             date = date_from_str(date)
1294         return self.start <= date <= self.end
1295
1296     def __str__(self):
1297         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1298
1299
1300 def platform_name():
1301     """ Returns the platform name as a compat_str """
1302     res = platform.platform()
1303     if isinstance(res, bytes):
1304         res = res.decode(preferredencoding())
1305
1306     assert isinstance(res, compat_str)
1307     return res
1308
1309
1310 def _windows_write_string(s, out):
1311     """ Returns True if the string was written using special methods,
1312     False if it has yet to be written out."""
1313     # Adapted from http://stackoverflow.com/a/3259271/35070
1314
1315     import ctypes
1316     import ctypes.wintypes
1317
1318     WIN_OUTPUT_IDS = {
1319         1: -11,
1320         2: -12,
1321     }
1322
1323     try:
1324         fileno = out.fileno()
1325     except AttributeError:
1326         # If the output stream doesn't have a fileno, it's virtual
1327         return False
1328     except io.UnsupportedOperation:
1329         # Some strange Windows pseudo files?
1330         return False
1331     if fileno not in WIN_OUTPUT_IDS:
1332         return False
1333
1334     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1335         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1336         ('GetStdHandle', ctypes.windll.kernel32))
1337     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1338
1339     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1340         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1341         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1342         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1343     written = ctypes.wintypes.DWORD(0)
1344
1345     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1346     FILE_TYPE_CHAR = 0x0002
1347     FILE_TYPE_REMOTE = 0x8000
1348     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1349         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1350         ctypes.POINTER(ctypes.wintypes.DWORD))(
1351         ('GetConsoleMode', ctypes.windll.kernel32))
1352     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1353
1354     def not_a_console(handle):
1355         if handle == INVALID_HANDLE_VALUE or handle is None:
1356             return True
1357         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1358                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1359
1360     if not_a_console(h):
1361         return False
1362
1363     def next_nonbmp_pos(s):
1364         try:
1365             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1366         except StopIteration:
1367             return len(s)
1368
1369     while s:
1370         count = min(next_nonbmp_pos(s), 1024)
1371
1372         ret = WriteConsoleW(
1373             h, s, count if count else 2, ctypes.byref(written), None)
1374         if ret == 0:
1375             raise OSError('Failed to write string')
1376         if not count:  # We just wrote a non-BMP character
1377             assert written.value == 2
1378             s = s[1:]
1379         else:
1380             assert written.value > 0
1381             s = s[written.value:]
1382     return True
1383
1384
1385 def write_string(s, out=None, encoding=None):
1386     if out is None:
1387         out = sys.stderr
1388     assert type(s) == compat_str
1389
1390     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1391         if _windows_write_string(s, out):
1392             return
1393
1394     if ('b' in getattr(out, 'mode', '') or
1395             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1396         byt = s.encode(encoding or preferredencoding(), 'ignore')
1397         out.write(byt)
1398     elif hasattr(out, 'buffer'):
1399         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1400         byt = s.encode(enc, 'ignore')
1401         out.buffer.write(byt)
1402     else:
1403         out.write(s)
1404     out.flush()
1405
1406
1407 def bytes_to_intlist(bs):
1408     if not bs:
1409         return []
1410     if isinstance(bs[0], int):  # Python 3
1411         return list(bs)
1412     else:
1413         return [ord(c) for c in bs]
1414
1415
1416 def intlist_to_bytes(xs):
1417     if not xs:
1418         return b''
1419     return compat_struct_pack('%dB' % len(xs), *xs)
1420
1421
1422 # Cross-platform file locking
1423 if sys.platform == 'win32':
1424     import ctypes.wintypes
1425     import msvcrt
1426
1427     class OVERLAPPED(ctypes.Structure):
1428         _fields_ = [
1429             ('Internal', ctypes.wintypes.LPVOID),
1430             ('InternalHigh', ctypes.wintypes.LPVOID),
1431             ('Offset', ctypes.wintypes.DWORD),
1432             ('OffsetHigh', ctypes.wintypes.DWORD),
1433             ('hEvent', ctypes.wintypes.HANDLE),
1434         ]
1435
1436     kernel32 = ctypes.windll.kernel32
1437     LockFileEx = kernel32.LockFileEx
1438     LockFileEx.argtypes = [
1439         ctypes.wintypes.HANDLE,     # hFile
1440         ctypes.wintypes.DWORD,      # dwFlags
1441         ctypes.wintypes.DWORD,      # dwReserved
1442         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1443         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1444         ctypes.POINTER(OVERLAPPED)  # Overlapped
1445     ]
1446     LockFileEx.restype = ctypes.wintypes.BOOL
1447     UnlockFileEx = kernel32.UnlockFileEx
1448     UnlockFileEx.argtypes = [
1449         ctypes.wintypes.HANDLE,     # hFile
1450         ctypes.wintypes.DWORD,      # dwReserved
1451         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1452         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1453         ctypes.POINTER(OVERLAPPED)  # Overlapped
1454     ]
1455     UnlockFileEx.restype = ctypes.wintypes.BOOL
1456     whole_low = 0xffffffff
1457     whole_high = 0x7fffffff
1458
1459     def _lock_file(f, exclusive):
1460         overlapped = OVERLAPPED()
1461         overlapped.Offset = 0
1462         overlapped.OffsetHigh = 0
1463         overlapped.hEvent = 0
1464         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1465         handle = msvcrt.get_osfhandle(f.fileno())
1466         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1467                           whole_low, whole_high, f._lock_file_overlapped_p):
1468             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1469
1470     def _unlock_file(f):
1471         assert f._lock_file_overlapped_p
1472         handle = msvcrt.get_osfhandle(f.fileno())
1473         if not UnlockFileEx(handle, 0,
1474                             whole_low, whole_high, f._lock_file_overlapped_p):
1475             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1476
1477 else:
1478     # Some platforms, such as Jython, is missing fcntl
1479     try:
1480         import fcntl
1481
1482         def _lock_file(f, exclusive):
1483             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1484
1485         def _unlock_file(f):
1486             fcntl.flock(f, fcntl.LOCK_UN)
1487     except ImportError:
1488         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1489
1490         def _lock_file(f, exclusive):
1491             raise IOError(UNSUPPORTED_MSG)
1492
1493         def _unlock_file(f):
1494             raise IOError(UNSUPPORTED_MSG)
1495
1496
1497 class locked_file(object):
1498     def __init__(self, filename, mode, encoding=None):
1499         assert mode in ['r', 'a', 'w']
1500         self.f = io.open(filename, mode, encoding=encoding)
1501         self.mode = mode
1502
1503     def __enter__(self):
1504         exclusive = self.mode != 'r'
1505         try:
1506             _lock_file(self.f, exclusive)
1507         except IOError:
1508             self.f.close()
1509             raise
1510         return self
1511
1512     def __exit__(self, etype, value, traceback):
1513         try:
1514             _unlock_file(self.f)
1515         finally:
1516             self.f.close()
1517
1518     def __iter__(self):
1519         return iter(self.f)
1520
1521     def write(self, *args):
1522         return self.f.write(*args)
1523
1524     def read(self, *args):
1525         return self.f.read(*args)
1526
1527
1528 def get_filesystem_encoding():
1529     encoding = sys.getfilesystemencoding()
1530     return encoding if encoding is not None else 'utf-8'
1531
1532
1533 def shell_quote(args):
1534     quoted_args = []
1535     encoding = get_filesystem_encoding()
1536     for a in args:
1537         if isinstance(a, bytes):
1538             # We may get a filename encoded with 'encodeFilename'
1539             a = a.decode(encoding)
1540         quoted_args.append(compat_shlex_quote(a))
1541     return ' '.join(quoted_args)
1542
1543
1544 def smuggle_url(url, data):
1545     """ Pass additional data in a URL for internal use. """
1546
1547     url, idata = unsmuggle_url(url, {})
1548     data.update(idata)
1549     sdata = compat_urllib_parse_urlencode(
1550         {'__youtubedl_smuggle': json.dumps(data)})
1551     return url + '#' + sdata
1552
1553
1554 def unsmuggle_url(smug_url, default=None):
1555     if '#__youtubedl_smuggle' not in smug_url:
1556         return smug_url, default
1557     url, _, sdata = smug_url.rpartition('#')
1558     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1559     data = json.loads(jsond)
1560     return url, data
1561
1562
1563 def format_bytes(bytes):
1564     if bytes is None:
1565         return 'N/A'
1566     if type(bytes) is str:
1567         bytes = float(bytes)
1568     if bytes == 0.0:
1569         exponent = 0
1570     else:
1571         exponent = int(math.log(bytes, 1024.0))
1572     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1573     converted = float(bytes) / float(1024 ** exponent)
1574     return '%.2f%s' % (converted, suffix)
1575
1576
1577 def lookup_unit_table(unit_table, s):
1578     units_re = '|'.join(re.escape(u) for u in unit_table)
1579     m = re.match(
1580         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1581     if not m:
1582         return None
1583     num_str = m.group('num').replace(',', '.')
1584     mult = unit_table[m.group('unit')]
1585     return int(float(num_str) * mult)
1586
1587
1588 def parse_filesize(s):
1589     if s is None:
1590         return None
1591
1592     # The lower-case forms are of course incorrect and unofficial,
1593     # but we support those too
1594     _UNIT_TABLE = {
1595         'B': 1,
1596         'b': 1,
1597         'bytes': 1,
1598         'KiB': 1024,
1599         'KB': 1000,
1600         'kB': 1024,
1601         'Kb': 1000,
1602         'kb': 1000,
1603         'kilobytes': 1000,
1604         'kibibytes': 1024,
1605         'MiB': 1024 ** 2,
1606         'MB': 1000 ** 2,
1607         'mB': 1024 ** 2,
1608         'Mb': 1000 ** 2,
1609         'mb': 1000 ** 2,
1610         'megabytes': 1000 ** 2,
1611         'mebibytes': 1024 ** 2,
1612         'GiB': 1024 ** 3,
1613         'GB': 1000 ** 3,
1614         'gB': 1024 ** 3,
1615         'Gb': 1000 ** 3,
1616         'gb': 1000 ** 3,
1617         'gigabytes': 1000 ** 3,
1618         'gibibytes': 1024 ** 3,
1619         'TiB': 1024 ** 4,
1620         'TB': 1000 ** 4,
1621         'tB': 1024 ** 4,
1622         'Tb': 1000 ** 4,
1623         'tb': 1000 ** 4,
1624         'terabytes': 1000 ** 4,
1625         'tebibytes': 1024 ** 4,
1626         'PiB': 1024 ** 5,
1627         'PB': 1000 ** 5,
1628         'pB': 1024 ** 5,
1629         'Pb': 1000 ** 5,
1630         'pb': 1000 ** 5,
1631         'petabytes': 1000 ** 5,
1632         'pebibytes': 1024 ** 5,
1633         'EiB': 1024 ** 6,
1634         'EB': 1000 ** 6,
1635         'eB': 1024 ** 6,
1636         'Eb': 1000 ** 6,
1637         'eb': 1000 ** 6,
1638         'exabytes': 1000 ** 6,
1639         'exbibytes': 1024 ** 6,
1640         'ZiB': 1024 ** 7,
1641         'ZB': 1000 ** 7,
1642         'zB': 1024 ** 7,
1643         'Zb': 1000 ** 7,
1644         'zb': 1000 ** 7,
1645         'zettabytes': 1000 ** 7,
1646         'zebibytes': 1024 ** 7,
1647         'YiB': 1024 ** 8,
1648         'YB': 1000 ** 8,
1649         'yB': 1024 ** 8,
1650         'Yb': 1000 ** 8,
1651         'yb': 1000 ** 8,
1652         'yottabytes': 1000 ** 8,
1653         'yobibytes': 1024 ** 8,
1654     }
1655
1656     return lookup_unit_table(_UNIT_TABLE, s)
1657
1658
1659 def parse_count(s):
1660     if s is None:
1661         return None
1662
1663     s = s.strip()
1664
1665     if re.match(r'^[\d,.]+$', s):
1666         return str_to_int(s)
1667
1668     _UNIT_TABLE = {
1669         'k': 1000,
1670         'K': 1000,
1671         'm': 1000 ** 2,
1672         'M': 1000 ** 2,
1673         'kk': 1000 ** 2,
1674         'KK': 1000 ** 2,
1675     }
1676
1677     return lookup_unit_table(_UNIT_TABLE, s)
1678
1679
1680 def month_by_name(name, lang='en'):
1681     """ Return the number of a month by (locale-independently) English name """
1682
1683     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1684
1685     try:
1686         return month_names.index(name) + 1
1687     except ValueError:
1688         return None
1689
1690
1691 def month_by_abbreviation(abbrev):
1692     """ Return the number of a month by (locale-independently) English
1693         abbreviations """
1694
1695     try:
1696         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1697     except ValueError:
1698         return None
1699
1700
1701 def fix_xml_ampersands(xml_str):
1702     """Replace all the '&' by '&amp;' in XML"""
1703     return re.sub(
1704         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1705         '&amp;',
1706         xml_str)
1707
1708
1709 def setproctitle(title):
1710     assert isinstance(title, compat_str)
1711
1712     # ctypes in Jython is not complete
1713     # http://bugs.jython.org/issue2148
1714     if sys.platform.startswith('java'):
1715         return
1716
1717     try:
1718         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1719     except OSError:
1720         return
1721     except TypeError:
1722         # LoadLibrary in Windows Python 2.7.13 only expects
1723         # a bytestring, but since unicode_literals turns
1724         # every string into a unicode string, it fails.
1725         return
1726     title_bytes = title.encode('utf-8')
1727     buf = ctypes.create_string_buffer(len(title_bytes))
1728     buf.value = title_bytes
1729     try:
1730         libc.prctl(15, buf, 0, 0, 0)
1731     except AttributeError:
1732         return  # Strange libc, just skip this
1733
1734
1735 def remove_start(s, start):
1736     return s[len(start):] if s is not None and s.startswith(start) else s
1737
1738
1739 def remove_end(s, end):
1740     return s[:-len(end)] if s is not None and s.endswith(end) else s
1741
1742
1743 def remove_quotes(s):
1744     if s is None or len(s) < 2:
1745         return s
1746     for quote in ('"', "'", ):
1747         if s[0] == quote and s[-1] == quote:
1748             return s[1:-1]
1749     return s
1750
1751
1752 def url_basename(url):
1753     path = compat_urlparse.urlparse(url).path
1754     return path.strip('/').split('/')[-1]
1755
1756
1757 def base_url(url):
1758     return re.match(r'https?://[^?#&]+/', url).group()
1759
1760
1761 def urljoin(base, path):
1762     if isinstance(path, bytes):
1763         path = path.decode('utf-8')
1764     if not isinstance(path, compat_str) or not path:
1765         return None
1766     if re.match(r'^(?:https?:)?//', path):
1767         return path
1768     if isinstance(base, bytes):
1769         base = base.decode('utf-8')
1770     if not isinstance(base, compat_str) or not re.match(
1771             r'^(?:https?:)?//', base):
1772         return None
1773     return compat_urlparse.urljoin(base, path)
1774
1775
1776 class HEADRequest(compat_urllib_request.Request):
1777     def get_method(self):
1778         return 'HEAD'
1779
1780
1781 class PUTRequest(compat_urllib_request.Request):
1782     def get_method(self):
1783         return 'PUT'
1784
1785
1786 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1787     if get_attr:
1788         if v is not None:
1789             v = getattr(v, get_attr, None)
1790     if v == '':
1791         v = None
1792     if v is None:
1793         return default
1794     try:
1795         return int(v) * invscale // scale
1796     except ValueError:
1797         return default
1798
1799
1800 def str_or_none(v, default=None):
1801     return default if v is None else compat_str(v)
1802
1803
1804 def str_to_int(int_str):
1805     """ A more relaxed version of int_or_none """
1806     if int_str is None:
1807         return None
1808     int_str = re.sub(r'[,\.\+]', '', int_str)
1809     return int(int_str)
1810
1811
1812 def float_or_none(v, scale=1, invscale=1, default=None):
1813     if v is None:
1814         return default
1815     try:
1816         return float(v) * invscale / scale
1817     except ValueError:
1818         return default
1819
1820
1821 def bool_or_none(v, default=None):
1822     return v if isinstance(v, bool) else default
1823
1824
1825 def strip_or_none(v):
1826     return None if v is None else v.strip()
1827
1828
1829 def parse_duration(s):
1830     if not isinstance(s, compat_basestring):
1831         return None
1832
1833     s = s.strip()
1834
1835     days, hours, mins, secs, ms = [None] * 5
1836     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1837     if m:
1838         days, hours, mins, secs, ms = m.groups()
1839     else:
1840         m = re.match(
1841             r'''(?ix)(?:P?
1842                 (?:
1843                     [0-9]+\s*y(?:ears?)?\s*
1844                 )?
1845                 (?:
1846                     [0-9]+\s*m(?:onths?)?\s*
1847                 )?
1848                 (?:
1849                     [0-9]+\s*w(?:eeks?)?\s*
1850                 )?
1851                 (?:
1852                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1853                 )?
1854                 T)?
1855                 (?:
1856                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1857                 )?
1858                 (?:
1859                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1860                 )?
1861                 (?:
1862                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1863                 )?Z?$''', s)
1864         if m:
1865             days, hours, mins, secs, ms = m.groups()
1866         else:
1867             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1868             if m:
1869                 hours, mins = m.groups()
1870             else:
1871                 return None
1872
1873     duration = 0
1874     if secs:
1875         duration += float(secs)
1876     if mins:
1877         duration += float(mins) * 60
1878     if hours:
1879         duration += float(hours) * 60 * 60
1880     if days:
1881         duration += float(days) * 24 * 60 * 60
1882     if ms:
1883         duration += float(ms)
1884     return duration
1885
1886
1887 def prepend_extension(filename, ext, expected_real_ext=None):
1888     name, real_ext = os.path.splitext(filename)
1889     return (
1890         '{0}.{1}{2}'.format(name, ext, real_ext)
1891         if not expected_real_ext or real_ext[1:] == expected_real_ext
1892         else '{0}.{1}'.format(filename, ext))
1893
1894
1895 def replace_extension(filename, ext, expected_real_ext=None):
1896     name, real_ext = os.path.splitext(filename)
1897     return '{0}.{1}'.format(
1898         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1899         ext)
1900
1901
1902 def check_executable(exe, args=[]):
1903     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1904     args can be a list of arguments for a short output (like -version) """
1905     try:
1906         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1907     except OSError:
1908         return False
1909     return exe
1910
1911
1912 def get_exe_version(exe, args=['--version'],
1913                     version_re=None, unrecognized='present'):
1914     """ Returns the version of the specified executable,
1915     or False if the executable is not present """
1916     try:
1917         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1918         # SIGTTOU if youtube-dl is run in the background.
1919         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1920         out, _ = subprocess.Popen(
1921             [encodeArgument(exe)] + args,
1922             stdin=subprocess.PIPE,
1923             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1924     except OSError:
1925         return False
1926     if isinstance(out, bytes):  # Python 2.x
1927         out = out.decode('ascii', 'ignore')
1928     return detect_exe_version(out, version_re, unrecognized)
1929
1930
1931 def detect_exe_version(output, version_re=None, unrecognized='present'):
1932     assert isinstance(output, compat_str)
1933     if version_re is None:
1934         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1935     m = re.search(version_re, output)
1936     if m:
1937         return m.group(1)
1938     else:
1939         return unrecognized
1940
1941
1942 class PagedList(object):
1943     def __len__(self):
1944         # This is only useful for tests
1945         return len(self.getslice())
1946
1947
1948 class OnDemandPagedList(PagedList):
1949     def __init__(self, pagefunc, pagesize, use_cache=True):
1950         self._pagefunc = pagefunc
1951         self._pagesize = pagesize
1952         self._use_cache = use_cache
1953         if use_cache:
1954             self._cache = {}
1955
1956     def getslice(self, start=0, end=None):
1957         res = []
1958         for pagenum in itertools.count(start // self._pagesize):
1959             firstid = pagenum * self._pagesize
1960             nextfirstid = pagenum * self._pagesize + self._pagesize
1961             if start >= nextfirstid:
1962                 continue
1963
1964             page_results = None
1965             if self._use_cache:
1966                 page_results = self._cache.get(pagenum)
1967             if page_results is None:
1968                 page_results = list(self._pagefunc(pagenum))
1969             if self._use_cache:
1970                 self._cache[pagenum] = page_results
1971
1972             startv = (
1973                 start % self._pagesize
1974                 if firstid <= start < nextfirstid
1975                 else 0)
1976
1977             endv = (
1978                 ((end - 1) % self._pagesize) + 1
1979                 if (end is not None and firstid <= end <= nextfirstid)
1980                 else None)
1981
1982             if startv != 0 or endv is not None:
1983                 page_results = page_results[startv:endv]
1984             res.extend(page_results)
1985
1986             # A little optimization - if current page is not "full", ie. does
1987             # not contain page_size videos then we can assume that this page
1988             # is the last one - there are no more ids on further pages -
1989             # i.e. no need to query again.
1990             if len(page_results) + startv < self._pagesize:
1991                 break
1992
1993             # If we got the whole page, but the next page is not interesting,
1994             # break out early as well
1995             if end == nextfirstid:
1996                 break
1997         return res
1998
1999
2000 class InAdvancePagedList(PagedList):
2001     def __init__(self, pagefunc, pagecount, pagesize):
2002         self._pagefunc = pagefunc
2003         self._pagecount = pagecount
2004         self._pagesize = pagesize
2005
2006     def getslice(self, start=0, end=None):
2007         res = []
2008         start_page = start // self._pagesize
2009         end_page = (
2010             self._pagecount if end is None else (end // self._pagesize + 1))
2011         skip_elems = start - start_page * self._pagesize
2012         only_more = None if end is None else end - start
2013         for pagenum in range(start_page, end_page):
2014             page = list(self._pagefunc(pagenum))
2015             if skip_elems:
2016                 page = page[skip_elems:]
2017                 skip_elems = None
2018             if only_more is not None:
2019                 if len(page) < only_more:
2020                     only_more -= len(page)
2021                 else:
2022                     page = page[:only_more]
2023                     res.extend(page)
2024                     break
2025             res.extend(page)
2026         return res
2027
2028
2029 def uppercase_escape(s):
2030     unicode_escape = codecs.getdecoder('unicode_escape')
2031     return re.sub(
2032         r'\\U[0-9a-fA-F]{8}',
2033         lambda m: unicode_escape(m.group(0))[0],
2034         s)
2035
2036
2037 def lowercase_escape(s):
2038     unicode_escape = codecs.getdecoder('unicode_escape')
2039     return re.sub(
2040         r'\\u[0-9a-fA-F]{4}',
2041         lambda m: unicode_escape(m.group(0))[0],
2042         s)
2043
2044
2045 def escape_rfc3986(s):
2046     """Escape non-ASCII characters as suggested by RFC 3986"""
2047     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2048         s = s.encode('utf-8')
2049     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2050
2051
2052 def escape_url(url):
2053     """Escape URL as suggested by RFC 3986"""
2054     url_parsed = compat_urllib_parse_urlparse(url)
2055     return url_parsed._replace(
2056         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2057         path=escape_rfc3986(url_parsed.path),
2058         params=escape_rfc3986(url_parsed.params),
2059         query=escape_rfc3986(url_parsed.query),
2060         fragment=escape_rfc3986(url_parsed.fragment)
2061     ).geturl()
2062
2063
2064 def read_batch_urls(batch_fd):
2065     def fixup(url):
2066         if not isinstance(url, compat_str):
2067             url = url.decode('utf-8', 'replace')
2068         BOM_UTF8 = '\xef\xbb\xbf'
2069         if url.startswith(BOM_UTF8):
2070             url = url[len(BOM_UTF8):]
2071         url = url.strip()
2072         if url.startswith(('#', ';', ']')):
2073             return False
2074         return url
2075
2076     with contextlib.closing(batch_fd) as fd:
2077         return [url for url in map(fixup, fd) if url]
2078
2079
2080 def urlencode_postdata(*args, **kargs):
2081     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2082
2083
2084 def update_url_query(url, query):
2085     if not query:
2086         return url
2087     parsed_url = compat_urlparse.urlparse(url)
2088     qs = compat_parse_qs(parsed_url.query)
2089     qs.update(query)
2090     return compat_urlparse.urlunparse(parsed_url._replace(
2091         query=compat_urllib_parse_urlencode(qs, True)))
2092
2093
2094 def update_Request(req, url=None, data=None, headers={}, query={}):
2095     req_headers = req.headers.copy()
2096     req_headers.update(headers)
2097     req_data = data or req.data
2098     req_url = update_url_query(url or req.get_full_url(), query)
2099     req_get_method = req.get_method()
2100     if req_get_method == 'HEAD':
2101         req_type = HEADRequest
2102     elif req_get_method == 'PUT':
2103         req_type = PUTRequest
2104     else:
2105         req_type = compat_urllib_request.Request
2106     new_req = req_type(
2107         req_url, data=req_data, headers=req_headers,
2108         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2109     if hasattr(req, 'timeout'):
2110         new_req.timeout = req.timeout
2111     return new_req
2112
2113
2114 def _multipart_encode_impl(data, boundary):
2115     content_type = 'multipart/form-data; boundary=%s' % boundary
2116
2117     out = b''
2118     for k, v in data.items():
2119         out += b'--' + boundary.encode('ascii') + b'\r\n'
2120         if isinstance(k, compat_str):
2121             k = k.encode('utf-8')
2122         if isinstance(v, compat_str):
2123             v = v.encode('utf-8')
2124         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2125         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2126         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2127         if boundary.encode('ascii') in content:
2128             raise ValueError('Boundary overlaps with data')
2129         out += content
2130
2131     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2132
2133     return out, content_type
2134
2135
2136 def multipart_encode(data, boundary=None):
2137     '''
2138     Encode a dict to RFC 7578-compliant form-data
2139
2140     data:
2141         A dict where keys and values can be either Unicode or bytes-like
2142         objects.
2143     boundary:
2144         If specified a Unicode object, it's used as the boundary. Otherwise
2145         a random boundary is generated.
2146
2147     Reference: https://tools.ietf.org/html/rfc7578
2148     '''
2149     has_specified_boundary = boundary is not None
2150
2151     while True:
2152         if boundary is None:
2153             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2154
2155         try:
2156             out, content_type = _multipart_encode_impl(data, boundary)
2157             break
2158         except ValueError:
2159             if has_specified_boundary:
2160                 raise
2161             boundary = None
2162
2163     return out, content_type
2164
2165
2166 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2167     if isinstance(key_or_keys, (list, tuple)):
2168         for key in key_or_keys:
2169             if key not in d or d[key] is None or skip_false_values and not d[key]:
2170                 continue
2171             return d[key]
2172         return default
2173     return d.get(key_or_keys, default)
2174
2175
2176 def try_get(src, getter, expected_type=None):
2177     if not isinstance(getter, (list, tuple)):
2178         getter = [getter]
2179     for get in getter:
2180         try:
2181             v = get(src)
2182         except (AttributeError, KeyError, TypeError, IndexError):
2183             pass
2184         else:
2185             if expected_type is None or isinstance(v, expected_type):
2186                 return v
2187
2188
2189 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2190     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2191
2192
2193 US_RATINGS = {
2194     'G': 0,
2195     'PG': 10,
2196     'PG-13': 13,
2197     'R': 16,
2198     'NC': 18,
2199 }
2200
2201
2202 TV_PARENTAL_GUIDELINES = {
2203     'TV-Y': 0,
2204     'TV-Y7': 7,
2205     'TV-G': 0,
2206     'TV-PG': 0,
2207     'TV-14': 14,
2208     'TV-MA': 17,
2209 }
2210
2211
2212 def parse_age_limit(s):
2213     if type(s) == int:
2214         return s if 0 <= s <= 21 else None
2215     if not isinstance(s, compat_basestring):
2216         return None
2217     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2218     if m:
2219         return int(m.group('age'))
2220     if s in US_RATINGS:
2221         return US_RATINGS[s]
2222     return TV_PARENTAL_GUIDELINES.get(s)
2223
2224
2225 def strip_jsonp(code):
2226     return re.sub(
2227         r'''(?sx)^
2228             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2229             (?:\s*&&\s*(?P=func_name))?
2230             \s*\(\s*(?P<callback_data>.*)\);?
2231             \s*?(?://[^\n]*)*$''',
2232         r'\g<callback_data>', code)
2233
2234
2235 def js_to_json(code):
2236     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2237     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2238     INTEGER_TABLE = (
2239         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2240         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2241     )
2242
2243     def fix_kv(m):
2244         v = m.group(0)
2245         if v in ('true', 'false', 'null'):
2246             return v
2247         elif v.startswith('/*') or v.startswith('//') or v == ',':
2248             return ""
2249
2250         if v[0] in ("'", '"'):
2251             v = re.sub(r'(?s)\\.|"', lambda m: {
2252                 '"': '\\"',
2253                 "\\'": "'",
2254                 '\\\n': '',
2255                 '\\x': '\\u00',
2256             }.get(m.group(0), m.group(0)), v[1:-1])
2257
2258         for regex, base in INTEGER_TABLE:
2259             im = re.match(regex, v)
2260             if im:
2261                 i = int(im.group(1), base)
2262                 return '"%d":' % i if v.endswith(':') else '%d' % i
2263
2264         return '"%s"' % v
2265
2266     return re.sub(r'''(?sx)
2267         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2268         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2269         {comment}|,(?={skip}[\]}}])|
2270         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2271         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2272         [0-9]+(?={skip}:)
2273         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2274
2275
2276 def qualities(quality_ids):
2277     """ Get a numeric quality value out of a list of possible values """
2278     def q(qid):
2279         try:
2280             return quality_ids.index(qid)
2281         except ValueError:
2282             return -1
2283     return q
2284
2285
2286 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2287
2288
2289 def limit_length(s, length):
2290     """ Add ellipses to overly long strings """
2291     if s is None:
2292         return None
2293     ELLIPSES = '...'
2294     if len(s) > length:
2295         return s[:length - len(ELLIPSES)] + ELLIPSES
2296     return s
2297
2298
2299 def version_tuple(v):
2300     return tuple(int(e) for e in re.split(r'[-.]', v))
2301
2302
2303 def is_outdated_version(version, limit, assume_new=True):
2304     if not version:
2305         return not assume_new
2306     try:
2307         return version_tuple(version) < version_tuple(limit)
2308     except ValueError:
2309         return not assume_new
2310
2311
2312 def ytdl_is_updateable():
2313     """ Returns if youtube-dl can be updated with -U """
2314     from zipimport import zipimporter
2315
2316     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2317
2318
2319 def args_to_str(args):
2320     # Get a short string representation for a subprocess command
2321     return ' '.join(compat_shlex_quote(a) for a in args)
2322
2323
2324 def error_to_compat_str(err):
2325     err_str = str(err)
2326     # On python 2 error byte string must be decoded with proper
2327     # encoding rather than ascii
2328     if sys.version_info[0] < 3:
2329         err_str = err_str.decode(preferredencoding())
2330     return err_str
2331
2332
2333 def mimetype2ext(mt):
2334     if mt is None:
2335         return None
2336
2337     ext = {
2338         'audio/mp4': 'm4a',
2339         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2340         # it's the most popular one
2341         'audio/mpeg': 'mp3',
2342     }.get(mt)
2343     if ext is not None:
2344         return ext
2345
2346     _, _, res = mt.rpartition('/')
2347     res = res.split(';')[0].strip().lower()
2348
2349     return {
2350         '3gpp': '3gp',
2351         'smptett+xml': 'tt',
2352         'ttaf+xml': 'dfxp',
2353         'ttml+xml': 'ttml',
2354         'x-flv': 'flv',
2355         'x-mp4-fragmented': 'mp4',
2356         'x-ms-sami': 'sami',
2357         'x-ms-wmv': 'wmv',
2358         'mpegurl': 'm3u8',
2359         'x-mpegurl': 'm3u8',
2360         'vnd.apple.mpegurl': 'm3u8',
2361         'dash+xml': 'mpd',
2362         'f4m+xml': 'f4m',
2363         'hds+xml': 'f4m',
2364         'vnd.ms-sstr+xml': 'ism',
2365         'quicktime': 'mov',
2366         'mp2t': 'ts',
2367     }.get(res, res)
2368
2369
2370 def parse_codecs(codecs_str):
2371     # http://tools.ietf.org/html/rfc6381
2372     if not codecs_str:
2373         return {}
2374     splited_codecs = list(filter(None, map(
2375         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2376     vcodec, acodec = None, None
2377     for full_codec in splited_codecs:
2378         codec = full_codec.split('.')[0]
2379         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2380             if not vcodec:
2381                 vcodec = full_codec
2382         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2383             if not acodec:
2384                 acodec = full_codec
2385         else:
2386             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2387     if not vcodec and not acodec:
2388         if len(splited_codecs) == 2:
2389             return {
2390                 'vcodec': vcodec,
2391                 'acodec': acodec,
2392             }
2393         elif len(splited_codecs) == 1:
2394             return {
2395                 'vcodec': 'none',
2396                 'acodec': vcodec,
2397             }
2398     else:
2399         return {
2400             'vcodec': vcodec or 'none',
2401             'acodec': acodec or 'none',
2402         }
2403     return {}
2404
2405
2406 def urlhandle_detect_ext(url_handle):
2407     getheader = url_handle.headers.get
2408
2409     cd = getheader('Content-Disposition')
2410     if cd:
2411         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2412         if m:
2413             e = determine_ext(m.group('filename'), default_ext=None)
2414             if e:
2415                 return e
2416
2417     return mimetype2ext(getheader('Content-Type'))
2418
2419
2420 def encode_data_uri(data, mime_type):
2421     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2422
2423
2424 def age_restricted(content_limit, age_limit):
2425     """ Returns True iff the content should be blocked """
2426
2427     if age_limit is None:  # No limit set
2428         return False
2429     if content_limit is None:
2430         return False  # Content available for everyone
2431     return age_limit < content_limit
2432
2433
2434 def is_html(first_bytes):
2435     """ Detect whether a file contains HTML by examining its first bytes. """
2436
2437     BOMS = [
2438         (b'\xef\xbb\xbf', 'utf-8'),
2439         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2440         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2441         (b'\xff\xfe', 'utf-16-le'),
2442         (b'\xfe\xff', 'utf-16-be'),
2443     ]
2444     for bom, enc in BOMS:
2445         if first_bytes.startswith(bom):
2446             s = first_bytes[len(bom):].decode(enc, 'replace')
2447             break
2448     else:
2449         s = first_bytes.decode('utf-8', 'replace')
2450
2451     return re.match(r'^\s*<', s)
2452
2453
2454 def determine_protocol(info_dict):
2455     protocol = info_dict.get('protocol')
2456     if protocol is not None:
2457         return protocol
2458
2459     url = info_dict['url']
2460     if url.startswith('rtmp'):
2461         return 'rtmp'
2462     elif url.startswith('mms'):
2463         return 'mms'
2464     elif url.startswith('rtsp'):
2465         return 'rtsp'
2466
2467     ext = determine_ext(url)
2468     if ext == 'm3u8':
2469         return 'm3u8'
2470     elif ext == 'f4m':
2471         return 'f4m'
2472
2473     return compat_urllib_parse_urlparse(url).scheme
2474
2475
2476 def render_table(header_row, data):
2477     """ Render a list of rows, each as a list of values """
2478     table = [header_row] + data
2479     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2480     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2481     return '\n'.join(format_str % tuple(row) for row in table)
2482
2483
2484 def _match_one(filter_part, dct):
2485     COMPARISON_OPERATORS = {
2486         '<': operator.lt,
2487         '<=': operator.le,
2488         '>': operator.gt,
2489         '>=': operator.ge,
2490         '=': operator.eq,
2491         '!=': operator.ne,
2492     }
2493     operator_rex = re.compile(r'''(?x)\s*
2494         (?P<key>[a-z_]+)
2495         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2496         (?:
2497             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2498             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2499             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2500         )
2501         \s*$
2502         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2503     m = operator_rex.search(filter_part)
2504     if m:
2505         op = COMPARISON_OPERATORS[m.group('op')]
2506         actual_value = dct.get(m.group('key'))
2507         if (m.group('quotedstrval') is not None or
2508             m.group('strval') is not None or
2509             # If the original field is a string and matching comparisonvalue is
2510             # a number we should respect the origin of the original field
2511             # and process comparison value as a string (see
2512             # https://github.com/rg3/youtube-dl/issues/11082).
2513             actual_value is not None and m.group('intval') is not None and
2514                 isinstance(actual_value, compat_str)):
2515             if m.group('op') not in ('=', '!='):
2516                 raise ValueError(
2517                     'Operator %s does not support string values!' % m.group('op'))
2518             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2519             quote = m.group('quote')
2520             if quote is not None:
2521                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2522         else:
2523             try:
2524                 comparison_value = int(m.group('intval'))
2525             except ValueError:
2526                 comparison_value = parse_filesize(m.group('intval'))
2527                 if comparison_value is None:
2528                     comparison_value = parse_filesize(m.group('intval') + 'B')
2529                 if comparison_value is None:
2530                     raise ValueError(
2531                         'Invalid integer value %r in filter part %r' % (
2532                             m.group('intval'), filter_part))
2533         if actual_value is None:
2534             return m.group('none_inclusive')
2535         return op(actual_value, comparison_value)
2536
2537     UNARY_OPERATORS = {
2538         '': lambda v: v is not None,
2539         '!': lambda v: v is None,
2540     }
2541     operator_rex = re.compile(r'''(?x)\s*
2542         (?P<op>%s)\s*(?P<key>[a-z_]+)
2543         \s*$
2544         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2545     m = operator_rex.search(filter_part)
2546     if m:
2547         op = UNARY_OPERATORS[m.group('op')]
2548         actual_value = dct.get(m.group('key'))
2549         return op(actual_value)
2550
2551     raise ValueError('Invalid filter part %r' % filter_part)
2552
2553
2554 def match_str(filter_str, dct):
2555     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2556
2557     return all(
2558         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2559
2560
2561 def match_filter_func(filter_str):
2562     def _match_func(info_dict):
2563         if match_str(filter_str, info_dict):
2564             return None
2565         else:
2566             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2567             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2568     return _match_func
2569
2570
2571 def parse_dfxp_time_expr(time_expr):
2572     if not time_expr:
2573         return
2574
2575     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2576     if mobj:
2577         return float(mobj.group('time_offset'))
2578
2579     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2580     if mobj:
2581         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2582
2583
2584 def srt_subtitles_timecode(seconds):
2585     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2586
2587
2588 def dfxp2srt(dfxp_data):
2589     '''
2590     @param dfxp_data A bytes-like object containing DFXP data
2591     @returns A unicode object containing converted SRT data
2592     '''
2593     LEGACY_NAMESPACES = (
2594         (b'http://www.w3.org/ns/ttml', [
2595             b'http://www.w3.org/2004/11/ttaf1',
2596             b'http://www.w3.org/2006/04/ttaf1',
2597             b'http://www.w3.org/2006/10/ttaf1',
2598         ]),
2599         (b'http://www.w3.org/ns/ttml#styling', [
2600             b'http://www.w3.org/ns/ttml#style',
2601         ]),
2602     )
2603
2604     SUPPORTED_STYLING = [
2605         'color',
2606         'fontFamily',
2607         'fontSize',
2608         'fontStyle',
2609         'fontWeight',
2610         'textDecoration'
2611     ]
2612
2613     _x = functools.partial(xpath_with_ns, ns_map={
2614         'ttml': 'http://www.w3.org/ns/ttml',
2615         'tts': 'http://www.w3.org/ns/ttml#styling',
2616     })
2617
2618     styles = {}
2619     default_style = {}
2620
2621     class TTMLPElementParser(object):
2622         _out = ''
2623         _unclosed_elements = []
2624         _applied_styles = []
2625
2626         def start(self, tag, attrib):
2627             if tag in (_x('ttml:br'), 'br'):
2628                 self._out += '\n'
2629             else:
2630                 unclosed_elements = []
2631                 style = {}
2632                 element_style_id = attrib.get('style')
2633                 if default_style:
2634                     style.update(default_style)
2635                 if element_style_id:
2636                     style.update(styles.get(element_style_id, {}))
2637                 for prop in SUPPORTED_STYLING:
2638                     prop_val = attrib.get(_x('tts:' + prop))
2639                     if prop_val:
2640                         style[prop] = prop_val
2641                 if style:
2642                     font = ''
2643                     for k, v in sorted(style.items()):
2644                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2645                             continue
2646                         if k == 'color':
2647                             font += ' color="%s"' % v
2648                         elif k == 'fontSize':
2649                             font += ' size="%s"' % v
2650                         elif k == 'fontFamily':
2651                             font += ' face="%s"' % v
2652                         elif k == 'fontWeight' and v == 'bold':
2653                             self._out += '<b>'
2654                             unclosed_elements.append('b')
2655                         elif k == 'fontStyle' and v == 'italic':
2656                             self._out += '<i>'
2657                             unclosed_elements.append('i')
2658                         elif k == 'textDecoration' and v == 'underline':
2659                             self._out += '<u>'
2660                             unclosed_elements.append('u')
2661                     if font:
2662                         self._out += '<font' + font + '>'
2663                         unclosed_elements.append('font')
2664                     applied_style = {}
2665                     if self._applied_styles:
2666                         applied_style.update(self._applied_styles[-1])
2667                     applied_style.update(style)
2668                     self._applied_styles.append(applied_style)
2669                 self._unclosed_elements.append(unclosed_elements)
2670
2671         def end(self, tag):
2672             if tag not in (_x('ttml:br'), 'br'):
2673                 unclosed_elements = self._unclosed_elements.pop()
2674                 for element in reversed(unclosed_elements):
2675                     self._out += '</%s>' % element
2676                 if unclosed_elements and self._applied_styles:
2677                     self._applied_styles.pop()
2678
2679         def data(self, data):
2680             self._out += data
2681
2682         def close(self):
2683             return self._out.strip()
2684
2685     def parse_node(node):
2686         target = TTMLPElementParser()
2687         parser = xml.etree.ElementTree.XMLParser(target=target)
2688         parser.feed(xml.etree.ElementTree.tostring(node))
2689         return parser.close()
2690
2691     for k, v in LEGACY_NAMESPACES:
2692         for ns in v:
2693             dfxp_data = dfxp_data.replace(ns, k)
2694
2695     dfxp = compat_etree_fromstring(dfxp_data)
2696     out = []
2697     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2698
2699     if not paras:
2700         raise ValueError('Invalid dfxp/TTML subtitle')
2701
2702     repeat = False
2703     while True:
2704         for style in dfxp.findall(_x('.//ttml:style')):
2705             style_id = style.get('id')
2706             parent_style_id = style.get('style')
2707             if parent_style_id:
2708                 if parent_style_id not in styles:
2709                     repeat = True
2710                     continue
2711                 styles[style_id] = styles[parent_style_id].copy()
2712             for prop in SUPPORTED_STYLING:
2713                 prop_val = style.get(_x('tts:' + prop))
2714                 if prop_val:
2715                     styles.setdefault(style_id, {})[prop] = prop_val
2716         if repeat:
2717             repeat = False
2718         else:
2719             break
2720
2721     for p in ('body', 'div'):
2722         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2723         if ele is None:
2724             continue
2725         style = styles.get(ele.get('style'))
2726         if not style:
2727             continue
2728         default_style.update(style)
2729
2730     for para, index in zip(paras, itertools.count(1)):
2731         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2732         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2733         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2734         if begin_time is None:
2735             continue
2736         if not end_time:
2737             if not dur:
2738                 continue
2739             end_time = begin_time + dur
2740         out.append('%d\n%s --> %s\n%s\n\n' % (
2741             index,
2742             srt_subtitles_timecode(begin_time),
2743             srt_subtitles_timecode(end_time),
2744             parse_node(para)))
2745
2746     return ''.join(out)
2747
2748
2749 def cli_option(params, command_option, param):
2750     param = params.get(param)
2751     if param:
2752         param = compat_str(param)
2753     return [command_option, param] if param is not None else []
2754
2755
2756 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2757     param = params.get(param)
2758     if param is None:
2759         return []
2760     assert isinstance(param, bool)
2761     if separator:
2762         return [command_option + separator + (true_value if param else false_value)]
2763     return [command_option, true_value if param else false_value]
2764
2765
2766 def cli_valueless_option(params, command_option, param, expected_value=True):
2767     param = params.get(param)
2768     return [command_option] if param == expected_value else []
2769
2770
2771 def cli_configuration_args(params, param, default=[]):
2772     ex_args = params.get(param)
2773     if ex_args is None:
2774         return default
2775     assert isinstance(ex_args, list)
2776     return ex_args
2777
2778
2779 class ISO639Utils(object):
2780     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2781     _lang_map = {
2782         'aa': 'aar',
2783         'ab': 'abk',
2784         'ae': 'ave',
2785         'af': 'afr',
2786         'ak': 'aka',
2787         'am': 'amh',
2788         'an': 'arg',
2789         'ar': 'ara',
2790         'as': 'asm',
2791         'av': 'ava',
2792         'ay': 'aym',
2793         'az': 'aze',
2794         'ba': 'bak',
2795         'be': 'bel',
2796         'bg': 'bul',
2797         'bh': 'bih',
2798         'bi': 'bis',
2799         'bm': 'bam',
2800         'bn': 'ben',
2801         'bo': 'bod',
2802         'br': 'bre',
2803         'bs': 'bos',
2804         'ca': 'cat',
2805         'ce': 'che',
2806         'ch': 'cha',
2807         'co': 'cos',
2808         'cr': 'cre',
2809         'cs': 'ces',
2810         'cu': 'chu',
2811         'cv': 'chv',
2812         'cy': 'cym',
2813         'da': 'dan',
2814         'de': 'deu',
2815         'dv': 'div',
2816         'dz': 'dzo',
2817         'ee': 'ewe',
2818         'el': 'ell',
2819         'en': 'eng',
2820         'eo': 'epo',
2821         'es': 'spa',
2822         'et': 'est',
2823         'eu': 'eus',
2824         'fa': 'fas',
2825         'ff': 'ful',
2826         'fi': 'fin',
2827         'fj': 'fij',
2828         'fo': 'fao',
2829         'fr': 'fra',
2830         'fy': 'fry',
2831         'ga': 'gle',
2832         'gd': 'gla',
2833         'gl': 'glg',
2834         'gn': 'grn',
2835         'gu': 'guj',
2836         'gv': 'glv',
2837         'ha': 'hau',
2838         'he': 'heb',
2839         'hi': 'hin',
2840         'ho': 'hmo',
2841         'hr': 'hrv',
2842         'ht': 'hat',
2843         'hu': 'hun',
2844         'hy': 'hye',
2845         'hz': 'her',
2846         'ia': 'ina',
2847         'id': 'ind',
2848         'ie': 'ile',
2849         'ig': 'ibo',
2850         'ii': 'iii',
2851         'ik': 'ipk',
2852         'io': 'ido',
2853         'is': 'isl',
2854         'it': 'ita',
2855         'iu': 'iku',
2856         'ja': 'jpn',
2857         'jv': 'jav',
2858         'ka': 'kat',
2859         'kg': 'kon',
2860         'ki': 'kik',
2861         'kj': 'kua',
2862         'kk': 'kaz',
2863         'kl': 'kal',
2864         'km': 'khm',
2865         'kn': 'kan',
2866         'ko': 'kor',
2867         'kr': 'kau',
2868         'ks': 'kas',
2869         'ku': 'kur',
2870         'kv': 'kom',
2871         'kw': 'cor',
2872         'ky': 'kir',
2873         'la': 'lat',
2874         'lb': 'ltz',
2875         'lg': 'lug',
2876         'li': 'lim',
2877         'ln': 'lin',
2878         'lo': 'lao',
2879         'lt': 'lit',
2880         'lu': 'lub',
2881         'lv': 'lav',
2882         'mg': 'mlg',
2883         'mh': 'mah',
2884         'mi': 'mri',
2885         'mk': 'mkd',
2886         'ml': 'mal',
2887         'mn': 'mon',
2888         'mr': 'mar',
2889         'ms': 'msa',
2890         'mt': 'mlt',
2891         'my': 'mya',
2892         'na': 'nau',
2893         'nb': 'nob',
2894         'nd': 'nde',
2895         'ne': 'nep',
2896         'ng': 'ndo',
2897         'nl': 'nld',
2898         'nn': 'nno',
2899         'no': 'nor',
2900         'nr': 'nbl',
2901         'nv': 'nav',
2902         'ny': 'nya',
2903         'oc': 'oci',
2904         'oj': 'oji',
2905         'om': 'orm',
2906         'or': 'ori',
2907         'os': 'oss',
2908         'pa': 'pan',
2909         'pi': 'pli',
2910         'pl': 'pol',
2911         'ps': 'pus',
2912         'pt': 'por',
2913         'qu': 'que',
2914         'rm': 'roh',
2915         'rn': 'run',
2916         'ro': 'ron',
2917         'ru': 'rus',
2918         'rw': 'kin',
2919         'sa': 'san',
2920         'sc': 'srd',
2921         'sd': 'snd',
2922         'se': 'sme',
2923         'sg': 'sag',
2924         'si': 'sin',
2925         'sk': 'slk',
2926         'sl': 'slv',
2927         'sm': 'smo',
2928         'sn': 'sna',
2929         'so': 'som',
2930         'sq': 'sqi',
2931         'sr': 'srp',
2932         'ss': 'ssw',
2933         'st': 'sot',
2934         'su': 'sun',
2935         'sv': 'swe',
2936         'sw': 'swa',
2937         'ta': 'tam',
2938         'te': 'tel',
2939         'tg': 'tgk',
2940         'th': 'tha',
2941         'ti': 'tir',
2942         'tk': 'tuk',
2943         'tl': 'tgl',
2944         'tn': 'tsn',
2945         'to': 'ton',
2946         'tr': 'tur',
2947         'ts': 'tso',
2948         'tt': 'tat',
2949         'tw': 'twi',
2950         'ty': 'tah',
2951         'ug': 'uig',
2952         'uk': 'ukr',
2953         'ur': 'urd',
2954         'uz': 'uzb',
2955         've': 'ven',
2956         'vi': 'vie',
2957         'vo': 'vol',
2958         'wa': 'wln',
2959         'wo': 'wol',
2960         'xh': 'xho',
2961         'yi': 'yid',
2962         'yo': 'yor',
2963         'za': 'zha',
2964         'zh': 'zho',
2965         'zu': 'zul',
2966     }
2967
2968     @classmethod
2969     def short2long(cls, code):
2970         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2971         return cls._lang_map.get(code[:2])
2972
2973     @classmethod
2974     def long2short(cls, code):
2975         """Convert language code from ISO 639-2/T to ISO 639-1"""
2976         for short_name, long_name in cls._lang_map.items():
2977             if long_name == code:
2978                 return short_name
2979
2980
2981 class ISO3166Utils(object):
2982     # From http://data.okfn.org/data/core/country-list
2983     _country_map = {
2984         'AF': 'Afghanistan',
2985         'AX': 'Åland Islands',
2986         'AL': 'Albania',
2987         'DZ': 'Algeria',
2988         'AS': 'American Samoa',
2989         'AD': 'Andorra',
2990         'AO': 'Angola',
2991         'AI': 'Anguilla',
2992         'AQ': 'Antarctica',
2993         'AG': 'Antigua and Barbuda',
2994         'AR': 'Argentina',
2995         'AM': 'Armenia',
2996         'AW': 'Aruba',
2997         'AU': 'Australia',
2998         'AT': 'Austria',
2999         'AZ': 'Azerbaijan',
3000         'BS': 'Bahamas',
3001         'BH': 'Bahrain',
3002         'BD': 'Bangladesh',
3003         'BB': 'Barbados',
3004         'BY': 'Belarus',
3005         'BE': 'Belgium',
3006         'BZ': 'Belize',
3007         'BJ': 'Benin',
3008         'BM': 'Bermuda',
3009         'BT': 'Bhutan',
3010         'BO': 'Bolivia, Plurinational State of',
3011         'BQ': 'Bonaire, Sint Eustatius and Saba',
3012         'BA': 'Bosnia and Herzegovina',
3013         'BW': 'Botswana',
3014         'BV': 'Bouvet Island',
3015         'BR': 'Brazil',
3016         'IO': 'British Indian Ocean Territory',
3017         'BN': 'Brunei Darussalam',
3018         'BG': 'Bulgaria',
3019         'BF': 'Burkina Faso',
3020         'BI': 'Burundi',
3021         'KH': 'Cambodia',
3022         'CM': 'Cameroon',
3023         'CA': 'Canada',
3024         'CV': 'Cape Verde',
3025         'KY': 'Cayman Islands',
3026         'CF': 'Central African Republic',
3027         'TD': 'Chad',
3028         'CL': 'Chile',
3029         'CN': 'China',
3030         'CX': 'Christmas Island',
3031         'CC': 'Cocos (Keeling) Islands',
3032         'CO': 'Colombia',
3033         'KM': 'Comoros',
3034         'CG': 'Congo',
3035         'CD': 'Congo, the Democratic Republic of the',
3036         'CK': 'Cook Islands',
3037         'CR': 'Costa Rica',
3038         'CI': 'Côte d\'Ivoire',
3039         'HR': 'Croatia',
3040         'CU': 'Cuba',
3041         'CW': 'Curaçao',
3042         'CY': 'Cyprus',
3043         'CZ': 'Czech Republic',
3044         'DK': 'Denmark',
3045         'DJ': 'Djibouti',
3046         'DM': 'Dominica',
3047         'DO': 'Dominican Republic',
3048         'EC': 'Ecuador',
3049         'EG': 'Egypt',
3050         'SV': 'El Salvador',
3051         'GQ': 'Equatorial Guinea',
3052         'ER': 'Eritrea',
3053         'EE': 'Estonia',
3054         'ET': 'Ethiopia',
3055         'FK': 'Falkland Islands (Malvinas)',
3056         'FO': 'Faroe Islands',
3057         'FJ': 'Fiji',
3058         'FI': 'Finland',
3059         'FR': 'France',
3060         'GF': 'French Guiana',
3061         'PF': 'French Polynesia',
3062         'TF': 'French Southern Territories',
3063         'GA': 'Gabon',
3064         'GM': 'Gambia',
3065         'GE': 'Georgia',
3066         'DE': 'Germany',
3067         'GH': 'Ghana',
3068         'GI': 'Gibraltar',
3069         'GR': 'Greece',
3070         'GL': 'Greenland',
3071         'GD': 'Grenada',
3072         'GP': 'Guadeloupe',
3073         'GU': 'Guam',
3074         'GT': 'Guatemala',
3075         'GG': 'Guernsey',
3076         'GN': 'Guinea',
3077         'GW': 'Guinea-Bissau',
3078         'GY': 'Guyana',
3079         'HT': 'Haiti',
3080         'HM': 'Heard Island and McDonald Islands',
3081         'VA': 'Holy See (Vatican City State)',
3082         'HN': 'Honduras',
3083         'HK': 'Hong Kong',
3084         'HU': 'Hungary',
3085         'IS': 'Iceland',
3086         'IN': 'India',
3087         'ID': 'Indonesia',
3088         'IR': 'Iran, Islamic Republic of',
3089         'IQ': 'Iraq',
3090         'IE': 'Ireland',
3091         'IM': 'Isle of Man',
3092         'IL': 'Israel',
3093         'IT': 'Italy',
3094         'JM': 'Jamaica',
3095         'JP': 'Japan',
3096         'JE': 'Jersey',
3097         'JO': 'Jordan',
3098         'KZ': 'Kazakhstan',
3099         'KE': 'Kenya',
3100         'KI': 'Kiribati',
3101         'KP': 'Korea, Democratic People\'s Republic of',
3102         'KR': 'Korea, Republic of',
3103         'KW': 'Kuwait',
3104         'KG': 'Kyrgyzstan',
3105         'LA': 'Lao People\'s Democratic Republic',
3106         'LV': 'Latvia',
3107         'LB': 'Lebanon',
3108         'LS': 'Lesotho',
3109         'LR': 'Liberia',
3110         'LY': 'Libya',
3111         'LI': 'Liechtenstein',
3112         'LT': 'Lithuania',
3113         'LU': 'Luxembourg',
3114         'MO': 'Macao',
3115         'MK': 'Macedonia, the Former Yugoslav Republic of',
3116         'MG': 'Madagascar',
3117         'MW': 'Malawi',
3118         'MY': 'Malaysia',
3119         'MV': 'Maldives',
3120         'ML': 'Mali',
3121         'MT': 'Malta',
3122         'MH': 'Marshall Islands',
3123         'MQ': 'Martinique',
3124         'MR': 'Mauritania',
3125         'MU': 'Mauritius',
3126         'YT': 'Mayotte',
3127         'MX': 'Mexico',
3128         'FM': 'Micronesia, Federated States of',
3129         'MD': 'Moldova, Republic of',
3130         'MC': 'Monaco',
3131         'MN': 'Mongolia',
3132         'ME': 'Montenegro',
3133         'MS': 'Montserrat',
3134         'MA': 'Morocco',
3135         'MZ': 'Mozambique',
3136         'MM': 'Myanmar',
3137         'NA': 'Namibia',
3138         'NR': 'Nauru',
3139         'NP': 'Nepal',
3140         'NL': 'Netherlands',
3141         'NC': 'New Caledonia',
3142         'NZ': 'New Zealand',
3143         'NI': 'Nicaragua',
3144         'NE': 'Niger',
3145         'NG': 'Nigeria',
3146         'NU': 'Niue',
3147         'NF': 'Norfolk Island',
3148         'MP': 'Northern Mariana Islands',
3149         'NO': 'Norway',
3150         'OM': 'Oman',
3151         'PK': 'Pakistan',
3152         'PW': 'Palau',
3153         'PS': 'Palestine, State of',
3154         'PA': 'Panama',
3155         'PG': 'Papua New Guinea',
3156         'PY': 'Paraguay',
3157         'PE': 'Peru',
3158         'PH': 'Philippines',
3159         'PN': 'Pitcairn',
3160         'PL': 'Poland',
3161         'PT': 'Portugal',
3162         'PR': 'Puerto Rico',
3163         'QA': 'Qatar',
3164         'RE': 'Réunion',
3165         'RO': 'Romania',
3166         'RU': 'Russian Federation',
3167         'RW': 'Rwanda',
3168         'BL': 'Saint Barthélemy',
3169         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3170         'KN': 'Saint Kitts and Nevis',
3171         'LC': 'Saint Lucia',
3172         'MF': 'Saint Martin (French part)',
3173         'PM': 'Saint Pierre and Miquelon',
3174         'VC': 'Saint Vincent and the Grenadines',
3175         'WS': 'Samoa',
3176         'SM': 'San Marino',
3177         'ST': 'Sao Tome and Principe',
3178         'SA': 'Saudi Arabia',
3179         'SN': 'Senegal',
3180         'RS': 'Serbia',
3181         'SC': 'Seychelles',
3182         'SL': 'Sierra Leone',
3183         'SG': 'Singapore',
3184         'SX': 'Sint Maarten (Dutch part)',
3185         'SK': 'Slovakia',
3186         'SI': 'Slovenia',
3187         'SB': 'Solomon Islands',
3188         'SO': 'Somalia',
3189         'ZA': 'South Africa',
3190         'GS': 'South Georgia and the South Sandwich Islands',
3191         'SS': 'South Sudan',
3192         'ES': 'Spain',
3193         'LK': 'Sri Lanka',
3194         'SD': 'Sudan',
3195         'SR': 'Suriname',
3196         'SJ': 'Svalbard and Jan Mayen',
3197         'SZ': 'Swaziland',
3198         'SE': 'Sweden',
3199         'CH': 'Switzerland',
3200         'SY': 'Syrian Arab Republic',
3201         'TW': 'Taiwan, Province of China',
3202         'TJ': 'Tajikistan',
3203         'TZ': 'Tanzania, United Republic of',
3204         'TH': 'Thailand',
3205         'TL': 'Timor-Leste',
3206         'TG': 'Togo',
3207         'TK': 'Tokelau',
3208         'TO': 'Tonga',
3209         'TT': 'Trinidad and Tobago',
3210         'TN': 'Tunisia',
3211         'TR': 'Turkey',
3212         'TM': 'Turkmenistan',
3213         'TC': 'Turks and Caicos Islands',
3214         'TV': 'Tuvalu',
3215         'UG': 'Uganda',
3216         'UA': 'Ukraine',
3217         'AE': 'United Arab Emirates',
3218         'GB': 'United Kingdom',
3219         'US': 'United States',
3220         'UM': 'United States Minor Outlying Islands',
3221         'UY': 'Uruguay',
3222         'UZ': 'Uzbekistan',
3223         'VU': 'Vanuatu',
3224         'VE': 'Venezuela, Bolivarian Republic of',
3225         'VN': 'Viet Nam',
3226         'VG': 'Virgin Islands, British',
3227         'VI': 'Virgin Islands, U.S.',
3228         'WF': 'Wallis and Futuna',
3229         'EH': 'Western Sahara',
3230         'YE': 'Yemen',
3231         'ZM': 'Zambia',
3232         'ZW': 'Zimbabwe',
3233     }
3234
3235     @classmethod
3236     def short2full(cls, code):
3237         """Convert an ISO 3166-2 country code to the corresponding full name"""
3238         return cls._country_map.get(code.upper())
3239
3240
3241 class GeoUtils(object):
3242     # Major IPv4 address blocks per country
3243     _country_ip_map = {
3244         'AD': '85.94.160.0/19',
3245         'AE': '94.200.0.0/13',
3246         'AF': '149.54.0.0/17',
3247         'AG': '209.59.64.0/18',
3248         'AI': '204.14.248.0/21',
3249         'AL': '46.99.0.0/16',
3250         'AM': '46.70.0.0/15',
3251         'AO': '105.168.0.0/13',
3252         'AP': '159.117.192.0/21',
3253         'AR': '181.0.0.0/12',
3254         'AS': '202.70.112.0/20',
3255         'AT': '84.112.0.0/13',
3256         'AU': '1.128.0.0/11',
3257         'AW': '181.41.0.0/18',
3258         'AZ': '5.191.0.0/16',
3259         'BA': '31.176.128.0/17',
3260         'BB': '65.48.128.0/17',
3261         'BD': '114.130.0.0/16',
3262         'BE': '57.0.0.0/8',
3263         'BF': '129.45.128.0/17',
3264         'BG': '95.42.0.0/15',
3265         'BH': '37.131.0.0/17',
3266         'BI': '154.117.192.0/18',
3267         'BJ': '137.255.0.0/16',
3268         'BL': '192.131.134.0/24',
3269         'BM': '196.12.64.0/18',
3270         'BN': '156.31.0.0/16',
3271         'BO': '161.56.0.0/16',
3272         'BQ': '161.0.80.0/20',
3273         'BR': '152.240.0.0/12',
3274         'BS': '24.51.64.0/18',
3275         'BT': '119.2.96.0/19',
3276         'BW': '168.167.0.0/16',
3277         'BY': '178.120.0.0/13',
3278         'BZ': '179.42.192.0/18',
3279         'CA': '99.224.0.0/11',
3280         'CD': '41.243.0.0/16',
3281         'CF': '196.32.200.0/21',
3282         'CG': '197.214.128.0/17',
3283         'CH': '85.0.0.0/13',
3284         'CI': '154.232.0.0/14',
3285         'CK': '202.65.32.0/19',
3286         'CL': '152.172.0.0/14',
3287         'CM': '165.210.0.0/15',
3288         'CN': '36.128.0.0/10',
3289         'CO': '181.240.0.0/12',
3290         'CR': '201.192.0.0/12',
3291         'CU': '152.206.0.0/15',
3292         'CV': '165.90.96.0/19',
3293         'CW': '190.88.128.0/17',
3294         'CY': '46.198.0.0/15',
3295         'CZ': '88.100.0.0/14',
3296         'DE': '53.0.0.0/8',
3297         'DJ': '197.241.0.0/17',
3298         'DK': '87.48.0.0/12',
3299         'DM': '192.243.48.0/20',
3300         'DO': '152.166.0.0/15',
3301         'DZ': '41.96.0.0/12',
3302         'EC': '186.68.0.0/15',
3303         'EE': '90.190.0.0/15',
3304         'EG': '156.160.0.0/11',
3305         'ER': '196.200.96.0/20',
3306         'ES': '88.0.0.0/11',
3307         'ET': '196.188.0.0/14',
3308         'EU': '2.16.0.0/13',
3309         'FI': '91.152.0.0/13',
3310         'FJ': '144.120.0.0/16',
3311         'FM': '119.252.112.0/20',
3312         'FO': '88.85.32.0/19',
3313         'FR': '90.0.0.0/9',
3314         'GA': '41.158.0.0/15',
3315         'GB': '25.0.0.0/8',
3316         'GD': '74.122.88.0/21',
3317         'GE': '31.146.0.0/16',
3318         'GF': '161.22.64.0/18',
3319         'GG': '62.68.160.0/19',
3320         'GH': '45.208.0.0/14',
3321         'GI': '85.115.128.0/19',
3322         'GL': '88.83.0.0/19',
3323         'GM': '160.182.0.0/15',
3324         'GN': '197.149.192.0/18',
3325         'GP': '104.250.0.0/19',
3326         'GQ': '105.235.224.0/20',
3327         'GR': '94.64.0.0/13',
3328         'GT': '168.234.0.0/16',
3329         'GU': '168.123.0.0/16',
3330         'GW': '197.214.80.0/20',
3331         'GY': '181.41.64.0/18',
3332         'HK': '113.252.0.0/14',
3333         'HN': '181.210.0.0/16',
3334         'HR': '93.136.0.0/13',
3335         'HT': '148.102.128.0/17',
3336         'HU': '84.0.0.0/14',
3337         'ID': '39.192.0.0/10',
3338         'IE': '87.32.0.0/12',
3339         'IL': '79.176.0.0/13',
3340         'IM': '5.62.80.0/20',
3341         'IN': '117.192.0.0/10',
3342         'IO': '203.83.48.0/21',
3343         'IQ': '37.236.0.0/14',
3344         'IR': '2.176.0.0/12',
3345         'IS': '82.221.0.0/16',
3346         'IT': '79.0.0.0/10',
3347         'JE': '87.244.64.0/18',
3348         'JM': '72.27.0.0/17',
3349         'JO': '176.29.0.0/16',
3350         'JP': '126.0.0.0/8',
3351         'KE': '105.48.0.0/12',
3352         'KG': '158.181.128.0/17',
3353         'KH': '36.37.128.0/17',
3354         'KI': '103.25.140.0/22',
3355         'KM': '197.255.224.0/20',
3356         'KN': '198.32.32.0/19',
3357         'KP': '175.45.176.0/22',
3358         'KR': '175.192.0.0/10',
3359         'KW': '37.36.0.0/14',
3360         'KY': '64.96.0.0/15',
3361         'KZ': '2.72.0.0/13',
3362         'LA': '115.84.64.0/18',
3363         'LB': '178.135.0.0/16',
3364         'LC': '192.147.231.0/24',
3365         'LI': '82.117.0.0/19',
3366         'LK': '112.134.0.0/15',
3367         'LR': '41.86.0.0/19',
3368         'LS': '129.232.0.0/17',
3369         'LT': '78.56.0.0/13',
3370         'LU': '188.42.0.0/16',
3371         'LV': '46.109.0.0/16',
3372         'LY': '41.252.0.0/14',
3373         'MA': '105.128.0.0/11',
3374         'MC': '88.209.64.0/18',
3375         'MD': '37.246.0.0/16',
3376         'ME': '178.175.0.0/17',
3377         'MF': '74.112.232.0/21',
3378         'MG': '154.126.0.0/17',
3379         'MH': '117.103.88.0/21',
3380         'MK': '77.28.0.0/15',
3381         'ML': '154.118.128.0/18',
3382         'MM': '37.111.0.0/17',
3383         'MN': '49.0.128.0/17',
3384         'MO': '60.246.0.0/16',
3385         'MP': '202.88.64.0/20',
3386         'MQ': '109.203.224.0/19',
3387         'MR': '41.188.64.0/18',
3388         'MS': '208.90.112.0/22',
3389         'MT': '46.11.0.0/16',
3390         'MU': '105.16.0.0/12',
3391         'MV': '27.114.128.0/18',
3392         'MW': '105.234.0.0/16',
3393         'MX': '187.192.0.0/11',
3394         'MY': '175.136.0.0/13',
3395         'MZ': '197.218.0.0/15',
3396         'NA': '41.182.0.0/16',
3397         'NC': '101.101.0.0/18',
3398         'NE': '197.214.0.0/18',
3399         'NF': '203.17.240.0/22',
3400         'NG': '105.112.0.0/12',
3401         'NI': '186.76.0.0/15',
3402         'NL': '145.96.0.0/11',
3403         'NO': '84.208.0.0/13',
3404         'NP': '36.252.0.0/15',
3405         'NR': '203.98.224.0/19',
3406         'NU': '49.156.48.0/22',
3407         'NZ': '49.224.0.0/14',
3408         'OM': '5.36.0.0/15',
3409         'PA': '186.72.0.0/15',
3410         'PE': '186.160.0.0/14',
3411         'PF': '123.50.64.0/18',
3412         'PG': '124.240.192.0/19',
3413         'PH': '49.144.0.0/13',
3414         'PK': '39.32.0.0/11',
3415         'PL': '83.0.0.0/11',
3416         'PM': '70.36.0.0/20',
3417         'PR': '66.50.0.0/16',
3418         'PS': '188.161.0.0/16',
3419         'PT': '85.240.0.0/13',
3420         'PW': '202.124.224.0/20',
3421         'PY': '181.120.0.0/14',
3422         'QA': '37.210.0.0/15',
3423         'RE': '139.26.0.0/16',
3424         'RO': '79.112.0.0/13',
3425         'RS': '178.220.0.0/14',
3426         'RU': '5.136.0.0/13',
3427         'RW': '105.178.0.0/15',
3428         'SA': '188.48.0.0/13',
3429         'SB': '202.1.160.0/19',
3430         'SC': '154.192.0.0/11',
3431         'SD': '154.96.0.0/13',
3432         'SE': '78.64.0.0/12',
3433         'SG': '152.56.0.0/14',
3434         'SI': '188.196.0.0/14',
3435         'SK': '78.98.0.0/15',
3436         'SL': '197.215.0.0/17',
3437         'SM': '89.186.32.0/19',
3438         'SN': '41.82.0.0/15',
3439         'SO': '197.220.64.0/19',
3440         'SR': '186.179.128.0/17',
3441         'SS': '105.235.208.0/21',
3442         'ST': '197.159.160.0/19',
3443         'SV': '168.243.0.0/16',
3444         'SX': '190.102.0.0/20',
3445         'SY': '5.0.0.0/16',
3446         'SZ': '41.84.224.0/19',
3447         'TC': '65.255.48.0/20',
3448         'TD': '154.68.128.0/19',
3449         'TG': '196.168.0.0/14',
3450         'TH': '171.96.0.0/13',
3451         'TJ': '85.9.128.0/18',
3452         'TK': '27.96.24.0/21',
3453         'TL': '180.189.160.0/20',
3454         'TM': '95.85.96.0/19',
3455         'TN': '197.0.0.0/11',
3456         'TO': '175.176.144.0/21',
3457         'TR': '78.160.0.0/11',
3458         'TT': '186.44.0.0/15',
3459         'TV': '202.2.96.0/19',
3460         'TW': '120.96.0.0/11',
3461         'TZ': '156.156.0.0/14',
3462         'UA': '93.72.0.0/13',
3463         'UG': '154.224.0.0/13',
3464         'US': '3.0.0.0/8',
3465         'UY': '167.56.0.0/13',
3466         'UZ': '82.215.64.0/18',
3467         'VA': '212.77.0.0/19',
3468         'VC': '24.92.144.0/20',
3469         'VE': '186.88.0.0/13',
3470         'VG': '172.103.64.0/18',
3471         'VI': '146.226.0.0/16',
3472         'VN': '14.160.0.0/11',
3473         'VU': '202.80.32.0/20',
3474         'WF': '117.20.32.0/21',
3475         'WS': '202.4.32.0/19',
3476         'YE': '134.35.0.0/16',
3477         'YT': '41.242.116.0/22',
3478         'ZA': '41.0.0.0/11',
3479         'ZM': '165.56.0.0/13',
3480         'ZW': '41.85.192.0/19',
3481     }
3482
3483     @classmethod
3484     def random_ipv4(cls, code):
3485         block = cls._country_ip_map.get(code.upper())
3486         if not block:
3487             return None
3488         addr, preflen = block.split('/')
3489         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3490         addr_max = addr_min | (0xffffffff >> int(preflen))
3491         return compat_str(socket.inet_ntoa(
3492             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3493
3494
3495 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3496     def __init__(self, proxies=None):
3497         # Set default handlers
3498         for type in ('http', 'https'):
3499             setattr(self, '%s_open' % type,
3500                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3501                         meth(r, proxy, type))
3502         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3503
3504     def proxy_open(self, req, proxy, type):
3505         req_proxy = req.headers.get('Ytdl-request-proxy')
3506         if req_proxy is not None:
3507             proxy = req_proxy
3508             del req.headers['Ytdl-request-proxy']
3509
3510         if proxy == '__noproxy__':
3511             return None  # No Proxy
3512         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3513             req.add_header('Ytdl-socks-proxy', proxy)
3514             # youtube-dl's http/https handlers do wrapping the socket with socks
3515             return None
3516         return compat_urllib_request.ProxyHandler.proxy_open(
3517             self, req, proxy, type)
3518
3519
3520 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3521 # released into Public Domain
3522 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3523
3524 def long_to_bytes(n, blocksize=0):
3525     """long_to_bytes(n:long, blocksize:int) : string
3526     Convert a long integer to a byte string.
3527
3528     If optional blocksize is given and greater than zero, pad the front of the
3529     byte string with binary zeros so that the length is a multiple of
3530     blocksize.
3531     """
3532     # after much testing, this algorithm was deemed to be the fastest
3533     s = b''
3534     n = int(n)
3535     while n > 0:
3536         s = compat_struct_pack('>I', n & 0xffffffff) + s
3537         n = n >> 32
3538     # strip off leading zeros
3539     for i in range(len(s)):
3540         if s[i] != b'\000'[0]:
3541             break
3542     else:
3543         # only happens when n == 0
3544         s = b'\000'
3545         i = 0
3546     s = s[i:]
3547     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3548     # de-padding being done above, but sigh...
3549     if blocksize > 0 and len(s) % blocksize:
3550         s = (blocksize - len(s) % blocksize) * b'\000' + s
3551     return s
3552
3553
3554 def bytes_to_long(s):
3555     """bytes_to_long(string) : long
3556     Convert a byte string to a long integer.
3557
3558     This is (essentially) the inverse of long_to_bytes().
3559     """
3560     acc = 0
3561     length = len(s)
3562     if length % 4:
3563         extra = (4 - length % 4)
3564         s = b'\000' * extra + s
3565         length = length + extra
3566     for i in range(0, length, 4):
3567         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3568     return acc
3569
3570
3571 def ohdave_rsa_encrypt(data, exponent, modulus):
3572     '''
3573     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3574
3575     Input:
3576         data: data to encrypt, bytes-like object
3577         exponent, modulus: parameter e and N of RSA algorithm, both integer
3578     Output: hex string of encrypted data
3579
3580     Limitation: supports one block encryption only
3581     '''
3582
3583     payload = int(binascii.hexlify(data[::-1]), 16)
3584     encrypted = pow(payload, exponent, modulus)
3585     return '%x' % encrypted
3586
3587
3588 def pkcs1pad(data, length):
3589     """
3590     Padding input data with PKCS#1 scheme
3591
3592     @param {int[]} data        input data
3593     @param {int}   length      target length
3594     @returns {int[]}           padded data
3595     """
3596     if len(data) > length - 11:
3597         raise ValueError('Input data too long for PKCS#1 padding')
3598
3599     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3600     return [0, 2] + pseudo_random + [0] + data
3601
3602
3603 def encode_base_n(num, n, table=None):
3604     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3605     if not table:
3606         table = FULL_TABLE[:n]
3607
3608     if n > len(table):
3609         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3610
3611     if num == 0:
3612         return table[0]
3613
3614     ret = ''
3615     while num:
3616         ret = table[num % n] + ret
3617         num = num // n
3618     return ret
3619
3620
3621 def decode_packed_codes(code):
3622     mobj = re.search(PACKED_CODES_RE, code)
3623     obfucasted_code, base, count, symbols = mobj.groups()
3624     base = int(base)
3625     count = int(count)
3626     symbols = symbols.split('|')
3627     symbol_table = {}
3628
3629     while count:
3630         count -= 1
3631         base_n_count = encode_base_n(count, base)
3632         symbol_table[base_n_count] = symbols[count] or base_n_count
3633
3634     return re.sub(
3635         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3636         obfucasted_code)
3637
3638
3639 def parse_m3u8_attributes(attrib):
3640     info = {}
3641     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3642         if val.startswith('"'):
3643             val = val[1:-1]
3644         info[key] = val
3645     return info
3646
3647
3648 def urshift(val, n):
3649     return val >> n if val >= 0 else (val + 0x100000000) >> n
3650
3651
3652 # Based on png2str() written by @gdkchan and improved by @yokrysty
3653 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3654 def decode_png(png_data):
3655     # Reference: https://www.w3.org/TR/PNG/
3656     header = png_data[8:]
3657
3658     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3659         raise IOError('Not a valid PNG file.')
3660
3661     int_map = {1: '>B', 2: '>H', 4: '>I'}
3662     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3663
3664     chunks = []
3665
3666     while header:
3667         length = unpack_integer(header[:4])
3668         header = header[4:]
3669
3670         chunk_type = header[:4]
3671         header = header[4:]
3672
3673         chunk_data = header[:length]
3674         header = header[length:]
3675
3676         header = header[4:]  # Skip CRC
3677
3678         chunks.append({
3679             'type': chunk_type,
3680             'length': length,
3681             'data': chunk_data
3682         })
3683
3684     ihdr = chunks[0]['data']
3685
3686     width = unpack_integer(ihdr[:4])
3687     height = unpack_integer(ihdr[4:8])
3688
3689     idat = b''
3690
3691     for chunk in chunks:
3692         if chunk['type'] == b'IDAT':
3693             idat += chunk['data']
3694
3695     if not idat:
3696         raise IOError('Unable to read PNG data.')
3697
3698     decompressed_data = bytearray(zlib.decompress(idat))
3699
3700     stride = width * 3
3701     pixels = []
3702
3703     def _get_pixel(idx):
3704         x = idx % stride
3705         y = idx // stride
3706         return pixels[y][x]
3707
3708     for y in range(height):
3709         basePos = y * (1 + stride)
3710         filter_type = decompressed_data[basePos]
3711
3712         current_row = []
3713
3714         pixels.append(current_row)
3715
3716         for x in range(stride):
3717             color = decompressed_data[1 + basePos + x]
3718             basex = y * stride + x
3719             left = 0
3720             up = 0
3721
3722             if x > 2:
3723                 left = _get_pixel(basex - 3)
3724             if y > 0:
3725                 up = _get_pixel(basex - stride)
3726
3727             if filter_type == 1:  # Sub
3728                 color = (color + left) & 0xff
3729             elif filter_type == 2:  # Up
3730                 color = (color + up) & 0xff
3731             elif filter_type == 3:  # Average
3732                 color = (color + ((left + up) >> 1)) & 0xff
3733             elif filter_type == 4:  # Paeth
3734                 a = left
3735                 b = up
3736                 c = 0
3737
3738                 if x > 2 and y > 0:
3739                     c = _get_pixel(basex - stride - 3)
3740
3741                 p = a + b - c
3742
3743                 pa = abs(p - a)
3744                 pb = abs(p - b)
3745                 pc = abs(p - c)
3746
3747                 if pa <= pb and pa <= pc:
3748                     color = (color + a) & 0xff
3749                 elif pb <= pc:
3750                     color = (color + b) & 0xff
3751                 else:
3752                     color = (color + c) & 0xff
3753
3754             current_row.append(color)
3755
3756     return width, height, pixels
3757
3758
3759 def write_xattr(path, key, value):
3760     # This mess below finds the best xattr tool for the job
3761     try:
3762         # try the pyxattr module...
3763         import xattr
3764
3765         if hasattr(xattr, 'set'):  # pyxattr
3766             # Unicode arguments are not supported in python-pyxattr until
3767             # version 0.5.0
3768             # See https://github.com/rg3/youtube-dl/issues/5498
3769             pyxattr_required_version = '0.5.0'
3770             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3771                 # TODO: fallback to CLI tools
3772                 raise XAttrUnavailableError(
3773                     'python-pyxattr is detected but is too old. '
3774                     'youtube-dl requires %s or above while your version is %s. '
3775                     'Falling back to other xattr implementations' % (
3776                         pyxattr_required_version, xattr.__version__))
3777
3778             setxattr = xattr.set
3779         else:  # xattr
3780             setxattr = xattr.setxattr
3781
3782         try:
3783             setxattr(path, key, value)
3784         except EnvironmentError as e:
3785             raise XAttrMetadataError(e.errno, e.strerror)
3786
3787     except ImportError:
3788         if compat_os_name == 'nt':
3789             # Write xattrs to NTFS Alternate Data Streams:
3790             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3791             assert ':' not in key
3792             assert os.path.exists(path)
3793
3794             ads_fn = path + ':' + key
3795             try:
3796                 with open(ads_fn, 'wb') as f:
3797                     f.write(value)
3798             except EnvironmentError as e:
3799                 raise XAttrMetadataError(e.errno, e.strerror)
3800         else:
3801             user_has_setfattr = check_executable('setfattr', ['--version'])
3802             user_has_xattr = check_executable('xattr', ['-h'])
3803
3804             if user_has_setfattr or user_has_xattr:
3805
3806                 value = value.decode('utf-8')
3807                 if user_has_setfattr:
3808                     executable = 'setfattr'
3809                     opts = ['-n', key, '-v', value]
3810                 elif user_has_xattr:
3811                     executable = 'xattr'
3812                     opts = ['-w', key, value]
3813
3814                 cmd = ([encodeFilename(executable, True)] +
3815                        [encodeArgument(o) for o in opts] +
3816                        [encodeFilename(path, True)])
3817
3818                 try:
3819                     p = subprocess.Popen(
3820                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3821                 except EnvironmentError as e:
3822                     raise XAttrMetadataError(e.errno, e.strerror)
3823                 stdout, stderr = p.communicate()
3824                 stderr = stderr.decode('utf-8', 'replace')
3825                 if p.returncode != 0:
3826                     raise XAttrMetadataError(p.returncode, stderr)
3827
3828             else:
3829                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3830                 if sys.platform.startswith('linux'):
3831                     raise XAttrUnavailableError(
3832                         "Couldn't find a tool to set the xattrs. "
3833                         "Install either the python 'pyxattr' or 'xattr' "
3834                         "modules, or the GNU 'attr' package "
3835                         "(which contains the 'setfattr' tool).")
3836                 else:
3837                     raise XAttrUnavailableError(
3838                         "Couldn't find a tool to set the xattrs. "
3839                         "Install either the python 'xattr' module, "
3840                         "or the 'xattr' binary.")
3841
3842
3843 def random_birthday(year_field, month_field, day_field):
3844     return {
3845         year_field: str(random.randint(1950, 1995)),
3846         month_field: str(random.randint(1, 12)),
3847         day_field: str(random.randint(1, 31)),
3848     }