Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113
 114 def preferredencoding():
 115     """Get preferred encoding.
 116
 117     Returns the best encoding scheme for the system, based on
 118     locale.getpreferredencoding() and some further tweaks.
 119     """
 120     try:
 121         pref = locale.getpreferredencoding()
 122         'TEST'.encode(pref)
 123     except Exception:
 124         pref = 'UTF-8'
 125
 126     return pref
 127
 128
 129 def write_json_file(obj, fn):
 130     """ Encode obj as JSON and write it to fn, atomically if possible """
 131
 132     fn = encodeFilename(fn)
 133     if sys.version_info < (3, 0) and sys.platform != 'win32':
 134         encoding = get_filesystem_encoding()
 135         # os.path.basename returns a bytes object, but NamedTemporaryFile
 136         # will fail if the filename contains non ascii characters unless we
 137         # use a unicode object
 138         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 139         # the same for os.path.dirname
 140         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 141     else:
 142         path_basename = os.path.basename
 143         path_dirname = os.path.dirname
 144
 145     args = {
 146         'suffix': '.tmp',
 147         'prefix': path_basename(fn) + '.',
 148         'dir': path_dirname(fn),
 149         'delete': False,
 150     }
 151
 152     # In Python 2.x, json.dump expects a bytestream.
 153     # In Python 3.x, it writes to a character stream
 154     if sys.version_info < (3, 0):
 155         args['mode'] = 'wb'
 156     else:
 157         args.update({
 158             'mode': 'w',
 159             'encoding': 'utf-8',
 160         })
 161
 162     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 163
 164     try:
 165         with tf:
 166             json.dump(obj, tf)
 167         if sys.platform == 'win32':
 168             # Need to remove existing file on Windows, else os.rename raises
 169             # WindowsError or FileExistsError.
 170             try:
 171                 os.unlink(fn)
 172             except OSError:
 173                 pass
 174         os.rename(tf.name, fn)
 175     except Exception:
 176         try:
 177             os.remove(tf.name)
 178         except OSError:
 179             pass
 180         raise
 181
 182
 183 if sys.version_info >= (2, 7):
 184     def find_xpath_attr(node, xpath, key, val=None):
 185         """ Find the xpath xpath[@key=val] """
 186         assert re.match(r'^[a-zA-Z_-]+$', key)
 187         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 188         return node.find(expr)
 189 else:
 190     def find_xpath_attr(node, xpath, key, val=None):
 191         for f in node.findall(compat_xpath(xpath)):
 192             if key not in f.attrib:
 193                 continue
 194             if val is None or f.attrib.get(key) == val:
 195                 return f
 196         return None
 197
 198 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 199 # the namespace parameter
 200
 201
 202 def xpath_with_ns(path, ns_map):
 203     components = [c.split(':') for c in path.split('/')]
 204     replaced = []
 205     for c in components:
 206         if len(c) == 1:
 207             replaced.append(c[0])
 208         else:
 209             ns, tag = c
 210             replaced.append('{%s}%s' % (ns_map[ns], tag))
 211     return '/'.join(replaced)
 212
 213
 214 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 215     def _find_xpath(xpath):
 216         return node.find(compat_xpath(xpath))
 217
 218     if isinstance(xpath, (str, compat_str)):
 219         n = _find_xpath(xpath)
 220     else:
 221         for xp in xpath:
 222             n = _find_xpath(xp)
 223             if n is not None:
 224                 break
 225
 226     if n is None:
 227         if default is not NO_DEFAULT:
 228             return default
 229         elif fatal:
 230             name = xpath if name is None else name
 231             raise ExtractorError('Could not find XML element %s' % name)
 232         else:
 233             return None
 234     return n
 235
 236
 237 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 239     if n is None or n == default:
 240         return n
 241     if n.text is None:
 242         if default is not NO_DEFAULT:
 243             return default
 244         elif fatal:
 245             name = xpath if name is None else name
 246             raise ExtractorError('Could not find XML element\'s text %s' % name)
 247         else:
 248             return None
 249     return n.text
 250
 251
 252 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 253     n = find_xpath_attr(node, xpath, key)
 254     if n is None:
 255         if default is not NO_DEFAULT:
 256             return default
 257         elif fatal:
 258             name = '%s[@%s]' % (xpath, key) if name is None else name
 259             raise ExtractorError('Could not find XML attribute %s' % name)
 260         else:
 261             return None
 262     return n.attrib[key]
 263
 264
 265 def get_element_by_id(id, html):
 266     """Return the content of the tag with the specified ID in the passed HTML document"""
 267     return get_element_by_attribute('id', id, html)
 268
 269
 270 def get_element_by_attribute(attribute, value, html):
 271     """Return the content of the tag with the specified attribute in the passed HTML document"""
 272
 273     m = re.search(r'''(?xs)
 274         <([a-zA-Z0-9:._-]+)
 275          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 276          \s+%s=['"]?%s['"]?
 277          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 278         \s*>
 279         (?P<content>.*?)
 280         </\1>
 281     ''' % (re.escape(attribute), re.escape(value)), html)
 282
 283     if not m:
 284         return None
 285     res = m.group('content')
 286
 287     if res.startswith('"') or res.startswith("'"):
 288         res = res[1:-1]
 289
 290     return unescapeHTML(res)
 291
 292
 293 class HTMLAttributeParser(compat_HTMLParser):
 294     """Trivial HTML parser to gather the attributes for a single element"""
 295     def __init__(self):
 296         self.attrs = {}
 297         compat_HTMLParser.__init__(self)
 298
 299     def handle_starttag(self, tag, attrs):
 300         self.attrs = dict(attrs)
 301
 302
 303 def extract_attributes(html_element):
 304     """Given a string for an HTML element such as
 305     <el
 306          a="foo" B="bar" c="&98;az" d=boz
 307          empty= noval entity="&amp;"
 308          sq='"' dq="'"
 309     >
 310     Decode and return a dictionary of attributes.
 311     {
 312         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 313         'empty': '', 'noval': None, 'entity': '&',
 314         'sq': '"', 'dq': '\''
 315     }.
 316     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 317     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 318     """
 319     parser = HTMLAttributeParser()
 320     parser.feed(html_element)
 321     parser.close()
 322     return parser.attrs
 323
 324
 325 def clean_html(html):
 326     """Clean an HTML snippet into a readable string"""
 327
 328     if html is None:  # Convenience for sanitizing descriptions etc.
 329         return html
 330
 331     # Newline vs <br />
 332     html = html.replace('\n', ' ')
 333     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 334     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 335     # Strip html tags
 336     html = re.sub('<.*?>', '', html)
 337     # Replace html entities
 338     html = unescapeHTML(html)
 339     return html.strip()
 340
 341
 342 def sanitize_open(filename, open_mode):
 343     """Try to open the given filename, and slightly tweak it if this fails.
 344
 345     Attempts to open the given filename. If this fails, it tries to change
 346     the filename slightly, step by step, until it's either able to open it
 347     or it fails and raises a final exception, like the standard open()
 348     function.
 349
 350     It returns the tuple (stream, definitive_file_name).
 351     """
 352     try:
 353         if filename == '-':
 354             if sys.platform == 'win32':
 355                 import msvcrt
 356                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 357             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 358         stream = open(encodeFilename(filename), open_mode)
 359         return (stream, filename)
 360     except (IOError, OSError) as err:
 361         if err.errno in (errno.EACCES,):
 362             raise
 363
 364         # In case of error, try to remove win32 forbidden chars
 365         alt_filename = sanitize_path(filename)
 366         if alt_filename == filename:
 367             raise
 368         else:
 369             # An exception here should be caught in the caller
 370             stream = open(encodeFilename(alt_filename), open_mode)
 371             return (stream, alt_filename)
 372
 373
 374 def timeconvert(timestr):
 375     """Convert RFC 2822 defined time string into system timestamp"""
 376     timestamp = None
 377     timetuple = email.utils.parsedate_tz(timestr)
 378     if timetuple is not None:
 379         timestamp = email.utils.mktime_tz(timetuple)
 380     return timestamp
 381
 382
 383 def sanitize_filename(s, restricted=False, is_id=False):
 384     """Sanitizes a string so it could be used as part of a filename.
 385     If restricted is set, use a stricter subset of allowed characters.
 386     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 387     """
 388     def replace_insane(char):
 389         if restricted and char in ACCENT_CHARS:
 390             return ACCENT_CHARS[char]
 391         if char == '?' or ord(char) < 32 or ord(char) == 127:
 392             return ''
 393         elif char == '"':
 394             return '' if restricted else '\''
 395         elif char == ':':
 396             return '_-' if restricted else ' -'
 397         elif char in '\\/|*<>':
 398             return '_'
 399         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 400             return '_'
 401         if restricted and ord(char) > 127:
 402             return '_'
 403         return char
 404
 405     # Handle timestamps
 406     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 407     result = ''.join(map(replace_insane, s))
 408     if not is_id:
 409         while '__' in result:
 410             result = result.replace('__', '_')
 411         result = result.strip('_')
 412         # Common case of "Foreign band name - English song title"
 413         if restricted and result.startswith('-_'):
 414             result = result[2:]
 415         if result.startswith('-'):
 416             result = '_' + result[len('-'):]
 417         result = result.lstrip('.')
 418         if not result:
 419             result = '_'
 420     return result
 421
 422
 423 def sanitize_path(s):
 424     """Sanitizes and normalizes path on Windows"""
 425     if sys.platform != 'win32':
 426         return s
 427     drive_or_unc, _ = os.path.splitdrive(s)
 428     if sys.version_info < (2, 7) and not drive_or_unc:
 429         drive_or_unc, _ = os.path.splitunc(s)
 430     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 431     if drive_or_unc:
 432         norm_path.pop(0)
 433     sanitized_path = [
 434         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 435         for path_part in norm_path]
 436     if drive_or_unc:
 437         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 438     return os.path.join(*sanitized_path)
 439
 440
 441 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 442 # unwanted failures due to missing protocol
 443 def sanitize_url(url):
 444     return 'http:%s' % url if url.startswith('//') else url
 445
 446
 447 def sanitized_Request(url, *args, **kwargs):
 448     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 449
 450
 451 def orderedSet(iterable):
 452     """ Remove all duplicates from the input iterable """
 453     res = []
 454     for el in iterable:
 455         if el not in res:
 456             res.append(el)
 457     return res
 458
 459
 460 def _htmlentity_transform(entity_with_semicolon):
 461     """Transforms an HTML entity to a character."""
 462     entity = entity_with_semicolon[:-1]
 463
 464     # Known non-numeric HTML entity
 465     if entity in compat_html_entities.name2codepoint:
 466         return compat_chr(compat_html_entities.name2codepoint[entity])
 467
 468     # TODO: HTML5 allows entities without a semicolon. For example,
 469     # '&Eacuteric' should be decoded as 'Éric'.
 470     if entity_with_semicolon in compat_html_entities_html5:
 471         return compat_html_entities_html5[entity_with_semicolon]
 472
 473     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 474     if mobj is not None:
 475         numstr = mobj.group(1)
 476         if numstr.startswith('x'):
 477             base = 16
 478             numstr = '0%s' % numstr
 479         else:
 480             base = 10
 481         # See https://github.com/rg3/youtube-dl/issues/7518
 482         try:
 483             return compat_chr(int(numstr, base))
 484         except ValueError:
 485             pass
 486
 487     # Unknown entity in name, return its literal representation
 488     return '&%s;' % entity
 489
 490
 491 def unescapeHTML(s):
 492     if s is None:
 493         return None
 494     assert type(s) == compat_str
 495
 496     return re.sub(
 497         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 498
 499
 500 def get_subprocess_encoding():
 501     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 502         # For subprocess calls, encode with locale encoding
 503         # Refer to http://stackoverflow.com/a/9951851/35070
 504         encoding = preferredencoding()
 505     else:
 506         encoding = sys.getfilesystemencoding()
 507     if encoding is None:
 508         encoding = 'utf-8'
 509     return encoding
 510
 511
 512 def encodeFilename(s, for_subprocess=False):
 513     """
 514     @param s The name of the file
 515     """
 516
 517     assert type(s) == compat_str
 518
 519     # Python 3 has a Unicode API
 520     if sys.version_info >= (3, 0):
 521         return s
 522
 523     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 524     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 525     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 526     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 527         return s
 528
 529     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 530     if sys.platform.startswith('java'):
 531         return s
 532
 533     return s.encode(get_subprocess_encoding(), 'ignore')
 534
 535
 536 def decodeFilename(b, for_subprocess=False):
 537
 538     if sys.version_info >= (3, 0):
 539         return b
 540
 541     if not isinstance(b, bytes):
 542         return b
 543
 544     return b.decode(get_subprocess_encoding(), 'ignore')
 545
 546
 547 def encodeArgument(s):
 548     if not isinstance(s, compat_str):
 549         # Legacy code that uses byte strings
 550         # Uncomment the following line after fixing all post processors
 551         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 552         s = s.decode('ascii')
 553     return encodeFilename(s, True)
 554
 555
 556 def decodeArgument(b):
 557     return decodeFilename(b, True)
 558
 559
 560 def decodeOption(optval):
 561     if optval is None:
 562         return optval
 563     if isinstance(optval, bytes):
 564         optval = optval.decode(preferredencoding())
 565
 566     assert isinstance(optval, compat_str)
 567     return optval
 568
 569
 570 def formatSeconds(secs):
 571     if secs > 3600:
 572         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 573     elif secs > 60:
 574         return '%d:%02d' % (secs // 60, secs % 60)
 575     else:
 576         return '%d' % secs
 577
 578
 579 def make_HTTPS_handler(params, **kwargs):
 580     opts_no_check_certificate = params.get('nocheckcertificate', False)
 581     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 582         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 583         if opts_no_check_certificate:
 584             context.check_hostname = False
 585             context.verify_mode = ssl.CERT_NONE
 586         try:
 587             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 588         except TypeError:
 589             # Python 2.7.8
 590             # (create_default_context present but HTTPSHandler has no context=)
 591             pass
 592
 593     if sys.version_info < (3, 2):
 594         return YoutubeDLHTTPSHandler(params, **kwargs)
 595     else:  # Python < 3.4
 596         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 597         context.verify_mode = (ssl.CERT_NONE
 598                                if opts_no_check_certificate
 599                                else ssl.CERT_REQUIRED)
 600         context.set_default_verify_paths()
 601         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 602
 603
 604 def bug_reports_message():
 605     if ytdl_is_updateable():
 606         update_cmd = 'type  youtube-dl -U  to update'
 607     else:
 608         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 609     msg = '; please report this issue on https://yt-dl.org/bug .'
 610     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 611     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 612     return msg
 613
 614
 615 class ExtractorError(Exception):
 616     """Error during info extraction."""
 617
 618     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 619         """ tb, if given, is the original traceback (so that it can be printed out).
 620         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 621         """
 622
 623         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 624             expected = True
 625         if video_id is not None:
 626             msg = video_id + ': ' + msg
 627         if cause:
 628             msg += ' (caused by %r)' % cause
 629         if not expected:
 630             msg += bug_reports_message()
 631         super(ExtractorError, self).__init__(msg)
 632
 633         self.traceback = tb
 634         self.exc_info = sys.exc_info()  # preserve original exception
 635         self.cause = cause
 636         self.video_id = video_id
 637
 638     def format_traceback(self):
 639         if self.traceback is None:
 640             return None
 641         return ''.join(traceback.format_tb(self.traceback))
 642
 643
 644 class UnsupportedError(ExtractorError):
 645     def __init__(self, url):
 646         super(UnsupportedError, self).__init__(
 647             'Unsupported URL: %s' % url, expected=True)
 648         self.url = url
 649
 650
 651 class RegexNotFoundError(ExtractorError):
 652     """Error when a regex didn't match"""
 653     pass
 654
 655
 656 class DownloadError(Exception):
 657     """Download Error exception.
 658
 659     This exception may be thrown by FileDownloader objects if they are not
 660     configured to continue on errors. They will contain the appropriate
 661     error message.
 662     """
 663
 664     def __init__(self, msg, exc_info=None):
 665         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 666         super(DownloadError, self).__init__(msg)
 667         self.exc_info = exc_info
 668
 669
 670 class SameFileError(Exception):
 671     """Same File exception.
 672
 673     This exception will be thrown by FileDownloader objects if they detect
 674     multiple files would have to be downloaded to the same file on disk.
 675     """
 676     pass
 677
 678
 679 class PostProcessingError(Exception):
 680     """Post Processing exception.
 681
 682     This exception may be raised by PostProcessor's .run() method to
 683     indicate an error in the postprocessing task.
 684     """
 685
 686     def __init__(self, msg):
 687         self.msg = msg
 688
 689
 690 class MaxDownloadsReached(Exception):
 691     """ --max-downloads limit has been reached. """
 692     pass
 693
 694
 695 class UnavailableVideoError(Exception):
 696     """Unavailable Format exception.
 697
 698     This exception will be thrown when a video is requested
 699     in a format that is not available for that video.
 700     """
 701     pass
 702
 703
 704 class ContentTooShortError(Exception):
 705     """Content Too Short exception.
 706
 707     This exception may be raised by FileDownloader objects when a file they
 708     download is too small for what the server announced first, indicating
 709     the connection was probably interrupted.
 710     """
 711
 712     def __init__(self, downloaded, expected):
 713         # Both in bytes
 714         self.downloaded = downloaded
 715         self.expected = expected
 716
 717
 718 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 719     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 720     # expected HTTP responses to meet HTTP/1.0 or later (see also
 721     # https://github.com/rg3/youtube-dl/issues/6727)
 722     if sys.version_info < (3, 0):
 723         kwargs[b'strict'] = True
 724     hc = http_class(*args, **kwargs)
 725     source_address = ydl_handler._params.get('source_address')
 726     if source_address is not None:
 727         sa = (source_address, 0)
 728         if hasattr(hc, 'source_address'):  # Python 2.7+
 729             hc.source_address = sa
 730         else:  # Python 2.6
 731             def _hc_connect(self, *args, **kwargs):
 732                 sock = compat_socket_create_connection(
 733                     (self.host, self.port), self.timeout, sa)
 734                 if is_https:
 735                     self.sock = ssl.wrap_socket(
 736                         sock, self.key_file, self.cert_file,
 737                         ssl_version=ssl.PROTOCOL_TLSv1)
 738                 else:
 739                     self.sock = sock
 740             hc.connect = functools.partial(_hc_connect, hc)
 741
 742     return hc
 743
 744
 745 def handle_youtubedl_headers(headers):
 746     filtered_headers = headers
 747
 748     if 'Youtubedl-no-compression' in filtered_headers:
 749         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 750         del filtered_headers['Youtubedl-no-compression']
 751
 752     return filtered_headers
 753
 754
 755 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 756     """Handler for HTTP requests and responses.
 757
 758     This class, when installed with an OpenerDirector, automatically adds
 759     the standard headers to every HTTP request and handles gzipped and
 760     deflated responses from web servers. If compression is to be avoided in
 761     a particular request, the original request in the program code only has
 762     to include the HTTP header "Youtubedl-no-compression", which will be
 763     removed before making the real request.
 764
 765     Part of this code was copied from:
 766
 767     http://techknack.net/python-urllib2-handlers/
 768
 769     Andrew Rowls, the author of that code, agreed to release it to the
 770     public domain.
 771     """
 772
 773     def __init__(self, params, *args, **kwargs):
 774         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 775         self._params = params
 776
 777     def http_open(self, req):
 778         conn_class = compat_http_client.HTTPConnection
 779
 780         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 781         if socks_proxy:
 782             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 783             del req.headers['Ytdl-socks-proxy']
 784
 785         return self.do_open(functools.partial(
 786             _create_http_connection, self, conn_class, False),
 787             req)
 788
 789     @staticmethod
 790     def deflate(data):
 791         try:
 792             return zlib.decompress(data, -zlib.MAX_WBITS)
 793         except zlib.error:
 794             return zlib.decompress(data)
 795
 796     @staticmethod
 797     def addinfourl_wrapper(stream, headers, url, code):
 798         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 799             return compat_urllib_request.addinfourl(stream, headers, url, code)
 800         ret = compat_urllib_request.addinfourl(stream, headers, url)
 801         ret.code = code
 802         return ret
 803
 804     def http_request(self, req):
 805         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 806         # always respected by websites, some tend to give out URLs with non percent-encoded
 807         # non-ASCII characters (see telemb.py, ard.py [#3412])
 808         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 809         # To work around aforementioned issue we will replace request's original URL with
 810         # percent-encoded one
 811         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 812         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 813         url = req.get_full_url()
 814         url_escaped = escape_url(url)
 815
 816         # Substitute URL if any change after escaping
 817         if url != url_escaped:
 818             req = update_Request(req, url=url_escaped)
 819
 820         for h, v in std_headers.items():
 821             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 822             # The dict keys are capitalized because of this bug by urllib
 823             if h.capitalize() not in req.headers:
 824                 req.add_header(h, v)
 825
 826         req.headers = handle_youtubedl_headers(req.headers)
 827
 828         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 829             # Python 2.6 is brain-dead when it comes to fragments
 830             req._Request__original = req._Request__original.partition('#')[0]
 831             req._Request__r_type = req._Request__r_type.partition('#')[0]
 832
 833         return req
 834
 835     def http_response(self, req, resp):
 836         old_resp = resp
 837         # gzip
 838         if resp.headers.get('Content-encoding', '') == 'gzip':
 839             content = resp.read()
 840             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 841             try:
 842                 uncompressed = io.BytesIO(gz.read())
 843             except IOError as original_ioerror:
 844                 # There may be junk add the end of the file
 845                 # See http://stackoverflow.com/q/4928560/35070 for details
 846                 for i in range(1, 1024):
 847                     try:
 848                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 849                         uncompressed = io.BytesIO(gz.read())
 850                     except IOError:
 851                         continue
 852                     break
 853                 else:
 854                     raise original_ioerror
 855             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 856             resp.msg = old_resp.msg
 857             del resp.headers['Content-encoding']
 858         # deflate
 859         if resp.headers.get('Content-encoding', '') == 'deflate':
 860             gz = io.BytesIO(self.deflate(resp.read()))
 861             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 862             resp.msg = old_resp.msg
 863             del resp.headers['Content-encoding']
 864         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 865         # https://github.com/rg3/youtube-dl/issues/6457).
 866         if 300 <= resp.code < 400:
 867             location = resp.headers.get('Location')
 868             if location:
 869                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 870                 if sys.version_info >= (3, 0):
 871                     location = location.encode('iso-8859-1').decode('utf-8')
 872                 else:
 873                     location = location.decode('utf-8')
 874                 location_escaped = escape_url(location)
 875                 if location != location_escaped:
 876                     del resp.headers['Location']
 877                     if sys.version_info < (3, 0):
 878                         location_escaped = location_escaped.encode('utf-8')
 879                     resp.headers['Location'] = location_escaped
 880         return resp
 881
 882     https_request = http_request
 883     https_response = http_response
 884
 885
 886 def make_socks_conn_class(base_class, socks_proxy):
 887     assert issubclass(base_class, (
 888         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 889
 890     url_components = compat_urlparse.urlparse(socks_proxy)
 891     if url_components.scheme.lower() == 'socks5':
 892         socks_type = ProxyType.SOCKS5
 893     elif url_components.scheme.lower() in ('socks', 'socks4'):
 894         socks_type = ProxyType.SOCKS4
 895     elif url_components.scheme.lower() == 'socks4a':
 896         socks_type = ProxyType.SOCKS4A
 897
 898     def unquote_if_non_empty(s):
 899         if not s:
 900             return s
 901         return compat_urllib_parse_unquote_plus(s)
 902
 903     proxy_args = (
 904         socks_type,
 905         url_components.hostname, url_components.port or 1080,
 906         True,  # Remote DNS
 907         unquote_if_non_empty(url_components.username),
 908         unquote_if_non_empty(url_components.password),
 909     )
 910
 911     class SocksConnection(base_class):
 912         def connect(self):
 913             self.sock = sockssocket()
 914             self.sock.setproxy(*proxy_args)
 915             if type(self.timeout) in (int, float):
 916                 self.sock.settimeout(self.timeout)
 917             self.sock.connect((self.host, self.port))
 918
 919             if isinstance(self, compat_http_client.HTTPSConnection):
 920                 if hasattr(self, '_context'):  # Python > 2.6
 921                     self.sock = self._context.wrap_socket(
 922                         self.sock, server_hostname=self.host)
 923                 else:
 924                     self.sock = ssl.wrap_socket(self.sock)
 925
 926     return SocksConnection
 927
 928
 929 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 930     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 931         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 932         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 933         self._params = params
 934
 935     def https_open(self, req):
 936         kwargs = {}
 937         conn_class = self._https_conn_class
 938
 939         if hasattr(self, '_context'):  # python > 2.6
 940             kwargs['context'] = self._context
 941         if hasattr(self, '_check_hostname'):  # python 3.x
 942             kwargs['check_hostname'] = self._check_hostname
 943
 944         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 945         if socks_proxy:
 946             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 947             del req.headers['Ytdl-socks-proxy']
 948
 949         return self.do_open(functools.partial(
 950             _create_http_connection, self, conn_class, True),
 951             req, **kwargs)
 952
 953
 954 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 955     def __init__(self, cookiejar=None):
 956         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 957
 958     def http_response(self, request, response):
 959         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 960         # characters in Set-Cookie HTTP header of last response (see
 961         # https://github.com/rg3/youtube-dl/issues/6769).
 962         # In order to at least prevent crashing we will percent encode Set-Cookie
 963         # header before HTTPCookieProcessor starts processing it.
 964         # if sys.version_info < (3, 0) and response.headers:
 965         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 966         #         set_cookie = response.headers.get(set_cookie_header)
 967         #         if set_cookie:
 968         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 969         #             if set_cookie != set_cookie_escaped:
 970         #                 del response.headers[set_cookie_header]
 971         #                 response.headers[set_cookie_header] = set_cookie_escaped
 972         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 973
 974     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 975     https_response = http_response
 976
 977
 978 def parse_iso8601(date_str, delimiter='T', timezone=None):
 979     """ Return a UNIX timestamp from the given date """
 980
 981     if date_str is None:
 982         return None
 983
 984     date_str = re.sub(r'\.[0-9]+', '', date_str)
 985
 986     if timezone is None:
 987         m = re.search(
 988             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 989             date_str)
 990         if not m:
 991             timezone = datetime.timedelta()
 992         else:
 993             date_str = date_str[:-len(m.group(0))]
 994             if not m.group('sign'):
 995                 timezone = datetime.timedelta()
 996             else:
 997                 sign = 1 if m.group('sign') == '+' else -1
 998                 timezone = datetime.timedelta(
 999                     hours=sign * int(m.group('hours')),
1000                     minutes=sign * int(m.group('minutes')))
1001     try:
1002         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1003         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1004         return calendar.timegm(dt.timetuple())
1005     except ValueError:
1006         pass
1007
1008
1009 def unified_strdate(date_str, day_first=True):
1010     """Return a string with the date in the format YYYYMMDD"""
1011
1012     if date_str is None:
1013         return None
1014     upload_date = None
1015     # Replace commas
1016     date_str = date_str.replace(',', ' ')
1017     # %z (UTC offset) is only supported in python>=3.2
1018     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1019         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1020     # Remove AM/PM + timezone
1021     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1022
1023     format_expressions = [
1024         '%d %B %Y',
1025         '%d %b %Y',
1026         '%B %d %Y',
1027         '%b %d %Y',
1028         '%b %dst %Y %I:%M',
1029         '%b %dnd %Y %I:%M',
1030         '%b %dth %Y %I:%M',
1031         '%Y %m %d',
1032         '%Y-%m-%d',
1033         '%Y/%m/%d',
1034         '%Y/%m/%d %H:%M:%S',
1035         '%Y-%m-%d %H:%M:%S',
1036         '%Y-%m-%d %H:%M:%S.%f',
1037         '%d.%m.%Y %H:%M',
1038         '%d.%m.%Y %H.%M',
1039         '%Y-%m-%dT%H:%M:%SZ',
1040         '%Y-%m-%dT%H:%M:%S.%fZ',
1041         '%Y-%m-%dT%H:%M:%S.%f0Z',
1042         '%Y-%m-%dT%H:%M:%S',
1043         '%Y-%m-%dT%H:%M:%S.%f',
1044         '%Y-%m-%dT%H:%M',
1045     ]
1046     if day_first:
1047         format_expressions.extend([
1048             '%d-%m-%Y',
1049             '%d.%m.%Y',
1050             '%d.%m.%y',
1051             '%d/%m/%Y',
1052             '%d/%m/%y',
1053             '%d/%m/%Y %H:%M:%S',
1054         ])
1055     else:
1056         format_expressions.extend([
1057             '%m-%d-%Y',
1058             '%m.%d.%Y',
1059             '%m/%d/%Y',
1060             '%m/%d/%y',
1061             '%m/%d/%Y %H:%M:%S',
1062         ])
1063     for expression in format_expressions:
1064         try:
1065             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1066         except ValueError:
1067             pass
1068     if upload_date is None:
1069         timetuple = email.utils.parsedate_tz(date_str)
1070         if timetuple:
1071             try:
1072                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1073             except ValueError:
1074                 pass
1075     if upload_date is not None:
1076         return compat_str(upload_date)
1077
1078
1079 def determine_ext(url, default_ext='unknown_video'):
1080     if url is None:
1081         return default_ext
1082     guess = url.partition('?')[0].rpartition('.')[2]
1083     if re.match(r'^[A-Za-z0-9]+$', guess):
1084         return guess
1085     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1086     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1087         return guess.rstrip('/')
1088     else:
1089         return default_ext
1090
1091
1092 def subtitles_filename(filename, sub_lang, sub_format):
1093     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1094
1095
1096 def date_from_str(date_str):
1097     """
1098     Return a datetime object from a string in the format YYYYMMDD or
1099     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1100     today = datetime.date.today()
1101     if date_str in ('now', 'today'):
1102         return today
1103     if date_str == 'yesterday':
1104         return today - datetime.timedelta(days=1)
1105     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1106     if match is not None:
1107         sign = match.group('sign')
1108         time = int(match.group('time'))
1109         if sign == '-':
1110             time = -time
1111         unit = match.group('unit')
1112         # A bad approximation?
1113         if unit == 'month':
1114             unit = 'day'
1115             time *= 30
1116         elif unit == 'year':
1117             unit = 'day'
1118             time *= 365
1119         unit += 's'
1120         delta = datetime.timedelta(**{unit: time})
1121         return today + delta
1122     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1123
1124
1125 def hyphenate_date(date_str):
1126     """
1127     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1128     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1129     if match is not None:
1130         return '-'.join(match.groups())
1131     else:
1132         return date_str
1133
1134
1135 class DateRange(object):
1136     """Represents a time interval between two dates"""
1137
1138     def __init__(self, start=None, end=None):
1139         """start and end must be strings in the format accepted by date"""
1140         if start is not None:
1141             self.start = date_from_str(start)
1142         else:
1143             self.start = datetime.datetime.min.date()
1144         if end is not None:
1145             self.end = date_from_str(end)
1146         else:
1147             self.end = datetime.datetime.max.date()
1148         if self.start > self.end:
1149             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1150
1151     @classmethod
1152     def day(cls, day):
1153         """Returns a range that only contains the given day"""
1154         return cls(day, day)
1155
1156     def __contains__(self, date):
1157         """Check if the date is in the range"""
1158         if not isinstance(date, datetime.date):
1159             date = date_from_str(date)
1160         return self.start <= date <= self.end
1161
1162     def __str__(self):
1163         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1164
1165
1166 def platform_name():
1167     """ Returns the platform name as a compat_str """
1168     res = platform.platform()
1169     if isinstance(res, bytes):
1170         res = res.decode(preferredencoding())
1171
1172     assert isinstance(res, compat_str)
1173     return res
1174
1175
1176 def _windows_write_string(s, out):
1177     """ Returns True if the string was written using special methods,
1178     False if it has yet to be written out."""
1179     # Adapted from http://stackoverflow.com/a/3259271/35070
1180
1181     import ctypes
1182     import ctypes.wintypes
1183
1184     WIN_OUTPUT_IDS = {
1185         1: -11,
1186         2: -12,
1187     }
1188
1189     try:
1190         fileno = out.fileno()
1191     except AttributeError:
1192         # If the output stream doesn't have a fileno, it's virtual
1193         return False
1194     except io.UnsupportedOperation:
1195         # Some strange Windows pseudo files?
1196         return False
1197     if fileno not in WIN_OUTPUT_IDS:
1198         return False
1199
1200     GetStdHandle = ctypes.WINFUNCTYPE(
1201         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1202         (b'GetStdHandle', ctypes.windll.kernel32))
1203     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1204
1205     WriteConsoleW = ctypes.WINFUNCTYPE(
1206         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1207         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1208         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1209     written = ctypes.wintypes.DWORD(0)
1210
1211     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1212     FILE_TYPE_CHAR = 0x0002
1213     FILE_TYPE_REMOTE = 0x8000
1214     GetConsoleMode = ctypes.WINFUNCTYPE(
1215         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1216         ctypes.POINTER(ctypes.wintypes.DWORD))(
1217         (b'GetConsoleMode', ctypes.windll.kernel32))
1218     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1219
1220     def not_a_console(handle):
1221         if handle == INVALID_HANDLE_VALUE or handle is None:
1222             return True
1223         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1224                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1225
1226     if not_a_console(h):
1227         return False
1228
1229     def next_nonbmp_pos(s):
1230         try:
1231             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1232         except StopIteration:
1233             return len(s)
1234
1235     while s:
1236         count = min(next_nonbmp_pos(s), 1024)
1237
1238         ret = WriteConsoleW(
1239             h, s, count if count else 2, ctypes.byref(written), None)
1240         if ret == 0:
1241             raise OSError('Failed to write string')
1242         if not count:  # We just wrote a non-BMP character
1243             assert written.value == 2
1244             s = s[1:]
1245         else:
1246             assert written.value > 0
1247             s = s[written.value:]
1248     return True
1249
1250
1251 def write_string(s, out=None, encoding=None):
1252     if out is None:
1253         out = sys.stderr
1254     assert type(s) == compat_str
1255
1256     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1257         if _windows_write_string(s, out):
1258             return
1259
1260     if ('b' in getattr(out, 'mode', '') or
1261             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1262         byt = s.encode(encoding or preferredencoding(), 'ignore')
1263         out.write(byt)
1264     elif hasattr(out, 'buffer'):
1265         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1266         byt = s.encode(enc, 'ignore')
1267         out.buffer.write(byt)
1268     else:
1269         out.write(s)
1270     out.flush()
1271
1272
1273 def bytes_to_intlist(bs):
1274     if not bs:
1275         return []
1276     if isinstance(bs[0], int):  # Python 3
1277         return list(bs)
1278     else:
1279         return [ord(c) for c in bs]
1280
1281
1282 def intlist_to_bytes(xs):
1283     if not xs:
1284         return b''
1285     return compat_struct_pack('%dB' % len(xs), *xs)
1286
1287
1288 # Cross-platform file locking
1289 if sys.platform == 'win32':
1290     import ctypes.wintypes
1291     import msvcrt
1292
1293     class OVERLAPPED(ctypes.Structure):
1294         _fields_ = [
1295             ('Internal', ctypes.wintypes.LPVOID),
1296             ('InternalHigh', ctypes.wintypes.LPVOID),
1297             ('Offset', ctypes.wintypes.DWORD),
1298             ('OffsetHigh', ctypes.wintypes.DWORD),
1299             ('hEvent', ctypes.wintypes.HANDLE),
1300         ]
1301
1302     kernel32 = ctypes.windll.kernel32
1303     LockFileEx = kernel32.LockFileEx
1304     LockFileEx.argtypes = [
1305         ctypes.wintypes.HANDLE,     # hFile
1306         ctypes.wintypes.DWORD,      # dwFlags
1307         ctypes.wintypes.DWORD,      # dwReserved
1308         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1309         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1310         ctypes.POINTER(OVERLAPPED)  # Overlapped
1311     ]
1312     LockFileEx.restype = ctypes.wintypes.BOOL
1313     UnlockFileEx = kernel32.UnlockFileEx
1314     UnlockFileEx.argtypes = [
1315         ctypes.wintypes.HANDLE,     # hFile
1316         ctypes.wintypes.DWORD,      # dwReserved
1317         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1318         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1319         ctypes.POINTER(OVERLAPPED)  # Overlapped
1320     ]
1321     UnlockFileEx.restype = ctypes.wintypes.BOOL
1322     whole_low = 0xffffffff
1323     whole_high = 0x7fffffff
1324
1325     def _lock_file(f, exclusive):
1326         overlapped = OVERLAPPED()
1327         overlapped.Offset = 0
1328         overlapped.OffsetHigh = 0
1329         overlapped.hEvent = 0
1330         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1331         handle = msvcrt.get_osfhandle(f.fileno())
1332         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1333                           whole_low, whole_high, f._lock_file_overlapped_p):
1334             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1335
1336     def _unlock_file(f):
1337         assert f._lock_file_overlapped_p
1338         handle = msvcrt.get_osfhandle(f.fileno())
1339         if not UnlockFileEx(handle, 0,
1340                             whole_low, whole_high, f._lock_file_overlapped_p):
1341             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1342
1343 else:
1344     # Some platforms, such as Jython, is missing fcntl
1345     try:
1346         import fcntl
1347
1348         def _lock_file(f, exclusive):
1349             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1350
1351         def _unlock_file(f):
1352             fcntl.flock(f, fcntl.LOCK_UN)
1353     except ImportError:
1354         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1355
1356         def _lock_file(f, exclusive):
1357             raise IOError(UNSUPPORTED_MSG)
1358
1359         def _unlock_file(f):
1360             raise IOError(UNSUPPORTED_MSG)
1361
1362
1363 class locked_file(object):
1364     def __init__(self, filename, mode, encoding=None):
1365         assert mode in ['r', 'a', 'w']
1366         self.f = io.open(filename, mode, encoding=encoding)
1367         self.mode = mode
1368
1369     def __enter__(self):
1370         exclusive = self.mode != 'r'
1371         try:
1372             _lock_file(self.f, exclusive)
1373         except IOError:
1374             self.f.close()
1375             raise
1376         return self
1377
1378     def __exit__(self, etype, value, traceback):
1379         try:
1380             _unlock_file(self.f)
1381         finally:
1382             self.f.close()
1383
1384     def __iter__(self):
1385         return iter(self.f)
1386
1387     def write(self, *args):
1388         return self.f.write(*args)
1389
1390     def read(self, *args):
1391         return self.f.read(*args)
1392
1393
1394 def get_filesystem_encoding():
1395     encoding = sys.getfilesystemencoding()
1396     return encoding if encoding is not None else 'utf-8'
1397
1398
1399 def shell_quote(args):
1400     quoted_args = []
1401     encoding = get_filesystem_encoding()
1402     for a in args:
1403         if isinstance(a, bytes):
1404             # We may get a filename encoded with 'encodeFilename'
1405             a = a.decode(encoding)
1406         quoted_args.append(pipes.quote(a))
1407     return ' '.join(quoted_args)
1408
1409
1410 def smuggle_url(url, data):
1411     """ Pass additional data in a URL for internal use. """
1412
1413     sdata = compat_urllib_parse_urlencode(
1414         {'__youtubedl_smuggle': json.dumps(data)})
1415     return url + '#' + sdata
1416
1417
1418 def unsmuggle_url(smug_url, default=None):
1419     if '#__youtubedl_smuggle' not in smug_url:
1420         return smug_url, default
1421     url, _, sdata = smug_url.rpartition('#')
1422     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1423     data = json.loads(jsond)
1424     return url, data
1425
1426
1427 def format_bytes(bytes):
1428     if bytes is None:
1429         return 'N/A'
1430     if type(bytes) is str:
1431         bytes = float(bytes)
1432     if bytes == 0.0:
1433         exponent = 0
1434     else:
1435         exponent = int(math.log(bytes, 1024.0))
1436     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1437     converted = float(bytes) / float(1024 ** exponent)
1438     return '%.2f%s' % (converted, suffix)
1439
1440
1441 def lookup_unit_table(unit_table, s):
1442     units_re = '|'.join(re.escape(u) for u in unit_table)
1443     m = re.match(
1444         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1445     if not m:
1446         return None
1447     num_str = m.group('num').replace(',', '.')
1448     mult = unit_table[m.group('unit')]
1449     return int(float(num_str) * mult)
1450
1451
1452 def parse_filesize(s):
1453     if s is None:
1454         return None
1455
1456     # The lower-case forms are of course incorrect and unofficial,
1457     # but we support those too
1458     _UNIT_TABLE = {
1459         'B': 1,
1460         'b': 1,
1461         'KiB': 1024,
1462         'KB': 1000,
1463         'kB': 1024,
1464         'Kb': 1000,
1465         'MiB': 1024 ** 2,
1466         'MB': 1000 ** 2,
1467         'mB': 1024 ** 2,
1468         'Mb': 1000 ** 2,
1469         'GiB': 1024 ** 3,
1470         'GB': 1000 ** 3,
1471         'gB': 1024 ** 3,
1472         'Gb': 1000 ** 3,
1473         'TiB': 1024 ** 4,
1474         'TB': 1000 ** 4,
1475         'tB': 1024 ** 4,
1476         'Tb': 1000 ** 4,
1477         'PiB': 1024 ** 5,
1478         'PB': 1000 ** 5,
1479         'pB': 1024 ** 5,
1480         'Pb': 1000 ** 5,
1481         'EiB': 1024 ** 6,
1482         'EB': 1000 ** 6,
1483         'eB': 1024 ** 6,
1484         'Eb': 1000 ** 6,
1485         'ZiB': 1024 ** 7,
1486         'ZB': 1000 ** 7,
1487         'zB': 1024 ** 7,
1488         'Zb': 1000 ** 7,
1489         'YiB': 1024 ** 8,
1490         'YB': 1000 ** 8,
1491         'yB': 1024 ** 8,
1492         'Yb': 1000 ** 8,
1493     }
1494
1495     return lookup_unit_table(_UNIT_TABLE, s)
1496
1497
1498 def parse_count(s):
1499     if s is None:
1500         return None
1501
1502     s = s.strip()
1503
1504     if re.match(r'^[\d,.]+$', s):
1505         return str_to_int(s)
1506
1507     _UNIT_TABLE = {
1508         'k': 1000,
1509         'K': 1000,
1510         'm': 1000 ** 2,
1511         'M': 1000 ** 2,
1512         'kk': 1000 ** 2,
1513         'KK': 1000 ** 2,
1514     }
1515
1516     return lookup_unit_table(_UNIT_TABLE, s)
1517
1518
1519 def month_by_name(name):
1520     """ Return the number of a month by (locale-independently) English name """
1521
1522     try:
1523         return ENGLISH_MONTH_NAMES.index(name) + 1
1524     except ValueError:
1525         return None
1526
1527
1528 def month_by_abbreviation(abbrev):
1529     """ Return the number of a month by (locale-independently) English
1530         abbreviations """
1531
1532     try:
1533         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1534     except ValueError:
1535         return None
1536
1537
1538 def fix_xml_ampersands(xml_str):
1539     """Replace all the '&' by '&amp;' in XML"""
1540     return re.sub(
1541         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1542         '&amp;',
1543         xml_str)
1544
1545
1546 def setproctitle(title):
1547     assert isinstance(title, compat_str)
1548
1549     # ctypes in Jython is not complete
1550     # http://bugs.jython.org/issue2148
1551     if sys.platform.startswith('java'):
1552         return
1553
1554     try:
1555         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1556     except OSError:
1557         return
1558     title_bytes = title.encode('utf-8')
1559     buf = ctypes.create_string_buffer(len(title_bytes))
1560     buf.value = title_bytes
1561     try:
1562         libc.prctl(15, buf, 0, 0, 0)
1563     except AttributeError:
1564         return  # Strange libc, just skip this
1565
1566
1567 def remove_start(s, start):
1568     return s[len(start):] if s is not None and s.startswith(start) else s
1569
1570
1571 def remove_end(s, end):
1572     return s[:-len(end)] if s is not None and s.endswith(end) else s
1573
1574
1575 def remove_quotes(s):
1576     if s is None or len(s) < 2:
1577         return s
1578     for quote in ('"', "'", ):
1579         if s[0] == quote and s[-1] == quote:
1580             return s[1:-1]
1581     return s
1582
1583
1584 def url_basename(url):
1585     path = compat_urlparse.urlparse(url).path
1586     return path.strip('/').split('/')[-1]
1587
1588
1589 class HEADRequest(compat_urllib_request.Request):
1590     def get_method(self):
1591         return 'HEAD'
1592
1593
1594 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1595     if get_attr:
1596         if v is not None:
1597             v = getattr(v, get_attr, None)
1598     if v == '':
1599         v = None
1600     if v is None:
1601         return default
1602     try:
1603         return int(v) * invscale // scale
1604     except ValueError:
1605         return default
1606
1607
1608 def str_or_none(v, default=None):
1609     return default if v is None else compat_str(v)
1610
1611
1612 def str_to_int(int_str):
1613     """ A more relaxed version of int_or_none """
1614     if int_str is None:
1615         return None
1616     int_str = re.sub(r'[,\.\+]', '', int_str)
1617     return int(int_str)
1618
1619
1620 def float_or_none(v, scale=1, invscale=1, default=None):
1621     if v is None:
1622         return default
1623     try:
1624         return float(v) * invscale / scale
1625     except ValueError:
1626         return default
1627
1628
1629 def parse_duration(s):
1630     if not isinstance(s, compat_basestring):
1631         return None
1632
1633     s = s.strip()
1634
1635     days, hours, mins, secs, ms = [None] * 5
1636     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1637     if m:
1638         days, hours, mins, secs, ms = m.groups()
1639     else:
1640         m = re.match(
1641             r'''(?ix)(?:P?T)?
1642                 (?:
1643                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1644                 )?
1645                 (?:
1646                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1647                 )?
1648                 (?:
1649                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1650                 )?
1651                 (?:
1652                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1653                 )?$''', s)
1654         if m:
1655             days, hours, mins, secs, ms = m.groups()
1656         else:
1657             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1658             if m:
1659                 hours, mins = m.groups()
1660             else:
1661                 return None
1662
1663     duration = 0
1664     if secs:
1665         duration += float(secs)
1666     if mins:
1667         duration += float(mins) * 60
1668     if hours:
1669         duration += float(hours) * 60 * 60
1670     if days:
1671         duration += float(days) * 24 * 60 * 60
1672     if ms:
1673         duration += float(ms)
1674     return duration
1675
1676
1677 def prepend_extension(filename, ext, expected_real_ext=None):
1678     name, real_ext = os.path.splitext(filename)
1679     return (
1680         '{0}.{1}{2}'.format(name, ext, real_ext)
1681         if not expected_real_ext or real_ext[1:] == expected_real_ext
1682         else '{0}.{1}'.format(filename, ext))
1683
1684
1685 def replace_extension(filename, ext, expected_real_ext=None):
1686     name, real_ext = os.path.splitext(filename)
1687     return '{0}.{1}'.format(
1688         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1689         ext)
1690
1691
1692 def check_executable(exe, args=[]):
1693     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1694     args can be a list of arguments for a short output (like -version) """
1695     try:
1696         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1697     except OSError:
1698         return False
1699     return exe
1700
1701
1702 def get_exe_version(exe, args=['--version'],
1703                     version_re=None, unrecognized='present'):
1704     """ Returns the version of the specified executable,
1705     or False if the executable is not present """
1706     try:
1707         out, _ = subprocess.Popen(
1708             [encodeArgument(exe)] + args,
1709             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1710     except OSError:
1711         return False
1712     if isinstance(out, bytes):  # Python 2.x
1713         out = out.decode('ascii', 'ignore')
1714     return detect_exe_version(out, version_re, unrecognized)
1715
1716
1717 def detect_exe_version(output, version_re=None, unrecognized='present'):
1718     assert isinstance(output, compat_str)
1719     if version_re is None:
1720         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1721     m = re.search(version_re, output)
1722     if m:
1723         return m.group(1)
1724     else:
1725         return unrecognized
1726
1727
1728 class PagedList(object):
1729     def __len__(self):
1730         # This is only useful for tests
1731         return len(self.getslice())
1732
1733
1734 class OnDemandPagedList(PagedList):
1735     def __init__(self, pagefunc, pagesize, use_cache=False):
1736         self._pagefunc = pagefunc
1737         self._pagesize = pagesize
1738         self._use_cache = use_cache
1739         if use_cache:
1740             self._cache = {}
1741
1742     def getslice(self, start=0, end=None):
1743         res = []
1744         for pagenum in itertools.count(start // self._pagesize):
1745             firstid = pagenum * self._pagesize
1746             nextfirstid = pagenum * self._pagesize + self._pagesize
1747             if start >= nextfirstid:
1748                 continue
1749
1750             page_results = None
1751             if self._use_cache:
1752                 page_results = self._cache.get(pagenum)
1753             if page_results is None:
1754                 page_results = list(self._pagefunc(pagenum))
1755             if self._use_cache:
1756                 self._cache[pagenum] = page_results
1757
1758             startv = (
1759                 start % self._pagesize
1760                 if firstid <= start < nextfirstid
1761                 else 0)
1762
1763             endv = (
1764                 ((end - 1) % self._pagesize) + 1
1765                 if (end is not None and firstid <= end <= nextfirstid)
1766                 else None)
1767
1768             if startv != 0 or endv is not None:
1769                 page_results = page_results[startv:endv]
1770             res.extend(page_results)
1771
1772             # A little optimization - if current page is not "full", ie. does
1773             # not contain page_size videos then we can assume that this page
1774             # is the last one - there are no more ids on further pages -
1775             # i.e. no need to query again.
1776             if len(page_results) + startv < self._pagesize:
1777                 break
1778
1779             # If we got the whole page, but the next page is not interesting,
1780             # break out early as well
1781             if end == nextfirstid:
1782                 break
1783         return res
1784
1785
1786 class InAdvancePagedList(PagedList):
1787     def __init__(self, pagefunc, pagecount, pagesize):
1788         self._pagefunc = pagefunc
1789         self._pagecount = pagecount
1790         self._pagesize = pagesize
1791
1792     def getslice(self, start=0, end=None):
1793         res = []
1794         start_page = start // self._pagesize
1795         end_page = (
1796             self._pagecount if end is None else (end // self._pagesize + 1))
1797         skip_elems = start - start_page * self._pagesize
1798         only_more = None if end is None else end - start
1799         for pagenum in range(start_page, end_page):
1800             page = list(self._pagefunc(pagenum))
1801             if skip_elems:
1802                 page = page[skip_elems:]
1803                 skip_elems = None
1804             if only_more is not None:
1805                 if len(page) < only_more:
1806                     only_more -= len(page)
1807                 else:
1808                     page = page[:only_more]
1809                     res.extend(page)
1810                     break
1811             res.extend(page)
1812         return res
1813
1814
1815 def uppercase_escape(s):
1816     unicode_escape = codecs.getdecoder('unicode_escape')
1817     return re.sub(
1818         r'\\U[0-9a-fA-F]{8}',
1819         lambda m: unicode_escape(m.group(0))[0],
1820         s)
1821
1822
1823 def lowercase_escape(s):
1824     unicode_escape = codecs.getdecoder('unicode_escape')
1825     return re.sub(
1826         r'\\u[0-9a-fA-F]{4}',
1827         lambda m: unicode_escape(m.group(0))[0],
1828         s)
1829
1830
1831 def escape_rfc3986(s):
1832     """Escape non-ASCII characters as suggested by RFC 3986"""
1833     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1834         s = s.encode('utf-8')
1835     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1836
1837
1838 def escape_url(url):
1839     """Escape URL as suggested by RFC 3986"""
1840     url_parsed = compat_urllib_parse_urlparse(url)
1841     return url_parsed._replace(
1842         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1843         path=escape_rfc3986(url_parsed.path),
1844         params=escape_rfc3986(url_parsed.params),
1845         query=escape_rfc3986(url_parsed.query),
1846         fragment=escape_rfc3986(url_parsed.fragment)
1847     ).geturl()
1848
1849
1850 def read_batch_urls(batch_fd):
1851     def fixup(url):
1852         if not isinstance(url, compat_str):
1853             url = url.decode('utf-8', 'replace')
1854         BOM_UTF8 = '\xef\xbb\xbf'
1855         if url.startswith(BOM_UTF8):
1856             url = url[len(BOM_UTF8):]
1857         url = url.strip()
1858         if url.startswith(('#', ';', ']')):
1859             return False
1860         return url
1861
1862     with contextlib.closing(batch_fd) as fd:
1863         return [url for url in map(fixup, fd) if url]
1864
1865
1866 def urlencode_postdata(*args, **kargs):
1867     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1868
1869
1870 def update_url_query(url, query):
1871     if not query:
1872         return url
1873     parsed_url = compat_urlparse.urlparse(url)
1874     qs = compat_parse_qs(parsed_url.query)
1875     qs.update(query)
1876     return compat_urlparse.urlunparse(parsed_url._replace(
1877         query=compat_urllib_parse_urlencode(qs, True)))
1878
1879
1880 def update_Request(req, url=None, data=None, headers={}, query={}):
1881     req_headers = req.headers.copy()
1882     req_headers.update(headers)
1883     req_data = data or req.data
1884     req_url = update_url_query(url or req.get_full_url(), query)
1885     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1886     new_req = req_type(
1887         req_url, data=req_data, headers=req_headers,
1888         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1889     if hasattr(req, 'timeout'):
1890         new_req.timeout = req.timeout
1891     return new_req
1892
1893
1894 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1895     if isinstance(key_or_keys, (list, tuple)):
1896         for key in key_or_keys:
1897             if key not in d or d[key] is None or skip_false_values and not d[key]:
1898                 continue
1899             return d[key]
1900         return default
1901     return d.get(key_or_keys, default)
1902
1903
1904 def try_get(src, getter, expected_type=None):
1905     try:
1906         v = getter(src)
1907     except (AttributeError, KeyError, TypeError, IndexError):
1908         pass
1909     else:
1910         if expected_type is None or isinstance(v, expected_type):
1911             return v
1912
1913
1914 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1915     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1916
1917
1918 US_RATINGS = {
1919     'G': 0,
1920     'PG': 10,
1921     'PG-13': 13,
1922     'R': 16,
1923     'NC': 18,
1924 }
1925
1926
1927 def parse_age_limit(s):
1928     if s is None:
1929         return None
1930     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1931     return int(m.group('age')) if m else US_RATINGS.get(s)
1932
1933
1934 def strip_jsonp(code):
1935     return re.sub(
1936         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1937
1938
1939 def js_to_json(code):
1940     def fix_kv(m):
1941         v = m.group(0)
1942         if v in ('true', 'false', 'null'):
1943             return v
1944         elif v.startswith('/*') or v == ',':
1945             return ""
1946
1947         if v[0] in ("'", '"'):
1948             v = re.sub(r'(?s)\\.|"', lambda m: {
1949                 '"': '\\"',
1950                 "\\'": "'",
1951                 '\\\n': '',
1952                 '\\x': '\\u00',
1953             }.get(m.group(0), m.group(0)), v[1:-1])
1954
1955         INTEGER_TABLE = (
1956             (r'^0[xX][0-9a-fA-F]+', 16),
1957             (r'^0+[0-7]+', 8),
1958         )
1959
1960         for regex, base in INTEGER_TABLE:
1961             im = re.match(regex, v)
1962             if im:
1963                 i = int(im.group(0), base)
1964                 return '"%d":' % i if v.endswith(':') else '%d' % i
1965
1966         return '"%s"' % v
1967
1968     return re.sub(r'''(?sx)
1969         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1970         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1971         /\*.*?\*/|,(?=\s*[\]}])|
1972         [a-zA-Z_][.a-zA-Z_0-9]*|
1973         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1974         [0-9]+(?=\s*:)
1975         ''', fix_kv, code)
1976
1977
1978 def qualities(quality_ids):
1979     """ Get a numeric quality value out of a list of possible values """
1980     def q(qid):
1981         try:
1982             return quality_ids.index(qid)
1983         except ValueError:
1984             return -1
1985     return q
1986
1987
1988 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1989
1990
1991 def limit_length(s, length):
1992     """ Add ellipses to overly long strings """
1993     if s is None:
1994         return None
1995     ELLIPSES = '...'
1996     if len(s) > length:
1997         return s[:length - len(ELLIPSES)] + ELLIPSES
1998     return s
1999
2000
2001 def version_tuple(v):
2002     return tuple(int(e) for e in re.split(r'[-.]', v))
2003
2004
2005 def is_outdated_version(version, limit, assume_new=True):
2006     if not version:
2007         return not assume_new
2008     try:
2009         return version_tuple(version) < version_tuple(limit)
2010     except ValueError:
2011         return not assume_new
2012
2013
2014 def ytdl_is_updateable():
2015     """ Returns if youtube-dl can be updated with -U """
2016     from zipimport import zipimporter
2017
2018     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2019
2020
2021 def args_to_str(args):
2022     # Get a short string representation for a subprocess command
2023     return ' '.join(compat_shlex_quote(a) for a in args)
2024
2025
2026 def error_to_compat_str(err):
2027     err_str = str(err)
2028     # On python 2 error byte string must be decoded with proper
2029     # encoding rather than ascii
2030     if sys.version_info[0] < 3:
2031         err_str = err_str.decode(preferredencoding())
2032     return err_str
2033
2034
2035 def mimetype2ext(mt):
2036     if mt is None:
2037         return None
2038
2039     ext = {
2040         'audio/mp4': 'm4a',
2041         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2042         # it's the most popular one
2043         'audio/mpeg': 'mp3',
2044     }.get(mt)
2045     if ext is not None:
2046         return ext
2047
2048     _, _, res = mt.rpartition('/')
2049
2050     return {
2051         '3gpp': '3gp',
2052         'smptett+xml': 'tt',
2053         'srt': 'srt',
2054         'ttaf+xml': 'dfxp',
2055         'ttml+xml': 'ttml',
2056         'vtt': 'vtt',
2057         'x-flv': 'flv',
2058         'x-mp4-fragmented': 'mp4',
2059         'x-ms-wmv': 'wmv',
2060     }.get(res, res)
2061
2062
2063 def urlhandle_detect_ext(url_handle):
2064     getheader = url_handle.headers.get
2065
2066     cd = getheader('Content-Disposition')
2067     if cd:
2068         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2069         if m:
2070             e = determine_ext(m.group('filename'), default_ext=None)
2071             if e:
2072                 return e
2073
2074     return mimetype2ext(getheader('Content-Type'))
2075
2076
2077 def encode_data_uri(data, mime_type):
2078     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2079
2080
2081 def age_restricted(content_limit, age_limit):
2082     """ Returns True iff the content should be blocked """
2083
2084     if age_limit is None:  # No limit set
2085         return False
2086     if content_limit is None:
2087         return False  # Content available for everyone
2088     return age_limit < content_limit
2089
2090
2091 def is_html(first_bytes):
2092     """ Detect whether a file contains HTML by examining its first bytes. """
2093
2094     BOMS = [
2095         (b'\xef\xbb\xbf', 'utf-8'),
2096         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2097         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2098         (b'\xff\xfe', 'utf-16-le'),
2099         (b'\xfe\xff', 'utf-16-be'),
2100     ]
2101     for bom, enc in BOMS:
2102         if first_bytes.startswith(bom):
2103             s = first_bytes[len(bom):].decode(enc, 'replace')
2104             break
2105     else:
2106         s = first_bytes.decode('utf-8', 'replace')
2107
2108     return re.match(r'^\s*<', s)
2109
2110
2111 def determine_protocol(info_dict):
2112     protocol = info_dict.get('protocol')
2113     if protocol is not None:
2114         return protocol
2115
2116     url = info_dict['url']
2117     if url.startswith('rtmp'):
2118         return 'rtmp'
2119     elif url.startswith('mms'):
2120         return 'mms'
2121     elif url.startswith('rtsp'):
2122         return 'rtsp'
2123
2124     ext = determine_ext(url)
2125     if ext == 'm3u8':
2126         return 'm3u8'
2127     elif ext == 'f4m':
2128         return 'f4m'
2129
2130     return compat_urllib_parse_urlparse(url).scheme
2131
2132
2133 def render_table(header_row, data):
2134     """ Render a list of rows, each as a list of values """
2135     table = [header_row] + data
2136     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2137     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2138     return '\n'.join(format_str % tuple(row) for row in table)
2139
2140
2141 def _match_one(filter_part, dct):
2142     COMPARISON_OPERATORS = {
2143         '<': operator.lt,
2144         '<=': operator.le,
2145         '>': operator.gt,
2146         '>=': operator.ge,
2147         '=': operator.eq,
2148         '!=': operator.ne,
2149     }
2150     operator_rex = re.compile(r'''(?x)\s*
2151         (?P<key>[a-z_]+)
2152         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2153         (?:
2154             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2155             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2156         )
2157         \s*$
2158         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2159     m = operator_rex.search(filter_part)
2160     if m:
2161         op = COMPARISON_OPERATORS[m.group('op')]
2162         if m.group('strval') is not None:
2163             if m.group('op') not in ('=', '!='):
2164                 raise ValueError(
2165                     'Operator %s does not support string values!' % m.group('op'))
2166             comparison_value = m.group('strval')
2167         else:
2168             try:
2169                 comparison_value = int(m.group('intval'))
2170             except ValueError:
2171                 comparison_value = parse_filesize(m.group('intval'))
2172                 if comparison_value is None:
2173                     comparison_value = parse_filesize(m.group('intval') + 'B')
2174                 if comparison_value is None:
2175                     raise ValueError(
2176                         'Invalid integer value %r in filter part %r' % (
2177                             m.group('intval'), filter_part))
2178         actual_value = dct.get(m.group('key'))
2179         if actual_value is None:
2180             return m.group('none_inclusive')
2181         return op(actual_value, comparison_value)
2182
2183     UNARY_OPERATORS = {
2184         '': lambda v: v is not None,
2185         '!': lambda v: v is None,
2186     }
2187     operator_rex = re.compile(r'''(?x)\s*
2188         (?P<op>%s)\s*(?P<key>[a-z_]+)
2189         \s*$
2190         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2191     m = operator_rex.search(filter_part)
2192     if m:
2193         op = UNARY_OPERATORS[m.group('op')]
2194         actual_value = dct.get(m.group('key'))
2195         return op(actual_value)
2196
2197     raise ValueError('Invalid filter part %r' % filter_part)
2198
2199
2200 def match_str(filter_str, dct):
2201     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2202
2203     return all(
2204         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2205
2206
2207 def match_filter_func(filter_str):
2208     def _match_func(info_dict):
2209         if match_str(filter_str, info_dict):
2210             return None
2211         else:
2212             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2213             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2214     return _match_func
2215
2216
2217 def parse_dfxp_time_expr(time_expr):
2218     if not time_expr:
2219         return
2220
2221     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2222     if mobj:
2223         return float(mobj.group('time_offset'))
2224
2225     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2226     if mobj:
2227         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2228
2229
2230 def srt_subtitles_timecode(seconds):
2231     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2232
2233
2234 def dfxp2srt(dfxp_data):
2235     _x = functools.partial(xpath_with_ns, ns_map={
2236         'ttml': 'http://www.w3.org/ns/ttml',
2237         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2238         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2239     })
2240
2241     class TTMLPElementParser(object):
2242         out = ''
2243
2244         def start(self, tag, attrib):
2245             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2246                 self.out += '\n'
2247
2248         def end(self, tag):
2249             pass
2250
2251         def data(self, data):
2252             self.out += data
2253
2254         def close(self):
2255             return self.out.strip()
2256
2257     def parse_node(node):
2258         target = TTMLPElementParser()
2259         parser = xml.etree.ElementTree.XMLParser(target=target)
2260         parser.feed(xml.etree.ElementTree.tostring(node))
2261         return parser.close()
2262
2263     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2264     out = []
2265     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2266
2267     if not paras:
2268         raise ValueError('Invalid dfxp/TTML subtitle')
2269
2270     for para, index in zip(paras, itertools.count(1)):
2271         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2272         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2273         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2274         if begin_time is None:
2275             continue
2276         if not end_time:
2277             if not dur:
2278                 continue
2279             end_time = begin_time + dur
2280         out.append('%d\n%s --> %s\n%s\n\n' % (
2281             index,
2282             srt_subtitles_timecode(begin_time),
2283             srt_subtitles_timecode(end_time),
2284             parse_node(para)))
2285
2286     return ''.join(out)
2287
2288
2289 def cli_option(params, command_option, param):
2290     param = params.get(param)
2291     return [command_option, param] if param is not None else []
2292
2293
2294 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2295     param = params.get(param)
2296     assert isinstance(param, bool)
2297     if separator:
2298         return [command_option + separator + (true_value if param else false_value)]
2299     return [command_option, true_value if param else false_value]
2300
2301
2302 def cli_valueless_option(params, command_option, param, expected_value=True):
2303     param = params.get(param)
2304     return [command_option] if param == expected_value else []
2305
2306
2307 def cli_configuration_args(params, param, default=[]):
2308     ex_args = params.get(param)
2309     if ex_args is None:
2310         return default
2311     assert isinstance(ex_args, list)
2312     return ex_args
2313
2314
2315 class ISO639Utils(object):
2316     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2317     _lang_map = {
2318         'aa': 'aar',
2319         'ab': 'abk',
2320         'ae': 'ave',
2321         'af': 'afr',
2322         'ak': 'aka',
2323         'am': 'amh',
2324         'an': 'arg',
2325         'ar': 'ara',
2326         'as': 'asm',
2327         'av': 'ava',
2328         'ay': 'aym',
2329         'az': 'aze',
2330         'ba': 'bak',
2331         'be': 'bel',
2332         'bg': 'bul',
2333         'bh': 'bih',
2334         'bi': 'bis',
2335         'bm': 'bam',
2336         'bn': 'ben',
2337         'bo': 'bod',
2338         'br': 'bre',
2339         'bs': 'bos',
2340         'ca': 'cat',
2341         'ce': 'che',
2342         'ch': 'cha',
2343         'co': 'cos',
2344         'cr': 'cre',
2345         'cs': 'ces',
2346         'cu': 'chu',
2347         'cv': 'chv',
2348         'cy': 'cym',
2349         'da': 'dan',
2350         'de': 'deu',
2351         'dv': 'div',
2352         'dz': 'dzo',
2353         'ee': 'ewe',
2354         'el': 'ell',
2355         'en': 'eng',
2356         'eo': 'epo',
2357         'es': 'spa',
2358         'et': 'est',
2359         'eu': 'eus',
2360         'fa': 'fas',
2361         'ff': 'ful',
2362         'fi': 'fin',
2363         'fj': 'fij',
2364         'fo': 'fao',
2365         'fr': 'fra',
2366         'fy': 'fry',
2367         'ga': 'gle',
2368         'gd': 'gla',
2369         'gl': 'glg',
2370         'gn': 'grn',
2371         'gu': 'guj',
2372         'gv': 'glv',
2373         'ha': 'hau',
2374         'he': 'heb',
2375         'hi': 'hin',
2376         'ho': 'hmo',
2377         'hr': 'hrv',
2378         'ht': 'hat',
2379         'hu': 'hun',
2380         'hy': 'hye',
2381         'hz': 'her',
2382         'ia': 'ina',
2383         'id': 'ind',
2384         'ie': 'ile',
2385         'ig': 'ibo',
2386         'ii': 'iii',
2387         'ik': 'ipk',
2388         'io': 'ido',
2389         'is': 'isl',
2390         'it': 'ita',
2391         'iu': 'iku',
2392         'ja': 'jpn',
2393         'jv': 'jav',
2394         'ka': 'kat',
2395         'kg': 'kon',
2396         'ki': 'kik',
2397         'kj': 'kua',
2398         'kk': 'kaz',
2399         'kl': 'kal',
2400         'km': 'khm',
2401         'kn': 'kan',
2402         'ko': 'kor',
2403         'kr': 'kau',
2404         'ks': 'kas',
2405         'ku': 'kur',
2406         'kv': 'kom',
2407         'kw': 'cor',
2408         'ky': 'kir',
2409         'la': 'lat',
2410         'lb': 'ltz',
2411         'lg': 'lug',
2412         'li': 'lim',
2413         'ln': 'lin',
2414         'lo': 'lao',
2415         'lt': 'lit',
2416         'lu': 'lub',
2417         'lv': 'lav',
2418         'mg': 'mlg',
2419         'mh': 'mah',
2420         'mi': 'mri',
2421         'mk': 'mkd',
2422         'ml': 'mal',
2423         'mn': 'mon',
2424         'mr': 'mar',
2425         'ms': 'msa',
2426         'mt': 'mlt',
2427         'my': 'mya',
2428         'na': 'nau',
2429         'nb': 'nob',
2430         'nd': 'nde',
2431         'ne': 'nep',
2432         'ng': 'ndo',
2433         'nl': 'nld',
2434         'nn': 'nno',
2435         'no': 'nor',
2436         'nr': 'nbl',
2437         'nv': 'nav',
2438         'ny': 'nya',
2439         'oc': 'oci',
2440         'oj': 'oji',
2441         'om': 'orm',
2442         'or': 'ori',
2443         'os': 'oss',
2444         'pa': 'pan',
2445         'pi': 'pli',
2446         'pl': 'pol',
2447         'ps': 'pus',
2448         'pt': 'por',
2449         'qu': 'que',
2450         'rm': 'roh',
2451         'rn': 'run',
2452         'ro': 'ron',
2453         'ru': 'rus',
2454         'rw': 'kin',
2455         'sa': 'san',
2456         'sc': 'srd',
2457         'sd': 'snd',
2458         'se': 'sme',
2459         'sg': 'sag',
2460         'si': 'sin',
2461         'sk': 'slk',
2462         'sl': 'slv',
2463         'sm': 'smo',
2464         'sn': 'sna',
2465         'so': 'som',
2466         'sq': 'sqi',
2467         'sr': 'srp',
2468         'ss': 'ssw',
2469         'st': 'sot',
2470         'su': 'sun',
2471         'sv': 'swe',
2472         'sw': 'swa',
2473         'ta': 'tam',
2474         'te': 'tel',
2475         'tg': 'tgk',
2476         'th': 'tha',
2477         'ti': 'tir',
2478         'tk': 'tuk',
2479         'tl': 'tgl',
2480         'tn': 'tsn',
2481         'to': 'ton',
2482         'tr': 'tur',
2483         'ts': 'tso',
2484         'tt': 'tat',
2485         'tw': 'twi',
2486         'ty': 'tah',
2487         'ug': 'uig',
2488         'uk': 'ukr',
2489         'ur': 'urd',
2490         'uz': 'uzb',
2491         've': 'ven',
2492         'vi': 'vie',
2493         'vo': 'vol',
2494         'wa': 'wln',
2495         'wo': 'wol',
2496         'xh': 'xho',
2497         'yi': 'yid',
2498         'yo': 'yor',
2499         'za': 'zha',
2500         'zh': 'zho',
2501         'zu': 'zul',
2502     }
2503
2504     @classmethod
2505     def short2long(cls, code):
2506         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2507         return cls._lang_map.get(code[:2])
2508
2509     @classmethod
2510     def long2short(cls, code):
2511         """Convert language code from ISO 639-2/T to ISO 639-1"""
2512         for short_name, long_name in cls._lang_map.items():
2513             if long_name == code:
2514                 return short_name
2515
2516
2517 class ISO3166Utils(object):
2518     # From http://data.okfn.org/data/core/country-list
2519     _country_map = {
2520         'AF': 'Afghanistan',
2521         'AX': 'Åland Islands',
2522         'AL': 'Albania',
2523         'DZ': 'Algeria',
2524         'AS': 'American Samoa',
2525         'AD': 'Andorra',
2526         'AO': 'Angola',
2527         'AI': 'Anguilla',
2528         'AQ': 'Antarctica',
2529         'AG': 'Antigua and Barbuda',
2530         'AR': 'Argentina',
2531         'AM': 'Armenia',
2532         'AW': 'Aruba',
2533         'AU': 'Australia',
2534         'AT': 'Austria',
2535         'AZ': 'Azerbaijan',
2536         'BS': 'Bahamas',
2537         'BH': 'Bahrain',
2538         'BD': 'Bangladesh',
2539         'BB': 'Barbados',
2540         'BY': 'Belarus',
2541         'BE': 'Belgium',
2542         'BZ': 'Belize',
2543         'BJ': 'Benin',
2544         'BM': 'Bermuda',
2545         'BT': 'Bhutan',
2546         'BO': 'Bolivia, Plurinational State of',
2547         'BQ': 'Bonaire, Sint Eustatius and Saba',
2548         'BA': 'Bosnia and Herzegovina',
2549         'BW': 'Botswana',
2550         'BV': 'Bouvet Island',
2551         'BR': 'Brazil',
2552         'IO': 'British Indian Ocean Territory',
2553         'BN': 'Brunei Darussalam',
2554         'BG': 'Bulgaria',
2555         'BF': 'Burkina Faso',
2556         'BI': 'Burundi',
2557         'KH': 'Cambodia',
2558         'CM': 'Cameroon',
2559         'CA': 'Canada',
2560         'CV': 'Cape Verde',
2561         'KY': 'Cayman Islands',
2562         'CF': 'Central African Republic',
2563         'TD': 'Chad',
2564         'CL': 'Chile',
2565         'CN': 'China',
2566         'CX': 'Christmas Island',
2567         'CC': 'Cocos (Keeling) Islands',
2568         'CO': 'Colombia',
2569         'KM': 'Comoros',
2570         'CG': 'Congo',
2571         'CD': 'Congo, the Democratic Republic of the',
2572         'CK': 'Cook Islands',
2573         'CR': 'Costa Rica',
2574         'CI': 'Côte d\'Ivoire',
2575         'HR': 'Croatia',
2576         'CU': 'Cuba',
2577         'CW': 'Curaçao',
2578         'CY': 'Cyprus',
2579         'CZ': 'Czech Republic',
2580         'DK': 'Denmark',
2581         'DJ': 'Djibouti',
2582         'DM': 'Dominica',
2583         'DO': 'Dominican Republic',
2584         'EC': 'Ecuador',
2585         'EG': 'Egypt',
2586         'SV': 'El Salvador',
2587         'GQ': 'Equatorial Guinea',
2588         'ER': 'Eritrea',
2589         'EE': 'Estonia',
2590         'ET': 'Ethiopia',
2591         'FK': 'Falkland Islands (Malvinas)',
2592         'FO': 'Faroe Islands',
2593         'FJ': 'Fiji',
2594         'FI': 'Finland',
2595         'FR': 'France',
2596         'GF': 'French Guiana',
2597         'PF': 'French Polynesia',
2598         'TF': 'French Southern Territories',
2599         'GA': 'Gabon',
2600         'GM': 'Gambia',
2601         'GE': 'Georgia',
2602         'DE': 'Germany',
2603         'GH': 'Ghana',
2604         'GI': 'Gibraltar',
2605         'GR': 'Greece',
2606         'GL': 'Greenland',
2607         'GD': 'Grenada',
2608         'GP': 'Guadeloupe',
2609         'GU': 'Guam',
2610         'GT': 'Guatemala',
2611         'GG': 'Guernsey',
2612         'GN': 'Guinea',
2613         'GW': 'Guinea-Bissau',
2614         'GY': 'Guyana',
2615         'HT': 'Haiti',
2616         'HM': 'Heard Island and McDonald Islands',
2617         'VA': 'Holy See (Vatican City State)',
2618         'HN': 'Honduras',
2619         'HK': 'Hong Kong',
2620         'HU': 'Hungary',
2621         'IS': 'Iceland',
2622         'IN': 'India',
2623         'ID': 'Indonesia',
2624         'IR': 'Iran, Islamic Republic of',
2625         'IQ': 'Iraq',
2626         'IE': 'Ireland',
2627         'IM': 'Isle of Man',
2628         'IL': 'Israel',
2629         'IT': 'Italy',
2630         'JM': 'Jamaica',
2631         'JP': 'Japan',
2632         'JE': 'Jersey',
2633         'JO': 'Jordan',
2634         'KZ': 'Kazakhstan',
2635         'KE': 'Kenya',
2636         'KI': 'Kiribati',
2637         'KP': 'Korea, Democratic People\'s Republic of',
2638         'KR': 'Korea, Republic of',
2639         'KW': 'Kuwait',
2640         'KG': 'Kyrgyzstan',
2641         'LA': 'Lao People\'s Democratic Republic',
2642         'LV': 'Latvia',
2643         'LB': 'Lebanon',
2644         'LS': 'Lesotho',
2645         'LR': 'Liberia',
2646         'LY': 'Libya',
2647         'LI': 'Liechtenstein',
2648         'LT': 'Lithuania',
2649         'LU': 'Luxembourg',
2650         'MO': 'Macao',
2651         'MK': 'Macedonia, the Former Yugoslav Republic of',
2652         'MG': 'Madagascar',
2653         'MW': 'Malawi',
2654         'MY': 'Malaysia',
2655         'MV': 'Maldives',
2656         'ML': 'Mali',
2657         'MT': 'Malta',
2658         'MH': 'Marshall Islands',
2659         'MQ': 'Martinique',
2660         'MR': 'Mauritania',
2661         'MU': 'Mauritius',
2662         'YT': 'Mayotte',
2663         'MX': 'Mexico',
2664         'FM': 'Micronesia, Federated States of',
2665         'MD': 'Moldova, Republic of',
2666         'MC': 'Monaco',
2667         'MN': 'Mongolia',
2668         'ME': 'Montenegro',
2669         'MS': 'Montserrat',
2670         'MA': 'Morocco',
2671         'MZ': 'Mozambique',
2672         'MM': 'Myanmar',
2673         'NA': 'Namibia',
2674         'NR': 'Nauru',
2675         'NP': 'Nepal',
2676         'NL': 'Netherlands',
2677         'NC': 'New Caledonia',
2678         'NZ': 'New Zealand',
2679         'NI': 'Nicaragua',
2680         'NE': 'Niger',
2681         'NG': 'Nigeria',
2682         'NU': 'Niue',
2683         'NF': 'Norfolk Island',
2684         'MP': 'Northern Mariana Islands',
2685         'NO': 'Norway',
2686         'OM': 'Oman',
2687         'PK': 'Pakistan',
2688         'PW': 'Palau',
2689         'PS': 'Palestine, State of',
2690         'PA': 'Panama',
2691         'PG': 'Papua New Guinea',
2692         'PY': 'Paraguay',
2693         'PE': 'Peru',
2694         'PH': 'Philippines',
2695         'PN': 'Pitcairn',
2696         'PL': 'Poland',
2697         'PT': 'Portugal',
2698         'PR': 'Puerto Rico',
2699         'QA': 'Qatar',
2700         'RE': 'Réunion',
2701         'RO': 'Romania',
2702         'RU': 'Russian Federation',
2703         'RW': 'Rwanda',
2704         'BL': 'Saint Barthélemy',
2705         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2706         'KN': 'Saint Kitts and Nevis',
2707         'LC': 'Saint Lucia',
2708         'MF': 'Saint Martin (French part)',
2709         'PM': 'Saint Pierre and Miquelon',
2710         'VC': 'Saint Vincent and the Grenadines',
2711         'WS': 'Samoa',
2712         'SM': 'San Marino',
2713         'ST': 'Sao Tome and Principe',
2714         'SA': 'Saudi Arabia',
2715         'SN': 'Senegal',
2716         'RS': 'Serbia',
2717         'SC': 'Seychelles',
2718         'SL': 'Sierra Leone',
2719         'SG': 'Singapore',
2720         'SX': 'Sint Maarten (Dutch part)',
2721         'SK': 'Slovakia',
2722         'SI': 'Slovenia',
2723         'SB': 'Solomon Islands',
2724         'SO': 'Somalia',
2725         'ZA': 'South Africa',
2726         'GS': 'South Georgia and the South Sandwich Islands',
2727         'SS': 'South Sudan',
2728         'ES': 'Spain',
2729         'LK': 'Sri Lanka',
2730         'SD': 'Sudan',
2731         'SR': 'Suriname',
2732         'SJ': 'Svalbard and Jan Mayen',
2733         'SZ': 'Swaziland',
2734         'SE': 'Sweden',
2735         'CH': 'Switzerland',
2736         'SY': 'Syrian Arab Republic',
2737         'TW': 'Taiwan, Province of China',
2738         'TJ': 'Tajikistan',
2739         'TZ': 'Tanzania, United Republic of',
2740         'TH': 'Thailand',
2741         'TL': 'Timor-Leste',
2742         'TG': 'Togo',
2743         'TK': 'Tokelau',
2744         'TO': 'Tonga',
2745         'TT': 'Trinidad and Tobago',
2746         'TN': 'Tunisia',
2747         'TR': 'Turkey',
2748         'TM': 'Turkmenistan',
2749         'TC': 'Turks and Caicos Islands',
2750         'TV': 'Tuvalu',
2751         'UG': 'Uganda',
2752         'UA': 'Ukraine',
2753         'AE': 'United Arab Emirates',
2754         'GB': 'United Kingdom',
2755         'US': 'United States',
2756         'UM': 'United States Minor Outlying Islands',
2757         'UY': 'Uruguay',
2758         'UZ': 'Uzbekistan',
2759         'VU': 'Vanuatu',
2760         'VE': 'Venezuela, Bolivarian Republic of',
2761         'VN': 'Viet Nam',
2762         'VG': 'Virgin Islands, British',
2763         'VI': 'Virgin Islands, U.S.',
2764         'WF': 'Wallis and Futuna',
2765         'EH': 'Western Sahara',
2766         'YE': 'Yemen',
2767         'ZM': 'Zambia',
2768         'ZW': 'Zimbabwe',
2769     }
2770
2771     @classmethod
2772     def short2full(cls, code):
2773         """Convert an ISO 3166-2 country code to the corresponding full name"""
2774         return cls._country_map.get(code.upper())
2775
2776
2777 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2778     def __init__(self, proxies=None):
2779         # Set default handlers
2780         for type in ('http', 'https'):
2781             setattr(self, '%s_open' % type,
2782                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2783                         meth(r, proxy, type))
2784         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2785
2786     def proxy_open(self, req, proxy, type):
2787         req_proxy = req.headers.get('Ytdl-request-proxy')
2788         if req_proxy is not None:
2789             proxy = req_proxy
2790             del req.headers['Ytdl-request-proxy']
2791
2792         if proxy == '__noproxy__':
2793             return None  # No Proxy
2794         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2795             req.add_header('Ytdl-socks-proxy', proxy)
2796             # youtube-dl's http/https handlers do wrapping the socket with socks
2797             return None
2798         return compat_urllib_request.ProxyHandler.proxy_open(
2799             self, req, proxy, type)
2800
2801
2802 def ohdave_rsa_encrypt(data, exponent, modulus):
2803     '''
2804     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2805
2806     Input:
2807         data: data to encrypt, bytes-like object
2808         exponent, modulus: parameter e and N of RSA algorithm, both integer
2809     Output: hex string of encrypted data
2810
2811     Limitation: supports one block encryption only
2812     '''
2813
2814     payload = int(binascii.hexlify(data[::-1]), 16)
2815     encrypted = pow(payload, exponent, modulus)
2816     return '%x' % encrypted
2817
2818
2819 def encode_base_n(num, n, table=None):
2820     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2821     if not table:
2822         table = FULL_TABLE[:n]
2823
2824     if n > len(table):
2825         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2826
2827     if num == 0:
2828         return table[0]
2829
2830     ret = ''
2831     while num:
2832         ret = table[num % n] + ret
2833         num = num // n
2834     return ret
2835
2836
2837 def decode_packed_codes(code):
2838     mobj = re.search(
2839         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2840         code)
2841     obfucasted_code, base, count, symbols = mobj.groups()
2842     base = int(base)
2843     count = int(count)
2844     symbols = symbols.split('|')
2845     symbol_table = {}
2846
2847     while count:
2848         count -= 1
2849         base_n_count = encode_base_n(count, base)
2850         symbol_table[base_n_count] = symbols[count] or base_n_count
2851
2852     return re.sub(
2853         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2854         obfucasted_code)
2855
2856
2857 def parse_m3u8_attributes(attrib):
2858     info = {}
2859     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2860         if val.startswith('"'):
2861             val = val[1:-1]
2862         info[key] = val
2863     return info