Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import itertools
  10 import io
  11 import json
  12 import locale
  13 import math
  14 import os
  15 import pipes
  16 import platform
  17 import re
  18 import ssl
  19 import socket
  20 import struct
  21 import subprocess
  22 import sys
  23 import traceback
  24 import zlib
  25
  26 try:
  27     import urllib.request as compat_urllib_request
  28 except ImportError: # Python 2
  29     import urllib2 as compat_urllib_request
  30
  31 try:
  32     import urllib.error as compat_urllib_error
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_error
  35
  36 try:
  37     import urllib.parse as compat_urllib_parse
  38 except ImportError: # Python 2
  39     import urllib as compat_urllib_parse
  40
  41 try:
  42     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  43 except ImportError: # Python 2
  44     from urlparse import urlparse as compat_urllib_parse_urlparse
  45
  46 try:
  47     import urllib.parse as compat_urlparse
  48 except ImportError: # Python 2
  49     import urlparse as compat_urlparse
  50
  51 try:
  52     import http.cookiejar as compat_cookiejar
  53 except ImportError: # Python 2
  54     import cookielib as compat_cookiejar
  55
  56 try:
  57     import html.entities as compat_html_entities
  58 except ImportError: # Python 2
  59     import htmlentitydefs as compat_html_entities
  60
  61 try:
  62     import html.parser as compat_html_parser
  63 except ImportError: # Python 2
  64     import HTMLParser as compat_html_parser
  65
  66 try:
  67     import http.client as compat_http_client
  68 except ImportError: # Python 2
  69     import httplib as compat_http_client
  70
  71 try:
  72     from urllib.error import HTTPError as compat_HTTPError
  73 except ImportError:  # Python 2
  74     from urllib2 import HTTPError as compat_HTTPError
  75
  76 try:
  77     from urllib.request import urlretrieve as compat_urlretrieve
  78 except ImportError:  # Python 2
  79     from urllib import urlretrieve as compat_urlretrieve
  80
  81
  82 try:
  83     from subprocess import DEVNULL
  84     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  85 except ImportError:
  86     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  87
  88 try:
  89     from urllib.parse import parse_qs as compat_parse_qs
  90 except ImportError: # Python 2
  91     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  92     # Python 2's version is apparently totally broken
  93     def _unquote(string, encoding='utf-8', errors='replace'):
  94         if string == '':
  95             return string
  96         res = string.split('%')
  97         if len(res) == 1:
  98             return string
  99         if encoding is None:
 100             encoding = 'utf-8'
 101         if errors is None:
 102             errors = 'replace'
 103         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 104         pct_sequence = b''
 105         string = res[0]
 106         for item in res[1:]:
 107             try:
 108                 if not item:
 109                     raise ValueError
 110                 pct_sequence += item[:2].decode('hex')
 111                 rest = item[2:]
 112                 if not rest:
 113                     # This segment was just a single percent-encoded character.
 114                     # May be part of a sequence of code units, so delay decoding.
 115                     # (Stored in pct_sequence).
 116                     continue
 117             except ValueError:
 118                 rest = '%' + item
 119             # Encountered non-percent-encoded characters. Flush the current
 120             # pct_sequence.
 121             string += pct_sequence.decode(encoding, errors) + rest
 122             pct_sequence = b''
 123         if pct_sequence:
 124             # Flush the final pct_sequence
 125             string += pct_sequence.decode(encoding, errors)
 126         return string
 127
 128     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 129                 encoding='utf-8', errors='replace'):
 130         qs, _coerce_result = qs, unicode
 131         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 132         r = []
 133         for name_value in pairs:
 134             if not name_value and not strict_parsing:
 135                 continue
 136             nv = name_value.split('=', 1)
 137             if len(nv) != 2:
 138                 if strict_parsing:
 139                     raise ValueError("bad query field: %r" % (name_value,))
 140                 # Handle case of a control-name with no equal sign
 141                 if keep_blank_values:
 142                     nv.append('')
 143                 else:
 144                     continue
 145             if len(nv[1]) or keep_blank_values:
 146                 name = nv[0].replace('+', ' ')
 147                 name = _unquote(name, encoding=encoding, errors=errors)
 148                 name = _coerce_result(name)
 149                 value = nv[1].replace('+', ' ')
 150                 value = _unquote(value, encoding=encoding, errors=errors)
 151                 value = _coerce_result(value)
 152                 r.append((name, value))
 153         return r
 154
 155     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 156                 encoding='utf-8', errors='replace'):
 157         parsed_result = {}
 158         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 159                         encoding=encoding, errors=errors)
 160         for name, value in pairs:
 161             if name in parsed_result:
 162                 parsed_result[name].append(value)
 163             else:
 164                 parsed_result[name] = [value]
 165         return parsed_result
 166
 167 try:
 168     compat_str = unicode # Python 2
 169 except NameError:
 170     compat_str = str
 171
 172 try:
 173     compat_chr = unichr # Python 2
 174 except NameError:
 175     compat_chr = chr
 176
 177 def compat_ord(c):
 178     if type(c) is int: return c
 179     else: return ord(c)
 180
 181 # This is not clearly defined otherwise
 182 compiled_regex_type = type(re.compile(''))
 183
 184 std_headers = {
 185     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 186     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 187     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 188     'Accept-Encoding': 'gzip, deflate',
 189     'Accept-Language': 'en-us,en;q=0.5',
 190 }
 191
 192 def preferredencoding():
 193     """Get preferred encoding.
 194
 195     Returns the best encoding scheme for the system, based on
 196     locale.getpreferredencoding() and some further tweaks.
 197     """
 198     try:
 199         pref = locale.getpreferredencoding()
 200         u'TEST'.encode(pref)
 201     except:
 202         pref = 'UTF-8'
 203
 204     return pref
 205
 206 if sys.version_info < (3,0):
 207     def compat_print(s):
 208         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 209 else:
 210     def compat_print(s):
 211         assert type(s) == type(u'')
 212         print(s)
 213
 214 # In Python 2.x, json.dump expects a bytestream.
 215 # In Python 3.x, it writes to a character stream
 216 if sys.version_info < (3,0):
 217     def write_json_file(obj, fn):
 218         with open(fn, 'wb') as f:
 219             json.dump(obj, f)
 220 else:
 221     def write_json_file(obj, fn):
 222         with open(fn, 'w', encoding='utf-8') as f:
 223             json.dump(obj, f)
 224
 225 if sys.version_info >= (2,7):
 226     def find_xpath_attr(node, xpath, key, val):
 227         """ Find the xpath xpath[@key=val] """
 228         assert re.match(r'^[a-zA-Z]+$', key)
 229         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 230         expr = xpath + u"[@%s='%s']" % (key, val)
 231         return node.find(expr)
 232 else:
 233     def find_xpath_attr(node, xpath, key, val):
 234         for f in node.findall(xpath):
 235             if f.attrib.get(key) == val:
 236                 return f
 237         return None
 238
 239 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 240 # the namespace parameter
 241 def xpath_with_ns(path, ns_map):
 242     components = [c.split(':') for c in path.split('/')]
 243     replaced = []
 244     for c in components:
 245         if len(c) == 1:
 246             replaced.append(c[0])
 247         else:
 248             ns, tag = c
 249             replaced.append('{%s}%s' % (ns_map[ns], tag))
 250     return '/'.join(replaced)
 251
 252 def htmlentity_transform(matchobj):
 253     """Transforms an HTML entity to a character.
 254
 255     This function receives a match object and is intended to be used with
 256     the re.sub() function.
 257     """
 258     entity = matchobj.group(1)
 259
 260     # Known non-numeric HTML entity
 261     if entity in compat_html_entities.name2codepoint:
 262         return compat_chr(compat_html_entities.name2codepoint[entity])
 263
 264     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 265     if mobj is not None:
 266         numstr = mobj.group(1)
 267         if numstr.startswith(u'x'):
 268             base = 16
 269             numstr = u'0%s' % numstr
 270         else:
 271             base = 10
 272         return compat_chr(int(numstr, base))
 273
 274     # Unknown entity in name, return its literal representation
 275     return (u'&%s;' % entity)
 276
 277 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 278 class BaseHTMLParser(compat_html_parser.HTMLParser):
 279     def __init(self):
 280         compat_html_parser.HTMLParser.__init__(self)
 281         self.html = None
 282
 283     def loads(self, html):
 284         self.html = html
 285         self.feed(html)
 286         self.close()
 287
 288 class AttrParser(BaseHTMLParser):
 289     """Modified HTMLParser that isolates a tag with the specified attribute"""
 290     def __init__(self, attribute, value):
 291         self.attribute = attribute
 292         self.value = value
 293         self.result = None
 294         self.started = False
 295         self.depth = {}
 296         self.watch_startpos = False
 297         self.error_count = 0
 298         BaseHTMLParser.__init__(self)
 299
 300     def error(self, message):
 301         if self.error_count > 10 or self.started:
 302             raise compat_html_parser.HTMLParseError(message, self.getpos())
 303         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 304         self.error_count += 1
 305         self.goahead(1)
 306
 307     def handle_starttag(self, tag, attrs):
 308         attrs = dict(attrs)
 309         if self.started:
 310             self.find_startpos(None)
 311         if self.attribute in attrs and attrs[self.attribute] == self.value:
 312             self.result = [tag]
 313             self.started = True
 314             self.watch_startpos = True
 315         if self.started:
 316             if not tag in self.depth: self.depth[tag] = 0
 317             self.depth[tag] += 1
 318
 319     def handle_endtag(self, tag):
 320         if self.started:
 321             if tag in self.depth: self.depth[tag] -= 1
 322             if self.depth[self.result[0]] == 0:
 323                 self.started = False
 324                 self.result.append(self.getpos())
 325
 326     def find_startpos(self, x):
 327         """Needed to put the start position of the result (self.result[1])
 328         after the opening tag with the requested id"""
 329         if self.watch_startpos:
 330             self.watch_startpos = False
 331             self.result.append(self.getpos())
 332     handle_entityref = handle_charref = handle_data = handle_comment = \
 333     handle_decl = handle_pi = unknown_decl = find_startpos
 334
 335     def get_result(self):
 336         if self.result is None:
 337             return None
 338         if len(self.result) != 3:
 339             return None
 340         lines = self.html.split('\n')
 341         lines = lines[self.result[1][0]-1:self.result[2][0]]
 342         lines[0] = lines[0][self.result[1][1]:]
 343         if len(lines) == 1:
 344             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 345         lines[-1] = lines[-1][:self.result[2][1]]
 346         return '\n'.join(lines).strip()
 347 # Hack for https://github.com/rg3/youtube-dl/issues/662
 348 if sys.version_info < (2, 7, 3):
 349     AttrParser.parse_endtag = (lambda self, i:
 350         i + len("</scr'+'ipt>")
 351         if self.rawdata[i:].startswith("</scr'+'ipt>")
 352         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 353
 354 def get_element_by_id(id, html):
 355     """Return the content of the tag with the specified ID in the passed HTML document"""
 356     return get_element_by_attribute("id", id, html)
 357
 358 def get_element_by_attribute(attribute, value, html):
 359     """Return the content of the tag with the specified attribute in the passed HTML document"""
 360     parser = AttrParser(attribute, value)
 361     try:
 362         parser.loads(html)
 363     except compat_html_parser.HTMLParseError:
 364         pass
 365     return parser.get_result()
 366
 367 class MetaParser(BaseHTMLParser):
 368     """
 369     Modified HTMLParser that isolates a meta tag with the specified name
 370     attribute.
 371     """
 372     def __init__(self, name):
 373         BaseHTMLParser.__init__(self)
 374         self.name = name
 375         self.content = None
 376         self.result = None
 377
 378     def handle_starttag(self, tag, attrs):
 379         if tag != 'meta':
 380             return
 381         attrs = dict(attrs)
 382         if attrs.get('name') == self.name:
 383             self.result = attrs.get('content')
 384
 385     def get_result(self):
 386         return self.result
 387
 388 def get_meta_content(name, html):
 389     """
 390     Return the content attribute from the meta tag with the given name attribute.
 391     """
 392     parser = MetaParser(name)
 393     try:
 394         parser.loads(html)
 395     except compat_html_parser.HTMLParseError:
 396         pass
 397     return parser.get_result()
 398
 399
 400 def clean_html(html):
 401     """Clean an HTML snippet into a readable string"""
 402     # Newline vs <br />
 403     html = html.replace('\n', ' ')
 404     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 405     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 406     # Strip html tags
 407     html = re.sub('<.*?>', '', html)
 408     # Replace html entities
 409     html = unescapeHTML(html)
 410     return html.strip()
 411
 412
 413 def sanitize_open(filename, open_mode):
 414     """Try to open the given filename, and slightly tweak it if this fails.
 415
 416     Attempts to open the given filename. If this fails, it tries to change
 417     the filename slightly, step by step, until it's either able to open it
 418     or it fails and raises a final exception, like the standard open()
 419     function.
 420
 421     It returns the tuple (stream, definitive_file_name).
 422     """
 423     try:
 424         if filename == u'-':
 425             if sys.platform == 'win32':
 426                 import msvcrt
 427                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 428             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 429         stream = open(encodeFilename(filename), open_mode)
 430         return (stream, filename)
 431     except (IOError, OSError) as err:
 432         if err.errno in (errno.EACCES,):
 433             raise
 434
 435         # In case of error, try to remove win32 forbidden chars
 436         alt_filename = os.path.join(
 437                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 438                         for path_part in os.path.split(filename)
 439                        )
 440         if alt_filename == filename:
 441             raise
 442         else:
 443             # An exception here should be caught in the caller
 444             stream = open(encodeFilename(filename), open_mode)
 445             return (stream, alt_filename)
 446
 447
 448 def timeconvert(timestr):
 449     """Convert RFC 2822 defined time string into system timestamp"""
 450     timestamp = None
 451     timetuple = email.utils.parsedate_tz(timestr)
 452     if timetuple is not None:
 453         timestamp = email.utils.mktime_tz(timetuple)
 454     return timestamp
 455
 456 def sanitize_filename(s, restricted=False, is_id=False):
 457     """Sanitizes a string so it could be used as part of a filename.
 458     If restricted is set, use a stricter subset of allowed characters.
 459     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 460     """
 461     def replace_insane(char):
 462         if char == '?' or ord(char) < 32 or ord(char) == 127:
 463             return ''
 464         elif char == '"':
 465             return '' if restricted else '\''
 466         elif char == ':':
 467             return '_-' if restricted else ' -'
 468         elif char in '\\/|*<>':
 469             return '_'
 470         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 471             return '_'
 472         if restricted and ord(char) > 127:
 473             return '_'
 474         return char
 475
 476     result = u''.join(map(replace_insane, s))
 477     if not is_id:
 478         while '__' in result:
 479             result = result.replace('__', '_')
 480         result = result.strip('_')
 481         # Common case of "Foreign band name - English song title"
 482         if restricted and result.startswith('-_'):
 483             result = result[2:]
 484         if not result:
 485             result = '_'
 486     return result
 487
 488 def orderedSet(iterable):
 489     """ Remove all duplicates from the input iterable """
 490     res = []
 491     for el in iterable:
 492         if el not in res:
 493             res.append(el)
 494     return res
 495
 496 def unescapeHTML(s):
 497     """
 498     @param s a string
 499     """
 500     assert type(s) == type(u'')
 501
 502     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 503     return result
 504
 505
 506 def encodeFilename(s, for_subprocess=False):
 507     """
 508     @param s The name of the file
 509     """
 510
 511     assert type(s) == compat_str
 512
 513     # Python 3 has a Unicode API
 514     if sys.version_info >= (3, 0):
 515         return s
 516
 517     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 518         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 519         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 520         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 521         if not for_subprocess:
 522             return s
 523         else:
 524             # For subprocess calls, encode with locale encoding
 525             # Refer to http://stackoverflow.com/a/9951851/35070
 526             encoding = preferredencoding()
 527     else:
 528         encoding = sys.getfilesystemencoding()
 529     if encoding is None:
 530         encoding = 'utf-8'
 531     return s.encode(encoding, 'ignore')
 532
 533
 534 def decodeOption(optval):
 535     if optval is None:
 536         return optval
 537     if isinstance(optval, bytes):
 538         optval = optval.decode(preferredencoding())
 539
 540     assert isinstance(optval, compat_str)
 541     return optval
 542
 543 def formatSeconds(secs):
 544     if secs > 3600:
 545         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 546     elif secs > 60:
 547         return '%d:%02d' % (secs // 60, secs % 60)
 548     else:
 549         return '%d' % secs
 550
 551
 552 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 553     if sys.version_info < (3, 2):
 554         import httplib
 555
 556         class HTTPSConnectionV3(httplib.HTTPSConnection):
 557             def __init__(self, *args, **kwargs):
 558                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 559
 560             def connect(self):
 561                 sock = socket.create_connection((self.host, self.port), self.timeout)
 562                 if getattr(self, '_tunnel_host', False):
 563                     self.sock = sock
 564                     self._tunnel()
 565                 try:
 566                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 567                 except ssl.SSLError:
 568                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 569
 570         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 571             def https_open(self, req):
 572                 return self.do_open(HTTPSConnectionV3, req)
 573         return HTTPSHandlerV3(**kwargs)
 574     else:
 575         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 576         context.verify_mode = (ssl.CERT_NONE
 577                                if opts_no_check_certificate
 578                                else ssl.CERT_REQUIRED)
 579         context.set_default_verify_paths()
 580         try:
 581             context.load_default_certs()
 582         except AttributeError:
 583             pass  # Python < 3.4
 584         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 585
 586 class ExtractorError(Exception):
 587     """Error during info extraction."""
 588     def __init__(self, msg, tb=None, expected=False, cause=None):
 589         """ tb, if given, is the original traceback (so that it can be printed out).
 590         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 591         """
 592
 593         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 594             expected = True
 595         if not expected:
 596             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 597         super(ExtractorError, self).__init__(msg)
 598
 599         self.traceback = tb
 600         self.exc_info = sys.exc_info()  # preserve original exception
 601         self.cause = cause
 602
 603     def format_traceback(self):
 604         if self.traceback is None:
 605             return None
 606         return u''.join(traceback.format_tb(self.traceback))
 607
 608
 609 class RegexNotFoundError(ExtractorError):
 610     """Error when a regex didn't match"""
 611     pass
 612
 613
 614 class DownloadError(Exception):
 615     """Download Error exception.
 616
 617     This exception may be thrown by FileDownloader objects if they are not
 618     configured to continue on errors. They will contain the appropriate
 619     error message.
 620     """
 621     def __init__(self, msg, exc_info=None):
 622         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 623         super(DownloadError, self).__init__(msg)
 624         self.exc_info = exc_info
 625
 626
 627 class SameFileError(Exception):
 628     """Same File exception.
 629
 630     This exception will be thrown by FileDownloader objects if they detect
 631     multiple files would have to be downloaded to the same file on disk.
 632     """
 633     pass
 634
 635
 636 class PostProcessingError(Exception):
 637     """Post Processing exception.
 638
 639     This exception may be raised by PostProcessor's .run() method to
 640     indicate an error in the postprocessing task.
 641     """
 642     def __init__(self, msg):
 643         self.msg = msg
 644
 645 class MaxDownloadsReached(Exception):
 646     """ --max-downloads limit has been reached. """
 647     pass
 648
 649
 650 class UnavailableVideoError(Exception):
 651     """Unavailable Format exception.
 652
 653     This exception will be thrown when a video is requested
 654     in a format that is not available for that video.
 655     """
 656     pass
 657
 658
 659 class ContentTooShortError(Exception):
 660     """Content Too Short exception.
 661
 662     This exception may be raised by FileDownloader objects when a file they
 663     download is too small for what the server announced first, indicating
 664     the connection was probably interrupted.
 665     """
 666     # Both in bytes
 667     downloaded = None
 668     expected = None
 669
 670     def __init__(self, downloaded, expected):
 671         self.downloaded = downloaded
 672         self.expected = expected
 673
 674 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 675     """Handler for HTTP requests and responses.
 676
 677     This class, when installed with an OpenerDirector, automatically adds
 678     the standard headers to every HTTP request and handles gzipped and
 679     deflated responses from web servers. If compression is to be avoided in
 680     a particular request, the original request in the program code only has
 681     to include the HTTP header "Youtubedl-No-Compression", which will be
 682     removed before making the real request.
 683
 684     Part of this code was copied from:
 685
 686     http://techknack.net/python-urllib2-handlers/
 687
 688     Andrew Rowls, the author of that code, agreed to release it to the
 689     public domain.
 690     """
 691
 692     @staticmethod
 693     def deflate(data):
 694         try:
 695             return zlib.decompress(data, -zlib.MAX_WBITS)
 696         except zlib.error:
 697             return zlib.decompress(data)
 698
 699     @staticmethod
 700     def addinfourl_wrapper(stream, headers, url, code):
 701         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 702             return compat_urllib_request.addinfourl(stream, headers, url, code)
 703         ret = compat_urllib_request.addinfourl(stream, headers, url)
 704         ret.code = code
 705         return ret
 706
 707     def http_request(self, req):
 708         for h,v in std_headers.items():
 709             if h in req.headers:
 710                 del req.headers[h]
 711             req.add_header(h, v)
 712         if 'Youtubedl-no-compression' in req.headers:
 713             if 'Accept-encoding' in req.headers:
 714                 del req.headers['Accept-encoding']
 715             del req.headers['Youtubedl-no-compression']
 716         if 'Youtubedl-user-agent' in req.headers:
 717             if 'User-agent' in req.headers:
 718                 del req.headers['User-agent']
 719             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 720             del req.headers['Youtubedl-user-agent']
 721         return req
 722
 723     def http_response(self, req, resp):
 724         old_resp = resp
 725         # gzip
 726         if resp.headers.get('Content-encoding', '') == 'gzip':
 727             content = resp.read()
 728             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 729             try:
 730                 uncompressed = io.BytesIO(gz.read())
 731             except IOError as original_ioerror:
 732                 # There may be junk add the end of the file
 733                 # See http://stackoverflow.com/q/4928560/35070 for details
 734                 for i in range(1, 1024):
 735                     try:
 736                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 737                         uncompressed = io.BytesIO(gz.read())
 738                     except IOError:
 739                         continue
 740                     break
 741                 else:
 742                     raise original_ioerror
 743             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 744             resp.msg = old_resp.msg
 745         # deflate
 746         if resp.headers.get('Content-encoding', '') == 'deflate':
 747             gz = io.BytesIO(self.deflate(resp.read()))
 748             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 749             resp.msg = old_resp.msg
 750         return resp
 751
 752     https_request = http_request
 753     https_response = http_response
 754
 755
 756 def unified_strdate(date_str):
 757     """Return a string with the date in the format YYYYMMDD"""
 758     upload_date = None
 759     #Replace commas
 760     date_str = date_str.replace(',', ' ')
 761     # %z (UTC offset) is only supported in python>=3.2
 762     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 763     format_expressions = [
 764         '%d %B %Y',
 765         '%d %b %Y',
 766         '%B %d %Y',
 767         '%b %d %Y',
 768         '%Y-%m-%d',
 769         '%d/%m/%Y',
 770         '%Y/%m/%d %H:%M:%S',
 771         '%Y-%m-%d %H:%M:%S',
 772         '%d.%m.%Y %H:%M',
 773         '%Y-%m-%dT%H:%M:%SZ',
 774         '%Y-%m-%dT%H:%M:%S.%fZ',
 775         '%Y-%m-%dT%H:%M:%S.%f0Z',
 776         '%Y-%m-%dT%H:%M:%S',
 777         '%Y-%m-%dT%H:%M',
 778     ]
 779     for expression in format_expressions:
 780         try:
 781             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 782         except ValueError:
 783             pass
 784     if upload_date is None:
 785         timetuple = email.utils.parsedate_tz(date_str)
 786         if timetuple:
 787             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 788     return upload_date
 789
 790 def determine_ext(url, default_ext=u'unknown_video'):
 791     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 792     if re.match(r'^[A-Za-z0-9]+$', guess):
 793         return guess
 794     else:
 795         return default_ext
 796
 797 def subtitles_filename(filename, sub_lang, sub_format):
 798     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 799
 800 def date_from_str(date_str):
 801     """
 802     Return a datetime object from a string in the format YYYYMMDD or
 803     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 804     today = datetime.date.today()
 805     if date_str == 'now'or date_str == 'today':
 806         return today
 807     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 808     if match is not None:
 809         sign = match.group('sign')
 810         time = int(match.group('time'))
 811         if sign == '-':
 812             time = -time
 813         unit = match.group('unit')
 814         #A bad aproximation?
 815         if unit == 'month':
 816             unit = 'day'
 817             time *= 30
 818         elif unit == 'year':
 819             unit = 'day'
 820             time *= 365
 821         unit += 's'
 822         delta = datetime.timedelta(**{unit: time})
 823         return today + delta
 824     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 825
 826 def hyphenate_date(date_str):
 827     """
 828     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 829     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 830     if match is not None:
 831         return '-'.join(match.groups())
 832     else:
 833         return date_str
 834
 835 class DateRange(object):
 836     """Represents a time interval between two dates"""
 837     def __init__(self, start=None, end=None):
 838         """start and end must be strings in the format accepted by date"""
 839         if start is not None:
 840             self.start = date_from_str(start)
 841         else:
 842             self.start = datetime.datetime.min.date()
 843         if end is not None:
 844             self.end = date_from_str(end)
 845         else:
 846             self.end = datetime.datetime.max.date()
 847         if self.start > self.end:
 848             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 849     @classmethod
 850     def day(cls, day):
 851         """Returns a range that only contains the given day"""
 852         return cls(day,day)
 853     def __contains__(self, date):
 854         """Check if the date is in the range"""
 855         if not isinstance(date, datetime.date):
 856             date = date_from_str(date)
 857         return self.start <= date <= self.end
 858     def __str__(self):
 859         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 860
 861
 862 def platform_name():
 863     """ Returns the platform name as a compat_str """
 864     res = platform.platform()
 865     if isinstance(res, bytes):
 866         res = res.decode(preferredencoding())
 867
 868     assert isinstance(res, compat_str)
 869     return res
 870
 871
 872 def write_string(s, out=None):
 873     if out is None:
 874         out = sys.stderr
 875     assert type(s) == compat_str
 876
 877     if ('b' in getattr(out, 'mode', '') or
 878             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 879         s = s.encode(preferredencoding(), 'ignore')
 880     try:
 881         out.write(s)
 882     except UnicodeEncodeError:
 883         # In Windows shells, this can fail even when the codec is just charmap!?
 884         # See https://wiki.python.org/moin/PrintFails#Issue
 885         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 886             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 887             out.write(s)
 888         else:
 889             raise
 890
 891     out.flush()
 892
 893
 894 def bytes_to_intlist(bs):
 895     if not bs:
 896         return []
 897     if isinstance(bs[0], int):  # Python 3
 898         return list(bs)
 899     else:
 900         return [ord(c) for c in bs]
 901
 902
 903 def intlist_to_bytes(xs):
 904     if not xs:
 905         return b''
 906     if isinstance(chr(0), bytes):  # Python 2
 907         return ''.join([chr(x) for x in xs])
 908     else:
 909         return bytes(xs)
 910
 911
 912 def get_cachedir(params={}):
 913     cache_root = os.environ.get('XDG_CACHE_HOME',
 914                                 os.path.expanduser('~/.cache'))
 915     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 916
 917
 918 # Cross-platform file locking
 919 if sys.platform == 'win32':
 920     import ctypes.wintypes
 921     import msvcrt
 922
 923     class OVERLAPPED(ctypes.Structure):
 924         _fields_ = [
 925             ('Internal', ctypes.wintypes.LPVOID),
 926             ('InternalHigh', ctypes.wintypes.LPVOID),
 927             ('Offset', ctypes.wintypes.DWORD),
 928             ('OffsetHigh', ctypes.wintypes.DWORD),
 929             ('hEvent', ctypes.wintypes.HANDLE),
 930         ]
 931
 932     kernel32 = ctypes.windll.kernel32
 933     LockFileEx = kernel32.LockFileEx
 934     LockFileEx.argtypes = [
 935         ctypes.wintypes.HANDLE,     # hFile
 936         ctypes.wintypes.DWORD,      # dwFlags
 937         ctypes.wintypes.DWORD,      # dwReserved
 938         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 939         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 940         ctypes.POINTER(OVERLAPPED)  # Overlapped
 941     ]
 942     LockFileEx.restype = ctypes.wintypes.BOOL
 943     UnlockFileEx = kernel32.UnlockFileEx
 944     UnlockFileEx.argtypes = [
 945         ctypes.wintypes.HANDLE,     # hFile
 946         ctypes.wintypes.DWORD,      # dwReserved
 947         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 948         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 949         ctypes.POINTER(OVERLAPPED)  # Overlapped
 950     ]
 951     UnlockFileEx.restype = ctypes.wintypes.BOOL
 952     whole_low = 0xffffffff
 953     whole_high = 0x7fffffff
 954
 955     def _lock_file(f, exclusive):
 956         overlapped = OVERLAPPED()
 957         overlapped.Offset = 0
 958         overlapped.OffsetHigh = 0
 959         overlapped.hEvent = 0
 960         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 961         handle = msvcrt.get_osfhandle(f.fileno())
 962         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 963                           whole_low, whole_high, f._lock_file_overlapped_p):
 964             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 965
 966     def _unlock_file(f):
 967         assert f._lock_file_overlapped_p
 968         handle = msvcrt.get_osfhandle(f.fileno())
 969         if not UnlockFileEx(handle, 0,
 970                             whole_low, whole_high, f._lock_file_overlapped_p):
 971             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 972
 973 else:
 974     import fcntl
 975
 976     def _lock_file(f, exclusive):
 977         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 978
 979     def _unlock_file(f):
 980         fcntl.lockf(f, fcntl.LOCK_UN)
 981
 982
 983 class locked_file(object):
 984     def __init__(self, filename, mode, encoding=None):
 985         assert mode in ['r', 'a', 'w']
 986         self.f = io.open(filename, mode, encoding=encoding)
 987         self.mode = mode
 988
 989     def __enter__(self):
 990         exclusive = self.mode != 'r'
 991         try:
 992             _lock_file(self.f, exclusive)
 993         except IOError:
 994             self.f.close()
 995             raise
 996         return self
 997
 998     def __exit__(self, etype, value, traceback):
 999         try:
1000             _unlock_file(self.f)
1001         finally:
1002             self.f.close()
1003
1004     def __iter__(self):
1005         return iter(self.f)
1006
1007     def write(self, *args):
1008         return self.f.write(*args)
1009
1010     def read(self, *args):
1011         return self.f.read(*args)
1012
1013
1014 def shell_quote(args):
1015     quoted_args = []
1016     encoding = sys.getfilesystemencoding()
1017     if encoding is None:
1018         encoding = 'utf-8'
1019     for a in args:
1020         if isinstance(a, bytes):
1021             # We may get a filename encoded with 'encodeFilename'
1022             a = a.decode(encoding)
1023         quoted_args.append(pipes.quote(a))
1024     return u' '.join(quoted_args)
1025
1026
1027 def takewhile_inclusive(pred, seq):
1028     """ Like itertools.takewhile, but include the latest evaluated element
1029         (the first element so that Not pred(e)) """
1030     for e in seq:
1031         yield e
1032         if not pred(e):
1033             return
1034
1035
1036 def smuggle_url(url, data):
1037     """ Pass additional data in a URL for internal use. """
1038
1039     sdata = compat_urllib_parse.urlencode(
1040         {u'__youtubedl_smuggle': json.dumps(data)})
1041     return url + u'#' + sdata
1042
1043
1044 def unsmuggle_url(smug_url, default=None):
1045     if not '#__youtubedl_smuggle' in smug_url:
1046         return smug_url, default
1047     url, _, sdata = smug_url.rpartition(u'#')
1048     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1049     data = json.loads(jsond)
1050     return url, data
1051
1052
1053 def format_bytes(bytes):
1054     if bytes is None:
1055         return u'N/A'
1056     if type(bytes) is str:
1057         bytes = float(bytes)
1058     if bytes == 0.0:
1059         exponent = 0
1060     else:
1061         exponent = int(math.log(bytes, 1024.0))
1062     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1063     converted = float(bytes) / float(1024 ** exponent)
1064     return u'%.2f%s' % (converted, suffix)
1065
1066
1067 def str_to_int(int_str):
1068     int_str = re.sub(r'[,\.]', u'', int_str)
1069     return int(int_str)
1070
1071
1072 def get_term_width():
1073     columns = os.environ.get('COLUMNS', None)
1074     if columns:
1075         return int(columns)
1076
1077     try:
1078         sp = subprocess.Popen(
1079             ['stty', 'size'],
1080             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1081         out, err = sp.communicate()
1082         return int(out.split()[1])
1083     except:
1084         pass
1085     return None
1086
1087
1088 def month_by_name(name):
1089     """ Return the number of a month by (locale-independently) English name """
1090
1091     ENGLISH_NAMES = [
1092         u'January', u'February', u'March', u'April', u'May', u'June',
1093         u'July', u'August', u'September', u'October', u'November', u'December']
1094     try:
1095         return ENGLISH_NAMES.index(name) + 1
1096     except ValueError:
1097         return None
1098
1099
1100 def fix_xml_ampersands(xml_str):
1101     """Replace all the '&' by '&amp;' in XML"""
1102     return re.sub(
1103         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1104         u'&amp;',
1105         xml_str)
1106
1107
1108 def setproctitle(title):
1109     assert isinstance(title, compat_str)
1110     try:
1111         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1112     except OSError:
1113         return
1114     title = title
1115     buf = ctypes.create_string_buffer(len(title) + 1)
1116     buf.value = title.encode('utf-8')
1117     try:
1118         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1119     except AttributeError:
1120         return  # Strange libc, just skip this
1121
1122
1123 def remove_start(s, start):
1124     if s.startswith(start):
1125         return s[len(start):]
1126     return s
1127
1128
1129 def url_basename(url):
1130     path = compat_urlparse.urlparse(url).path
1131     return path.strip(u'/').split(u'/')[-1]
1132
1133
1134 class HEADRequest(compat_urllib_request.Request):
1135     def get_method(self):
1136         return "HEAD"
1137
1138
1139 def int_or_none(v, scale=1):
1140     return v if v is None else (int(v) // scale)
1141
1142
1143 def parse_duration(s):
1144     if s is None:
1145         return None
1146
1147     m = re.match(
1148         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1149     if not m:
1150         return None
1151     res = int(m.group('secs'))
1152     if m.group('mins'):
1153         res += int(m.group('mins')) * 60
1154         if m.group('hours'):
1155             res += int(m.group('hours')) * 60 * 60
1156     return res
1157
1158
1159 def prepend_extension(filename, ext):
1160     name, real_ext = os.path.splitext(filename)
1161     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1162
1163
1164 def check_executable(exe, args=[]):
1165     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1166     args can be a list of arguments for a short output (like -version) """
1167     try:
1168         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1169     except OSError:
1170         return False
1171     return exe
1172
1173
1174 class PagedList(object):
1175     def __init__(self, pagefunc, pagesize):
1176         self._pagefunc = pagefunc
1177         self._pagesize = pagesize
1178
1179     def __len__(self):
1180         # This is only useful for tests
1181         return len(self.getslice())
1182
1183     def getslice(self, start=0, end=None):
1184         res = []
1185         for pagenum in itertools.count(start // self._pagesize):
1186             firstid = pagenum * self._pagesize
1187             nextfirstid = pagenum * self._pagesize + self._pagesize
1188             if start >= nextfirstid:
1189                 continue
1190
1191             page_results = list(self._pagefunc(pagenum))
1192
1193             startv = (
1194                 start % self._pagesize
1195                 if firstid <= start < nextfirstid
1196                 else 0)
1197
1198             endv = (
1199                 ((end - 1) % self._pagesize) + 1
1200                 if (end is not None and firstid <= end <= nextfirstid)
1201                 else None)
1202
1203             if startv != 0 or endv is not None:
1204                 page_results = page_results[startv:endv]
1205             res.extend(page_results)
1206
1207             # A little optimization - if current page is not "full", ie. does
1208             # not contain page_size videos then we can assume that this page
1209             # is the last one - there are no more ids on further pages -
1210             # i.e. no need to query again.
1211             if len(page_results) + startv < self._pagesize:
1212                 break
1213
1214             # If we got the whole page, but the next page is not interesting,
1215             # break out early as well
1216             if end == nextfirstid:
1217                 break
1218         return res
1219
1220
1221 def uppercase_escape(s):
1222     return re.sub(
1223         r'\\U([0-9a-fA-F]{8})',
1224         lambda m: compat_chr(int(m.group(1), base=16)), s)
1225
1226 try:
1227     struct.pack(u'!I', 0)
1228 except TypeError:
1229     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1230     def struct_pack(spec, *args):
1231         if isinstance(spec, compat_str):
1232             spec = spec.encode('ascii')
1233         return struct.pack(spec, *args)
1234
1235     def struct_unpack(spec, *args):
1236         if isinstance(spec, compat_str):
1237             spec = spec.encode('ascii')
1238         return struct.unpack(spec, *args)
1239 else:
1240     struct_pack = struct.pack
1241     struct_unpack = struct.unpack