Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import os
  12 import pipes
  13 import platform
  14 import re
  15 import socket
  16 import sys
  17 import traceback
  18 import zlib
  19
  20 try:
  21     import urllib.request as compat_urllib_request
  22 except ImportError: # Python 2
  23     import urllib2 as compat_urllib_request
  24
  25 try:
  26     import urllib.error as compat_urllib_error
  27 except ImportError: # Python 2
  28     import urllib2 as compat_urllib_error
  29
  30 try:
  31     import urllib.parse as compat_urllib_parse
  32 except ImportError: # Python 2
  33     import urllib as compat_urllib_parse
  34
  35 try:
  36     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  37 except ImportError: # Python 2
  38     from urlparse import urlparse as compat_urllib_parse_urlparse
  39
  40 try:
  41     import urllib.parse as compat_urlparse
  42 except ImportError: # Python 2
  43     import urlparse as compat_urlparse
  44
  45 try:
  46     import http.cookiejar as compat_cookiejar
  47 except ImportError: # Python 2
  48     import cookielib as compat_cookiejar
  49
  50 try:
  51     import html.entities as compat_html_entities
  52 except ImportError: # Python 2
  53     import htmlentitydefs as compat_html_entities
  54
  55 try:
  56     import html.parser as compat_html_parser
  57 except ImportError: # Python 2
  58     import HTMLParser as compat_html_parser
  59
  60 try:
  61     import http.client as compat_http_client
  62 except ImportError: # Python 2
  63     import httplib as compat_http_client
  64
  65 try:
  66     from urllib.error import HTTPError as compat_HTTPError
  67 except ImportError:  # Python 2
  68     from urllib2 import HTTPError as compat_HTTPError
  69
  70 try:
  71     from urllib.request import urlretrieve as compat_urlretrieve
  72 except ImportError:  # Python 2
  73     from urllib import urlretrieve as compat_urlretrieve
  74
  75
  76 try:
  77     from subprocess import DEVNULL
  78     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  79 except ImportError:
  80     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  81
  82 try:
  83     from urllib.parse import parse_qs as compat_parse_qs
  84 except ImportError: # Python 2
  85     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  86     # Python 2's version is apparently totally broken
  87     def _unquote(string, encoding='utf-8', errors='replace'):
  88         if string == '':
  89             return string
  90         res = string.split('%')
  91         if len(res) == 1:
  92             return string
  93         if encoding is None:
  94             encoding = 'utf-8'
  95         if errors is None:
  96             errors = 'replace'
  97         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  98         pct_sequence = b''
  99         string = res[0]
 100         for item in res[1:]:
 101             try:
 102                 if not item:
 103                     raise ValueError
 104                 pct_sequence += item[:2].decode('hex')
 105                 rest = item[2:]
 106                 if not rest:
 107                     # This segment was just a single percent-encoded character.
 108                     # May be part of a sequence of code units, so delay decoding.
 109                     # (Stored in pct_sequence).
 110                     continue
 111             except ValueError:
 112                 rest = '%' + item
 113             # Encountered non-percent-encoded characters. Flush the current
 114             # pct_sequence.
 115             string += pct_sequence.decode(encoding, errors) + rest
 116             pct_sequence = b''
 117         if pct_sequence:
 118             # Flush the final pct_sequence
 119             string += pct_sequence.decode(encoding, errors)
 120         return string
 121
 122     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 123                 encoding='utf-8', errors='replace'):
 124         qs, _coerce_result = qs, unicode
 125         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 126         r = []
 127         for name_value in pairs:
 128             if not name_value and not strict_parsing:
 129                 continue
 130             nv = name_value.split('=', 1)
 131             if len(nv) != 2:
 132                 if strict_parsing:
 133                     raise ValueError("bad query field: %r" % (name_value,))
 134                 # Handle case of a control-name with no equal sign
 135                 if keep_blank_values:
 136                     nv.append('')
 137                 else:
 138                     continue
 139             if len(nv[1]) or keep_blank_values:
 140                 name = nv[0].replace('+', ' ')
 141                 name = _unquote(name, encoding=encoding, errors=errors)
 142                 name = _coerce_result(name)
 143                 value = nv[1].replace('+', ' ')
 144                 value = _unquote(value, encoding=encoding, errors=errors)
 145                 value = _coerce_result(value)
 146                 r.append((name, value))
 147         return r
 148
 149     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 150                 encoding='utf-8', errors='replace'):
 151         parsed_result = {}
 152         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 153                         encoding=encoding, errors=errors)
 154         for name, value in pairs:
 155             if name in parsed_result:
 156                 parsed_result[name].append(value)
 157             else:
 158                 parsed_result[name] = [value]
 159         return parsed_result
 160
 161 try:
 162     compat_str = unicode # Python 2
 163 except NameError:
 164     compat_str = str
 165
 166 try:
 167     compat_chr = unichr # Python 2
 168 except NameError:
 169     compat_chr = chr
 170
 171 def compat_ord(c):
 172     if type(c) is int: return c
 173     else: return ord(c)
 174
 175 # This is not clearly defined otherwise
 176 compiled_regex_type = type(re.compile(''))
 177
 178 std_headers = {
 179     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 180     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 181     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 182     'Accept-Encoding': 'gzip, deflate',
 183     'Accept-Language': 'en-us,en;q=0.5',
 184 }
 185
 186 def preferredencoding():
 187     """Get preferred encoding.
 188
 189     Returns the best encoding scheme for the system, based on
 190     locale.getpreferredencoding() and some further tweaks.
 191     """
 192     try:
 193         pref = locale.getpreferredencoding()
 194         u'TEST'.encode(pref)
 195     except:
 196         pref = 'UTF-8'
 197
 198     return pref
 199
 200 if sys.version_info < (3,0):
 201     def compat_print(s):
 202         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 203 else:
 204     def compat_print(s):
 205         assert type(s) == type(u'')
 206         print(s)
 207
 208 # In Python 2.x, json.dump expects a bytestream.
 209 # In Python 3.x, it writes to a character stream
 210 if sys.version_info < (3,0):
 211     def write_json_file(obj, fn):
 212         with open(fn, 'wb') as f:
 213             json.dump(obj, f)
 214 else:
 215     def write_json_file(obj, fn):
 216         with open(fn, 'w', encoding='utf-8') as f:
 217             json.dump(obj, f)
 218
 219 if sys.version_info >= (2,7):
 220     def find_xpath_attr(node, xpath, key, val):
 221         """ Find the xpath xpath[@key=val] """
 222         assert re.match(r'^[a-zA-Z]+$', key)
 223         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 224         expr = xpath + u"[@%s='%s']" % (key, val)
 225         return node.find(expr)
 226 else:
 227     def find_xpath_attr(node, xpath, key, val):
 228         for f in node.findall(xpath):
 229             if f.attrib.get(key) == val:
 230                 return f
 231         return None
 232
 233 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 234 # the namespace parameter
 235 def xpath_with_ns(path, ns_map):
 236     components = [c.split(':') for c in path.split('/')]
 237     replaced = []
 238     for c in components:
 239         if len(c) == 1:
 240             replaced.append(c[0])
 241         else:
 242             ns, tag = c
 243             replaced.append('{%s}%s' % (ns_map[ns], tag))
 244     return '/'.join(replaced)
 245
 246 def htmlentity_transform(matchobj):
 247     """Transforms an HTML entity to a character.
 248
 249     This function receives a match object and is intended to be used with
 250     the re.sub() function.
 251     """
 252     entity = matchobj.group(1)
 253
 254     # Known non-numeric HTML entity
 255     if entity in compat_html_entities.name2codepoint:
 256         return compat_chr(compat_html_entities.name2codepoint[entity])
 257
 258     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 259     if mobj is not None:
 260         numstr = mobj.group(1)
 261         if numstr.startswith(u'x'):
 262             base = 16
 263             numstr = u'0%s' % numstr
 264         else:
 265             base = 10
 266         return compat_chr(int(numstr, base))
 267
 268     # Unknown entity in name, return its literal representation
 269     return (u'&%s;' % entity)
 270
 271 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 272 class BaseHTMLParser(compat_html_parser.HTMLParser):
 273     def __init(self):
 274         compat_html_parser.HTMLParser.__init__(self)
 275         self.html = None
 276
 277     def loads(self, html):
 278         self.html = html
 279         self.feed(html)
 280         self.close()
 281
 282 class AttrParser(BaseHTMLParser):
 283     """Modified HTMLParser that isolates a tag with the specified attribute"""
 284     def __init__(self, attribute, value):
 285         self.attribute = attribute
 286         self.value = value
 287         self.result = None
 288         self.started = False
 289         self.depth = {}
 290         self.watch_startpos = False
 291         self.error_count = 0
 292         BaseHTMLParser.__init__(self)
 293
 294     def error(self, message):
 295         if self.error_count > 10 or self.started:
 296             raise compat_html_parser.HTMLParseError(message, self.getpos())
 297         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 298         self.error_count += 1
 299         self.goahead(1)
 300
 301     def handle_starttag(self, tag, attrs):
 302         attrs = dict(attrs)
 303         if self.started:
 304             self.find_startpos(None)
 305         if self.attribute in attrs and attrs[self.attribute] == self.value:
 306             self.result = [tag]
 307             self.started = True
 308             self.watch_startpos = True
 309         if self.started:
 310             if not tag in self.depth: self.depth[tag] = 0
 311             self.depth[tag] += 1
 312
 313     def handle_endtag(self, tag):
 314         if self.started:
 315             if tag in self.depth: self.depth[tag] -= 1
 316             if self.depth[self.result[0]] == 0:
 317                 self.started = False
 318                 self.result.append(self.getpos())
 319
 320     def find_startpos(self, x):
 321         """Needed to put the start position of the result (self.result[1])
 322         after the opening tag with the requested id"""
 323         if self.watch_startpos:
 324             self.watch_startpos = False
 325             self.result.append(self.getpos())
 326     handle_entityref = handle_charref = handle_data = handle_comment = \
 327     handle_decl = handle_pi = unknown_decl = find_startpos
 328
 329     def get_result(self):
 330         if self.result is None:
 331             return None
 332         if len(self.result) != 3:
 333             return None
 334         lines = self.html.split('\n')
 335         lines = lines[self.result[1][0]-1:self.result[2][0]]
 336         lines[0] = lines[0][self.result[1][1]:]
 337         if len(lines) == 1:
 338             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 339         lines[-1] = lines[-1][:self.result[2][1]]
 340         return '\n'.join(lines).strip()
 341 # Hack for https://github.com/rg3/youtube-dl/issues/662
 342 if sys.version_info < (2, 7, 3):
 343     AttrParser.parse_endtag = (lambda self, i:
 344         i + len("</scr'+'ipt>")
 345         if self.rawdata[i:].startswith("</scr'+'ipt>")
 346         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 347
 348 def get_element_by_id(id, html):
 349     """Return the content of the tag with the specified ID in the passed HTML document"""
 350     return get_element_by_attribute("id", id, html)
 351
 352 def get_element_by_attribute(attribute, value, html):
 353     """Return the content of the tag with the specified attribute in the passed HTML document"""
 354     parser = AttrParser(attribute, value)
 355     try:
 356         parser.loads(html)
 357     except compat_html_parser.HTMLParseError:
 358         pass
 359     return parser.get_result()
 360
 361 class MetaParser(BaseHTMLParser):
 362     """
 363     Modified HTMLParser that isolates a meta tag with the specified name
 364     attribute.
 365     """
 366     def __init__(self, name):
 367         BaseHTMLParser.__init__(self)
 368         self.name = name
 369         self.content = None
 370         self.result = None
 371
 372     def handle_starttag(self, tag, attrs):
 373         if tag != 'meta':
 374             return
 375         attrs = dict(attrs)
 376         if attrs.get('name') == self.name:
 377             self.result = attrs.get('content')
 378
 379     def get_result(self):
 380         return self.result
 381
 382 def get_meta_content(name, html):
 383     """
 384     Return the content attribute from the meta tag with the given name attribute.
 385     """
 386     parser = MetaParser(name)
 387     try:
 388         parser.loads(html)
 389     except compat_html_parser.HTMLParseError:
 390         pass
 391     return parser.get_result()
 392
 393
 394 def clean_html(html):
 395     """Clean an HTML snippet into a readable string"""
 396     # Newline vs <br />
 397     html = html.replace('\n', ' ')
 398     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 399     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 400     # Strip html tags
 401     html = re.sub('<.*?>', '', html)
 402     # Replace html entities
 403     html = unescapeHTML(html)
 404     return html.strip()
 405
 406
 407 def sanitize_open(filename, open_mode):
 408     """Try to open the given filename, and slightly tweak it if this fails.
 409
 410     Attempts to open the given filename. If this fails, it tries to change
 411     the filename slightly, step by step, until it's either able to open it
 412     or it fails and raises a final exception, like the standard open()
 413     function.
 414
 415     It returns the tuple (stream, definitive_file_name).
 416     """
 417     try:
 418         if filename == u'-':
 419             if sys.platform == 'win32':
 420                 import msvcrt
 421                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 422             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 423         stream = open(encodeFilename(filename), open_mode)
 424         return (stream, filename)
 425     except (IOError, OSError) as err:
 426         if err.errno in (errno.EACCES,):
 427             raise
 428
 429         # In case of error, try to remove win32 forbidden chars
 430         alt_filename = os.path.join(
 431                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 432                         for path_part in os.path.split(filename)
 433                        )
 434         if alt_filename == filename:
 435             raise
 436         else:
 437             # An exception here should be caught in the caller
 438             stream = open(encodeFilename(filename), open_mode)
 439             return (stream, alt_filename)
 440
 441
 442 def timeconvert(timestr):
 443     """Convert RFC 2822 defined time string into system timestamp"""
 444     timestamp = None
 445     timetuple = email.utils.parsedate_tz(timestr)
 446     if timetuple is not None:
 447         timestamp = email.utils.mktime_tz(timetuple)
 448     return timestamp
 449
 450 def sanitize_filename(s, restricted=False, is_id=False):
 451     """Sanitizes a string so it could be used as part of a filename.
 452     If restricted is set, use a stricter subset of allowed characters.
 453     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 454     """
 455     def replace_insane(char):
 456         if char == '?' or ord(char) < 32 or ord(char) == 127:
 457             return ''
 458         elif char == '"':
 459             return '' if restricted else '\''
 460         elif char == ':':
 461             return '_-' if restricted else ' -'
 462         elif char in '\\/|*<>':
 463             return '_'
 464         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 465             return '_'
 466         if restricted and ord(char) > 127:
 467             return '_'
 468         return char
 469
 470     result = u''.join(map(replace_insane, s))
 471     if not is_id:
 472         while '__' in result:
 473             result = result.replace('__', '_')
 474         result = result.strip('_')
 475         # Common case of "Foreign band name - English song title"
 476         if restricted and result.startswith('-_'):
 477             result = result[2:]
 478         if not result:
 479             result = '_'
 480     return result
 481
 482 def orderedSet(iterable):
 483     """ Remove all duplicates from the input iterable """
 484     res = []
 485     for el in iterable:
 486         if el not in res:
 487             res.append(el)
 488     return res
 489
 490 def unescapeHTML(s):
 491     """
 492     @param s a string
 493     """
 494     assert type(s) == type(u'')
 495
 496     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 497     return result
 498
 499 def encodeFilename(s):
 500     """
 501     @param s The name of the file
 502     """
 503
 504     assert type(s) == type(u'')
 505
 506     # Python 3 has a Unicode API
 507     if sys.version_info >= (3, 0):
 508         return s
 509
 510     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 511         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 512         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 513         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 514         return s
 515     else:
 516         encoding = sys.getfilesystemencoding()
 517         if encoding is None:
 518             encoding = 'utf-8'
 519         return s.encode(encoding, 'ignore')
 520
 521 def decodeOption(optval):
 522     if optval is None:
 523         return optval
 524     if isinstance(optval, bytes):
 525         optval = optval.decode(preferredencoding())
 526
 527     assert isinstance(optval, compat_str)
 528     return optval
 529
 530 def formatSeconds(secs):
 531     if secs > 3600:
 532         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 533     elif secs > 60:
 534         return '%d:%02d' % (secs // 60, secs % 60)
 535     else:
 536         return '%d' % secs
 537
 538 def make_HTTPS_handler(opts):
 539     if sys.version_info < (3,2):
 540         # Python's 2.x handler is very simplistic
 541         return compat_urllib_request.HTTPSHandler()
 542     else:
 543         import ssl
 544         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 545         context.set_default_verify_paths()
 546
 547         context.verify_mode = (ssl.CERT_NONE
 548                                if opts.no_check_certificate
 549                                else ssl.CERT_REQUIRED)
 550         return compat_urllib_request.HTTPSHandler(context=context)
 551
 552 class ExtractorError(Exception):
 553     """Error during info extraction."""
 554     def __init__(self, msg, tb=None, expected=False, cause=None):
 555         """ tb, if given, is the original traceback (so that it can be printed out).
 556         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 557         """
 558
 559         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 560             expected = True
 561         if not expected:
 562             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 563         super(ExtractorError, self).__init__(msg)
 564
 565         self.traceback = tb
 566         self.exc_info = sys.exc_info()  # preserve original exception
 567         self.cause = cause
 568
 569     def format_traceback(self):
 570         if self.traceback is None:
 571             return None
 572         return u''.join(traceback.format_tb(self.traceback))
 573
 574
 575 class DownloadError(Exception):
 576     """Download Error exception.
 577
 578     This exception may be thrown by FileDownloader objects if they are not
 579     configured to continue on errors. They will contain the appropriate
 580     error message.
 581     """
 582     def __init__(self, msg, exc_info=None):
 583         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 584         super(DownloadError, self).__init__(msg)
 585         self.exc_info = exc_info
 586
 587
 588 class SameFileError(Exception):
 589     """Same File exception.
 590
 591     This exception will be thrown by FileDownloader objects if they detect
 592     multiple files would have to be downloaded to the same file on disk.
 593     """
 594     pass
 595
 596
 597 class PostProcessingError(Exception):
 598     """Post Processing exception.
 599
 600     This exception may be raised by PostProcessor's .run() method to
 601     indicate an error in the postprocessing task.
 602     """
 603     def __init__(self, msg):
 604         self.msg = msg
 605
 606 class MaxDownloadsReached(Exception):
 607     """ --max-downloads limit has been reached. """
 608     pass
 609
 610
 611 class UnavailableVideoError(Exception):
 612     """Unavailable Format exception.
 613
 614     This exception will be thrown when a video is requested
 615     in a format that is not available for that video.
 616     """
 617     pass
 618
 619
 620 class ContentTooShortError(Exception):
 621     """Content Too Short exception.
 622
 623     This exception may be raised by FileDownloader objects when a file they
 624     download is too small for what the server announced first, indicating
 625     the connection was probably interrupted.
 626     """
 627     # Both in bytes
 628     downloaded = None
 629     expected = None
 630
 631     def __init__(self, downloaded, expected):
 632         self.downloaded = downloaded
 633         self.expected = expected
 634
 635 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 636     """Handler for HTTP requests and responses.
 637
 638     This class, when installed with an OpenerDirector, automatically adds
 639     the standard headers to every HTTP request and handles gzipped and
 640     deflated responses from web servers. If compression is to be avoided in
 641     a particular request, the original request in the program code only has
 642     to include the HTTP header "Youtubedl-No-Compression", which will be
 643     removed before making the real request.
 644
 645     Part of this code was copied from:
 646
 647     http://techknack.net/python-urllib2-handlers/
 648
 649     Andrew Rowls, the author of that code, agreed to release it to the
 650     public domain.
 651     """
 652
 653     @staticmethod
 654     def deflate(data):
 655         try:
 656             return zlib.decompress(data, -zlib.MAX_WBITS)
 657         except zlib.error:
 658             return zlib.decompress(data)
 659
 660     @staticmethod
 661     def addinfourl_wrapper(stream, headers, url, code):
 662         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 663             return compat_urllib_request.addinfourl(stream, headers, url, code)
 664         ret = compat_urllib_request.addinfourl(stream, headers, url)
 665         ret.code = code
 666         return ret
 667
 668     def http_request(self, req):
 669         for h,v in std_headers.items():
 670             if h in req.headers:
 671                 del req.headers[h]
 672             req.add_header(h, v)
 673         if 'Youtubedl-no-compression' in req.headers:
 674             if 'Accept-encoding' in req.headers:
 675                 del req.headers['Accept-encoding']
 676             del req.headers['Youtubedl-no-compression']
 677         if 'Youtubedl-user-agent' in req.headers:
 678             if 'User-agent' in req.headers:
 679                 del req.headers['User-agent']
 680             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 681             del req.headers['Youtubedl-user-agent']
 682         return req
 683
 684     def http_response(self, req, resp):
 685         old_resp = resp
 686         # gzip
 687         if resp.headers.get('Content-encoding', '') == 'gzip':
 688             content = resp.read()
 689             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 690             try:
 691                 uncompressed = io.BytesIO(gz.read())
 692             except IOError as original_ioerror:
 693                 # There may be junk add the end of the file
 694                 # See http://stackoverflow.com/q/4928560/35070 for details
 695                 for i in range(1, 1024):
 696                     try:
 697                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 698                         uncompressed = io.BytesIO(gz.read())
 699                     except IOError:
 700                         continue
 701                     break
 702                 else:
 703                     raise original_ioerror
 704             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 705             resp.msg = old_resp.msg
 706         # deflate
 707         if resp.headers.get('Content-encoding', '') == 'deflate':
 708             gz = io.BytesIO(self.deflate(resp.read()))
 709             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 710             resp.msg = old_resp.msg
 711         return resp
 712
 713     https_request = http_request
 714     https_response = http_response
 715
 716 def unified_strdate(date_str):
 717     """Return a string with the date in the format YYYYMMDD"""
 718     upload_date = None
 719     #Replace commas
 720     date_str = date_str.replace(',',' ')
 721     # %z (UTC offset) is only supported in python>=3.2
 722     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 723     format_expressions = [
 724         '%d %B %Y',
 725         '%B %d %Y',
 726         '%b %d %Y',
 727         '%Y-%m-%d',
 728         '%d/%m/%Y',
 729         '%Y/%m/%d %H:%M:%S',
 730         '%d.%m.%Y %H:%M',
 731         '%Y-%m-%dT%H:%M:%SZ',
 732         '%Y-%m-%dT%H:%M:%S',
 733     ]
 734     for expression in format_expressions:
 735         try:
 736             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 737         except:
 738             pass
 739     return upload_date
 740
 741 def determine_ext(url, default_ext=u'unknown_video'):
 742     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 743     if re.match(r'^[A-Za-z0-9]+$', guess):
 744         return guess
 745     else:
 746         return default_ext
 747
 748 def subtitles_filename(filename, sub_lang, sub_format):
 749     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 750
 751 def date_from_str(date_str):
 752     """
 753     Return a datetime object from a string in the format YYYYMMDD or
 754     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 755     today = datetime.date.today()
 756     if date_str == 'now'or date_str == 'today':
 757         return today
 758     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 759     if match is not None:
 760         sign = match.group('sign')
 761         time = int(match.group('time'))
 762         if sign == '-':
 763             time = -time
 764         unit = match.group('unit')
 765         #A bad aproximation?
 766         if unit == 'month':
 767             unit = 'day'
 768             time *= 30
 769         elif unit == 'year':
 770             unit = 'day'
 771             time *= 365
 772         unit += 's'
 773         delta = datetime.timedelta(**{unit: time})
 774         return today + delta
 775     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 776
 777 class DateRange(object):
 778     """Represents a time interval between two dates"""
 779     def __init__(self, start=None, end=None):
 780         """start and end must be strings in the format accepted by date"""
 781         if start is not None:
 782             self.start = date_from_str(start)
 783         else:
 784             self.start = datetime.datetime.min.date()
 785         if end is not None:
 786             self.end = date_from_str(end)
 787         else:
 788             self.end = datetime.datetime.max.date()
 789         if self.start > self.end:
 790             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 791     @classmethod
 792     def day(cls, day):
 793         """Returns a range that only contains the given day"""
 794         return cls(day,day)
 795     def __contains__(self, date):
 796         """Check if the date is in the range"""
 797         if not isinstance(date, datetime.date):
 798             date = date_from_str(date)
 799         return self.start <= date <= self.end
 800     def __str__(self):
 801         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 802
 803
 804 def platform_name():
 805     """ Returns the platform name as a compat_str """
 806     res = platform.platform()
 807     if isinstance(res, bytes):
 808         res = res.decode(preferredencoding())
 809
 810     assert isinstance(res, compat_str)
 811     return res
 812
 813
 814 def write_string(s, out=None):
 815     if out is None:
 816         out = sys.stderr
 817     assert type(s) == type(u'')
 818
 819     if ('b' in getattr(out, 'mode', '') or
 820             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 821         s = s.encode(preferredencoding(), 'ignore')
 822     out.write(s)
 823     out.flush()
 824
 825
 826 def bytes_to_intlist(bs):
 827     if not bs:
 828         return []
 829     if isinstance(bs[0], int):  # Python 3
 830         return list(bs)
 831     else:
 832         return [ord(c) for c in bs]
 833
 834
 835 def intlist_to_bytes(xs):
 836     if not xs:
 837         return b''
 838     if isinstance(chr(0), bytes):  # Python 2
 839         return ''.join([chr(x) for x in xs])
 840     else:
 841         return bytes(xs)
 842
 843
 844 def get_cachedir(params={}):
 845     cache_root = os.environ.get('XDG_CACHE_HOME',
 846                                 os.path.expanduser('~/.cache'))
 847     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 848
 849
 850 # Cross-platform file locking
 851 if sys.platform == 'win32':
 852     import ctypes.wintypes
 853     import msvcrt
 854
 855     class OVERLAPPED(ctypes.Structure):
 856         _fields_ = [
 857             ('Internal', ctypes.wintypes.LPVOID),
 858             ('InternalHigh', ctypes.wintypes.LPVOID),
 859             ('Offset', ctypes.wintypes.DWORD),
 860             ('OffsetHigh', ctypes.wintypes.DWORD),
 861             ('hEvent', ctypes.wintypes.HANDLE),
 862         ]
 863
 864     kernel32 = ctypes.windll.kernel32
 865     LockFileEx = kernel32.LockFileEx
 866     LockFileEx.argtypes = [
 867         ctypes.wintypes.HANDLE,     # hFile
 868         ctypes.wintypes.DWORD,      # dwFlags
 869         ctypes.wintypes.DWORD,      # dwReserved
 870         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 871         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 872         ctypes.POINTER(OVERLAPPED)  # Overlapped
 873     ]
 874     LockFileEx.restype = ctypes.wintypes.BOOL
 875     UnlockFileEx = kernel32.UnlockFileEx
 876     UnlockFileEx.argtypes = [
 877         ctypes.wintypes.HANDLE,     # hFile
 878         ctypes.wintypes.DWORD,      # dwReserved
 879         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 880         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 881         ctypes.POINTER(OVERLAPPED)  # Overlapped
 882     ]
 883     UnlockFileEx.restype = ctypes.wintypes.BOOL
 884     whole_low = 0xffffffff
 885     whole_high = 0x7fffffff
 886
 887     def _lock_file(f, exclusive):
 888         overlapped = OVERLAPPED()
 889         overlapped.Offset = 0
 890         overlapped.OffsetHigh = 0
 891         overlapped.hEvent = 0
 892         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 893         handle = msvcrt.get_osfhandle(f.fileno())
 894         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 895                           whole_low, whole_high, f._lock_file_overlapped_p):
 896             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 897
 898     def _unlock_file(f):
 899         assert f._lock_file_overlapped_p
 900         handle = msvcrt.get_osfhandle(f.fileno())
 901         if not UnlockFileEx(handle, 0,
 902                             whole_low, whole_high, f._lock_file_overlapped_p):
 903             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 904
 905 else:
 906     import fcntl
 907
 908     def _lock_file(f, exclusive):
 909         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 910
 911     def _unlock_file(f):
 912         fcntl.lockf(f, fcntl.LOCK_UN)
 913
 914
 915 class locked_file(object):
 916     def __init__(self, filename, mode, encoding=None):
 917         assert mode in ['r', 'a', 'w']
 918         self.f = io.open(filename, mode, encoding=encoding)
 919         self.mode = mode
 920
 921     def __enter__(self):
 922         exclusive = self.mode != 'r'
 923         try:
 924             _lock_file(self.f, exclusive)
 925         except IOError:
 926             self.f.close()
 927             raise
 928         return self
 929
 930     def __exit__(self, etype, value, traceback):
 931         try:
 932             _unlock_file(self.f)
 933         finally:
 934             self.f.close()
 935
 936     def __iter__(self):
 937         return iter(self.f)
 938
 939     def write(self, *args):
 940         return self.f.write(*args)
 941
 942     def read(self, *args):
 943         return self.f.read(*args)
 944
 945
 946 def shell_quote(args):
 947     return ' '.join(map(pipes.quote, args))
 948
 949
 950 def takewhile_inclusive(pred, seq):
 951     """ Like itertools.takewhile, but include the latest evaluated element
 952         (the first element so that Not pred(e)) """
 953     for e in seq:
 954         yield e
 955         if not pred(e):
 956             return
 957
 958
 959 def smuggle_url(url, data):
 960     """ Pass additional data in a URL for internal use. """
 961
 962     sdata = compat_urllib_parse.urlencode(
 963         {u'__youtubedl_smuggle': json.dumps(data)})
 964     return url + u'#' + sdata
 965
 966
 967 def unsmuggle_url(smug_url):
 968     if not '#__youtubedl_smuggle' in smug_url:
 969         return smug_url, None
 970     url, _, sdata = smug_url.rpartition(u'#')
 971     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
 972     data = json.loads(jsond)
 973     return url, data