Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import os
  12 import platform
  13 import re
  14 import socket
  15 import sys
  16 import traceback
  17 import zlib
  18
  19 try:
  20     import urllib.request as compat_urllib_request
  21 except ImportError: # Python 2
  22     import urllib2 as compat_urllib_request
  23
  24 try:
  25     import urllib.error as compat_urllib_error
  26 except ImportError: # Python 2
  27     import urllib2 as compat_urllib_error
  28
  29 try:
  30     import urllib.parse as compat_urllib_parse
  31 except ImportError: # Python 2
  32     import urllib as compat_urllib_parse
  33
  34 try:
  35     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  36 except ImportError: # Python 2
  37     from urlparse import urlparse as compat_urllib_parse_urlparse
  38
  39 try:
  40     import urllib.parse as compat_urlparse
  41 except ImportError: # Python 2
  42     import urlparse as compat_urlparse
  43
  44 try:
  45     import http.cookiejar as compat_cookiejar
  46 except ImportError: # Python 2
  47     import cookielib as compat_cookiejar
  48
  49 try:
  50     import html.entities as compat_html_entities
  51 except ImportError: # Python 2
  52     import htmlentitydefs as compat_html_entities
  53
  54 try:
  55     import html.parser as compat_html_parser
  56 except ImportError: # Python 2
  57     import HTMLParser as compat_html_parser
  58
  59 try:
  60     import http.client as compat_http_client
  61 except ImportError: # Python 2
  62     import httplib as compat_http_client
  63
  64 try:
  65     from urllib.error import HTTPError as compat_HTTPError
  66 except ImportError:  # Python 2
  67     from urllib2 import HTTPError as compat_HTTPError
  68
  69 try:
  70     from subprocess import DEVNULL
  71     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  72 except ImportError:
  73     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  74
  75 try:
  76     from urllib.parse import parse_qs as compat_parse_qs
  77 except ImportError: # Python 2
  78     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  79     # Python 2's version is apparently totally broken
  80     def _unquote(string, encoding='utf-8', errors='replace'):
  81         if string == '':
  82             return string
  83         res = string.split('%')
  84         if len(res) == 1:
  85             return string
  86         if encoding is None:
  87             encoding = 'utf-8'
  88         if errors is None:
  89             errors = 'replace'
  90         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  91         pct_sequence = b''
  92         string = res[0]
  93         for item in res[1:]:
  94             try:
  95                 if not item:
  96                     raise ValueError
  97                 pct_sequence += item[:2].decode('hex')
  98                 rest = item[2:]
  99                 if not rest:
 100                     # This segment was just a single percent-encoded character.
 101                     # May be part of a sequence of code units, so delay decoding.
 102                     # (Stored in pct_sequence).
 103                     continue
 104             except ValueError:
 105                 rest = '%' + item
 106             # Encountered non-percent-encoded characters. Flush the current
 107             # pct_sequence.
 108             string += pct_sequence.decode(encoding, errors) + rest
 109             pct_sequence = b''
 110         if pct_sequence:
 111             # Flush the final pct_sequence
 112             string += pct_sequence.decode(encoding, errors)
 113         return string
 114
 115     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 116                 encoding='utf-8', errors='replace'):
 117         qs, _coerce_result = qs, unicode
 118         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 119         r = []
 120         for name_value in pairs:
 121             if not name_value and not strict_parsing:
 122                 continue
 123             nv = name_value.split('=', 1)
 124             if len(nv) != 2:
 125                 if strict_parsing:
 126                     raise ValueError("bad query field: %r" % (name_value,))
 127                 # Handle case of a control-name with no equal sign
 128                 if keep_blank_values:
 129                     nv.append('')
 130                 else:
 131                     continue
 132             if len(nv[1]) or keep_blank_values:
 133                 name = nv[0].replace('+', ' ')
 134                 name = _unquote(name, encoding=encoding, errors=errors)
 135                 name = _coerce_result(name)
 136                 value = nv[1].replace('+', ' ')
 137                 value = _unquote(value, encoding=encoding, errors=errors)
 138                 value = _coerce_result(value)
 139                 r.append((name, value))
 140         return r
 141
 142     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 143                 encoding='utf-8', errors='replace'):
 144         parsed_result = {}
 145         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 146                         encoding=encoding, errors=errors)
 147         for name, value in pairs:
 148             if name in parsed_result:
 149                 parsed_result[name].append(value)
 150             else:
 151                 parsed_result[name] = [value]
 152         return parsed_result
 153
 154 try:
 155     compat_str = unicode # Python 2
 156 except NameError:
 157     compat_str = str
 158
 159 try:
 160     compat_chr = unichr # Python 2
 161 except NameError:
 162     compat_chr = chr
 163
 164 def compat_ord(c):
 165     if type(c) is int: return c
 166     else: return ord(c)
 167
 168 # This is not clearly defined otherwise
 169 compiled_regex_type = type(re.compile(''))
 170
 171 std_headers = {
 172     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 173     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 174     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 175     'Accept-Encoding': 'gzip, deflate',
 176     'Accept-Language': 'en-us,en;q=0.5',
 177 }
 178
 179 def preferredencoding():
 180     """Get preferred encoding.
 181
 182     Returns the best encoding scheme for the system, based on
 183     locale.getpreferredencoding() and some further tweaks.
 184     """
 185     try:
 186         pref = locale.getpreferredencoding()
 187         u'TEST'.encode(pref)
 188     except:
 189         pref = 'UTF-8'
 190
 191     return pref
 192
 193 if sys.version_info < (3,0):
 194     def compat_print(s):
 195         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 196 else:
 197     def compat_print(s):
 198         assert type(s) == type(u'')
 199         print(s)
 200
 201 # In Python 2.x, json.dump expects a bytestream.
 202 # In Python 3.x, it writes to a character stream
 203 if sys.version_info < (3,0):
 204     def write_json_file(obj, fn):
 205         with open(fn, 'wb') as f:
 206             json.dump(obj, f)
 207 else:
 208     def write_json_file(obj, fn):
 209         with open(fn, 'w', encoding='utf-8') as f:
 210             json.dump(obj, f)
 211
 212 if sys.version_info >= (2,7):
 213     def find_xpath_attr(node, xpath, key, val):
 214         """ Find the xpath xpath[@key=val] """
 215         assert re.match(r'^[a-zA-Z]+$', key)
 216         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 217         expr = xpath + u"[@%s='%s']" % (key, val)
 218         return node.find(expr)
 219 else:
 220     def find_xpath_attr(node, xpath, key, val):
 221         for f in node.findall(xpath):
 222             if f.attrib.get(key) == val:
 223                 return f
 224         return None
 225
 226 def htmlentity_transform(matchobj):
 227     """Transforms an HTML entity to a character.
 228
 229     This function receives a match object and is intended to be used with
 230     the re.sub() function.
 231     """
 232     entity = matchobj.group(1)
 233
 234     # Known non-numeric HTML entity
 235     if entity in compat_html_entities.name2codepoint:
 236         return compat_chr(compat_html_entities.name2codepoint[entity])
 237
 238     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 239     if mobj is not None:
 240         numstr = mobj.group(1)
 241         if numstr.startswith(u'x'):
 242             base = 16
 243             numstr = u'0%s' % numstr
 244         else:
 245             base = 10
 246         return compat_chr(int(numstr, base))
 247
 248     # Unknown entity in name, return its literal representation
 249     return (u'&%s;' % entity)
 250
 251 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 252 class AttrParser(compat_html_parser.HTMLParser):
 253     """Modified HTMLParser that isolates a tag with the specified attribute"""
 254     def __init__(self, attribute, value):
 255         self.attribute = attribute
 256         self.value = value
 257         self.result = None
 258         self.started = False
 259         self.depth = {}
 260         self.html = None
 261         self.watch_startpos = False
 262         self.error_count = 0
 263         compat_html_parser.HTMLParser.__init__(self)
 264
 265     def error(self, message):
 266         if self.error_count > 10 or self.started:
 267             raise compat_html_parser.HTMLParseError(message, self.getpos())
 268         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 269         self.error_count += 1
 270         self.goahead(1)
 271
 272     def loads(self, html):
 273         self.html = html
 274         self.feed(html)
 275         self.close()
 276
 277     def handle_starttag(self, tag, attrs):
 278         attrs = dict(attrs)
 279         if self.started:
 280             self.find_startpos(None)
 281         if self.attribute in attrs and attrs[self.attribute] == self.value:
 282             self.result = [tag]
 283             self.started = True
 284             self.watch_startpos = True
 285         if self.started:
 286             if not tag in self.depth: self.depth[tag] = 0
 287             self.depth[tag] += 1
 288
 289     def handle_endtag(self, tag):
 290         if self.started:
 291             if tag in self.depth: self.depth[tag] -= 1
 292             if self.depth[self.result[0]] == 0:
 293                 self.started = False
 294                 self.result.append(self.getpos())
 295
 296     def find_startpos(self, x):
 297         """Needed to put the start position of the result (self.result[1])
 298         after the opening tag with the requested id"""
 299         if self.watch_startpos:
 300             self.watch_startpos = False
 301             self.result.append(self.getpos())
 302     handle_entityref = handle_charref = handle_data = handle_comment = \
 303     handle_decl = handle_pi = unknown_decl = find_startpos
 304
 305     def get_result(self):
 306         if self.result is None:
 307             return None
 308         if len(self.result) != 3:
 309             return None
 310         lines = self.html.split('\n')
 311         lines = lines[self.result[1][0]-1:self.result[2][0]]
 312         lines[0] = lines[0][self.result[1][1]:]
 313         if len(lines) == 1:
 314             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 315         lines[-1] = lines[-1][:self.result[2][1]]
 316         return '\n'.join(lines).strip()
 317 # Hack for https://github.com/rg3/youtube-dl/issues/662
 318 if sys.version_info < (2, 7, 3):
 319     AttrParser.parse_endtag = (lambda self, i:
 320         i + len("</scr'+'ipt>")
 321         if self.rawdata[i:].startswith("</scr'+'ipt>")
 322         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 323
 324 def get_element_by_id(id, html):
 325     """Return the content of the tag with the specified ID in the passed HTML document"""
 326     return get_element_by_attribute("id", id, html)
 327
 328 def get_element_by_attribute(attribute, value, html):
 329     """Return the content of the tag with the specified attribute in the passed HTML document"""
 330     parser = AttrParser(attribute, value)
 331     try:
 332         parser.loads(html)
 333     except compat_html_parser.HTMLParseError:
 334         pass
 335     return parser.get_result()
 336
 337
 338 def clean_html(html):
 339     """Clean an HTML snippet into a readable string"""
 340     # Newline vs <br />
 341     html = html.replace('\n', ' ')
 342     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 343     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 344     # Strip html tags
 345     html = re.sub('<.*?>', '', html)
 346     # Replace html entities
 347     html = unescapeHTML(html)
 348     return html.strip()
 349
 350
 351 def sanitize_open(filename, open_mode):
 352     """Try to open the given filename, and slightly tweak it if this fails.
 353
 354     Attempts to open the given filename. If this fails, it tries to change
 355     the filename slightly, step by step, until it's either able to open it
 356     or it fails and raises a final exception, like the standard open()
 357     function.
 358
 359     It returns the tuple (stream, definitive_file_name).
 360     """
 361     try:
 362         if filename == u'-':
 363             if sys.platform == 'win32':
 364                 import msvcrt
 365                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 366             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 367         stream = open(encodeFilename(filename), open_mode)
 368         return (stream, filename)
 369     except (IOError, OSError) as err:
 370         if err.errno in (errno.EACCES,):
 371             raise
 372
 373         # In case of error, try to remove win32 forbidden chars
 374         alt_filename = os.path.join(
 375                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 376                         for path_part in os.path.split(filename)
 377                        )
 378         if alt_filename == filename:
 379             raise
 380         else:
 381             # An exception here should be caught in the caller
 382             stream = open(encodeFilename(filename), open_mode)
 383             return (stream, alt_filename)
 384
 385
 386 def timeconvert(timestr):
 387     """Convert RFC 2822 defined time string into system timestamp"""
 388     timestamp = None
 389     timetuple = email.utils.parsedate_tz(timestr)
 390     if timetuple is not None:
 391         timestamp = email.utils.mktime_tz(timetuple)
 392     return timestamp
 393
 394 def sanitize_filename(s, restricted=False, is_id=False):
 395     """Sanitizes a string so it could be used as part of a filename.
 396     If restricted is set, use a stricter subset of allowed characters.
 397     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 398     """
 399     def replace_insane(char):
 400         if char == '?' or ord(char) < 32 or ord(char) == 127:
 401             return ''
 402         elif char == '"':
 403             return '' if restricted else '\''
 404         elif char == ':':
 405             return '_-' if restricted else ' -'
 406         elif char in '\\/|*<>':
 407             return '_'
 408         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 409             return '_'
 410         if restricted and ord(char) > 127:
 411             return '_'
 412         return char
 413
 414     result = u''.join(map(replace_insane, s))
 415     if not is_id:
 416         while '__' in result:
 417             result = result.replace('__', '_')
 418         result = result.strip('_')
 419         # Common case of "Foreign band name - English song title"
 420         if restricted and result.startswith('-_'):
 421             result = result[2:]
 422         if not result:
 423             result = '_'
 424     return result
 425
 426 def orderedSet(iterable):
 427     """ Remove all duplicates from the input iterable """
 428     res = []
 429     for el in iterable:
 430         if el not in res:
 431             res.append(el)
 432     return res
 433
 434 def unescapeHTML(s):
 435     """
 436     @param s a string
 437     """
 438     assert type(s) == type(u'')
 439
 440     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 441     return result
 442
 443 def encodeFilename(s):
 444     """
 445     @param s The name of the file
 446     """
 447
 448     assert type(s) == type(u'')
 449
 450     # Python 3 has a Unicode API
 451     if sys.version_info >= (3, 0):
 452         return s
 453
 454     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 455         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 456         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 457         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 458         return s
 459     else:
 460         encoding = sys.getfilesystemencoding()
 461         if encoding is None:
 462             encoding = 'utf-8'
 463         return s.encode(encoding, 'ignore')
 464
 465 def decodeOption(optval):
 466     if optval is None:
 467         return optval
 468     if isinstance(optval, bytes):
 469         optval = optval.decode(preferredencoding())
 470
 471     assert isinstance(optval, compat_str)
 472     return optval
 473
 474 def formatSeconds(secs):
 475     if secs > 3600:
 476         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 477     elif secs > 60:
 478         return '%d:%02d' % (secs // 60, secs % 60)
 479     else:
 480         return '%d' % secs
 481
 482 def make_HTTPS_handler(opts):
 483     if sys.version_info < (3,2):
 484         # Python's 2.x handler is very simplistic
 485         return compat_urllib_request.HTTPSHandler()
 486     else:
 487         import ssl
 488         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 489         context.set_default_verify_paths()
 490
 491         context.verify_mode = (ssl.CERT_NONE
 492                                if opts.no_check_certificate
 493                                else ssl.CERT_REQUIRED)
 494         return compat_urllib_request.HTTPSHandler(context=context)
 495
 496 class ExtractorError(Exception):
 497     """Error during info extraction."""
 498     def __init__(self, msg, tb=None, expected=False, cause=None):
 499         """ tb, if given, is the original traceback (so that it can be printed out).
 500         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 501         """
 502
 503         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 504             expected = True
 505         if not expected:
 506             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 507         super(ExtractorError, self).__init__(msg)
 508
 509         self.traceback = tb
 510         self.exc_info = sys.exc_info()  # preserve original exception
 511         self.cause = cause
 512
 513     def format_traceback(self):
 514         if self.traceback is None:
 515             return None
 516         return u''.join(traceback.format_tb(self.traceback))
 517
 518
 519 class DownloadError(Exception):
 520     """Download Error exception.
 521
 522     This exception may be thrown by FileDownloader objects if they are not
 523     configured to continue on errors. They will contain the appropriate
 524     error message.
 525     """
 526     def __init__(self, msg, exc_info=None):
 527         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 528         super(DownloadError, self).__init__(msg)
 529         self.exc_info = exc_info
 530
 531
 532 class SameFileError(Exception):
 533     """Same File exception.
 534
 535     This exception will be thrown by FileDownloader objects if they detect
 536     multiple files would have to be downloaded to the same file on disk.
 537     """
 538     pass
 539
 540
 541 class PostProcessingError(Exception):
 542     """Post Processing exception.
 543
 544     This exception may be raised by PostProcessor's .run() method to
 545     indicate an error in the postprocessing task.
 546     """
 547     def __init__(self, msg):
 548         self.msg = msg
 549
 550 class MaxDownloadsReached(Exception):
 551     """ --max-downloads limit has been reached. """
 552     pass
 553
 554
 555 class UnavailableVideoError(Exception):
 556     """Unavailable Format exception.
 557
 558     This exception will be thrown when a video is requested
 559     in a format that is not available for that video.
 560     """
 561     pass
 562
 563
 564 class ContentTooShortError(Exception):
 565     """Content Too Short exception.
 566
 567     This exception may be raised by FileDownloader objects when a file they
 568     download is too small for what the server announced first, indicating
 569     the connection was probably interrupted.
 570     """
 571     # Both in bytes
 572     downloaded = None
 573     expected = None
 574
 575     def __init__(self, downloaded, expected):
 576         self.downloaded = downloaded
 577         self.expected = expected
 578
 579 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 580     """Handler for HTTP requests and responses.
 581
 582     This class, when installed with an OpenerDirector, automatically adds
 583     the standard headers to every HTTP request and handles gzipped and
 584     deflated responses from web servers. If compression is to be avoided in
 585     a particular request, the original request in the program code only has
 586     to include the HTTP header "Youtubedl-No-Compression", which will be
 587     removed before making the real request.
 588
 589     Part of this code was copied from:
 590
 591     http://techknack.net/python-urllib2-handlers/
 592
 593     Andrew Rowls, the author of that code, agreed to release it to the
 594     public domain.
 595     """
 596
 597     @staticmethod
 598     def deflate(data):
 599         try:
 600             return zlib.decompress(data, -zlib.MAX_WBITS)
 601         except zlib.error:
 602             return zlib.decompress(data)
 603
 604     @staticmethod
 605     def addinfourl_wrapper(stream, headers, url, code):
 606         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 607             return compat_urllib_request.addinfourl(stream, headers, url, code)
 608         ret = compat_urllib_request.addinfourl(stream, headers, url)
 609         ret.code = code
 610         return ret
 611
 612     def http_request(self, req):
 613         for h,v in std_headers.items():
 614             if h in req.headers:
 615                 del req.headers[h]
 616             req.add_header(h, v)
 617         if 'Youtubedl-no-compression' in req.headers:
 618             if 'Accept-encoding' in req.headers:
 619                 del req.headers['Accept-encoding']
 620             del req.headers['Youtubedl-no-compression']
 621         if 'Youtubedl-user-agent' in req.headers:
 622             if 'User-agent' in req.headers:
 623                 del req.headers['User-agent']
 624             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 625             del req.headers['Youtubedl-user-agent']
 626         return req
 627
 628     def http_response(self, req, resp):
 629         old_resp = resp
 630         # gzip
 631         if resp.headers.get('Content-encoding', '') == 'gzip':
 632             content = resp.read()
 633             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 634             try:
 635                 uncompressed = io.BytesIO(gz.read())
 636             except IOError as original_ioerror:
 637                 # There may be junk add the end of the file
 638                 # See http://stackoverflow.com/q/4928560/35070 for details
 639                 for i in range(1, 1024):
 640                     try:
 641                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 642                         uncompressed = io.BytesIO(gz.read())
 643                     except IOError:
 644                         continue
 645                     break
 646                 else:
 647                     raise original_ioerror
 648             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 649             resp.msg = old_resp.msg
 650         # deflate
 651         if resp.headers.get('Content-encoding', '') == 'deflate':
 652             gz = io.BytesIO(self.deflate(resp.read()))
 653             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 654             resp.msg = old_resp.msg
 655         return resp
 656
 657     https_request = http_request
 658     https_response = http_response
 659
 660 def unified_strdate(date_str):
 661     """Return a string with the date in the format YYYYMMDD"""
 662     upload_date = None
 663     #Replace commas
 664     date_str = date_str.replace(',',' ')
 665     # %z (UTC offset) is only supported in python>=3.2
 666     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 667     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
 668     for expression in format_expressions:
 669         try:
 670             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 671         except:
 672             pass
 673     return upload_date
 674
 675 def determine_ext(url, default_ext=u'unknown_video'):
 676     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 677     if re.match(r'^[A-Za-z0-9]+$', guess):
 678         return guess
 679     else:
 680         return default_ext
 681
 682 def subtitles_filename(filename, sub_lang, sub_format):
 683     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 684
 685 def date_from_str(date_str):
 686     """
 687     Return a datetime object from a string in the format YYYYMMDD or
 688     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 689     today = datetime.date.today()
 690     if date_str == 'now'or date_str == 'today':
 691         return today
 692     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 693     if match is not None:
 694         sign = match.group('sign')
 695         time = int(match.group('time'))
 696         if sign == '-':
 697             time = -time
 698         unit = match.group('unit')
 699         #A bad aproximation?
 700         if unit == 'month':
 701             unit = 'day'
 702             time *= 30
 703         elif unit == 'year':
 704             unit = 'day'
 705             time *= 365
 706         unit += 's'
 707         delta = datetime.timedelta(**{unit: time})
 708         return today + delta
 709     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 710
 711 class DateRange(object):
 712     """Represents a time interval between two dates"""
 713     def __init__(self, start=None, end=None):
 714         """start and end must be strings in the format accepted by date"""
 715         if start is not None:
 716             self.start = date_from_str(start)
 717         else:
 718             self.start = datetime.datetime.min.date()
 719         if end is not None:
 720             self.end = date_from_str(end)
 721         else:
 722             self.end = datetime.datetime.max.date()
 723         if self.start > self.end:
 724             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 725     @classmethod
 726     def day(cls, day):
 727         """Returns a range that only contains the given day"""
 728         return cls(day,day)
 729     def __contains__(self, date):
 730         """Check if the date is in the range"""
 731         if not isinstance(date, datetime.date):
 732             date = date_from_str(date)
 733         return self.start <= date <= self.end
 734     def __str__(self):
 735         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 736
 737
 738 def platform_name():
 739     """ Returns the platform name as a compat_str """
 740     res = platform.platform()
 741     if isinstance(res, bytes):
 742         res = res.decode(preferredencoding())
 743
 744     assert isinstance(res, compat_str)
 745     return res
 746
 747
 748 def bytes_to_intlist(bs):
 749     if not bs:
 750         return []
 751     if isinstance(bs[0], int):  # Python 3
 752         return list(bs)
 753     else:
 754         return [ord(c) for c in bs]
 755
 756
 757 def intlist_to_bytes(xs):
 758     if not xs:
 759         return b''
 760     if isinstance(chr(0), bytes):  # Python 2
 761         return ''.join([chr(x) for x in xs])
 762     else:
 763         return bytes(xs)