Merge pull request #1 from e7appew/python3

[youtubedl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 562031fe110a9a6962c6e5efed2bb311f3059979..b3b687a314681de17547104f7f99e883a312c322 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -47,6 +47,7 @@ from .compat import (
      compat_socket_create_connection,
      compat_str,
      compat_struct_pack,
+    compat_struct_unpack,
      compat_urllib_error,
      compat_urllib_parse,
      compat_urllib_parse_urlencode,
@@ -110,6 +111,50 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
                          itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
                                          'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
  
+DATE_FORMATS = (
+    '%d %B %Y',
+    '%d %b %Y',
+    '%B %d %Y',
+    '%b %d %Y',
+    '%b %dst %Y %I:%M',
+    '%b %dnd %Y %I:%M',
+    '%b %dth %Y %I:%M',
+    '%Y %m %d',
+    '%Y-%m-%d',
+    '%Y/%m/%d',
+    '%Y/%m/%d %H:%M',
+    '%Y/%m/%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S.%f',
+    '%d.%m.%Y %H:%M',
+    '%d.%m.%Y %H.%M',
+    '%Y-%m-%dT%H:%M:%SZ',
+    '%Y-%m-%dT%H:%M:%S.%fZ',
+    '%Y-%m-%dT%H:%M:%S.%f0Z',
+    '%Y-%m-%dT%H:%M:%S',
+    '%Y-%m-%dT%H:%M:%S.%f',
+    '%Y-%m-%dT%H:%M',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+    '%d-%m-%Y',
+    '%d.%m.%Y',
+    '%d.%m.%y',
+    '%d/%m/%Y',
+    '%d/%m/%y',
+    '%d/%m/%Y %H:%M:%S',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+    '%m-%d-%Y',
+    '%m.%d.%Y',
+    '%m/%d/%Y',
+    '%m/%d/%y',
+    '%m/%d/%Y %H:%M:%S',
+])
+
  
  def preferredencoding():
      """Get preferred encoding.
@@ -267,9 +312,17 @@ def get_element_by_id(id, html):
      return get_element_by_attribute('id', id, html)
  
  
-def get_element_by_attribute(attribute, value, html):
+def get_element_by_class(class_name, html):
+    return get_element_by_attribute(
+        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+        html, escape_value=False)
+
+
+def get_element_by_attribute(attribute, value, html, escape_value=True):
      """Return the content of the tag with the specified attribute in the passed HTML document"""
  
+    value = re.escape(value) if escape_value else value
+
      m = re.search(r'''(?xs)
          <([a-zA-Z0-9:._-]+)
           (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@@ -278,7 +331,7 @@ def get_element_by_attribute(attribute, value, html):
          \s*>
          (?P<content>.*?)
          </\1>
-    ''' % (re.escape(attribute), re.escape(value)), html)
+    ''' % (re.escape(attribute), value), html)
  
      if not m:
          return None
@@ -975,6 +1028,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
      https_response = http_response
  
  
+def extract_timezone(date_str):
+    m = re.search(
+        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group('tz'))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+    return timezone, date_str
+
+
  def parse_iso8601(date_str, delimiter='T', timezone=None):
      """ Return a UNIX timestamp from the given date """
  
@@ -984,20 +1055,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
      date_str = re.sub(r'\.[0-9]+', '', date_str)
  
      if timezone is None:
-        m = re.search(
-            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
-            date_str)
-        if not m:
-            timezone = datetime.timedelta()
-        else:
-            date_str = date_str[:-len(m.group(0))]
-            if not m.group('sign'):
-                timezone = datetime.timedelta()
-            else:
-                sign = 1 if m.group('sign') == '+' else -1
-                timezone = datetime.timedelta(
-                    hours=sign * int(m.group('hours')),
-                    minutes=sign * int(m.group('minutes')))
+        timezone, date_str = extract_timezone(date_str)
+
      try:
          date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
          dt = datetime.datetime.strptime(date_str, date_format) - timezone
@@ -1006,6 +1065,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
          pass
  
  
+def date_formats(day_first=True):
+    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
  def unified_strdate(date_str, day_first=True):
      """Return a string with the date in the format YYYYMMDD"""
  
@@ -1014,53 +1077,11 @@ def unified_strdate(date_str, day_first=True):
      upload_date = None
      # Replace commas
      date_str = date_str.replace(',', ' ')
-    # %z (UTC offset) is only supported in python>=3.2
-    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
-        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
      # Remove AM/PM + timezone
      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+    _, date_str = extract_timezone(date_str)
  
-    format_expressions = [
-        '%d %B %Y',
-        '%d %b %Y',
-        '%B %d %Y',
-        '%b %d %Y',
-        '%b %dst %Y %I:%M',
-        '%b %dnd %Y %I:%M',
-        '%b %dth %Y %I:%M',
-        '%Y %m %d',
-        '%Y-%m-%d',
-        '%Y/%m/%d',
-        '%Y/%m/%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S.%f',
-        '%d.%m.%Y %H:%M',
-        '%d.%m.%Y %H.%M',
-        '%Y-%m-%dT%H:%M:%SZ',
-        '%Y-%m-%dT%H:%M:%S.%fZ',
-        '%Y-%m-%dT%H:%M:%S.%f0Z',
-        '%Y-%m-%dT%H:%M:%S',
-        '%Y-%m-%dT%H:%M:%S.%f',
-        '%Y-%m-%dT%H:%M',
-    ]
-    if day_first:
-        format_expressions.extend([
-            '%d-%m-%Y',
-            '%d.%m.%Y',
-            '%d.%m.%y',
-            '%d/%m/%Y',
-            '%d/%m/%y',
-            '%d/%m/%Y %H:%M:%S',
-        ])
-    else:
-        format_expressions.extend([
-            '%m-%d-%Y',
-            '%m.%d.%Y',
-            '%m/%d/%Y',
-            '%m/%d/%y',
-            '%m/%d/%Y %H:%M:%S',
-        ])
-    for expression in format_expressions:
+    for expression in date_formats(day_first):
          try:
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
          except ValueError:
@@ -1076,6 +1097,29 @@ def unified_strdate(date_str, day_first=True):
          return compat_str(upload_date)
  
  
+def unified_timestamp(date_str, day_first=True):
+    if date_str is None:
+        return None
+
+    date_str = date_str.replace(',', ' ')
+
+    pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
+    timezone, date_str = extract_timezone(date_str)
+
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+    for expression in date_formats(day_first):
+        try:
+            dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
+            return calendar.timegm(dt.timetuple())
+        except ValueError:
+            pass
+    timetuple = email.utils.parsedate_tz(date_str)
+    if timetuple:
+        return calendar.timegm(timetuple) + pm_delta * 3600
+
+
  def determine_ext(url, default_ext='unknown_video'):
      if url is None:
          return default_ext
@@ -1410,6 +1454,8 @@ def shell_quote(args):
  def smuggle_url(url, data):
      """ Pass additional data in a URL for internal use. """
  
+    url, idata = unsmuggle_url(url, {})
+    data.update(idata)
      sdata = compat_urllib_parse_urlencode(
          {'__youtubedl_smuggle': json.dumps(data)})
      return url + '#' + sdata
@@ -1591,6 +1637,11 @@ class HEADRequest(compat_urllib_request.Request):
          return 'HEAD'
  
  
+class PUTRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'PUT'
+
+
  def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
      if get_attr:
          if v is not None:
@@ -1626,6 +1677,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
          return default
  
  
+def strip_or_none(v):
+    return None if v is None else v.strip()
+
+
  def parse_duration(s):
      if not isinstance(s, compat_basestring):
          return None
@@ -1882,7 +1937,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
      req_headers.update(headers)
      req_data = data or req.data
      req_url = update_url_query(url or req.get_full_url(), query)
-    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = compat_urllib_request.Request
      new_req = req_type(
          req_url, data=req_data, headers=req_headers,
          origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
@@ -1924,11 +1985,27 @@ US_RATINGS = {
  }
  
  
+TV_PARENTAL_GUIDELINES = {
+    'TV-Y': 0,
+    'TV-Y7': 7,
+    'TV-G': 0,
+    'TV-PG': 0,
+    'TV-14': 14,
+    'TV-MA': 17,
+}
+
+
  def parse_age_limit(s):
-    if s is None:
+    if type(s) == int:
+        return s if 0 <= s <= 21 else None
+    if not isinstance(s, compat_basestring):
          return None
      m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
-    return int(m.group('age')) if m else US_RATINGS.get(s)
+    if m:
+        return int(m.group('age'))
+    if s in US_RATINGS:
+        return US_RATINGS[s]
+    return TV_PARENTAL_GUIDELINES.get(s)
  
  
  def strip_jsonp(code):
@@ -2046,6 +2123,7 @@ def mimetype2ext(mt):
          return ext
  
      _, _, res = mt.rpartition('/')
+    res = res.lower()
  
      return {
          '3gpp': '3gp',
@@ -2057,9 +2135,53 @@ def mimetype2ext(mt):
          'x-flv': 'flv',
          'x-mp4-fragmented': 'mp4',
          'x-ms-wmv': 'wmv',
+        'mpegurl': 'm3u8',
+        'x-mpegurl': 'm3u8',
+        'vnd.apple.mpegurl': 'm3u8',
+        'dash+xml': 'mpd',
+        'f4m': 'f4m',
+        'f4m+xml': 'f4m',
+        'hds+xml': 'f4m',
+        'vnd.ms-sstr+xml': 'ism',
      }.get(res, res)
  
  
+def parse_codecs(codecs_str):
+    # http://tools.ietf.org/html/rfc6381
+    if not codecs_str:
+        return {}
+    splited_codecs = list(filter(None, map(
+        lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
+    vcodec, acodec = None, None
+    for full_codec in splited_codecs:
+        codec = full_codec.split('.')[0]
+        if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
+            if not vcodec:
+                vcodec = full_codec
+        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
+            if not acodec:
+                acodec = full_codec
+        else:
+            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+    if not vcodec and not acodec:
+        if len(splited_codecs) == 2:
+            return {
+                'vcodec': vcodec,
+                'acodec': acodec,
+            }
+        elif len(splited_codecs) == 1:
+            return {
+                'vcodec': 'none',
+                'acodec': vcodec,
+            }
+    else:
+        return {
+            'vcodec': vcodec or 'none',
+            'acodec': acodec or 'none',
+        }
+    return {}
+
+
  def urlhandle_detect_ext(url_handle):
      getheader = url_handle.headers.get
  
@@ -2288,6 +2410,8 @@ def dfxp2srt(dfxp_data):
  
  def cli_option(params, command_option, param):
      param = params.get(param)
+    if param:
+        param = compat_str(param)
      return [command_option, param] if param is not None else []
  
  
@@ -2861,3 +2985,114 @@ def parse_m3u8_attributes(attrib):
              val = val[1:-1]
          info[key] = val
      return info
+
+
+def urshift(val, n):
+    return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+
+# Based on png2str() written by @gdkchan and improved by @yokrysty
+# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
+def decode_png(png_data):
+    # Reference: https://www.w3.org/TR/PNG/
+    header = png_data[8:]
+
+    if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
+        raise IOError('Not a valid PNG file.')
+
+    int_map = {1: '>B', 2: '>H', 4: '>I'}
+    unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
+
+    chunks = []
+
+    while header:
+        length = unpack_integer(header[:4])
+        header = header[4:]
+
+        chunk_type = header[:4]
+        header = header[4:]
+
+        chunk_data = header[:length]
+        header = header[length:]
+
+        header = header[4:]  # Skip CRC
+
+        chunks.append({
+            'type': chunk_type,
+            'length': length,
+            'data': chunk_data
+        })
+
+    ihdr = chunks[0]['data']
+
+    width = unpack_integer(ihdr[:4])
+    height = unpack_integer(ihdr[4:8])
+
+    idat = b''
+
+    for chunk in chunks:
+        if chunk['type'] == b'IDAT':
+            idat += chunk['data']
+
+    if not idat:
+        raise IOError('Unable to read PNG data.')
+
+    decompressed_data = bytearray(zlib.decompress(idat))
+
+    stride = width * 3
+    pixels = []
+
+    def _get_pixel(idx):
+        x = idx % stride
+        y = idx // stride
+        return pixels[y][x]
+
+    for y in range(height):
+        basePos = y * (1 + stride)
+        filter_type = decompressed_data[basePos]
+
+        current_row = []
+
+        pixels.append(current_row)
+
+        for x in range(stride):
+            color = decompressed_data[1 + basePos + x]
+            basex = y * stride + x
+            left = 0
+            up = 0
+
+            if x > 2:
+                left = _get_pixel(basex - 3)
+            if y > 0:
+                up = _get_pixel(basex - stride)
+
+            if filter_type == 1:  # Sub
+                color = (color + left) & 0xff
+            elif filter_type == 2:  # Up
+                color = (color + up) & 0xff
+            elif filter_type == 3:  # Average
+                color = (color + ((left + up) >> 1)) & 0xff
+            elif filter_type == 4:  # Paeth
+                a = left
+                b = up
+                c = 0
+
+                if x > 2 and y > 0:
+                    c = _get_pixel(basex - stride - 3)
+
+                p = a + b - c
+
+                pa = abs(p - a)
+                pb = abs(p - b)
+                pc = abs(p - c)
+
+                if pa <= pb and pa <= pc:
+                    color = (color + a) & 0xff
+                elif pb <= pc:
+                    color = (color + b) & 0xff
+                else:
+                    color = (color + c) & 0xff
+
+            current_row.append(color)
+
+    return width, height, pixels