Merge tag 'upstream/2013.08.29'

[youtubedl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 66ae41e319e39d7c75597a31034315f996f00797..201802cee6e56cbfffeed573c7fc42592a33fdab 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,19 +1,20 @@
  #!/usr/bin/env python
  # -*- coding: utf-8 -*-
  
+import datetime
+import email.utils
  import errno
  import gzip
  import io
  import json
  import locale
  import os
+import platform
  import re
+import socket
  import sys
  import traceback
  import zlib
-import email.utils
-import socket
-import datetime
  
  try:
      import urllib.request as compat_urllib_request
@@ -35,6 +36,11 @@ try:
  except ImportError: # Python 2
      from urlparse import urlparse as compat_urllib_parse_urlparse
  
+try:
+    import urllib.parse as compat_urlparse
+except ImportError: # Python 2
+    import urlparse as compat_urlparse
+
  try:
      import http.cookiejar as compat_cookiejar
  except ImportError: # Python 2
@@ -55,6 +61,11 @@ try:
  except ImportError: # Python 2
      import httplib as compat_http_client
  
+try:
+    from urllib.error import HTTPError as compat_HTTPError
+except ImportError:  # Python 2
+    from urllib2 import HTTPError as compat_HTTPError
+
  try:
      from subprocess import DEVNULL
      compat_subprocess_get_DEVNULL = lambda: DEVNULL
@@ -198,6 +209,20 @@ else:
          with open(fn, 'w', encoding='utf-8') as f:
              json.dump(obj, f)
  
+if sys.version_info >= (2,7):
+    def find_xpath_attr(node, xpath, key, val):
+        """ Find the xpath xpath[@key=val] """
+        assert re.match(r'^[a-zA-Z]+$', key)
+        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
+        expr = xpath + u"[@%s='%s']" % (key, val)
+        return node.find(expr)
+else:
+    def find_xpath_attr(node, xpath, key, val):
+        for f in node.findall(xpath):
+            if f.attrib.get(key) == val:
+                return f
+        return None
+
  def htmlentity_transform(matchobj):
      """Transforms an HTML entity to a character.
  
@@ -470,15 +495,20 @@ def make_HTTPS_handler(opts):
  
  class ExtractorError(Exception):
      """Error during info extraction."""
-    def __init__(self, msg, tb=None):
-        """ tb, if given, is the original traceback (so that it can be printed out). """
-
-        if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
-            msg = msg + u'; please report this issue on GitHub.'
+    def __init__(self, msg, tb=None, expected=False, cause=None):
+        """ tb, if given, is the original traceback (so that it can be printed out).
+        If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
+        """
+
+        if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+            expected = True
+        if not expected:
+            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
          super(ExtractorError, self).__init__(msg)
  
          self.traceback = tb
          self.exc_info = sys.exc_info()  # preserve original exception
+        self.cause = cause
  
      def format_traceback(self):
          if self.traceback is None:
@@ -599,8 +629,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
          old_resp = resp
          # gzip
          if resp.headers.get('Content-encoding', '') == 'gzip':
-            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
-            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
+            content = resp.read()
+            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
+            try:
+                uncompressed = io.BytesIO(gz.read())
+            except IOError as original_ioerror:
+                # There may be junk add the end of the file
+                # See http://stackoverflow.com/q/4928560/35070 for details
+                for i in range(1, 1024):
+                    try:
+                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
+                        uncompressed = io.BytesIO(gz.read())
+                    except IOError:
+                        continue
+                    break
+                else:
+                    raise original_ioerror
+            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
          # deflate
          if resp.headers.get('Content-encoding', '') == 'deflate':
@@ -619,7 +664,7 @@ def unified_strdate(date_str):
      date_str = date_str.replace(',',' ')
      # %z (UTC offset) is only supported in python>=3.2
      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
-    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
+    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
      for expression in format_expressions:
          try:
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -627,6 +672,16 @@ def unified_strdate(date_str):
              pass
      return upload_date
  
+def determine_ext(url, default_ext=u'unknown_video'):
+    guess = url.partition(u'?')[0].rpartition(u'.')[2]
+    if re.match(r'^[A-Za-z0-9]+$', guess):
+        return guess
+    else:
+        return default_ext
+
+def subtitles_filename(filename, sub_lang, sub_format):
+    return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+
  def date_from_str(date_str):
      """
      Return a datetime object from a string in the format YYYYMMDD or
@@ -678,3 +733,31 @@ class DateRange(object):
          return self.start <= date <= self.end
      def __str__(self):
          return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
+
+
+def platform_name():
+    """ Returns the platform name as a compat_str """
+    res = platform.platform()
+    if isinstance(res, bytes):
+        res = res.decode(preferredencoding())
+
+    assert isinstance(res, compat_str)
+    return res
+
+
+def bytes_to_intlist(bs):
+    if not bs:
+        return []
+    if isinstance(bs[0], int):  # Python 3
+        return list(bs)
+    else:
+        return [ord(c) for c in bs]
+
+
+def intlist_to_bytes(xs):
+    if not xs:
+        return b''
+    if isinstance(chr(0), bytes):  # Python 2
+        return ''.join([chr(x) for x in xs])
+    else:
+        return bytes(xs)