X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/c7b4d76a372777e3af76ecf9966a8ab9952e52f4..3cdde5eb40ad4ca4809bafa3bb7bf12462f486b7:/youtube_dl/utils.py?ds=sidebyside diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 616948e..201802c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,19 +1,20 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import datetime +import email.utils import errno import gzip import io import json import locale import os +import platform import re +import socket import sys import traceback import zlib -import email.utils -import json -import datetime try: import urllib.request as compat_urllib_request @@ -35,6 +36,11 @@ try: except ImportError: # Python 2 from urlparse import urlparse as compat_urllib_parse_urlparse +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -55,6 +61,11 @@ try: except ImportError: # Python 2 import httplib as compat_http_client +try: + from urllib.error import HTTPError as compat_HTTPError +except ImportError: # Python 2 + from urllib2 import HTTPError as compat_HTTPError + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -150,6 +161,13 @@ try: except NameError: compat_chr = chr +def compat_ord(c): + if type(c) is int: return c + else: return ord(c) + +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) + std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -191,6 +209,20 @@ else: with open(fn, 'w', encoding='utf-8') as f: json.dump(obj, f) +if sys.version_info >= (2,7): + def find_xpath_attr(node, xpath, key, val): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z]+$', key) + assert re.match(r'^[a-zA-Z0-9@\s]*$', val) + expr = xpath + u"[@%s='%s']" % (key, val) + return node.find(expr) +else: + def find_xpath_attr(node, xpath, key, val): + for f in node.findall(xpath): + if f.attrib.get(key) == val: + return f + return None + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -463,11 +495,20 @@ def make_HTTPS_handler(opts): class ExtractorError(Exception): """Error during info extraction.""" - def __init__(self, msg, tb=None): - """ tb, if given, is the original traceback (so that it can be printed out). """ + def __init__(self, msg, tb=None, expected=False, cause=None): + """ tb, if given, is the original traceback (so that it can be printed out). + If expected is set, this is a normal error message and most likely not a bug in youtube-dl. + """ + + if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + expected = True + if not expected: + msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception + self.cause = cause def format_traceback(self): if self.traceback is None: @@ -588,8 +629,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + content = resp.read() + gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') + try: + uncompressed = io.BytesIO(gz.read()) + except IOError as original_ioerror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') + uncompressed = io.BytesIO(gz.read()) + except IOError: + continue + break + else: + raise original_ioerror + resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # deflate if resp.headers.get('Content-encoding', '') == 'deflate': @@ -608,7 +664,7 @@ def unified_strdate(date_str): date_str = date_str.replace(',',' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S'] + format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -616,6 +672,16 @@ def unified_strdate(date_str): pass return upload_date +def determine_ext(url, default_ext=u'unknown_video'): + guess = url.partition(u'?')[0].rpartition(u'.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + else: + return default_ext + +def subtitles_filename(filename, sub_lang, sub_format): + return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + def date_from_str(date_str): """ Return a datetime object from a string in the format YYYYMMDD or @@ -667,3 +733,31 @@ class DateRange(object): return self.start <= date <= self.end def __str__(self): return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): + """ Returns the platform name as a compat_str """ + res = platform.platform() + if isinstance(res, bytes): + res = res.decode(preferredencoding()) + + assert isinstance(res, compat_str) + return res + + +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): + if not xs: + return b'' + if isinstance(chr(0), bytes): # Python 2 + return ''.join([chr(x) for x in xs]) + else: + return bytes(xs)