X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/15b1d10671b48df598afd70e17ba21e9e64ac766..555f0744b172953b2591db57ff5c6bccb4f378d6:/youtube_dl/utils.py?ds=inline diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4d3cbac..1f3bfef 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -10,12 +10,14 @@ import ctypes import datetime import email.utils import errno +import functools import gzip import itertools import io import json import locale import math +import operator import os import pipes import platform @@ -31,10 +33,13 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_basestring, compat_chr, compat_getenv, compat_html_entities, + compat_http_client, compat_parse_qs, + compat_socket_create_connection, compat_str, compat_urllib_error, compat_urllib_parse, @@ -49,7 +54,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -57,6 +62,11 @@ std_headers = { } +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + + def preferredencoding(): """Get preferred encoding. @@ -137,7 +147,7 @@ else: def find_xpath_attr(node, xpath, key, val): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, unicode): + if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): @@ -166,7 +176,7 @@ def xpath_text(node, xpath, name=None, fatal=False): xpath = xpath.encode('ascii') n = node.find(xpath) - if n is None: + if n is None or n.text is None: if fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) @@ -205,6 +215,10 @@ def get_element_by_attribute(attribute, value, html): def clean_html(html): """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + # Newline vs
html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) @@ -280,6 +294,8 @@ def sanitize_filename(s, restricted=False, is_id=False): return '_' return char + # Handle timestamps + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) result = ''.join(map(replace_insane, s)) if not is_id: while '__' in result: @@ -288,6 +304,8 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] if not result: result = '_' return result @@ -363,7 +381,7 @@ def encodeArgument(s): if not isinstance(s, compat_str): # Legacy code that uses byte strings # Uncomment the following line after fixing all post processors - #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) s = s.decode('ascii') return encodeFilename(s, True) @@ -387,45 +405,29 @@ def formatSeconds(secs): return '%d' % secs -def make_HTTPS_handler(opts_no_check_certificate, **kwargs): - if sys.version_info < (3, 2): - import httplib - - class HTTPSConnectionV3(httplib.HTTPSConnection): - def __init__(self, *args, **kwargs): - httplib.HTTPSConnection.__init__(self, *args, **kwargs) - - def connect(self): - sock = socket.create_connection((self.host, self.port), self.timeout) - if getattr(self, '_tunnel_host', False): - self.sock = sock - self._tunnel() - try: - self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1) - except ssl.SSLError: - self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) - - class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): - def https_open(self, req): - return self.do_open(HTTPSConnectionV3, req) - return HTTPSHandlerV3(**kwargs) - elif hasattr(ssl, 'create_default_context'): # Python >= 3.4 - context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3 +def make_HTTPS_handler(params, **kwargs): + opts_no_check_certificate = params.get('nocheckcertificate', False) + if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 + context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) if opts_no_check_certificate: + context.check_hostname = False context.verify_mode = ssl.CERT_NONE - return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + try: + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + except TypeError: + # Python 2.7.8 + # (create_default_context present but HTTPSHandler has no context=) + pass + + if sys.version_info < (3, 2): + return YoutubeDLHTTPSHandler(params, **kwargs) else: # Python < 3.4 - context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) context.verify_mode = (ssl.CERT_NONE if opts_no_check_certificate else ssl.CERT_REQUIRED) context.set_default_verify_paths() - try: - context.load_default_certs() - except AttributeError: - pass # Python < 3.4 - return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) class ExtractorError(Exception): @@ -463,6 +465,13 @@ class ExtractorError(Exception): return ''.join(traceback.format_tb(self.traceback)) +class UnsupportedError(ExtractorError): + def __init__(self, url): + super(UnsupportedError, self).__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + class RegexNotFoundError(ExtractorError): """Error when a regex didn't match""" pass @@ -532,6 +541,28 @@ class ContentTooShortError(Exception): self.expected = expected +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + if source_address is not None: + sa = (source_address, 0) + if hasattr(hc, 'source_address'): # Python 2.7+ + hc.source_address = sa + else: # Python 2.6 + def _hc_connect(self, *args, **kwargs): + sock = compat_socket_create_connection( + (self.host, self.port), self.timeout, sa) + if is_https: + self.sock = ssl.wrap_socket( + sock, self.key_file, self.cert_file, + ssl_version=ssl.PROTOCOL_TLSv1) + else: + self.sock = sock + hc.connect = functools.partial(_hc_connect, hc) + + return hc + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -550,6 +581,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): public domain. """ + def __init__(self, params, *args, **kwargs): + compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + return self.do_open(functools.partial( + _create_http_connection, self, compat_http_client.HTTPConnection, False), + req) + @staticmethod def deflate(data): try: @@ -567,17 +607,14 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def http_request(self, req): for h, v in std_headers.items(): - if h not in req.headers: + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: req.add_header(h, v) if 'Youtubedl-no-compression' in req.headers: if 'Accept-encoding' in req.headers: del req.headers['Accept-encoding'] del req.headers['Youtubedl-no-compression'] - if 'Youtubedl-user-agent' in req.headers: - if 'User-agent' in req.headers: - del req.headers['User-agent'] - req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] - del req.headers['Youtubedl-user-agent'] if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments @@ -619,42 +656,62 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response -def parse_iso8601(date_str, delimiter='T'): +class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): + def __init__(self, params, https_conn_class=None, *args, **kwargs): + compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + self._params = params + + def https_open(self, req): + kwargs = {} + if hasattr(self, '_context'): # python > 2.6 + kwargs['context'] = self._context + if hasattr(self, '_check_hostname'): # python 3.x + kwargs['check_hostname'] = self._check_hostname + return self.do_open(functools.partial( + _create_http_connection, self, self._https_conn_class, True), + req, **kwargs) + + +def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ if date_str is None: return None - m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): + if timezone is None: + m = re.search( + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: timezone = datetime.timedelta() else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt.timetuple()) -def unified_strdate(date_str): +def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" if date_str is None: return None - upload_date = None # Replace commas date_str = date_str.replace(',', ' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + format_expressions = [ '%d %B %Y', '%d %b %Y', @@ -663,13 +720,10 @@ def unified_strdate(date_str): '%b %dst %Y %I:%M%p', '%b %dnd %Y %I:%M%p', '%b %dth %Y %I:%M%p', + '%Y %m %d', '%Y-%m-%d', '%Y/%m/%d', - '%d.%m.%Y', - '%d/%m/%Y', - '%d/%m/%y', '%Y/%m/%d %H:%M:%S', - '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -681,6 +735,20 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] + if day_first: + format_expressions.extend([ + '%d.%m.%Y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', + ]) + else: + format_expressions.extend([ + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', + ]) for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -712,8 +780,10 @@ def date_from_str(date_str): Return a datetime object from a string in the format YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s)?""" today = datetime.date.today() - if date_str == 'now'or date_str == 'today': + if date_str in ('now', 'today'): return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) match = re.match('(now|today)(?P[+-])(?P