X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/782e271dfe61ed8d5dbf100286ad876068ea585c..74a9312fb6565d0d3d227b6c959436678ce1dbe2:/youtube_dl/utils.py diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 201ed25..b3b687a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,187 +1,161 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import base64 +import binascii +import calendar +import codecs +import contextlib +import ctypes import datetime import email.utils import errno +import functools import gzip import io +import itertools import json import locale +import math +import operator import os +import pipes import platform import re import socket +import ssl +import subprocess import sys +import tempfile import traceback +import xml.etree.ElementTree import zlib -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: - import html.parser as compat_html_parser -except ImportError: # Python 2 - import HTMLParser as compat_html_parser - -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, unicode - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str +from .compat import ( + compat_HTMLParser, + compat_basestring, + compat_chr, + compat_etree_fromstring, + compat_html_entities, + compat_html_entities_html5, + compat_http_client, + compat_kwargs, + compat_parse_qs, + compat_shlex_quote, + compat_socket_create_connection, + compat_str, + compat_struct_pack, + compat_struct_unpack, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, + compat_urllib_parse_unquote_plus, + compat_urllib_request, + compat_urlparse, + compat_xpath, +) + +from .socks import ( + ProxyType, + sockssocket, +) + + +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in compat_urlparse.uses_netloc: + compat_urlparse.uses_netloc.append(scheme) -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -def compat_ord(c): - if type(c) is int: return c - else: return ord(c) # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } + +NO_DEFAULT = object() + +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + +KNOWN_EXTENSIONS = ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil') + +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) + +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + + def preferredencoding(): """Get preferred encoding. @@ -190,195 +164,223 @@ def preferredencoding(): """ try: pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: + 'TEST'.encode(pref) + except Exception: pref = 'UTF-8' return pref -if sys.version_info < (3,0): - def compat_print(s): - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert type(s) == type(u'') - print(s) - -# In Python 2.x, json.dump expects a bytestream. -# In Python 3.x, it writes to a character stream -if sys.version_info < (3,0): - def write_json_file(obj, fn): - with open(fn, 'wb') as f: - json.dump(obj, f) -else: - def write_json_file(obj, fn): - with open(fn, 'w', encoding='utf-8') as f: - json.dump(obj, f) -if sys.version_info >= (2,7): - def find_xpath_attr(node, xpath, key, val): +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically if possible """ + + fn = encodeFilename(fn) + if sys.version_info < (3, 0) and sys.platform != 'win32': + encoding = get_filesystem_encoding() + # os.path.basename returns a bytes object, but NamedTemporaryFile + # will fail if the filename contains non ascii characters unless we + # use a unicode object + path_basename = lambda f: os.path.basename(fn).decode(encoding) + # the same for os.path.dirname + path_dirname = lambda f: os.path.dirname(fn).decode(encoding) + else: + path_basename = os.path.basename + path_dirname = os.path.dirname + + args = { + 'suffix': '.tmp', + 'prefix': path_basename(fn) + '.', + 'dir': path_dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) + + try: + with tf: + json.dump(obj, tf) + if sys.platform == 'win32': + # Need to remove existing file on Windows, else os.rename raises + # WindowsError or FileExistsError. + try: + os.unlink(fn) + except OSError: + pass + os.rename(tf.name, fn) + except Exception: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s]*$', val) - expr = xpath + u"[@%s='%s']" % (key, val) + assert re.match(r'^[a-zA-Z_-]+$', key) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): - for f in node.findall(xpath): - if f.attrib.get(key) == val: + def find_xpath_attr(node, xpath, key, val=None): + for f in node.findall(compat_xpath(xpath)): + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None -def htmlentity_transform(matchobj): - """Transforms an HTML entity to a character. - - This function receives a match object and is intended to be used with - the re.sub() function. - """ - entity = matchobj.group(1) +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) - mobj = re.match(u'(?u)#(x?\\d+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith(u'x'): - base = 16 - numstr = u'0%s' % numstr +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) else: - base = 10 - return compat_chr(int(numstr, base)) + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) - # Unknown entity in name, return its literal representation - return (u'&%s;' % entity) - -compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class BaseHTMLParser(compat_html_parser.HTMLParser): - def __init(self): - compat_html_parser.HTMLParser.__init__(self) - self.html = None - - def loads(self, html): - self.html = html - self.feed(html) - self.close() - -class AttrParser(BaseHTMLParser): - """Modified HTMLParser that isolates a tag with the specified attribute""" - def __init__(self, attribute, value): - self.attribute = attribute - self.value = value - self.result = None - self.started = False - self.depth = {} - self.watch_startpos = False - self.error_count = 0 - BaseHTMLParser.__init__(self) - - def error(self, message): - if self.error_count > 10 or self.started: - raise compat_html_parser.HTMLParseError(message, self.getpos()) - self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line - self.error_count += 1 - self.goahead(1) - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.started: - self.find_startpos(None) - if self.attribute in attrs and attrs[self.attribute] == self.value: - self.result = [tag] - self.started = True - self.watch_startpos = True - if self.started: - if not tag in self.depth: self.depth[tag] = 0 - self.depth[tag] += 1 - - def handle_endtag(self, tag): - if self.started: - if tag in self.depth: self.depth[tag] -= 1 - if self.depth[self.result[0]] == 0: - self.started = False - self.result.append(self.getpos()) - - def find_startpos(self, x): - """Needed to put the start position of the result (self.result[1]) - after the opening tag with the requested id""" - if self.watch_startpos: - self.watch_startpos = False - self.result.append(self.getpos()) - handle_entityref = handle_charref = handle_data = handle_comment = \ - handle_decl = handle_pi = unknown_decl = find_startpos - - def get_result(self): - if self.result is None: +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + def _find_xpath(xpath): + return node.find(compat_xpath(xpath)) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break + + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) + else: + return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: return None - if len(self.result) != 3: + return n.text + + +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = '%s[@%s]' % (xpath, key) if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: return None - lines = self.html.split('\n') - lines = lines[self.result[1][0]-1:self.result[2][0]] - lines[0] = lines[0][self.result[1][1]:] - if len(lines) == 1: - lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] - lines[-1] = lines[-1][:self.result[2][1]] - return '\n'.join(lines).strip() -# Hack for https://github.com/rg3/youtube-dl/issues/662 -if sys.version_info < (2, 7, 3): - AttrParser.parse_endtag = (lambda self, i: - i + len("") - if self.rawdata[i:].startswith("") - else compat_html_parser.HTMLParser.parse_endtag(self, i)) + return n.attrib[key] + def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute("id", id, html) + return get_element_by_attribute('id', id, html) + + +def get_element_by_class(class_name, html): + return get_element_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + -def get_element_by_attribute(attribute, value, html): +def get_element_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" - parser = AttrParser(attribute, value) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() -class MetaParser(BaseHTMLParser): - """ - Modified HTMLParser that isolates a meta tag with the specified name - attribute. - """ - def __init__(self, name): - BaseHTMLParser.__init__(self) - self.name = name - self.content = None - self.result = None + value = re.escape(value) if escape_value else value - def handle_starttag(self, tag, attrs): - if tag != 'meta': - return - attrs = dict(attrs) - if attrs.get('name') == self.name: - self.result = attrs.get('content') + m = re.search(r'''(?xs) + <([a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + \s+%s=['"]?%s['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + \s*> + (?P.*?) + + ''' % (re.escape(attribute), value), html) - def get_result(self): - return self.result + if not m: + return None + res = m.group('content') -def get_meta_content(name, html): - """ - Return the content attribute from the meta tag with the given name attribute. + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] + + return unescapeHTML(res) + + +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. """ - parser = MetaParser(name) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + # Newline vs
html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) @@ -401,7 +403,7 @@ def sanitize_open(filename, open_mode): It returns the tuple (stream, definitive_file_name). """ try: - if filename == u'-': + if filename == '-': if sys.platform == 'win32': import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) @@ -413,15 +415,12 @@ def sanitize_open(filename, open_mode): raise # In case of error, try to remove win32 forbidden chars - alt_filename = os.path.join( - re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part) - for path_part in os.path.split(filename) - ) + alt_filename = sanitize_path(filename) if alt_filename == filename: raise else: # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) + stream = open(encodeFilename(alt_filename), open_mode) return (stream, alt_filename) @@ -433,12 +432,15 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp + def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': @@ -453,7 +455,9 @@ def sanitize_filename(s, restricted=False, is_id=False): return '_' return char - result = u''.join(map(replace_insane, s)) + # Handle timestamps + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) + result = ''.join(map(replace_insane, s)) if not is_id: while '__' in result: result = result.replace('__', '_') @@ -461,10 +465,42 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') if not result: result = '_' return result + +def sanitize_path(s): + """Sanitizes and normalizes path on Windows""" + if sys.platform != 'win32': + return s + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + for path_part in norm_path] + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) + return os.path.join(*sanitized_path) + + +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitize_url(url): + return 'http:%s' % url if url.startswith('//') else url + + +def sanitized_Request(url, *args, **kwargs): + return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -473,36 +509,106 @@ def orderedSet(iterable): res.append(el) return res + +def _htmlentity_transform(entity_with_semicolon): + """Transforms an HTML entity to a character.""" + entity = entity_with_semicolon[:-1] + + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + # TODO: HTML5 allows entities without a semicolon. For example, + # 'Éric' should be decoded as 'Éric'. + if entity_with_semicolon in compat_html_entities_html5: + return compat_html_entities_html5[entity_with_semicolon] + + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith('x'): + base = 16 + numstr = '0%s' % numstr + else: + base = 10 + # See https://github.com/rg3/youtube-dl/issues/7518 + try: + return compat_chr(int(numstr, base)) + except ValueError: + pass + + # Unknown entity in name, return its literal representation + return '&%s;' % entity + + def unescapeHTML(s): - """ - @param s a string - """ - assert type(s) == type(u'') + if s is None: + return None + assert type(s) == compat_str - result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) - return result + return re.sub( + r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding -def encodeFilename(s): + +def encodeFilename(s, for_subprocess=False): """ @param s The name of the file """ - assert type(s) == type(u'') + assert type(s) == compat_str # Python 3 has a Unicode API if sys.version_info >= (3, 0): return s - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass u'' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: return s - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return s.encode(encoding, 'ignore') + + # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible + if sys.platform.startswith('java'): + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') + + +def encodeArgument(s): + if not isinstance(s, compat_str): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + s = s.decode('ascii') + return encodeFilename(s, True) + + +def decodeArgument(b): + return decodeFilename(b, True) + def decodeOption(optval): if optval is None: @@ -513,6 +619,7 @@ def decodeOption(optval): assert isinstance(optval, compat_str) return optval + def formatSeconds(secs): if secs > 3600: return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) @@ -521,41 +628,82 @@ def formatSeconds(secs): else: return '%d' % secs -def make_HTTPS_handler(opts): - if sys.version_info < (3,2): - # Python's 2.x handler is very simplistic - return compat_urllib_request.HTTPSHandler() - else: - import ssl - context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) - context.set_default_verify_paths() - + +def make_HTTPS_handler(params, **kwargs): + opts_no_check_certificate = params.get('nocheckcertificate', False) + if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 + context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) + if opts_no_check_certificate: + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + try: + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + except TypeError: + # Python 2.7.8 + # (create_default_context present but HTTPSHandler has no context=) + pass + + if sys.version_info < (3, 2): + return YoutubeDLHTTPSHandler(params, **kwargs) + else: # Python < 3.4 + context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) context.verify_mode = (ssl.CERT_NONE - if opts.no_check_certificate + if opts_no_check_certificate else ssl.CERT_REQUIRED) - return compat_urllib_request.HTTPSHandler(context=context) + context.set_default_verify_paths() + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + + +def bug_reports_message(): + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg = '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + return msg + class ExtractorError(Exception): """Error during info extraction.""" - def __init__(self, msg, tb=None, expected=False, cause=None): + + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in youtube-dl. """ if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): expected = True + if video_id is not None: + msg = video_id + ': ' + msg + if cause: + msg += ' (caused by %r)' % cause if not expected: - msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' + msg += bug_reports_message() super(ExtractorError, self).__init__(msg) self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception self.cause = cause + self.video_id = video_id def format_traceback(self): if self.traceback is None: return None - return u''.join(traceback.format_tb(self.traceback)) + return ''.join(traceback.format_tb(self.traceback)) + + +class UnsupportedError(ExtractorError): + def __init__(self, url): + super(UnsupportedError, self).__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass class DownloadError(Exception): @@ -565,6 +713,7 @@ class DownloadError(Exception): configured to continue on errors. They will contain the appropriate error message. """ + def __init__(self, msg, exc_info=None): """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ super(DownloadError, self).__init__(msg) @@ -586,9 +735,11 @@ class PostProcessingError(Exception): This exception may be raised by PostProcessor's .run() method to indicate an error in the postprocessing task. """ + def __init__(self, msg): self.msg = msg + class MaxDownloadsReached(Exception): """ --max-downloads limit has been reached. """ pass @@ -610,14 +761,50 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting + # expected HTTP responses to meet HTTP/1.0 or later (see also + # https://github.com/rg3/youtube-dl/issues/6727) + if sys.version_info < (3, 0): + kwargs[b'strict'] = True + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + if source_address is not None: + sa = (source_address, 0) + if hasattr(hc, 'source_address'): # Python 2.7+ + hc.source_address = sa + else: # Python 2.6 + def _hc_connect(self, *args, **kwargs): + sock = compat_socket_create_connection( + (self.host, self.port), self.timeout, sa) + if is_https: + self.sock = ssl.wrap_socket( + sock, self.key_file, self.cert_file, + ssl_version=ssl.PROTOCOL_TLSv1) + else: + self.sock = sock + hc.connect = functools.partial(_hc_connect, hc) + + return hc + + +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -625,7 +812,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -636,6 +823,22 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): public domain. """ + def __init__(self, params, *args, **kwargs): + compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + conn_class = compat_http_client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, False), + req) + @staticmethod def deflate(data): try: @@ -652,19 +855,34 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): - for h,v in std_headers.items(): - if h in req.headers: - del req.headers[h] - req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] - if 'Youtubedl-user-agent' in req.headers: - if 'User-agent' in req.headers: - del req.headers['User-agent'] - req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] - del req.headers['Youtubedl-user-agent'] + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req = update_Request(req, url=url_escaped) + + for h, v in std_headers.items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + + req.headers = handle_youtubedl_headers(req.headers) + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): @@ -689,57 +907,245 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): raise original_ioerror resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/rg3/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + else: + location = location.decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + if sys.version_info < (3, 0): + location_escaped = location_escaped.encode('utf-8') + resp.headers['Location'] = location_escaped return resp https_request = http_request https_response = http_response -def unified_strdate(date_str): + +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + + url_components = compat_urlparse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + def unquote_if_non_empty(s): + if not s: + return s + return compat_urllib_parse_unquote_plus(s) + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + unquote_if_non_empty(url_components.username), + unquote_if_non_empty(url_components.password), + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if type(self.timeout) in (int, float): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, compat_http_client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + +class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): + def __init__(self, params, https_conn_class=None, *args, **kwargs): + compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + self._params = params + + def https_open(self, req): + kwargs = {} + conn_class = self._https_conn_class + + if hasattr(self, '_context'): # python > 2.6 + kwargs['context'] = self._context + if hasattr(self, '_check_hostname'): # python 3.x + kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, True), + req, **kwargs) + + +class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + # Python 2 will choke on next HTTP request in row if there are non-ASCII + # characters in Set-Cookie HTTP header of last response (see + # https://github.com/rg3/youtube-dl/issues/6769). + # In order to at least prevent crashing we will percent encode Set-Cookie + # header before HTTPCookieProcessor starts processing it. + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped + return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_response = http_response + + +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?PZ$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + +def parse_iso8601(date_str, delimiter='T', timezone=None): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + date_str = re.sub(r'\.[0-9]+', '', date_str) + + if timezone is None: + timezone, date_str = extract_timezone(date_str) + + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + + +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + +def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None upload_date = None - #Replace commas - date_str = date_str.replace(',',' ') - # %z (UTC offset) is only supported in python>=3.2 - date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = [ - '%d %B %Y', - '%B %d %Y', - '%b %d %Y', - '%Y-%m-%d', - '%d/%m/%Y', - '%Y/%m/%d %H:%M:%S', - '%d.%m.%Y %H:%M', - '%Y-%m-%dT%H:%M:%SZ', - ] - for expression in format_expressions: + # Replace commas + date_str = date_str.replace(',', ' ') + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) + + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') - except: + except ValueError: + pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + try: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + except ValueError: + pass + if upload_date is not None: + return compat_str(upload_date) + + +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = date_str.replace(',', ' ') + + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) + return calendar.timegm(dt.timetuple()) + except ValueError: pass - return upload_date + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple) + pm_delta * 3600 -def determine_ext(url, default_ext=u'unknown_video'): - guess = url.partition(u'?')[0].rpartition(u'.')[2] + +def determine_ext(url, default_ext='unknown_video'): + if url is None: + return default_ext + guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: + return guess.rstrip('/') else: return default_ext + def subtitles_filename(filename, sub_lang, sub_format): - return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format + def date_from_str(date_str): """ Return a datetime object from a string in the format YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s)?""" today = datetime.date.today() - if date_str == 'now'or date_str == 'today': + if date_str in ('now', 'today'): return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) match = re.match('(now|today)(?P[+-])(?P