X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/1169b048fab1ef2e5cbad85340928b8c69dd2758..a8b640ed81fd63ade1773f4aa2f7808bc80175dd:/youtube_dl/utils.py diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 922e17e..2864e51 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,354 +1,1738 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import calendar +import codecs +import contextlib +import ctypes +import datetime +import email.utils +import errno +import getpass import gzip -import htmlentitydefs -import HTMLParser +import itertools +import io +import json import locale +import math import os +import pipes +import platform import re +import ssl +import socket +import struct +import subprocess import sys +import tempfile +import traceback +import xml.etree.ElementTree import zlib -import urllib2 -import email.utils -import json try: - import cStringIO as StringIO + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + from urllib.parse import urlparse as compat_urllib_parse_urlparse +except ImportError: # Python 2 + from urlparse import urlparse as compat_urllib_parse_urlparse + +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +try: + import html.entities as compat_html_entities +except ImportError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + import html.parser as compat_html_parser +except ImportError: # Python 2 + import HTMLParser as compat_html_parser + +try: + import http.client as compat_http_client +except ImportError: # Python 2 + import httplib as compat_http_client + +try: + from urllib.error import HTTPError as compat_HTTPError +except ImportError: # Python 2 + from urllib2 import HTTPError as compat_HTTPError + +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL except ImportError: - import StringIO + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + +try: + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + if string == '': + return string + res = string.split('%') + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: + try: + if not item: + raise ValueError + pct_sequence += item[:2].decode('hex') + rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue + except ValueError: + rest = '%' + item + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' + if pct_sequence: + # Flush the final pct_sequence + string += pct_sequence.decode(encoding, errors) + return string + + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, unicode + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + +try: + from xml.etree.ElementTree import ParseError as compat_xml_parse_error +except ImportError: # Python 2.6 + from xml.parsers.expat import ExpatError as compat_xml_parse_error + +try: + from shlex import quote as shlex_quote +except ImportError: # Python < 3.3 + def shlex_quote(s): + return "'" + s.replace("'", "'\"'\"'") + "'" + + +def compat_ord(c): + if type(c) is int: return c + else: return ord(c) + + +if sys.version_info >= (3, 0): + compat_getenv = os.getenv + compat_expanduser = os.path.expanduser +else: + # Environment variables should be decoded with filesystem encoding. + # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) + + def compat_getenv(key, default=None): + env = os.getenv(key, default) + if env: + env = env.decode(get_filesystem_encoding()) + return env + + # HACK: The default implementations of os.path.expanduser from cpython do not decode + # environment variables with filesystem encoding. We will work around this by + # providing adjusted implementations. + # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib + # for different platforms with correct environment variables decoding. + + if os.name == 'posix': + def compat_expanduser(path): + """Expand ~ and ~user constructions. If user or $HOME is unknown, + do nothing.""" + if not path.startswith('~'): + return path + i = path.find('/', 1) + if i < 0: + i = len(path) + if i == 1: + if 'HOME' not in os.environ: + import pwd + userhome = pwd.getpwuid(os.getuid()).pw_dir + else: + userhome = compat_getenv('HOME') + else: + import pwd + try: + pwent = pwd.getpwnam(path[1:i]) + except KeyError: + return path + userhome = pwent.pw_dir + userhome = userhome.rstrip('/') + return (userhome + path[i:]) or '/' + elif os.name == 'nt' or os.name == 'ce': + def compat_expanduser(path): + """Expand ~ and ~user constructs. + + If user or $HOME is unknown, do nothing.""" + if path[:1] != '~': + return path + i, n = 1, len(path) + while i < n and path[i] not in '/\\': + i = i + 1 + + if 'HOME' in os.environ: + userhome = compat_getenv('HOME') + elif 'USERPROFILE' in os.environ: + userhome = compat_getenv('USERPROFILE') + elif not 'HOMEPATH' in os.environ: + return path + else: + try: + drive = compat_getenv('HOMEDRIVE') + except KeyError: + drive = '' + userhome = os.path.join(drive, compat_getenv('HOMEPATH')) + + if i != 1: #~user + userhome = os.path.join(os.path.dirname(userhome), path[1:i]) + + return userhome + path[i:] + else: + compat_expanduser = os.path.expanduser + + +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - 'Accept-Language': 'en-us,en;q=0.5', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'en-us,en;q=0.5', } def preferredencoding(): - """Get preferred encoding. - - Returns the best encoding scheme for the system, based on - locale.getpreferredencoding() and some further tweaks. - """ - def yield_preferredencoding(): - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' - while True: - yield pref - return yield_preferredencoding().next() - - -def htmlentity_transform(matchobj): - """Transforms an HTML entity to a Unicode character. - - This function receives a match object and is intended to be used with - the re.sub() function. - """ - entity = matchobj.group(1) - - # Known non-numeric HTML entity - if entity in htmlentitydefs.name2codepoint: - return unichr(htmlentitydefs.name2codepoint[entity]) - - # Unicode character - mobj = re.match(ur'(?u)#(x?\d+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith(u'x'): - base = 16 - numstr = u'0%s' % numstr - else: - base = 10 - return unichr(long(numstr, base)) - - # Unknown entity in name, return its literal representation - return (u'&%s;' % entity) - -HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(HTMLParser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id - self.result = None - self.started = False - self.depth = {} - self.html = None - self.watch_startpos = False - self.error_count = 0 - HTMLParser.HTMLParser.__init__(self) - - def error(self, message): - print >> sys.stderr, self.getpos() - if self.error_count > 10 or self.started: - raise HTMLParser.HTMLParseError(message, self.getpos()) - self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line - self.error_count += 1 - self.goahead(1) - - def loads(self, html): - self.html = html - self.feed(html) - self.close() - - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.started: - self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: - self.result = [tag] - self.started = True - self.watch_startpos = True - if self.started: - if not tag in self.depth: self.depth[tag] = 0 - self.depth[tag] += 1 - - def handle_endtag(self, tag): - if self.started: - if tag in self.depth: self.depth[tag] -= 1 - if self.depth[self.result[0]] == 0: - self.started = False - self.result.append(self.getpos()) - - def find_startpos(self, x): - """Needed to put the start position of the result (self.result[1]) - after the opening tag with the requested id""" - if self.watch_startpos: - self.watch_startpos = False - self.result.append(self.getpos()) - handle_entityref = handle_charref = handle_data = handle_comment = \ - handle_decl = handle_pi = unknown_decl = find_startpos - - def get_result(self): - if self.result == None: return None - if len(self.result) != 3: return None - lines = self.html.split('\n') - lines = lines[self.result[1][0]-1:self.result[2][0]] - lines[0] = lines[0][self.result[1][1]:] - if len(lines) == 1: - lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] - lines[-1] = lines[-1][:self.result[2][1]] - return '\n'.join(lines).strip() + """Get preferred encoding. + + Returns the best encoding scheme for the system, based on + locale.getpreferredencoding() and some further tweaks. + """ + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + + return pref + +if sys.version_info < (3,0): + def compat_print(s): + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) +else: + def compat_print(s): + assert type(s) == type(u'') + print(s) + + +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically """ + + args = { + 'suffix': '.tmp', + 'prefix': os.path.basename(fn) + '.', + 'dir': os.path.dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**args) + + try: + with tf: + json.dump(obj, tf) + os.rename(tf.name, fn) + except: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): + def find_xpath_attr(node, xpath, key, val): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z-]+$', key) + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + u"[@%s='%s']" % (key, val) + return node.find(expr) +else: + def find_xpath_attr(node, xpath, key, val): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + if isinstance(xpath, unicode): + xpath = xpath.encode('ascii') + + for f in node.findall(xpath): + if f.attrib.get(key) == val: + return f + return None + +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + + +def xpath_text(node, xpath, name=None, fatal=False): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + + n = node.find(xpath) + if n is None: + if fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) + else: + return None + return n.text + + +compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +class BaseHTMLParser(compat_html_parser.HTMLParser): + def __init(self): + compat_html_parser.HTMLParser.__init__(self) + self.html = None + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + +class AttrParser(BaseHTMLParser): + """Modified HTMLParser that isolates a tag with the specified attribute""" + def __init__(self, attribute, value): + self.attribute = attribute + self.value = value + self.result = None + self.started = False + self.depth = {} + self.watch_startpos = False + self.error_count = 0 + BaseHTMLParser.__init__(self) + + def error(self, message): + if self.error_count > 10 or self.started: + raise compat_html_parser.HTMLParseError(message, self.getpos()) + self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line + self.error_count += 1 + self.goahead(1) + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self.started: + self.find_startpos(None) + if self.attribute in attrs and attrs[self.attribute] == self.value: + self.result = [tag] + self.started = True + self.watch_startpos = True + if self.started: + if not tag in self.depth: self.depth[tag] = 0 + self.depth[tag] += 1 + + def handle_endtag(self, tag): + if self.started: + if tag in self.depth: self.depth[tag] -= 1 + if self.depth[self.result[0]] == 0: + self.started = False + self.result.append(self.getpos()) + + def find_startpos(self, x): + """Needed to put the start position of the result (self.result[1]) + after the opening tag with the requested id""" + if self.watch_startpos: + self.watch_startpos = False + self.result.append(self.getpos()) + handle_entityref = handle_charref = handle_data = handle_comment = \ + handle_decl = handle_pi = unknown_decl = find_startpos + + def get_result(self): + if self.result is None: + return None + if len(self.result) != 3: + return None + lines = self.html.split('\n') + lines = lines[self.result[1][0]-1:self.result[2][0]] + lines[0] = lines[0][self.result[1][1]:] + if len(lines) == 1: + lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] + lines[-1] = lines[-1][:self.result[2][1]] + return '\n'.join(lines).strip() +# Hack for https://github.com/rg3/youtube-dl/issues/662 +if sys.version_info < (2, 7, 3): + AttrParser.parse_endtag = (lambda self, i: + i + len("") + if self.rawdata[i:].startswith("") + else compat_html_parser.HTMLParser.parse_endtag(self, i)) def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) - try: - parser.loads(html) - except HTMLParser.HTMLParseError: - pass - return parser.get_result() + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + parser = AttrParser(attribute, value) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() + +class MetaParser(BaseHTMLParser): + """ + Modified HTMLParser that isolates a meta tag with the specified name + attribute. + """ + def __init__(self, name): + BaseHTMLParser.__init__(self) + self.name = name + self.content = None + self.result = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + attrs = dict(attrs) + if attrs.get('name') == self.name: + self.result = attrs.get('content') + + def get_result(self): + return self.result + +def get_meta_content(name, html): + """ + Return the content attribute from the meta tag with the given name attribute. + """ + parser = MetaParser(name) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() def clean_html(html): - """Clean an HTML snippet into a readable string""" - # Newline vs
- html = html.replace('\n', ' ') - html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) - # Strip html tags - html = re.sub('<.*?>', '', html) - # Replace html entities - html = unescapeHTML(html) - return html + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + return html.strip() def sanitize_open(filename, open_mode): - """Try to open the given filename, and slightly tweak it if this fails. - - Attempts to open the given filename. If this fails, it tries to change - the filename slightly, step by step, until it's either able to open it - or it fails and raises a final exception, like the standard open() - function. - - It returns the tuple (stream, definitive_file_name). - """ - try: - if filename == u'-': - if sys.platform == 'win32': - import msvcrt - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) - except (IOError, OSError), err: - # In case of error, try to remove win32 forbidden chars - filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) - - # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + try: + if filename == u'-': + if sys.platform == 'win32': + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) + except (IOError, OSError) as err: + if err.errno in (errno.EACCES,): + raise + + # In case of error, try to remove win32 forbidden chars + alt_filename = os.path.join( + re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part) + for path_part in os.path.split(filename) + ) + if alt_filename == filename: + raise + else: + # An exception here should be caught in the caller + stream = open(encodeFilename(filename), open_mode) + return (stream, alt_filename) def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp - -def sanitize_filename(s): - """Sanitizes a string so it could be used as part of a filename.""" - def replace_insane(char): - if char in u' .\\/|?*<>:"' or ord(char) < 32: - return '_' - return char - return u''.join(map(replace_insane, s)).strip('_') + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + +def sanitize_filename(s, restricted=False, is_id=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + """ + def replace_insane(char): + if char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + result = u''.join(map(replace_insane, s)) + if not is_id: + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if not result: + result = '_' + return result def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + + +def _htmlentity_transform(entity): + """Transforms an HTML entity to a character.""" + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + mobj = re.match(r'#(x?[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return compat_chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + def unescapeHTML(s): - """ - @param s a string (of type unicode) - """ - assert type(s) == type(u'') + if s is None: + return None + assert type(s) == compat_str + + return re.sub( + r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) + + +def encodeFilename(s, for_subprocess=False): + """ + @param s The name of the file + """ + + assert type(s) == compat_str + + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s + + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # Pass u'' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess: + return s + else: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return s.encode(encoding, 'ignore') + + +def encodeArgument(s): + if not isinstance(s, compat_str): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + s = s.decode('ascii') + return encodeFilename(s, True) + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, compat_str) + return optval + +def formatSeconds(secs): + if secs > 3600: + return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) + elif secs > 60: + return '%d:%02d' % (secs // 60, secs % 60) + else: + return '%d' % secs - result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) - return result -def encodeFilename(s): - """ - @param s The name of the file (of type unicode) - """ +def make_HTTPS_handler(opts_no_check_certificate, **kwargs): + if sys.version_info < (3, 2): + import httplib - assert type(s) == type(u'') + class HTTPSConnectionV3(httplib.HTTPSConnection): + def __init__(self, *args, **kwargs): + httplib.HTTPSConnection.__init__(self, *args, **kwargs) + + def connect(self): + sock = socket.create_connection((self.host, self.port), self.timeout) + if getattr(self, '_tunnel_host', False): + self.sock = sock + self._tunnel() + try: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1) + except ssl.SSLError: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) + + class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): + def https_open(self, req): + return self.do_open(HTTPSConnectionV3, req) + return HTTPSHandlerV3(**kwargs) + elif hasattr(ssl, 'create_default_context'): # Python >= 3.4 + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3 + if opts_no_check_certificate: + context.verify_mode = ssl.CERT_NONE + return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + else: # Python < 3.4 + context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context.verify_mode = (ssl.CERT_NONE + if opts_no_check_certificate + else ssl.CERT_REQUIRED) + context.set_default_verify_paths() + try: + context.load_default_certs() + except AttributeError: + pass # Python < 3.4 + return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + +class ExtractorError(Exception): + """Error during info extraction.""" + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): + """ tb, if given, is the original traceback (so that it can be printed out). + If expected is set, this is a normal error message and most likely not a bug in youtube-dl. + """ + + if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + expected = True + if video_id is not None: + msg = video_id + ': ' + msg + if cause: + msg += u' (caused by %r)' % cause + if not expected: + msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' + super(ExtractorError, self).__init__(msg) + + self.traceback = tb + self.exc_info = sys.exc_info() # preserve original exception + self.cause = cause + self.video_id = video_id + + def format_traceback(self): + if self.traceback is None: + return None + return u''.join(traceback.format_tb(self.traceback)) + + +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass u'' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - return s - else: - return s.encode(sys.getfilesystemencoding(), 'ignore') class DownloadError(Exception): - """Download Error exception. + """Download Error exception. - This exception may be thrown by FileDownloader objects if they are not - configured to continue on errors. They will contain the appropriate - error message. - """ - pass + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + def __init__(self, msg, exc_info=None): + """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ + super(DownloadError, self).__init__(msg) + self.exc_info = exc_info class SameFileError(Exception): - """Same File exception. + """Same File exception. - This exception will be thrown by FileDownloader objects if they detect - multiple files would have to be downloaded to the same file on disk. - """ - pass + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + pass class PostProcessingError(Exception): - """Post Processing exception. + """Post Processing exception. - This exception may be raised by PostProcessor's .run() method to - indicate an error in the postprocessing task. - """ - pass + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + def __init__(self, msg): + self.msg = msg class MaxDownloadsReached(Exception): - """ --max-downloads limit has been reached. """ - pass + """ --max-downloads limit has been reached. """ + pass class UnavailableVideoError(Exception): - """Unavailable Format exception. + """Unavailable Format exception. - This exception will be thrown when a video is requested - in a format that is not available for that video. - """ - pass + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + pass class ContentTooShortError(Exception): - """Content Too Short exception. - - This exception may be raised by FileDownloader objects when a file they - download is too small for what the server announced first, indicating - the connection was probably interrupted. - """ - # Both in bytes - downloaded = None - expected = None - - def __init__(self, downloaded, expected): - self.downloaded = downloaded - self.expected = expected - - -class Trouble(Exception): - """Trouble helper exception - - This is an exception to be handled with - FileDownloader.trouble - """ - -class YoutubeDLHandler(urllib2.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be - removed before making the real request. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - @staticmethod - def deflate(data): - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - @staticmethod - def addinfourl_wrapper(stream, headers, url, code): - if hasattr(urllib2.addinfourl, 'getcode'): - return urllib2.addinfourl(stream, headers, url, code) - ret = urllib2.addinfourl(stream, headers, url) - ret.code = code - return ret - - def http_request(self, req): - for h in std_headers: - if h in req.headers: - del req.headers[h] - req.add_header(h, std_headers[h]) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] - return req - - def http_response(self, req, resp): - old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = StringIO.StringIO(self.deflate(resp.read())) - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - return resp + """Content Too Short exception. + + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + # Both in bytes + downloaded = None + expected = None + + def __init__(self, downloaded, expected): + self.downloaded = downloaded + self.expected = expected + +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-No-Compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + @staticmethod + def deflate(data): + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def addinfourl_wrapper(stream, headers, url, code): + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) + ret.code = code + return ret + + def http_request(self, req): + for h, v in std_headers.items(): + if h not in req.headers: + req.add_header(h, v) + if 'Youtubedl-no-compression' in req.headers: + if 'Accept-encoding' in req.headers: + del req.headers['Accept-encoding'] + del req.headers['Youtubedl-no-compression'] + if 'Youtubedl-user-agent' in req.headers: + if 'User-agent' in req.headers: + del req.headers['User-agent'] + req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] + del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + content = resp.read() + gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') + try: + uncompressed = io.BytesIO(gz.read()) + except IOError as original_ioerror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') + uncompressed = io.BytesIO(gz.read()) + except IOError: + continue + break + else: + raise original_ioerror + resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = io.BytesIO(self.deflate(resp.read())) + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + return resp + + https_request = http_request + https_response = http_response + + +def parse_iso8601(date_str, delimiter='T'): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + m = re.search( + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + + +def unified_strdate(date_str): + """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + + upload_date = None + #Replace commas + date_str = date_str.replace(',', ' ') + # %z (UTC offset) is only supported in python>=3.2 + date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + format_expressions = [ + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M%p', + '%b %dnd %Y %I:%M%p', + '%b %dth %Y %I:%M%p', + '%Y-%m-%d', + '%Y/%m/%d', + '%d.%m.%Y', + '%d/%m/%Y', + '%d/%m/%y', + '%Y/%m/%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', + ] + for expression in format_expressions: + try: + upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + except ValueError: + pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + return upload_date + +def determine_ext(url, default_ext=u'unknown_video'): + if url is None: + return default_ext + guess = url.partition(u'?')[0].rpartition(u'.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + else: + return default_ext + +def subtitles_filename(filename, sub_lang, sub_format): + return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + +def date_from_str(date_str): + """ + Return a datetime object from a string in the format YYYYMMDD or + (now|today)[+-][0-9](day|week|month|year)(s)?""" + today = datetime.date.today() + if date_str == 'now'or date_str == 'today': + return today + match = re.match('(now|today)(?P[+-])(?P