X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/d4ff594119bc679aa175947eb59a97bee8f966f4..19fba829d2a7c21a394804be7ea2595b540f4ebd:/youtube_dl/utils.py diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8f5463f..942f76d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,6 +17,7 @@ import io import json import locale import math +import operator import os import pipes import platform @@ -34,9 +35,9 @@ import zlib from .compat import ( compat_basestring, compat_chr, - compat_getenv, compat_html_entities, compat_http_client, + compat_kwargs, compat_parse_qs, compat_socket_create_connection, compat_str, @@ -53,7 +54,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -61,6 +62,13 @@ std_headers = { } +NO_DEFAULT = object() + +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + + def preferredencoding(): """Get preferred encoding. @@ -70,7 +78,7 @@ def preferredencoding(): try: pref = locale.getpreferredencoding() 'TEST'.encode(pref) - except: + except Exception: pref = 'UTF-8' return pref @@ -109,7 +117,7 @@ def write_json_file(obj, fn): 'encoding': 'utf-8', }) - tf = tempfile.NamedTemporaryFile(**args) + tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) try: with tf: @@ -122,7 +130,7 @@ def write_json_file(obj, fn): except OSError: pass os.rename(tf.name, fn) - except: + except Exception: try: os.remove(tf.name) except OSError: @@ -165,13 +173,15 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): if sys.version_info < (2, 7): # Crazy 2.6 xpath = xpath.encode('ascii') n = node.find(xpath) if n is None or n.text is None: - if fatal: + if default is not NO_DEFAULT: + return default + elif fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) else: @@ -247,15 +257,12 @@ def sanitize_open(filename, open_mode): raise # In case of error, try to remove win32 forbidden chars - alt_filename = os.path.join( - re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) - for path_part in os.path.split(filename) - ) + alt_filename = sanitize_path(filename) if alt_filename == filename: raise else: # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) + stream = open(encodeFilename(alt_filename), open_mode) return (stream, alt_filename) @@ -298,11 +305,32 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') if not result: result = '_' return result +def sanitize_path(s): + """Sanitizes and normalizes path on Windows""" + if sys.platform != 'win32': + return s + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + for path_part in norm_path] + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) + return os.path.join(*sanitized_path) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -318,7 +346,7 @@ def _htmlentity_transform(entity): if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) - mobj = re.match(r'#(x?[0-9]+)', entity) + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith('x'): @@ -341,6 +369,18 @@ def unescapeHTML(s): r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + def encodeFilename(s, for_subprocess=False): """ @param s The name of the file @@ -352,21 +392,24 @@ def encodeFilename(s, for_subprocess=False): if sys.version_info >= (3, 0): return s - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess: - return s - else: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return s.encode(encoding, 'ignore') + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') def encodeArgument(s): @@ -378,6 +421,10 @@ def encodeArgument(s): return encodeFilename(s, True) +def decodeArgument(b): + return decodeFilename(b, True) + + def decodeOption(optval): if optval is None: return optval @@ -422,6 +469,17 @@ def make_HTTPS_handler(params, **kwargs): return YoutubeDLHTTPSHandler(params, context=context, **kwargs) +def bug_reports_message(): + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg = '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + return msg + + class ExtractorError(Exception): """Error during info extraction.""" @@ -437,13 +495,7 @@ class ExtractorError(Exception): if cause: msg += ' (caused by %r)' % cause if not expected: - if ytdl_is_updateable(): - update_cmd = 'type youtube-dl -U to update' - else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg += '; please report this issue on https://yt-dl.org/bug .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + msg += bug_reports_message() super(ExtractorError, self).__init__(msg) self.traceback = tb @@ -665,26 +717,27 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) -def parse_iso8601(date_str, delimiter='T'): +def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ if date_str is None: return None - m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): + if timezone is None: + m = re.search( + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: timezone = datetime.timedelta() else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt.timetuple()) @@ -699,7 +752,8 @@ def unified_strdate(date_str, day_first=True): # Replace commas date_str = date_str.replace(',', ' ') # %z (UTC offset) is only supported in python>=3.2 - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): + date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) @@ -728,6 +782,7 @@ def unified_strdate(date_str, day_first=True): ] if day_first: format_expressions.extend([ + '%d-%m-%Y', '%d.%m.%Y', '%d/%m/%Y', '%d/%m/%y', @@ -735,6 +790,7 @@ def unified_strdate(date_str, day_first=True): ]) else: format_expressions.extend([ + '%m-%d-%Y', '%m.%d.%Y', '%m/%d/%Y', '%m/%d/%y', @@ -893,8 +949,8 @@ def _windows_write_string(s, out): def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or + GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False @@ -1070,15 +1126,6 @@ def shell_quote(args): return ' '.join(quoted_args) -def takewhile_inclusive(pred, seq): - """ Like itertools.takewhile, but include the latest evaluated element - (the first element so that Not pred(e)) """ - for e in seq: - yield e - if not pred(e): - return - - def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ @@ -1164,30 +1211,21 @@ def parse_filesize(s): return int(float(num_str) * mult) -def get_term_width(): - columns = compat_getenv('COLUMNS', None) - if columns: - return int(columns) +def month_by_name(name): + """ Return the number of a month by (locale-independently) English name """ try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - return int(out.split()[1]) - except: - pass - return None + return ENGLISH_MONTH_NAMES.index(name) + 1 + except ValueError: + return None -def month_by_name(name): - """ Return the number of a month by (locale-independently) English name """ +def month_by_abbreviation(abbrev): + """ Return the number of a month by (locale-independently) English + abbreviations """ - ENGLISH_NAMES = [ - 'January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', 'December'] try: - return ENGLISH_NAMES.index(name) + 1 + return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 except ValueError: return None @@ -1274,6 +1312,7 @@ def parse_duration(s): (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1292,10 +1331,14 @@ def parse_duration(s): return float_or_none(m.group('only_hours'), invscale=60 * 60) if m.group('secs'): res += int(m.group('secs')) + if m.group('mins_reversed'): + res += int(m.group('mins_reversed')) * 60 if m.group('mins'): res += int(m.group('mins')) * 60 if m.group('hours'): res += int(m.group('hours')) * 60 * 60 + if m.group('hours_reversed'): + res += int(m.group('hours_reversed')) * 60 * 60 if m.group('days'): res += int(m.group('days')) * 24 * 60 * 60 if m.group('ms'): @@ -1303,9 +1346,19 @@ def parse_duration(s): return res -def prepend_extension(filename, ext): +def prepend_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return ( + '{0}.{1}{2}'.format(name, ext, real_ext) + if not expected_real_ext or real_ext[1:] == expected_real_ext + else '{0}.{1}'.format(filename, ext)) + + +def replace_extension(filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return '{0}.{1}{2}'.format(name, ext, real_ext) + return '{0}.{1}'.format( + name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, + ext) def check_executable(exe, args=[]): @@ -1324,7 +1377,7 @@ def get_exe_version(exe, args=['--version'], or False if the executable is not present """ try: out, _ = subprocess.Popen( - [exe] + args, + [encodeArgument(exe)] + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() except OSError: return False @@ -1430,6 +1483,14 @@ def uppercase_escape(s): s) +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str): @@ -1546,11 +1607,11 @@ def js_to_json(code): return '"%s"' % v res = re.sub(r'''(?x) - "(?:[^"\\]*(?:\\\\|\\")?)*"| - '(?:[^'\\]*(?:\\\\|\\')?)*'| + "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| [a-zA-Z_][.a-zA-Z_0-9]* ''', fix_kv, code) - res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) return res @@ -1602,6 +1663,16 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) +def mimetype2ext(mt): + _, _, res = mt.rpartition('/') + + return { + 'x-ms-wmv': 'wmv', + 'x-mp4-fragmented': 'mp4', + 'ttml+xml': 'ttml', + }.get(res, res) + + def urlhandle_detect_ext(url_handle): try: url_handle.headers @@ -1617,7 +1688,7 @@ def urlhandle_detect_ext(url_handle): if e: return e - return getheader('Content-Type').split("/")[1] + return mimetype2ext(getheader('Content-Type')) def age_restricted(content_limit, age_limit): @@ -1678,3 +1749,621 @@ def render_table(header_row, data): max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' return '\n'.join(format_str % tuple(row) for row in table) + + +def _match_one(filter_part, dct): + COMPARISON_OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?P[a-z_]+) + \s*(?P%s)(?P\s*\?)?\s* + (?: + (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| + (?P(?![0-9.])[a-z0-9A-Z]*) + ) + \s*$ + ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + op = COMPARISON_OPERATORS[m.group('op')] + if m.group('strval') is not None: + if m.group('op') not in ('=', '!='): + raise ValueError( + 'Operator %s does not support string values!' % m.group('op')) + comparison_value = m.group('strval') + else: + try: + comparison_value = int(m.group('intval')) + except ValueError: + comparison_value = parse_filesize(m.group('intval')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('intval') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid integer value %r in filter part %r' % ( + m.group('intval'), filter_part)) + actual_value = dct.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + + UNARY_OPERATORS = { + '': lambda v: v is not None, + '!': lambda v: v is None, + } + operator_rex = re.compile(r'''(?x)\s* + (?P%s)\s*(?P[a-z_]+) + \s*$ + ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + op = UNARY_OPERATORS[m.group('op')] + actual_value = dct.get(m.group('key')) + return op(actual_value) + + raise ValueError('Invalid filter part %r' % filter_part) + + +def match_str(filter_str, dct): + """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ + + return all( + _match_one(filter_part, dct) for filter_part in filter_str.split('&')) + + +def match_filter_func(filter_str): + def _match_func(info_dict): + if match_str(filter_str, info_dict): + return None + else: + video_title = info_dict.get('title', info_dict.get('id', 'video')) + return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) + return _match_func + + +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return 0.0 + + mobj = re.match(r'^(?P\d+(?:\.\d+)?)s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) + + +def dfxp2srt(dfxp_data): + _x = functools.partial(xpath_with_ns, ns_map={ + 'ttml': 'http://www.w3.org/ns/ttml', + 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + }) + + def parse_node(node): + str_or_empty = functools.partial(str_or_none, default='') + + out = str_or_empty(node.text) + + for child in node: + if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): + out += '\n' + str_or_empty(child.tail) + elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): + out += str_or_empty(parse_node(child)) + else: + out += str_or_empty(xml.etree.ElementTree.tostring(child)) + + return out + + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') + + for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib['begin']) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + if not end_time: + end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), + parse_node(para))) + + return ''.join(out) + + +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + +class ISO3166Utils(object): + # From http://data.okfn.org/data/core/country-list + _country_map = { + 'AF': 'Afghanistan', + 'AX': 'Åland Islands', + 'AL': 'Albania', + 'DZ': 'Algeria', + 'AS': 'American Samoa', + 'AD': 'Andorra', + 'AO': 'Angola', + 'AI': 'Anguilla', + 'AQ': 'Antarctica', + 'AG': 'Antigua and Barbuda', + 'AR': 'Argentina', + 'AM': 'Armenia', + 'AW': 'Aruba', + 'AU': 'Australia', + 'AT': 'Austria', + 'AZ': 'Azerbaijan', + 'BS': 'Bahamas', + 'BH': 'Bahrain', + 'BD': 'Bangladesh', + 'BB': 'Barbados', + 'BY': 'Belarus', + 'BE': 'Belgium', + 'BZ': 'Belize', + 'BJ': 'Benin', + 'BM': 'Bermuda', + 'BT': 'Bhutan', + 'BO': 'Bolivia, Plurinational State of', + 'BQ': 'Bonaire, Sint Eustatius and Saba', + 'BA': 'Bosnia and Herzegovina', + 'BW': 'Botswana', + 'BV': 'Bouvet Island', + 'BR': 'Brazil', + 'IO': 'British Indian Ocean Territory', + 'BN': 'Brunei Darussalam', + 'BG': 'Bulgaria', + 'BF': 'Burkina Faso', + 'BI': 'Burundi', + 'KH': 'Cambodia', + 'CM': 'Cameroon', + 'CA': 'Canada', + 'CV': 'Cape Verde', + 'KY': 'Cayman Islands', + 'CF': 'Central African Republic', + 'TD': 'Chad', + 'CL': 'Chile', + 'CN': 'China', + 'CX': 'Christmas Island', + 'CC': 'Cocos (Keeling) Islands', + 'CO': 'Colombia', + 'KM': 'Comoros', + 'CG': 'Congo', + 'CD': 'Congo, the Democratic Republic of the', + 'CK': 'Cook Islands', + 'CR': 'Costa Rica', + 'CI': 'Côte d\'Ivoire', + 'HR': 'Croatia', + 'CU': 'Cuba', + 'CW': 'Curaçao', + 'CY': 'Cyprus', + 'CZ': 'Czech Republic', + 'DK': 'Denmark', + 'DJ': 'Djibouti', + 'DM': 'Dominica', + 'DO': 'Dominican Republic', + 'EC': 'Ecuador', + 'EG': 'Egypt', + 'SV': 'El Salvador', + 'GQ': 'Equatorial Guinea', + 'ER': 'Eritrea', + 'EE': 'Estonia', + 'ET': 'Ethiopia', + 'FK': 'Falkland Islands (Malvinas)', + 'FO': 'Faroe Islands', + 'FJ': 'Fiji', + 'FI': 'Finland', + 'FR': 'France', + 'GF': 'French Guiana', + 'PF': 'French Polynesia', + 'TF': 'French Southern Territories', + 'GA': 'Gabon', + 'GM': 'Gambia', + 'GE': 'Georgia', + 'DE': 'Germany', + 'GH': 'Ghana', + 'GI': 'Gibraltar', + 'GR': 'Greece', + 'GL': 'Greenland', + 'GD': 'Grenada', + 'GP': 'Guadeloupe', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GG': 'Guernsey', + 'GN': 'Guinea', + 'GW': 'Guinea-Bissau', + 'GY': 'Guyana', + 'HT': 'Haiti', + 'HM': 'Heard Island and McDonald Islands', + 'VA': 'Holy See (Vatican City State)', + 'HN': 'Honduras', + 'HK': 'Hong Kong', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IN': 'India', + 'ID': 'Indonesia', + 'IR': 'Iran, Islamic Republic of', + 'IQ': 'Iraq', + 'IE': 'Ireland', + 'IM': 'Isle of Man', + 'IL': 'Israel', + 'IT': 'Italy', + 'JM': 'Jamaica', + 'JP': 'Japan', + 'JE': 'Jersey', + 'JO': 'Jordan', + 'KZ': 'Kazakhstan', + 'KE': 'Kenya', + 'KI': 'Kiribati', + 'KP': 'Korea, Democratic People\'s Republic of', + 'KR': 'Korea, Republic of', + 'KW': 'Kuwait', + 'KG': 'Kyrgyzstan', + 'LA': 'Lao People\'s Democratic Republic', + 'LV': 'Latvia', + 'LB': 'Lebanon', + 'LS': 'Lesotho', + 'LR': 'Liberia', + 'LY': 'Libya', + 'LI': 'Liechtenstein', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'MO': 'Macao', + 'MK': 'Macedonia, the Former Yugoslav Republic of', + 'MG': 'Madagascar', + 'MW': 'Malawi', + 'MY': 'Malaysia', + 'MV': 'Maldives', + 'ML': 'Mali', + 'MT': 'Malta', + 'MH': 'Marshall Islands', + 'MQ': 'Martinique', + 'MR': 'Mauritania', + 'MU': 'Mauritius', + 'YT': 'Mayotte', + 'MX': 'Mexico', + 'FM': 'Micronesia, Federated States of', + 'MD': 'Moldova, Republic of', + 'MC': 'Monaco', + 'MN': 'Mongolia', + 'ME': 'Montenegro', + 'MS': 'Montserrat', + 'MA': 'Morocco', + 'MZ': 'Mozambique', + 'MM': 'Myanmar', + 'NA': 'Namibia', + 'NR': 'Nauru', + 'NP': 'Nepal', + 'NL': 'Netherlands', + 'NC': 'New Caledonia', + 'NZ': 'New Zealand', + 'NI': 'Nicaragua', + 'NE': 'Niger', + 'NG': 'Nigeria', + 'NU': 'Niue', + 'NF': 'Norfolk Island', + 'MP': 'Northern Mariana Islands', + 'NO': 'Norway', + 'OM': 'Oman', + 'PK': 'Pakistan', + 'PW': 'Palau', + 'PS': 'Palestine, State of', + 'PA': 'Panama', + 'PG': 'Papua New Guinea', + 'PY': 'Paraguay', + 'PE': 'Peru', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PT': 'Portugal', + 'PR': 'Puerto Rico', + 'QA': 'Qatar', + 'RE': 'Réunion', + 'RO': 'Romania', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'BL': 'Saint Barthélemy', + 'SH': 'Saint Helena, Ascension and Tristan da Cunha', + 'KN': 'Saint Kitts and Nevis', + 'LC': 'Saint Lucia', + 'MF': 'Saint Martin (French part)', + 'PM': 'Saint Pierre and Miquelon', + 'VC': 'Saint Vincent and the Grenadines', + 'WS': 'Samoa', + 'SM': 'San Marino', + 'ST': 'Sao Tome and Principe', + 'SA': 'Saudi Arabia', + 'SN': 'Senegal', + 'RS': 'Serbia', + 'SC': 'Seychelles', + 'SL': 'Sierra Leone', + 'SG': 'Singapore', + 'SX': 'Sint Maarten (Dutch part)', + 'SK': 'Slovakia', + 'SI': 'Slovenia', + 'SB': 'Solomon Islands', + 'SO': 'Somalia', + 'ZA': 'South Africa', + 'GS': 'South Georgia and the South Sandwich Islands', + 'SS': 'South Sudan', + 'ES': 'Spain', + 'LK': 'Sri Lanka', + 'SD': 'Sudan', + 'SR': 'Suriname', + 'SJ': 'Svalbard and Jan Mayen', + 'SZ': 'Swaziland', + 'SE': 'Sweden', + 'CH': 'Switzerland', + 'SY': 'Syrian Arab Republic', + 'TW': 'Taiwan, Province of China', + 'TJ': 'Tajikistan', + 'TZ': 'Tanzania, United Republic of', + 'TH': 'Thailand', + 'TL': 'Timor-Leste', + 'TG': 'Togo', + 'TK': 'Tokelau', + 'TO': 'Tonga', + 'TT': 'Trinidad and Tobago', + 'TN': 'Tunisia', + 'TR': 'Turkey', + 'TM': 'Turkmenistan', + 'TC': 'Turks and Caicos Islands', + 'TV': 'Tuvalu', + 'UG': 'Uganda', + 'UA': 'Ukraine', + 'AE': 'United Arab Emirates', + 'GB': 'United Kingdom', + 'US': 'United States', + 'UM': 'United States Minor Outlying Islands', + 'UY': 'Uruguay', + 'UZ': 'Uzbekistan', + 'VU': 'Vanuatu', + 'VE': 'Venezuela, Bolivarian Republic of', + 'VN': 'Viet Nam', + 'VG': 'Virgin Islands, British', + 'VI': 'Virgin Islands, U.S.', + 'WF': 'Wallis and Futuna', + 'EH': 'Western Sahara', + 'YE': 'Yemen', + 'ZM': 'Zambia', + 'ZW': 'Zimbabwe', + } + + @classmethod + def short2full(cls, code): + """Convert an ISO 3166-2 country code to the corresponding full name""" + return cls._country_map.get(code.upper()) + + +class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + return compat_urllib_request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + return compat_urllib_request.ProxyHandler.proxy_open( + self, req, proxy, type)