]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Update version in the changelog.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import calendar
8 import codecs
9 import contextlib
10 import ctypes
11 import datetime
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import itertools
17 import io
18 import json
19 import locale
20 import math
21 import operator
22 import os
23 import pipes
24 import platform
25 import re
26 import ssl
27 import socket
28 import struct
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_basestring,
38 compat_chr,
39 compat_etree_fromstring,
40 compat_html_entities,
41 compat_http_client,
42 compat_kwargs,
43 compat_parse_qs,
44 compat_socket_create_connection,
45 compat_str,
46 compat_urllib_error,
47 compat_urllib_parse,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
50 compat_urlparse,
51 shlex_quote,
52 )
53
54
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
57
58 std_headers = {
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
64 }
65
66
67 NO_DEFAULT = object()
68
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
72
73
74 def preferredencoding():
75 """Get preferred encoding.
76
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
79 """
80 try:
81 pref = locale.getpreferredencoding()
82 'TEST'.encode(pref)
83 except Exception:
84 pref = 'UTF-8'
85
86 return pref
87
88
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
91
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
101 else:
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
104
105 args = {
106 'suffix': '.tmp',
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
109 'delete': False,
110 }
111
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
115 args['mode'] = 'wb'
116 else:
117 args.update({
118 'mode': 'w',
119 'encoding': 'utf-8',
120 })
121
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
123
124 try:
125 with tf:
126 json.dump(obj, tf)
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
130 try:
131 os.unlink(fn)
132 except OSError:
133 pass
134 os.rename(tf.name, fn)
135 except Exception:
136 try:
137 os.remove(tf.name)
138 except OSError:
139 pass
140 raise
141
142
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
147 if val:
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
151 else:
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
157
158 for f in node.findall(xpath):
159 if key not in f.attrib:
160 continue
161 if val is None or f.attrib.get(key) == val:
162 return f
163 return None
164
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
167
168
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
171 replaced = []
172 for c in components:
173 if len(c) == 1:
174 replaced.append(c[0])
175 else:
176 ns, tag = c
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
179
180
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
186
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
189 else:
190 for xp in xpath:
191 n = _find_xpath(xp)
192 if n is not None:
193 break
194
195 if n is None:
196 if default is not NO_DEFAULT:
197 return default
198 elif fatal:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
201 else:
202 return None
203 return n
204
205
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
209 return n
210 if n.text is None:
211 if default is not NO_DEFAULT:
212 return default
213 elif fatal:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
216 else:
217 return None
218 return n.text
219
220
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
223 if n is None:
224 if default is not NO_DEFAULT:
225 return default
226 elif fatal:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
229 else:
230 return None
231 return n.attrib[key]
232
233
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
237
238
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
241
242 m = re.search(r'''(?xs)
243 <([a-zA-Z0-9:._-]+)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
245 \s+%s=['"]?%s['"]?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
247 \s*>
248 (?P<content>.*?)
249 </\1>
250 ''' % (re.escape(attribute), re.escape(value)), html)
251
252 if not m:
253 return None
254 res = m.group('content')
255
256 if res.startswith('"') or res.startswith("'"):
257 res = res[1:-1]
258
259 return unescapeHTML(res)
260
261
262 def clean_html(html):
263 """Clean an HTML snippet into a readable string"""
264
265 if html is None: # Convenience for sanitizing descriptions etc.
266 return html
267
268 # Newline vs <br />
269 html = html.replace('\n', ' ')
270 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
271 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
272 # Strip html tags
273 html = re.sub('<.*?>', '', html)
274 # Replace html entities
275 html = unescapeHTML(html)
276 return html.strip()
277
278
279 def sanitize_open(filename, open_mode):
280 """Try to open the given filename, and slightly tweak it if this fails.
281
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
285 function.
286
287 It returns the tuple (stream, definitive_file_name).
288 """
289 try:
290 if filename == '-':
291 if sys.platform == 'win32':
292 import msvcrt
293 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
294 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
295 stream = open(encodeFilename(filename), open_mode)
296 return (stream, filename)
297 except (IOError, OSError) as err:
298 if err.errno in (errno.EACCES,):
299 raise
300
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename = sanitize_path(filename)
303 if alt_filename == filename:
304 raise
305 else:
306 # An exception here should be caught in the caller
307 stream = open(encodeFilename(alt_filename), open_mode)
308 return (stream, alt_filename)
309
310
311 def timeconvert(timestr):
312 """Convert RFC 2822 defined time string into system timestamp"""
313 timestamp = None
314 timetuple = email.utils.parsedate_tz(timestr)
315 if timetuple is not None:
316 timestamp = email.utils.mktime_tz(timetuple)
317 return timestamp
318
319
320 def sanitize_filename(s, restricted=False, is_id=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
324 """
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
327 return ''
328 elif char == '"':
329 return '' if restricted else '\''
330 elif char == ':':
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
333 return '_'
334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
335 return '_'
336 if restricted and ord(char) > 127:
337 return '_'
338 return char
339
340 # Handle timestamps
341 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
342 result = ''.join(map(replace_insane, s))
343 if not is_id:
344 while '__' in result:
345 result = result.replace('__', '_')
346 result = result.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted and result.startswith('-_'):
349 result = result[2:]
350 if result.startswith('-'):
351 result = '_' + result[len('-'):]
352 result = result.lstrip('.')
353 if not result:
354 result = '_'
355 return result
356
357
358 def sanitize_path(s):
359 """Sanitizes and normalizes path on Windows"""
360 if sys.platform != 'win32':
361 return s
362 drive_or_unc, _ = os.path.splitdrive(s)
363 if sys.version_info < (2, 7) and not drive_or_unc:
364 drive_or_unc, _ = os.path.splitunc(s)
365 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
366 if drive_or_unc:
367 norm_path.pop(0)
368 sanitized_path = [
369 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
370 for path_part in norm_path]
371 if drive_or_unc:
372 sanitized_path.insert(0, drive_or_unc + os.path.sep)
373 return os.path.join(*sanitized_path)
374
375
376 def orderedSet(iterable):
377 """ Remove all duplicates from the input iterable """
378 res = []
379 for el in iterable:
380 if el not in res:
381 res.append(el)
382 return res
383
384
385 def _htmlentity_transform(entity):
386 """Transforms an HTML entity to a character."""
387 # Known non-numeric HTML entity
388 if entity in compat_html_entities.name2codepoint:
389 return compat_chr(compat_html_entities.name2codepoint[entity])
390
391 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
392 if mobj is not None:
393 numstr = mobj.group(1)
394 if numstr.startswith('x'):
395 base = 16
396 numstr = '0%s' % numstr
397 else:
398 base = 10
399 return compat_chr(int(numstr, base))
400
401 # Unknown entity in name, return its literal representation
402 return ('&%s;' % entity)
403
404
405 def unescapeHTML(s):
406 if s is None:
407 return None
408 assert type(s) == compat_str
409
410 return re.sub(
411 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
412
413
414 def get_subprocess_encoding():
415 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
416 # For subprocess calls, encode with locale encoding
417 # Refer to http://stackoverflow.com/a/9951851/35070
418 encoding = preferredencoding()
419 else:
420 encoding = sys.getfilesystemencoding()
421 if encoding is None:
422 encoding = 'utf-8'
423 return encoding
424
425
426 def encodeFilename(s, for_subprocess=False):
427 """
428 @param s The name of the file
429 """
430
431 assert type(s) == compat_str
432
433 # Python 3 has a Unicode API
434 if sys.version_info >= (3, 0):
435 return s
436
437 # Pass '' directly to use Unicode APIs on Windows 2000 and up
438 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
439 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
440 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
441 return s
442
443 return s.encode(get_subprocess_encoding(), 'ignore')
444
445
446 def decodeFilename(b, for_subprocess=False):
447
448 if sys.version_info >= (3, 0):
449 return b
450
451 if not isinstance(b, bytes):
452 return b
453
454 return b.decode(get_subprocess_encoding(), 'ignore')
455
456
457 def encodeArgument(s):
458 if not isinstance(s, compat_str):
459 # Legacy code that uses byte strings
460 # Uncomment the following line after fixing all post processors
461 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
462 s = s.decode('ascii')
463 return encodeFilename(s, True)
464
465
466 def decodeArgument(b):
467 return decodeFilename(b, True)
468
469
470 def decodeOption(optval):
471 if optval is None:
472 return optval
473 if isinstance(optval, bytes):
474 optval = optval.decode(preferredencoding())
475
476 assert isinstance(optval, compat_str)
477 return optval
478
479
480 def formatSeconds(secs):
481 if secs > 3600:
482 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
483 elif secs > 60:
484 return '%d:%02d' % (secs // 60, secs % 60)
485 else:
486 return '%d' % secs
487
488
489 def make_HTTPS_handler(params, **kwargs):
490 opts_no_check_certificate = params.get('nocheckcertificate', False)
491 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
492 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
493 if opts_no_check_certificate:
494 context.check_hostname = False
495 context.verify_mode = ssl.CERT_NONE
496 try:
497 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
498 except TypeError:
499 # Python 2.7.8
500 # (create_default_context present but HTTPSHandler has no context=)
501 pass
502
503 if sys.version_info < (3, 2):
504 return YoutubeDLHTTPSHandler(params, **kwargs)
505 else: # Python < 3.4
506 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
507 context.verify_mode = (ssl.CERT_NONE
508 if opts_no_check_certificate
509 else ssl.CERT_REQUIRED)
510 context.set_default_verify_paths()
511 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
512
513
514 def bug_reports_message():
515 if ytdl_is_updateable():
516 update_cmd = 'type youtube-dl -U to update'
517 else:
518 update_cmd = 'see https://yt-dl.org/update on how to update'
519 msg = '; please report this issue on https://yt-dl.org/bug .'
520 msg += ' Make sure you are using the latest version; %s.' % update_cmd
521 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
522 return msg
523
524
525 class ExtractorError(Exception):
526 """Error during info extraction."""
527
528 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
529 """ tb, if given, is the original traceback (so that it can be printed out).
530 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
531 """
532
533 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
534 expected = True
535 if video_id is not None:
536 msg = video_id + ': ' + msg
537 if cause:
538 msg += ' (caused by %r)' % cause
539 if not expected:
540 msg += bug_reports_message()
541 super(ExtractorError, self).__init__(msg)
542
543 self.traceback = tb
544 self.exc_info = sys.exc_info() # preserve original exception
545 self.cause = cause
546 self.video_id = video_id
547
548 def format_traceback(self):
549 if self.traceback is None:
550 return None
551 return ''.join(traceback.format_tb(self.traceback))
552
553
554 class UnsupportedError(ExtractorError):
555 def __init__(self, url):
556 super(UnsupportedError, self).__init__(
557 'Unsupported URL: %s' % url, expected=True)
558 self.url = url
559
560
561 class RegexNotFoundError(ExtractorError):
562 """Error when a regex didn't match"""
563 pass
564
565
566 class DownloadError(Exception):
567 """Download Error exception.
568
569 This exception may be thrown by FileDownloader objects if they are not
570 configured to continue on errors. They will contain the appropriate
571 error message.
572 """
573
574 def __init__(self, msg, exc_info=None):
575 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
576 super(DownloadError, self).__init__(msg)
577 self.exc_info = exc_info
578
579
580 class SameFileError(Exception):
581 """Same File exception.
582
583 This exception will be thrown by FileDownloader objects if they detect
584 multiple files would have to be downloaded to the same file on disk.
585 """
586 pass
587
588
589 class PostProcessingError(Exception):
590 """Post Processing exception.
591
592 This exception may be raised by PostProcessor's .run() method to
593 indicate an error in the postprocessing task.
594 """
595
596 def __init__(self, msg):
597 self.msg = msg
598
599
600 class MaxDownloadsReached(Exception):
601 """ --max-downloads limit has been reached. """
602 pass
603
604
605 class UnavailableVideoError(Exception):
606 """Unavailable Format exception.
607
608 This exception will be thrown when a video is requested
609 in a format that is not available for that video.
610 """
611 pass
612
613
614 class ContentTooShortError(Exception):
615 """Content Too Short exception.
616
617 This exception may be raised by FileDownloader objects when a file they
618 download is too small for what the server announced first, indicating
619 the connection was probably interrupted.
620 """
621
622 def __init__(self, downloaded, expected):
623 # Both in bytes
624 self.downloaded = downloaded
625 self.expected = expected
626
627
628 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
629 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
630 # expected HTTP responses to meet HTTP/1.0 or later (see also
631 # https://github.com/rg3/youtube-dl/issues/6727)
632 if sys.version_info < (3, 0):
633 kwargs[b'strict'] = True
634 hc = http_class(*args, **kwargs)
635 source_address = ydl_handler._params.get('source_address')
636 if source_address is not None:
637 sa = (source_address, 0)
638 if hasattr(hc, 'source_address'): # Python 2.7+
639 hc.source_address = sa
640 else: # Python 2.6
641 def _hc_connect(self, *args, **kwargs):
642 sock = compat_socket_create_connection(
643 (self.host, self.port), self.timeout, sa)
644 if is_https:
645 self.sock = ssl.wrap_socket(
646 sock, self.key_file, self.cert_file,
647 ssl_version=ssl.PROTOCOL_TLSv1)
648 else:
649 self.sock = sock
650 hc.connect = functools.partial(_hc_connect, hc)
651
652 return hc
653
654
655 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
656 """Handler for HTTP requests and responses.
657
658 This class, when installed with an OpenerDirector, automatically adds
659 the standard headers to every HTTP request and handles gzipped and
660 deflated responses from web servers. If compression is to be avoided in
661 a particular request, the original request in the program code only has
662 to include the HTTP header "Youtubedl-No-Compression", which will be
663 removed before making the real request.
664
665 Part of this code was copied from:
666
667 http://techknack.net/python-urllib2-handlers/
668
669 Andrew Rowls, the author of that code, agreed to release it to the
670 public domain.
671 """
672
673 def __init__(self, params, *args, **kwargs):
674 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
675 self._params = params
676
677 def http_open(self, req):
678 return self.do_open(functools.partial(
679 _create_http_connection, self, compat_http_client.HTTPConnection, False),
680 req)
681
682 @staticmethod
683 def deflate(data):
684 try:
685 return zlib.decompress(data, -zlib.MAX_WBITS)
686 except zlib.error:
687 return zlib.decompress(data)
688
689 @staticmethod
690 def addinfourl_wrapper(stream, headers, url, code):
691 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
692 return compat_urllib_request.addinfourl(stream, headers, url, code)
693 ret = compat_urllib_request.addinfourl(stream, headers, url)
694 ret.code = code
695 return ret
696
697 def http_request(self, req):
698 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
699 # always respected by websites, some tend to give out URLs with non percent-encoded
700 # non-ASCII characters (see telemb.py, ard.py [#3412])
701 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
702 # To work around aforementioned issue we will replace request's original URL with
703 # percent-encoded one
704 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
705 # the code of this workaround has been moved here from YoutubeDL.urlopen()
706 url = req.get_full_url()
707 url_escaped = escape_url(url)
708
709 # Substitute URL if any change after escaping
710 if url != url_escaped:
711 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
712 new_req = req_type(
713 url_escaped, data=req.data, headers=req.headers,
714 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
715 new_req.timeout = req.timeout
716 req = new_req
717
718 for h, v in std_headers.items():
719 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
720 # The dict keys are capitalized because of this bug by urllib
721 if h.capitalize() not in req.headers:
722 req.add_header(h, v)
723 if 'Youtubedl-no-compression' in req.headers:
724 if 'Accept-encoding' in req.headers:
725 del req.headers['Accept-encoding']
726 del req.headers['Youtubedl-no-compression']
727
728 if sys.version_info < (2, 7) and '#' in req.get_full_url():
729 # Python 2.6 is brain-dead when it comes to fragments
730 req._Request__original = req._Request__original.partition('#')[0]
731 req._Request__r_type = req._Request__r_type.partition('#')[0]
732
733 return req
734
735 def http_response(self, req, resp):
736 old_resp = resp
737 # gzip
738 if resp.headers.get('Content-encoding', '') == 'gzip':
739 content = resp.read()
740 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
741 try:
742 uncompressed = io.BytesIO(gz.read())
743 except IOError as original_ioerror:
744 # There may be junk add the end of the file
745 # See http://stackoverflow.com/q/4928560/35070 for details
746 for i in range(1, 1024):
747 try:
748 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
749 uncompressed = io.BytesIO(gz.read())
750 except IOError:
751 continue
752 break
753 else:
754 raise original_ioerror
755 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
756 resp.msg = old_resp.msg
757 # deflate
758 if resp.headers.get('Content-encoding', '') == 'deflate':
759 gz = io.BytesIO(self.deflate(resp.read()))
760 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
761 resp.msg = old_resp.msg
762 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
763 # https://github.com/rg3/youtube-dl/issues/6457).
764 if 300 <= resp.code < 400:
765 location = resp.headers.get('Location')
766 if location:
767 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
768 if sys.version_info >= (3, 0):
769 location = location.encode('iso-8859-1').decode('utf-8')
770 location_escaped = escape_url(location)
771 if location != location_escaped:
772 del resp.headers['Location']
773 resp.headers['Location'] = location_escaped
774 return resp
775
776 https_request = http_request
777 https_response = http_response
778
779
780 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
781 def __init__(self, params, https_conn_class=None, *args, **kwargs):
782 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
783 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
784 self._params = params
785
786 def https_open(self, req):
787 kwargs = {}
788 if hasattr(self, '_context'): # python > 2.6
789 kwargs['context'] = self._context
790 if hasattr(self, '_check_hostname'): # python 3.x
791 kwargs['check_hostname'] = self._check_hostname
792 return self.do_open(functools.partial(
793 _create_http_connection, self, self._https_conn_class, True),
794 req, **kwargs)
795
796
797 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
798 def __init__(self, cookiejar=None):
799 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
800
801 def http_response(self, request, response):
802 # Python 2 will choke on next HTTP request in row if there are non-ASCII
803 # characters in Set-Cookie HTTP header of last response (see
804 # https://github.com/rg3/youtube-dl/issues/6769).
805 # In order to at least prevent crashing we will percent encode Set-Cookie
806 # header before HTTPCookieProcessor starts processing it.
807 # if sys.version_info < (3, 0) and response.headers:
808 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
809 # set_cookie = response.headers.get(set_cookie_header)
810 # if set_cookie:
811 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
812 # if set_cookie != set_cookie_escaped:
813 # del response.headers[set_cookie_header]
814 # response.headers[set_cookie_header] = set_cookie_escaped
815 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
816
817 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
818 https_response = http_response
819
820
821 def parse_iso8601(date_str, delimiter='T', timezone=None):
822 """ Return a UNIX timestamp from the given date """
823
824 if date_str is None:
825 return None
826
827 date_str = re.sub(r'\.[0-9]+', '', date_str)
828
829 if timezone is None:
830 m = re.search(
831 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
832 date_str)
833 if not m:
834 timezone = datetime.timedelta()
835 else:
836 date_str = date_str[:-len(m.group(0))]
837 if not m.group('sign'):
838 timezone = datetime.timedelta()
839 else:
840 sign = 1 if m.group('sign') == '+' else -1
841 timezone = datetime.timedelta(
842 hours=sign * int(m.group('hours')),
843 minutes=sign * int(m.group('minutes')))
844 try:
845 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
846 dt = datetime.datetime.strptime(date_str, date_format) - timezone
847 return calendar.timegm(dt.timetuple())
848 except ValueError:
849 pass
850
851
852 def unified_strdate(date_str, day_first=True):
853 """Return a string with the date in the format YYYYMMDD"""
854
855 if date_str is None:
856 return None
857 upload_date = None
858 # Replace commas
859 date_str = date_str.replace(',', ' ')
860 # %z (UTC offset) is only supported in python>=3.2
861 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
862 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
863 # Remove AM/PM + timezone
864 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
865
866 format_expressions = [
867 '%d %B %Y',
868 '%d %b %Y',
869 '%B %d %Y',
870 '%b %d %Y',
871 '%b %dst %Y %I:%M%p',
872 '%b %dnd %Y %I:%M%p',
873 '%b %dth %Y %I:%M%p',
874 '%Y %m %d',
875 '%Y-%m-%d',
876 '%Y/%m/%d',
877 '%Y/%m/%d %H:%M:%S',
878 '%Y-%m-%d %H:%M:%S',
879 '%Y-%m-%d %H:%M:%S.%f',
880 '%d.%m.%Y %H:%M',
881 '%d.%m.%Y %H.%M',
882 '%Y-%m-%dT%H:%M:%SZ',
883 '%Y-%m-%dT%H:%M:%S.%fZ',
884 '%Y-%m-%dT%H:%M:%S.%f0Z',
885 '%Y-%m-%dT%H:%M:%S',
886 '%Y-%m-%dT%H:%M:%S.%f',
887 '%Y-%m-%dT%H:%M',
888 ]
889 if day_first:
890 format_expressions.extend([
891 '%d-%m-%Y',
892 '%d.%m.%Y',
893 '%d/%m/%Y',
894 '%d/%m/%y',
895 '%d/%m/%Y %H:%M:%S',
896 ])
897 else:
898 format_expressions.extend([
899 '%m-%d-%Y',
900 '%m.%d.%Y',
901 '%m/%d/%Y',
902 '%m/%d/%y',
903 '%m/%d/%Y %H:%M:%S',
904 ])
905 for expression in format_expressions:
906 try:
907 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
908 except ValueError:
909 pass
910 if upload_date is None:
911 timetuple = email.utils.parsedate_tz(date_str)
912 if timetuple:
913 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
914 if upload_date is not None:
915 return compat_str(upload_date)
916
917
918 def determine_ext(url, default_ext='unknown_video'):
919 if url is None:
920 return default_ext
921 guess = url.partition('?')[0].rpartition('.')[2]
922 if re.match(r'^[A-Za-z0-9]+$', guess):
923 return guess
924 else:
925 return default_ext
926
927
928 def subtitles_filename(filename, sub_lang, sub_format):
929 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
930
931
932 def date_from_str(date_str):
933 """
934 Return a datetime object from a string in the format YYYYMMDD or
935 (now|today)[+-][0-9](day|week|month|year)(s)?"""
936 today = datetime.date.today()
937 if date_str in ('now', 'today'):
938 return today
939 if date_str == 'yesterday':
940 return today - datetime.timedelta(days=1)
941 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
942 if match is not None:
943 sign = match.group('sign')
944 time = int(match.group('time'))
945 if sign == '-':
946 time = -time
947 unit = match.group('unit')
948 # A bad aproximation?
949 if unit == 'month':
950 unit = 'day'
951 time *= 30
952 elif unit == 'year':
953 unit = 'day'
954 time *= 365
955 unit += 's'
956 delta = datetime.timedelta(**{unit: time})
957 return today + delta
958 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
959
960
961 def hyphenate_date(date_str):
962 """
963 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
964 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
965 if match is not None:
966 return '-'.join(match.groups())
967 else:
968 return date_str
969
970
971 class DateRange(object):
972 """Represents a time interval between two dates"""
973
974 def __init__(self, start=None, end=None):
975 """start and end must be strings in the format accepted by date"""
976 if start is not None:
977 self.start = date_from_str(start)
978 else:
979 self.start = datetime.datetime.min.date()
980 if end is not None:
981 self.end = date_from_str(end)
982 else:
983 self.end = datetime.datetime.max.date()
984 if self.start > self.end:
985 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
986
987 @classmethod
988 def day(cls, day):
989 """Returns a range that only contains the given day"""
990 return cls(day, day)
991
992 def __contains__(self, date):
993 """Check if the date is in the range"""
994 if not isinstance(date, datetime.date):
995 date = date_from_str(date)
996 return self.start <= date <= self.end
997
998 def __str__(self):
999 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1000
1001
1002 def platform_name():
1003 """ Returns the platform name as a compat_str """
1004 res = platform.platform()
1005 if isinstance(res, bytes):
1006 res = res.decode(preferredencoding())
1007
1008 assert isinstance(res, compat_str)
1009 return res
1010
1011
1012 def _windows_write_string(s, out):
1013 """ Returns True if the string was written using special methods,
1014 False if it has yet to be written out."""
1015 # Adapted from http://stackoverflow.com/a/3259271/35070
1016
1017 import ctypes
1018 import ctypes.wintypes
1019
1020 WIN_OUTPUT_IDS = {
1021 1: -11,
1022 2: -12,
1023 }
1024
1025 try:
1026 fileno = out.fileno()
1027 except AttributeError:
1028 # If the output stream doesn't have a fileno, it's virtual
1029 return False
1030 except io.UnsupportedOperation:
1031 # Some strange Windows pseudo files?
1032 return False
1033 if fileno not in WIN_OUTPUT_IDS:
1034 return False
1035
1036 GetStdHandle = ctypes.WINFUNCTYPE(
1037 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1038 (b"GetStdHandle", ctypes.windll.kernel32))
1039 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1040
1041 WriteConsoleW = ctypes.WINFUNCTYPE(
1042 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1043 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1044 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1045 written = ctypes.wintypes.DWORD(0)
1046
1047 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1048 FILE_TYPE_CHAR = 0x0002
1049 FILE_TYPE_REMOTE = 0x8000
1050 GetConsoleMode = ctypes.WINFUNCTYPE(
1051 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1052 ctypes.POINTER(ctypes.wintypes.DWORD))(
1053 (b"GetConsoleMode", ctypes.windll.kernel32))
1054 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1055
1056 def not_a_console(handle):
1057 if handle == INVALID_HANDLE_VALUE or handle is None:
1058 return True
1059 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1060 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1061
1062 if not_a_console(h):
1063 return False
1064
1065 def next_nonbmp_pos(s):
1066 try:
1067 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1068 except StopIteration:
1069 return len(s)
1070
1071 while s:
1072 count = min(next_nonbmp_pos(s), 1024)
1073
1074 ret = WriteConsoleW(
1075 h, s, count if count else 2, ctypes.byref(written), None)
1076 if ret == 0:
1077 raise OSError('Failed to write string')
1078 if not count: # We just wrote a non-BMP character
1079 assert written.value == 2
1080 s = s[1:]
1081 else:
1082 assert written.value > 0
1083 s = s[written.value:]
1084 return True
1085
1086
1087 def write_string(s, out=None, encoding=None):
1088 if out is None:
1089 out = sys.stderr
1090 assert type(s) == compat_str
1091
1092 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1093 if _windows_write_string(s, out):
1094 return
1095
1096 if ('b' in getattr(out, 'mode', '') or
1097 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1098 byt = s.encode(encoding or preferredencoding(), 'ignore')
1099 out.write(byt)
1100 elif hasattr(out, 'buffer'):
1101 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1102 byt = s.encode(enc, 'ignore')
1103 out.buffer.write(byt)
1104 else:
1105 out.write(s)
1106 out.flush()
1107
1108
1109 def bytes_to_intlist(bs):
1110 if not bs:
1111 return []
1112 if isinstance(bs[0], int): # Python 3
1113 return list(bs)
1114 else:
1115 return [ord(c) for c in bs]
1116
1117
1118 def intlist_to_bytes(xs):
1119 if not xs:
1120 return b''
1121 return struct_pack('%dB' % len(xs), *xs)
1122
1123
1124 # Cross-platform file locking
1125 if sys.platform == 'win32':
1126 import ctypes.wintypes
1127 import msvcrt
1128
1129 class OVERLAPPED(ctypes.Structure):
1130 _fields_ = [
1131 ('Internal', ctypes.wintypes.LPVOID),
1132 ('InternalHigh', ctypes.wintypes.LPVOID),
1133 ('Offset', ctypes.wintypes.DWORD),
1134 ('OffsetHigh', ctypes.wintypes.DWORD),
1135 ('hEvent', ctypes.wintypes.HANDLE),
1136 ]
1137
1138 kernel32 = ctypes.windll.kernel32
1139 LockFileEx = kernel32.LockFileEx
1140 LockFileEx.argtypes = [
1141 ctypes.wintypes.HANDLE, # hFile
1142 ctypes.wintypes.DWORD, # dwFlags
1143 ctypes.wintypes.DWORD, # dwReserved
1144 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1145 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1146 ctypes.POINTER(OVERLAPPED) # Overlapped
1147 ]
1148 LockFileEx.restype = ctypes.wintypes.BOOL
1149 UnlockFileEx = kernel32.UnlockFileEx
1150 UnlockFileEx.argtypes = [
1151 ctypes.wintypes.HANDLE, # hFile
1152 ctypes.wintypes.DWORD, # dwReserved
1153 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1154 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1155 ctypes.POINTER(OVERLAPPED) # Overlapped
1156 ]
1157 UnlockFileEx.restype = ctypes.wintypes.BOOL
1158 whole_low = 0xffffffff
1159 whole_high = 0x7fffffff
1160
1161 def _lock_file(f, exclusive):
1162 overlapped = OVERLAPPED()
1163 overlapped.Offset = 0
1164 overlapped.OffsetHigh = 0
1165 overlapped.hEvent = 0
1166 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1167 handle = msvcrt.get_osfhandle(f.fileno())
1168 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1169 whole_low, whole_high, f._lock_file_overlapped_p):
1170 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1171
1172 def _unlock_file(f):
1173 assert f._lock_file_overlapped_p
1174 handle = msvcrt.get_osfhandle(f.fileno())
1175 if not UnlockFileEx(handle, 0,
1176 whole_low, whole_high, f._lock_file_overlapped_p):
1177 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1178
1179 else:
1180 import fcntl
1181
1182 def _lock_file(f, exclusive):
1183 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1184
1185 def _unlock_file(f):
1186 fcntl.flock(f, fcntl.LOCK_UN)
1187
1188
1189 class locked_file(object):
1190 def __init__(self, filename, mode, encoding=None):
1191 assert mode in ['r', 'a', 'w']
1192 self.f = io.open(filename, mode, encoding=encoding)
1193 self.mode = mode
1194
1195 def __enter__(self):
1196 exclusive = self.mode != 'r'
1197 try:
1198 _lock_file(self.f, exclusive)
1199 except IOError:
1200 self.f.close()
1201 raise
1202 return self
1203
1204 def __exit__(self, etype, value, traceback):
1205 try:
1206 _unlock_file(self.f)
1207 finally:
1208 self.f.close()
1209
1210 def __iter__(self):
1211 return iter(self.f)
1212
1213 def write(self, *args):
1214 return self.f.write(*args)
1215
1216 def read(self, *args):
1217 return self.f.read(*args)
1218
1219
1220 def get_filesystem_encoding():
1221 encoding = sys.getfilesystemencoding()
1222 return encoding if encoding is not None else 'utf-8'
1223
1224
1225 def shell_quote(args):
1226 quoted_args = []
1227 encoding = get_filesystem_encoding()
1228 for a in args:
1229 if isinstance(a, bytes):
1230 # We may get a filename encoded with 'encodeFilename'
1231 a = a.decode(encoding)
1232 quoted_args.append(pipes.quote(a))
1233 return ' '.join(quoted_args)
1234
1235
1236 def smuggle_url(url, data):
1237 """ Pass additional data in a URL for internal use. """
1238
1239 sdata = compat_urllib_parse.urlencode(
1240 {'__youtubedl_smuggle': json.dumps(data)})
1241 return url + '#' + sdata
1242
1243
1244 def unsmuggle_url(smug_url, default=None):
1245 if '#__youtubedl_smuggle' not in smug_url:
1246 return smug_url, default
1247 url, _, sdata = smug_url.rpartition('#')
1248 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1249 data = json.loads(jsond)
1250 return url, data
1251
1252
1253 def format_bytes(bytes):
1254 if bytes is None:
1255 return 'N/A'
1256 if type(bytes) is str:
1257 bytes = float(bytes)
1258 if bytes == 0.0:
1259 exponent = 0
1260 else:
1261 exponent = int(math.log(bytes, 1024.0))
1262 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1263 converted = float(bytes) / float(1024 ** exponent)
1264 return '%.2f%s' % (converted, suffix)
1265
1266
1267 def parse_filesize(s):
1268 if s is None:
1269 return None
1270
1271 # The lower-case forms are of course incorrect and inofficial,
1272 # but we support those too
1273 _UNIT_TABLE = {
1274 'B': 1,
1275 'b': 1,
1276 'KiB': 1024,
1277 'KB': 1000,
1278 'kB': 1024,
1279 'Kb': 1000,
1280 'MiB': 1024 ** 2,
1281 'MB': 1000 ** 2,
1282 'mB': 1024 ** 2,
1283 'Mb': 1000 ** 2,
1284 'GiB': 1024 ** 3,
1285 'GB': 1000 ** 3,
1286 'gB': 1024 ** 3,
1287 'Gb': 1000 ** 3,
1288 'TiB': 1024 ** 4,
1289 'TB': 1000 ** 4,
1290 'tB': 1024 ** 4,
1291 'Tb': 1000 ** 4,
1292 'PiB': 1024 ** 5,
1293 'PB': 1000 ** 5,
1294 'pB': 1024 ** 5,
1295 'Pb': 1000 ** 5,
1296 'EiB': 1024 ** 6,
1297 'EB': 1000 ** 6,
1298 'eB': 1024 ** 6,
1299 'Eb': 1000 ** 6,
1300 'ZiB': 1024 ** 7,
1301 'ZB': 1000 ** 7,
1302 'zB': 1024 ** 7,
1303 'Zb': 1000 ** 7,
1304 'YiB': 1024 ** 8,
1305 'YB': 1000 ** 8,
1306 'yB': 1024 ** 8,
1307 'Yb': 1000 ** 8,
1308 }
1309
1310 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1311 m = re.match(
1312 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1313 if not m:
1314 return None
1315
1316 num_str = m.group('num').replace(',', '.')
1317 mult = _UNIT_TABLE[m.group('unit')]
1318 return int(float(num_str) * mult)
1319
1320
1321 def month_by_name(name):
1322 """ Return the number of a month by (locale-independently) English name """
1323
1324 try:
1325 return ENGLISH_MONTH_NAMES.index(name) + 1
1326 except ValueError:
1327 return None
1328
1329
1330 def month_by_abbreviation(abbrev):
1331 """ Return the number of a month by (locale-independently) English
1332 abbreviations """
1333
1334 try:
1335 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1336 except ValueError:
1337 return None
1338
1339
1340 def fix_xml_ampersands(xml_str):
1341 """Replace all the '&' by '&amp;' in XML"""
1342 return re.sub(
1343 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1344 '&amp;',
1345 xml_str)
1346
1347
1348 def setproctitle(title):
1349 assert isinstance(title, compat_str)
1350 try:
1351 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1352 except OSError:
1353 return
1354 title_bytes = title.encode('utf-8')
1355 buf = ctypes.create_string_buffer(len(title_bytes))
1356 buf.value = title_bytes
1357 try:
1358 libc.prctl(15, buf, 0, 0, 0)
1359 except AttributeError:
1360 return # Strange libc, just skip this
1361
1362
1363 def remove_start(s, start):
1364 if s.startswith(start):
1365 return s[len(start):]
1366 return s
1367
1368
1369 def remove_end(s, end):
1370 if s.endswith(end):
1371 return s[:-len(end)]
1372 return s
1373
1374
1375 def url_basename(url):
1376 path = compat_urlparse.urlparse(url).path
1377 return path.strip('/').split('/')[-1]
1378
1379
1380 class HEADRequest(compat_urllib_request.Request):
1381 def get_method(self):
1382 return "HEAD"
1383
1384
1385 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1386 if get_attr:
1387 if v is not None:
1388 v = getattr(v, get_attr, None)
1389 if v == '':
1390 v = None
1391 if v is None:
1392 return default
1393 try:
1394 return int(v) * invscale // scale
1395 except ValueError:
1396 return default
1397
1398
1399 def str_or_none(v, default=None):
1400 return default if v is None else compat_str(v)
1401
1402
1403 def str_to_int(int_str):
1404 """ A more relaxed version of int_or_none """
1405 if int_str is None:
1406 return None
1407 int_str = re.sub(r'[,\.\+]', '', int_str)
1408 return int(int_str)
1409
1410
1411 def float_or_none(v, scale=1, invscale=1, default=None):
1412 if v is None:
1413 return default
1414 try:
1415 return float(v) * invscale / scale
1416 except ValueError:
1417 return default
1418
1419
1420 def parse_duration(s):
1421 if not isinstance(s, compat_basestring):
1422 return None
1423
1424 s = s.strip()
1425
1426 m = re.match(
1427 r'''(?ix)(?:P?T)?
1428 (?:
1429 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1430 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1431
1432 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1433 (?:
1434 (?:
1435 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1436 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1437 )?
1438 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1439 )?
1440 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1441 )$''', s)
1442 if not m:
1443 return None
1444 res = 0
1445 if m.group('only_mins'):
1446 return float_or_none(m.group('only_mins'), invscale=60)
1447 if m.group('only_hours'):
1448 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1449 if m.group('secs'):
1450 res += int(m.group('secs'))
1451 if m.group('mins_reversed'):
1452 res += int(m.group('mins_reversed')) * 60
1453 if m.group('mins'):
1454 res += int(m.group('mins')) * 60
1455 if m.group('hours'):
1456 res += int(m.group('hours')) * 60 * 60
1457 if m.group('hours_reversed'):
1458 res += int(m.group('hours_reversed')) * 60 * 60
1459 if m.group('days'):
1460 res += int(m.group('days')) * 24 * 60 * 60
1461 if m.group('ms'):
1462 res += float(m.group('ms'))
1463 return res
1464
1465
1466 def prepend_extension(filename, ext, expected_real_ext=None):
1467 name, real_ext = os.path.splitext(filename)
1468 return (
1469 '{0}.{1}{2}'.format(name, ext, real_ext)
1470 if not expected_real_ext or real_ext[1:] == expected_real_ext
1471 else '{0}.{1}'.format(filename, ext))
1472
1473
1474 def replace_extension(filename, ext, expected_real_ext=None):
1475 name, real_ext = os.path.splitext(filename)
1476 return '{0}.{1}'.format(
1477 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1478 ext)
1479
1480
1481 def check_executable(exe, args=[]):
1482 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1483 args can be a list of arguments for a short output (like -version) """
1484 try:
1485 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1486 except OSError:
1487 return False
1488 return exe
1489
1490
1491 def get_exe_version(exe, args=['--version'],
1492 version_re=None, unrecognized='present'):
1493 """ Returns the version of the specified executable,
1494 or False if the executable is not present """
1495 try:
1496 out, _ = subprocess.Popen(
1497 [encodeArgument(exe)] + args,
1498 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1499 except OSError:
1500 return False
1501 if isinstance(out, bytes): # Python 2.x
1502 out = out.decode('ascii', 'ignore')
1503 return detect_exe_version(out, version_re, unrecognized)
1504
1505
1506 def detect_exe_version(output, version_re=None, unrecognized='present'):
1507 assert isinstance(output, compat_str)
1508 if version_re is None:
1509 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1510 m = re.search(version_re, output)
1511 if m:
1512 return m.group(1)
1513 else:
1514 return unrecognized
1515
1516
1517 class PagedList(object):
1518 def __len__(self):
1519 # This is only useful for tests
1520 return len(self.getslice())
1521
1522
1523 class OnDemandPagedList(PagedList):
1524 def __init__(self, pagefunc, pagesize):
1525 self._pagefunc = pagefunc
1526 self._pagesize = pagesize
1527
1528 def getslice(self, start=0, end=None):
1529 res = []
1530 for pagenum in itertools.count(start // self._pagesize):
1531 firstid = pagenum * self._pagesize
1532 nextfirstid = pagenum * self._pagesize + self._pagesize
1533 if start >= nextfirstid:
1534 continue
1535
1536 page_results = list(self._pagefunc(pagenum))
1537
1538 startv = (
1539 start % self._pagesize
1540 if firstid <= start < nextfirstid
1541 else 0)
1542
1543 endv = (
1544 ((end - 1) % self._pagesize) + 1
1545 if (end is not None and firstid <= end <= nextfirstid)
1546 else None)
1547
1548 if startv != 0 or endv is not None:
1549 page_results = page_results[startv:endv]
1550 res.extend(page_results)
1551
1552 # A little optimization - if current page is not "full", ie. does
1553 # not contain page_size videos then we can assume that this page
1554 # is the last one - there are no more ids on further pages -
1555 # i.e. no need to query again.
1556 if len(page_results) + startv < self._pagesize:
1557 break
1558
1559 # If we got the whole page, but the next page is not interesting,
1560 # break out early as well
1561 if end == nextfirstid:
1562 break
1563 return res
1564
1565
1566 class InAdvancePagedList(PagedList):
1567 def __init__(self, pagefunc, pagecount, pagesize):
1568 self._pagefunc = pagefunc
1569 self._pagecount = pagecount
1570 self._pagesize = pagesize
1571
1572 def getslice(self, start=0, end=None):
1573 res = []
1574 start_page = start // self._pagesize
1575 end_page = (
1576 self._pagecount if end is None else (end // self._pagesize + 1))
1577 skip_elems = start - start_page * self._pagesize
1578 only_more = None if end is None else end - start
1579 for pagenum in range(start_page, end_page):
1580 page = list(self._pagefunc(pagenum))
1581 if skip_elems:
1582 page = page[skip_elems:]
1583 skip_elems = None
1584 if only_more is not None:
1585 if len(page) < only_more:
1586 only_more -= len(page)
1587 else:
1588 page = page[:only_more]
1589 res.extend(page)
1590 break
1591 res.extend(page)
1592 return res
1593
1594
1595 def uppercase_escape(s):
1596 unicode_escape = codecs.getdecoder('unicode_escape')
1597 return re.sub(
1598 r'\\U[0-9a-fA-F]{8}',
1599 lambda m: unicode_escape(m.group(0))[0],
1600 s)
1601
1602
1603 def lowercase_escape(s):
1604 unicode_escape = codecs.getdecoder('unicode_escape')
1605 return re.sub(
1606 r'\\u[0-9a-fA-F]{4}',
1607 lambda m: unicode_escape(m.group(0))[0],
1608 s)
1609
1610
1611 def escape_rfc3986(s):
1612 """Escape non-ASCII characters as suggested by RFC 3986"""
1613 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1614 s = s.encode('utf-8')
1615 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1616
1617
1618 def escape_url(url):
1619 """Escape URL as suggested by RFC 3986"""
1620 url_parsed = compat_urllib_parse_urlparse(url)
1621 return url_parsed._replace(
1622 path=escape_rfc3986(url_parsed.path),
1623 params=escape_rfc3986(url_parsed.params),
1624 query=escape_rfc3986(url_parsed.query),
1625 fragment=escape_rfc3986(url_parsed.fragment)
1626 ).geturl()
1627
1628 try:
1629 struct.pack('!I', 0)
1630 except TypeError:
1631 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1632 def struct_pack(spec, *args):
1633 if isinstance(spec, compat_str):
1634 spec = spec.encode('ascii')
1635 return struct.pack(spec, *args)
1636
1637 def struct_unpack(spec, *args):
1638 if isinstance(spec, compat_str):
1639 spec = spec.encode('ascii')
1640 return struct.unpack(spec, *args)
1641 else:
1642 struct_pack = struct.pack
1643 struct_unpack = struct.unpack
1644
1645
1646 def read_batch_urls(batch_fd):
1647 def fixup(url):
1648 if not isinstance(url, compat_str):
1649 url = url.decode('utf-8', 'replace')
1650 BOM_UTF8 = '\xef\xbb\xbf'
1651 if url.startswith(BOM_UTF8):
1652 url = url[len(BOM_UTF8):]
1653 url = url.strip()
1654 if url.startswith(('#', ';', ']')):
1655 return False
1656 return url
1657
1658 with contextlib.closing(batch_fd) as fd:
1659 return [url for url in map(fixup, fd) if url]
1660
1661
1662 def urlencode_postdata(*args, **kargs):
1663 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1664
1665
1666 def encode_dict(d, encoding='utf-8'):
1667 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1668
1669
1670 US_RATINGS = {
1671 'G': 0,
1672 'PG': 10,
1673 'PG-13': 13,
1674 'R': 16,
1675 'NC': 18,
1676 }
1677
1678
1679 def parse_age_limit(s):
1680 if s is None:
1681 return None
1682 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1683 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1684
1685
1686 def strip_jsonp(code):
1687 return re.sub(
1688 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1689
1690
1691 def js_to_json(code):
1692 def fix_kv(m):
1693 v = m.group(0)
1694 if v in ('true', 'false', 'null'):
1695 return v
1696 if v.startswith('"'):
1697 v = re.sub(r"\\'", "'", v[1:-1])
1698 elif v.startswith("'"):
1699 v = v[1:-1]
1700 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1701 '\\\\': '\\\\',
1702 "\\'": "'",
1703 '"': '\\"',
1704 }[m.group(0)], v)
1705 return '"%s"' % v
1706
1707 res = re.sub(r'''(?x)
1708 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1709 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1710 [a-zA-Z_][.a-zA-Z_0-9]*
1711 ''', fix_kv, code)
1712 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1713 return res
1714
1715
1716 def qualities(quality_ids):
1717 """ Get a numeric quality value out of a list of possible values """
1718 def q(qid):
1719 try:
1720 return quality_ids.index(qid)
1721 except ValueError:
1722 return -1
1723 return q
1724
1725
1726 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1727
1728
1729 def limit_length(s, length):
1730 """ Add ellipses to overly long strings """
1731 if s is None:
1732 return None
1733 ELLIPSES = '...'
1734 if len(s) > length:
1735 return s[:length - len(ELLIPSES)] + ELLIPSES
1736 return s
1737
1738
1739 def version_tuple(v):
1740 return tuple(int(e) for e in re.split(r'[-.]', v))
1741
1742
1743 def is_outdated_version(version, limit, assume_new=True):
1744 if not version:
1745 return not assume_new
1746 try:
1747 return version_tuple(version) < version_tuple(limit)
1748 except ValueError:
1749 return not assume_new
1750
1751
1752 def ytdl_is_updateable():
1753 """ Returns if youtube-dl can be updated with -U """
1754 from zipimport import zipimporter
1755
1756 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1757
1758
1759 def args_to_str(args):
1760 # Get a short string representation for a subprocess command
1761 return ' '.join(shlex_quote(a) for a in args)
1762
1763
1764 def mimetype2ext(mt):
1765 _, _, res = mt.rpartition('/')
1766
1767 return {
1768 'x-ms-wmv': 'wmv',
1769 'x-mp4-fragmented': 'mp4',
1770 'ttml+xml': 'ttml',
1771 }.get(res, res)
1772
1773
1774 def urlhandle_detect_ext(url_handle):
1775 try:
1776 url_handle.headers
1777 getheader = lambda h: url_handle.headers[h]
1778 except AttributeError: # Python < 3
1779 getheader = url_handle.info().getheader
1780
1781 cd = getheader('Content-Disposition')
1782 if cd:
1783 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1784 if m:
1785 e = determine_ext(m.group('filename'), default_ext=None)
1786 if e:
1787 return e
1788
1789 return mimetype2ext(getheader('Content-Type'))
1790
1791
1792 def encode_data_uri(data, mime_type):
1793 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1794
1795
1796 def age_restricted(content_limit, age_limit):
1797 """ Returns True iff the content should be blocked """
1798
1799 if age_limit is None: # No limit set
1800 return False
1801 if content_limit is None:
1802 return False # Content available for everyone
1803 return age_limit < content_limit
1804
1805
1806 def is_html(first_bytes):
1807 """ Detect whether a file contains HTML by examining its first bytes. """
1808
1809 BOMS = [
1810 (b'\xef\xbb\xbf', 'utf-8'),
1811 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1812 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1813 (b'\xff\xfe', 'utf-16-le'),
1814 (b'\xfe\xff', 'utf-16-be'),
1815 ]
1816 for bom, enc in BOMS:
1817 if first_bytes.startswith(bom):
1818 s = first_bytes[len(bom):].decode(enc, 'replace')
1819 break
1820 else:
1821 s = first_bytes.decode('utf-8', 'replace')
1822
1823 return re.match(r'^\s*<', s)
1824
1825
1826 def determine_protocol(info_dict):
1827 protocol = info_dict.get('protocol')
1828 if protocol is not None:
1829 return protocol
1830
1831 url = info_dict['url']
1832 if url.startswith('rtmp'):
1833 return 'rtmp'
1834 elif url.startswith('mms'):
1835 return 'mms'
1836 elif url.startswith('rtsp'):
1837 return 'rtsp'
1838
1839 ext = determine_ext(url)
1840 if ext == 'm3u8':
1841 return 'm3u8'
1842 elif ext == 'f4m':
1843 return 'f4m'
1844
1845 return compat_urllib_parse_urlparse(url).scheme
1846
1847
1848 def render_table(header_row, data):
1849 """ Render a list of rows, each as a list of values """
1850 table = [header_row] + data
1851 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1852 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1853 return '\n'.join(format_str % tuple(row) for row in table)
1854
1855
1856 def _match_one(filter_part, dct):
1857 COMPARISON_OPERATORS = {
1858 '<': operator.lt,
1859 '<=': operator.le,
1860 '>': operator.gt,
1861 '>=': operator.ge,
1862 '=': operator.eq,
1863 '!=': operator.ne,
1864 }
1865 operator_rex = re.compile(r'''(?x)\s*
1866 (?P<key>[a-z_]+)
1867 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1868 (?:
1869 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1870 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1871 )
1872 \s*$
1873 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1874 m = operator_rex.search(filter_part)
1875 if m:
1876 op = COMPARISON_OPERATORS[m.group('op')]
1877 if m.group('strval') is not None:
1878 if m.group('op') not in ('=', '!='):
1879 raise ValueError(
1880 'Operator %s does not support string values!' % m.group('op'))
1881 comparison_value = m.group('strval')
1882 else:
1883 try:
1884 comparison_value = int(m.group('intval'))
1885 except ValueError:
1886 comparison_value = parse_filesize(m.group('intval'))
1887 if comparison_value is None:
1888 comparison_value = parse_filesize(m.group('intval') + 'B')
1889 if comparison_value is None:
1890 raise ValueError(
1891 'Invalid integer value %r in filter part %r' % (
1892 m.group('intval'), filter_part))
1893 actual_value = dct.get(m.group('key'))
1894 if actual_value is None:
1895 return m.group('none_inclusive')
1896 return op(actual_value, comparison_value)
1897
1898 UNARY_OPERATORS = {
1899 '': lambda v: v is not None,
1900 '!': lambda v: v is None,
1901 }
1902 operator_rex = re.compile(r'''(?x)\s*
1903 (?P<op>%s)\s*(?P<key>[a-z_]+)
1904 \s*$
1905 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1906 m = operator_rex.search(filter_part)
1907 if m:
1908 op = UNARY_OPERATORS[m.group('op')]
1909 actual_value = dct.get(m.group('key'))
1910 return op(actual_value)
1911
1912 raise ValueError('Invalid filter part %r' % filter_part)
1913
1914
1915 def match_str(filter_str, dct):
1916 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1917
1918 return all(
1919 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1920
1921
1922 def match_filter_func(filter_str):
1923 def _match_func(info_dict):
1924 if match_str(filter_str, info_dict):
1925 return None
1926 else:
1927 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1928 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1929 return _match_func
1930
1931
1932 def parse_dfxp_time_expr(time_expr):
1933 if not time_expr:
1934 return 0.0
1935
1936 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1937 if mobj:
1938 return float(mobj.group('time_offset'))
1939
1940 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1941 if mobj:
1942 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1943
1944
1945 def srt_subtitles_timecode(seconds):
1946 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1947
1948
1949 def dfxp2srt(dfxp_data):
1950 _x = functools.partial(xpath_with_ns, ns_map={
1951 'ttml': 'http://www.w3.org/ns/ttml',
1952 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1953 })
1954
1955 def parse_node(node):
1956 str_or_empty = functools.partial(str_or_none, default='')
1957
1958 out = str_or_empty(node.text)
1959
1960 for child in node:
1961 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1962 out += '\n' + str_or_empty(child.tail)
1963 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1964 out += str_or_empty(parse_node(child))
1965 else:
1966 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1967
1968 return out
1969
1970 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1971 out = []
1972 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1973
1974 if not paras:
1975 raise ValueError('Invalid dfxp/TTML subtitle')
1976
1977 for para, index in zip(paras, itertools.count(1)):
1978 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1979 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1980 if not end_time:
1981 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1982 out.append('%d\n%s --> %s\n%s\n\n' % (
1983 index,
1984 srt_subtitles_timecode(begin_time),
1985 srt_subtitles_timecode(end_time),
1986 parse_node(para)))
1987
1988 return ''.join(out)
1989
1990
1991 def cli_option(params, command_option, param):
1992 param = params.get(param)
1993 return [command_option, param] if param is not None else []
1994
1995
1996 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1997 param = params.get(param)
1998 assert isinstance(param, bool)
1999 if separator:
2000 return [command_option + separator + (true_value if param else false_value)]
2001 return [command_option, true_value if param else false_value]
2002
2003
2004 def cli_valueless_option(params, command_option, param, expected_value=True):
2005 param = params.get(param)
2006 return [command_option] if param == expected_value else []
2007
2008
2009 def cli_configuration_args(params, param, default=[]):
2010 ex_args = params.get(param)
2011 if ex_args is None:
2012 return default
2013 assert isinstance(ex_args, list)
2014 return ex_args
2015
2016
2017 class ISO639Utils(object):
2018 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2019 _lang_map = {
2020 'aa': 'aar',
2021 'ab': 'abk',
2022 'ae': 'ave',
2023 'af': 'afr',
2024 'ak': 'aka',
2025 'am': 'amh',
2026 'an': 'arg',
2027 'ar': 'ara',
2028 'as': 'asm',
2029 'av': 'ava',
2030 'ay': 'aym',
2031 'az': 'aze',
2032 'ba': 'bak',
2033 'be': 'bel',
2034 'bg': 'bul',
2035 'bh': 'bih',
2036 'bi': 'bis',
2037 'bm': 'bam',
2038 'bn': 'ben',
2039 'bo': 'bod',
2040 'br': 'bre',
2041 'bs': 'bos',
2042 'ca': 'cat',
2043 'ce': 'che',
2044 'ch': 'cha',
2045 'co': 'cos',
2046 'cr': 'cre',
2047 'cs': 'ces',
2048 'cu': 'chu',
2049 'cv': 'chv',
2050 'cy': 'cym',
2051 'da': 'dan',
2052 'de': 'deu',
2053 'dv': 'div',
2054 'dz': 'dzo',
2055 'ee': 'ewe',
2056 'el': 'ell',
2057 'en': 'eng',
2058 'eo': 'epo',
2059 'es': 'spa',
2060 'et': 'est',
2061 'eu': 'eus',
2062 'fa': 'fas',
2063 'ff': 'ful',
2064 'fi': 'fin',
2065 'fj': 'fij',
2066 'fo': 'fao',
2067 'fr': 'fra',
2068 'fy': 'fry',
2069 'ga': 'gle',
2070 'gd': 'gla',
2071 'gl': 'glg',
2072 'gn': 'grn',
2073 'gu': 'guj',
2074 'gv': 'glv',
2075 'ha': 'hau',
2076 'he': 'heb',
2077 'hi': 'hin',
2078 'ho': 'hmo',
2079 'hr': 'hrv',
2080 'ht': 'hat',
2081 'hu': 'hun',
2082 'hy': 'hye',
2083 'hz': 'her',
2084 'ia': 'ina',
2085 'id': 'ind',
2086 'ie': 'ile',
2087 'ig': 'ibo',
2088 'ii': 'iii',
2089 'ik': 'ipk',
2090 'io': 'ido',
2091 'is': 'isl',
2092 'it': 'ita',
2093 'iu': 'iku',
2094 'ja': 'jpn',
2095 'jv': 'jav',
2096 'ka': 'kat',
2097 'kg': 'kon',
2098 'ki': 'kik',
2099 'kj': 'kua',
2100 'kk': 'kaz',
2101 'kl': 'kal',
2102 'km': 'khm',
2103 'kn': 'kan',
2104 'ko': 'kor',
2105 'kr': 'kau',
2106 'ks': 'kas',
2107 'ku': 'kur',
2108 'kv': 'kom',
2109 'kw': 'cor',
2110 'ky': 'kir',
2111 'la': 'lat',
2112 'lb': 'ltz',
2113 'lg': 'lug',
2114 'li': 'lim',
2115 'ln': 'lin',
2116 'lo': 'lao',
2117 'lt': 'lit',
2118 'lu': 'lub',
2119 'lv': 'lav',
2120 'mg': 'mlg',
2121 'mh': 'mah',
2122 'mi': 'mri',
2123 'mk': 'mkd',
2124 'ml': 'mal',
2125 'mn': 'mon',
2126 'mr': 'mar',
2127 'ms': 'msa',
2128 'mt': 'mlt',
2129 'my': 'mya',
2130 'na': 'nau',
2131 'nb': 'nob',
2132 'nd': 'nde',
2133 'ne': 'nep',
2134 'ng': 'ndo',
2135 'nl': 'nld',
2136 'nn': 'nno',
2137 'no': 'nor',
2138 'nr': 'nbl',
2139 'nv': 'nav',
2140 'ny': 'nya',
2141 'oc': 'oci',
2142 'oj': 'oji',
2143 'om': 'orm',
2144 'or': 'ori',
2145 'os': 'oss',
2146 'pa': 'pan',
2147 'pi': 'pli',
2148 'pl': 'pol',
2149 'ps': 'pus',
2150 'pt': 'por',
2151 'qu': 'que',
2152 'rm': 'roh',
2153 'rn': 'run',
2154 'ro': 'ron',
2155 'ru': 'rus',
2156 'rw': 'kin',
2157 'sa': 'san',
2158 'sc': 'srd',
2159 'sd': 'snd',
2160 'se': 'sme',
2161 'sg': 'sag',
2162 'si': 'sin',
2163 'sk': 'slk',
2164 'sl': 'slv',
2165 'sm': 'smo',
2166 'sn': 'sna',
2167 'so': 'som',
2168 'sq': 'sqi',
2169 'sr': 'srp',
2170 'ss': 'ssw',
2171 'st': 'sot',
2172 'su': 'sun',
2173 'sv': 'swe',
2174 'sw': 'swa',
2175 'ta': 'tam',
2176 'te': 'tel',
2177 'tg': 'tgk',
2178 'th': 'tha',
2179 'ti': 'tir',
2180 'tk': 'tuk',
2181 'tl': 'tgl',
2182 'tn': 'tsn',
2183 'to': 'ton',
2184 'tr': 'tur',
2185 'ts': 'tso',
2186 'tt': 'tat',
2187 'tw': 'twi',
2188 'ty': 'tah',
2189 'ug': 'uig',
2190 'uk': 'ukr',
2191 'ur': 'urd',
2192 'uz': 'uzb',
2193 've': 'ven',
2194 'vi': 'vie',
2195 'vo': 'vol',
2196 'wa': 'wln',
2197 'wo': 'wol',
2198 'xh': 'xho',
2199 'yi': 'yid',
2200 'yo': 'yor',
2201 'za': 'zha',
2202 'zh': 'zho',
2203 'zu': 'zul',
2204 }
2205
2206 @classmethod
2207 def short2long(cls, code):
2208 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2209 return cls._lang_map.get(code[:2])
2210
2211 @classmethod
2212 def long2short(cls, code):
2213 """Convert language code from ISO 639-2/T to ISO 639-1"""
2214 for short_name, long_name in cls._lang_map.items():
2215 if long_name == code:
2216 return short_name
2217
2218
2219 class ISO3166Utils(object):
2220 # From http://data.okfn.org/data/core/country-list
2221 _country_map = {
2222 'AF': 'Afghanistan',
2223 'AX': 'Åland Islands',
2224 'AL': 'Albania',
2225 'DZ': 'Algeria',
2226 'AS': 'American Samoa',
2227 'AD': 'Andorra',
2228 'AO': 'Angola',
2229 'AI': 'Anguilla',
2230 'AQ': 'Antarctica',
2231 'AG': 'Antigua and Barbuda',
2232 'AR': 'Argentina',
2233 'AM': 'Armenia',
2234 'AW': 'Aruba',
2235 'AU': 'Australia',
2236 'AT': 'Austria',
2237 'AZ': 'Azerbaijan',
2238 'BS': 'Bahamas',
2239 'BH': 'Bahrain',
2240 'BD': 'Bangladesh',
2241 'BB': 'Barbados',
2242 'BY': 'Belarus',
2243 'BE': 'Belgium',
2244 'BZ': 'Belize',
2245 'BJ': 'Benin',
2246 'BM': 'Bermuda',
2247 'BT': 'Bhutan',
2248 'BO': 'Bolivia, Plurinational State of',
2249 'BQ': 'Bonaire, Sint Eustatius and Saba',
2250 'BA': 'Bosnia and Herzegovina',
2251 'BW': 'Botswana',
2252 'BV': 'Bouvet Island',
2253 'BR': 'Brazil',
2254 'IO': 'British Indian Ocean Territory',
2255 'BN': 'Brunei Darussalam',
2256 'BG': 'Bulgaria',
2257 'BF': 'Burkina Faso',
2258 'BI': 'Burundi',
2259 'KH': 'Cambodia',
2260 'CM': 'Cameroon',
2261 'CA': 'Canada',
2262 'CV': 'Cape Verde',
2263 'KY': 'Cayman Islands',
2264 'CF': 'Central African Republic',
2265 'TD': 'Chad',
2266 'CL': 'Chile',
2267 'CN': 'China',
2268 'CX': 'Christmas Island',
2269 'CC': 'Cocos (Keeling) Islands',
2270 'CO': 'Colombia',
2271 'KM': 'Comoros',
2272 'CG': 'Congo',
2273 'CD': 'Congo, the Democratic Republic of the',
2274 'CK': 'Cook Islands',
2275 'CR': 'Costa Rica',
2276 'CI': 'Côte d\'Ivoire',
2277 'HR': 'Croatia',
2278 'CU': 'Cuba',
2279 'CW': 'Curaçao',
2280 'CY': 'Cyprus',
2281 'CZ': 'Czech Republic',
2282 'DK': 'Denmark',
2283 'DJ': 'Djibouti',
2284 'DM': 'Dominica',
2285 'DO': 'Dominican Republic',
2286 'EC': 'Ecuador',
2287 'EG': 'Egypt',
2288 'SV': 'El Salvador',
2289 'GQ': 'Equatorial Guinea',
2290 'ER': 'Eritrea',
2291 'EE': 'Estonia',
2292 'ET': 'Ethiopia',
2293 'FK': 'Falkland Islands (Malvinas)',
2294 'FO': 'Faroe Islands',
2295 'FJ': 'Fiji',
2296 'FI': 'Finland',
2297 'FR': 'France',
2298 'GF': 'French Guiana',
2299 'PF': 'French Polynesia',
2300 'TF': 'French Southern Territories',
2301 'GA': 'Gabon',
2302 'GM': 'Gambia',
2303 'GE': 'Georgia',
2304 'DE': 'Germany',
2305 'GH': 'Ghana',
2306 'GI': 'Gibraltar',
2307 'GR': 'Greece',
2308 'GL': 'Greenland',
2309 'GD': 'Grenada',
2310 'GP': 'Guadeloupe',
2311 'GU': 'Guam',
2312 'GT': 'Guatemala',
2313 'GG': 'Guernsey',
2314 'GN': 'Guinea',
2315 'GW': 'Guinea-Bissau',
2316 'GY': 'Guyana',
2317 'HT': 'Haiti',
2318 'HM': 'Heard Island and McDonald Islands',
2319 'VA': 'Holy See (Vatican City State)',
2320 'HN': 'Honduras',
2321 'HK': 'Hong Kong',
2322 'HU': 'Hungary',
2323 'IS': 'Iceland',
2324 'IN': 'India',
2325 'ID': 'Indonesia',
2326 'IR': 'Iran, Islamic Republic of',
2327 'IQ': 'Iraq',
2328 'IE': 'Ireland',
2329 'IM': 'Isle of Man',
2330 'IL': 'Israel',
2331 'IT': 'Italy',
2332 'JM': 'Jamaica',
2333 'JP': 'Japan',
2334 'JE': 'Jersey',
2335 'JO': 'Jordan',
2336 'KZ': 'Kazakhstan',
2337 'KE': 'Kenya',
2338 'KI': 'Kiribati',
2339 'KP': 'Korea, Democratic People\'s Republic of',
2340 'KR': 'Korea, Republic of',
2341 'KW': 'Kuwait',
2342 'KG': 'Kyrgyzstan',
2343 'LA': 'Lao People\'s Democratic Republic',
2344 'LV': 'Latvia',
2345 'LB': 'Lebanon',
2346 'LS': 'Lesotho',
2347 'LR': 'Liberia',
2348 'LY': 'Libya',
2349 'LI': 'Liechtenstein',
2350 'LT': 'Lithuania',
2351 'LU': 'Luxembourg',
2352 'MO': 'Macao',
2353 'MK': 'Macedonia, the Former Yugoslav Republic of',
2354 'MG': 'Madagascar',
2355 'MW': 'Malawi',
2356 'MY': 'Malaysia',
2357 'MV': 'Maldives',
2358 'ML': 'Mali',
2359 'MT': 'Malta',
2360 'MH': 'Marshall Islands',
2361 'MQ': 'Martinique',
2362 'MR': 'Mauritania',
2363 'MU': 'Mauritius',
2364 'YT': 'Mayotte',
2365 'MX': 'Mexico',
2366 'FM': 'Micronesia, Federated States of',
2367 'MD': 'Moldova, Republic of',
2368 'MC': 'Monaco',
2369 'MN': 'Mongolia',
2370 'ME': 'Montenegro',
2371 'MS': 'Montserrat',
2372 'MA': 'Morocco',
2373 'MZ': 'Mozambique',
2374 'MM': 'Myanmar',
2375 'NA': 'Namibia',
2376 'NR': 'Nauru',
2377 'NP': 'Nepal',
2378 'NL': 'Netherlands',
2379 'NC': 'New Caledonia',
2380 'NZ': 'New Zealand',
2381 'NI': 'Nicaragua',
2382 'NE': 'Niger',
2383 'NG': 'Nigeria',
2384 'NU': 'Niue',
2385 'NF': 'Norfolk Island',
2386 'MP': 'Northern Mariana Islands',
2387 'NO': 'Norway',
2388 'OM': 'Oman',
2389 'PK': 'Pakistan',
2390 'PW': 'Palau',
2391 'PS': 'Palestine, State of',
2392 'PA': 'Panama',
2393 'PG': 'Papua New Guinea',
2394 'PY': 'Paraguay',
2395 'PE': 'Peru',
2396 'PH': 'Philippines',
2397 'PN': 'Pitcairn',
2398 'PL': 'Poland',
2399 'PT': 'Portugal',
2400 'PR': 'Puerto Rico',
2401 'QA': 'Qatar',
2402 'RE': 'Réunion',
2403 'RO': 'Romania',
2404 'RU': 'Russian Federation',
2405 'RW': 'Rwanda',
2406 'BL': 'Saint Barthélemy',
2407 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2408 'KN': 'Saint Kitts and Nevis',
2409 'LC': 'Saint Lucia',
2410 'MF': 'Saint Martin (French part)',
2411 'PM': 'Saint Pierre and Miquelon',
2412 'VC': 'Saint Vincent and the Grenadines',
2413 'WS': 'Samoa',
2414 'SM': 'San Marino',
2415 'ST': 'Sao Tome and Principe',
2416 'SA': 'Saudi Arabia',
2417 'SN': 'Senegal',
2418 'RS': 'Serbia',
2419 'SC': 'Seychelles',
2420 'SL': 'Sierra Leone',
2421 'SG': 'Singapore',
2422 'SX': 'Sint Maarten (Dutch part)',
2423 'SK': 'Slovakia',
2424 'SI': 'Slovenia',
2425 'SB': 'Solomon Islands',
2426 'SO': 'Somalia',
2427 'ZA': 'South Africa',
2428 'GS': 'South Georgia and the South Sandwich Islands',
2429 'SS': 'South Sudan',
2430 'ES': 'Spain',
2431 'LK': 'Sri Lanka',
2432 'SD': 'Sudan',
2433 'SR': 'Suriname',
2434 'SJ': 'Svalbard and Jan Mayen',
2435 'SZ': 'Swaziland',
2436 'SE': 'Sweden',
2437 'CH': 'Switzerland',
2438 'SY': 'Syrian Arab Republic',
2439 'TW': 'Taiwan, Province of China',
2440 'TJ': 'Tajikistan',
2441 'TZ': 'Tanzania, United Republic of',
2442 'TH': 'Thailand',
2443 'TL': 'Timor-Leste',
2444 'TG': 'Togo',
2445 'TK': 'Tokelau',
2446 'TO': 'Tonga',
2447 'TT': 'Trinidad and Tobago',
2448 'TN': 'Tunisia',
2449 'TR': 'Turkey',
2450 'TM': 'Turkmenistan',
2451 'TC': 'Turks and Caicos Islands',
2452 'TV': 'Tuvalu',
2453 'UG': 'Uganda',
2454 'UA': 'Ukraine',
2455 'AE': 'United Arab Emirates',
2456 'GB': 'United Kingdom',
2457 'US': 'United States',
2458 'UM': 'United States Minor Outlying Islands',
2459 'UY': 'Uruguay',
2460 'UZ': 'Uzbekistan',
2461 'VU': 'Vanuatu',
2462 'VE': 'Venezuela, Bolivarian Republic of',
2463 'VN': 'Viet Nam',
2464 'VG': 'Virgin Islands, British',
2465 'VI': 'Virgin Islands, U.S.',
2466 'WF': 'Wallis and Futuna',
2467 'EH': 'Western Sahara',
2468 'YE': 'Yemen',
2469 'ZM': 'Zambia',
2470 'ZW': 'Zimbabwe',
2471 }
2472
2473 @classmethod
2474 def short2full(cls, code):
2475 """Convert an ISO 3166-2 country code to the corresponding full name"""
2476 return cls._country_map.get(code.upper())
2477
2478
2479 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2480 def __init__(self, proxies=None):
2481 # Set default handlers
2482 for type in ('http', 'https'):
2483 setattr(self, '%s_open' % type,
2484 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2485 meth(r, proxy, type))
2486 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2487
2488 def proxy_open(self, req, proxy, type):
2489 req_proxy = req.headers.get('Ytdl-request-proxy')
2490 if req_proxy is not None:
2491 proxy = req_proxy
2492 del req.headers['Ytdl-request-proxy']
2493
2494 if proxy == '__noproxy__':
2495 return None # No Proxy
2496 return compat_urllib_request.ProxyHandler.proxy_open(
2497 self, req, proxy, type)