]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Imported Upstream version 2015.07.21
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_kwargs,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 NO_DEFAULT = object()
66
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
70
71
72 def preferredencoding():
73 """Get preferred encoding.
74
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
78 try:
79 pref = locale.getpreferredencoding()
80 'TEST'.encode(pref)
81 except Exception:
82 pref = 'UTF-8'
83
84 return pref
85
86
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
89
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
99 else:
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
102
103 args = {
104 'suffix': '.tmp',
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
107 'delete': False,
108 }
109
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
113 args['mode'] = 'wb'
114 else:
115 args.update({
116 'mode': 'w',
117 'encoding': 'utf-8',
118 })
119
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
121
122 try:
123 with tf:
124 json.dump(obj, tf)
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
128 try:
129 os.unlink(fn)
130 except OSError:
131 pass
132 os.rename(tf.name, fn)
133 except Exception:
134 try:
135 os.remove(tf.name)
136 except OSError:
137 pass
138 raise
139
140
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z-]+$', key)
145 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
146 expr = xpath + "[@%s='%s']" % (key, val)
147 return node.find(expr)
148 else:
149 def find_xpath_attr(node, xpath, key, val):
150 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
151 # .//node does not match if a node is a direct child of . !
152 if isinstance(xpath, compat_str):
153 xpath = xpath.encode('ascii')
154
155 for f in node.findall(xpath):
156 if f.attrib.get(key) == val:
157 return f
158 return None
159
160 # On python2.6 the xml.etree.ElementTree.Element methods don't support
161 # the namespace parameter
162
163
164 def xpath_with_ns(path, ns_map):
165 components = [c.split(':') for c in path.split('/')]
166 replaced = []
167 for c in components:
168 if len(c) == 1:
169 replaced.append(c[0])
170 else:
171 ns, tag = c
172 replaced.append('{%s}%s' % (ns_map[ns], tag))
173 return '/'.join(replaced)
174
175
176 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
177 if sys.version_info < (2, 7): # Crazy 2.6
178 xpath = xpath.encode('ascii')
179
180 n = node.find(xpath)
181 if n is None or n.text is None:
182 if default is not NO_DEFAULT:
183 return default
184 elif fatal:
185 name = xpath if name is None else name
186 raise ExtractorError('Could not find XML element %s' % name)
187 else:
188 return None
189 return n.text
190
191
192 def get_element_by_id(id, html):
193 """Return the content of the tag with the specified ID in the passed HTML document"""
194 return get_element_by_attribute("id", id, html)
195
196
197 def get_element_by_attribute(attribute, value, html):
198 """Return the content of the tag with the specified attribute in the passed HTML document"""
199
200 m = re.search(r'''(?xs)
201 <([a-zA-Z0-9:._-]+)
202 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 \s+%s=['"]?%s['"]?
204 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
205 \s*>
206 (?P<content>.*?)
207 </\1>
208 ''' % (re.escape(attribute), re.escape(value)), html)
209
210 if not m:
211 return None
212 res = m.group('content')
213
214 if res.startswith('"') or res.startswith("'"):
215 res = res[1:-1]
216
217 return unescapeHTML(res)
218
219
220 def clean_html(html):
221 """Clean an HTML snippet into a readable string"""
222
223 if html is None: # Convenience for sanitizing descriptions etc.
224 return html
225
226 # Newline vs <br />
227 html = html.replace('\n', ' ')
228 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
229 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
230 # Strip html tags
231 html = re.sub('<.*?>', '', html)
232 # Replace html entities
233 html = unescapeHTML(html)
234 return html.strip()
235
236
237 def sanitize_open(filename, open_mode):
238 """Try to open the given filename, and slightly tweak it if this fails.
239
240 Attempts to open the given filename. If this fails, it tries to change
241 the filename slightly, step by step, until it's either able to open it
242 or it fails and raises a final exception, like the standard open()
243 function.
244
245 It returns the tuple (stream, definitive_file_name).
246 """
247 try:
248 if filename == '-':
249 if sys.platform == 'win32':
250 import msvcrt
251 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
252 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
253 stream = open(encodeFilename(filename), open_mode)
254 return (stream, filename)
255 except (IOError, OSError) as err:
256 if err.errno in (errno.EACCES,):
257 raise
258
259 # In case of error, try to remove win32 forbidden chars
260 alt_filename = sanitize_path(filename)
261 if alt_filename == filename:
262 raise
263 else:
264 # An exception here should be caught in the caller
265 stream = open(encodeFilename(alt_filename), open_mode)
266 return (stream, alt_filename)
267
268
269 def timeconvert(timestr):
270 """Convert RFC 2822 defined time string into system timestamp"""
271 timestamp = None
272 timetuple = email.utils.parsedate_tz(timestr)
273 if timetuple is not None:
274 timestamp = email.utils.mktime_tz(timetuple)
275 return timestamp
276
277
278 def sanitize_filename(s, restricted=False, is_id=False):
279 """Sanitizes a string so it could be used as part of a filename.
280 If restricted is set, use a stricter subset of allowed characters.
281 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
282 """
283 def replace_insane(char):
284 if char == '?' or ord(char) < 32 or ord(char) == 127:
285 return ''
286 elif char == '"':
287 return '' if restricted else '\''
288 elif char == ':':
289 return '_-' if restricted else ' -'
290 elif char in '\\/|*<>':
291 return '_'
292 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
293 return '_'
294 if restricted and ord(char) > 127:
295 return '_'
296 return char
297
298 # Handle timestamps
299 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
300 result = ''.join(map(replace_insane, s))
301 if not is_id:
302 while '__' in result:
303 result = result.replace('__', '_')
304 result = result.strip('_')
305 # Common case of "Foreign band name - English song title"
306 if restricted and result.startswith('-_'):
307 result = result[2:]
308 if result.startswith('-'):
309 result = '_' + result[len('-'):]
310 result = result.lstrip('.')
311 if not result:
312 result = '_'
313 return result
314
315
316 def sanitize_path(s):
317 """Sanitizes and normalizes path on Windows"""
318 if sys.platform != 'win32':
319 return s
320 drive_or_unc, _ = os.path.splitdrive(s)
321 if sys.version_info < (2, 7) and not drive_or_unc:
322 drive_or_unc, _ = os.path.splitunc(s)
323 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
324 if drive_or_unc:
325 norm_path.pop(0)
326 sanitized_path = [
327 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
328 for path_part in norm_path]
329 if drive_or_unc:
330 sanitized_path.insert(0, drive_or_unc + os.path.sep)
331 return os.path.join(*sanitized_path)
332
333
334 def orderedSet(iterable):
335 """ Remove all duplicates from the input iterable """
336 res = []
337 for el in iterable:
338 if el not in res:
339 res.append(el)
340 return res
341
342
343 def _htmlentity_transform(entity):
344 """Transforms an HTML entity to a character."""
345 # Known non-numeric HTML entity
346 if entity in compat_html_entities.name2codepoint:
347 return compat_chr(compat_html_entities.name2codepoint[entity])
348
349 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
350 if mobj is not None:
351 numstr = mobj.group(1)
352 if numstr.startswith('x'):
353 base = 16
354 numstr = '0%s' % numstr
355 else:
356 base = 10
357 return compat_chr(int(numstr, base))
358
359 # Unknown entity in name, return its literal representation
360 return ('&%s;' % entity)
361
362
363 def unescapeHTML(s):
364 if s is None:
365 return None
366 assert type(s) == compat_str
367
368 return re.sub(
369 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
370
371
372 def get_subprocess_encoding():
373 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
374 # For subprocess calls, encode with locale encoding
375 # Refer to http://stackoverflow.com/a/9951851/35070
376 encoding = preferredencoding()
377 else:
378 encoding = sys.getfilesystemencoding()
379 if encoding is None:
380 encoding = 'utf-8'
381 return encoding
382
383
384 def encodeFilename(s, for_subprocess=False):
385 """
386 @param s The name of the file
387 """
388
389 assert type(s) == compat_str
390
391 # Python 3 has a Unicode API
392 if sys.version_info >= (3, 0):
393 return s
394
395 # Pass '' directly to use Unicode APIs on Windows 2000 and up
396 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
397 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
398 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
399 return s
400
401 return s.encode(get_subprocess_encoding(), 'ignore')
402
403
404 def decodeFilename(b, for_subprocess=False):
405
406 if sys.version_info >= (3, 0):
407 return b
408
409 if not isinstance(b, bytes):
410 return b
411
412 return b.decode(get_subprocess_encoding(), 'ignore')
413
414
415 def encodeArgument(s):
416 if not isinstance(s, compat_str):
417 # Legacy code that uses byte strings
418 # Uncomment the following line after fixing all post processors
419 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
420 s = s.decode('ascii')
421 return encodeFilename(s, True)
422
423
424 def decodeArgument(b):
425 return decodeFilename(b, True)
426
427
428 def decodeOption(optval):
429 if optval is None:
430 return optval
431 if isinstance(optval, bytes):
432 optval = optval.decode(preferredencoding())
433
434 assert isinstance(optval, compat_str)
435 return optval
436
437
438 def formatSeconds(secs):
439 if secs > 3600:
440 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
441 elif secs > 60:
442 return '%d:%02d' % (secs // 60, secs % 60)
443 else:
444 return '%d' % secs
445
446
447 def make_HTTPS_handler(params, **kwargs):
448 opts_no_check_certificate = params.get('nocheckcertificate', False)
449 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
450 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
451 if opts_no_check_certificate:
452 context.check_hostname = False
453 context.verify_mode = ssl.CERT_NONE
454 try:
455 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
456 except TypeError:
457 # Python 2.7.8
458 # (create_default_context present but HTTPSHandler has no context=)
459 pass
460
461 if sys.version_info < (3, 2):
462 return YoutubeDLHTTPSHandler(params, **kwargs)
463 else: # Python < 3.4
464 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
465 context.verify_mode = (ssl.CERT_NONE
466 if opts_no_check_certificate
467 else ssl.CERT_REQUIRED)
468 context.set_default_verify_paths()
469 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
470
471
472 def bug_reports_message():
473 if ytdl_is_updateable():
474 update_cmd = 'type youtube-dl -U to update'
475 else:
476 update_cmd = 'see https://yt-dl.org/update on how to update'
477 msg = '; please report this issue on https://yt-dl.org/bug .'
478 msg += ' Make sure you are using the latest version; %s.' % update_cmd
479 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
480 return msg
481
482
483 class ExtractorError(Exception):
484 """Error during info extraction."""
485
486 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
487 """ tb, if given, is the original traceback (so that it can be printed out).
488 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
489 """
490
491 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
492 expected = True
493 if video_id is not None:
494 msg = video_id + ': ' + msg
495 if cause:
496 msg += ' (caused by %r)' % cause
497 if not expected:
498 msg += bug_reports_message()
499 super(ExtractorError, self).__init__(msg)
500
501 self.traceback = tb
502 self.exc_info = sys.exc_info() # preserve original exception
503 self.cause = cause
504 self.video_id = video_id
505
506 def format_traceback(self):
507 if self.traceback is None:
508 return None
509 return ''.join(traceback.format_tb(self.traceback))
510
511
512 class UnsupportedError(ExtractorError):
513 def __init__(self, url):
514 super(UnsupportedError, self).__init__(
515 'Unsupported URL: %s' % url, expected=True)
516 self.url = url
517
518
519 class RegexNotFoundError(ExtractorError):
520 """Error when a regex didn't match"""
521 pass
522
523
524 class DownloadError(Exception):
525 """Download Error exception.
526
527 This exception may be thrown by FileDownloader objects if they are not
528 configured to continue on errors. They will contain the appropriate
529 error message.
530 """
531
532 def __init__(self, msg, exc_info=None):
533 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
534 super(DownloadError, self).__init__(msg)
535 self.exc_info = exc_info
536
537
538 class SameFileError(Exception):
539 """Same File exception.
540
541 This exception will be thrown by FileDownloader objects if they detect
542 multiple files would have to be downloaded to the same file on disk.
543 """
544 pass
545
546
547 class PostProcessingError(Exception):
548 """Post Processing exception.
549
550 This exception may be raised by PostProcessor's .run() method to
551 indicate an error in the postprocessing task.
552 """
553
554 def __init__(self, msg):
555 self.msg = msg
556
557
558 class MaxDownloadsReached(Exception):
559 """ --max-downloads limit has been reached. """
560 pass
561
562
563 class UnavailableVideoError(Exception):
564 """Unavailable Format exception.
565
566 This exception will be thrown when a video is requested
567 in a format that is not available for that video.
568 """
569 pass
570
571
572 class ContentTooShortError(Exception):
573 """Content Too Short exception.
574
575 This exception may be raised by FileDownloader objects when a file they
576 download is too small for what the server announced first, indicating
577 the connection was probably interrupted.
578 """
579 # Both in bytes
580 downloaded = None
581 expected = None
582
583 def __init__(self, downloaded, expected):
584 self.downloaded = downloaded
585 self.expected = expected
586
587
588 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
589 hc = http_class(*args, **kwargs)
590 source_address = ydl_handler._params.get('source_address')
591 if source_address is not None:
592 sa = (source_address, 0)
593 if hasattr(hc, 'source_address'): # Python 2.7+
594 hc.source_address = sa
595 else: # Python 2.6
596 def _hc_connect(self, *args, **kwargs):
597 sock = compat_socket_create_connection(
598 (self.host, self.port), self.timeout, sa)
599 if is_https:
600 self.sock = ssl.wrap_socket(
601 sock, self.key_file, self.cert_file,
602 ssl_version=ssl.PROTOCOL_TLSv1)
603 else:
604 self.sock = sock
605 hc.connect = functools.partial(_hc_connect, hc)
606
607 return hc
608
609
610 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
611 """Handler for HTTP requests and responses.
612
613 This class, when installed with an OpenerDirector, automatically adds
614 the standard headers to every HTTP request and handles gzipped and
615 deflated responses from web servers. If compression is to be avoided in
616 a particular request, the original request in the program code only has
617 to include the HTTP header "Youtubedl-No-Compression", which will be
618 removed before making the real request.
619
620 Part of this code was copied from:
621
622 http://techknack.net/python-urllib2-handlers/
623
624 Andrew Rowls, the author of that code, agreed to release it to the
625 public domain.
626 """
627
628 def __init__(self, params, *args, **kwargs):
629 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
630 self._params = params
631
632 def http_open(self, req):
633 return self.do_open(functools.partial(
634 _create_http_connection, self, compat_http_client.HTTPConnection, False),
635 req)
636
637 @staticmethod
638 def deflate(data):
639 try:
640 return zlib.decompress(data, -zlib.MAX_WBITS)
641 except zlib.error:
642 return zlib.decompress(data)
643
644 @staticmethod
645 def addinfourl_wrapper(stream, headers, url, code):
646 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
647 return compat_urllib_request.addinfourl(stream, headers, url, code)
648 ret = compat_urllib_request.addinfourl(stream, headers, url)
649 ret.code = code
650 return ret
651
652 def http_request(self, req):
653 for h, v in std_headers.items():
654 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
655 # The dict keys are capitalized because of this bug by urllib
656 if h.capitalize() not in req.headers:
657 req.add_header(h, v)
658 if 'Youtubedl-no-compression' in req.headers:
659 if 'Accept-encoding' in req.headers:
660 del req.headers['Accept-encoding']
661 del req.headers['Youtubedl-no-compression']
662
663 if sys.version_info < (2, 7) and '#' in req.get_full_url():
664 # Python 2.6 is brain-dead when it comes to fragments
665 req._Request__original = req._Request__original.partition('#')[0]
666 req._Request__r_type = req._Request__r_type.partition('#')[0]
667
668 return req
669
670 def http_response(self, req, resp):
671 old_resp = resp
672 # gzip
673 if resp.headers.get('Content-encoding', '') == 'gzip':
674 content = resp.read()
675 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
676 try:
677 uncompressed = io.BytesIO(gz.read())
678 except IOError as original_ioerror:
679 # There may be junk add the end of the file
680 # See http://stackoverflow.com/q/4928560/35070 for details
681 for i in range(1, 1024):
682 try:
683 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
684 uncompressed = io.BytesIO(gz.read())
685 except IOError:
686 continue
687 break
688 else:
689 raise original_ioerror
690 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
691 resp.msg = old_resp.msg
692 # deflate
693 if resp.headers.get('Content-encoding', '') == 'deflate':
694 gz = io.BytesIO(self.deflate(resp.read()))
695 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
696 resp.msg = old_resp.msg
697 return resp
698
699 https_request = http_request
700 https_response = http_response
701
702
703 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
704 def __init__(self, params, https_conn_class=None, *args, **kwargs):
705 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
706 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
707 self._params = params
708
709 def https_open(self, req):
710 kwargs = {}
711 if hasattr(self, '_context'): # python > 2.6
712 kwargs['context'] = self._context
713 if hasattr(self, '_check_hostname'): # python 3.x
714 kwargs['check_hostname'] = self._check_hostname
715 return self.do_open(functools.partial(
716 _create_http_connection, self, self._https_conn_class, True),
717 req, **kwargs)
718
719
720 def parse_iso8601(date_str, delimiter='T', timezone=None):
721 """ Return a UNIX timestamp from the given date """
722
723 if date_str is None:
724 return None
725
726 if timezone is None:
727 m = re.search(
728 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
729 date_str)
730 if not m:
731 timezone = datetime.timedelta()
732 else:
733 date_str = date_str[:-len(m.group(0))]
734 if not m.group('sign'):
735 timezone = datetime.timedelta()
736 else:
737 sign = 1 if m.group('sign') == '+' else -1
738 timezone = datetime.timedelta(
739 hours=sign * int(m.group('hours')),
740 minutes=sign * int(m.group('minutes')))
741 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
742 dt = datetime.datetime.strptime(date_str, date_format) - timezone
743 return calendar.timegm(dt.timetuple())
744
745
746 def unified_strdate(date_str, day_first=True):
747 """Return a string with the date in the format YYYYMMDD"""
748
749 if date_str is None:
750 return None
751 upload_date = None
752 # Replace commas
753 date_str = date_str.replace(',', ' ')
754 # %z (UTC offset) is only supported in python>=3.2
755 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
756 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
757 # Remove AM/PM + timezone
758 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
759
760 format_expressions = [
761 '%d %B %Y',
762 '%d %b %Y',
763 '%B %d %Y',
764 '%b %d %Y',
765 '%b %dst %Y %I:%M%p',
766 '%b %dnd %Y %I:%M%p',
767 '%b %dth %Y %I:%M%p',
768 '%Y %m %d',
769 '%Y-%m-%d',
770 '%Y/%m/%d',
771 '%Y/%m/%d %H:%M:%S',
772 '%Y-%m-%d %H:%M:%S',
773 '%Y-%m-%d %H:%M:%S.%f',
774 '%d.%m.%Y %H:%M',
775 '%d.%m.%Y %H.%M',
776 '%Y-%m-%dT%H:%M:%SZ',
777 '%Y-%m-%dT%H:%M:%S.%fZ',
778 '%Y-%m-%dT%H:%M:%S.%f0Z',
779 '%Y-%m-%dT%H:%M:%S',
780 '%Y-%m-%dT%H:%M:%S.%f',
781 '%Y-%m-%dT%H:%M',
782 ]
783 if day_first:
784 format_expressions.extend([
785 '%d-%m-%Y',
786 '%d.%m.%Y',
787 '%d/%m/%Y',
788 '%d/%m/%y',
789 '%d/%m/%Y %H:%M:%S',
790 ])
791 else:
792 format_expressions.extend([
793 '%m-%d-%Y',
794 '%m.%d.%Y',
795 '%m/%d/%Y',
796 '%m/%d/%y',
797 '%m/%d/%Y %H:%M:%S',
798 ])
799 for expression in format_expressions:
800 try:
801 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
802 except ValueError:
803 pass
804 if upload_date is None:
805 timetuple = email.utils.parsedate_tz(date_str)
806 if timetuple:
807 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
808 return upload_date
809
810
811 def determine_ext(url, default_ext='unknown_video'):
812 if url is None:
813 return default_ext
814 guess = url.partition('?')[0].rpartition('.')[2]
815 if re.match(r'^[A-Za-z0-9]+$', guess):
816 return guess
817 else:
818 return default_ext
819
820
821 def subtitles_filename(filename, sub_lang, sub_format):
822 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
823
824
825 def date_from_str(date_str):
826 """
827 Return a datetime object from a string in the format YYYYMMDD or
828 (now|today)[+-][0-9](day|week|month|year)(s)?"""
829 today = datetime.date.today()
830 if date_str in ('now', 'today'):
831 return today
832 if date_str == 'yesterday':
833 return today - datetime.timedelta(days=1)
834 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
835 if match is not None:
836 sign = match.group('sign')
837 time = int(match.group('time'))
838 if sign == '-':
839 time = -time
840 unit = match.group('unit')
841 # A bad aproximation?
842 if unit == 'month':
843 unit = 'day'
844 time *= 30
845 elif unit == 'year':
846 unit = 'day'
847 time *= 365
848 unit += 's'
849 delta = datetime.timedelta(**{unit: time})
850 return today + delta
851 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
852
853
854 def hyphenate_date(date_str):
855 """
856 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
857 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
858 if match is not None:
859 return '-'.join(match.groups())
860 else:
861 return date_str
862
863
864 class DateRange(object):
865 """Represents a time interval between two dates"""
866
867 def __init__(self, start=None, end=None):
868 """start and end must be strings in the format accepted by date"""
869 if start is not None:
870 self.start = date_from_str(start)
871 else:
872 self.start = datetime.datetime.min.date()
873 if end is not None:
874 self.end = date_from_str(end)
875 else:
876 self.end = datetime.datetime.max.date()
877 if self.start > self.end:
878 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
879
880 @classmethod
881 def day(cls, day):
882 """Returns a range that only contains the given day"""
883 return cls(day, day)
884
885 def __contains__(self, date):
886 """Check if the date is in the range"""
887 if not isinstance(date, datetime.date):
888 date = date_from_str(date)
889 return self.start <= date <= self.end
890
891 def __str__(self):
892 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
893
894
895 def platform_name():
896 """ Returns the platform name as a compat_str """
897 res = platform.platform()
898 if isinstance(res, bytes):
899 res = res.decode(preferredencoding())
900
901 assert isinstance(res, compat_str)
902 return res
903
904
905 def _windows_write_string(s, out):
906 """ Returns True if the string was written using special methods,
907 False if it has yet to be written out."""
908 # Adapted from http://stackoverflow.com/a/3259271/35070
909
910 import ctypes
911 import ctypes.wintypes
912
913 WIN_OUTPUT_IDS = {
914 1: -11,
915 2: -12,
916 }
917
918 try:
919 fileno = out.fileno()
920 except AttributeError:
921 # If the output stream doesn't have a fileno, it's virtual
922 return False
923 except io.UnsupportedOperation:
924 # Some strange Windows pseudo files?
925 return False
926 if fileno not in WIN_OUTPUT_IDS:
927 return False
928
929 GetStdHandle = ctypes.WINFUNCTYPE(
930 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
931 (b"GetStdHandle", ctypes.windll.kernel32))
932 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
933
934 WriteConsoleW = ctypes.WINFUNCTYPE(
935 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
936 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
937 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
938 written = ctypes.wintypes.DWORD(0)
939
940 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
941 FILE_TYPE_CHAR = 0x0002
942 FILE_TYPE_REMOTE = 0x8000
943 GetConsoleMode = ctypes.WINFUNCTYPE(
944 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
945 ctypes.POINTER(ctypes.wintypes.DWORD))(
946 (b"GetConsoleMode", ctypes.windll.kernel32))
947 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
948
949 def not_a_console(handle):
950 if handle == INVALID_HANDLE_VALUE or handle is None:
951 return True
952 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
953 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
954
955 if not_a_console(h):
956 return False
957
958 def next_nonbmp_pos(s):
959 try:
960 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
961 except StopIteration:
962 return len(s)
963
964 while s:
965 count = min(next_nonbmp_pos(s), 1024)
966
967 ret = WriteConsoleW(
968 h, s, count if count else 2, ctypes.byref(written), None)
969 if ret == 0:
970 raise OSError('Failed to write string')
971 if not count: # We just wrote a non-BMP character
972 assert written.value == 2
973 s = s[1:]
974 else:
975 assert written.value > 0
976 s = s[written.value:]
977 return True
978
979
980 def write_string(s, out=None, encoding=None):
981 if out is None:
982 out = sys.stderr
983 assert type(s) == compat_str
984
985 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
986 if _windows_write_string(s, out):
987 return
988
989 if ('b' in getattr(out, 'mode', '') or
990 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
991 byt = s.encode(encoding or preferredencoding(), 'ignore')
992 out.write(byt)
993 elif hasattr(out, 'buffer'):
994 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
995 byt = s.encode(enc, 'ignore')
996 out.buffer.write(byt)
997 else:
998 out.write(s)
999 out.flush()
1000
1001
1002 def bytes_to_intlist(bs):
1003 if not bs:
1004 return []
1005 if isinstance(bs[0], int): # Python 3
1006 return list(bs)
1007 else:
1008 return [ord(c) for c in bs]
1009
1010
1011 def intlist_to_bytes(xs):
1012 if not xs:
1013 return b''
1014 return struct_pack('%dB' % len(xs), *xs)
1015
1016
1017 # Cross-platform file locking
1018 if sys.platform == 'win32':
1019 import ctypes.wintypes
1020 import msvcrt
1021
1022 class OVERLAPPED(ctypes.Structure):
1023 _fields_ = [
1024 ('Internal', ctypes.wintypes.LPVOID),
1025 ('InternalHigh', ctypes.wintypes.LPVOID),
1026 ('Offset', ctypes.wintypes.DWORD),
1027 ('OffsetHigh', ctypes.wintypes.DWORD),
1028 ('hEvent', ctypes.wintypes.HANDLE),
1029 ]
1030
1031 kernel32 = ctypes.windll.kernel32
1032 LockFileEx = kernel32.LockFileEx
1033 LockFileEx.argtypes = [
1034 ctypes.wintypes.HANDLE, # hFile
1035 ctypes.wintypes.DWORD, # dwFlags
1036 ctypes.wintypes.DWORD, # dwReserved
1037 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1038 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1039 ctypes.POINTER(OVERLAPPED) # Overlapped
1040 ]
1041 LockFileEx.restype = ctypes.wintypes.BOOL
1042 UnlockFileEx = kernel32.UnlockFileEx
1043 UnlockFileEx.argtypes = [
1044 ctypes.wintypes.HANDLE, # hFile
1045 ctypes.wintypes.DWORD, # dwReserved
1046 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1047 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1048 ctypes.POINTER(OVERLAPPED) # Overlapped
1049 ]
1050 UnlockFileEx.restype = ctypes.wintypes.BOOL
1051 whole_low = 0xffffffff
1052 whole_high = 0x7fffffff
1053
1054 def _lock_file(f, exclusive):
1055 overlapped = OVERLAPPED()
1056 overlapped.Offset = 0
1057 overlapped.OffsetHigh = 0
1058 overlapped.hEvent = 0
1059 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1060 handle = msvcrt.get_osfhandle(f.fileno())
1061 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1062 whole_low, whole_high, f._lock_file_overlapped_p):
1063 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1064
1065 def _unlock_file(f):
1066 assert f._lock_file_overlapped_p
1067 handle = msvcrt.get_osfhandle(f.fileno())
1068 if not UnlockFileEx(handle, 0,
1069 whole_low, whole_high, f._lock_file_overlapped_p):
1070 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1071
1072 else:
1073 import fcntl
1074
1075 def _lock_file(f, exclusive):
1076 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1077
1078 def _unlock_file(f):
1079 fcntl.flock(f, fcntl.LOCK_UN)
1080
1081
1082 class locked_file(object):
1083 def __init__(self, filename, mode, encoding=None):
1084 assert mode in ['r', 'a', 'w']
1085 self.f = io.open(filename, mode, encoding=encoding)
1086 self.mode = mode
1087
1088 def __enter__(self):
1089 exclusive = self.mode != 'r'
1090 try:
1091 _lock_file(self.f, exclusive)
1092 except IOError:
1093 self.f.close()
1094 raise
1095 return self
1096
1097 def __exit__(self, etype, value, traceback):
1098 try:
1099 _unlock_file(self.f)
1100 finally:
1101 self.f.close()
1102
1103 def __iter__(self):
1104 return iter(self.f)
1105
1106 def write(self, *args):
1107 return self.f.write(*args)
1108
1109 def read(self, *args):
1110 return self.f.read(*args)
1111
1112
1113 def get_filesystem_encoding():
1114 encoding = sys.getfilesystemencoding()
1115 return encoding if encoding is not None else 'utf-8'
1116
1117
1118 def shell_quote(args):
1119 quoted_args = []
1120 encoding = get_filesystem_encoding()
1121 for a in args:
1122 if isinstance(a, bytes):
1123 # We may get a filename encoded with 'encodeFilename'
1124 a = a.decode(encoding)
1125 quoted_args.append(pipes.quote(a))
1126 return ' '.join(quoted_args)
1127
1128
1129 def smuggle_url(url, data):
1130 """ Pass additional data in a URL for internal use. """
1131
1132 sdata = compat_urllib_parse.urlencode(
1133 {'__youtubedl_smuggle': json.dumps(data)})
1134 return url + '#' + sdata
1135
1136
1137 def unsmuggle_url(smug_url, default=None):
1138 if '#__youtubedl_smuggle' not in smug_url:
1139 return smug_url, default
1140 url, _, sdata = smug_url.rpartition('#')
1141 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1142 data = json.loads(jsond)
1143 return url, data
1144
1145
1146 def format_bytes(bytes):
1147 if bytes is None:
1148 return 'N/A'
1149 if type(bytes) is str:
1150 bytes = float(bytes)
1151 if bytes == 0.0:
1152 exponent = 0
1153 else:
1154 exponent = int(math.log(bytes, 1024.0))
1155 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1156 converted = float(bytes) / float(1024 ** exponent)
1157 return '%.2f%s' % (converted, suffix)
1158
1159
1160 def parse_filesize(s):
1161 if s is None:
1162 return None
1163
1164 # The lower-case forms are of course incorrect and inofficial,
1165 # but we support those too
1166 _UNIT_TABLE = {
1167 'B': 1,
1168 'b': 1,
1169 'KiB': 1024,
1170 'KB': 1000,
1171 'kB': 1024,
1172 'Kb': 1000,
1173 'MiB': 1024 ** 2,
1174 'MB': 1000 ** 2,
1175 'mB': 1024 ** 2,
1176 'Mb': 1000 ** 2,
1177 'GiB': 1024 ** 3,
1178 'GB': 1000 ** 3,
1179 'gB': 1024 ** 3,
1180 'Gb': 1000 ** 3,
1181 'TiB': 1024 ** 4,
1182 'TB': 1000 ** 4,
1183 'tB': 1024 ** 4,
1184 'Tb': 1000 ** 4,
1185 'PiB': 1024 ** 5,
1186 'PB': 1000 ** 5,
1187 'pB': 1024 ** 5,
1188 'Pb': 1000 ** 5,
1189 'EiB': 1024 ** 6,
1190 'EB': 1000 ** 6,
1191 'eB': 1024 ** 6,
1192 'Eb': 1000 ** 6,
1193 'ZiB': 1024 ** 7,
1194 'ZB': 1000 ** 7,
1195 'zB': 1024 ** 7,
1196 'Zb': 1000 ** 7,
1197 'YiB': 1024 ** 8,
1198 'YB': 1000 ** 8,
1199 'yB': 1024 ** 8,
1200 'Yb': 1000 ** 8,
1201 }
1202
1203 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1204 m = re.match(
1205 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1206 if not m:
1207 return None
1208
1209 num_str = m.group('num').replace(',', '.')
1210 mult = _UNIT_TABLE[m.group('unit')]
1211 return int(float(num_str) * mult)
1212
1213
1214 def month_by_name(name):
1215 """ Return the number of a month by (locale-independently) English name """
1216
1217 try:
1218 return ENGLISH_MONTH_NAMES.index(name) + 1
1219 except ValueError:
1220 return None
1221
1222
1223 def month_by_abbreviation(abbrev):
1224 """ Return the number of a month by (locale-independently) English
1225 abbreviations """
1226
1227 try:
1228 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1229 except ValueError:
1230 return None
1231
1232
1233 def fix_xml_ampersands(xml_str):
1234 """Replace all the '&' by '&amp;' in XML"""
1235 return re.sub(
1236 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1237 '&amp;',
1238 xml_str)
1239
1240
1241 def setproctitle(title):
1242 assert isinstance(title, compat_str)
1243 try:
1244 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245 except OSError:
1246 return
1247 title_bytes = title.encode('utf-8')
1248 buf = ctypes.create_string_buffer(len(title_bytes))
1249 buf.value = title_bytes
1250 try:
1251 libc.prctl(15, buf, 0, 0, 0)
1252 except AttributeError:
1253 return # Strange libc, just skip this
1254
1255
1256 def remove_start(s, start):
1257 if s.startswith(start):
1258 return s[len(start):]
1259 return s
1260
1261
1262 def remove_end(s, end):
1263 if s.endswith(end):
1264 return s[:-len(end)]
1265 return s
1266
1267
1268 def url_basename(url):
1269 path = compat_urlparse.urlparse(url).path
1270 return path.strip('/').split('/')[-1]
1271
1272
1273 class HEADRequest(compat_urllib_request.Request):
1274 def get_method(self):
1275 return "HEAD"
1276
1277
1278 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1279 if get_attr:
1280 if v is not None:
1281 v = getattr(v, get_attr, None)
1282 if v == '':
1283 v = None
1284 return default if v is None else (int(v) * invscale // scale)
1285
1286
1287 def str_or_none(v, default=None):
1288 return default if v is None else compat_str(v)
1289
1290
1291 def str_to_int(int_str):
1292 """ A more relaxed version of int_or_none """
1293 if int_str is None:
1294 return None
1295 int_str = re.sub(r'[,\.\+]', '', int_str)
1296 return int(int_str)
1297
1298
1299 def float_or_none(v, scale=1, invscale=1, default=None):
1300 return default if v is None else (float(v) * invscale / scale)
1301
1302
1303 def parse_duration(s):
1304 if not isinstance(s, compat_basestring):
1305 return None
1306
1307 s = s.strip()
1308
1309 m = re.match(
1310 r'''(?ix)(?:P?T)?
1311 (?:
1312 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1313 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1314
1315 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1316 (?:
1317 (?:
1318 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1319 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1320 )?
1321 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1322 )?
1323 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1324 )$''', s)
1325 if not m:
1326 return None
1327 res = 0
1328 if m.group('only_mins'):
1329 return float_or_none(m.group('only_mins'), invscale=60)
1330 if m.group('only_hours'):
1331 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1332 if m.group('secs'):
1333 res += int(m.group('secs'))
1334 if m.group('mins_reversed'):
1335 res += int(m.group('mins_reversed')) * 60
1336 if m.group('mins'):
1337 res += int(m.group('mins')) * 60
1338 if m.group('hours'):
1339 res += int(m.group('hours')) * 60 * 60
1340 if m.group('hours_reversed'):
1341 res += int(m.group('hours_reversed')) * 60 * 60
1342 if m.group('days'):
1343 res += int(m.group('days')) * 24 * 60 * 60
1344 if m.group('ms'):
1345 res += float(m.group('ms'))
1346 return res
1347
1348
1349 def prepend_extension(filename, ext, expected_real_ext=None):
1350 name, real_ext = os.path.splitext(filename)
1351 return (
1352 '{0}.{1}{2}'.format(name, ext, real_ext)
1353 if not expected_real_ext or real_ext[1:] == expected_real_ext
1354 else '{0}.{1}'.format(filename, ext))
1355
1356
1357 def replace_extension(filename, ext, expected_real_ext=None):
1358 name, real_ext = os.path.splitext(filename)
1359 return '{0}.{1}'.format(
1360 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1361 ext)
1362
1363
1364 def check_executable(exe, args=[]):
1365 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1366 args can be a list of arguments for a short output (like -version) """
1367 try:
1368 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1369 except OSError:
1370 return False
1371 return exe
1372
1373
1374 def get_exe_version(exe, args=['--version'],
1375 version_re=None, unrecognized='present'):
1376 """ Returns the version of the specified executable,
1377 or False if the executable is not present """
1378 try:
1379 out, _ = subprocess.Popen(
1380 [encodeArgument(exe)] + args,
1381 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1382 except OSError:
1383 return False
1384 if isinstance(out, bytes): # Python 2.x
1385 out = out.decode('ascii', 'ignore')
1386 return detect_exe_version(out, version_re, unrecognized)
1387
1388
1389 def detect_exe_version(output, version_re=None, unrecognized='present'):
1390 assert isinstance(output, compat_str)
1391 if version_re is None:
1392 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1393 m = re.search(version_re, output)
1394 if m:
1395 return m.group(1)
1396 else:
1397 return unrecognized
1398
1399
1400 class PagedList(object):
1401 def __len__(self):
1402 # This is only useful for tests
1403 return len(self.getslice())
1404
1405
1406 class OnDemandPagedList(PagedList):
1407 def __init__(self, pagefunc, pagesize):
1408 self._pagefunc = pagefunc
1409 self._pagesize = pagesize
1410
1411 def getslice(self, start=0, end=None):
1412 res = []
1413 for pagenum in itertools.count(start // self._pagesize):
1414 firstid = pagenum * self._pagesize
1415 nextfirstid = pagenum * self._pagesize + self._pagesize
1416 if start >= nextfirstid:
1417 continue
1418
1419 page_results = list(self._pagefunc(pagenum))
1420
1421 startv = (
1422 start % self._pagesize
1423 if firstid <= start < nextfirstid
1424 else 0)
1425
1426 endv = (
1427 ((end - 1) % self._pagesize) + 1
1428 if (end is not None and firstid <= end <= nextfirstid)
1429 else None)
1430
1431 if startv != 0 or endv is not None:
1432 page_results = page_results[startv:endv]
1433 res.extend(page_results)
1434
1435 # A little optimization - if current page is not "full", ie. does
1436 # not contain page_size videos then we can assume that this page
1437 # is the last one - there are no more ids on further pages -
1438 # i.e. no need to query again.
1439 if len(page_results) + startv < self._pagesize:
1440 break
1441
1442 # If we got the whole page, but the next page is not interesting,
1443 # break out early as well
1444 if end == nextfirstid:
1445 break
1446 return res
1447
1448
1449 class InAdvancePagedList(PagedList):
1450 def __init__(self, pagefunc, pagecount, pagesize):
1451 self._pagefunc = pagefunc
1452 self._pagecount = pagecount
1453 self._pagesize = pagesize
1454
1455 def getslice(self, start=0, end=None):
1456 res = []
1457 start_page = start // self._pagesize
1458 end_page = (
1459 self._pagecount if end is None else (end // self._pagesize + 1))
1460 skip_elems = start - start_page * self._pagesize
1461 only_more = None if end is None else end - start
1462 for pagenum in range(start_page, end_page):
1463 page = list(self._pagefunc(pagenum))
1464 if skip_elems:
1465 page = page[skip_elems:]
1466 skip_elems = None
1467 if only_more is not None:
1468 if len(page) < only_more:
1469 only_more -= len(page)
1470 else:
1471 page = page[:only_more]
1472 res.extend(page)
1473 break
1474 res.extend(page)
1475 return res
1476
1477
1478 def uppercase_escape(s):
1479 unicode_escape = codecs.getdecoder('unicode_escape')
1480 return re.sub(
1481 r'\\U[0-9a-fA-F]{8}',
1482 lambda m: unicode_escape(m.group(0))[0],
1483 s)
1484
1485
1486 def lowercase_escape(s):
1487 unicode_escape = codecs.getdecoder('unicode_escape')
1488 return re.sub(
1489 r'\\u[0-9a-fA-F]{4}',
1490 lambda m: unicode_escape(m.group(0))[0],
1491 s)
1492
1493
1494 def escape_rfc3986(s):
1495 """Escape non-ASCII characters as suggested by RFC 3986"""
1496 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1497 s = s.encode('utf-8')
1498 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1499
1500
1501 def escape_url(url):
1502 """Escape URL as suggested by RFC 3986"""
1503 url_parsed = compat_urllib_parse_urlparse(url)
1504 return url_parsed._replace(
1505 path=escape_rfc3986(url_parsed.path),
1506 params=escape_rfc3986(url_parsed.params),
1507 query=escape_rfc3986(url_parsed.query),
1508 fragment=escape_rfc3986(url_parsed.fragment)
1509 ).geturl()
1510
1511 try:
1512 struct.pack('!I', 0)
1513 except TypeError:
1514 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1515 def struct_pack(spec, *args):
1516 if isinstance(spec, compat_str):
1517 spec = spec.encode('ascii')
1518 return struct.pack(spec, *args)
1519
1520 def struct_unpack(spec, *args):
1521 if isinstance(spec, compat_str):
1522 spec = spec.encode('ascii')
1523 return struct.unpack(spec, *args)
1524 else:
1525 struct_pack = struct.pack
1526 struct_unpack = struct.unpack
1527
1528
1529 def read_batch_urls(batch_fd):
1530 def fixup(url):
1531 if not isinstance(url, compat_str):
1532 url = url.decode('utf-8', 'replace')
1533 BOM_UTF8 = '\xef\xbb\xbf'
1534 if url.startswith(BOM_UTF8):
1535 url = url[len(BOM_UTF8):]
1536 url = url.strip()
1537 if url.startswith(('#', ';', ']')):
1538 return False
1539 return url
1540
1541 with contextlib.closing(batch_fd) as fd:
1542 return [url for url in map(fixup, fd) if url]
1543
1544
1545 def urlencode_postdata(*args, **kargs):
1546 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1547
1548
1549 try:
1550 etree_iter = xml.etree.ElementTree.Element.iter
1551 except AttributeError: # Python <=2.6
1552 etree_iter = lambda n: n.findall('.//*')
1553
1554
1555 def parse_xml(s):
1556 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1557 def doctype(self, name, pubid, system):
1558 pass # Ignore doctypes
1559
1560 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1561 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1562 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1563 # Fix up XML parser in Python 2.x
1564 if sys.version_info < (3, 0):
1565 for n in etree_iter(tree):
1566 if n.text is not None:
1567 if not isinstance(n.text, compat_str):
1568 n.text = n.text.decode('utf-8')
1569 return tree
1570
1571
1572 US_RATINGS = {
1573 'G': 0,
1574 'PG': 10,
1575 'PG-13': 13,
1576 'R': 16,
1577 'NC': 18,
1578 }
1579
1580
1581 def parse_age_limit(s):
1582 if s is None:
1583 return None
1584 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1585 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1586
1587
1588 def strip_jsonp(code):
1589 return re.sub(
1590 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1591
1592
1593 def js_to_json(code):
1594 def fix_kv(m):
1595 v = m.group(0)
1596 if v in ('true', 'false', 'null'):
1597 return v
1598 if v.startswith('"'):
1599 return v
1600 if v.startswith("'"):
1601 v = v[1:-1]
1602 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1603 '\\\\': '\\\\',
1604 "\\'": "'",
1605 '"': '\\"',
1606 }[m.group(0)], v)
1607 return '"%s"' % v
1608
1609 res = re.sub(r'''(?x)
1610 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1611 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1612 [a-zA-Z_][.a-zA-Z_0-9]*
1613 ''', fix_kv, code)
1614 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1615 return res
1616
1617
1618 def qualities(quality_ids):
1619 """ Get a numeric quality value out of a list of possible values """
1620 def q(qid):
1621 try:
1622 return quality_ids.index(qid)
1623 except ValueError:
1624 return -1
1625 return q
1626
1627
1628 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1629
1630
1631 def limit_length(s, length):
1632 """ Add ellipses to overly long strings """
1633 if s is None:
1634 return None
1635 ELLIPSES = '...'
1636 if len(s) > length:
1637 return s[:length - len(ELLIPSES)] + ELLIPSES
1638 return s
1639
1640
1641 def version_tuple(v):
1642 return tuple(int(e) for e in re.split(r'[-.]', v))
1643
1644
1645 def is_outdated_version(version, limit, assume_new=True):
1646 if not version:
1647 return not assume_new
1648 try:
1649 return version_tuple(version) < version_tuple(limit)
1650 except ValueError:
1651 return not assume_new
1652
1653
1654 def ytdl_is_updateable():
1655 """ Returns if youtube-dl can be updated with -U """
1656 from zipimport import zipimporter
1657
1658 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1659
1660
1661 def args_to_str(args):
1662 # Get a short string representation for a subprocess command
1663 return ' '.join(shlex_quote(a) for a in args)
1664
1665
1666 def mimetype2ext(mt):
1667 _, _, res = mt.rpartition('/')
1668
1669 return {
1670 'x-ms-wmv': 'wmv',
1671 'x-mp4-fragmented': 'mp4',
1672 'ttml+xml': 'ttml',
1673 }.get(res, res)
1674
1675
1676 def urlhandle_detect_ext(url_handle):
1677 try:
1678 url_handle.headers
1679 getheader = lambda h: url_handle.headers[h]
1680 except AttributeError: # Python < 3
1681 getheader = url_handle.info().getheader
1682
1683 cd = getheader('Content-Disposition')
1684 if cd:
1685 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1686 if m:
1687 e = determine_ext(m.group('filename'), default_ext=None)
1688 if e:
1689 return e
1690
1691 return mimetype2ext(getheader('Content-Type'))
1692
1693
1694 def age_restricted(content_limit, age_limit):
1695 """ Returns True iff the content should be blocked """
1696
1697 if age_limit is None: # No limit set
1698 return False
1699 if content_limit is None:
1700 return False # Content available for everyone
1701 return age_limit < content_limit
1702
1703
1704 def is_html(first_bytes):
1705 """ Detect whether a file contains HTML by examining its first bytes. """
1706
1707 BOMS = [
1708 (b'\xef\xbb\xbf', 'utf-8'),
1709 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1710 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1711 (b'\xff\xfe', 'utf-16-le'),
1712 (b'\xfe\xff', 'utf-16-be'),
1713 ]
1714 for bom, enc in BOMS:
1715 if first_bytes.startswith(bom):
1716 s = first_bytes[len(bom):].decode(enc, 'replace')
1717 break
1718 else:
1719 s = first_bytes.decode('utf-8', 'replace')
1720
1721 return re.match(r'^\s*<', s)
1722
1723
1724 def determine_protocol(info_dict):
1725 protocol = info_dict.get('protocol')
1726 if protocol is not None:
1727 return protocol
1728
1729 url = info_dict['url']
1730 if url.startswith('rtmp'):
1731 return 'rtmp'
1732 elif url.startswith('mms'):
1733 return 'mms'
1734 elif url.startswith('rtsp'):
1735 return 'rtsp'
1736
1737 ext = determine_ext(url)
1738 if ext == 'm3u8':
1739 return 'm3u8'
1740 elif ext == 'f4m':
1741 return 'f4m'
1742
1743 return compat_urllib_parse_urlparse(url).scheme
1744
1745
1746 def render_table(header_row, data):
1747 """ Render a list of rows, each as a list of values """
1748 table = [header_row] + data
1749 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1750 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1751 return '\n'.join(format_str % tuple(row) for row in table)
1752
1753
1754 def _match_one(filter_part, dct):
1755 COMPARISON_OPERATORS = {
1756 '<': operator.lt,
1757 '<=': operator.le,
1758 '>': operator.gt,
1759 '>=': operator.ge,
1760 '=': operator.eq,
1761 '!=': operator.ne,
1762 }
1763 operator_rex = re.compile(r'''(?x)\s*
1764 (?P<key>[a-z_]+)
1765 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1766 (?:
1767 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1768 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1769 )
1770 \s*$
1771 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1772 m = operator_rex.search(filter_part)
1773 if m:
1774 op = COMPARISON_OPERATORS[m.group('op')]
1775 if m.group('strval') is not None:
1776 if m.group('op') not in ('=', '!='):
1777 raise ValueError(
1778 'Operator %s does not support string values!' % m.group('op'))
1779 comparison_value = m.group('strval')
1780 else:
1781 try:
1782 comparison_value = int(m.group('intval'))
1783 except ValueError:
1784 comparison_value = parse_filesize(m.group('intval'))
1785 if comparison_value is None:
1786 comparison_value = parse_filesize(m.group('intval') + 'B')
1787 if comparison_value is None:
1788 raise ValueError(
1789 'Invalid integer value %r in filter part %r' % (
1790 m.group('intval'), filter_part))
1791 actual_value = dct.get(m.group('key'))
1792 if actual_value is None:
1793 return m.group('none_inclusive')
1794 return op(actual_value, comparison_value)
1795
1796 UNARY_OPERATORS = {
1797 '': lambda v: v is not None,
1798 '!': lambda v: v is None,
1799 }
1800 operator_rex = re.compile(r'''(?x)\s*
1801 (?P<op>%s)\s*(?P<key>[a-z_]+)
1802 \s*$
1803 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1804 m = operator_rex.search(filter_part)
1805 if m:
1806 op = UNARY_OPERATORS[m.group('op')]
1807 actual_value = dct.get(m.group('key'))
1808 return op(actual_value)
1809
1810 raise ValueError('Invalid filter part %r' % filter_part)
1811
1812
1813 def match_str(filter_str, dct):
1814 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1815
1816 return all(
1817 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1818
1819
1820 def match_filter_func(filter_str):
1821 def _match_func(info_dict):
1822 if match_str(filter_str, info_dict):
1823 return None
1824 else:
1825 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1826 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1827 return _match_func
1828
1829
1830 def parse_dfxp_time_expr(time_expr):
1831 if not time_expr:
1832 return 0.0
1833
1834 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1835 if mobj:
1836 return float(mobj.group('time_offset'))
1837
1838 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1839 if mobj:
1840 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1841
1842
1843 def srt_subtitles_timecode(seconds):
1844 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1845
1846
1847 def dfxp2srt(dfxp_data):
1848 _x = functools.partial(xpath_with_ns, ns_map={
1849 'ttml': 'http://www.w3.org/ns/ttml',
1850 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1851 })
1852
1853 def parse_node(node):
1854 str_or_empty = functools.partial(str_or_none, default='')
1855
1856 out = str_or_empty(node.text)
1857
1858 for child in node:
1859 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1860 out += '\n' + str_or_empty(child.tail)
1861 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1862 out += str_or_empty(parse_node(child))
1863 else:
1864 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1865
1866 return out
1867
1868 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1869 out = []
1870 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1871
1872 if not paras:
1873 raise ValueError('Invalid dfxp/TTML subtitle')
1874
1875 for para, index in zip(paras, itertools.count(1)):
1876 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1877 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1878 if not end_time:
1879 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1880 out.append('%d\n%s --> %s\n%s\n\n' % (
1881 index,
1882 srt_subtitles_timecode(begin_time),
1883 srt_subtitles_timecode(end_time),
1884 parse_node(para)))
1885
1886 return ''.join(out)
1887
1888
1889 class ISO639Utils(object):
1890 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1891 _lang_map = {
1892 'aa': 'aar',
1893 'ab': 'abk',
1894 'ae': 'ave',
1895 'af': 'afr',
1896 'ak': 'aka',
1897 'am': 'amh',
1898 'an': 'arg',
1899 'ar': 'ara',
1900 'as': 'asm',
1901 'av': 'ava',
1902 'ay': 'aym',
1903 'az': 'aze',
1904 'ba': 'bak',
1905 'be': 'bel',
1906 'bg': 'bul',
1907 'bh': 'bih',
1908 'bi': 'bis',
1909 'bm': 'bam',
1910 'bn': 'ben',
1911 'bo': 'bod',
1912 'br': 'bre',
1913 'bs': 'bos',
1914 'ca': 'cat',
1915 'ce': 'che',
1916 'ch': 'cha',
1917 'co': 'cos',
1918 'cr': 'cre',
1919 'cs': 'ces',
1920 'cu': 'chu',
1921 'cv': 'chv',
1922 'cy': 'cym',
1923 'da': 'dan',
1924 'de': 'deu',
1925 'dv': 'div',
1926 'dz': 'dzo',
1927 'ee': 'ewe',
1928 'el': 'ell',
1929 'en': 'eng',
1930 'eo': 'epo',
1931 'es': 'spa',
1932 'et': 'est',
1933 'eu': 'eus',
1934 'fa': 'fas',
1935 'ff': 'ful',
1936 'fi': 'fin',
1937 'fj': 'fij',
1938 'fo': 'fao',
1939 'fr': 'fra',
1940 'fy': 'fry',
1941 'ga': 'gle',
1942 'gd': 'gla',
1943 'gl': 'glg',
1944 'gn': 'grn',
1945 'gu': 'guj',
1946 'gv': 'glv',
1947 'ha': 'hau',
1948 'he': 'heb',
1949 'hi': 'hin',
1950 'ho': 'hmo',
1951 'hr': 'hrv',
1952 'ht': 'hat',
1953 'hu': 'hun',
1954 'hy': 'hye',
1955 'hz': 'her',
1956 'ia': 'ina',
1957 'id': 'ind',
1958 'ie': 'ile',
1959 'ig': 'ibo',
1960 'ii': 'iii',
1961 'ik': 'ipk',
1962 'io': 'ido',
1963 'is': 'isl',
1964 'it': 'ita',
1965 'iu': 'iku',
1966 'ja': 'jpn',
1967 'jv': 'jav',
1968 'ka': 'kat',
1969 'kg': 'kon',
1970 'ki': 'kik',
1971 'kj': 'kua',
1972 'kk': 'kaz',
1973 'kl': 'kal',
1974 'km': 'khm',
1975 'kn': 'kan',
1976 'ko': 'kor',
1977 'kr': 'kau',
1978 'ks': 'kas',
1979 'ku': 'kur',
1980 'kv': 'kom',
1981 'kw': 'cor',
1982 'ky': 'kir',
1983 'la': 'lat',
1984 'lb': 'ltz',
1985 'lg': 'lug',
1986 'li': 'lim',
1987 'ln': 'lin',
1988 'lo': 'lao',
1989 'lt': 'lit',
1990 'lu': 'lub',
1991 'lv': 'lav',
1992 'mg': 'mlg',
1993 'mh': 'mah',
1994 'mi': 'mri',
1995 'mk': 'mkd',
1996 'ml': 'mal',
1997 'mn': 'mon',
1998 'mr': 'mar',
1999 'ms': 'msa',
2000 'mt': 'mlt',
2001 'my': 'mya',
2002 'na': 'nau',
2003 'nb': 'nob',
2004 'nd': 'nde',
2005 'ne': 'nep',
2006 'ng': 'ndo',
2007 'nl': 'nld',
2008 'nn': 'nno',
2009 'no': 'nor',
2010 'nr': 'nbl',
2011 'nv': 'nav',
2012 'ny': 'nya',
2013 'oc': 'oci',
2014 'oj': 'oji',
2015 'om': 'orm',
2016 'or': 'ori',
2017 'os': 'oss',
2018 'pa': 'pan',
2019 'pi': 'pli',
2020 'pl': 'pol',
2021 'ps': 'pus',
2022 'pt': 'por',
2023 'qu': 'que',
2024 'rm': 'roh',
2025 'rn': 'run',
2026 'ro': 'ron',
2027 'ru': 'rus',
2028 'rw': 'kin',
2029 'sa': 'san',
2030 'sc': 'srd',
2031 'sd': 'snd',
2032 'se': 'sme',
2033 'sg': 'sag',
2034 'si': 'sin',
2035 'sk': 'slk',
2036 'sl': 'slv',
2037 'sm': 'smo',
2038 'sn': 'sna',
2039 'so': 'som',
2040 'sq': 'sqi',
2041 'sr': 'srp',
2042 'ss': 'ssw',
2043 'st': 'sot',
2044 'su': 'sun',
2045 'sv': 'swe',
2046 'sw': 'swa',
2047 'ta': 'tam',
2048 'te': 'tel',
2049 'tg': 'tgk',
2050 'th': 'tha',
2051 'ti': 'tir',
2052 'tk': 'tuk',
2053 'tl': 'tgl',
2054 'tn': 'tsn',
2055 'to': 'ton',
2056 'tr': 'tur',
2057 'ts': 'tso',
2058 'tt': 'tat',
2059 'tw': 'twi',
2060 'ty': 'tah',
2061 'ug': 'uig',
2062 'uk': 'ukr',
2063 'ur': 'urd',
2064 'uz': 'uzb',
2065 've': 'ven',
2066 'vi': 'vie',
2067 'vo': 'vol',
2068 'wa': 'wln',
2069 'wo': 'wol',
2070 'xh': 'xho',
2071 'yi': 'yid',
2072 'yo': 'yor',
2073 'za': 'zha',
2074 'zh': 'zho',
2075 'zu': 'zul',
2076 }
2077
2078 @classmethod
2079 def short2long(cls, code):
2080 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2081 return cls._lang_map.get(code[:2])
2082
2083 @classmethod
2084 def long2short(cls, code):
2085 """Convert language code from ISO 639-2/T to ISO 639-1"""
2086 for short_name, long_name in cls._lang_map.items():
2087 if long_name == code:
2088 return short_name
2089
2090
2091 class ISO3166Utils(object):
2092 # From http://data.okfn.org/data/core/country-list
2093 _country_map = {
2094 'AF': 'Afghanistan',
2095 'AX': 'Åland Islands',
2096 'AL': 'Albania',
2097 'DZ': 'Algeria',
2098 'AS': 'American Samoa',
2099 'AD': 'Andorra',
2100 'AO': 'Angola',
2101 'AI': 'Anguilla',
2102 'AQ': 'Antarctica',
2103 'AG': 'Antigua and Barbuda',
2104 'AR': 'Argentina',
2105 'AM': 'Armenia',
2106 'AW': 'Aruba',
2107 'AU': 'Australia',
2108 'AT': 'Austria',
2109 'AZ': 'Azerbaijan',
2110 'BS': 'Bahamas',
2111 'BH': 'Bahrain',
2112 'BD': 'Bangladesh',
2113 'BB': 'Barbados',
2114 'BY': 'Belarus',
2115 'BE': 'Belgium',
2116 'BZ': 'Belize',
2117 'BJ': 'Benin',
2118 'BM': 'Bermuda',
2119 'BT': 'Bhutan',
2120 'BO': 'Bolivia, Plurinational State of',
2121 'BQ': 'Bonaire, Sint Eustatius and Saba',
2122 'BA': 'Bosnia and Herzegovina',
2123 'BW': 'Botswana',
2124 'BV': 'Bouvet Island',
2125 'BR': 'Brazil',
2126 'IO': 'British Indian Ocean Territory',
2127 'BN': 'Brunei Darussalam',
2128 'BG': 'Bulgaria',
2129 'BF': 'Burkina Faso',
2130 'BI': 'Burundi',
2131 'KH': 'Cambodia',
2132 'CM': 'Cameroon',
2133 'CA': 'Canada',
2134 'CV': 'Cape Verde',
2135 'KY': 'Cayman Islands',
2136 'CF': 'Central African Republic',
2137 'TD': 'Chad',
2138 'CL': 'Chile',
2139 'CN': 'China',
2140 'CX': 'Christmas Island',
2141 'CC': 'Cocos (Keeling) Islands',
2142 'CO': 'Colombia',
2143 'KM': 'Comoros',
2144 'CG': 'Congo',
2145 'CD': 'Congo, the Democratic Republic of the',
2146 'CK': 'Cook Islands',
2147 'CR': 'Costa Rica',
2148 'CI': 'Côte d\'Ivoire',
2149 'HR': 'Croatia',
2150 'CU': 'Cuba',
2151 'CW': 'Curaçao',
2152 'CY': 'Cyprus',
2153 'CZ': 'Czech Republic',
2154 'DK': 'Denmark',
2155 'DJ': 'Djibouti',
2156 'DM': 'Dominica',
2157 'DO': 'Dominican Republic',
2158 'EC': 'Ecuador',
2159 'EG': 'Egypt',
2160 'SV': 'El Salvador',
2161 'GQ': 'Equatorial Guinea',
2162 'ER': 'Eritrea',
2163 'EE': 'Estonia',
2164 'ET': 'Ethiopia',
2165 'FK': 'Falkland Islands (Malvinas)',
2166 'FO': 'Faroe Islands',
2167 'FJ': 'Fiji',
2168 'FI': 'Finland',
2169 'FR': 'France',
2170 'GF': 'French Guiana',
2171 'PF': 'French Polynesia',
2172 'TF': 'French Southern Territories',
2173 'GA': 'Gabon',
2174 'GM': 'Gambia',
2175 'GE': 'Georgia',
2176 'DE': 'Germany',
2177 'GH': 'Ghana',
2178 'GI': 'Gibraltar',
2179 'GR': 'Greece',
2180 'GL': 'Greenland',
2181 'GD': 'Grenada',
2182 'GP': 'Guadeloupe',
2183 'GU': 'Guam',
2184 'GT': 'Guatemala',
2185 'GG': 'Guernsey',
2186 'GN': 'Guinea',
2187 'GW': 'Guinea-Bissau',
2188 'GY': 'Guyana',
2189 'HT': 'Haiti',
2190 'HM': 'Heard Island and McDonald Islands',
2191 'VA': 'Holy See (Vatican City State)',
2192 'HN': 'Honduras',
2193 'HK': 'Hong Kong',
2194 'HU': 'Hungary',
2195 'IS': 'Iceland',
2196 'IN': 'India',
2197 'ID': 'Indonesia',
2198 'IR': 'Iran, Islamic Republic of',
2199 'IQ': 'Iraq',
2200 'IE': 'Ireland',
2201 'IM': 'Isle of Man',
2202 'IL': 'Israel',
2203 'IT': 'Italy',
2204 'JM': 'Jamaica',
2205 'JP': 'Japan',
2206 'JE': 'Jersey',
2207 'JO': 'Jordan',
2208 'KZ': 'Kazakhstan',
2209 'KE': 'Kenya',
2210 'KI': 'Kiribati',
2211 'KP': 'Korea, Democratic People\'s Republic of',
2212 'KR': 'Korea, Republic of',
2213 'KW': 'Kuwait',
2214 'KG': 'Kyrgyzstan',
2215 'LA': 'Lao People\'s Democratic Republic',
2216 'LV': 'Latvia',
2217 'LB': 'Lebanon',
2218 'LS': 'Lesotho',
2219 'LR': 'Liberia',
2220 'LY': 'Libya',
2221 'LI': 'Liechtenstein',
2222 'LT': 'Lithuania',
2223 'LU': 'Luxembourg',
2224 'MO': 'Macao',
2225 'MK': 'Macedonia, the Former Yugoslav Republic of',
2226 'MG': 'Madagascar',
2227 'MW': 'Malawi',
2228 'MY': 'Malaysia',
2229 'MV': 'Maldives',
2230 'ML': 'Mali',
2231 'MT': 'Malta',
2232 'MH': 'Marshall Islands',
2233 'MQ': 'Martinique',
2234 'MR': 'Mauritania',
2235 'MU': 'Mauritius',
2236 'YT': 'Mayotte',
2237 'MX': 'Mexico',
2238 'FM': 'Micronesia, Federated States of',
2239 'MD': 'Moldova, Republic of',
2240 'MC': 'Monaco',
2241 'MN': 'Mongolia',
2242 'ME': 'Montenegro',
2243 'MS': 'Montserrat',
2244 'MA': 'Morocco',
2245 'MZ': 'Mozambique',
2246 'MM': 'Myanmar',
2247 'NA': 'Namibia',
2248 'NR': 'Nauru',
2249 'NP': 'Nepal',
2250 'NL': 'Netherlands',
2251 'NC': 'New Caledonia',
2252 'NZ': 'New Zealand',
2253 'NI': 'Nicaragua',
2254 'NE': 'Niger',
2255 'NG': 'Nigeria',
2256 'NU': 'Niue',
2257 'NF': 'Norfolk Island',
2258 'MP': 'Northern Mariana Islands',
2259 'NO': 'Norway',
2260 'OM': 'Oman',
2261 'PK': 'Pakistan',
2262 'PW': 'Palau',
2263 'PS': 'Palestine, State of',
2264 'PA': 'Panama',
2265 'PG': 'Papua New Guinea',
2266 'PY': 'Paraguay',
2267 'PE': 'Peru',
2268 'PH': 'Philippines',
2269 'PN': 'Pitcairn',
2270 'PL': 'Poland',
2271 'PT': 'Portugal',
2272 'PR': 'Puerto Rico',
2273 'QA': 'Qatar',
2274 'RE': 'Réunion',
2275 'RO': 'Romania',
2276 'RU': 'Russian Federation',
2277 'RW': 'Rwanda',
2278 'BL': 'Saint Barthélemy',
2279 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2280 'KN': 'Saint Kitts and Nevis',
2281 'LC': 'Saint Lucia',
2282 'MF': 'Saint Martin (French part)',
2283 'PM': 'Saint Pierre and Miquelon',
2284 'VC': 'Saint Vincent and the Grenadines',
2285 'WS': 'Samoa',
2286 'SM': 'San Marino',
2287 'ST': 'Sao Tome and Principe',
2288 'SA': 'Saudi Arabia',
2289 'SN': 'Senegal',
2290 'RS': 'Serbia',
2291 'SC': 'Seychelles',
2292 'SL': 'Sierra Leone',
2293 'SG': 'Singapore',
2294 'SX': 'Sint Maarten (Dutch part)',
2295 'SK': 'Slovakia',
2296 'SI': 'Slovenia',
2297 'SB': 'Solomon Islands',
2298 'SO': 'Somalia',
2299 'ZA': 'South Africa',
2300 'GS': 'South Georgia and the South Sandwich Islands',
2301 'SS': 'South Sudan',
2302 'ES': 'Spain',
2303 'LK': 'Sri Lanka',
2304 'SD': 'Sudan',
2305 'SR': 'Suriname',
2306 'SJ': 'Svalbard and Jan Mayen',
2307 'SZ': 'Swaziland',
2308 'SE': 'Sweden',
2309 'CH': 'Switzerland',
2310 'SY': 'Syrian Arab Republic',
2311 'TW': 'Taiwan, Province of China',
2312 'TJ': 'Tajikistan',
2313 'TZ': 'Tanzania, United Republic of',
2314 'TH': 'Thailand',
2315 'TL': 'Timor-Leste',
2316 'TG': 'Togo',
2317 'TK': 'Tokelau',
2318 'TO': 'Tonga',
2319 'TT': 'Trinidad and Tobago',
2320 'TN': 'Tunisia',
2321 'TR': 'Turkey',
2322 'TM': 'Turkmenistan',
2323 'TC': 'Turks and Caicos Islands',
2324 'TV': 'Tuvalu',
2325 'UG': 'Uganda',
2326 'UA': 'Ukraine',
2327 'AE': 'United Arab Emirates',
2328 'GB': 'United Kingdom',
2329 'US': 'United States',
2330 'UM': 'United States Minor Outlying Islands',
2331 'UY': 'Uruguay',
2332 'UZ': 'Uzbekistan',
2333 'VU': 'Vanuatu',
2334 'VE': 'Venezuela, Bolivarian Republic of',
2335 'VN': 'Viet Nam',
2336 'VG': 'Virgin Islands, British',
2337 'VI': 'Virgin Islands, U.S.',
2338 'WF': 'Wallis and Futuna',
2339 'EH': 'Western Sahara',
2340 'YE': 'Yemen',
2341 'ZM': 'Zambia',
2342 'ZW': 'Zimbabwe',
2343 }
2344
2345 @classmethod
2346 def short2full(cls, code):
2347 """Convert an ISO 3166-2 country code to the corresponding full name"""
2348 return cls._country_map.get(code.upper())
2349
2350
2351 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2352 def __init__(self, proxies=None):
2353 # Set default handlers
2354 for type in ('http', 'https'):
2355 setattr(self, '%s_open' % type,
2356 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2357 meth(r, proxy, type))
2358 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2359
2360 def proxy_open(self, req, proxy, type):
2361 req_proxy = req.headers.get('Ytdl-request-proxy')
2362 if req_proxy is not None:
2363 proxy = req_proxy
2364 del req.headers['Ytdl-request-proxy']
2365
2366 if proxy == '__noproxy__':
2367 return None # No Proxy
2368 return compat_urllib_request.ProxyHandler.proxy_open(
2369 self, req, proxy, type)