]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/control: Mark compliance with policy 4.0.1.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import email.header
15 import errno
16 import functools
17 import gzip
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import operator
24 import os
25 import platform
26 import random
27 import re
28 import socket
29 import ssl
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParseError,
39 compat_HTMLParser,
40 compat_basestring,
41 compat_chr,
42 compat_etree_fromstring,
43 compat_expanduser,
44 compat_html_entities,
45 compat_html_entities_html5,
46 compat_http_client,
47 compat_kwargs,
48 compat_os_name,
49 compat_parse_qs,
50 compat_shlex_quote,
51 compat_socket_create_connection,
52 compat_str,
53 compat_struct_pack,
54 compat_struct_unpack,
55 compat_urllib_error,
56 compat_urllib_parse,
57 compat_urllib_parse_urlencode,
58 compat_urllib_parse_urlparse,
59 compat_urllib_parse_unquote_plus,
60 compat_urllib_request,
61 compat_urlparse,
62 compat_xpath,
63 )
64
65 from .socks import (
66 ProxyType,
67 sockssocket,
68 )
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83 std_headers = {
84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
89 }
90
91
92 USER_AGENTS = {
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
94 }
95
96
97 NO_DEFAULT = object()
98
99 ENGLISH_MONTH_NAMES = [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
102
103 MONTH_NAMES = {
104 'en': ENGLISH_MONTH_NAMES,
105 'fr': [
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 }
109
110 KNOWN_EXTENSIONS = (
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
115 'avi', 'divx',
116 'mov',
117 'asf', 'wmv', 'wma',
118 '3gp', '3g2',
119 'mp3',
120 'flac',
121 'ape',
122 'wav',
123 'f4f', 'f4m', 'm3u8', 'smil')
124
125 # needed for sanitizing filenames in restricted mode
126 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
129
130 DATE_FORMATS = (
131 '%d %B %Y',
132 '%d %b %Y',
133 '%B %d %Y',
134 '%B %dst %Y',
135 '%B %dnd %Y',
136 '%B %dth %Y',
137 '%b %d %Y',
138 '%b %dst %Y',
139 '%b %dnd %Y',
140 '%b %dth %Y',
141 '%b %dst %Y %I:%M',
142 '%b %dnd %Y %I:%M',
143 '%b %dth %Y %I:%M',
144 '%Y %m %d',
145 '%Y-%m-%d',
146 '%Y/%m/%d',
147 '%Y/%m/%d %H:%M',
148 '%Y/%m/%d %H:%M:%S',
149 '%Y-%m-%d %H:%M',
150 '%Y-%m-%d %H:%M:%S',
151 '%Y-%m-%d %H:%M:%S.%f',
152 '%d.%m.%Y %H:%M',
153 '%d.%m.%Y %H.%M',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
157 '%Y-%m-%dT%H:%M:%S',
158 '%Y-%m-%dT%H:%M:%S.%f',
159 '%Y-%m-%dT%H:%M',
160 '%b %d %Y at %H:%M',
161 '%b %d %Y at %H:%M:%S',
162 )
163
164 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
165 DATE_FORMATS_DAY_FIRST.extend([
166 '%d-%m-%Y',
167 '%d.%m.%Y',
168 '%d.%m.%y',
169 '%d/%m/%Y',
170 '%d/%m/%y',
171 '%d/%m/%Y %H:%M:%S',
172 ])
173
174 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
175 DATE_FORMATS_MONTH_FIRST.extend([
176 '%m-%d-%Y',
177 '%m.%d.%Y',
178 '%m/%d/%Y',
179 '%m/%d/%y',
180 '%m/%d/%Y %H:%M:%S',
181 ])
182
183 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184
185
186 def preferredencoding():
187 """Get preferred encoding.
188
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 'TEST'.encode(pref)
195 except Exception:
196 pref = 'UTF-8'
197
198 return pref
199
200
201 def write_json_file(obj, fn):
202 """ Encode obj as JSON and write it to fn, atomically if possible """
203
204 fn = encodeFilename(fn)
205 if sys.version_info < (3, 0) and sys.platform != 'win32':
206 encoding = get_filesystem_encoding()
207 # os.path.basename returns a bytes object, but NamedTemporaryFile
208 # will fail if the filename contains non ascii characters unless we
209 # use a unicode object
210 path_basename = lambda f: os.path.basename(fn).decode(encoding)
211 # the same for os.path.dirname
212 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
213 else:
214 path_basename = os.path.basename
215 path_dirname = os.path.dirname
216
217 args = {
218 'suffix': '.tmp',
219 'prefix': path_basename(fn) + '.',
220 'dir': path_dirname(fn),
221 'delete': False,
222 }
223
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3, 0):
227 args['mode'] = 'wb'
228 else:
229 args.update({
230 'mode': 'w',
231 'encoding': 'utf-8',
232 })
233
234 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
235
236 try:
237 with tf:
238 json.dump(obj, tf)
239 if sys.platform == 'win32':
240 # Need to remove existing file on Windows, else os.rename raises
241 # WindowsError or FileExistsError.
242 try:
243 os.unlink(fn)
244 except OSError:
245 pass
246 os.rename(tf.name, fn)
247 except Exception:
248 try:
249 os.remove(tf.name)
250 except OSError:
251 pass
252 raise
253
254
255 if sys.version_info >= (2, 7):
256 def find_xpath_attr(node, xpath, key, val=None):
257 """ Find the xpath xpath[@key=val] """
258 assert re.match(r'^[a-zA-Z_-]+$', key)
259 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
260 return node.find(expr)
261 else:
262 def find_xpath_attr(node, xpath, key, val=None):
263 for f in node.findall(compat_xpath(xpath)):
264 if key not in f.attrib:
265 continue
266 if val is None or f.attrib.get(key) == val:
267 return f
268 return None
269
270 # On python2.6 the xml.etree.ElementTree.Element methods don't support
271 # the namespace parameter
272
273
274 def xpath_with_ns(path, ns_map):
275 components = [c.split(':') for c in path.split('/')]
276 replaced = []
277 for c in components:
278 if len(c) == 1:
279 replaced.append(c[0])
280 else:
281 ns, tag = c
282 replaced.append('{%s}%s' % (ns_map[ns], tag))
283 return '/'.join(replaced)
284
285
286 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
287 def _find_xpath(xpath):
288 return node.find(compat_xpath(xpath))
289
290 if isinstance(xpath, (str, compat_str)):
291 n = _find_xpath(xpath)
292 else:
293 for xp in xpath:
294 n = _find_xpath(xp)
295 if n is not None:
296 break
297
298 if n is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element %s' % name)
304 else:
305 return None
306 return n
307
308
309 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
311 if n is None or n == default:
312 return n
313 if n.text is None:
314 if default is not NO_DEFAULT:
315 return default
316 elif fatal:
317 name = xpath if name is None else name
318 raise ExtractorError('Could not find XML element\'s text %s' % name)
319 else:
320 return None
321 return n.text
322
323
324 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
325 n = find_xpath_attr(node, xpath, key)
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = '%s[@%s]' % (xpath, key) if name is None else name
331 raise ExtractorError('Could not find XML attribute %s' % name)
332 else:
333 return None
334 return n.attrib[key]
335
336
337 def get_element_by_id(id, html):
338 """Return the content of the tag with the specified ID in the passed HTML document"""
339 return get_element_by_attribute('id', id, html)
340
341
342 def get_element_by_class(class_name, html):
343 """Return the content of the first tag with the specified class in the passed HTML document"""
344 retval = get_elements_by_class(class_name, html)
345 return retval[0] if retval else None
346
347
348 def get_element_by_attribute(attribute, value, html, escape_value=True):
349 retval = get_elements_by_attribute(attribute, value, html, escape_value)
350 return retval[0] if retval else None
351
352
353 def get_elements_by_class(class_name, html):
354 """Return the content of all tags with the specified class in the passed HTML document as a list"""
355 return get_elements_by_attribute(
356 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
357 html, escape_value=False)
358
359
360 def get_elements_by_attribute(attribute, value, html, escape_value=True):
361 """Return the content of the tag with the specified attribute in the passed HTML document"""
362
363 value = re.escape(value) if escape_value else value
364
365 retlist = []
366 for m in re.finditer(r'''(?xs)
367 <([a-zA-Z0-9:._-]+)
368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
369 \s+%s=['"]?%s['"]?
370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
371 \s*>
372 (?P<content>.*?)
373 </\1>
374 ''' % (re.escape(attribute), value), html):
375 res = m.group('content')
376
377 if res.startswith('"') or res.startswith("'"):
378 res = res[1:-1]
379
380 retlist.append(unescapeHTML(res))
381
382 return retlist
383
384
385 class HTMLAttributeParser(compat_HTMLParser):
386 """Trivial HTML parser to gather the attributes for a single element"""
387 def __init__(self):
388 self.attrs = {}
389 compat_HTMLParser.__init__(self)
390
391 def handle_starttag(self, tag, attrs):
392 self.attrs = dict(attrs)
393
394
395 def extract_attributes(html_element):
396 """Given a string for an HTML element such as
397 <el
398 a="foo" B="bar" c="&98;az" d=boz
399 empty= noval entity="&amp;"
400 sq='"' dq="'"
401 >
402 Decode and return a dictionary of attributes.
403 {
404 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
405 'empty': '', 'noval': None, 'entity': '&',
406 'sq': '"', 'dq': '\''
407 }.
408 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
409 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
410 """
411 parser = HTMLAttributeParser()
412 try:
413 parser.feed(html_element)
414 parser.close()
415 # Older Python may throw HTMLParseError in case of malformed HTML
416 except compat_HTMLParseError:
417 pass
418 return parser.attrs
419
420
421 def clean_html(html):
422 """Clean an HTML snippet into a readable string"""
423
424 if html is None: # Convenience for sanitizing descriptions etc.
425 return html
426
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
429 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
435 return html.strip()
436
437
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
449 if filename == '-':
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
458 raise
459
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = sanitize_path(filename)
462 if alt_filename == filename:
463 raise
464 else:
465 # An exception here should be caught in the caller
466 stream = open(encodeFilename(alt_filename), open_mode)
467 return (stream, alt_filename)
468
469
470 def timeconvert(timestr):
471 """Convert RFC 2822 defined time string into system timestamp"""
472 timestamp = None
473 timetuple = email.utils.parsedate_tz(timestr)
474 if timetuple is not None:
475 timestamp = email.utils.mktime_tz(timetuple)
476 return timestamp
477
478
479 def sanitize_filename(s, restricted=False, is_id=False):
480 """Sanitizes a string so it could be used as part of a filename.
481 If restricted is set, use a stricter subset of allowed characters.
482 Set is_id if this is not an arbitrary string, but an ID that should be kept
483 if possible.
484 """
485 def replace_insane(char):
486 if restricted and char in ACCENT_CHARS:
487 return ACCENT_CHARS[char]
488 if char == '?' or ord(char) < 32 or ord(char) == 127:
489 return ''
490 elif char == '"':
491 return '' if restricted else '\''
492 elif char == ':':
493 return '_-' if restricted else ' -'
494 elif char in '\\/|*<>':
495 return '_'
496 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
497 return '_'
498 if restricted and ord(char) > 127:
499 return '_'
500 return char
501
502 # Handle timestamps
503 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
504 result = ''.join(map(replace_insane, s))
505 if not is_id:
506 while '__' in result:
507 result = result.replace('__', '_')
508 result = result.strip('_')
509 # Common case of "Foreign band name - English song title"
510 if restricted and result.startswith('-_'):
511 result = result[2:]
512 if result.startswith('-'):
513 result = '_' + result[len('-'):]
514 result = result.lstrip('.')
515 if not result:
516 result = '_'
517 return result
518
519
520 def sanitize_path(s):
521 """Sanitizes and normalizes path on Windows"""
522 if sys.platform != 'win32':
523 return s
524 drive_or_unc, _ = os.path.splitdrive(s)
525 if sys.version_info < (2, 7) and not drive_or_unc:
526 drive_or_unc, _ = os.path.splitunc(s)
527 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
528 if drive_or_unc:
529 norm_path.pop(0)
530 sanitized_path = [
531 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
532 for path_part in norm_path]
533 if drive_or_unc:
534 sanitized_path.insert(0, drive_or_unc + os.path.sep)
535 return os.path.join(*sanitized_path)
536
537
538 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
539 # unwanted failures due to missing protocol
540 def sanitize_url(url):
541 return 'http:%s' % url if url.startswith('//') else url
542
543
544 def sanitized_Request(url, *args, **kwargs):
545 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
546
547
548 def expand_path(s):
549 """Expand shell variables and ~"""
550 return os.path.expandvars(compat_expanduser(s))
551
552
553 def orderedSet(iterable):
554 """ Remove all duplicates from the input iterable """
555 res = []
556 for el in iterable:
557 if el not in res:
558 res.append(el)
559 return res
560
561
562 def _htmlentity_transform(entity_with_semicolon):
563 """Transforms an HTML entity to a character."""
564 entity = entity_with_semicolon[:-1]
565
566 # Known non-numeric HTML entity
567 if entity in compat_html_entities.name2codepoint:
568 return compat_chr(compat_html_entities.name2codepoint[entity])
569
570 # TODO: HTML5 allows entities without a semicolon. For example,
571 # '&Eacuteric' should be decoded as 'Éric'.
572 if entity_with_semicolon in compat_html_entities_html5:
573 return compat_html_entities_html5[entity_with_semicolon]
574
575 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
576 if mobj is not None:
577 numstr = mobj.group(1)
578 if numstr.startswith('x'):
579 base = 16
580 numstr = '0%s' % numstr
581 else:
582 base = 10
583 # See https://github.com/rg3/youtube-dl/issues/7518
584 try:
585 return compat_chr(int(numstr, base))
586 except ValueError:
587 pass
588
589 # Unknown entity in name, return its literal representation
590 return '&%s;' % entity
591
592
593 def unescapeHTML(s):
594 if s is None:
595 return None
596 assert type(s) == compat_str
597
598 return re.sub(
599 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
600
601
602 def get_subprocess_encoding():
603 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 # For subprocess calls, encode with locale encoding
605 # Refer to http://stackoverflow.com/a/9951851/35070
606 encoding = preferredencoding()
607 else:
608 encoding = sys.getfilesystemencoding()
609 if encoding is None:
610 encoding = 'utf-8'
611 return encoding
612
613
614 def encodeFilename(s, for_subprocess=False):
615 """
616 @param s The name of the file
617 """
618
619 assert type(s) == compat_str
620
621 # Python 3 has a Unicode API
622 if sys.version_info >= (3, 0):
623 return s
624
625 # Pass '' directly to use Unicode APIs on Windows 2000 and up
626 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
627 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
628 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
629 return s
630
631 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
632 if sys.platform.startswith('java'):
633 return s
634
635 return s.encode(get_subprocess_encoding(), 'ignore')
636
637
638 def decodeFilename(b, for_subprocess=False):
639
640 if sys.version_info >= (3, 0):
641 return b
642
643 if not isinstance(b, bytes):
644 return b
645
646 return b.decode(get_subprocess_encoding(), 'ignore')
647
648
649 def encodeArgument(s):
650 if not isinstance(s, compat_str):
651 # Legacy code that uses byte strings
652 # Uncomment the following line after fixing all post processors
653 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
654 s = s.decode('ascii')
655 return encodeFilename(s, True)
656
657
658 def decodeArgument(b):
659 return decodeFilename(b, True)
660
661
662 def decodeOption(optval):
663 if optval is None:
664 return optval
665 if isinstance(optval, bytes):
666 optval = optval.decode(preferredencoding())
667
668 assert isinstance(optval, compat_str)
669 return optval
670
671
672 def formatSeconds(secs):
673 if secs > 3600:
674 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
675 elif secs > 60:
676 return '%d:%02d' % (secs // 60, secs % 60)
677 else:
678 return '%d' % secs
679
680
681 def make_HTTPS_handler(params, **kwargs):
682 opts_no_check_certificate = params.get('nocheckcertificate', False)
683 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
684 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
685 if opts_no_check_certificate:
686 context.check_hostname = False
687 context.verify_mode = ssl.CERT_NONE
688 try:
689 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
690 except TypeError:
691 # Python 2.7.8
692 # (create_default_context present but HTTPSHandler has no context=)
693 pass
694
695 if sys.version_info < (3, 2):
696 return YoutubeDLHTTPSHandler(params, **kwargs)
697 else: # Python < 3.4
698 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
699 context.verify_mode = (ssl.CERT_NONE
700 if opts_no_check_certificate
701 else ssl.CERT_REQUIRED)
702 context.set_default_verify_paths()
703 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
704
705
706 def bug_reports_message():
707 if ytdl_is_updateable():
708 update_cmd = 'type youtube-dl -U to update'
709 else:
710 update_cmd = 'see https://yt-dl.org/update on how to update'
711 msg = '; please report this issue on https://yt-dl.org/bug .'
712 msg += ' Make sure you are using the latest version; %s.' % update_cmd
713 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
714 return msg
715
716
717 class YoutubeDLError(Exception):
718 """Base exception for YoutubeDL errors."""
719 pass
720
721
722 class ExtractorError(YoutubeDLError):
723 """Error during info extraction."""
724
725 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
726 """ tb, if given, is the original traceback (so that it can be printed out).
727 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
728 """
729
730 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
731 expected = True
732 if video_id is not None:
733 msg = video_id + ': ' + msg
734 if cause:
735 msg += ' (caused by %r)' % cause
736 if not expected:
737 msg += bug_reports_message()
738 super(ExtractorError, self).__init__(msg)
739
740 self.traceback = tb
741 self.exc_info = sys.exc_info() # preserve original exception
742 self.cause = cause
743 self.video_id = video_id
744
745 def format_traceback(self):
746 if self.traceback is None:
747 return None
748 return ''.join(traceback.format_tb(self.traceback))
749
750
751 class UnsupportedError(ExtractorError):
752 def __init__(self, url):
753 super(UnsupportedError, self).__init__(
754 'Unsupported URL: %s' % url, expected=True)
755 self.url = url
756
757
758 class RegexNotFoundError(ExtractorError):
759 """Error when a regex didn't match"""
760 pass
761
762
763 class GeoRestrictedError(ExtractorError):
764 """Geographic restriction Error exception.
765
766 This exception may be thrown when a video is not available from your
767 geographic location due to geographic restrictions imposed by a website.
768 """
769 def __init__(self, msg, countries=None):
770 super(GeoRestrictedError, self).__init__(msg, expected=True)
771 self.msg = msg
772 self.countries = countries
773
774
775 class DownloadError(YoutubeDLError):
776 """Download Error exception.
777
778 This exception may be thrown by FileDownloader objects if they are not
779 configured to continue on errors. They will contain the appropriate
780 error message.
781 """
782
783 def __init__(self, msg, exc_info=None):
784 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
785 super(DownloadError, self).__init__(msg)
786 self.exc_info = exc_info
787
788
789 class SameFileError(YoutubeDLError):
790 """Same File exception.
791
792 This exception will be thrown by FileDownloader objects if they detect
793 multiple files would have to be downloaded to the same file on disk.
794 """
795 pass
796
797
798 class PostProcessingError(YoutubeDLError):
799 """Post Processing exception.
800
801 This exception may be raised by PostProcessor's .run() method to
802 indicate an error in the postprocessing task.
803 """
804
805 def __init__(self, msg):
806 super(PostProcessingError, self).__init__(msg)
807 self.msg = msg
808
809
810 class MaxDownloadsReached(YoutubeDLError):
811 """ --max-downloads limit has been reached. """
812 pass
813
814
815 class UnavailableVideoError(YoutubeDLError):
816 """Unavailable Format exception.
817
818 This exception will be thrown when a video is requested
819 in a format that is not available for that video.
820 """
821 pass
822
823
824 class ContentTooShortError(YoutubeDLError):
825 """Content Too Short exception.
826
827 This exception may be raised by FileDownloader objects when a file they
828 download is too small for what the server announced first, indicating
829 the connection was probably interrupted.
830 """
831
832 def __init__(self, downloaded, expected):
833 super(ContentTooShortError, self).__init__(
834 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
835 )
836 # Both in bytes
837 self.downloaded = downloaded
838 self.expected = expected
839
840
841 class XAttrMetadataError(YoutubeDLError):
842 def __init__(self, code=None, msg='Unknown error'):
843 super(XAttrMetadataError, self).__init__(msg)
844 self.code = code
845 self.msg = msg
846
847 # Parsing code and msg
848 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
849 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
850 self.reason = 'NO_SPACE'
851 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
852 self.reason = 'VALUE_TOO_LONG'
853 else:
854 self.reason = 'NOT_SUPPORTED'
855
856
857 class XAttrUnavailableError(YoutubeDLError):
858 pass
859
860
861 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
862 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
863 # expected HTTP responses to meet HTTP/1.0 or later (see also
864 # https://github.com/rg3/youtube-dl/issues/6727)
865 if sys.version_info < (3, 0):
866 kwargs[b'strict'] = True
867 hc = http_class(*args, **kwargs)
868 source_address = ydl_handler._params.get('source_address')
869 if source_address is not None:
870 sa = (source_address, 0)
871 if hasattr(hc, 'source_address'): # Python 2.7+
872 hc.source_address = sa
873 else: # Python 2.6
874 def _hc_connect(self, *args, **kwargs):
875 sock = compat_socket_create_connection(
876 (self.host, self.port), self.timeout, sa)
877 if is_https:
878 self.sock = ssl.wrap_socket(
879 sock, self.key_file, self.cert_file,
880 ssl_version=ssl.PROTOCOL_TLSv1)
881 else:
882 self.sock = sock
883 hc.connect = functools.partial(_hc_connect, hc)
884
885 return hc
886
887
888 def handle_youtubedl_headers(headers):
889 filtered_headers = headers
890
891 if 'Youtubedl-no-compression' in filtered_headers:
892 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
893 del filtered_headers['Youtubedl-no-compression']
894
895 return filtered_headers
896
897
898 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
899 """Handler for HTTP requests and responses.
900
901 This class, when installed with an OpenerDirector, automatically adds
902 the standard headers to every HTTP request and handles gzipped and
903 deflated responses from web servers. If compression is to be avoided in
904 a particular request, the original request in the program code only has
905 to include the HTTP header "Youtubedl-no-compression", which will be
906 removed before making the real request.
907
908 Part of this code was copied from:
909
910 http://techknack.net/python-urllib2-handlers/
911
912 Andrew Rowls, the author of that code, agreed to release it to the
913 public domain.
914 """
915
916 def __init__(self, params, *args, **kwargs):
917 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
918 self._params = params
919
920 def http_open(self, req):
921 conn_class = compat_http_client.HTTPConnection
922
923 socks_proxy = req.headers.get('Ytdl-socks-proxy')
924 if socks_proxy:
925 conn_class = make_socks_conn_class(conn_class, socks_proxy)
926 del req.headers['Ytdl-socks-proxy']
927
928 return self.do_open(functools.partial(
929 _create_http_connection, self, conn_class, False),
930 req)
931
932 @staticmethod
933 def deflate(data):
934 try:
935 return zlib.decompress(data, -zlib.MAX_WBITS)
936 except zlib.error:
937 return zlib.decompress(data)
938
939 def http_request(self, req):
940 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
941 # always respected by websites, some tend to give out URLs with non percent-encoded
942 # non-ASCII characters (see telemb.py, ard.py [#3412])
943 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
944 # To work around aforementioned issue we will replace request's original URL with
945 # percent-encoded one
946 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
947 # the code of this workaround has been moved here from YoutubeDL.urlopen()
948 url = req.get_full_url()
949 url_escaped = escape_url(url)
950
951 # Substitute URL if any change after escaping
952 if url != url_escaped:
953 req = update_Request(req, url=url_escaped)
954
955 for h, v in std_headers.items():
956 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
957 # The dict keys are capitalized because of this bug by urllib
958 if h.capitalize() not in req.headers:
959 req.add_header(h, v)
960
961 req.headers = handle_youtubedl_headers(req.headers)
962
963 if sys.version_info < (2, 7) and '#' in req.get_full_url():
964 # Python 2.6 is brain-dead when it comes to fragments
965 req._Request__original = req._Request__original.partition('#')[0]
966 req._Request__r_type = req._Request__r_type.partition('#')[0]
967
968 return req
969
970 def http_response(self, req, resp):
971 old_resp = resp
972 # gzip
973 if resp.headers.get('Content-encoding', '') == 'gzip':
974 content = resp.read()
975 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
976 try:
977 uncompressed = io.BytesIO(gz.read())
978 except IOError as original_ioerror:
979 # There may be junk add the end of the file
980 # See http://stackoverflow.com/q/4928560/35070 for details
981 for i in range(1, 1024):
982 try:
983 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
984 uncompressed = io.BytesIO(gz.read())
985 except IOError:
986 continue
987 break
988 else:
989 raise original_ioerror
990 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
991 resp.msg = old_resp.msg
992 del resp.headers['Content-encoding']
993 # deflate
994 if resp.headers.get('Content-encoding', '') == 'deflate':
995 gz = io.BytesIO(self.deflate(resp.read()))
996 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
997 resp.msg = old_resp.msg
998 del resp.headers['Content-encoding']
999 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1000 # https://github.com/rg3/youtube-dl/issues/6457).
1001 if 300 <= resp.code < 400:
1002 location = resp.headers.get('Location')
1003 if location:
1004 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1005 if sys.version_info >= (3, 0):
1006 location = location.encode('iso-8859-1').decode('utf-8')
1007 else:
1008 location = location.decode('utf-8')
1009 location_escaped = escape_url(location)
1010 if location != location_escaped:
1011 del resp.headers['Location']
1012 if sys.version_info < (3, 0):
1013 location_escaped = location_escaped.encode('utf-8')
1014 resp.headers['Location'] = location_escaped
1015 return resp
1016
1017 https_request = http_request
1018 https_response = http_response
1019
1020
1021 def make_socks_conn_class(base_class, socks_proxy):
1022 assert issubclass(base_class, (
1023 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1024
1025 url_components = compat_urlparse.urlparse(socks_proxy)
1026 if url_components.scheme.lower() == 'socks5':
1027 socks_type = ProxyType.SOCKS5
1028 elif url_components.scheme.lower() in ('socks', 'socks4'):
1029 socks_type = ProxyType.SOCKS4
1030 elif url_components.scheme.lower() == 'socks4a':
1031 socks_type = ProxyType.SOCKS4A
1032
1033 def unquote_if_non_empty(s):
1034 if not s:
1035 return s
1036 return compat_urllib_parse_unquote_plus(s)
1037
1038 proxy_args = (
1039 socks_type,
1040 url_components.hostname, url_components.port or 1080,
1041 True, # Remote DNS
1042 unquote_if_non_empty(url_components.username),
1043 unquote_if_non_empty(url_components.password),
1044 )
1045
1046 class SocksConnection(base_class):
1047 def connect(self):
1048 self.sock = sockssocket()
1049 self.sock.setproxy(*proxy_args)
1050 if type(self.timeout) in (int, float):
1051 self.sock.settimeout(self.timeout)
1052 self.sock.connect((self.host, self.port))
1053
1054 if isinstance(self, compat_http_client.HTTPSConnection):
1055 if hasattr(self, '_context'): # Python > 2.6
1056 self.sock = self._context.wrap_socket(
1057 self.sock, server_hostname=self.host)
1058 else:
1059 self.sock = ssl.wrap_socket(self.sock)
1060
1061 return SocksConnection
1062
1063
1064 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1065 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1066 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1067 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1068 self._params = params
1069
1070 def https_open(self, req):
1071 kwargs = {}
1072 conn_class = self._https_conn_class
1073
1074 if hasattr(self, '_context'): # python > 2.6
1075 kwargs['context'] = self._context
1076 if hasattr(self, '_check_hostname'): # python 3.x
1077 kwargs['check_hostname'] = self._check_hostname
1078
1079 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1080 if socks_proxy:
1081 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1082 del req.headers['Ytdl-socks-proxy']
1083
1084 return self.do_open(functools.partial(
1085 _create_http_connection, self, conn_class, True),
1086 req, **kwargs)
1087
1088
1089 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1090 def __init__(self, cookiejar=None):
1091 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1092
1093 def http_response(self, request, response):
1094 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1095 # characters in Set-Cookie HTTP header of last response (see
1096 # https://github.com/rg3/youtube-dl/issues/6769).
1097 # In order to at least prevent crashing we will percent encode Set-Cookie
1098 # header before HTTPCookieProcessor starts processing it.
1099 # if sys.version_info < (3, 0) and response.headers:
1100 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1101 # set_cookie = response.headers.get(set_cookie_header)
1102 # if set_cookie:
1103 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1104 # if set_cookie != set_cookie_escaped:
1105 # del response.headers[set_cookie_header]
1106 # response.headers[set_cookie_header] = set_cookie_escaped
1107 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1108
1109 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1110 https_response = http_response
1111
1112
1113 def extract_timezone(date_str):
1114 m = re.search(
1115 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1116 date_str)
1117 if not m:
1118 timezone = datetime.timedelta()
1119 else:
1120 date_str = date_str[:-len(m.group('tz'))]
1121 if not m.group('sign'):
1122 timezone = datetime.timedelta()
1123 else:
1124 sign = 1 if m.group('sign') == '+' else -1
1125 timezone = datetime.timedelta(
1126 hours=sign * int(m.group('hours')),
1127 minutes=sign * int(m.group('minutes')))
1128 return timezone, date_str
1129
1130
1131 def parse_iso8601(date_str, delimiter='T', timezone=None):
1132 """ Return a UNIX timestamp from the given date """
1133
1134 if date_str is None:
1135 return None
1136
1137 date_str = re.sub(r'\.[0-9]+', '', date_str)
1138
1139 if timezone is None:
1140 timezone, date_str = extract_timezone(date_str)
1141
1142 try:
1143 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1144 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
1148
1149
1150 def date_formats(day_first=True):
1151 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1152
1153
1154 def unified_strdate(date_str, day_first=True):
1155 """Return a string with the date in the format YYYYMMDD"""
1156
1157 if date_str is None:
1158 return None
1159 upload_date = None
1160 # Replace commas
1161 date_str = date_str.replace(',', ' ')
1162 # Remove AM/PM + timezone
1163 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1164 _, date_str = extract_timezone(date_str)
1165
1166 for expression in date_formats(day_first):
1167 try:
1168 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1169 except ValueError:
1170 pass
1171 if upload_date is None:
1172 timetuple = email.utils.parsedate_tz(date_str)
1173 if timetuple:
1174 try:
1175 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1176 except ValueError:
1177 pass
1178 if upload_date is not None:
1179 return compat_str(upload_date)
1180
1181
1182 def unified_timestamp(date_str, day_first=True):
1183 if date_str is None:
1184 return None
1185
1186 date_str = re.sub(r'[,|]', '', date_str)
1187
1188 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1189 timezone, date_str = extract_timezone(date_str)
1190
1191 # Remove AM/PM + timezone
1192 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1193
1194 # Remove unrecognized timezones from ISO 8601 alike timestamps
1195 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1196 if m:
1197 date_str = date_str[:-len(m.group('tz'))]
1198
1199 for expression in date_formats(day_first):
1200 try:
1201 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1202 return calendar.timegm(dt.timetuple())
1203 except ValueError:
1204 pass
1205 timetuple = email.utils.parsedate_tz(date_str)
1206 if timetuple:
1207 return calendar.timegm(timetuple) + pm_delta * 3600
1208
1209
1210 def determine_ext(url, default_ext='unknown_video'):
1211 if url is None:
1212 return default_ext
1213 guess = url.partition('?')[0].rpartition('.')[2]
1214 if re.match(r'^[A-Za-z0-9]+$', guess):
1215 return guess
1216 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1217 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1218 return guess.rstrip('/')
1219 else:
1220 return default_ext
1221
1222
1223 def subtitles_filename(filename, sub_lang, sub_format):
1224 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1225
1226
1227 def date_from_str(date_str):
1228 """
1229 Return a datetime object from a string in the format YYYYMMDD or
1230 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1231 today = datetime.date.today()
1232 if date_str in ('now', 'today'):
1233 return today
1234 if date_str == 'yesterday':
1235 return today - datetime.timedelta(days=1)
1236 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1237 if match is not None:
1238 sign = match.group('sign')
1239 time = int(match.group('time'))
1240 if sign == '-':
1241 time = -time
1242 unit = match.group('unit')
1243 # A bad approximation?
1244 if unit == 'month':
1245 unit = 'day'
1246 time *= 30
1247 elif unit == 'year':
1248 unit = 'day'
1249 time *= 365
1250 unit += 's'
1251 delta = datetime.timedelta(**{unit: time})
1252 return today + delta
1253 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1254
1255
1256 def hyphenate_date(date_str):
1257 """
1258 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1259 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1260 if match is not None:
1261 return '-'.join(match.groups())
1262 else:
1263 return date_str
1264
1265
1266 class DateRange(object):
1267 """Represents a time interval between two dates"""
1268
1269 def __init__(self, start=None, end=None):
1270 """start and end must be strings in the format accepted by date"""
1271 if start is not None:
1272 self.start = date_from_str(start)
1273 else:
1274 self.start = datetime.datetime.min.date()
1275 if end is not None:
1276 self.end = date_from_str(end)
1277 else:
1278 self.end = datetime.datetime.max.date()
1279 if self.start > self.end:
1280 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1281
1282 @classmethod
1283 def day(cls, day):
1284 """Returns a range that only contains the given day"""
1285 return cls(day, day)
1286
1287 def __contains__(self, date):
1288 """Check if the date is in the range"""
1289 if not isinstance(date, datetime.date):
1290 date = date_from_str(date)
1291 return self.start <= date <= self.end
1292
1293 def __str__(self):
1294 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1295
1296
1297 def platform_name():
1298 """ Returns the platform name as a compat_str """
1299 res = platform.platform()
1300 if isinstance(res, bytes):
1301 res = res.decode(preferredencoding())
1302
1303 assert isinstance(res, compat_str)
1304 return res
1305
1306
1307 def _windows_write_string(s, out):
1308 """ Returns True if the string was written using special methods,
1309 False if it has yet to be written out."""
1310 # Adapted from http://stackoverflow.com/a/3259271/35070
1311
1312 import ctypes
1313 import ctypes.wintypes
1314
1315 WIN_OUTPUT_IDS = {
1316 1: -11,
1317 2: -12,
1318 }
1319
1320 try:
1321 fileno = out.fileno()
1322 except AttributeError:
1323 # If the output stream doesn't have a fileno, it's virtual
1324 return False
1325 except io.UnsupportedOperation:
1326 # Some strange Windows pseudo files?
1327 return False
1328 if fileno not in WIN_OUTPUT_IDS:
1329 return False
1330
1331 GetStdHandle = ctypes.WINFUNCTYPE(
1332 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1333 (b'GetStdHandle', ctypes.windll.kernel32))
1334 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1335
1336 WriteConsoleW = ctypes.WINFUNCTYPE(
1337 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1338 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1339 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1340 written = ctypes.wintypes.DWORD(0)
1341
1342 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1343 FILE_TYPE_CHAR = 0x0002
1344 FILE_TYPE_REMOTE = 0x8000
1345 GetConsoleMode = ctypes.WINFUNCTYPE(
1346 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1347 ctypes.POINTER(ctypes.wintypes.DWORD))(
1348 (b'GetConsoleMode', ctypes.windll.kernel32))
1349 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1350
1351 def not_a_console(handle):
1352 if handle == INVALID_HANDLE_VALUE or handle is None:
1353 return True
1354 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1355 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1356
1357 if not_a_console(h):
1358 return False
1359
1360 def next_nonbmp_pos(s):
1361 try:
1362 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1363 except StopIteration:
1364 return len(s)
1365
1366 while s:
1367 count = min(next_nonbmp_pos(s), 1024)
1368
1369 ret = WriteConsoleW(
1370 h, s, count if count else 2, ctypes.byref(written), None)
1371 if ret == 0:
1372 raise OSError('Failed to write string')
1373 if not count: # We just wrote a non-BMP character
1374 assert written.value == 2
1375 s = s[1:]
1376 else:
1377 assert written.value > 0
1378 s = s[written.value:]
1379 return True
1380
1381
1382 def write_string(s, out=None, encoding=None):
1383 if out is None:
1384 out = sys.stderr
1385 assert type(s) == compat_str
1386
1387 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1388 if _windows_write_string(s, out):
1389 return
1390
1391 if ('b' in getattr(out, 'mode', '') or
1392 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1393 byt = s.encode(encoding or preferredencoding(), 'ignore')
1394 out.write(byt)
1395 elif hasattr(out, 'buffer'):
1396 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1397 byt = s.encode(enc, 'ignore')
1398 out.buffer.write(byt)
1399 else:
1400 out.write(s)
1401 out.flush()
1402
1403
1404 def bytes_to_intlist(bs):
1405 if not bs:
1406 return []
1407 if isinstance(bs[0], int): # Python 3
1408 return list(bs)
1409 else:
1410 return [ord(c) for c in bs]
1411
1412
1413 def intlist_to_bytes(xs):
1414 if not xs:
1415 return b''
1416 return compat_struct_pack('%dB' % len(xs), *xs)
1417
1418
1419 # Cross-platform file locking
1420 if sys.platform == 'win32':
1421 import ctypes.wintypes
1422 import msvcrt
1423
1424 class OVERLAPPED(ctypes.Structure):
1425 _fields_ = [
1426 ('Internal', ctypes.wintypes.LPVOID),
1427 ('InternalHigh', ctypes.wintypes.LPVOID),
1428 ('Offset', ctypes.wintypes.DWORD),
1429 ('OffsetHigh', ctypes.wintypes.DWORD),
1430 ('hEvent', ctypes.wintypes.HANDLE),
1431 ]
1432
1433 kernel32 = ctypes.windll.kernel32
1434 LockFileEx = kernel32.LockFileEx
1435 LockFileEx.argtypes = [
1436 ctypes.wintypes.HANDLE, # hFile
1437 ctypes.wintypes.DWORD, # dwFlags
1438 ctypes.wintypes.DWORD, # dwReserved
1439 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1441 ctypes.POINTER(OVERLAPPED) # Overlapped
1442 ]
1443 LockFileEx.restype = ctypes.wintypes.BOOL
1444 UnlockFileEx = kernel32.UnlockFileEx
1445 UnlockFileEx.argtypes = [
1446 ctypes.wintypes.HANDLE, # hFile
1447 ctypes.wintypes.DWORD, # dwReserved
1448 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1450 ctypes.POINTER(OVERLAPPED) # Overlapped
1451 ]
1452 UnlockFileEx.restype = ctypes.wintypes.BOOL
1453 whole_low = 0xffffffff
1454 whole_high = 0x7fffffff
1455
1456 def _lock_file(f, exclusive):
1457 overlapped = OVERLAPPED()
1458 overlapped.Offset = 0
1459 overlapped.OffsetHigh = 0
1460 overlapped.hEvent = 0
1461 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1462 handle = msvcrt.get_osfhandle(f.fileno())
1463 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1464 whole_low, whole_high, f._lock_file_overlapped_p):
1465 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1466
1467 def _unlock_file(f):
1468 assert f._lock_file_overlapped_p
1469 handle = msvcrt.get_osfhandle(f.fileno())
1470 if not UnlockFileEx(handle, 0,
1471 whole_low, whole_high, f._lock_file_overlapped_p):
1472 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1473
1474 else:
1475 # Some platforms, such as Jython, is missing fcntl
1476 try:
1477 import fcntl
1478
1479 def _lock_file(f, exclusive):
1480 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1481
1482 def _unlock_file(f):
1483 fcntl.flock(f, fcntl.LOCK_UN)
1484 except ImportError:
1485 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1486
1487 def _lock_file(f, exclusive):
1488 raise IOError(UNSUPPORTED_MSG)
1489
1490 def _unlock_file(f):
1491 raise IOError(UNSUPPORTED_MSG)
1492
1493
1494 class locked_file(object):
1495 def __init__(self, filename, mode, encoding=None):
1496 assert mode in ['r', 'a', 'w']
1497 self.f = io.open(filename, mode, encoding=encoding)
1498 self.mode = mode
1499
1500 def __enter__(self):
1501 exclusive = self.mode != 'r'
1502 try:
1503 _lock_file(self.f, exclusive)
1504 except IOError:
1505 self.f.close()
1506 raise
1507 return self
1508
1509 def __exit__(self, etype, value, traceback):
1510 try:
1511 _unlock_file(self.f)
1512 finally:
1513 self.f.close()
1514
1515 def __iter__(self):
1516 return iter(self.f)
1517
1518 def write(self, *args):
1519 return self.f.write(*args)
1520
1521 def read(self, *args):
1522 return self.f.read(*args)
1523
1524
1525 def get_filesystem_encoding():
1526 encoding = sys.getfilesystemencoding()
1527 return encoding if encoding is not None else 'utf-8'
1528
1529
1530 def shell_quote(args):
1531 quoted_args = []
1532 encoding = get_filesystem_encoding()
1533 for a in args:
1534 if isinstance(a, bytes):
1535 # We may get a filename encoded with 'encodeFilename'
1536 a = a.decode(encoding)
1537 quoted_args.append(compat_shlex_quote(a))
1538 return ' '.join(quoted_args)
1539
1540
1541 def smuggle_url(url, data):
1542 """ Pass additional data in a URL for internal use. """
1543
1544 url, idata = unsmuggle_url(url, {})
1545 data.update(idata)
1546 sdata = compat_urllib_parse_urlencode(
1547 {'__youtubedl_smuggle': json.dumps(data)})
1548 return url + '#' + sdata
1549
1550
1551 def unsmuggle_url(smug_url, default=None):
1552 if '#__youtubedl_smuggle' not in smug_url:
1553 return smug_url, default
1554 url, _, sdata = smug_url.rpartition('#')
1555 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1556 data = json.loads(jsond)
1557 return url, data
1558
1559
1560 def format_bytes(bytes):
1561 if bytes is None:
1562 return 'N/A'
1563 if type(bytes) is str:
1564 bytes = float(bytes)
1565 if bytes == 0.0:
1566 exponent = 0
1567 else:
1568 exponent = int(math.log(bytes, 1024.0))
1569 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1570 converted = float(bytes) / float(1024 ** exponent)
1571 return '%.2f%s' % (converted, suffix)
1572
1573
1574 def lookup_unit_table(unit_table, s):
1575 units_re = '|'.join(re.escape(u) for u in unit_table)
1576 m = re.match(
1577 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1578 if not m:
1579 return None
1580 num_str = m.group('num').replace(',', '.')
1581 mult = unit_table[m.group('unit')]
1582 return int(float(num_str) * mult)
1583
1584
1585 def parse_filesize(s):
1586 if s is None:
1587 return None
1588
1589 # The lower-case forms are of course incorrect and unofficial,
1590 # but we support those too
1591 _UNIT_TABLE = {
1592 'B': 1,
1593 'b': 1,
1594 'bytes': 1,
1595 'KiB': 1024,
1596 'KB': 1000,
1597 'kB': 1024,
1598 'Kb': 1000,
1599 'kb': 1000,
1600 'kilobytes': 1000,
1601 'kibibytes': 1024,
1602 'MiB': 1024 ** 2,
1603 'MB': 1000 ** 2,
1604 'mB': 1024 ** 2,
1605 'Mb': 1000 ** 2,
1606 'mb': 1000 ** 2,
1607 'megabytes': 1000 ** 2,
1608 'mebibytes': 1024 ** 2,
1609 'GiB': 1024 ** 3,
1610 'GB': 1000 ** 3,
1611 'gB': 1024 ** 3,
1612 'Gb': 1000 ** 3,
1613 'gb': 1000 ** 3,
1614 'gigabytes': 1000 ** 3,
1615 'gibibytes': 1024 ** 3,
1616 'TiB': 1024 ** 4,
1617 'TB': 1000 ** 4,
1618 'tB': 1024 ** 4,
1619 'Tb': 1000 ** 4,
1620 'tb': 1000 ** 4,
1621 'terabytes': 1000 ** 4,
1622 'tebibytes': 1024 ** 4,
1623 'PiB': 1024 ** 5,
1624 'PB': 1000 ** 5,
1625 'pB': 1024 ** 5,
1626 'Pb': 1000 ** 5,
1627 'pb': 1000 ** 5,
1628 'petabytes': 1000 ** 5,
1629 'pebibytes': 1024 ** 5,
1630 'EiB': 1024 ** 6,
1631 'EB': 1000 ** 6,
1632 'eB': 1024 ** 6,
1633 'Eb': 1000 ** 6,
1634 'eb': 1000 ** 6,
1635 'exabytes': 1000 ** 6,
1636 'exbibytes': 1024 ** 6,
1637 'ZiB': 1024 ** 7,
1638 'ZB': 1000 ** 7,
1639 'zB': 1024 ** 7,
1640 'Zb': 1000 ** 7,
1641 'zb': 1000 ** 7,
1642 'zettabytes': 1000 ** 7,
1643 'zebibytes': 1024 ** 7,
1644 'YiB': 1024 ** 8,
1645 'YB': 1000 ** 8,
1646 'yB': 1024 ** 8,
1647 'Yb': 1000 ** 8,
1648 'yb': 1000 ** 8,
1649 'yottabytes': 1000 ** 8,
1650 'yobibytes': 1024 ** 8,
1651 }
1652
1653 return lookup_unit_table(_UNIT_TABLE, s)
1654
1655
1656 def parse_count(s):
1657 if s is None:
1658 return None
1659
1660 s = s.strip()
1661
1662 if re.match(r'^[\d,.]+$', s):
1663 return str_to_int(s)
1664
1665 _UNIT_TABLE = {
1666 'k': 1000,
1667 'K': 1000,
1668 'm': 1000 ** 2,
1669 'M': 1000 ** 2,
1670 'kk': 1000 ** 2,
1671 'KK': 1000 ** 2,
1672 }
1673
1674 return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677 def month_by_name(name, lang='en'):
1678 """ Return the number of a month by (locale-independently) English name """
1679
1680 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1681
1682 try:
1683 return month_names.index(name) + 1
1684 except ValueError:
1685 return None
1686
1687
1688 def month_by_abbreviation(abbrev):
1689 """ Return the number of a month by (locale-independently) English
1690 abbreviations """
1691
1692 try:
1693 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1694 except ValueError:
1695 return None
1696
1697
1698 def fix_xml_ampersands(xml_str):
1699 """Replace all the '&' by '&amp;' in XML"""
1700 return re.sub(
1701 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1702 '&amp;',
1703 xml_str)
1704
1705
1706 def setproctitle(title):
1707 assert isinstance(title, compat_str)
1708
1709 # ctypes in Jython is not complete
1710 # http://bugs.jython.org/issue2148
1711 if sys.platform.startswith('java'):
1712 return
1713
1714 try:
1715 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1716 except OSError:
1717 return
1718 except TypeError:
1719 # LoadLibrary in Windows Python 2.7.13 only expects
1720 # a bytestring, but since unicode_literals turns
1721 # every string into a unicode string, it fails.
1722 return
1723 title_bytes = title.encode('utf-8')
1724 buf = ctypes.create_string_buffer(len(title_bytes))
1725 buf.value = title_bytes
1726 try:
1727 libc.prctl(15, buf, 0, 0, 0)
1728 except AttributeError:
1729 return # Strange libc, just skip this
1730
1731
1732 def remove_start(s, start):
1733 return s[len(start):] if s is not None and s.startswith(start) else s
1734
1735
1736 def remove_end(s, end):
1737 return s[:-len(end)] if s is not None and s.endswith(end) else s
1738
1739
1740 def remove_quotes(s):
1741 if s is None or len(s) < 2:
1742 return s
1743 for quote in ('"', "'", ):
1744 if s[0] == quote and s[-1] == quote:
1745 return s[1:-1]
1746 return s
1747
1748
1749 def url_basename(url):
1750 path = compat_urlparse.urlparse(url).path
1751 return path.strip('/').split('/')[-1]
1752
1753
1754 def base_url(url):
1755 return re.match(r'https?://[^?#&]+/', url).group()
1756
1757
1758 def urljoin(base, path):
1759 if isinstance(path, bytes):
1760 path = path.decode('utf-8')
1761 if not isinstance(path, compat_str) or not path:
1762 return None
1763 if re.match(r'^(?:https?:)?//', path):
1764 return path
1765 if isinstance(base, bytes):
1766 base = base.decode('utf-8')
1767 if not isinstance(base, compat_str) or not re.match(
1768 r'^(?:https?:)?//', base):
1769 return None
1770 return compat_urlparse.urljoin(base, path)
1771
1772
1773 class HEADRequest(compat_urllib_request.Request):
1774 def get_method(self):
1775 return 'HEAD'
1776
1777
1778 class PUTRequest(compat_urllib_request.Request):
1779 def get_method(self):
1780 return 'PUT'
1781
1782
1783 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1784 if get_attr:
1785 if v is not None:
1786 v = getattr(v, get_attr, None)
1787 if v == '':
1788 v = None
1789 if v is None:
1790 return default
1791 try:
1792 return int(v) * invscale // scale
1793 except ValueError:
1794 return default
1795
1796
1797 def str_or_none(v, default=None):
1798 return default if v is None else compat_str(v)
1799
1800
1801 def str_to_int(int_str):
1802 """ A more relaxed version of int_or_none """
1803 if int_str is None:
1804 return None
1805 int_str = re.sub(r'[,\.\+]', '', int_str)
1806 return int(int_str)
1807
1808
1809 def float_or_none(v, scale=1, invscale=1, default=None):
1810 if v is None:
1811 return default
1812 try:
1813 return float(v) * invscale / scale
1814 except ValueError:
1815 return default
1816
1817
1818 def bool_or_none(v, default=None):
1819 return v if isinstance(v, bool) else default
1820
1821
1822 def strip_or_none(v):
1823 return None if v is None else v.strip()
1824
1825
1826 def parse_duration(s):
1827 if not isinstance(s, compat_basestring):
1828 return None
1829
1830 s = s.strip()
1831
1832 days, hours, mins, secs, ms = [None] * 5
1833 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1834 if m:
1835 days, hours, mins, secs, ms = m.groups()
1836 else:
1837 m = re.match(
1838 r'''(?ix)(?:P?T)?
1839 (?:
1840 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1841 )?
1842 (?:
1843 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1844 )?
1845 (?:
1846 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1847 )?
1848 (?:
1849 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1850 )?Z?$''', s)
1851 if m:
1852 days, hours, mins, secs, ms = m.groups()
1853 else:
1854 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1855 if m:
1856 hours, mins = m.groups()
1857 else:
1858 return None
1859
1860 duration = 0
1861 if secs:
1862 duration += float(secs)
1863 if mins:
1864 duration += float(mins) * 60
1865 if hours:
1866 duration += float(hours) * 60 * 60
1867 if days:
1868 duration += float(days) * 24 * 60 * 60
1869 if ms:
1870 duration += float(ms)
1871 return duration
1872
1873
1874 def prepend_extension(filename, ext, expected_real_ext=None):
1875 name, real_ext = os.path.splitext(filename)
1876 return (
1877 '{0}.{1}{2}'.format(name, ext, real_ext)
1878 if not expected_real_ext or real_ext[1:] == expected_real_ext
1879 else '{0}.{1}'.format(filename, ext))
1880
1881
1882 def replace_extension(filename, ext, expected_real_ext=None):
1883 name, real_ext = os.path.splitext(filename)
1884 return '{0}.{1}'.format(
1885 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1886 ext)
1887
1888
1889 def check_executable(exe, args=[]):
1890 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1891 args can be a list of arguments for a short output (like -version) """
1892 try:
1893 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1894 except OSError:
1895 return False
1896 return exe
1897
1898
1899 def get_exe_version(exe, args=['--version'],
1900 version_re=None, unrecognized='present'):
1901 """ Returns the version of the specified executable,
1902 or False if the executable is not present """
1903 try:
1904 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1905 # SIGTTOU if youtube-dl is run in the background.
1906 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1907 out, _ = subprocess.Popen(
1908 [encodeArgument(exe)] + args,
1909 stdin=subprocess.PIPE,
1910 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1911 except OSError:
1912 return False
1913 if isinstance(out, bytes): # Python 2.x
1914 out = out.decode('ascii', 'ignore')
1915 return detect_exe_version(out, version_re, unrecognized)
1916
1917
1918 def detect_exe_version(output, version_re=None, unrecognized='present'):
1919 assert isinstance(output, compat_str)
1920 if version_re is None:
1921 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1922 m = re.search(version_re, output)
1923 if m:
1924 return m.group(1)
1925 else:
1926 return unrecognized
1927
1928
1929 class PagedList(object):
1930 def __len__(self):
1931 # This is only useful for tests
1932 return len(self.getslice())
1933
1934
1935 class OnDemandPagedList(PagedList):
1936 def __init__(self, pagefunc, pagesize, use_cache=True):
1937 self._pagefunc = pagefunc
1938 self._pagesize = pagesize
1939 self._use_cache = use_cache
1940 if use_cache:
1941 self._cache = {}
1942
1943 def getslice(self, start=0, end=None):
1944 res = []
1945 for pagenum in itertools.count(start // self._pagesize):
1946 firstid = pagenum * self._pagesize
1947 nextfirstid = pagenum * self._pagesize + self._pagesize
1948 if start >= nextfirstid:
1949 continue
1950
1951 page_results = None
1952 if self._use_cache:
1953 page_results = self._cache.get(pagenum)
1954 if page_results is None:
1955 page_results = list(self._pagefunc(pagenum))
1956 if self._use_cache:
1957 self._cache[pagenum] = page_results
1958
1959 startv = (
1960 start % self._pagesize
1961 if firstid <= start < nextfirstid
1962 else 0)
1963
1964 endv = (
1965 ((end - 1) % self._pagesize) + 1
1966 if (end is not None and firstid <= end <= nextfirstid)
1967 else None)
1968
1969 if startv != 0 or endv is not None:
1970 page_results = page_results[startv:endv]
1971 res.extend(page_results)
1972
1973 # A little optimization - if current page is not "full", ie. does
1974 # not contain page_size videos then we can assume that this page
1975 # is the last one - there are no more ids on further pages -
1976 # i.e. no need to query again.
1977 if len(page_results) + startv < self._pagesize:
1978 break
1979
1980 # If we got the whole page, but the next page is not interesting,
1981 # break out early as well
1982 if end == nextfirstid:
1983 break
1984 return res
1985
1986
1987 class InAdvancePagedList(PagedList):
1988 def __init__(self, pagefunc, pagecount, pagesize):
1989 self._pagefunc = pagefunc
1990 self._pagecount = pagecount
1991 self._pagesize = pagesize
1992
1993 def getslice(self, start=0, end=None):
1994 res = []
1995 start_page = start // self._pagesize
1996 end_page = (
1997 self._pagecount if end is None else (end // self._pagesize + 1))
1998 skip_elems = start - start_page * self._pagesize
1999 only_more = None if end is None else end - start
2000 for pagenum in range(start_page, end_page):
2001 page = list(self._pagefunc(pagenum))
2002 if skip_elems:
2003 page = page[skip_elems:]
2004 skip_elems = None
2005 if only_more is not None:
2006 if len(page) < only_more:
2007 only_more -= len(page)
2008 else:
2009 page = page[:only_more]
2010 res.extend(page)
2011 break
2012 res.extend(page)
2013 return res
2014
2015
2016 def uppercase_escape(s):
2017 unicode_escape = codecs.getdecoder('unicode_escape')
2018 return re.sub(
2019 r'\\U[0-9a-fA-F]{8}',
2020 lambda m: unicode_escape(m.group(0))[0],
2021 s)
2022
2023
2024 def lowercase_escape(s):
2025 unicode_escape = codecs.getdecoder('unicode_escape')
2026 return re.sub(
2027 r'\\u[0-9a-fA-F]{4}',
2028 lambda m: unicode_escape(m.group(0))[0],
2029 s)
2030
2031
2032 def escape_rfc3986(s):
2033 """Escape non-ASCII characters as suggested by RFC 3986"""
2034 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2035 s = s.encode('utf-8')
2036 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2037
2038
2039 def escape_url(url):
2040 """Escape URL as suggested by RFC 3986"""
2041 url_parsed = compat_urllib_parse_urlparse(url)
2042 return url_parsed._replace(
2043 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2044 path=escape_rfc3986(url_parsed.path),
2045 params=escape_rfc3986(url_parsed.params),
2046 query=escape_rfc3986(url_parsed.query),
2047 fragment=escape_rfc3986(url_parsed.fragment)
2048 ).geturl()
2049
2050
2051 def read_batch_urls(batch_fd):
2052 def fixup(url):
2053 if not isinstance(url, compat_str):
2054 url = url.decode('utf-8', 'replace')
2055 BOM_UTF8 = '\xef\xbb\xbf'
2056 if url.startswith(BOM_UTF8):
2057 url = url[len(BOM_UTF8):]
2058 url = url.strip()
2059 if url.startswith(('#', ';', ']')):
2060 return False
2061 return url
2062
2063 with contextlib.closing(batch_fd) as fd:
2064 return [url for url in map(fixup, fd) if url]
2065
2066
2067 def urlencode_postdata(*args, **kargs):
2068 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2069
2070
2071 def update_url_query(url, query):
2072 if not query:
2073 return url
2074 parsed_url = compat_urlparse.urlparse(url)
2075 qs = compat_parse_qs(parsed_url.query)
2076 qs.update(query)
2077 return compat_urlparse.urlunparse(parsed_url._replace(
2078 query=compat_urllib_parse_urlencode(qs, True)))
2079
2080
2081 def update_Request(req, url=None, data=None, headers={}, query={}):
2082 req_headers = req.headers.copy()
2083 req_headers.update(headers)
2084 req_data = data or req.data
2085 req_url = update_url_query(url or req.get_full_url(), query)
2086 req_get_method = req.get_method()
2087 if req_get_method == 'HEAD':
2088 req_type = HEADRequest
2089 elif req_get_method == 'PUT':
2090 req_type = PUTRequest
2091 else:
2092 req_type = compat_urllib_request.Request
2093 new_req = req_type(
2094 req_url, data=req_data, headers=req_headers,
2095 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2096 if hasattr(req, 'timeout'):
2097 new_req.timeout = req.timeout
2098 return new_req
2099
2100
2101 def _multipart_encode_impl(data, boundary):
2102 content_type = 'multipart/form-data; boundary=%s' % boundary
2103
2104 out = b''
2105 for k, v in data.items():
2106 out += b'--' + boundary.encode('ascii') + b'\r\n'
2107 if isinstance(k, compat_str):
2108 k = k.encode('utf-8')
2109 if isinstance(v, compat_str):
2110 v = v.encode('utf-8')
2111 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2112 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2113 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2114 if boundary.encode('ascii') in content:
2115 raise ValueError('Boundary overlaps with data')
2116 out += content
2117
2118 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2119
2120 return out, content_type
2121
2122
2123 def multipart_encode(data, boundary=None):
2124 '''
2125 Encode a dict to RFC 7578-compliant form-data
2126
2127 data:
2128 A dict where keys and values can be either Unicode or bytes-like
2129 objects.
2130 boundary:
2131 If specified a Unicode object, it's used as the boundary. Otherwise
2132 a random boundary is generated.
2133
2134 Reference: https://tools.ietf.org/html/rfc7578
2135 '''
2136 has_specified_boundary = boundary is not None
2137
2138 while True:
2139 if boundary is None:
2140 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2141
2142 try:
2143 out, content_type = _multipart_encode_impl(data, boundary)
2144 break
2145 except ValueError:
2146 if has_specified_boundary:
2147 raise
2148 boundary = None
2149
2150 return out, content_type
2151
2152
2153 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2154 if isinstance(key_or_keys, (list, tuple)):
2155 for key in key_or_keys:
2156 if key not in d or d[key] is None or skip_false_values and not d[key]:
2157 continue
2158 return d[key]
2159 return default
2160 return d.get(key_or_keys, default)
2161
2162
2163 def try_get(src, getter, expected_type=None):
2164 if not isinstance(getter, (list, tuple)):
2165 getter = [getter]
2166 for get in getter:
2167 try:
2168 v = get(src)
2169 except (AttributeError, KeyError, TypeError, IndexError):
2170 pass
2171 else:
2172 if expected_type is None or isinstance(v, expected_type):
2173 return v
2174
2175
2176 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2177 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2178
2179
2180 US_RATINGS = {
2181 'G': 0,
2182 'PG': 10,
2183 'PG-13': 13,
2184 'R': 16,
2185 'NC': 18,
2186 }
2187
2188
2189 TV_PARENTAL_GUIDELINES = {
2190 'TV-Y': 0,
2191 'TV-Y7': 7,
2192 'TV-G': 0,
2193 'TV-PG': 0,
2194 'TV-14': 14,
2195 'TV-MA': 17,
2196 }
2197
2198
2199 def parse_age_limit(s):
2200 if type(s) == int:
2201 return s if 0 <= s <= 21 else None
2202 if not isinstance(s, compat_basestring):
2203 return None
2204 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2205 if m:
2206 return int(m.group('age'))
2207 if s in US_RATINGS:
2208 return US_RATINGS[s]
2209 return TV_PARENTAL_GUIDELINES.get(s)
2210
2211
2212 def strip_jsonp(code):
2213 return re.sub(
2214 r'''(?sx)^
2215 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2216 (?:\s*&&\s*(?P=func_name))?
2217 \s*\(\s*(?P<callback_data>.*)\);?
2218 \s*?(?://[^\n]*)*$''',
2219 r'\g<callback_data>', code)
2220
2221
2222 def js_to_json(code):
2223 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2224 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2225 INTEGER_TABLE = (
2226 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2227 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2228 )
2229
2230 def fix_kv(m):
2231 v = m.group(0)
2232 if v in ('true', 'false', 'null'):
2233 return v
2234 elif v.startswith('/*') or v.startswith('//') or v == ',':
2235 return ""
2236
2237 if v[0] in ("'", '"'):
2238 v = re.sub(r'(?s)\\.|"', lambda m: {
2239 '"': '\\"',
2240 "\\'": "'",
2241 '\\\n': '',
2242 '\\x': '\\u00',
2243 }.get(m.group(0), m.group(0)), v[1:-1])
2244
2245 for regex, base in INTEGER_TABLE:
2246 im = re.match(regex, v)
2247 if im:
2248 i = int(im.group(1), base)
2249 return '"%d":' % i if v.endswith(':') else '%d' % i
2250
2251 return '"%s"' % v
2252
2253 return re.sub(r'''(?sx)
2254 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2255 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2256 {comment}|,(?={skip}[\]}}])|
2257 [a-zA-Z_][.a-zA-Z_0-9]*|
2258 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2259 [0-9]+(?={skip}:)
2260 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2261
2262
2263 def qualities(quality_ids):
2264 """ Get a numeric quality value out of a list of possible values """
2265 def q(qid):
2266 try:
2267 return quality_ids.index(qid)
2268 except ValueError:
2269 return -1
2270 return q
2271
2272
2273 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2274
2275
2276 def limit_length(s, length):
2277 """ Add ellipses to overly long strings """
2278 if s is None:
2279 return None
2280 ELLIPSES = '...'
2281 if len(s) > length:
2282 return s[:length - len(ELLIPSES)] + ELLIPSES
2283 return s
2284
2285
2286 def version_tuple(v):
2287 return tuple(int(e) for e in re.split(r'[-.]', v))
2288
2289
2290 def is_outdated_version(version, limit, assume_new=True):
2291 if not version:
2292 return not assume_new
2293 try:
2294 return version_tuple(version) < version_tuple(limit)
2295 except ValueError:
2296 return not assume_new
2297
2298
2299 def ytdl_is_updateable():
2300 """ Returns if youtube-dl can be updated with -U """
2301 from zipimport import zipimporter
2302
2303 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2304
2305
2306 def args_to_str(args):
2307 # Get a short string representation for a subprocess command
2308 return ' '.join(compat_shlex_quote(a) for a in args)
2309
2310
2311 def error_to_compat_str(err):
2312 err_str = str(err)
2313 # On python 2 error byte string must be decoded with proper
2314 # encoding rather than ascii
2315 if sys.version_info[0] < 3:
2316 err_str = err_str.decode(preferredencoding())
2317 return err_str
2318
2319
2320 def mimetype2ext(mt):
2321 if mt is None:
2322 return None
2323
2324 ext = {
2325 'audio/mp4': 'm4a',
2326 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2327 # it's the most popular one
2328 'audio/mpeg': 'mp3',
2329 }.get(mt)
2330 if ext is not None:
2331 return ext
2332
2333 _, _, res = mt.rpartition('/')
2334 res = res.split(';')[0].strip().lower()
2335
2336 return {
2337 '3gpp': '3gp',
2338 'smptett+xml': 'tt',
2339 'ttaf+xml': 'dfxp',
2340 'ttml+xml': 'ttml',
2341 'x-flv': 'flv',
2342 'x-mp4-fragmented': 'mp4',
2343 'x-ms-wmv': 'wmv',
2344 'mpegurl': 'm3u8',
2345 'x-mpegurl': 'm3u8',
2346 'vnd.apple.mpegurl': 'm3u8',
2347 'dash+xml': 'mpd',
2348 'f4m+xml': 'f4m',
2349 'hds+xml': 'f4m',
2350 'vnd.ms-sstr+xml': 'ism',
2351 'quicktime': 'mov',
2352 'mp2t': 'ts',
2353 }.get(res, res)
2354
2355
2356 def parse_codecs(codecs_str):
2357 # http://tools.ietf.org/html/rfc6381
2358 if not codecs_str:
2359 return {}
2360 splited_codecs = list(filter(None, map(
2361 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2362 vcodec, acodec = None, None
2363 for full_codec in splited_codecs:
2364 codec = full_codec.split('.')[0]
2365 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2366 if not vcodec:
2367 vcodec = full_codec
2368 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2369 if not acodec:
2370 acodec = full_codec
2371 else:
2372 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2373 if not vcodec and not acodec:
2374 if len(splited_codecs) == 2:
2375 return {
2376 'vcodec': vcodec,
2377 'acodec': acodec,
2378 }
2379 elif len(splited_codecs) == 1:
2380 return {
2381 'vcodec': 'none',
2382 'acodec': vcodec,
2383 }
2384 else:
2385 return {
2386 'vcodec': vcodec or 'none',
2387 'acodec': acodec or 'none',
2388 }
2389 return {}
2390
2391
2392 def urlhandle_detect_ext(url_handle):
2393 getheader = url_handle.headers.get
2394
2395 cd = getheader('Content-Disposition')
2396 if cd:
2397 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2398 if m:
2399 e = determine_ext(m.group('filename'), default_ext=None)
2400 if e:
2401 return e
2402
2403 return mimetype2ext(getheader('Content-Type'))
2404
2405
2406 def encode_data_uri(data, mime_type):
2407 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2408
2409
2410 def age_restricted(content_limit, age_limit):
2411 """ Returns True iff the content should be blocked """
2412
2413 if age_limit is None: # No limit set
2414 return False
2415 if content_limit is None:
2416 return False # Content available for everyone
2417 return age_limit < content_limit
2418
2419
2420 def is_html(first_bytes):
2421 """ Detect whether a file contains HTML by examining its first bytes. """
2422
2423 BOMS = [
2424 (b'\xef\xbb\xbf', 'utf-8'),
2425 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2426 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2427 (b'\xff\xfe', 'utf-16-le'),
2428 (b'\xfe\xff', 'utf-16-be'),
2429 ]
2430 for bom, enc in BOMS:
2431 if first_bytes.startswith(bom):
2432 s = first_bytes[len(bom):].decode(enc, 'replace')
2433 break
2434 else:
2435 s = first_bytes.decode('utf-8', 'replace')
2436
2437 return re.match(r'^\s*<', s)
2438
2439
2440 def determine_protocol(info_dict):
2441 protocol = info_dict.get('protocol')
2442 if protocol is not None:
2443 return protocol
2444
2445 url = info_dict['url']
2446 if url.startswith('rtmp'):
2447 return 'rtmp'
2448 elif url.startswith('mms'):
2449 return 'mms'
2450 elif url.startswith('rtsp'):
2451 return 'rtsp'
2452
2453 ext = determine_ext(url)
2454 if ext == 'm3u8':
2455 return 'm3u8'
2456 elif ext == 'f4m':
2457 return 'f4m'
2458
2459 return compat_urllib_parse_urlparse(url).scheme
2460
2461
2462 def render_table(header_row, data):
2463 """ Render a list of rows, each as a list of values """
2464 table = [header_row] + data
2465 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2466 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2467 return '\n'.join(format_str % tuple(row) for row in table)
2468
2469
2470 def _match_one(filter_part, dct):
2471 COMPARISON_OPERATORS = {
2472 '<': operator.lt,
2473 '<=': operator.le,
2474 '>': operator.gt,
2475 '>=': operator.ge,
2476 '=': operator.eq,
2477 '!=': operator.ne,
2478 }
2479 operator_rex = re.compile(r'''(?x)\s*
2480 (?P<key>[a-z_]+)
2481 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2482 (?:
2483 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2484 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2485 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2486 )
2487 \s*$
2488 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2489 m = operator_rex.search(filter_part)
2490 if m:
2491 op = COMPARISON_OPERATORS[m.group('op')]
2492 actual_value = dct.get(m.group('key'))
2493 if (m.group('quotedstrval') is not None or
2494 m.group('strval') is not None or
2495 # If the original field is a string and matching comparisonvalue is
2496 # a number we should respect the origin of the original field
2497 # and process comparison value as a string (see
2498 # https://github.com/rg3/youtube-dl/issues/11082).
2499 actual_value is not None and m.group('intval') is not None and
2500 isinstance(actual_value, compat_str)):
2501 if m.group('op') not in ('=', '!='):
2502 raise ValueError(
2503 'Operator %s does not support string values!' % m.group('op'))
2504 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2505 quote = m.group('quote')
2506 if quote is not None:
2507 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2508 else:
2509 try:
2510 comparison_value = int(m.group('intval'))
2511 except ValueError:
2512 comparison_value = parse_filesize(m.group('intval'))
2513 if comparison_value is None:
2514 comparison_value = parse_filesize(m.group('intval') + 'B')
2515 if comparison_value is None:
2516 raise ValueError(
2517 'Invalid integer value %r in filter part %r' % (
2518 m.group('intval'), filter_part))
2519 if actual_value is None:
2520 return m.group('none_inclusive')
2521 return op(actual_value, comparison_value)
2522
2523 UNARY_OPERATORS = {
2524 '': lambda v: v is not None,
2525 '!': lambda v: v is None,
2526 }
2527 operator_rex = re.compile(r'''(?x)\s*
2528 (?P<op>%s)\s*(?P<key>[a-z_]+)
2529 \s*$
2530 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2531 m = operator_rex.search(filter_part)
2532 if m:
2533 op = UNARY_OPERATORS[m.group('op')]
2534 actual_value = dct.get(m.group('key'))
2535 return op(actual_value)
2536
2537 raise ValueError('Invalid filter part %r' % filter_part)
2538
2539
2540 def match_str(filter_str, dct):
2541 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2542
2543 return all(
2544 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2545
2546
2547 def match_filter_func(filter_str):
2548 def _match_func(info_dict):
2549 if match_str(filter_str, info_dict):
2550 return None
2551 else:
2552 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2553 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2554 return _match_func
2555
2556
2557 def parse_dfxp_time_expr(time_expr):
2558 if not time_expr:
2559 return
2560
2561 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2562 if mobj:
2563 return float(mobj.group('time_offset'))
2564
2565 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2566 if mobj:
2567 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2568
2569
2570 def srt_subtitles_timecode(seconds):
2571 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2572
2573
2574 def dfxp2srt(dfxp_data):
2575 '''
2576 @param dfxp_data A bytes-like object containing DFXP data
2577 @returns A unicode object containing converted SRT data
2578 '''
2579 LEGACY_NAMESPACES = (
2580 (b'http://www.w3.org/ns/ttml', [
2581 b'http://www.w3.org/2004/11/ttaf1',
2582 b'http://www.w3.org/2006/04/ttaf1',
2583 b'http://www.w3.org/2006/10/ttaf1',
2584 ]),
2585 (b'http://www.w3.org/ns/ttml#styling', [
2586 b'http://www.w3.org/ns/ttml#style',
2587 ]),
2588 )
2589
2590 SUPPORTED_STYLING = [
2591 'color',
2592 'fontFamily',
2593 'fontSize',
2594 'fontStyle',
2595 'fontWeight',
2596 'textDecoration'
2597 ]
2598
2599 _x = functools.partial(xpath_with_ns, ns_map={
2600 'ttml': 'http://www.w3.org/ns/ttml',
2601 'tts': 'http://www.w3.org/ns/ttml#styling',
2602 })
2603
2604 styles = {}
2605 default_style = {}
2606
2607 class TTMLPElementParser(object):
2608 _out = ''
2609 _unclosed_elements = []
2610 _applied_styles = []
2611
2612 def start(self, tag, attrib):
2613 if tag in (_x('ttml:br'), 'br'):
2614 self._out += '\n'
2615 else:
2616 unclosed_elements = []
2617 style = {}
2618 element_style_id = attrib.get('style')
2619 if default_style:
2620 style.update(default_style)
2621 if element_style_id:
2622 style.update(styles.get(element_style_id, {}))
2623 for prop in SUPPORTED_STYLING:
2624 prop_val = attrib.get(_x('tts:' + prop))
2625 if prop_val:
2626 style[prop] = prop_val
2627 if style:
2628 font = ''
2629 for k, v in sorted(style.items()):
2630 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2631 continue
2632 if k == 'color':
2633 font += ' color="%s"' % v
2634 elif k == 'fontSize':
2635 font += ' size="%s"' % v
2636 elif k == 'fontFamily':
2637 font += ' face="%s"' % v
2638 elif k == 'fontWeight' and v == 'bold':
2639 self._out += '<b>'
2640 unclosed_elements.append('b')
2641 elif k == 'fontStyle' and v == 'italic':
2642 self._out += '<i>'
2643 unclosed_elements.append('i')
2644 elif k == 'textDecoration' and v == 'underline':
2645 self._out += '<u>'
2646 unclosed_elements.append('u')
2647 if font:
2648 self._out += '<font' + font + '>'
2649 unclosed_elements.append('font')
2650 applied_style = {}
2651 if self._applied_styles:
2652 applied_style.update(self._applied_styles[-1])
2653 applied_style.update(style)
2654 self._applied_styles.append(applied_style)
2655 self._unclosed_elements.append(unclosed_elements)
2656
2657 def end(self, tag):
2658 if tag not in (_x('ttml:br'), 'br'):
2659 unclosed_elements = self._unclosed_elements.pop()
2660 for element in reversed(unclosed_elements):
2661 self._out += '</%s>' % element
2662 if unclosed_elements and self._applied_styles:
2663 self._applied_styles.pop()
2664
2665 def data(self, data):
2666 self._out += data
2667
2668 def close(self):
2669 return self._out.strip()
2670
2671 def parse_node(node):
2672 target = TTMLPElementParser()
2673 parser = xml.etree.ElementTree.XMLParser(target=target)
2674 parser.feed(xml.etree.ElementTree.tostring(node))
2675 return parser.close()
2676
2677 for k, v in LEGACY_NAMESPACES:
2678 for ns in v:
2679 dfxp_data = dfxp_data.replace(ns, k)
2680
2681 dfxp = compat_etree_fromstring(dfxp_data)
2682 out = []
2683 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2684
2685 if not paras:
2686 raise ValueError('Invalid dfxp/TTML subtitle')
2687
2688 repeat = False
2689 while True:
2690 for style in dfxp.findall(_x('.//ttml:style')):
2691 style_id = style.get('id')
2692 parent_style_id = style.get('style')
2693 if parent_style_id:
2694 if parent_style_id not in styles:
2695 repeat = True
2696 continue
2697 styles[style_id] = styles[parent_style_id].copy()
2698 for prop in SUPPORTED_STYLING:
2699 prop_val = style.get(_x('tts:' + prop))
2700 if prop_val:
2701 styles.setdefault(style_id, {})[prop] = prop_val
2702 if repeat:
2703 repeat = False
2704 else:
2705 break
2706
2707 for p in ('body', 'div'):
2708 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2709 if ele is None:
2710 continue
2711 style = styles.get(ele.get('style'))
2712 if not style:
2713 continue
2714 default_style.update(style)
2715
2716 for para, index in zip(paras, itertools.count(1)):
2717 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2718 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2719 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2720 if begin_time is None:
2721 continue
2722 if not end_time:
2723 if not dur:
2724 continue
2725 end_time = begin_time + dur
2726 out.append('%d\n%s --> %s\n%s\n\n' % (
2727 index,
2728 srt_subtitles_timecode(begin_time),
2729 srt_subtitles_timecode(end_time),
2730 parse_node(para)))
2731
2732 return ''.join(out)
2733
2734
2735 def cli_option(params, command_option, param):
2736 param = params.get(param)
2737 if param:
2738 param = compat_str(param)
2739 return [command_option, param] if param is not None else []
2740
2741
2742 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2743 param = params.get(param)
2744 if param is None:
2745 return []
2746 assert isinstance(param, bool)
2747 if separator:
2748 return [command_option + separator + (true_value if param else false_value)]
2749 return [command_option, true_value if param else false_value]
2750
2751
2752 def cli_valueless_option(params, command_option, param, expected_value=True):
2753 param = params.get(param)
2754 return [command_option] if param == expected_value else []
2755
2756
2757 def cli_configuration_args(params, param, default=[]):
2758 ex_args = params.get(param)
2759 if ex_args is None:
2760 return default
2761 assert isinstance(ex_args, list)
2762 return ex_args
2763
2764
2765 class ISO639Utils(object):
2766 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2767 _lang_map = {
2768 'aa': 'aar',
2769 'ab': 'abk',
2770 'ae': 'ave',
2771 'af': 'afr',
2772 'ak': 'aka',
2773 'am': 'amh',
2774 'an': 'arg',
2775 'ar': 'ara',
2776 'as': 'asm',
2777 'av': 'ava',
2778 'ay': 'aym',
2779 'az': 'aze',
2780 'ba': 'bak',
2781 'be': 'bel',
2782 'bg': 'bul',
2783 'bh': 'bih',
2784 'bi': 'bis',
2785 'bm': 'bam',
2786 'bn': 'ben',
2787 'bo': 'bod',
2788 'br': 'bre',
2789 'bs': 'bos',
2790 'ca': 'cat',
2791 'ce': 'che',
2792 'ch': 'cha',
2793 'co': 'cos',
2794 'cr': 'cre',
2795 'cs': 'ces',
2796 'cu': 'chu',
2797 'cv': 'chv',
2798 'cy': 'cym',
2799 'da': 'dan',
2800 'de': 'deu',
2801 'dv': 'div',
2802 'dz': 'dzo',
2803 'ee': 'ewe',
2804 'el': 'ell',
2805 'en': 'eng',
2806 'eo': 'epo',
2807 'es': 'spa',
2808 'et': 'est',
2809 'eu': 'eus',
2810 'fa': 'fas',
2811 'ff': 'ful',
2812 'fi': 'fin',
2813 'fj': 'fij',
2814 'fo': 'fao',
2815 'fr': 'fra',
2816 'fy': 'fry',
2817 'ga': 'gle',
2818 'gd': 'gla',
2819 'gl': 'glg',
2820 'gn': 'grn',
2821 'gu': 'guj',
2822 'gv': 'glv',
2823 'ha': 'hau',
2824 'he': 'heb',
2825 'hi': 'hin',
2826 'ho': 'hmo',
2827 'hr': 'hrv',
2828 'ht': 'hat',
2829 'hu': 'hun',
2830 'hy': 'hye',
2831 'hz': 'her',
2832 'ia': 'ina',
2833 'id': 'ind',
2834 'ie': 'ile',
2835 'ig': 'ibo',
2836 'ii': 'iii',
2837 'ik': 'ipk',
2838 'io': 'ido',
2839 'is': 'isl',
2840 'it': 'ita',
2841 'iu': 'iku',
2842 'ja': 'jpn',
2843 'jv': 'jav',
2844 'ka': 'kat',
2845 'kg': 'kon',
2846 'ki': 'kik',
2847 'kj': 'kua',
2848 'kk': 'kaz',
2849 'kl': 'kal',
2850 'km': 'khm',
2851 'kn': 'kan',
2852 'ko': 'kor',
2853 'kr': 'kau',
2854 'ks': 'kas',
2855 'ku': 'kur',
2856 'kv': 'kom',
2857 'kw': 'cor',
2858 'ky': 'kir',
2859 'la': 'lat',
2860 'lb': 'ltz',
2861 'lg': 'lug',
2862 'li': 'lim',
2863 'ln': 'lin',
2864 'lo': 'lao',
2865 'lt': 'lit',
2866 'lu': 'lub',
2867 'lv': 'lav',
2868 'mg': 'mlg',
2869 'mh': 'mah',
2870 'mi': 'mri',
2871 'mk': 'mkd',
2872 'ml': 'mal',
2873 'mn': 'mon',
2874 'mr': 'mar',
2875 'ms': 'msa',
2876 'mt': 'mlt',
2877 'my': 'mya',
2878 'na': 'nau',
2879 'nb': 'nob',
2880 'nd': 'nde',
2881 'ne': 'nep',
2882 'ng': 'ndo',
2883 'nl': 'nld',
2884 'nn': 'nno',
2885 'no': 'nor',
2886 'nr': 'nbl',
2887 'nv': 'nav',
2888 'ny': 'nya',
2889 'oc': 'oci',
2890 'oj': 'oji',
2891 'om': 'orm',
2892 'or': 'ori',
2893 'os': 'oss',
2894 'pa': 'pan',
2895 'pi': 'pli',
2896 'pl': 'pol',
2897 'ps': 'pus',
2898 'pt': 'por',
2899 'qu': 'que',
2900 'rm': 'roh',
2901 'rn': 'run',
2902 'ro': 'ron',
2903 'ru': 'rus',
2904 'rw': 'kin',
2905 'sa': 'san',
2906 'sc': 'srd',
2907 'sd': 'snd',
2908 'se': 'sme',
2909 'sg': 'sag',
2910 'si': 'sin',
2911 'sk': 'slk',
2912 'sl': 'slv',
2913 'sm': 'smo',
2914 'sn': 'sna',
2915 'so': 'som',
2916 'sq': 'sqi',
2917 'sr': 'srp',
2918 'ss': 'ssw',
2919 'st': 'sot',
2920 'su': 'sun',
2921 'sv': 'swe',
2922 'sw': 'swa',
2923 'ta': 'tam',
2924 'te': 'tel',
2925 'tg': 'tgk',
2926 'th': 'tha',
2927 'ti': 'tir',
2928 'tk': 'tuk',
2929 'tl': 'tgl',
2930 'tn': 'tsn',
2931 'to': 'ton',
2932 'tr': 'tur',
2933 'ts': 'tso',
2934 'tt': 'tat',
2935 'tw': 'twi',
2936 'ty': 'tah',
2937 'ug': 'uig',
2938 'uk': 'ukr',
2939 'ur': 'urd',
2940 'uz': 'uzb',
2941 've': 'ven',
2942 'vi': 'vie',
2943 'vo': 'vol',
2944 'wa': 'wln',
2945 'wo': 'wol',
2946 'xh': 'xho',
2947 'yi': 'yid',
2948 'yo': 'yor',
2949 'za': 'zha',
2950 'zh': 'zho',
2951 'zu': 'zul',
2952 }
2953
2954 @classmethod
2955 def short2long(cls, code):
2956 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2957 return cls._lang_map.get(code[:2])
2958
2959 @classmethod
2960 def long2short(cls, code):
2961 """Convert language code from ISO 639-2/T to ISO 639-1"""
2962 for short_name, long_name in cls._lang_map.items():
2963 if long_name == code:
2964 return short_name
2965
2966
2967 class ISO3166Utils(object):
2968 # From http://data.okfn.org/data/core/country-list
2969 _country_map = {
2970 'AF': 'Afghanistan',
2971 'AX': 'Åland Islands',
2972 'AL': 'Albania',
2973 'DZ': 'Algeria',
2974 'AS': 'American Samoa',
2975 'AD': 'Andorra',
2976 'AO': 'Angola',
2977 'AI': 'Anguilla',
2978 'AQ': 'Antarctica',
2979 'AG': 'Antigua and Barbuda',
2980 'AR': 'Argentina',
2981 'AM': 'Armenia',
2982 'AW': 'Aruba',
2983 'AU': 'Australia',
2984 'AT': 'Austria',
2985 'AZ': 'Azerbaijan',
2986 'BS': 'Bahamas',
2987 'BH': 'Bahrain',
2988 'BD': 'Bangladesh',
2989 'BB': 'Barbados',
2990 'BY': 'Belarus',
2991 'BE': 'Belgium',
2992 'BZ': 'Belize',
2993 'BJ': 'Benin',
2994 'BM': 'Bermuda',
2995 'BT': 'Bhutan',
2996 'BO': 'Bolivia, Plurinational State of',
2997 'BQ': 'Bonaire, Sint Eustatius and Saba',
2998 'BA': 'Bosnia and Herzegovina',
2999 'BW': 'Botswana',
3000 'BV': 'Bouvet Island',
3001 'BR': 'Brazil',
3002 'IO': 'British Indian Ocean Territory',
3003 'BN': 'Brunei Darussalam',
3004 'BG': 'Bulgaria',
3005 'BF': 'Burkina Faso',
3006 'BI': 'Burundi',
3007 'KH': 'Cambodia',
3008 'CM': 'Cameroon',
3009 'CA': 'Canada',
3010 'CV': 'Cape Verde',
3011 'KY': 'Cayman Islands',
3012 'CF': 'Central African Republic',
3013 'TD': 'Chad',
3014 'CL': 'Chile',
3015 'CN': 'China',
3016 'CX': 'Christmas Island',
3017 'CC': 'Cocos (Keeling) Islands',
3018 'CO': 'Colombia',
3019 'KM': 'Comoros',
3020 'CG': 'Congo',
3021 'CD': 'Congo, the Democratic Republic of the',
3022 'CK': 'Cook Islands',
3023 'CR': 'Costa Rica',
3024 'CI': 'Côte d\'Ivoire',
3025 'HR': 'Croatia',
3026 'CU': 'Cuba',
3027 'CW': 'Curaçao',
3028 'CY': 'Cyprus',
3029 'CZ': 'Czech Republic',
3030 'DK': 'Denmark',
3031 'DJ': 'Djibouti',
3032 'DM': 'Dominica',
3033 'DO': 'Dominican Republic',
3034 'EC': 'Ecuador',
3035 'EG': 'Egypt',
3036 'SV': 'El Salvador',
3037 'GQ': 'Equatorial Guinea',
3038 'ER': 'Eritrea',
3039 'EE': 'Estonia',
3040 'ET': 'Ethiopia',
3041 'FK': 'Falkland Islands (Malvinas)',
3042 'FO': 'Faroe Islands',
3043 'FJ': 'Fiji',
3044 'FI': 'Finland',
3045 'FR': 'France',
3046 'GF': 'French Guiana',
3047 'PF': 'French Polynesia',
3048 'TF': 'French Southern Territories',
3049 'GA': 'Gabon',
3050 'GM': 'Gambia',
3051 'GE': 'Georgia',
3052 'DE': 'Germany',
3053 'GH': 'Ghana',
3054 'GI': 'Gibraltar',
3055 'GR': 'Greece',
3056 'GL': 'Greenland',
3057 'GD': 'Grenada',
3058 'GP': 'Guadeloupe',
3059 'GU': 'Guam',
3060 'GT': 'Guatemala',
3061 'GG': 'Guernsey',
3062 'GN': 'Guinea',
3063 'GW': 'Guinea-Bissau',
3064 'GY': 'Guyana',
3065 'HT': 'Haiti',
3066 'HM': 'Heard Island and McDonald Islands',
3067 'VA': 'Holy See (Vatican City State)',
3068 'HN': 'Honduras',
3069 'HK': 'Hong Kong',
3070 'HU': 'Hungary',
3071 'IS': 'Iceland',
3072 'IN': 'India',
3073 'ID': 'Indonesia',
3074 'IR': 'Iran, Islamic Republic of',
3075 'IQ': 'Iraq',
3076 'IE': 'Ireland',
3077 'IM': 'Isle of Man',
3078 'IL': 'Israel',
3079 'IT': 'Italy',
3080 'JM': 'Jamaica',
3081 'JP': 'Japan',
3082 'JE': 'Jersey',
3083 'JO': 'Jordan',
3084 'KZ': 'Kazakhstan',
3085 'KE': 'Kenya',
3086 'KI': 'Kiribati',
3087 'KP': 'Korea, Democratic People\'s Republic of',
3088 'KR': 'Korea, Republic of',
3089 'KW': 'Kuwait',
3090 'KG': 'Kyrgyzstan',
3091 'LA': 'Lao People\'s Democratic Republic',
3092 'LV': 'Latvia',
3093 'LB': 'Lebanon',
3094 'LS': 'Lesotho',
3095 'LR': 'Liberia',
3096 'LY': 'Libya',
3097 'LI': 'Liechtenstein',
3098 'LT': 'Lithuania',
3099 'LU': 'Luxembourg',
3100 'MO': 'Macao',
3101 'MK': 'Macedonia, the Former Yugoslav Republic of',
3102 'MG': 'Madagascar',
3103 'MW': 'Malawi',
3104 'MY': 'Malaysia',
3105 'MV': 'Maldives',
3106 'ML': 'Mali',
3107 'MT': 'Malta',
3108 'MH': 'Marshall Islands',
3109 'MQ': 'Martinique',
3110 'MR': 'Mauritania',
3111 'MU': 'Mauritius',
3112 'YT': 'Mayotte',
3113 'MX': 'Mexico',
3114 'FM': 'Micronesia, Federated States of',
3115 'MD': 'Moldova, Republic of',
3116 'MC': 'Monaco',
3117 'MN': 'Mongolia',
3118 'ME': 'Montenegro',
3119 'MS': 'Montserrat',
3120 'MA': 'Morocco',
3121 'MZ': 'Mozambique',
3122 'MM': 'Myanmar',
3123 'NA': 'Namibia',
3124 'NR': 'Nauru',
3125 'NP': 'Nepal',
3126 'NL': 'Netherlands',
3127 'NC': 'New Caledonia',
3128 'NZ': 'New Zealand',
3129 'NI': 'Nicaragua',
3130 'NE': 'Niger',
3131 'NG': 'Nigeria',
3132 'NU': 'Niue',
3133 'NF': 'Norfolk Island',
3134 'MP': 'Northern Mariana Islands',
3135 'NO': 'Norway',
3136 'OM': 'Oman',
3137 'PK': 'Pakistan',
3138 'PW': 'Palau',
3139 'PS': 'Palestine, State of',
3140 'PA': 'Panama',
3141 'PG': 'Papua New Guinea',
3142 'PY': 'Paraguay',
3143 'PE': 'Peru',
3144 'PH': 'Philippines',
3145 'PN': 'Pitcairn',
3146 'PL': 'Poland',
3147 'PT': 'Portugal',
3148 'PR': 'Puerto Rico',
3149 'QA': 'Qatar',
3150 'RE': 'Réunion',
3151 'RO': 'Romania',
3152 'RU': 'Russian Federation',
3153 'RW': 'Rwanda',
3154 'BL': 'Saint Barthélemy',
3155 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3156 'KN': 'Saint Kitts and Nevis',
3157 'LC': 'Saint Lucia',
3158 'MF': 'Saint Martin (French part)',
3159 'PM': 'Saint Pierre and Miquelon',
3160 'VC': 'Saint Vincent and the Grenadines',
3161 'WS': 'Samoa',
3162 'SM': 'San Marino',
3163 'ST': 'Sao Tome and Principe',
3164 'SA': 'Saudi Arabia',
3165 'SN': 'Senegal',
3166 'RS': 'Serbia',
3167 'SC': 'Seychelles',
3168 'SL': 'Sierra Leone',
3169 'SG': 'Singapore',
3170 'SX': 'Sint Maarten (Dutch part)',
3171 'SK': 'Slovakia',
3172 'SI': 'Slovenia',
3173 'SB': 'Solomon Islands',
3174 'SO': 'Somalia',
3175 'ZA': 'South Africa',
3176 'GS': 'South Georgia and the South Sandwich Islands',
3177 'SS': 'South Sudan',
3178 'ES': 'Spain',
3179 'LK': 'Sri Lanka',
3180 'SD': 'Sudan',
3181 'SR': 'Suriname',
3182 'SJ': 'Svalbard and Jan Mayen',
3183 'SZ': 'Swaziland',
3184 'SE': 'Sweden',
3185 'CH': 'Switzerland',
3186 'SY': 'Syrian Arab Republic',
3187 'TW': 'Taiwan, Province of China',
3188 'TJ': 'Tajikistan',
3189 'TZ': 'Tanzania, United Republic of',
3190 'TH': 'Thailand',
3191 'TL': 'Timor-Leste',
3192 'TG': 'Togo',
3193 'TK': 'Tokelau',
3194 'TO': 'Tonga',
3195 'TT': 'Trinidad and Tobago',
3196 'TN': 'Tunisia',
3197 'TR': 'Turkey',
3198 'TM': 'Turkmenistan',
3199 'TC': 'Turks and Caicos Islands',
3200 'TV': 'Tuvalu',
3201 'UG': 'Uganda',
3202 'UA': 'Ukraine',
3203 'AE': 'United Arab Emirates',
3204 'GB': 'United Kingdom',
3205 'US': 'United States',
3206 'UM': 'United States Minor Outlying Islands',
3207 'UY': 'Uruguay',
3208 'UZ': 'Uzbekistan',
3209 'VU': 'Vanuatu',
3210 'VE': 'Venezuela, Bolivarian Republic of',
3211 'VN': 'Viet Nam',
3212 'VG': 'Virgin Islands, British',
3213 'VI': 'Virgin Islands, U.S.',
3214 'WF': 'Wallis and Futuna',
3215 'EH': 'Western Sahara',
3216 'YE': 'Yemen',
3217 'ZM': 'Zambia',
3218 'ZW': 'Zimbabwe',
3219 }
3220
3221 @classmethod
3222 def short2full(cls, code):
3223 """Convert an ISO 3166-2 country code to the corresponding full name"""
3224 return cls._country_map.get(code.upper())
3225
3226
3227 class GeoUtils(object):
3228 # Major IPv4 address blocks per country
3229 _country_ip_map = {
3230 'AD': '85.94.160.0/19',
3231 'AE': '94.200.0.0/13',
3232 'AF': '149.54.0.0/17',
3233 'AG': '209.59.64.0/18',
3234 'AI': '204.14.248.0/21',
3235 'AL': '46.99.0.0/16',
3236 'AM': '46.70.0.0/15',
3237 'AO': '105.168.0.0/13',
3238 'AP': '159.117.192.0/21',
3239 'AR': '181.0.0.0/12',
3240 'AS': '202.70.112.0/20',
3241 'AT': '84.112.0.0/13',
3242 'AU': '1.128.0.0/11',
3243 'AW': '181.41.0.0/18',
3244 'AZ': '5.191.0.0/16',
3245 'BA': '31.176.128.0/17',
3246 'BB': '65.48.128.0/17',
3247 'BD': '114.130.0.0/16',
3248 'BE': '57.0.0.0/8',
3249 'BF': '129.45.128.0/17',
3250 'BG': '95.42.0.0/15',
3251 'BH': '37.131.0.0/17',
3252 'BI': '154.117.192.0/18',
3253 'BJ': '137.255.0.0/16',
3254 'BL': '192.131.134.0/24',
3255 'BM': '196.12.64.0/18',
3256 'BN': '156.31.0.0/16',
3257 'BO': '161.56.0.0/16',
3258 'BQ': '161.0.80.0/20',
3259 'BR': '152.240.0.0/12',
3260 'BS': '24.51.64.0/18',
3261 'BT': '119.2.96.0/19',
3262 'BW': '168.167.0.0/16',
3263 'BY': '178.120.0.0/13',
3264 'BZ': '179.42.192.0/18',
3265 'CA': '99.224.0.0/11',
3266 'CD': '41.243.0.0/16',
3267 'CF': '196.32.200.0/21',
3268 'CG': '197.214.128.0/17',
3269 'CH': '85.0.0.0/13',
3270 'CI': '154.232.0.0/14',
3271 'CK': '202.65.32.0/19',
3272 'CL': '152.172.0.0/14',
3273 'CM': '165.210.0.0/15',
3274 'CN': '36.128.0.0/10',
3275 'CO': '181.240.0.0/12',
3276 'CR': '201.192.0.0/12',
3277 'CU': '152.206.0.0/15',
3278 'CV': '165.90.96.0/19',
3279 'CW': '190.88.128.0/17',
3280 'CY': '46.198.0.0/15',
3281 'CZ': '88.100.0.0/14',
3282 'DE': '53.0.0.0/8',
3283 'DJ': '197.241.0.0/17',
3284 'DK': '87.48.0.0/12',
3285 'DM': '192.243.48.0/20',
3286 'DO': '152.166.0.0/15',
3287 'DZ': '41.96.0.0/12',
3288 'EC': '186.68.0.0/15',
3289 'EE': '90.190.0.0/15',
3290 'EG': '156.160.0.0/11',
3291 'ER': '196.200.96.0/20',
3292 'ES': '88.0.0.0/11',
3293 'ET': '196.188.0.0/14',
3294 'EU': '2.16.0.0/13',
3295 'FI': '91.152.0.0/13',
3296 'FJ': '144.120.0.0/16',
3297 'FM': '119.252.112.0/20',
3298 'FO': '88.85.32.0/19',
3299 'FR': '90.0.0.0/9',
3300 'GA': '41.158.0.0/15',
3301 'GB': '25.0.0.0/8',
3302 'GD': '74.122.88.0/21',
3303 'GE': '31.146.0.0/16',
3304 'GF': '161.22.64.0/18',
3305 'GG': '62.68.160.0/19',
3306 'GH': '45.208.0.0/14',
3307 'GI': '85.115.128.0/19',
3308 'GL': '88.83.0.0/19',
3309 'GM': '160.182.0.0/15',
3310 'GN': '197.149.192.0/18',
3311 'GP': '104.250.0.0/19',
3312 'GQ': '105.235.224.0/20',
3313 'GR': '94.64.0.0/13',
3314 'GT': '168.234.0.0/16',
3315 'GU': '168.123.0.0/16',
3316 'GW': '197.214.80.0/20',
3317 'GY': '181.41.64.0/18',
3318 'HK': '113.252.0.0/14',
3319 'HN': '181.210.0.0/16',
3320 'HR': '93.136.0.0/13',
3321 'HT': '148.102.128.0/17',
3322 'HU': '84.0.0.0/14',
3323 'ID': '39.192.0.0/10',
3324 'IE': '87.32.0.0/12',
3325 'IL': '79.176.0.0/13',
3326 'IM': '5.62.80.0/20',
3327 'IN': '117.192.0.0/10',
3328 'IO': '203.83.48.0/21',
3329 'IQ': '37.236.0.0/14',
3330 'IR': '2.176.0.0/12',
3331 'IS': '82.221.0.0/16',
3332 'IT': '79.0.0.0/10',
3333 'JE': '87.244.64.0/18',
3334 'JM': '72.27.0.0/17',
3335 'JO': '176.29.0.0/16',
3336 'JP': '126.0.0.0/8',
3337 'KE': '105.48.0.0/12',
3338 'KG': '158.181.128.0/17',
3339 'KH': '36.37.128.0/17',
3340 'KI': '103.25.140.0/22',
3341 'KM': '197.255.224.0/20',
3342 'KN': '198.32.32.0/19',
3343 'KP': '175.45.176.0/22',
3344 'KR': '175.192.0.0/10',
3345 'KW': '37.36.0.0/14',
3346 'KY': '64.96.0.0/15',
3347 'KZ': '2.72.0.0/13',
3348 'LA': '115.84.64.0/18',
3349 'LB': '178.135.0.0/16',
3350 'LC': '192.147.231.0/24',
3351 'LI': '82.117.0.0/19',
3352 'LK': '112.134.0.0/15',
3353 'LR': '41.86.0.0/19',
3354 'LS': '129.232.0.0/17',
3355 'LT': '78.56.0.0/13',
3356 'LU': '188.42.0.0/16',
3357 'LV': '46.109.0.0/16',
3358 'LY': '41.252.0.0/14',
3359 'MA': '105.128.0.0/11',
3360 'MC': '88.209.64.0/18',
3361 'MD': '37.246.0.0/16',
3362 'ME': '178.175.0.0/17',
3363 'MF': '74.112.232.0/21',
3364 'MG': '154.126.0.0/17',
3365 'MH': '117.103.88.0/21',
3366 'MK': '77.28.0.0/15',
3367 'ML': '154.118.128.0/18',
3368 'MM': '37.111.0.0/17',
3369 'MN': '49.0.128.0/17',
3370 'MO': '60.246.0.0/16',
3371 'MP': '202.88.64.0/20',
3372 'MQ': '109.203.224.0/19',
3373 'MR': '41.188.64.0/18',
3374 'MS': '208.90.112.0/22',
3375 'MT': '46.11.0.0/16',
3376 'MU': '105.16.0.0/12',
3377 'MV': '27.114.128.0/18',
3378 'MW': '105.234.0.0/16',
3379 'MX': '187.192.0.0/11',
3380 'MY': '175.136.0.0/13',
3381 'MZ': '197.218.0.0/15',
3382 'NA': '41.182.0.0/16',
3383 'NC': '101.101.0.0/18',
3384 'NE': '197.214.0.0/18',
3385 'NF': '203.17.240.0/22',
3386 'NG': '105.112.0.0/12',
3387 'NI': '186.76.0.0/15',
3388 'NL': '145.96.0.0/11',
3389 'NO': '84.208.0.0/13',
3390 'NP': '36.252.0.0/15',
3391 'NR': '203.98.224.0/19',
3392 'NU': '49.156.48.0/22',
3393 'NZ': '49.224.0.0/14',
3394 'OM': '5.36.0.0/15',
3395 'PA': '186.72.0.0/15',
3396 'PE': '186.160.0.0/14',
3397 'PF': '123.50.64.0/18',
3398 'PG': '124.240.192.0/19',
3399 'PH': '49.144.0.0/13',
3400 'PK': '39.32.0.0/11',
3401 'PL': '83.0.0.0/11',
3402 'PM': '70.36.0.0/20',
3403 'PR': '66.50.0.0/16',
3404 'PS': '188.161.0.0/16',
3405 'PT': '85.240.0.0/13',
3406 'PW': '202.124.224.0/20',
3407 'PY': '181.120.0.0/14',
3408 'QA': '37.210.0.0/15',
3409 'RE': '139.26.0.0/16',
3410 'RO': '79.112.0.0/13',
3411 'RS': '178.220.0.0/14',
3412 'RU': '5.136.0.0/13',
3413 'RW': '105.178.0.0/15',
3414 'SA': '188.48.0.0/13',
3415 'SB': '202.1.160.0/19',
3416 'SC': '154.192.0.0/11',
3417 'SD': '154.96.0.0/13',
3418 'SE': '78.64.0.0/12',
3419 'SG': '152.56.0.0/14',
3420 'SI': '188.196.0.0/14',
3421 'SK': '78.98.0.0/15',
3422 'SL': '197.215.0.0/17',
3423 'SM': '89.186.32.0/19',
3424 'SN': '41.82.0.0/15',
3425 'SO': '197.220.64.0/19',
3426 'SR': '186.179.128.0/17',
3427 'SS': '105.235.208.0/21',
3428 'ST': '197.159.160.0/19',
3429 'SV': '168.243.0.0/16',
3430 'SX': '190.102.0.0/20',
3431 'SY': '5.0.0.0/16',
3432 'SZ': '41.84.224.0/19',
3433 'TC': '65.255.48.0/20',
3434 'TD': '154.68.128.0/19',
3435 'TG': '196.168.0.0/14',
3436 'TH': '171.96.0.0/13',
3437 'TJ': '85.9.128.0/18',
3438 'TK': '27.96.24.0/21',
3439 'TL': '180.189.160.0/20',
3440 'TM': '95.85.96.0/19',
3441 'TN': '197.0.0.0/11',
3442 'TO': '175.176.144.0/21',
3443 'TR': '78.160.0.0/11',
3444 'TT': '186.44.0.0/15',
3445 'TV': '202.2.96.0/19',
3446 'TW': '120.96.0.0/11',
3447 'TZ': '156.156.0.0/14',
3448 'UA': '93.72.0.0/13',
3449 'UG': '154.224.0.0/13',
3450 'US': '3.0.0.0/8',
3451 'UY': '167.56.0.0/13',
3452 'UZ': '82.215.64.0/18',
3453 'VA': '212.77.0.0/19',
3454 'VC': '24.92.144.0/20',
3455 'VE': '186.88.0.0/13',
3456 'VG': '172.103.64.0/18',
3457 'VI': '146.226.0.0/16',
3458 'VN': '14.160.0.0/11',
3459 'VU': '202.80.32.0/20',
3460 'WF': '117.20.32.0/21',
3461 'WS': '202.4.32.0/19',
3462 'YE': '134.35.0.0/16',
3463 'YT': '41.242.116.0/22',
3464 'ZA': '41.0.0.0/11',
3465 'ZM': '165.56.0.0/13',
3466 'ZW': '41.85.192.0/19',
3467 }
3468
3469 @classmethod
3470 def random_ipv4(cls, code):
3471 block = cls._country_ip_map.get(code.upper())
3472 if not block:
3473 return None
3474 addr, preflen = block.split('/')
3475 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3476 addr_max = addr_min | (0xffffffff >> int(preflen))
3477 return compat_str(socket.inet_ntoa(
3478 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3479
3480
3481 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3482 def __init__(self, proxies=None):
3483 # Set default handlers
3484 for type in ('http', 'https'):
3485 setattr(self, '%s_open' % type,
3486 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3487 meth(r, proxy, type))
3488 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3489
3490 def proxy_open(self, req, proxy, type):
3491 req_proxy = req.headers.get('Ytdl-request-proxy')
3492 if req_proxy is not None:
3493 proxy = req_proxy
3494 del req.headers['Ytdl-request-proxy']
3495
3496 if proxy == '__noproxy__':
3497 return None # No Proxy
3498 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3499 req.add_header('Ytdl-socks-proxy', proxy)
3500 # youtube-dl's http/https handlers do wrapping the socket with socks
3501 return None
3502 return compat_urllib_request.ProxyHandler.proxy_open(
3503 self, req, proxy, type)
3504
3505
3506 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3507 # released into Public Domain
3508 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3509
3510 def long_to_bytes(n, blocksize=0):
3511 """long_to_bytes(n:long, blocksize:int) : string
3512 Convert a long integer to a byte string.
3513
3514 If optional blocksize is given and greater than zero, pad the front of the
3515 byte string with binary zeros so that the length is a multiple of
3516 blocksize.
3517 """
3518 # after much testing, this algorithm was deemed to be the fastest
3519 s = b''
3520 n = int(n)
3521 while n > 0:
3522 s = compat_struct_pack('>I', n & 0xffffffff) + s
3523 n = n >> 32
3524 # strip off leading zeros
3525 for i in range(len(s)):
3526 if s[i] != b'\000'[0]:
3527 break
3528 else:
3529 # only happens when n == 0
3530 s = b'\000'
3531 i = 0
3532 s = s[i:]
3533 # add back some pad bytes. this could be done more efficiently w.r.t. the
3534 # de-padding being done above, but sigh...
3535 if blocksize > 0 and len(s) % blocksize:
3536 s = (blocksize - len(s) % blocksize) * b'\000' + s
3537 return s
3538
3539
3540 def bytes_to_long(s):
3541 """bytes_to_long(string) : long
3542 Convert a byte string to a long integer.
3543
3544 This is (essentially) the inverse of long_to_bytes().
3545 """
3546 acc = 0
3547 length = len(s)
3548 if length % 4:
3549 extra = (4 - length % 4)
3550 s = b'\000' * extra + s
3551 length = length + extra
3552 for i in range(0, length, 4):
3553 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3554 return acc
3555
3556
3557 def ohdave_rsa_encrypt(data, exponent, modulus):
3558 '''
3559 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3560
3561 Input:
3562 data: data to encrypt, bytes-like object
3563 exponent, modulus: parameter e and N of RSA algorithm, both integer
3564 Output: hex string of encrypted data
3565
3566 Limitation: supports one block encryption only
3567 '''
3568
3569 payload = int(binascii.hexlify(data[::-1]), 16)
3570 encrypted = pow(payload, exponent, modulus)
3571 return '%x' % encrypted
3572
3573
3574 def pkcs1pad(data, length):
3575 """
3576 Padding input data with PKCS#1 scheme
3577
3578 @param {int[]} data input data
3579 @param {int} length target length
3580 @returns {int[]} padded data
3581 """
3582 if len(data) > length - 11:
3583 raise ValueError('Input data too long for PKCS#1 padding')
3584
3585 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3586 return [0, 2] + pseudo_random + [0] + data
3587
3588
3589 def encode_base_n(num, n, table=None):
3590 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3591 if not table:
3592 table = FULL_TABLE[:n]
3593
3594 if n > len(table):
3595 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3596
3597 if num == 0:
3598 return table[0]
3599
3600 ret = ''
3601 while num:
3602 ret = table[num % n] + ret
3603 num = num // n
3604 return ret
3605
3606
3607 def decode_packed_codes(code):
3608 mobj = re.search(PACKED_CODES_RE, code)
3609 obfucasted_code, base, count, symbols = mobj.groups()
3610 base = int(base)
3611 count = int(count)
3612 symbols = symbols.split('|')
3613 symbol_table = {}
3614
3615 while count:
3616 count -= 1
3617 base_n_count = encode_base_n(count, base)
3618 symbol_table[base_n_count] = symbols[count] or base_n_count
3619
3620 return re.sub(
3621 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3622 obfucasted_code)
3623
3624
3625 def parse_m3u8_attributes(attrib):
3626 info = {}
3627 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3628 if val.startswith('"'):
3629 val = val[1:-1]
3630 info[key] = val
3631 return info
3632
3633
3634 def urshift(val, n):
3635 return val >> n if val >= 0 else (val + 0x100000000) >> n
3636
3637
3638 # Based on png2str() written by @gdkchan and improved by @yokrysty
3639 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3640 def decode_png(png_data):
3641 # Reference: https://www.w3.org/TR/PNG/
3642 header = png_data[8:]
3643
3644 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3645 raise IOError('Not a valid PNG file.')
3646
3647 int_map = {1: '>B', 2: '>H', 4: '>I'}
3648 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3649
3650 chunks = []
3651
3652 while header:
3653 length = unpack_integer(header[:4])
3654 header = header[4:]
3655
3656 chunk_type = header[:4]
3657 header = header[4:]
3658
3659 chunk_data = header[:length]
3660 header = header[length:]
3661
3662 header = header[4:] # Skip CRC
3663
3664 chunks.append({
3665 'type': chunk_type,
3666 'length': length,
3667 'data': chunk_data
3668 })
3669
3670 ihdr = chunks[0]['data']
3671
3672 width = unpack_integer(ihdr[:4])
3673 height = unpack_integer(ihdr[4:8])
3674
3675 idat = b''
3676
3677 for chunk in chunks:
3678 if chunk['type'] == b'IDAT':
3679 idat += chunk['data']
3680
3681 if not idat:
3682 raise IOError('Unable to read PNG data.')
3683
3684 decompressed_data = bytearray(zlib.decompress(idat))
3685
3686 stride = width * 3
3687 pixels = []
3688
3689 def _get_pixel(idx):
3690 x = idx % stride
3691 y = idx // stride
3692 return pixels[y][x]
3693
3694 for y in range(height):
3695 basePos = y * (1 + stride)
3696 filter_type = decompressed_data[basePos]
3697
3698 current_row = []
3699
3700 pixels.append(current_row)
3701
3702 for x in range(stride):
3703 color = decompressed_data[1 + basePos + x]
3704 basex = y * stride + x
3705 left = 0
3706 up = 0
3707
3708 if x > 2:
3709 left = _get_pixel(basex - 3)
3710 if y > 0:
3711 up = _get_pixel(basex - stride)
3712
3713 if filter_type == 1: # Sub
3714 color = (color + left) & 0xff
3715 elif filter_type == 2: # Up
3716 color = (color + up) & 0xff
3717 elif filter_type == 3: # Average
3718 color = (color + ((left + up) >> 1)) & 0xff
3719 elif filter_type == 4: # Paeth
3720 a = left
3721 b = up
3722 c = 0
3723
3724 if x > 2 and y > 0:
3725 c = _get_pixel(basex - stride - 3)
3726
3727 p = a + b - c
3728
3729 pa = abs(p - a)
3730 pb = abs(p - b)
3731 pc = abs(p - c)
3732
3733 if pa <= pb and pa <= pc:
3734 color = (color + a) & 0xff
3735 elif pb <= pc:
3736 color = (color + b) & 0xff
3737 else:
3738 color = (color + c) & 0xff
3739
3740 current_row.append(color)
3741
3742 return width, height, pixels
3743
3744
3745 def write_xattr(path, key, value):
3746 # This mess below finds the best xattr tool for the job
3747 try:
3748 # try the pyxattr module...
3749 import xattr
3750
3751 if hasattr(xattr, 'set'): # pyxattr
3752 # Unicode arguments are not supported in python-pyxattr until
3753 # version 0.5.0
3754 # See https://github.com/rg3/youtube-dl/issues/5498
3755 pyxattr_required_version = '0.5.0'
3756 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3757 # TODO: fallback to CLI tools
3758 raise XAttrUnavailableError(
3759 'python-pyxattr is detected but is too old. '
3760 'youtube-dl requires %s or above while your version is %s. '
3761 'Falling back to other xattr implementations' % (
3762 pyxattr_required_version, xattr.__version__))
3763
3764 setxattr = xattr.set
3765 else: # xattr
3766 setxattr = xattr.setxattr
3767
3768 try:
3769 setxattr(path, key, value)
3770 except EnvironmentError as e:
3771 raise XAttrMetadataError(e.errno, e.strerror)
3772
3773 except ImportError:
3774 if compat_os_name == 'nt':
3775 # Write xattrs to NTFS Alternate Data Streams:
3776 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3777 assert ':' not in key
3778 assert os.path.exists(path)
3779
3780 ads_fn = path + ':' + key
3781 try:
3782 with open(ads_fn, 'wb') as f:
3783 f.write(value)
3784 except EnvironmentError as e:
3785 raise XAttrMetadataError(e.errno, e.strerror)
3786 else:
3787 user_has_setfattr = check_executable('setfattr', ['--version'])
3788 user_has_xattr = check_executable('xattr', ['-h'])
3789
3790 if user_has_setfattr or user_has_xattr:
3791
3792 value = value.decode('utf-8')
3793 if user_has_setfattr:
3794 executable = 'setfattr'
3795 opts = ['-n', key, '-v', value]
3796 elif user_has_xattr:
3797 executable = 'xattr'
3798 opts = ['-w', key, value]
3799
3800 cmd = ([encodeFilename(executable, True)] +
3801 [encodeArgument(o) for o in opts] +
3802 [encodeFilename(path, True)])
3803
3804 try:
3805 p = subprocess.Popen(
3806 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3807 except EnvironmentError as e:
3808 raise XAttrMetadataError(e.errno, e.strerror)
3809 stdout, stderr = p.communicate()
3810 stderr = stderr.decode('utf-8', 'replace')
3811 if p.returncode != 0:
3812 raise XAttrMetadataError(p.returncode, stderr)
3813
3814 else:
3815 # On Unix, and can't find pyxattr, setfattr, or xattr.
3816 if sys.platform.startswith('linux'):
3817 raise XAttrUnavailableError(
3818 "Couldn't find a tool to set the xattrs. "
3819 "Install either the python 'pyxattr' or 'xattr' "
3820 "modules, or the GNU 'attr' package "
3821 "(which contains the 'setfattr' tool).")
3822 else:
3823 raise XAttrUnavailableError(
3824 "Couldn't find a tool to set the xattrs. "
3825 "Install either the python 'xattr' module, "
3826 "or the 'xattr' binary.")
3827
3828
3829 def random_birthday(year_field, month_field, day_field):
3830 return {
3831 year_field: str(random.randint(1950, 1995)),
3832 month_field: str(random.randint(1, 12)),
3833 day_field: str(random.randint(1, 31)),
3834 }