]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Initiate new release.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import email.header
15 import errno
16 import functools
17 import gzip
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import operator
24 import os
25 import platform
26 import random
27 import re
28 import socket
29 import ssl
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParseError,
39 compat_HTMLParser,
40 compat_basestring,
41 compat_chr,
42 compat_etree_fromstring,
43 compat_expanduser,
44 compat_html_entities,
45 compat_html_entities_html5,
46 compat_http_client,
47 compat_kwargs,
48 compat_os_name,
49 compat_parse_qs,
50 compat_shlex_quote,
51 compat_socket_create_connection,
52 compat_str,
53 compat_struct_pack,
54 compat_struct_unpack,
55 compat_urllib_error,
56 compat_urllib_parse,
57 compat_urllib_parse_urlencode,
58 compat_urllib_parse_urlparse,
59 compat_urllib_parse_unquote_plus,
60 compat_urllib_request,
61 compat_urlparse,
62 compat_xpath,
63 )
64
65 from .socks import (
66 ProxyType,
67 sockssocket,
68 )
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83 std_headers = {
84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
89 }
90
91
92 USER_AGENTS = {
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
94 }
95
96
97 NO_DEFAULT = object()
98
99 ENGLISH_MONTH_NAMES = [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
102
103 MONTH_NAMES = {
104 'en': ENGLISH_MONTH_NAMES,
105 'fr': [
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 }
109
110 KNOWN_EXTENSIONS = (
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
115 'avi', 'divx',
116 'mov',
117 'asf', 'wmv', 'wma',
118 '3gp', '3g2',
119 'mp3',
120 'flac',
121 'ape',
122 'wav',
123 'f4f', 'f4m', 'm3u8', 'smil')
124
125 # needed for sanitizing filenames in restricted mode
126 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
129
130 DATE_FORMATS = (
131 '%d %B %Y',
132 '%d %b %Y',
133 '%B %d %Y',
134 '%B %dst %Y',
135 '%B %dnd %Y',
136 '%B %dth %Y',
137 '%b %d %Y',
138 '%b %dst %Y',
139 '%b %dnd %Y',
140 '%b %dth %Y',
141 '%b %dst %Y %I:%M',
142 '%b %dnd %Y %I:%M',
143 '%b %dth %Y %I:%M',
144 '%Y %m %d',
145 '%Y-%m-%d',
146 '%Y/%m/%d',
147 '%Y/%m/%d %H:%M',
148 '%Y/%m/%d %H:%M:%S',
149 '%Y-%m-%d %H:%M',
150 '%Y-%m-%d %H:%M:%S',
151 '%Y-%m-%d %H:%M:%S.%f',
152 '%d.%m.%Y %H:%M',
153 '%d.%m.%Y %H.%M',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
157 '%Y-%m-%dT%H:%M:%S',
158 '%Y-%m-%dT%H:%M:%S.%f',
159 '%Y-%m-%dT%H:%M',
160 '%b %d %Y at %H:%M',
161 '%b %d %Y at %H:%M:%S',
162 )
163
164 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
165 DATE_FORMATS_DAY_FIRST.extend([
166 '%d-%m-%Y',
167 '%d.%m.%Y',
168 '%d.%m.%y',
169 '%d/%m/%Y',
170 '%d/%m/%y',
171 '%d/%m/%Y %H:%M:%S',
172 ])
173
174 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
175 DATE_FORMATS_MONTH_FIRST.extend([
176 '%m-%d-%Y',
177 '%m.%d.%Y',
178 '%m/%d/%Y',
179 '%m/%d/%y',
180 '%m/%d/%Y %H:%M:%S',
181 ])
182
183 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184
185
186 def preferredencoding():
187 """Get preferred encoding.
188
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 'TEST'.encode(pref)
195 except Exception:
196 pref = 'UTF-8'
197
198 return pref
199
200
201 def write_json_file(obj, fn):
202 """ Encode obj as JSON and write it to fn, atomically if possible """
203
204 fn = encodeFilename(fn)
205 if sys.version_info < (3, 0) and sys.platform != 'win32':
206 encoding = get_filesystem_encoding()
207 # os.path.basename returns a bytes object, but NamedTemporaryFile
208 # will fail if the filename contains non ascii characters unless we
209 # use a unicode object
210 path_basename = lambda f: os.path.basename(fn).decode(encoding)
211 # the same for os.path.dirname
212 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
213 else:
214 path_basename = os.path.basename
215 path_dirname = os.path.dirname
216
217 args = {
218 'suffix': '.tmp',
219 'prefix': path_basename(fn) + '.',
220 'dir': path_dirname(fn),
221 'delete': False,
222 }
223
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3, 0):
227 args['mode'] = 'wb'
228 else:
229 args.update({
230 'mode': 'w',
231 'encoding': 'utf-8',
232 })
233
234 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
235
236 try:
237 with tf:
238 json.dump(obj, tf)
239 if sys.platform == 'win32':
240 # Need to remove existing file on Windows, else os.rename raises
241 # WindowsError or FileExistsError.
242 try:
243 os.unlink(fn)
244 except OSError:
245 pass
246 os.rename(tf.name, fn)
247 except Exception:
248 try:
249 os.remove(tf.name)
250 except OSError:
251 pass
252 raise
253
254
255 if sys.version_info >= (2, 7):
256 def find_xpath_attr(node, xpath, key, val=None):
257 """ Find the xpath xpath[@key=val] """
258 assert re.match(r'^[a-zA-Z_-]+$', key)
259 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
260 return node.find(expr)
261 else:
262 def find_xpath_attr(node, xpath, key, val=None):
263 for f in node.findall(compat_xpath(xpath)):
264 if key not in f.attrib:
265 continue
266 if val is None or f.attrib.get(key) == val:
267 return f
268 return None
269
270 # On python2.6 the xml.etree.ElementTree.Element methods don't support
271 # the namespace parameter
272
273
274 def xpath_with_ns(path, ns_map):
275 components = [c.split(':') for c in path.split('/')]
276 replaced = []
277 for c in components:
278 if len(c) == 1:
279 replaced.append(c[0])
280 else:
281 ns, tag = c
282 replaced.append('{%s}%s' % (ns_map[ns], tag))
283 return '/'.join(replaced)
284
285
286 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
287 def _find_xpath(xpath):
288 return node.find(compat_xpath(xpath))
289
290 if isinstance(xpath, (str, compat_str)):
291 n = _find_xpath(xpath)
292 else:
293 for xp in xpath:
294 n = _find_xpath(xp)
295 if n is not None:
296 break
297
298 if n is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element %s' % name)
304 else:
305 return None
306 return n
307
308
309 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
311 if n is None or n == default:
312 return n
313 if n.text is None:
314 if default is not NO_DEFAULT:
315 return default
316 elif fatal:
317 name = xpath if name is None else name
318 raise ExtractorError('Could not find XML element\'s text %s' % name)
319 else:
320 return None
321 return n.text
322
323
324 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
325 n = find_xpath_attr(node, xpath, key)
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = '%s[@%s]' % (xpath, key) if name is None else name
331 raise ExtractorError('Could not find XML attribute %s' % name)
332 else:
333 return None
334 return n.attrib[key]
335
336
337 def get_element_by_id(id, html):
338 """Return the content of the tag with the specified ID in the passed HTML document"""
339 return get_element_by_attribute('id', id, html)
340
341
342 def get_element_by_class(class_name, html):
343 """Return the content of the first tag with the specified class in the passed HTML document"""
344 retval = get_elements_by_class(class_name, html)
345 return retval[0] if retval else None
346
347
348 def get_element_by_attribute(attribute, value, html, escape_value=True):
349 retval = get_elements_by_attribute(attribute, value, html, escape_value)
350 return retval[0] if retval else None
351
352
353 def get_elements_by_class(class_name, html):
354 """Return the content of all tags with the specified class in the passed HTML document as a list"""
355 return get_elements_by_attribute(
356 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
357 html, escape_value=False)
358
359
360 def get_elements_by_attribute(attribute, value, html, escape_value=True):
361 """Return the content of the tag with the specified attribute in the passed HTML document"""
362
363 value = re.escape(value) if escape_value else value
364
365 retlist = []
366 for m in re.finditer(r'''(?xs)
367 <([a-zA-Z0-9:._-]+)
368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
369 \s+%s=['"]?%s['"]?
370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
371 \s*>
372 (?P<content>.*?)
373 </\1>
374 ''' % (re.escape(attribute), value), html):
375 res = m.group('content')
376
377 if res.startswith('"') or res.startswith("'"):
378 res = res[1:-1]
379
380 retlist.append(unescapeHTML(res))
381
382 return retlist
383
384
385 class HTMLAttributeParser(compat_HTMLParser):
386 """Trivial HTML parser to gather the attributes for a single element"""
387 def __init__(self):
388 self.attrs = {}
389 compat_HTMLParser.__init__(self)
390
391 def handle_starttag(self, tag, attrs):
392 self.attrs = dict(attrs)
393
394
395 def extract_attributes(html_element):
396 """Given a string for an HTML element such as
397 <el
398 a="foo" B="bar" c="&98;az" d=boz
399 empty= noval entity="&amp;"
400 sq='"' dq="'"
401 >
402 Decode and return a dictionary of attributes.
403 {
404 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
405 'empty': '', 'noval': None, 'entity': '&',
406 'sq': '"', 'dq': '\''
407 }.
408 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
409 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
410 """
411 parser = HTMLAttributeParser()
412 try:
413 parser.feed(html_element)
414 parser.close()
415 # Older Python may throw HTMLParseError in case of malformed HTML
416 except compat_HTMLParseError:
417 pass
418 return parser.attrs
419
420
421 def clean_html(html):
422 """Clean an HTML snippet into a readable string"""
423
424 if html is None: # Convenience for sanitizing descriptions etc.
425 return html
426
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
429 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
435 return html.strip()
436
437
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
449 if filename == '-':
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
458 raise
459
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = sanitize_path(filename)
462 if alt_filename == filename:
463 raise
464 else:
465 # An exception here should be caught in the caller
466 stream = open(encodeFilename(alt_filename), open_mode)
467 return (stream, alt_filename)
468
469
470 def timeconvert(timestr):
471 """Convert RFC 2822 defined time string into system timestamp"""
472 timestamp = None
473 timetuple = email.utils.parsedate_tz(timestr)
474 if timetuple is not None:
475 timestamp = email.utils.mktime_tz(timetuple)
476 return timestamp
477
478
479 def sanitize_filename(s, restricted=False, is_id=False):
480 """Sanitizes a string so it could be used as part of a filename.
481 If restricted is set, use a stricter subset of allowed characters.
482 Set is_id if this is not an arbitrary string, but an ID that should be kept
483 if possible.
484 """
485 def replace_insane(char):
486 if restricted and char in ACCENT_CHARS:
487 return ACCENT_CHARS[char]
488 if char == '?' or ord(char) < 32 or ord(char) == 127:
489 return ''
490 elif char == '"':
491 return '' if restricted else '\''
492 elif char == ':':
493 return '_-' if restricted else ' -'
494 elif char in '\\/|*<>':
495 return '_'
496 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
497 return '_'
498 if restricted and ord(char) > 127:
499 return '_'
500 return char
501
502 # Handle timestamps
503 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
504 result = ''.join(map(replace_insane, s))
505 if not is_id:
506 while '__' in result:
507 result = result.replace('__', '_')
508 result = result.strip('_')
509 # Common case of "Foreign band name - English song title"
510 if restricted and result.startswith('-_'):
511 result = result[2:]
512 if result.startswith('-'):
513 result = '_' + result[len('-'):]
514 result = result.lstrip('.')
515 if not result:
516 result = '_'
517 return result
518
519
520 def sanitize_path(s):
521 """Sanitizes and normalizes path on Windows"""
522 if sys.platform != 'win32':
523 return s
524 drive_or_unc, _ = os.path.splitdrive(s)
525 if sys.version_info < (2, 7) and not drive_or_unc:
526 drive_or_unc, _ = os.path.splitunc(s)
527 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
528 if drive_or_unc:
529 norm_path.pop(0)
530 sanitized_path = [
531 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
532 for path_part in norm_path]
533 if drive_or_unc:
534 sanitized_path.insert(0, drive_or_unc + os.path.sep)
535 return os.path.join(*sanitized_path)
536
537
538 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
539 # unwanted failures due to missing protocol
540 def sanitize_url(url):
541 return 'http:%s' % url if url.startswith('//') else url
542
543
544 def sanitized_Request(url, *args, **kwargs):
545 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
546
547
548 def expand_path(s):
549 """Expand shell variables and ~"""
550 return os.path.expandvars(compat_expanduser(s))
551
552
553 def orderedSet(iterable):
554 """ Remove all duplicates from the input iterable """
555 res = []
556 for el in iterable:
557 if el not in res:
558 res.append(el)
559 return res
560
561
562 def _htmlentity_transform(entity_with_semicolon):
563 """Transforms an HTML entity to a character."""
564 entity = entity_with_semicolon[:-1]
565
566 # Known non-numeric HTML entity
567 if entity in compat_html_entities.name2codepoint:
568 return compat_chr(compat_html_entities.name2codepoint[entity])
569
570 # TODO: HTML5 allows entities without a semicolon. For example,
571 # '&Eacuteric' should be decoded as 'Éric'.
572 if entity_with_semicolon in compat_html_entities_html5:
573 return compat_html_entities_html5[entity_with_semicolon]
574
575 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
576 if mobj is not None:
577 numstr = mobj.group(1)
578 if numstr.startswith('x'):
579 base = 16
580 numstr = '0%s' % numstr
581 else:
582 base = 10
583 # See https://github.com/rg3/youtube-dl/issues/7518
584 try:
585 return compat_chr(int(numstr, base))
586 except ValueError:
587 pass
588
589 # Unknown entity in name, return its literal representation
590 return '&%s;' % entity
591
592
593 def unescapeHTML(s):
594 if s is None:
595 return None
596 assert type(s) == compat_str
597
598 return re.sub(
599 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
600
601
602 def get_subprocess_encoding():
603 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 # For subprocess calls, encode with locale encoding
605 # Refer to http://stackoverflow.com/a/9951851/35070
606 encoding = preferredencoding()
607 else:
608 encoding = sys.getfilesystemencoding()
609 if encoding is None:
610 encoding = 'utf-8'
611 return encoding
612
613
614 def encodeFilename(s, for_subprocess=False):
615 """
616 @param s The name of the file
617 """
618
619 assert type(s) == compat_str
620
621 # Python 3 has a Unicode API
622 if sys.version_info >= (3, 0):
623 return s
624
625 # Pass '' directly to use Unicode APIs on Windows 2000 and up
626 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
627 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
628 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
629 return s
630
631 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
632 if sys.platform.startswith('java'):
633 return s
634
635 return s.encode(get_subprocess_encoding(), 'ignore')
636
637
638 def decodeFilename(b, for_subprocess=False):
639
640 if sys.version_info >= (3, 0):
641 return b
642
643 if not isinstance(b, bytes):
644 return b
645
646 return b.decode(get_subprocess_encoding(), 'ignore')
647
648
649 def encodeArgument(s):
650 if not isinstance(s, compat_str):
651 # Legacy code that uses byte strings
652 # Uncomment the following line after fixing all post processors
653 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
654 s = s.decode('ascii')
655 return encodeFilename(s, True)
656
657
658 def decodeArgument(b):
659 return decodeFilename(b, True)
660
661
662 def decodeOption(optval):
663 if optval is None:
664 return optval
665 if isinstance(optval, bytes):
666 optval = optval.decode(preferredencoding())
667
668 assert isinstance(optval, compat_str)
669 return optval
670
671
672 def formatSeconds(secs):
673 if secs > 3600:
674 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
675 elif secs > 60:
676 return '%d:%02d' % (secs // 60, secs % 60)
677 else:
678 return '%d' % secs
679
680
681 def make_HTTPS_handler(params, **kwargs):
682 opts_no_check_certificate = params.get('nocheckcertificate', False)
683 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
684 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
685 if opts_no_check_certificate:
686 context.check_hostname = False
687 context.verify_mode = ssl.CERT_NONE
688 try:
689 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
690 except TypeError:
691 # Python 2.7.8
692 # (create_default_context present but HTTPSHandler has no context=)
693 pass
694
695 if sys.version_info < (3, 2):
696 return YoutubeDLHTTPSHandler(params, **kwargs)
697 else: # Python < 3.4
698 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
699 context.verify_mode = (ssl.CERT_NONE
700 if opts_no_check_certificate
701 else ssl.CERT_REQUIRED)
702 context.set_default_verify_paths()
703 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
704
705
706 def bug_reports_message():
707 if ytdl_is_updateable():
708 update_cmd = 'type youtube-dl -U to update'
709 else:
710 update_cmd = 'see https://yt-dl.org/update on how to update'
711 msg = '; please report this issue on https://yt-dl.org/bug .'
712 msg += ' Make sure you are using the latest version; %s.' % update_cmd
713 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
714 return msg
715
716
717 class YoutubeDLError(Exception):
718 """Base exception for YoutubeDL errors."""
719 pass
720
721
722 class ExtractorError(YoutubeDLError):
723 """Error during info extraction."""
724
725 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
726 """ tb, if given, is the original traceback (so that it can be printed out).
727 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
728 """
729
730 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
731 expected = True
732 if video_id is not None:
733 msg = video_id + ': ' + msg
734 if cause:
735 msg += ' (caused by %r)' % cause
736 if not expected:
737 msg += bug_reports_message()
738 super(ExtractorError, self).__init__(msg)
739
740 self.traceback = tb
741 self.exc_info = sys.exc_info() # preserve original exception
742 self.cause = cause
743 self.video_id = video_id
744
745 def format_traceback(self):
746 if self.traceback is None:
747 return None
748 return ''.join(traceback.format_tb(self.traceback))
749
750
751 class UnsupportedError(ExtractorError):
752 def __init__(self, url):
753 super(UnsupportedError, self).__init__(
754 'Unsupported URL: %s' % url, expected=True)
755 self.url = url
756
757
758 class RegexNotFoundError(ExtractorError):
759 """Error when a regex didn't match"""
760 pass
761
762
763 class GeoRestrictedError(ExtractorError):
764 """Geographic restriction Error exception.
765
766 This exception may be thrown when a video is not available from your
767 geographic location due to geographic restrictions imposed by a website.
768 """
769 def __init__(self, msg, countries=None):
770 super(GeoRestrictedError, self).__init__(msg, expected=True)
771 self.msg = msg
772 self.countries = countries
773
774
775 class DownloadError(YoutubeDLError):
776 """Download Error exception.
777
778 This exception may be thrown by FileDownloader objects if they are not
779 configured to continue on errors. They will contain the appropriate
780 error message.
781 """
782
783 def __init__(self, msg, exc_info=None):
784 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
785 super(DownloadError, self).__init__(msg)
786 self.exc_info = exc_info
787
788
789 class SameFileError(YoutubeDLError):
790 """Same File exception.
791
792 This exception will be thrown by FileDownloader objects if they detect
793 multiple files would have to be downloaded to the same file on disk.
794 """
795 pass
796
797
798 class PostProcessingError(YoutubeDLError):
799 """Post Processing exception.
800
801 This exception may be raised by PostProcessor's .run() method to
802 indicate an error in the postprocessing task.
803 """
804
805 def __init__(self, msg):
806 super(PostProcessingError, self).__init__(msg)
807 self.msg = msg
808
809
810 class MaxDownloadsReached(YoutubeDLError):
811 """ --max-downloads limit has been reached. """
812 pass
813
814
815 class UnavailableVideoError(YoutubeDLError):
816 """Unavailable Format exception.
817
818 This exception will be thrown when a video is requested
819 in a format that is not available for that video.
820 """
821 pass
822
823
824 class ContentTooShortError(YoutubeDLError):
825 """Content Too Short exception.
826
827 This exception may be raised by FileDownloader objects when a file they
828 download is too small for what the server announced first, indicating
829 the connection was probably interrupted.
830 """
831
832 def __init__(self, downloaded, expected):
833 super(ContentTooShortError, self).__init__(
834 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
835 )
836 # Both in bytes
837 self.downloaded = downloaded
838 self.expected = expected
839
840
841 class XAttrMetadataError(YoutubeDLError):
842 def __init__(self, code=None, msg='Unknown error'):
843 super(XAttrMetadataError, self).__init__(msg)
844 self.code = code
845 self.msg = msg
846
847 # Parsing code and msg
848 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
849 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
850 self.reason = 'NO_SPACE'
851 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
852 self.reason = 'VALUE_TOO_LONG'
853 else:
854 self.reason = 'NOT_SUPPORTED'
855
856
857 class XAttrUnavailableError(YoutubeDLError):
858 pass
859
860
861 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
862 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
863 # expected HTTP responses to meet HTTP/1.0 or later (see also
864 # https://github.com/rg3/youtube-dl/issues/6727)
865 if sys.version_info < (3, 0):
866 kwargs[b'strict'] = True
867 hc = http_class(*args, **kwargs)
868 source_address = ydl_handler._params.get('source_address')
869 if source_address is not None:
870 sa = (source_address, 0)
871 if hasattr(hc, 'source_address'): # Python 2.7+
872 hc.source_address = sa
873 else: # Python 2.6
874 def _hc_connect(self, *args, **kwargs):
875 sock = compat_socket_create_connection(
876 (self.host, self.port), self.timeout, sa)
877 if is_https:
878 self.sock = ssl.wrap_socket(
879 sock, self.key_file, self.cert_file,
880 ssl_version=ssl.PROTOCOL_TLSv1)
881 else:
882 self.sock = sock
883 hc.connect = functools.partial(_hc_connect, hc)
884
885 return hc
886
887
888 def handle_youtubedl_headers(headers):
889 filtered_headers = headers
890
891 if 'Youtubedl-no-compression' in filtered_headers:
892 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
893 del filtered_headers['Youtubedl-no-compression']
894
895 return filtered_headers
896
897
898 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
899 """Handler for HTTP requests and responses.
900
901 This class, when installed with an OpenerDirector, automatically adds
902 the standard headers to every HTTP request and handles gzipped and
903 deflated responses from web servers. If compression is to be avoided in
904 a particular request, the original request in the program code only has
905 to include the HTTP header "Youtubedl-no-compression", which will be
906 removed before making the real request.
907
908 Part of this code was copied from:
909
910 http://techknack.net/python-urllib2-handlers/
911
912 Andrew Rowls, the author of that code, agreed to release it to the
913 public domain.
914 """
915
916 def __init__(self, params, *args, **kwargs):
917 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
918 self._params = params
919
920 def http_open(self, req):
921 conn_class = compat_http_client.HTTPConnection
922
923 socks_proxy = req.headers.get('Ytdl-socks-proxy')
924 if socks_proxy:
925 conn_class = make_socks_conn_class(conn_class, socks_proxy)
926 del req.headers['Ytdl-socks-proxy']
927
928 return self.do_open(functools.partial(
929 _create_http_connection, self, conn_class, False),
930 req)
931
932 @staticmethod
933 def deflate(data):
934 try:
935 return zlib.decompress(data, -zlib.MAX_WBITS)
936 except zlib.error:
937 return zlib.decompress(data)
938
939 def http_request(self, req):
940 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
941 # always respected by websites, some tend to give out URLs with non percent-encoded
942 # non-ASCII characters (see telemb.py, ard.py [#3412])
943 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
944 # To work around aforementioned issue we will replace request's original URL with
945 # percent-encoded one
946 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
947 # the code of this workaround has been moved here from YoutubeDL.urlopen()
948 url = req.get_full_url()
949 url_escaped = escape_url(url)
950
951 # Substitute URL if any change after escaping
952 if url != url_escaped:
953 req = update_Request(req, url=url_escaped)
954
955 for h, v in std_headers.items():
956 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
957 # The dict keys are capitalized because of this bug by urllib
958 if h.capitalize() not in req.headers:
959 req.add_header(h, v)
960
961 req.headers = handle_youtubedl_headers(req.headers)
962
963 if sys.version_info < (2, 7) and '#' in req.get_full_url():
964 # Python 2.6 is brain-dead when it comes to fragments
965 req._Request__original = req._Request__original.partition('#')[0]
966 req._Request__r_type = req._Request__r_type.partition('#')[0]
967
968 return req
969
970 def http_response(self, req, resp):
971 old_resp = resp
972 # gzip
973 if resp.headers.get('Content-encoding', '') == 'gzip':
974 content = resp.read()
975 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
976 try:
977 uncompressed = io.BytesIO(gz.read())
978 except IOError as original_ioerror:
979 # There may be junk add the end of the file
980 # See http://stackoverflow.com/q/4928560/35070 for details
981 for i in range(1, 1024):
982 try:
983 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
984 uncompressed = io.BytesIO(gz.read())
985 except IOError:
986 continue
987 break
988 else:
989 raise original_ioerror
990 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
991 resp.msg = old_resp.msg
992 del resp.headers['Content-encoding']
993 # deflate
994 if resp.headers.get('Content-encoding', '') == 'deflate':
995 gz = io.BytesIO(self.deflate(resp.read()))
996 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
997 resp.msg = old_resp.msg
998 del resp.headers['Content-encoding']
999 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1000 # https://github.com/rg3/youtube-dl/issues/6457).
1001 if 300 <= resp.code < 400:
1002 location = resp.headers.get('Location')
1003 if location:
1004 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1005 if sys.version_info >= (3, 0):
1006 location = location.encode('iso-8859-1').decode('utf-8')
1007 else:
1008 location = location.decode('utf-8')
1009 location_escaped = escape_url(location)
1010 if location != location_escaped:
1011 del resp.headers['Location']
1012 if sys.version_info < (3, 0):
1013 location_escaped = location_escaped.encode('utf-8')
1014 resp.headers['Location'] = location_escaped
1015 return resp
1016
1017 https_request = http_request
1018 https_response = http_response
1019
1020
1021 def make_socks_conn_class(base_class, socks_proxy):
1022 assert issubclass(base_class, (
1023 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1024
1025 url_components = compat_urlparse.urlparse(socks_proxy)
1026 if url_components.scheme.lower() == 'socks5':
1027 socks_type = ProxyType.SOCKS5
1028 elif url_components.scheme.lower() in ('socks', 'socks4'):
1029 socks_type = ProxyType.SOCKS4
1030 elif url_components.scheme.lower() == 'socks4a':
1031 socks_type = ProxyType.SOCKS4A
1032
1033 def unquote_if_non_empty(s):
1034 if not s:
1035 return s
1036 return compat_urllib_parse_unquote_plus(s)
1037
1038 proxy_args = (
1039 socks_type,
1040 url_components.hostname, url_components.port or 1080,
1041 True, # Remote DNS
1042 unquote_if_non_empty(url_components.username),
1043 unquote_if_non_empty(url_components.password),
1044 )
1045
1046 class SocksConnection(base_class):
1047 def connect(self):
1048 self.sock = sockssocket()
1049 self.sock.setproxy(*proxy_args)
1050 if type(self.timeout) in (int, float):
1051 self.sock.settimeout(self.timeout)
1052 self.sock.connect((self.host, self.port))
1053
1054 if isinstance(self, compat_http_client.HTTPSConnection):
1055 if hasattr(self, '_context'): # Python > 2.6
1056 self.sock = self._context.wrap_socket(
1057 self.sock, server_hostname=self.host)
1058 else:
1059 self.sock = ssl.wrap_socket(self.sock)
1060
1061 return SocksConnection
1062
1063
1064 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1065 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1066 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1067 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1068 self._params = params
1069
1070 def https_open(self, req):
1071 kwargs = {}
1072 conn_class = self._https_conn_class
1073
1074 if hasattr(self, '_context'): # python > 2.6
1075 kwargs['context'] = self._context
1076 if hasattr(self, '_check_hostname'): # python 3.x
1077 kwargs['check_hostname'] = self._check_hostname
1078
1079 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1080 if socks_proxy:
1081 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1082 del req.headers['Ytdl-socks-proxy']
1083
1084 return self.do_open(functools.partial(
1085 _create_http_connection, self, conn_class, True),
1086 req, **kwargs)
1087
1088
1089 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1090 def __init__(self, cookiejar=None):
1091 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1092
1093 def http_response(self, request, response):
1094 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1095 # characters in Set-Cookie HTTP header of last response (see
1096 # https://github.com/rg3/youtube-dl/issues/6769).
1097 # In order to at least prevent crashing we will percent encode Set-Cookie
1098 # header before HTTPCookieProcessor starts processing it.
1099 # if sys.version_info < (3, 0) and response.headers:
1100 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1101 # set_cookie = response.headers.get(set_cookie_header)
1102 # if set_cookie:
1103 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1104 # if set_cookie != set_cookie_escaped:
1105 # del response.headers[set_cookie_header]
1106 # response.headers[set_cookie_header] = set_cookie_escaped
1107 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1108
1109 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1110 https_response = http_response
1111
1112
1113 def extract_timezone(date_str):
1114 m = re.search(
1115 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1116 date_str)
1117 if not m:
1118 timezone = datetime.timedelta()
1119 else:
1120 date_str = date_str[:-len(m.group('tz'))]
1121 if not m.group('sign'):
1122 timezone = datetime.timedelta()
1123 else:
1124 sign = 1 if m.group('sign') == '+' else -1
1125 timezone = datetime.timedelta(
1126 hours=sign * int(m.group('hours')),
1127 minutes=sign * int(m.group('minutes')))
1128 return timezone, date_str
1129
1130
1131 def parse_iso8601(date_str, delimiter='T', timezone=None):
1132 """ Return a UNIX timestamp from the given date """
1133
1134 if date_str is None:
1135 return None
1136
1137 date_str = re.sub(r'\.[0-9]+', '', date_str)
1138
1139 if timezone is None:
1140 timezone, date_str = extract_timezone(date_str)
1141
1142 try:
1143 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1144 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
1148
1149
1150 def date_formats(day_first=True):
1151 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1152
1153
1154 def unified_strdate(date_str, day_first=True):
1155 """Return a string with the date in the format YYYYMMDD"""
1156
1157 if date_str is None:
1158 return None
1159 upload_date = None
1160 # Replace commas
1161 date_str = date_str.replace(',', ' ')
1162 # Remove AM/PM + timezone
1163 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1164 _, date_str = extract_timezone(date_str)
1165
1166 for expression in date_formats(day_first):
1167 try:
1168 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1169 except ValueError:
1170 pass
1171 if upload_date is None:
1172 timetuple = email.utils.parsedate_tz(date_str)
1173 if timetuple:
1174 try:
1175 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1176 except ValueError:
1177 pass
1178 if upload_date is not None:
1179 return compat_str(upload_date)
1180
1181
1182 def unified_timestamp(date_str, day_first=True):
1183 if date_str is None:
1184 return None
1185
1186 date_str = re.sub(r'[,|]', '', date_str)
1187
1188 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1189 timezone, date_str = extract_timezone(date_str)
1190
1191 # Remove AM/PM + timezone
1192 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1193
1194 # Remove unrecognized timezones from ISO 8601 alike timestamps
1195 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1196 if m:
1197 date_str = date_str[:-len(m.group('tz'))]
1198
1199 for expression in date_formats(day_first):
1200 try:
1201 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1202 return calendar.timegm(dt.timetuple())
1203 except ValueError:
1204 pass
1205 timetuple = email.utils.parsedate_tz(date_str)
1206 if timetuple:
1207 return calendar.timegm(timetuple) + pm_delta * 3600
1208
1209
1210 def determine_ext(url, default_ext='unknown_video'):
1211 if url is None:
1212 return default_ext
1213 guess = url.partition('?')[0].rpartition('.')[2]
1214 if re.match(r'^[A-Za-z0-9]+$', guess):
1215 return guess
1216 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1217 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1218 return guess.rstrip('/')
1219 else:
1220 return default_ext
1221
1222
1223 def subtitles_filename(filename, sub_lang, sub_format):
1224 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1225
1226
1227 def date_from_str(date_str):
1228 """
1229 Return a datetime object from a string in the format YYYYMMDD or
1230 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1231 today = datetime.date.today()
1232 if date_str in ('now', 'today'):
1233 return today
1234 if date_str == 'yesterday':
1235 return today - datetime.timedelta(days=1)
1236 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1237 if match is not None:
1238 sign = match.group('sign')
1239 time = int(match.group('time'))
1240 if sign == '-':
1241 time = -time
1242 unit = match.group('unit')
1243 # A bad approximation?
1244 if unit == 'month':
1245 unit = 'day'
1246 time *= 30
1247 elif unit == 'year':
1248 unit = 'day'
1249 time *= 365
1250 unit += 's'
1251 delta = datetime.timedelta(**{unit: time})
1252 return today + delta
1253 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1254
1255
1256 def hyphenate_date(date_str):
1257 """
1258 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1259 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1260 if match is not None:
1261 return '-'.join(match.groups())
1262 else:
1263 return date_str
1264
1265
1266 class DateRange(object):
1267 """Represents a time interval between two dates"""
1268
1269 def __init__(self, start=None, end=None):
1270 """start and end must be strings in the format accepted by date"""
1271 if start is not None:
1272 self.start = date_from_str(start)
1273 else:
1274 self.start = datetime.datetime.min.date()
1275 if end is not None:
1276 self.end = date_from_str(end)
1277 else:
1278 self.end = datetime.datetime.max.date()
1279 if self.start > self.end:
1280 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1281
1282 @classmethod
1283 def day(cls, day):
1284 """Returns a range that only contains the given day"""
1285 return cls(day, day)
1286
1287 def __contains__(self, date):
1288 """Check if the date is in the range"""
1289 if not isinstance(date, datetime.date):
1290 date = date_from_str(date)
1291 return self.start <= date <= self.end
1292
1293 def __str__(self):
1294 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1295
1296
1297 def platform_name():
1298 """ Returns the platform name as a compat_str """
1299 res = platform.platform()
1300 if isinstance(res, bytes):
1301 res = res.decode(preferredencoding())
1302
1303 assert isinstance(res, compat_str)
1304 return res
1305
1306
1307 def _windows_write_string(s, out):
1308 """ Returns True if the string was written using special methods,
1309 False if it has yet to be written out."""
1310 # Adapted from http://stackoverflow.com/a/3259271/35070
1311
1312 import ctypes
1313 import ctypes.wintypes
1314
1315 WIN_OUTPUT_IDS = {
1316 1: -11,
1317 2: -12,
1318 }
1319
1320 try:
1321 fileno = out.fileno()
1322 except AttributeError:
1323 # If the output stream doesn't have a fileno, it's virtual
1324 return False
1325 except io.UnsupportedOperation:
1326 # Some strange Windows pseudo files?
1327 return False
1328 if fileno not in WIN_OUTPUT_IDS:
1329 return False
1330
1331 GetStdHandle = ctypes.WINFUNCTYPE(
1332 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1333 (b'GetStdHandle', ctypes.windll.kernel32))
1334 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1335
1336 WriteConsoleW = ctypes.WINFUNCTYPE(
1337 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1338 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1339 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1340 written = ctypes.wintypes.DWORD(0)
1341
1342 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1343 FILE_TYPE_CHAR = 0x0002
1344 FILE_TYPE_REMOTE = 0x8000
1345 GetConsoleMode = ctypes.WINFUNCTYPE(
1346 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1347 ctypes.POINTER(ctypes.wintypes.DWORD))(
1348 (b'GetConsoleMode', ctypes.windll.kernel32))
1349 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1350
1351 def not_a_console(handle):
1352 if handle == INVALID_HANDLE_VALUE or handle is None:
1353 return True
1354 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1355 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1356
1357 if not_a_console(h):
1358 return False
1359
1360 def next_nonbmp_pos(s):
1361 try:
1362 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1363 except StopIteration:
1364 return len(s)
1365
1366 while s:
1367 count = min(next_nonbmp_pos(s), 1024)
1368
1369 ret = WriteConsoleW(
1370 h, s, count if count else 2, ctypes.byref(written), None)
1371 if ret == 0:
1372 raise OSError('Failed to write string')
1373 if not count: # We just wrote a non-BMP character
1374 assert written.value == 2
1375 s = s[1:]
1376 else:
1377 assert written.value > 0
1378 s = s[written.value:]
1379 return True
1380
1381
1382 def write_string(s, out=None, encoding=None):
1383 if out is None:
1384 out = sys.stderr
1385 assert type(s) == compat_str
1386
1387 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1388 if _windows_write_string(s, out):
1389 return
1390
1391 if ('b' in getattr(out, 'mode', '') or
1392 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1393 byt = s.encode(encoding or preferredencoding(), 'ignore')
1394 out.write(byt)
1395 elif hasattr(out, 'buffer'):
1396 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1397 byt = s.encode(enc, 'ignore')
1398 out.buffer.write(byt)
1399 else:
1400 out.write(s)
1401 out.flush()
1402
1403
1404 def bytes_to_intlist(bs):
1405 if not bs:
1406 return []
1407 if isinstance(bs[0], int): # Python 3
1408 return list(bs)
1409 else:
1410 return [ord(c) for c in bs]
1411
1412
1413 def intlist_to_bytes(xs):
1414 if not xs:
1415 return b''
1416 return compat_struct_pack('%dB' % len(xs), *xs)
1417
1418
1419 # Cross-platform file locking
1420 if sys.platform == 'win32':
1421 import ctypes.wintypes
1422 import msvcrt
1423
1424 class OVERLAPPED(ctypes.Structure):
1425 _fields_ = [
1426 ('Internal', ctypes.wintypes.LPVOID),
1427 ('InternalHigh', ctypes.wintypes.LPVOID),
1428 ('Offset', ctypes.wintypes.DWORD),
1429 ('OffsetHigh', ctypes.wintypes.DWORD),
1430 ('hEvent', ctypes.wintypes.HANDLE),
1431 ]
1432
1433 kernel32 = ctypes.windll.kernel32
1434 LockFileEx = kernel32.LockFileEx
1435 LockFileEx.argtypes = [
1436 ctypes.wintypes.HANDLE, # hFile
1437 ctypes.wintypes.DWORD, # dwFlags
1438 ctypes.wintypes.DWORD, # dwReserved
1439 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1441 ctypes.POINTER(OVERLAPPED) # Overlapped
1442 ]
1443 LockFileEx.restype = ctypes.wintypes.BOOL
1444 UnlockFileEx = kernel32.UnlockFileEx
1445 UnlockFileEx.argtypes = [
1446 ctypes.wintypes.HANDLE, # hFile
1447 ctypes.wintypes.DWORD, # dwReserved
1448 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1450 ctypes.POINTER(OVERLAPPED) # Overlapped
1451 ]
1452 UnlockFileEx.restype = ctypes.wintypes.BOOL
1453 whole_low = 0xffffffff
1454 whole_high = 0x7fffffff
1455
1456 def _lock_file(f, exclusive):
1457 overlapped = OVERLAPPED()
1458 overlapped.Offset = 0
1459 overlapped.OffsetHigh = 0
1460 overlapped.hEvent = 0
1461 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1462 handle = msvcrt.get_osfhandle(f.fileno())
1463 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1464 whole_low, whole_high, f._lock_file_overlapped_p):
1465 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1466
1467 def _unlock_file(f):
1468 assert f._lock_file_overlapped_p
1469 handle = msvcrt.get_osfhandle(f.fileno())
1470 if not UnlockFileEx(handle, 0,
1471 whole_low, whole_high, f._lock_file_overlapped_p):
1472 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1473
1474 else:
1475 # Some platforms, such as Jython, is missing fcntl
1476 try:
1477 import fcntl
1478
1479 def _lock_file(f, exclusive):
1480 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1481
1482 def _unlock_file(f):
1483 fcntl.flock(f, fcntl.LOCK_UN)
1484 except ImportError:
1485 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1486
1487 def _lock_file(f, exclusive):
1488 raise IOError(UNSUPPORTED_MSG)
1489
1490 def _unlock_file(f):
1491 raise IOError(UNSUPPORTED_MSG)
1492
1493
1494 class locked_file(object):
1495 def __init__(self, filename, mode, encoding=None):
1496 assert mode in ['r', 'a', 'w']
1497 self.f = io.open(filename, mode, encoding=encoding)
1498 self.mode = mode
1499
1500 def __enter__(self):
1501 exclusive = self.mode != 'r'
1502 try:
1503 _lock_file(self.f, exclusive)
1504 except IOError:
1505 self.f.close()
1506 raise
1507 return self
1508
1509 def __exit__(self, etype, value, traceback):
1510 try:
1511 _unlock_file(self.f)
1512 finally:
1513 self.f.close()
1514
1515 def __iter__(self):
1516 return iter(self.f)
1517
1518 def write(self, *args):
1519 return self.f.write(*args)
1520
1521 def read(self, *args):
1522 return self.f.read(*args)
1523
1524
1525 def get_filesystem_encoding():
1526 encoding = sys.getfilesystemencoding()
1527 return encoding if encoding is not None else 'utf-8'
1528
1529
1530 def shell_quote(args):
1531 quoted_args = []
1532 encoding = get_filesystem_encoding()
1533 for a in args:
1534 if isinstance(a, bytes):
1535 # We may get a filename encoded with 'encodeFilename'
1536 a = a.decode(encoding)
1537 quoted_args.append(compat_shlex_quote(a))
1538 return ' '.join(quoted_args)
1539
1540
1541 def smuggle_url(url, data):
1542 """ Pass additional data in a URL for internal use. """
1543
1544 url, idata = unsmuggle_url(url, {})
1545 data.update(idata)
1546 sdata = compat_urllib_parse_urlencode(
1547 {'__youtubedl_smuggle': json.dumps(data)})
1548 return url + '#' + sdata
1549
1550
1551 def unsmuggle_url(smug_url, default=None):
1552 if '#__youtubedl_smuggle' not in smug_url:
1553 return smug_url, default
1554 url, _, sdata = smug_url.rpartition('#')
1555 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1556 data = json.loads(jsond)
1557 return url, data
1558
1559
1560 def format_bytes(bytes):
1561 if bytes is None:
1562 return 'N/A'
1563 if type(bytes) is str:
1564 bytes = float(bytes)
1565 if bytes == 0.0:
1566 exponent = 0
1567 else:
1568 exponent = int(math.log(bytes, 1024.0))
1569 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1570 converted = float(bytes) / float(1024 ** exponent)
1571 return '%.2f%s' % (converted, suffix)
1572
1573
1574 def lookup_unit_table(unit_table, s):
1575 units_re = '|'.join(re.escape(u) for u in unit_table)
1576 m = re.match(
1577 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1578 if not m:
1579 return None
1580 num_str = m.group('num').replace(',', '.')
1581 mult = unit_table[m.group('unit')]
1582 return int(float(num_str) * mult)
1583
1584
1585 def parse_filesize(s):
1586 if s is None:
1587 return None
1588
1589 # The lower-case forms are of course incorrect and unofficial,
1590 # but we support those too
1591 _UNIT_TABLE = {
1592 'B': 1,
1593 'b': 1,
1594 'bytes': 1,
1595 'KiB': 1024,
1596 'KB': 1000,
1597 'kB': 1024,
1598 'Kb': 1000,
1599 'kb': 1000,
1600 'kilobytes': 1000,
1601 'kibibytes': 1024,
1602 'MiB': 1024 ** 2,
1603 'MB': 1000 ** 2,
1604 'mB': 1024 ** 2,
1605 'Mb': 1000 ** 2,
1606 'mb': 1000 ** 2,
1607 'megabytes': 1000 ** 2,
1608 'mebibytes': 1024 ** 2,
1609 'GiB': 1024 ** 3,
1610 'GB': 1000 ** 3,
1611 'gB': 1024 ** 3,
1612 'Gb': 1000 ** 3,
1613 'gb': 1000 ** 3,
1614 'gigabytes': 1000 ** 3,
1615 'gibibytes': 1024 ** 3,
1616 'TiB': 1024 ** 4,
1617 'TB': 1000 ** 4,
1618 'tB': 1024 ** 4,
1619 'Tb': 1000 ** 4,
1620 'tb': 1000 ** 4,
1621 'terabytes': 1000 ** 4,
1622 'tebibytes': 1024 ** 4,
1623 'PiB': 1024 ** 5,
1624 'PB': 1000 ** 5,
1625 'pB': 1024 ** 5,
1626 'Pb': 1000 ** 5,
1627 'pb': 1000 ** 5,
1628 'petabytes': 1000 ** 5,
1629 'pebibytes': 1024 ** 5,
1630 'EiB': 1024 ** 6,
1631 'EB': 1000 ** 6,
1632 'eB': 1024 ** 6,
1633 'Eb': 1000 ** 6,
1634 'eb': 1000 ** 6,
1635 'exabytes': 1000 ** 6,
1636 'exbibytes': 1024 ** 6,
1637 'ZiB': 1024 ** 7,
1638 'ZB': 1000 ** 7,
1639 'zB': 1024 ** 7,
1640 'Zb': 1000 ** 7,
1641 'zb': 1000 ** 7,
1642 'zettabytes': 1000 ** 7,
1643 'zebibytes': 1024 ** 7,
1644 'YiB': 1024 ** 8,
1645 'YB': 1000 ** 8,
1646 'yB': 1024 ** 8,
1647 'Yb': 1000 ** 8,
1648 'yb': 1000 ** 8,
1649 'yottabytes': 1000 ** 8,
1650 'yobibytes': 1024 ** 8,
1651 }
1652
1653 return lookup_unit_table(_UNIT_TABLE, s)
1654
1655
1656 def parse_count(s):
1657 if s is None:
1658 return None
1659
1660 s = s.strip()
1661
1662 if re.match(r'^[\d,.]+$', s):
1663 return str_to_int(s)
1664
1665 _UNIT_TABLE = {
1666 'k': 1000,
1667 'K': 1000,
1668 'm': 1000 ** 2,
1669 'M': 1000 ** 2,
1670 'kk': 1000 ** 2,
1671 'KK': 1000 ** 2,
1672 }
1673
1674 return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677 def month_by_name(name, lang='en'):
1678 """ Return the number of a month by (locale-independently) English name """
1679
1680 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1681
1682 try:
1683 return month_names.index(name) + 1
1684 except ValueError:
1685 return None
1686
1687
1688 def month_by_abbreviation(abbrev):
1689 """ Return the number of a month by (locale-independently) English
1690 abbreviations """
1691
1692 try:
1693 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1694 except ValueError:
1695 return None
1696
1697
1698 def fix_xml_ampersands(xml_str):
1699 """Replace all the '&' by '&amp;' in XML"""
1700 return re.sub(
1701 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1702 '&amp;',
1703 xml_str)
1704
1705
1706 def setproctitle(title):
1707 assert isinstance(title, compat_str)
1708
1709 # ctypes in Jython is not complete
1710 # http://bugs.jython.org/issue2148
1711 if sys.platform.startswith('java'):
1712 return
1713
1714 try:
1715 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1716 except OSError:
1717 return
1718 except TypeError:
1719 # LoadLibrary in Windows Python 2.7.13 only expects
1720 # a bytestring, but since unicode_literals turns
1721 # every string into a unicode string, it fails.
1722 return
1723 title_bytes = title.encode('utf-8')
1724 buf = ctypes.create_string_buffer(len(title_bytes))
1725 buf.value = title_bytes
1726 try:
1727 libc.prctl(15, buf, 0, 0, 0)
1728 except AttributeError:
1729 return # Strange libc, just skip this
1730
1731
1732 def remove_start(s, start):
1733 return s[len(start):] if s is not None and s.startswith(start) else s
1734
1735
1736 def remove_end(s, end):
1737 return s[:-len(end)] if s is not None and s.endswith(end) else s
1738
1739
1740 def remove_quotes(s):
1741 if s is None or len(s) < 2:
1742 return s
1743 for quote in ('"', "'", ):
1744 if s[0] == quote and s[-1] == quote:
1745 return s[1:-1]
1746 return s
1747
1748
1749 def url_basename(url):
1750 path = compat_urlparse.urlparse(url).path
1751 return path.strip('/').split('/')[-1]
1752
1753
1754 def base_url(url):
1755 return re.match(r'https?://[^?#&]+/', url).group()
1756
1757
1758 def urljoin(base, path):
1759 if isinstance(path, bytes):
1760 path = path.decode('utf-8')
1761 if not isinstance(path, compat_str) or not path:
1762 return None
1763 if re.match(r'^(?:https?:)?//', path):
1764 return path
1765 if isinstance(base, bytes):
1766 base = base.decode('utf-8')
1767 if not isinstance(base, compat_str) or not re.match(
1768 r'^(?:https?:)?//', base):
1769 return None
1770 return compat_urlparse.urljoin(base, path)
1771
1772
1773 class HEADRequest(compat_urllib_request.Request):
1774 def get_method(self):
1775 return 'HEAD'
1776
1777
1778 class PUTRequest(compat_urllib_request.Request):
1779 def get_method(self):
1780 return 'PUT'
1781
1782
1783 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1784 if get_attr:
1785 if v is not None:
1786 v = getattr(v, get_attr, None)
1787 if v == '':
1788 v = None
1789 if v is None:
1790 return default
1791 try:
1792 return int(v) * invscale // scale
1793 except ValueError:
1794 return default
1795
1796
1797 def str_or_none(v, default=None):
1798 return default if v is None else compat_str(v)
1799
1800
1801 def str_to_int(int_str):
1802 """ A more relaxed version of int_or_none """
1803 if int_str is None:
1804 return None
1805 int_str = re.sub(r'[,\.\+]', '', int_str)
1806 return int(int_str)
1807
1808
1809 def float_or_none(v, scale=1, invscale=1, default=None):
1810 if v is None:
1811 return default
1812 try:
1813 return float(v) * invscale / scale
1814 except ValueError:
1815 return default
1816
1817
1818 def bool_or_none(v, default=None):
1819 return v if isinstance(v, bool) else default
1820
1821
1822 def strip_or_none(v):
1823 return None if v is None else v.strip()
1824
1825
1826 def parse_duration(s):
1827 if not isinstance(s, compat_basestring):
1828 return None
1829
1830 s = s.strip()
1831
1832 days, hours, mins, secs, ms = [None] * 5
1833 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1834 if m:
1835 days, hours, mins, secs, ms = m.groups()
1836 else:
1837 m = re.match(
1838 r'''(?ix)(?:P?
1839 (?:
1840 [0-9]+\s*y(?:ears?)?\s*
1841 )?
1842 (?:
1843 [0-9]+\s*m(?:onths?)?\s*
1844 )?
1845 (?:
1846 [0-9]+\s*w(?:eeks?)?\s*
1847 )?
1848 (?:
1849 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1850 )?
1851 T)?
1852 (?:
1853 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1854 )?
1855 (?:
1856 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1857 )?
1858 (?:
1859 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1860 )?Z?$''', s)
1861 if m:
1862 days, hours, mins, secs, ms = m.groups()
1863 else:
1864 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1865 if m:
1866 hours, mins = m.groups()
1867 else:
1868 return None
1869
1870 duration = 0
1871 if secs:
1872 duration += float(secs)
1873 if mins:
1874 duration += float(mins) * 60
1875 if hours:
1876 duration += float(hours) * 60 * 60
1877 if days:
1878 duration += float(days) * 24 * 60 * 60
1879 if ms:
1880 duration += float(ms)
1881 return duration
1882
1883
1884 def prepend_extension(filename, ext, expected_real_ext=None):
1885 name, real_ext = os.path.splitext(filename)
1886 return (
1887 '{0}.{1}{2}'.format(name, ext, real_ext)
1888 if not expected_real_ext or real_ext[1:] == expected_real_ext
1889 else '{0}.{1}'.format(filename, ext))
1890
1891
1892 def replace_extension(filename, ext, expected_real_ext=None):
1893 name, real_ext = os.path.splitext(filename)
1894 return '{0}.{1}'.format(
1895 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1896 ext)
1897
1898
1899 def check_executable(exe, args=[]):
1900 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1901 args can be a list of arguments for a short output (like -version) """
1902 try:
1903 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1904 except OSError:
1905 return False
1906 return exe
1907
1908
1909 def get_exe_version(exe, args=['--version'],
1910 version_re=None, unrecognized='present'):
1911 """ Returns the version of the specified executable,
1912 or False if the executable is not present """
1913 try:
1914 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1915 # SIGTTOU if youtube-dl is run in the background.
1916 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1917 out, _ = subprocess.Popen(
1918 [encodeArgument(exe)] + args,
1919 stdin=subprocess.PIPE,
1920 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1921 except OSError:
1922 return False
1923 if isinstance(out, bytes): # Python 2.x
1924 out = out.decode('ascii', 'ignore')
1925 return detect_exe_version(out, version_re, unrecognized)
1926
1927
1928 def detect_exe_version(output, version_re=None, unrecognized='present'):
1929 assert isinstance(output, compat_str)
1930 if version_re is None:
1931 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1932 m = re.search(version_re, output)
1933 if m:
1934 return m.group(1)
1935 else:
1936 return unrecognized
1937
1938
1939 class PagedList(object):
1940 def __len__(self):
1941 # This is only useful for tests
1942 return len(self.getslice())
1943
1944
1945 class OnDemandPagedList(PagedList):
1946 def __init__(self, pagefunc, pagesize, use_cache=True):
1947 self._pagefunc = pagefunc
1948 self._pagesize = pagesize
1949 self._use_cache = use_cache
1950 if use_cache:
1951 self._cache = {}
1952
1953 def getslice(self, start=0, end=None):
1954 res = []
1955 for pagenum in itertools.count(start // self._pagesize):
1956 firstid = pagenum * self._pagesize
1957 nextfirstid = pagenum * self._pagesize + self._pagesize
1958 if start >= nextfirstid:
1959 continue
1960
1961 page_results = None
1962 if self._use_cache:
1963 page_results = self._cache.get(pagenum)
1964 if page_results is None:
1965 page_results = list(self._pagefunc(pagenum))
1966 if self._use_cache:
1967 self._cache[pagenum] = page_results
1968
1969 startv = (
1970 start % self._pagesize
1971 if firstid <= start < nextfirstid
1972 else 0)
1973
1974 endv = (
1975 ((end - 1) % self._pagesize) + 1
1976 if (end is not None and firstid <= end <= nextfirstid)
1977 else None)
1978
1979 if startv != 0 or endv is not None:
1980 page_results = page_results[startv:endv]
1981 res.extend(page_results)
1982
1983 # A little optimization - if current page is not "full", ie. does
1984 # not contain page_size videos then we can assume that this page
1985 # is the last one - there are no more ids on further pages -
1986 # i.e. no need to query again.
1987 if len(page_results) + startv < self._pagesize:
1988 break
1989
1990 # If we got the whole page, but the next page is not interesting,
1991 # break out early as well
1992 if end == nextfirstid:
1993 break
1994 return res
1995
1996
1997 class InAdvancePagedList(PagedList):
1998 def __init__(self, pagefunc, pagecount, pagesize):
1999 self._pagefunc = pagefunc
2000 self._pagecount = pagecount
2001 self._pagesize = pagesize
2002
2003 def getslice(self, start=0, end=None):
2004 res = []
2005 start_page = start // self._pagesize
2006 end_page = (
2007 self._pagecount if end is None else (end // self._pagesize + 1))
2008 skip_elems = start - start_page * self._pagesize
2009 only_more = None if end is None else end - start
2010 for pagenum in range(start_page, end_page):
2011 page = list(self._pagefunc(pagenum))
2012 if skip_elems:
2013 page = page[skip_elems:]
2014 skip_elems = None
2015 if only_more is not None:
2016 if len(page) < only_more:
2017 only_more -= len(page)
2018 else:
2019 page = page[:only_more]
2020 res.extend(page)
2021 break
2022 res.extend(page)
2023 return res
2024
2025
2026 def uppercase_escape(s):
2027 unicode_escape = codecs.getdecoder('unicode_escape')
2028 return re.sub(
2029 r'\\U[0-9a-fA-F]{8}',
2030 lambda m: unicode_escape(m.group(0))[0],
2031 s)
2032
2033
2034 def lowercase_escape(s):
2035 unicode_escape = codecs.getdecoder('unicode_escape')
2036 return re.sub(
2037 r'\\u[0-9a-fA-F]{4}',
2038 lambda m: unicode_escape(m.group(0))[0],
2039 s)
2040
2041
2042 def escape_rfc3986(s):
2043 """Escape non-ASCII characters as suggested by RFC 3986"""
2044 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2045 s = s.encode('utf-8')
2046 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2047
2048
2049 def escape_url(url):
2050 """Escape URL as suggested by RFC 3986"""
2051 url_parsed = compat_urllib_parse_urlparse(url)
2052 return url_parsed._replace(
2053 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2054 path=escape_rfc3986(url_parsed.path),
2055 params=escape_rfc3986(url_parsed.params),
2056 query=escape_rfc3986(url_parsed.query),
2057 fragment=escape_rfc3986(url_parsed.fragment)
2058 ).geturl()
2059
2060
2061 def read_batch_urls(batch_fd):
2062 def fixup(url):
2063 if not isinstance(url, compat_str):
2064 url = url.decode('utf-8', 'replace')
2065 BOM_UTF8 = '\xef\xbb\xbf'
2066 if url.startswith(BOM_UTF8):
2067 url = url[len(BOM_UTF8):]
2068 url = url.strip()
2069 if url.startswith(('#', ';', ']')):
2070 return False
2071 return url
2072
2073 with contextlib.closing(batch_fd) as fd:
2074 return [url for url in map(fixup, fd) if url]
2075
2076
2077 def urlencode_postdata(*args, **kargs):
2078 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2079
2080
2081 def update_url_query(url, query):
2082 if not query:
2083 return url
2084 parsed_url = compat_urlparse.urlparse(url)
2085 qs = compat_parse_qs(parsed_url.query)
2086 qs.update(query)
2087 return compat_urlparse.urlunparse(parsed_url._replace(
2088 query=compat_urllib_parse_urlencode(qs, True)))
2089
2090
2091 def update_Request(req, url=None, data=None, headers={}, query={}):
2092 req_headers = req.headers.copy()
2093 req_headers.update(headers)
2094 req_data = data or req.data
2095 req_url = update_url_query(url or req.get_full_url(), query)
2096 req_get_method = req.get_method()
2097 if req_get_method == 'HEAD':
2098 req_type = HEADRequest
2099 elif req_get_method == 'PUT':
2100 req_type = PUTRequest
2101 else:
2102 req_type = compat_urllib_request.Request
2103 new_req = req_type(
2104 req_url, data=req_data, headers=req_headers,
2105 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2106 if hasattr(req, 'timeout'):
2107 new_req.timeout = req.timeout
2108 return new_req
2109
2110
2111 def _multipart_encode_impl(data, boundary):
2112 content_type = 'multipart/form-data; boundary=%s' % boundary
2113
2114 out = b''
2115 for k, v in data.items():
2116 out += b'--' + boundary.encode('ascii') + b'\r\n'
2117 if isinstance(k, compat_str):
2118 k = k.encode('utf-8')
2119 if isinstance(v, compat_str):
2120 v = v.encode('utf-8')
2121 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2122 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2123 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2124 if boundary.encode('ascii') in content:
2125 raise ValueError('Boundary overlaps with data')
2126 out += content
2127
2128 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2129
2130 return out, content_type
2131
2132
2133 def multipart_encode(data, boundary=None):
2134 '''
2135 Encode a dict to RFC 7578-compliant form-data
2136
2137 data:
2138 A dict where keys and values can be either Unicode or bytes-like
2139 objects.
2140 boundary:
2141 If specified a Unicode object, it's used as the boundary. Otherwise
2142 a random boundary is generated.
2143
2144 Reference: https://tools.ietf.org/html/rfc7578
2145 '''
2146 has_specified_boundary = boundary is not None
2147
2148 while True:
2149 if boundary is None:
2150 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2151
2152 try:
2153 out, content_type = _multipart_encode_impl(data, boundary)
2154 break
2155 except ValueError:
2156 if has_specified_boundary:
2157 raise
2158 boundary = None
2159
2160 return out, content_type
2161
2162
2163 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2164 if isinstance(key_or_keys, (list, tuple)):
2165 for key in key_or_keys:
2166 if key not in d or d[key] is None or skip_false_values and not d[key]:
2167 continue
2168 return d[key]
2169 return default
2170 return d.get(key_or_keys, default)
2171
2172
2173 def try_get(src, getter, expected_type=None):
2174 if not isinstance(getter, (list, tuple)):
2175 getter = [getter]
2176 for get in getter:
2177 try:
2178 v = get(src)
2179 except (AttributeError, KeyError, TypeError, IndexError):
2180 pass
2181 else:
2182 if expected_type is None or isinstance(v, expected_type):
2183 return v
2184
2185
2186 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2187 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2188
2189
2190 US_RATINGS = {
2191 'G': 0,
2192 'PG': 10,
2193 'PG-13': 13,
2194 'R': 16,
2195 'NC': 18,
2196 }
2197
2198
2199 TV_PARENTAL_GUIDELINES = {
2200 'TV-Y': 0,
2201 'TV-Y7': 7,
2202 'TV-G': 0,
2203 'TV-PG': 0,
2204 'TV-14': 14,
2205 'TV-MA': 17,
2206 }
2207
2208
2209 def parse_age_limit(s):
2210 if type(s) == int:
2211 return s if 0 <= s <= 21 else None
2212 if not isinstance(s, compat_basestring):
2213 return None
2214 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2215 if m:
2216 return int(m.group('age'))
2217 if s in US_RATINGS:
2218 return US_RATINGS[s]
2219 return TV_PARENTAL_GUIDELINES.get(s)
2220
2221
2222 def strip_jsonp(code):
2223 return re.sub(
2224 r'''(?sx)^
2225 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2226 (?:\s*&&\s*(?P=func_name))?
2227 \s*\(\s*(?P<callback_data>.*)\);?
2228 \s*?(?://[^\n]*)*$''',
2229 r'\g<callback_data>', code)
2230
2231
2232 def js_to_json(code):
2233 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2234 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2235 INTEGER_TABLE = (
2236 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2237 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2238 )
2239
2240 def fix_kv(m):
2241 v = m.group(0)
2242 if v in ('true', 'false', 'null'):
2243 return v
2244 elif v.startswith('/*') or v.startswith('//') or v == ',':
2245 return ""
2246
2247 if v[0] in ("'", '"'):
2248 v = re.sub(r'(?s)\\.|"', lambda m: {
2249 '"': '\\"',
2250 "\\'": "'",
2251 '\\\n': '',
2252 '\\x': '\\u00',
2253 }.get(m.group(0), m.group(0)), v[1:-1])
2254
2255 for regex, base in INTEGER_TABLE:
2256 im = re.match(regex, v)
2257 if im:
2258 i = int(im.group(1), base)
2259 return '"%d":' % i if v.endswith(':') else '%d' % i
2260
2261 return '"%s"' % v
2262
2263 return re.sub(r'''(?sx)
2264 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2265 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2266 {comment}|,(?={skip}[\]}}])|
2267 [a-zA-Z_][.a-zA-Z_0-9]*|
2268 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2269 [0-9]+(?={skip}:)
2270 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2271
2272
2273 def qualities(quality_ids):
2274 """ Get a numeric quality value out of a list of possible values """
2275 def q(qid):
2276 try:
2277 return quality_ids.index(qid)
2278 except ValueError:
2279 return -1
2280 return q
2281
2282
2283 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2284
2285
2286 def limit_length(s, length):
2287 """ Add ellipses to overly long strings """
2288 if s is None:
2289 return None
2290 ELLIPSES = '...'
2291 if len(s) > length:
2292 return s[:length - len(ELLIPSES)] + ELLIPSES
2293 return s
2294
2295
2296 def version_tuple(v):
2297 return tuple(int(e) for e in re.split(r'[-.]', v))
2298
2299
2300 def is_outdated_version(version, limit, assume_new=True):
2301 if not version:
2302 return not assume_new
2303 try:
2304 return version_tuple(version) < version_tuple(limit)
2305 except ValueError:
2306 return not assume_new
2307
2308
2309 def ytdl_is_updateable():
2310 """ Returns if youtube-dl can be updated with -U """
2311 from zipimport import zipimporter
2312
2313 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2314
2315
2316 def args_to_str(args):
2317 # Get a short string representation for a subprocess command
2318 return ' '.join(compat_shlex_quote(a) for a in args)
2319
2320
2321 def error_to_compat_str(err):
2322 err_str = str(err)
2323 # On python 2 error byte string must be decoded with proper
2324 # encoding rather than ascii
2325 if sys.version_info[0] < 3:
2326 err_str = err_str.decode(preferredencoding())
2327 return err_str
2328
2329
2330 def mimetype2ext(mt):
2331 if mt is None:
2332 return None
2333
2334 ext = {
2335 'audio/mp4': 'm4a',
2336 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2337 # it's the most popular one
2338 'audio/mpeg': 'mp3',
2339 }.get(mt)
2340 if ext is not None:
2341 return ext
2342
2343 _, _, res = mt.rpartition('/')
2344 res = res.split(';')[0].strip().lower()
2345
2346 return {
2347 '3gpp': '3gp',
2348 'smptett+xml': 'tt',
2349 'ttaf+xml': 'dfxp',
2350 'ttml+xml': 'ttml',
2351 'x-flv': 'flv',
2352 'x-mp4-fragmented': 'mp4',
2353 'x-ms-wmv': 'wmv',
2354 'mpegurl': 'm3u8',
2355 'x-mpegurl': 'm3u8',
2356 'vnd.apple.mpegurl': 'm3u8',
2357 'dash+xml': 'mpd',
2358 'f4m+xml': 'f4m',
2359 'hds+xml': 'f4m',
2360 'vnd.ms-sstr+xml': 'ism',
2361 'quicktime': 'mov',
2362 'mp2t': 'ts',
2363 }.get(res, res)
2364
2365
2366 def parse_codecs(codecs_str):
2367 # http://tools.ietf.org/html/rfc6381
2368 if not codecs_str:
2369 return {}
2370 splited_codecs = list(filter(None, map(
2371 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2372 vcodec, acodec = None, None
2373 for full_codec in splited_codecs:
2374 codec = full_codec.split('.')[0]
2375 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2376 if not vcodec:
2377 vcodec = full_codec
2378 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2379 if not acodec:
2380 acodec = full_codec
2381 else:
2382 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2383 if not vcodec and not acodec:
2384 if len(splited_codecs) == 2:
2385 return {
2386 'vcodec': vcodec,
2387 'acodec': acodec,
2388 }
2389 elif len(splited_codecs) == 1:
2390 return {
2391 'vcodec': 'none',
2392 'acodec': vcodec,
2393 }
2394 else:
2395 return {
2396 'vcodec': vcodec or 'none',
2397 'acodec': acodec or 'none',
2398 }
2399 return {}
2400
2401
2402 def urlhandle_detect_ext(url_handle):
2403 getheader = url_handle.headers.get
2404
2405 cd = getheader('Content-Disposition')
2406 if cd:
2407 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2408 if m:
2409 e = determine_ext(m.group('filename'), default_ext=None)
2410 if e:
2411 return e
2412
2413 return mimetype2ext(getheader('Content-Type'))
2414
2415
2416 def encode_data_uri(data, mime_type):
2417 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2418
2419
2420 def age_restricted(content_limit, age_limit):
2421 """ Returns True iff the content should be blocked """
2422
2423 if age_limit is None: # No limit set
2424 return False
2425 if content_limit is None:
2426 return False # Content available for everyone
2427 return age_limit < content_limit
2428
2429
2430 def is_html(first_bytes):
2431 """ Detect whether a file contains HTML by examining its first bytes. """
2432
2433 BOMS = [
2434 (b'\xef\xbb\xbf', 'utf-8'),
2435 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2436 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2437 (b'\xff\xfe', 'utf-16-le'),
2438 (b'\xfe\xff', 'utf-16-be'),
2439 ]
2440 for bom, enc in BOMS:
2441 if first_bytes.startswith(bom):
2442 s = first_bytes[len(bom):].decode(enc, 'replace')
2443 break
2444 else:
2445 s = first_bytes.decode('utf-8', 'replace')
2446
2447 return re.match(r'^\s*<', s)
2448
2449
2450 def determine_protocol(info_dict):
2451 protocol = info_dict.get('protocol')
2452 if protocol is not None:
2453 return protocol
2454
2455 url = info_dict['url']
2456 if url.startswith('rtmp'):
2457 return 'rtmp'
2458 elif url.startswith('mms'):
2459 return 'mms'
2460 elif url.startswith('rtsp'):
2461 return 'rtsp'
2462
2463 ext = determine_ext(url)
2464 if ext == 'm3u8':
2465 return 'm3u8'
2466 elif ext == 'f4m':
2467 return 'f4m'
2468
2469 return compat_urllib_parse_urlparse(url).scheme
2470
2471
2472 def render_table(header_row, data):
2473 """ Render a list of rows, each as a list of values """
2474 table = [header_row] + data
2475 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2476 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2477 return '\n'.join(format_str % tuple(row) for row in table)
2478
2479
2480 def _match_one(filter_part, dct):
2481 COMPARISON_OPERATORS = {
2482 '<': operator.lt,
2483 '<=': operator.le,
2484 '>': operator.gt,
2485 '>=': operator.ge,
2486 '=': operator.eq,
2487 '!=': operator.ne,
2488 }
2489 operator_rex = re.compile(r'''(?x)\s*
2490 (?P<key>[a-z_]+)
2491 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2492 (?:
2493 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2494 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2495 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2496 )
2497 \s*$
2498 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2499 m = operator_rex.search(filter_part)
2500 if m:
2501 op = COMPARISON_OPERATORS[m.group('op')]
2502 actual_value = dct.get(m.group('key'))
2503 if (m.group('quotedstrval') is not None or
2504 m.group('strval') is not None or
2505 # If the original field is a string and matching comparisonvalue is
2506 # a number we should respect the origin of the original field
2507 # and process comparison value as a string (see
2508 # https://github.com/rg3/youtube-dl/issues/11082).
2509 actual_value is not None and m.group('intval') is not None and
2510 isinstance(actual_value, compat_str)):
2511 if m.group('op') not in ('=', '!='):
2512 raise ValueError(
2513 'Operator %s does not support string values!' % m.group('op'))
2514 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2515 quote = m.group('quote')
2516 if quote is not None:
2517 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2518 else:
2519 try:
2520 comparison_value = int(m.group('intval'))
2521 except ValueError:
2522 comparison_value = parse_filesize(m.group('intval'))
2523 if comparison_value is None:
2524 comparison_value = parse_filesize(m.group('intval') + 'B')
2525 if comparison_value is None:
2526 raise ValueError(
2527 'Invalid integer value %r in filter part %r' % (
2528 m.group('intval'), filter_part))
2529 if actual_value is None:
2530 return m.group('none_inclusive')
2531 return op(actual_value, comparison_value)
2532
2533 UNARY_OPERATORS = {
2534 '': lambda v: v is not None,
2535 '!': lambda v: v is None,
2536 }
2537 operator_rex = re.compile(r'''(?x)\s*
2538 (?P<op>%s)\s*(?P<key>[a-z_]+)
2539 \s*$
2540 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2541 m = operator_rex.search(filter_part)
2542 if m:
2543 op = UNARY_OPERATORS[m.group('op')]
2544 actual_value = dct.get(m.group('key'))
2545 return op(actual_value)
2546
2547 raise ValueError('Invalid filter part %r' % filter_part)
2548
2549
2550 def match_str(filter_str, dct):
2551 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2552
2553 return all(
2554 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2555
2556
2557 def match_filter_func(filter_str):
2558 def _match_func(info_dict):
2559 if match_str(filter_str, info_dict):
2560 return None
2561 else:
2562 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2563 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2564 return _match_func
2565
2566
2567 def parse_dfxp_time_expr(time_expr):
2568 if not time_expr:
2569 return
2570
2571 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2572 if mobj:
2573 return float(mobj.group('time_offset'))
2574
2575 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2576 if mobj:
2577 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2578
2579
2580 def srt_subtitles_timecode(seconds):
2581 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2582
2583
2584 def dfxp2srt(dfxp_data):
2585 '''
2586 @param dfxp_data A bytes-like object containing DFXP data
2587 @returns A unicode object containing converted SRT data
2588 '''
2589 LEGACY_NAMESPACES = (
2590 (b'http://www.w3.org/ns/ttml', [
2591 b'http://www.w3.org/2004/11/ttaf1',
2592 b'http://www.w3.org/2006/04/ttaf1',
2593 b'http://www.w3.org/2006/10/ttaf1',
2594 ]),
2595 (b'http://www.w3.org/ns/ttml#styling', [
2596 b'http://www.w3.org/ns/ttml#style',
2597 ]),
2598 )
2599
2600 SUPPORTED_STYLING = [
2601 'color',
2602 'fontFamily',
2603 'fontSize',
2604 'fontStyle',
2605 'fontWeight',
2606 'textDecoration'
2607 ]
2608
2609 _x = functools.partial(xpath_with_ns, ns_map={
2610 'ttml': 'http://www.w3.org/ns/ttml',
2611 'tts': 'http://www.w3.org/ns/ttml#styling',
2612 })
2613
2614 styles = {}
2615 default_style = {}
2616
2617 class TTMLPElementParser(object):
2618 _out = ''
2619 _unclosed_elements = []
2620 _applied_styles = []
2621
2622 def start(self, tag, attrib):
2623 if tag in (_x('ttml:br'), 'br'):
2624 self._out += '\n'
2625 else:
2626 unclosed_elements = []
2627 style = {}
2628 element_style_id = attrib.get('style')
2629 if default_style:
2630 style.update(default_style)
2631 if element_style_id:
2632 style.update(styles.get(element_style_id, {}))
2633 for prop in SUPPORTED_STYLING:
2634 prop_val = attrib.get(_x('tts:' + prop))
2635 if prop_val:
2636 style[prop] = prop_val
2637 if style:
2638 font = ''
2639 for k, v in sorted(style.items()):
2640 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2641 continue
2642 if k == 'color':
2643 font += ' color="%s"' % v
2644 elif k == 'fontSize':
2645 font += ' size="%s"' % v
2646 elif k == 'fontFamily':
2647 font += ' face="%s"' % v
2648 elif k == 'fontWeight' and v == 'bold':
2649 self._out += '<b>'
2650 unclosed_elements.append('b')
2651 elif k == 'fontStyle' and v == 'italic':
2652 self._out += '<i>'
2653 unclosed_elements.append('i')
2654 elif k == 'textDecoration' and v == 'underline':
2655 self._out += '<u>'
2656 unclosed_elements.append('u')
2657 if font:
2658 self._out += '<font' + font + '>'
2659 unclosed_elements.append('font')
2660 applied_style = {}
2661 if self._applied_styles:
2662 applied_style.update(self._applied_styles[-1])
2663 applied_style.update(style)
2664 self._applied_styles.append(applied_style)
2665 self._unclosed_elements.append(unclosed_elements)
2666
2667 def end(self, tag):
2668 if tag not in (_x('ttml:br'), 'br'):
2669 unclosed_elements = self._unclosed_elements.pop()
2670 for element in reversed(unclosed_elements):
2671 self._out += '</%s>' % element
2672 if unclosed_elements and self._applied_styles:
2673 self._applied_styles.pop()
2674
2675 def data(self, data):
2676 self._out += data
2677
2678 def close(self):
2679 return self._out.strip()
2680
2681 def parse_node(node):
2682 target = TTMLPElementParser()
2683 parser = xml.etree.ElementTree.XMLParser(target=target)
2684 parser.feed(xml.etree.ElementTree.tostring(node))
2685 return parser.close()
2686
2687 for k, v in LEGACY_NAMESPACES:
2688 for ns in v:
2689 dfxp_data = dfxp_data.replace(ns, k)
2690
2691 dfxp = compat_etree_fromstring(dfxp_data)
2692 out = []
2693 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2694
2695 if not paras:
2696 raise ValueError('Invalid dfxp/TTML subtitle')
2697
2698 repeat = False
2699 while True:
2700 for style in dfxp.findall(_x('.//ttml:style')):
2701 style_id = style.get('id')
2702 parent_style_id = style.get('style')
2703 if parent_style_id:
2704 if parent_style_id not in styles:
2705 repeat = True
2706 continue
2707 styles[style_id] = styles[parent_style_id].copy()
2708 for prop in SUPPORTED_STYLING:
2709 prop_val = style.get(_x('tts:' + prop))
2710 if prop_val:
2711 styles.setdefault(style_id, {})[prop] = prop_val
2712 if repeat:
2713 repeat = False
2714 else:
2715 break
2716
2717 for p in ('body', 'div'):
2718 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2719 if ele is None:
2720 continue
2721 style = styles.get(ele.get('style'))
2722 if not style:
2723 continue
2724 default_style.update(style)
2725
2726 for para, index in zip(paras, itertools.count(1)):
2727 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2728 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2729 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2730 if begin_time is None:
2731 continue
2732 if not end_time:
2733 if not dur:
2734 continue
2735 end_time = begin_time + dur
2736 out.append('%d\n%s --> %s\n%s\n\n' % (
2737 index,
2738 srt_subtitles_timecode(begin_time),
2739 srt_subtitles_timecode(end_time),
2740 parse_node(para)))
2741
2742 return ''.join(out)
2743
2744
2745 def cli_option(params, command_option, param):
2746 param = params.get(param)
2747 if param:
2748 param = compat_str(param)
2749 return [command_option, param] if param is not None else []
2750
2751
2752 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2753 param = params.get(param)
2754 if param is None:
2755 return []
2756 assert isinstance(param, bool)
2757 if separator:
2758 return [command_option + separator + (true_value if param else false_value)]
2759 return [command_option, true_value if param else false_value]
2760
2761
2762 def cli_valueless_option(params, command_option, param, expected_value=True):
2763 param = params.get(param)
2764 return [command_option] if param == expected_value else []
2765
2766
2767 def cli_configuration_args(params, param, default=[]):
2768 ex_args = params.get(param)
2769 if ex_args is None:
2770 return default
2771 assert isinstance(ex_args, list)
2772 return ex_args
2773
2774
2775 class ISO639Utils(object):
2776 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2777 _lang_map = {
2778 'aa': 'aar',
2779 'ab': 'abk',
2780 'ae': 'ave',
2781 'af': 'afr',
2782 'ak': 'aka',
2783 'am': 'amh',
2784 'an': 'arg',
2785 'ar': 'ara',
2786 'as': 'asm',
2787 'av': 'ava',
2788 'ay': 'aym',
2789 'az': 'aze',
2790 'ba': 'bak',
2791 'be': 'bel',
2792 'bg': 'bul',
2793 'bh': 'bih',
2794 'bi': 'bis',
2795 'bm': 'bam',
2796 'bn': 'ben',
2797 'bo': 'bod',
2798 'br': 'bre',
2799 'bs': 'bos',
2800 'ca': 'cat',
2801 'ce': 'che',
2802 'ch': 'cha',
2803 'co': 'cos',
2804 'cr': 'cre',
2805 'cs': 'ces',
2806 'cu': 'chu',
2807 'cv': 'chv',
2808 'cy': 'cym',
2809 'da': 'dan',
2810 'de': 'deu',
2811 'dv': 'div',
2812 'dz': 'dzo',
2813 'ee': 'ewe',
2814 'el': 'ell',
2815 'en': 'eng',
2816 'eo': 'epo',
2817 'es': 'spa',
2818 'et': 'est',
2819 'eu': 'eus',
2820 'fa': 'fas',
2821 'ff': 'ful',
2822 'fi': 'fin',
2823 'fj': 'fij',
2824 'fo': 'fao',
2825 'fr': 'fra',
2826 'fy': 'fry',
2827 'ga': 'gle',
2828 'gd': 'gla',
2829 'gl': 'glg',
2830 'gn': 'grn',
2831 'gu': 'guj',
2832 'gv': 'glv',
2833 'ha': 'hau',
2834 'he': 'heb',
2835 'hi': 'hin',
2836 'ho': 'hmo',
2837 'hr': 'hrv',
2838 'ht': 'hat',
2839 'hu': 'hun',
2840 'hy': 'hye',
2841 'hz': 'her',
2842 'ia': 'ina',
2843 'id': 'ind',
2844 'ie': 'ile',
2845 'ig': 'ibo',
2846 'ii': 'iii',
2847 'ik': 'ipk',
2848 'io': 'ido',
2849 'is': 'isl',
2850 'it': 'ita',
2851 'iu': 'iku',
2852 'ja': 'jpn',
2853 'jv': 'jav',
2854 'ka': 'kat',
2855 'kg': 'kon',
2856 'ki': 'kik',
2857 'kj': 'kua',
2858 'kk': 'kaz',
2859 'kl': 'kal',
2860 'km': 'khm',
2861 'kn': 'kan',
2862 'ko': 'kor',
2863 'kr': 'kau',
2864 'ks': 'kas',
2865 'ku': 'kur',
2866 'kv': 'kom',
2867 'kw': 'cor',
2868 'ky': 'kir',
2869 'la': 'lat',
2870 'lb': 'ltz',
2871 'lg': 'lug',
2872 'li': 'lim',
2873 'ln': 'lin',
2874 'lo': 'lao',
2875 'lt': 'lit',
2876 'lu': 'lub',
2877 'lv': 'lav',
2878 'mg': 'mlg',
2879 'mh': 'mah',
2880 'mi': 'mri',
2881 'mk': 'mkd',
2882 'ml': 'mal',
2883 'mn': 'mon',
2884 'mr': 'mar',
2885 'ms': 'msa',
2886 'mt': 'mlt',
2887 'my': 'mya',
2888 'na': 'nau',
2889 'nb': 'nob',
2890 'nd': 'nde',
2891 'ne': 'nep',
2892 'ng': 'ndo',
2893 'nl': 'nld',
2894 'nn': 'nno',
2895 'no': 'nor',
2896 'nr': 'nbl',
2897 'nv': 'nav',
2898 'ny': 'nya',
2899 'oc': 'oci',
2900 'oj': 'oji',
2901 'om': 'orm',
2902 'or': 'ori',
2903 'os': 'oss',
2904 'pa': 'pan',
2905 'pi': 'pli',
2906 'pl': 'pol',
2907 'ps': 'pus',
2908 'pt': 'por',
2909 'qu': 'que',
2910 'rm': 'roh',
2911 'rn': 'run',
2912 'ro': 'ron',
2913 'ru': 'rus',
2914 'rw': 'kin',
2915 'sa': 'san',
2916 'sc': 'srd',
2917 'sd': 'snd',
2918 'se': 'sme',
2919 'sg': 'sag',
2920 'si': 'sin',
2921 'sk': 'slk',
2922 'sl': 'slv',
2923 'sm': 'smo',
2924 'sn': 'sna',
2925 'so': 'som',
2926 'sq': 'sqi',
2927 'sr': 'srp',
2928 'ss': 'ssw',
2929 'st': 'sot',
2930 'su': 'sun',
2931 'sv': 'swe',
2932 'sw': 'swa',
2933 'ta': 'tam',
2934 'te': 'tel',
2935 'tg': 'tgk',
2936 'th': 'tha',
2937 'ti': 'tir',
2938 'tk': 'tuk',
2939 'tl': 'tgl',
2940 'tn': 'tsn',
2941 'to': 'ton',
2942 'tr': 'tur',
2943 'ts': 'tso',
2944 'tt': 'tat',
2945 'tw': 'twi',
2946 'ty': 'tah',
2947 'ug': 'uig',
2948 'uk': 'ukr',
2949 'ur': 'urd',
2950 'uz': 'uzb',
2951 've': 'ven',
2952 'vi': 'vie',
2953 'vo': 'vol',
2954 'wa': 'wln',
2955 'wo': 'wol',
2956 'xh': 'xho',
2957 'yi': 'yid',
2958 'yo': 'yor',
2959 'za': 'zha',
2960 'zh': 'zho',
2961 'zu': 'zul',
2962 }
2963
2964 @classmethod
2965 def short2long(cls, code):
2966 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2967 return cls._lang_map.get(code[:2])
2968
2969 @classmethod
2970 def long2short(cls, code):
2971 """Convert language code from ISO 639-2/T to ISO 639-1"""
2972 for short_name, long_name in cls._lang_map.items():
2973 if long_name == code:
2974 return short_name
2975
2976
2977 class ISO3166Utils(object):
2978 # From http://data.okfn.org/data/core/country-list
2979 _country_map = {
2980 'AF': 'Afghanistan',
2981 'AX': 'Åland Islands',
2982 'AL': 'Albania',
2983 'DZ': 'Algeria',
2984 'AS': 'American Samoa',
2985 'AD': 'Andorra',
2986 'AO': 'Angola',
2987 'AI': 'Anguilla',
2988 'AQ': 'Antarctica',
2989 'AG': 'Antigua and Barbuda',
2990 'AR': 'Argentina',
2991 'AM': 'Armenia',
2992 'AW': 'Aruba',
2993 'AU': 'Australia',
2994 'AT': 'Austria',
2995 'AZ': 'Azerbaijan',
2996 'BS': 'Bahamas',
2997 'BH': 'Bahrain',
2998 'BD': 'Bangladesh',
2999 'BB': 'Barbados',
3000 'BY': 'Belarus',
3001 'BE': 'Belgium',
3002 'BZ': 'Belize',
3003 'BJ': 'Benin',
3004 'BM': 'Bermuda',
3005 'BT': 'Bhutan',
3006 'BO': 'Bolivia, Plurinational State of',
3007 'BQ': 'Bonaire, Sint Eustatius and Saba',
3008 'BA': 'Bosnia and Herzegovina',
3009 'BW': 'Botswana',
3010 'BV': 'Bouvet Island',
3011 'BR': 'Brazil',
3012 'IO': 'British Indian Ocean Territory',
3013 'BN': 'Brunei Darussalam',
3014 'BG': 'Bulgaria',
3015 'BF': 'Burkina Faso',
3016 'BI': 'Burundi',
3017 'KH': 'Cambodia',
3018 'CM': 'Cameroon',
3019 'CA': 'Canada',
3020 'CV': 'Cape Verde',
3021 'KY': 'Cayman Islands',
3022 'CF': 'Central African Republic',
3023 'TD': 'Chad',
3024 'CL': 'Chile',
3025 'CN': 'China',
3026 'CX': 'Christmas Island',
3027 'CC': 'Cocos (Keeling) Islands',
3028 'CO': 'Colombia',
3029 'KM': 'Comoros',
3030 'CG': 'Congo',
3031 'CD': 'Congo, the Democratic Republic of the',
3032 'CK': 'Cook Islands',
3033 'CR': 'Costa Rica',
3034 'CI': 'Côte d\'Ivoire',
3035 'HR': 'Croatia',
3036 'CU': 'Cuba',
3037 'CW': 'Curaçao',
3038 'CY': 'Cyprus',
3039 'CZ': 'Czech Republic',
3040 'DK': 'Denmark',
3041 'DJ': 'Djibouti',
3042 'DM': 'Dominica',
3043 'DO': 'Dominican Republic',
3044 'EC': 'Ecuador',
3045 'EG': 'Egypt',
3046 'SV': 'El Salvador',
3047 'GQ': 'Equatorial Guinea',
3048 'ER': 'Eritrea',
3049 'EE': 'Estonia',
3050 'ET': 'Ethiopia',
3051 'FK': 'Falkland Islands (Malvinas)',
3052 'FO': 'Faroe Islands',
3053 'FJ': 'Fiji',
3054 'FI': 'Finland',
3055 'FR': 'France',
3056 'GF': 'French Guiana',
3057 'PF': 'French Polynesia',
3058 'TF': 'French Southern Territories',
3059 'GA': 'Gabon',
3060 'GM': 'Gambia',
3061 'GE': 'Georgia',
3062 'DE': 'Germany',
3063 'GH': 'Ghana',
3064 'GI': 'Gibraltar',
3065 'GR': 'Greece',
3066 'GL': 'Greenland',
3067 'GD': 'Grenada',
3068 'GP': 'Guadeloupe',
3069 'GU': 'Guam',
3070 'GT': 'Guatemala',
3071 'GG': 'Guernsey',
3072 'GN': 'Guinea',
3073 'GW': 'Guinea-Bissau',
3074 'GY': 'Guyana',
3075 'HT': 'Haiti',
3076 'HM': 'Heard Island and McDonald Islands',
3077 'VA': 'Holy See (Vatican City State)',
3078 'HN': 'Honduras',
3079 'HK': 'Hong Kong',
3080 'HU': 'Hungary',
3081 'IS': 'Iceland',
3082 'IN': 'India',
3083 'ID': 'Indonesia',
3084 'IR': 'Iran, Islamic Republic of',
3085 'IQ': 'Iraq',
3086 'IE': 'Ireland',
3087 'IM': 'Isle of Man',
3088 'IL': 'Israel',
3089 'IT': 'Italy',
3090 'JM': 'Jamaica',
3091 'JP': 'Japan',
3092 'JE': 'Jersey',
3093 'JO': 'Jordan',
3094 'KZ': 'Kazakhstan',
3095 'KE': 'Kenya',
3096 'KI': 'Kiribati',
3097 'KP': 'Korea, Democratic People\'s Republic of',
3098 'KR': 'Korea, Republic of',
3099 'KW': 'Kuwait',
3100 'KG': 'Kyrgyzstan',
3101 'LA': 'Lao People\'s Democratic Republic',
3102 'LV': 'Latvia',
3103 'LB': 'Lebanon',
3104 'LS': 'Lesotho',
3105 'LR': 'Liberia',
3106 'LY': 'Libya',
3107 'LI': 'Liechtenstein',
3108 'LT': 'Lithuania',
3109 'LU': 'Luxembourg',
3110 'MO': 'Macao',
3111 'MK': 'Macedonia, the Former Yugoslav Republic of',
3112 'MG': 'Madagascar',
3113 'MW': 'Malawi',
3114 'MY': 'Malaysia',
3115 'MV': 'Maldives',
3116 'ML': 'Mali',
3117 'MT': 'Malta',
3118 'MH': 'Marshall Islands',
3119 'MQ': 'Martinique',
3120 'MR': 'Mauritania',
3121 'MU': 'Mauritius',
3122 'YT': 'Mayotte',
3123 'MX': 'Mexico',
3124 'FM': 'Micronesia, Federated States of',
3125 'MD': 'Moldova, Republic of',
3126 'MC': 'Monaco',
3127 'MN': 'Mongolia',
3128 'ME': 'Montenegro',
3129 'MS': 'Montserrat',
3130 'MA': 'Morocco',
3131 'MZ': 'Mozambique',
3132 'MM': 'Myanmar',
3133 'NA': 'Namibia',
3134 'NR': 'Nauru',
3135 'NP': 'Nepal',
3136 'NL': 'Netherlands',
3137 'NC': 'New Caledonia',
3138 'NZ': 'New Zealand',
3139 'NI': 'Nicaragua',
3140 'NE': 'Niger',
3141 'NG': 'Nigeria',
3142 'NU': 'Niue',
3143 'NF': 'Norfolk Island',
3144 'MP': 'Northern Mariana Islands',
3145 'NO': 'Norway',
3146 'OM': 'Oman',
3147 'PK': 'Pakistan',
3148 'PW': 'Palau',
3149 'PS': 'Palestine, State of',
3150 'PA': 'Panama',
3151 'PG': 'Papua New Guinea',
3152 'PY': 'Paraguay',
3153 'PE': 'Peru',
3154 'PH': 'Philippines',
3155 'PN': 'Pitcairn',
3156 'PL': 'Poland',
3157 'PT': 'Portugal',
3158 'PR': 'Puerto Rico',
3159 'QA': 'Qatar',
3160 'RE': 'Réunion',
3161 'RO': 'Romania',
3162 'RU': 'Russian Federation',
3163 'RW': 'Rwanda',
3164 'BL': 'Saint Barthélemy',
3165 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3166 'KN': 'Saint Kitts and Nevis',
3167 'LC': 'Saint Lucia',
3168 'MF': 'Saint Martin (French part)',
3169 'PM': 'Saint Pierre and Miquelon',
3170 'VC': 'Saint Vincent and the Grenadines',
3171 'WS': 'Samoa',
3172 'SM': 'San Marino',
3173 'ST': 'Sao Tome and Principe',
3174 'SA': 'Saudi Arabia',
3175 'SN': 'Senegal',
3176 'RS': 'Serbia',
3177 'SC': 'Seychelles',
3178 'SL': 'Sierra Leone',
3179 'SG': 'Singapore',
3180 'SX': 'Sint Maarten (Dutch part)',
3181 'SK': 'Slovakia',
3182 'SI': 'Slovenia',
3183 'SB': 'Solomon Islands',
3184 'SO': 'Somalia',
3185 'ZA': 'South Africa',
3186 'GS': 'South Georgia and the South Sandwich Islands',
3187 'SS': 'South Sudan',
3188 'ES': 'Spain',
3189 'LK': 'Sri Lanka',
3190 'SD': 'Sudan',
3191 'SR': 'Suriname',
3192 'SJ': 'Svalbard and Jan Mayen',
3193 'SZ': 'Swaziland',
3194 'SE': 'Sweden',
3195 'CH': 'Switzerland',
3196 'SY': 'Syrian Arab Republic',
3197 'TW': 'Taiwan, Province of China',
3198 'TJ': 'Tajikistan',
3199 'TZ': 'Tanzania, United Republic of',
3200 'TH': 'Thailand',
3201 'TL': 'Timor-Leste',
3202 'TG': 'Togo',
3203 'TK': 'Tokelau',
3204 'TO': 'Tonga',
3205 'TT': 'Trinidad and Tobago',
3206 'TN': 'Tunisia',
3207 'TR': 'Turkey',
3208 'TM': 'Turkmenistan',
3209 'TC': 'Turks and Caicos Islands',
3210 'TV': 'Tuvalu',
3211 'UG': 'Uganda',
3212 'UA': 'Ukraine',
3213 'AE': 'United Arab Emirates',
3214 'GB': 'United Kingdom',
3215 'US': 'United States',
3216 'UM': 'United States Minor Outlying Islands',
3217 'UY': 'Uruguay',
3218 'UZ': 'Uzbekistan',
3219 'VU': 'Vanuatu',
3220 'VE': 'Venezuela, Bolivarian Republic of',
3221 'VN': 'Viet Nam',
3222 'VG': 'Virgin Islands, British',
3223 'VI': 'Virgin Islands, U.S.',
3224 'WF': 'Wallis and Futuna',
3225 'EH': 'Western Sahara',
3226 'YE': 'Yemen',
3227 'ZM': 'Zambia',
3228 'ZW': 'Zimbabwe',
3229 }
3230
3231 @classmethod
3232 def short2full(cls, code):
3233 """Convert an ISO 3166-2 country code to the corresponding full name"""
3234 return cls._country_map.get(code.upper())
3235
3236
3237 class GeoUtils(object):
3238 # Major IPv4 address blocks per country
3239 _country_ip_map = {
3240 'AD': '85.94.160.0/19',
3241 'AE': '94.200.0.0/13',
3242 'AF': '149.54.0.0/17',
3243 'AG': '209.59.64.0/18',
3244 'AI': '204.14.248.0/21',
3245 'AL': '46.99.0.0/16',
3246 'AM': '46.70.0.0/15',
3247 'AO': '105.168.0.0/13',
3248 'AP': '159.117.192.0/21',
3249 'AR': '181.0.0.0/12',
3250 'AS': '202.70.112.0/20',
3251 'AT': '84.112.0.0/13',
3252 'AU': '1.128.0.0/11',
3253 'AW': '181.41.0.0/18',
3254 'AZ': '5.191.0.0/16',
3255 'BA': '31.176.128.0/17',
3256 'BB': '65.48.128.0/17',
3257 'BD': '114.130.0.0/16',
3258 'BE': '57.0.0.0/8',
3259 'BF': '129.45.128.0/17',
3260 'BG': '95.42.0.0/15',
3261 'BH': '37.131.0.0/17',
3262 'BI': '154.117.192.0/18',
3263 'BJ': '137.255.0.0/16',
3264 'BL': '192.131.134.0/24',
3265 'BM': '196.12.64.0/18',
3266 'BN': '156.31.0.0/16',
3267 'BO': '161.56.0.0/16',
3268 'BQ': '161.0.80.0/20',
3269 'BR': '152.240.0.0/12',
3270 'BS': '24.51.64.0/18',
3271 'BT': '119.2.96.0/19',
3272 'BW': '168.167.0.0/16',
3273 'BY': '178.120.0.0/13',
3274 'BZ': '179.42.192.0/18',
3275 'CA': '99.224.0.0/11',
3276 'CD': '41.243.0.0/16',
3277 'CF': '196.32.200.0/21',
3278 'CG': '197.214.128.0/17',
3279 'CH': '85.0.0.0/13',
3280 'CI': '154.232.0.0/14',
3281 'CK': '202.65.32.0/19',
3282 'CL': '152.172.0.0/14',
3283 'CM': '165.210.0.0/15',
3284 'CN': '36.128.0.0/10',
3285 'CO': '181.240.0.0/12',
3286 'CR': '201.192.0.0/12',
3287 'CU': '152.206.0.0/15',
3288 'CV': '165.90.96.0/19',
3289 'CW': '190.88.128.0/17',
3290 'CY': '46.198.0.0/15',
3291 'CZ': '88.100.0.0/14',
3292 'DE': '53.0.0.0/8',
3293 'DJ': '197.241.0.0/17',
3294 'DK': '87.48.0.0/12',
3295 'DM': '192.243.48.0/20',
3296 'DO': '152.166.0.0/15',
3297 'DZ': '41.96.0.0/12',
3298 'EC': '186.68.0.0/15',
3299 'EE': '90.190.0.0/15',
3300 'EG': '156.160.0.0/11',
3301 'ER': '196.200.96.0/20',
3302 'ES': '88.0.0.0/11',
3303 'ET': '196.188.0.0/14',
3304 'EU': '2.16.0.0/13',
3305 'FI': '91.152.0.0/13',
3306 'FJ': '144.120.0.0/16',
3307 'FM': '119.252.112.0/20',
3308 'FO': '88.85.32.0/19',
3309 'FR': '90.0.0.0/9',
3310 'GA': '41.158.0.0/15',
3311 'GB': '25.0.0.0/8',
3312 'GD': '74.122.88.0/21',
3313 'GE': '31.146.0.0/16',
3314 'GF': '161.22.64.0/18',
3315 'GG': '62.68.160.0/19',
3316 'GH': '45.208.0.0/14',
3317 'GI': '85.115.128.0/19',
3318 'GL': '88.83.0.0/19',
3319 'GM': '160.182.0.0/15',
3320 'GN': '197.149.192.0/18',
3321 'GP': '104.250.0.0/19',
3322 'GQ': '105.235.224.0/20',
3323 'GR': '94.64.0.0/13',
3324 'GT': '168.234.0.0/16',
3325 'GU': '168.123.0.0/16',
3326 'GW': '197.214.80.0/20',
3327 'GY': '181.41.64.0/18',
3328 'HK': '113.252.0.0/14',
3329 'HN': '181.210.0.0/16',
3330 'HR': '93.136.0.0/13',
3331 'HT': '148.102.128.0/17',
3332 'HU': '84.0.0.0/14',
3333 'ID': '39.192.0.0/10',
3334 'IE': '87.32.0.0/12',
3335 'IL': '79.176.0.0/13',
3336 'IM': '5.62.80.0/20',
3337 'IN': '117.192.0.0/10',
3338 'IO': '203.83.48.0/21',
3339 'IQ': '37.236.0.0/14',
3340 'IR': '2.176.0.0/12',
3341 'IS': '82.221.0.0/16',
3342 'IT': '79.0.0.0/10',
3343 'JE': '87.244.64.0/18',
3344 'JM': '72.27.0.0/17',
3345 'JO': '176.29.0.0/16',
3346 'JP': '126.0.0.0/8',
3347 'KE': '105.48.0.0/12',
3348 'KG': '158.181.128.0/17',
3349 'KH': '36.37.128.0/17',
3350 'KI': '103.25.140.0/22',
3351 'KM': '197.255.224.0/20',
3352 'KN': '198.32.32.0/19',
3353 'KP': '175.45.176.0/22',
3354 'KR': '175.192.0.0/10',
3355 'KW': '37.36.0.0/14',
3356 'KY': '64.96.0.0/15',
3357 'KZ': '2.72.0.0/13',
3358 'LA': '115.84.64.0/18',
3359 'LB': '178.135.0.0/16',
3360 'LC': '192.147.231.0/24',
3361 'LI': '82.117.0.0/19',
3362 'LK': '112.134.0.0/15',
3363 'LR': '41.86.0.0/19',
3364 'LS': '129.232.0.0/17',
3365 'LT': '78.56.0.0/13',
3366 'LU': '188.42.0.0/16',
3367 'LV': '46.109.0.0/16',
3368 'LY': '41.252.0.0/14',
3369 'MA': '105.128.0.0/11',
3370 'MC': '88.209.64.0/18',
3371 'MD': '37.246.0.0/16',
3372 'ME': '178.175.0.0/17',
3373 'MF': '74.112.232.0/21',
3374 'MG': '154.126.0.0/17',
3375 'MH': '117.103.88.0/21',
3376 'MK': '77.28.0.0/15',
3377 'ML': '154.118.128.0/18',
3378 'MM': '37.111.0.0/17',
3379 'MN': '49.0.128.0/17',
3380 'MO': '60.246.0.0/16',
3381 'MP': '202.88.64.0/20',
3382 'MQ': '109.203.224.0/19',
3383 'MR': '41.188.64.0/18',
3384 'MS': '208.90.112.0/22',
3385 'MT': '46.11.0.0/16',
3386 'MU': '105.16.0.0/12',
3387 'MV': '27.114.128.0/18',
3388 'MW': '105.234.0.0/16',
3389 'MX': '187.192.0.0/11',
3390 'MY': '175.136.0.0/13',
3391 'MZ': '197.218.0.0/15',
3392 'NA': '41.182.0.0/16',
3393 'NC': '101.101.0.0/18',
3394 'NE': '197.214.0.0/18',
3395 'NF': '203.17.240.0/22',
3396 'NG': '105.112.0.0/12',
3397 'NI': '186.76.0.0/15',
3398 'NL': '145.96.0.0/11',
3399 'NO': '84.208.0.0/13',
3400 'NP': '36.252.0.0/15',
3401 'NR': '203.98.224.0/19',
3402 'NU': '49.156.48.0/22',
3403 'NZ': '49.224.0.0/14',
3404 'OM': '5.36.0.0/15',
3405 'PA': '186.72.0.0/15',
3406 'PE': '186.160.0.0/14',
3407 'PF': '123.50.64.0/18',
3408 'PG': '124.240.192.0/19',
3409 'PH': '49.144.0.0/13',
3410 'PK': '39.32.0.0/11',
3411 'PL': '83.0.0.0/11',
3412 'PM': '70.36.0.0/20',
3413 'PR': '66.50.0.0/16',
3414 'PS': '188.161.0.0/16',
3415 'PT': '85.240.0.0/13',
3416 'PW': '202.124.224.0/20',
3417 'PY': '181.120.0.0/14',
3418 'QA': '37.210.0.0/15',
3419 'RE': '139.26.0.0/16',
3420 'RO': '79.112.0.0/13',
3421 'RS': '178.220.0.0/14',
3422 'RU': '5.136.0.0/13',
3423 'RW': '105.178.0.0/15',
3424 'SA': '188.48.0.0/13',
3425 'SB': '202.1.160.0/19',
3426 'SC': '154.192.0.0/11',
3427 'SD': '154.96.0.0/13',
3428 'SE': '78.64.0.0/12',
3429 'SG': '152.56.0.0/14',
3430 'SI': '188.196.0.0/14',
3431 'SK': '78.98.0.0/15',
3432 'SL': '197.215.0.0/17',
3433 'SM': '89.186.32.0/19',
3434 'SN': '41.82.0.0/15',
3435 'SO': '197.220.64.0/19',
3436 'SR': '186.179.128.0/17',
3437 'SS': '105.235.208.0/21',
3438 'ST': '197.159.160.0/19',
3439 'SV': '168.243.0.0/16',
3440 'SX': '190.102.0.0/20',
3441 'SY': '5.0.0.0/16',
3442 'SZ': '41.84.224.0/19',
3443 'TC': '65.255.48.0/20',
3444 'TD': '154.68.128.0/19',
3445 'TG': '196.168.0.0/14',
3446 'TH': '171.96.0.0/13',
3447 'TJ': '85.9.128.0/18',
3448 'TK': '27.96.24.0/21',
3449 'TL': '180.189.160.0/20',
3450 'TM': '95.85.96.0/19',
3451 'TN': '197.0.0.0/11',
3452 'TO': '175.176.144.0/21',
3453 'TR': '78.160.0.0/11',
3454 'TT': '186.44.0.0/15',
3455 'TV': '202.2.96.0/19',
3456 'TW': '120.96.0.0/11',
3457 'TZ': '156.156.0.0/14',
3458 'UA': '93.72.0.0/13',
3459 'UG': '154.224.0.0/13',
3460 'US': '3.0.0.0/8',
3461 'UY': '167.56.0.0/13',
3462 'UZ': '82.215.64.0/18',
3463 'VA': '212.77.0.0/19',
3464 'VC': '24.92.144.0/20',
3465 'VE': '186.88.0.0/13',
3466 'VG': '172.103.64.0/18',
3467 'VI': '146.226.0.0/16',
3468 'VN': '14.160.0.0/11',
3469 'VU': '202.80.32.0/20',
3470 'WF': '117.20.32.0/21',
3471 'WS': '202.4.32.0/19',
3472 'YE': '134.35.0.0/16',
3473 'YT': '41.242.116.0/22',
3474 'ZA': '41.0.0.0/11',
3475 'ZM': '165.56.0.0/13',
3476 'ZW': '41.85.192.0/19',
3477 }
3478
3479 @classmethod
3480 def random_ipv4(cls, code):
3481 block = cls._country_ip_map.get(code.upper())
3482 if not block:
3483 return None
3484 addr, preflen = block.split('/')
3485 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3486 addr_max = addr_min | (0xffffffff >> int(preflen))
3487 return compat_str(socket.inet_ntoa(
3488 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3489
3490
3491 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3492 def __init__(self, proxies=None):
3493 # Set default handlers
3494 for type in ('http', 'https'):
3495 setattr(self, '%s_open' % type,
3496 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3497 meth(r, proxy, type))
3498 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3499
3500 def proxy_open(self, req, proxy, type):
3501 req_proxy = req.headers.get('Ytdl-request-proxy')
3502 if req_proxy is not None:
3503 proxy = req_proxy
3504 del req.headers['Ytdl-request-proxy']
3505
3506 if proxy == '__noproxy__':
3507 return None # No Proxy
3508 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3509 req.add_header('Ytdl-socks-proxy', proxy)
3510 # youtube-dl's http/https handlers do wrapping the socket with socks
3511 return None
3512 return compat_urllib_request.ProxyHandler.proxy_open(
3513 self, req, proxy, type)
3514
3515
3516 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3517 # released into Public Domain
3518 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3519
3520 def long_to_bytes(n, blocksize=0):
3521 """long_to_bytes(n:long, blocksize:int) : string
3522 Convert a long integer to a byte string.
3523
3524 If optional blocksize is given and greater than zero, pad the front of the
3525 byte string with binary zeros so that the length is a multiple of
3526 blocksize.
3527 """
3528 # after much testing, this algorithm was deemed to be the fastest
3529 s = b''
3530 n = int(n)
3531 while n > 0:
3532 s = compat_struct_pack('>I', n & 0xffffffff) + s
3533 n = n >> 32
3534 # strip off leading zeros
3535 for i in range(len(s)):
3536 if s[i] != b'\000'[0]:
3537 break
3538 else:
3539 # only happens when n == 0
3540 s = b'\000'
3541 i = 0
3542 s = s[i:]
3543 # add back some pad bytes. this could be done more efficiently w.r.t. the
3544 # de-padding being done above, but sigh...
3545 if blocksize > 0 and len(s) % blocksize:
3546 s = (blocksize - len(s) % blocksize) * b'\000' + s
3547 return s
3548
3549
3550 def bytes_to_long(s):
3551 """bytes_to_long(string) : long
3552 Convert a byte string to a long integer.
3553
3554 This is (essentially) the inverse of long_to_bytes().
3555 """
3556 acc = 0
3557 length = len(s)
3558 if length % 4:
3559 extra = (4 - length % 4)
3560 s = b'\000' * extra + s
3561 length = length + extra
3562 for i in range(0, length, 4):
3563 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3564 return acc
3565
3566
3567 def ohdave_rsa_encrypt(data, exponent, modulus):
3568 '''
3569 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3570
3571 Input:
3572 data: data to encrypt, bytes-like object
3573 exponent, modulus: parameter e and N of RSA algorithm, both integer
3574 Output: hex string of encrypted data
3575
3576 Limitation: supports one block encryption only
3577 '''
3578
3579 payload = int(binascii.hexlify(data[::-1]), 16)
3580 encrypted = pow(payload, exponent, modulus)
3581 return '%x' % encrypted
3582
3583
3584 def pkcs1pad(data, length):
3585 """
3586 Padding input data with PKCS#1 scheme
3587
3588 @param {int[]} data input data
3589 @param {int} length target length
3590 @returns {int[]} padded data
3591 """
3592 if len(data) > length - 11:
3593 raise ValueError('Input data too long for PKCS#1 padding')
3594
3595 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3596 return [0, 2] + pseudo_random + [0] + data
3597
3598
3599 def encode_base_n(num, n, table=None):
3600 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3601 if not table:
3602 table = FULL_TABLE[:n]
3603
3604 if n > len(table):
3605 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3606
3607 if num == 0:
3608 return table[0]
3609
3610 ret = ''
3611 while num:
3612 ret = table[num % n] + ret
3613 num = num // n
3614 return ret
3615
3616
3617 def decode_packed_codes(code):
3618 mobj = re.search(PACKED_CODES_RE, code)
3619 obfucasted_code, base, count, symbols = mobj.groups()
3620 base = int(base)
3621 count = int(count)
3622 symbols = symbols.split('|')
3623 symbol_table = {}
3624
3625 while count:
3626 count -= 1
3627 base_n_count = encode_base_n(count, base)
3628 symbol_table[base_n_count] = symbols[count] or base_n_count
3629
3630 return re.sub(
3631 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3632 obfucasted_code)
3633
3634
3635 def parse_m3u8_attributes(attrib):
3636 info = {}
3637 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3638 if val.startswith('"'):
3639 val = val[1:-1]
3640 info[key] = val
3641 return info
3642
3643
3644 def urshift(val, n):
3645 return val >> n if val >= 0 else (val + 0x100000000) >> n
3646
3647
3648 # Based on png2str() written by @gdkchan and improved by @yokrysty
3649 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3650 def decode_png(png_data):
3651 # Reference: https://www.w3.org/TR/PNG/
3652 header = png_data[8:]
3653
3654 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3655 raise IOError('Not a valid PNG file.')
3656
3657 int_map = {1: '>B', 2: '>H', 4: '>I'}
3658 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3659
3660 chunks = []
3661
3662 while header:
3663 length = unpack_integer(header[:4])
3664 header = header[4:]
3665
3666 chunk_type = header[:4]
3667 header = header[4:]
3668
3669 chunk_data = header[:length]
3670 header = header[length:]
3671
3672 header = header[4:] # Skip CRC
3673
3674 chunks.append({
3675 'type': chunk_type,
3676 'length': length,
3677 'data': chunk_data
3678 })
3679
3680 ihdr = chunks[0]['data']
3681
3682 width = unpack_integer(ihdr[:4])
3683 height = unpack_integer(ihdr[4:8])
3684
3685 idat = b''
3686
3687 for chunk in chunks:
3688 if chunk['type'] == b'IDAT':
3689 idat += chunk['data']
3690
3691 if not idat:
3692 raise IOError('Unable to read PNG data.')
3693
3694 decompressed_data = bytearray(zlib.decompress(idat))
3695
3696 stride = width * 3
3697 pixels = []
3698
3699 def _get_pixel(idx):
3700 x = idx % stride
3701 y = idx // stride
3702 return pixels[y][x]
3703
3704 for y in range(height):
3705 basePos = y * (1 + stride)
3706 filter_type = decompressed_data[basePos]
3707
3708 current_row = []
3709
3710 pixels.append(current_row)
3711
3712 for x in range(stride):
3713 color = decompressed_data[1 + basePos + x]
3714 basex = y * stride + x
3715 left = 0
3716 up = 0
3717
3718 if x > 2:
3719 left = _get_pixel(basex - 3)
3720 if y > 0:
3721 up = _get_pixel(basex - stride)
3722
3723 if filter_type == 1: # Sub
3724 color = (color + left) & 0xff
3725 elif filter_type == 2: # Up
3726 color = (color + up) & 0xff
3727 elif filter_type == 3: # Average
3728 color = (color + ((left + up) >> 1)) & 0xff
3729 elif filter_type == 4: # Paeth
3730 a = left
3731 b = up
3732 c = 0
3733
3734 if x > 2 and y > 0:
3735 c = _get_pixel(basex - stride - 3)
3736
3737 p = a + b - c
3738
3739 pa = abs(p - a)
3740 pb = abs(p - b)
3741 pc = abs(p - c)
3742
3743 if pa <= pb and pa <= pc:
3744 color = (color + a) & 0xff
3745 elif pb <= pc:
3746 color = (color + b) & 0xff
3747 else:
3748 color = (color + c) & 0xff
3749
3750 current_row.append(color)
3751
3752 return width, height, pixels
3753
3754
3755 def write_xattr(path, key, value):
3756 # This mess below finds the best xattr tool for the job
3757 try:
3758 # try the pyxattr module...
3759 import xattr
3760
3761 if hasattr(xattr, 'set'): # pyxattr
3762 # Unicode arguments are not supported in python-pyxattr until
3763 # version 0.5.0
3764 # See https://github.com/rg3/youtube-dl/issues/5498
3765 pyxattr_required_version = '0.5.0'
3766 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3767 # TODO: fallback to CLI tools
3768 raise XAttrUnavailableError(
3769 'python-pyxattr is detected but is too old. '
3770 'youtube-dl requires %s or above while your version is %s. '
3771 'Falling back to other xattr implementations' % (
3772 pyxattr_required_version, xattr.__version__))
3773
3774 setxattr = xattr.set
3775 else: # xattr
3776 setxattr = xattr.setxattr
3777
3778 try:
3779 setxattr(path, key, value)
3780 except EnvironmentError as e:
3781 raise XAttrMetadataError(e.errno, e.strerror)
3782
3783 except ImportError:
3784 if compat_os_name == 'nt':
3785 # Write xattrs to NTFS Alternate Data Streams:
3786 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3787 assert ':' not in key
3788 assert os.path.exists(path)
3789
3790 ads_fn = path + ':' + key
3791 try:
3792 with open(ads_fn, 'wb') as f:
3793 f.write(value)
3794 except EnvironmentError as e:
3795 raise XAttrMetadataError(e.errno, e.strerror)
3796 else:
3797 user_has_setfattr = check_executable('setfattr', ['--version'])
3798 user_has_xattr = check_executable('xattr', ['-h'])
3799
3800 if user_has_setfattr or user_has_xattr:
3801
3802 value = value.decode('utf-8')
3803 if user_has_setfattr:
3804 executable = 'setfattr'
3805 opts = ['-n', key, '-v', value]
3806 elif user_has_xattr:
3807 executable = 'xattr'
3808 opts = ['-w', key, value]
3809
3810 cmd = ([encodeFilename(executable, True)] +
3811 [encodeArgument(o) for o in opts] +
3812 [encodeFilename(path, True)])
3813
3814 try:
3815 p = subprocess.Popen(
3816 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3817 except EnvironmentError as e:
3818 raise XAttrMetadataError(e.errno, e.strerror)
3819 stdout, stderr = p.communicate()
3820 stderr = stderr.decode('utf-8', 'replace')
3821 if p.returncode != 0:
3822 raise XAttrMetadataError(p.returncode, stderr)
3823
3824 else:
3825 # On Unix, and can't find pyxattr, setfattr, or xattr.
3826 if sys.platform.startswith('linux'):
3827 raise XAttrUnavailableError(
3828 "Couldn't find a tool to set the xattrs. "
3829 "Install either the python 'pyxattr' or 'xattr' "
3830 "modules, or the GNU 'attr' package "
3831 "(which contains the 'setfattr' tool).")
3832 else:
3833 raise XAttrUnavailableError(
3834 "Couldn't find a tool to set the xattrs. "
3835 "Install either the python 'xattr' module, "
3836 "or the 'xattr' binary.")
3837
3838
3839 def random_birthday(year_field, month_field, day_field):
3840 return {
3841 year_field: str(random.randint(1950, 1995)),
3842 month_field: str(random.randint(1, 12)),
3843 day_field: str(random.randint(1, 31)),
3844 }