]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
New upstream version 2019.01.16
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import email.header
15 import errno
16 import functools
17 import gzip
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import operator
24 import os
25 import platform
26 import random
27 import re
28 import socket
29 import ssl
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParseError,
39 compat_HTMLParser,
40 compat_basestring,
41 compat_chr,
42 compat_cookiejar,
43 compat_ctypes_WINFUNCTYPE,
44 compat_etree_fromstring,
45 compat_expanduser,
46 compat_html_entities,
47 compat_html_entities_html5,
48 compat_http_client,
49 compat_kwargs,
50 compat_os_name,
51 compat_parse_qs,
52 compat_shlex_quote,
53 compat_str,
54 compat_struct_pack,
55 compat_struct_unpack,
56 compat_urllib_error,
57 compat_urllib_parse,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
62 compat_urlparse,
63 compat_xpath,
64 )
65
66 from .socks import (
67 ProxyType,
68 sockssocket,
69 )
70
71
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
83
84 std_headers = {
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
90 }
91
92
93 USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95 }
96
97
98 NO_DEFAULT = object()
99
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
104 MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109 }
110
111 KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
130
131 DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
138 '%b %d %Y',
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
148 '%Y/%m/%d %H:%M',
149 '%Y/%m/%d %H:%M:%S',
150 '%Y-%m-%d %H:%M',
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
165 )
166
167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168 DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175 ])
176
177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178 DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184 ])
185
186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
188
189
190 def preferredencoding():
191 """Get preferred encoding.
192
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
198 'TEST'.encode(pref)
199 except Exception:
200 pref = 'UTF-8'
201
202 return pref
203
204
205 def write_json_file(obj, fn):
206 """ Encode obj as JSON and write it to fn, atomically if possible """
207
208 fn = encodeFilename(fn)
209 if sys.version_info < (3, 0) and sys.platform != 'win32':
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
221 args = {
222 'suffix': '.tmp',
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
225 'delete': False,
226 }
227
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
231 args['mode'] = 'wb'
232 else:
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
239
240 try:
241 with tf:
242 json.dump(obj, tf)
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
250 os.rename(tf.name, fn)
251 except Exception:
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259 if sys.version_info >= (2, 7):
260 def find_xpath_attr(node, xpath, key, val=None):
261 """ Find the xpath xpath[@key=val] """
262 assert re.match(r'^[a-zA-Z_-]+$', key)
263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
264 return node.find(expr)
265 else:
266 def find_xpath_attr(node, xpath, key, val=None):
267 for f in node.findall(compat_xpath(xpath)):
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
271 return f
272 return None
273
274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
275 # the namespace parameter
276
277
278 def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
289
290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
291 def _find_xpath(xpath):
292 return node.find(compat_xpath(xpath))
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
301
302 if n is None:
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
310 return n
311
312
313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
326
327
328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
339
340
341 def get_element_by_id(id, html):
342 """Return the content of the tag with the specified ID in the passed HTML document"""
343 return get_element_by_attribute('id', id, html)
344
345
346 def get_element_by_class(class_name, html):
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352 def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357 def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
365 """Return the content of the tag with the specified attribute in the passed HTML document"""
366
367 value = re.escape(value) if escape_value else value
368
369 retlist = []
370 for m in re.finditer(r'''(?xs)
371 <([a-zA-Z0-9:._-]+)
372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
373 \s+%s=['"]?%s['"]?
374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
375 \s*>
376 (?P<content>.*?)
377 </\1>
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
380
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
383
384 retlist.append(unescapeHTML(res))
385
386 return retlist
387
388
389 class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
392 self.attrs = {}
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
398
399 def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
422 return parser.attrs
423
424
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
439 return html.strip()
440
441
442 def sanitize_open(filename, open_mode):
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
453 if filename == '-':
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
461 if err.errno in (errno.EACCES,):
462 raise
463
464 # In case of error, try to remove win32 forbidden chars
465 alt_filename = sanitize_path(filename)
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
470 stream = open(encodeFilename(alt_filename), open_mode)
471 return (stream, alt_filename)
472
473
474 def timeconvert(timestr):
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
481
482
483 def sanitize_filename(s, restricted=False, is_id=False):
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
488 """
489 def replace_insane(char):
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
508 result = ''.join(map(replace_insane, s))
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
518 result = result.lstrip('.')
519 if not result:
520 result = '_'
521 return result
522
523
524 def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
533 norm_path.pop(0)
534 sanitized_path = [
535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
536 for path_part in norm_path]
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
539 return os.path.join(*sanitized_path)
540
541
542 def sanitize_url(url):
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
558
559
560 def sanitized_Request(url, *args, **kwargs):
561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
562
563
564 def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
569 def orderedSet(iterable):
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
576
577
578 def _htmlentity_transform(entity_with_semicolon):
579 """Transforms an HTML entity to a character."""
580 entity = entity_with_semicolon[:-1]
581
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
592 if mobj is not None:
593 numstr = mobj.group(1)
594 if numstr.startswith('x'):
595 base = 16
596 numstr = '0%s' % numstr
597 else:
598 base = 10
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
604
605 # Unknown entity in name, return its literal representation
606 return '&%s;' % entity
607
608
609 def unescapeHTML(s):
610 if s is None:
611 return None
612 assert type(s) == compat_str
613
614 return re.sub(
615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
616
617
618 def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
630 def encodeFilename(s, for_subprocess=False):
631 """
632 @param s The name of the file
633 """
634
635 assert type(s) == compat_str
636
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
640
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654 def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
663
664
665 def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
674 def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
678 def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
686
687
688 def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
696
697 def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
701 if opts_no_check_certificate:
702 context.check_hostname = False
703 context.verify_mode = ssl.CERT_NONE
704 try:
705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
712 return YoutubeDLHTTPSHandler(params, **kwargs)
713 else: # Python < 3.4
714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
715 context.verify_mode = (ssl.CERT_NONE
716 if opts_no_check_certificate
717 else ssl.CERT_REQUIRED)
718 context.set_default_verify_paths()
719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
720
721
722 def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
733 class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738 class ExtractorError(YoutubeDLError):
739 """Error during info extraction."""
740
741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
748 if video_id is not None:
749 msg = video_id + ': ' + msg
750 if cause:
751 msg += ' (caused by %r)' % cause
752 if not expected:
753 msg += bug_reports_message()
754 super(ExtractorError, self).__init__(msg)
755
756 self.traceback = tb
757 self.exc_info = sys.exc_info() # preserve original exception
758 self.cause = cause
759 self.video_id = video_id
760
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
764 return ''.join(traceback.format_tb(self.traceback))
765
766
767 class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
774 class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
779 class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
791 class DownloadError(YoutubeDLError):
792 """Download Error exception.
793
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
798
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
803
804
805 class SameFileError(YoutubeDLError):
806 """Same File exception.
807
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
812
813
814 class PostProcessingError(YoutubeDLError):
815 """Post Processing exception.
816
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
820
821 def __init__(self, msg):
822 super(PostProcessingError, self).__init__(msg)
823 self.msg = msg
824
825
826 class MaxDownloadsReached(YoutubeDLError):
827 """ --max-downloads limit has been reached. """
828 pass
829
830
831 class UnavailableVideoError(YoutubeDLError):
832 """Unavailable Format exception.
833
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
838
839
840 class ContentTooShortError(YoutubeDLError):
841 """Content Too Short exception.
842
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
847
848 def __init__(self, downloaded, expected):
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
852 # Both in bytes
853 self.downloaded = downloaded
854 self.expected = expected
855
856
857 class XAttrMetadataError(YoutubeDLError):
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
861 self.msg = msg
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
873 class XAttrUnavailableError(YoutubeDLError):
874 pass
875
876
877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
884 source_address = ydl_handler._params.get('source_address')
885
886 if source_address is not None:
887 # This is to workaround _create_connection() from socket where it will try all
888 # address data from getaddrinfo() including IPv6. This filters the result from
889 # getaddrinfo() based on the source_address value.
890 # This is based on the cpython socket.create_connection() function.
891 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
892 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
893 host, port = address
894 err = None
895 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
896 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
897 ip_addrs = [addr for addr in addrs if addr[0] == af]
898 if addrs and not ip_addrs:
899 ip_version = 'v4' if af == socket.AF_INET else 'v6'
900 raise socket.error(
901 "No remote IP%s addresses available for connect, can't use '%s' as source address"
902 % (ip_version, source_address[0]))
903 for res in ip_addrs:
904 af, socktype, proto, canonname, sa = res
905 sock = None
906 try:
907 sock = socket.socket(af, socktype, proto)
908 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
909 sock.settimeout(timeout)
910 sock.bind(source_address)
911 sock.connect(sa)
912 err = None # Explicitly break reference cycle
913 return sock
914 except socket.error as _:
915 err = _
916 if sock is not None:
917 sock.close()
918 if err is not None:
919 raise err
920 else:
921 raise socket.error('getaddrinfo returns an empty list')
922 if hasattr(hc, '_create_connection'):
923 hc._create_connection = _create_connection
924 sa = (source_address, 0)
925 if hasattr(hc, 'source_address'): # Python 2.7+
926 hc.source_address = sa
927 else: # Python 2.6
928 def _hc_connect(self, *args, **kwargs):
929 sock = _create_connection(
930 (self.host, self.port), self.timeout, sa)
931 if is_https:
932 self.sock = ssl.wrap_socket(
933 sock, self.key_file, self.cert_file,
934 ssl_version=ssl.PROTOCOL_TLSv1)
935 else:
936 self.sock = sock
937 hc.connect = functools.partial(_hc_connect, hc)
938
939 return hc
940
941
942 def handle_youtubedl_headers(headers):
943 filtered_headers = headers
944
945 if 'Youtubedl-no-compression' in filtered_headers:
946 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
947 del filtered_headers['Youtubedl-no-compression']
948
949 return filtered_headers
950
951
952 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
953 """Handler for HTTP requests and responses.
954
955 This class, when installed with an OpenerDirector, automatically adds
956 the standard headers to every HTTP request and handles gzipped and
957 deflated responses from web servers. If compression is to be avoided in
958 a particular request, the original request in the program code only has
959 to include the HTTP header "Youtubedl-no-compression", which will be
960 removed before making the real request.
961
962 Part of this code was copied from:
963
964 http://techknack.net/python-urllib2-handlers/
965
966 Andrew Rowls, the author of that code, agreed to release it to the
967 public domain.
968 """
969
970 def __init__(self, params, *args, **kwargs):
971 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
972 self._params = params
973
974 def http_open(self, req):
975 conn_class = compat_http_client.HTTPConnection
976
977 socks_proxy = req.headers.get('Ytdl-socks-proxy')
978 if socks_proxy:
979 conn_class = make_socks_conn_class(conn_class, socks_proxy)
980 del req.headers['Ytdl-socks-proxy']
981
982 return self.do_open(functools.partial(
983 _create_http_connection, self, conn_class, False),
984 req)
985
986 @staticmethod
987 def deflate(data):
988 try:
989 return zlib.decompress(data, -zlib.MAX_WBITS)
990 except zlib.error:
991 return zlib.decompress(data)
992
993 def http_request(self, req):
994 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
995 # always respected by websites, some tend to give out URLs with non percent-encoded
996 # non-ASCII characters (see telemb.py, ard.py [#3412])
997 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
998 # To work around aforementioned issue we will replace request's original URL with
999 # percent-encoded one
1000 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1001 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1002 url = req.get_full_url()
1003 url_escaped = escape_url(url)
1004
1005 # Substitute URL if any change after escaping
1006 if url != url_escaped:
1007 req = update_Request(req, url=url_escaped)
1008
1009 for h, v in std_headers.items():
1010 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1011 # The dict keys are capitalized because of this bug by urllib
1012 if h.capitalize() not in req.headers:
1013 req.add_header(h, v)
1014
1015 req.headers = handle_youtubedl_headers(req.headers)
1016
1017 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1018 # Python 2.6 is brain-dead when it comes to fragments
1019 req._Request__original = req._Request__original.partition('#')[0]
1020 req._Request__r_type = req._Request__r_type.partition('#')[0]
1021
1022 return req
1023
1024 def http_response(self, req, resp):
1025 old_resp = resp
1026 # gzip
1027 if resp.headers.get('Content-encoding', '') == 'gzip':
1028 content = resp.read()
1029 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1030 try:
1031 uncompressed = io.BytesIO(gz.read())
1032 except IOError as original_ioerror:
1033 # There may be junk add the end of the file
1034 # See http://stackoverflow.com/q/4928560/35070 for details
1035 for i in range(1, 1024):
1036 try:
1037 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1038 uncompressed = io.BytesIO(gz.read())
1039 except IOError:
1040 continue
1041 break
1042 else:
1043 raise original_ioerror
1044 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1045 resp.msg = old_resp.msg
1046 del resp.headers['Content-encoding']
1047 # deflate
1048 if resp.headers.get('Content-encoding', '') == 'deflate':
1049 gz = io.BytesIO(self.deflate(resp.read()))
1050 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1051 resp.msg = old_resp.msg
1052 del resp.headers['Content-encoding']
1053 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1054 # https://github.com/rg3/youtube-dl/issues/6457).
1055 if 300 <= resp.code < 400:
1056 location = resp.headers.get('Location')
1057 if location:
1058 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1059 if sys.version_info >= (3, 0):
1060 location = location.encode('iso-8859-1').decode('utf-8')
1061 else:
1062 location = location.decode('utf-8')
1063 location_escaped = escape_url(location)
1064 if location != location_escaped:
1065 del resp.headers['Location']
1066 if sys.version_info < (3, 0):
1067 location_escaped = location_escaped.encode('utf-8')
1068 resp.headers['Location'] = location_escaped
1069 return resp
1070
1071 https_request = http_request
1072 https_response = http_response
1073
1074
1075 def make_socks_conn_class(base_class, socks_proxy):
1076 assert issubclass(base_class, (
1077 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1078
1079 url_components = compat_urlparse.urlparse(socks_proxy)
1080 if url_components.scheme.lower() == 'socks5':
1081 socks_type = ProxyType.SOCKS5
1082 elif url_components.scheme.lower() in ('socks', 'socks4'):
1083 socks_type = ProxyType.SOCKS4
1084 elif url_components.scheme.lower() == 'socks4a':
1085 socks_type = ProxyType.SOCKS4A
1086
1087 def unquote_if_non_empty(s):
1088 if not s:
1089 return s
1090 return compat_urllib_parse_unquote_plus(s)
1091
1092 proxy_args = (
1093 socks_type,
1094 url_components.hostname, url_components.port or 1080,
1095 True, # Remote DNS
1096 unquote_if_non_empty(url_components.username),
1097 unquote_if_non_empty(url_components.password),
1098 )
1099
1100 class SocksConnection(base_class):
1101 def connect(self):
1102 self.sock = sockssocket()
1103 self.sock.setproxy(*proxy_args)
1104 if type(self.timeout) in (int, float):
1105 self.sock.settimeout(self.timeout)
1106 self.sock.connect((self.host, self.port))
1107
1108 if isinstance(self, compat_http_client.HTTPSConnection):
1109 if hasattr(self, '_context'): # Python > 2.6
1110 self.sock = self._context.wrap_socket(
1111 self.sock, server_hostname=self.host)
1112 else:
1113 self.sock = ssl.wrap_socket(self.sock)
1114
1115 return SocksConnection
1116
1117
1118 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1119 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1120 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1121 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1122 self._params = params
1123
1124 def https_open(self, req):
1125 kwargs = {}
1126 conn_class = self._https_conn_class
1127
1128 if hasattr(self, '_context'): # python > 2.6
1129 kwargs['context'] = self._context
1130 if hasattr(self, '_check_hostname'): # python 3.x
1131 kwargs['check_hostname'] = self._check_hostname
1132
1133 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1134 if socks_proxy:
1135 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1136 del req.headers['Ytdl-socks-proxy']
1137
1138 return self.do_open(functools.partial(
1139 _create_http_connection, self, conn_class, True),
1140 req, **kwargs)
1141
1142
1143 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1144 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1145 # Store session cookies with `expires` set to 0 instead of an empty
1146 # string
1147 for cookie in self:
1148 if cookie.expires is None:
1149 cookie.expires = 0
1150 compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
1151
1152 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1153 compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
1154 # Session cookies are denoted by either `expires` field set to
1155 # an empty string or 0. MozillaCookieJar only recognizes the former
1156 # (see [1]). So we need force the latter to be recognized as session
1157 # cookies on our own.
1158 # Session cookies may be important for cookies-based authentication,
1159 # e.g. usually, when user does not check 'Remember me' check box while
1160 # logging in on a site, some important cookies are stored as session
1161 # cookies so that not recognizing them will result in failed login.
1162 # 1. https://bugs.python.org/issue17164
1163 for cookie in self:
1164 # Treat `expires=0` cookies as session cookies
1165 if cookie.expires == 0:
1166 cookie.expires = None
1167 cookie.discard = True
1168
1169
1170 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1171 def __init__(self, cookiejar=None):
1172 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1173
1174 def http_response(self, request, response):
1175 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1176 # characters in Set-Cookie HTTP header of last response (see
1177 # https://github.com/rg3/youtube-dl/issues/6769).
1178 # In order to at least prevent crashing we will percent encode Set-Cookie
1179 # header before HTTPCookieProcessor starts processing it.
1180 # if sys.version_info < (3, 0) and response.headers:
1181 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1182 # set_cookie = response.headers.get(set_cookie_header)
1183 # if set_cookie:
1184 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1185 # if set_cookie != set_cookie_escaped:
1186 # del response.headers[set_cookie_header]
1187 # response.headers[set_cookie_header] = set_cookie_escaped
1188 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1189
1190 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1191 https_response = http_response
1192
1193
1194 def extract_timezone(date_str):
1195 m = re.search(
1196 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1197 date_str)
1198 if not m:
1199 timezone = datetime.timedelta()
1200 else:
1201 date_str = date_str[:-len(m.group('tz'))]
1202 if not m.group('sign'):
1203 timezone = datetime.timedelta()
1204 else:
1205 sign = 1 if m.group('sign') == '+' else -1
1206 timezone = datetime.timedelta(
1207 hours=sign * int(m.group('hours')),
1208 minutes=sign * int(m.group('minutes')))
1209 return timezone, date_str
1210
1211
1212 def parse_iso8601(date_str, delimiter='T', timezone=None):
1213 """ Return a UNIX timestamp from the given date """
1214
1215 if date_str is None:
1216 return None
1217
1218 date_str = re.sub(r'\.[0-9]+', '', date_str)
1219
1220 if timezone is None:
1221 timezone, date_str = extract_timezone(date_str)
1222
1223 try:
1224 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1225 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1226 return calendar.timegm(dt.timetuple())
1227 except ValueError:
1228 pass
1229
1230
1231 def date_formats(day_first=True):
1232 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1233
1234
1235 def unified_strdate(date_str, day_first=True):
1236 """Return a string with the date in the format YYYYMMDD"""
1237
1238 if date_str is None:
1239 return None
1240 upload_date = None
1241 # Replace commas
1242 date_str = date_str.replace(',', ' ')
1243 # Remove AM/PM + timezone
1244 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1245 _, date_str = extract_timezone(date_str)
1246
1247 for expression in date_formats(day_first):
1248 try:
1249 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1250 except ValueError:
1251 pass
1252 if upload_date is None:
1253 timetuple = email.utils.parsedate_tz(date_str)
1254 if timetuple:
1255 try:
1256 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1257 except ValueError:
1258 pass
1259 if upload_date is not None:
1260 return compat_str(upload_date)
1261
1262
1263 def unified_timestamp(date_str, day_first=True):
1264 if date_str is None:
1265 return None
1266
1267 date_str = re.sub(r'[,|]', '', date_str)
1268
1269 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1270 timezone, date_str = extract_timezone(date_str)
1271
1272 # Remove AM/PM + timezone
1273 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1274
1275 # Remove unrecognized timezones from ISO 8601 alike timestamps
1276 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1277 if m:
1278 date_str = date_str[:-len(m.group('tz'))]
1279
1280 # Python only supports microseconds, so remove nanoseconds
1281 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1282 if m:
1283 date_str = m.group(1)
1284
1285 for expression in date_formats(day_first):
1286 try:
1287 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1288 return calendar.timegm(dt.timetuple())
1289 except ValueError:
1290 pass
1291 timetuple = email.utils.parsedate_tz(date_str)
1292 if timetuple:
1293 return calendar.timegm(timetuple) + pm_delta * 3600
1294
1295
1296 def determine_ext(url, default_ext='unknown_video'):
1297 if url is None or '.' not in url:
1298 return default_ext
1299 guess = url.partition('?')[0].rpartition('.')[2]
1300 if re.match(r'^[A-Za-z0-9]+$', guess):
1301 return guess
1302 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1303 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1304 return guess.rstrip('/')
1305 else:
1306 return default_ext
1307
1308
1309 def subtitles_filename(filename, sub_lang, sub_format):
1310 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1311
1312
1313 def date_from_str(date_str):
1314 """
1315 Return a datetime object from a string in the format YYYYMMDD or
1316 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1317 today = datetime.date.today()
1318 if date_str in ('now', 'today'):
1319 return today
1320 if date_str == 'yesterday':
1321 return today - datetime.timedelta(days=1)
1322 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1323 if match is not None:
1324 sign = match.group('sign')
1325 time = int(match.group('time'))
1326 if sign == '-':
1327 time = -time
1328 unit = match.group('unit')
1329 # A bad approximation?
1330 if unit == 'month':
1331 unit = 'day'
1332 time *= 30
1333 elif unit == 'year':
1334 unit = 'day'
1335 time *= 365
1336 unit += 's'
1337 delta = datetime.timedelta(**{unit: time})
1338 return today + delta
1339 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1340
1341
1342 def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
1351
1352 class DateRange(object):
1353 """Represents a time interval between two dates"""
1354
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start)
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
1362 self.end = date_from_str(end)
1363 else:
1364 self.end = datetime.datetime.max.date()
1365 if self.start > self.end:
1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
1371 return cls(day, day)
1372
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
1378
1379 def __str__(self):
1380 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1381
1382
1383 def platform_name():
1384 """ Returns the platform name as a compat_str """
1385 res = platform.platform()
1386 if isinstance(res, bytes):
1387 res = res.decode(preferredencoding())
1388
1389 assert isinstance(res, compat_str)
1390 return res
1391
1392
1393 def _windows_write_string(s, out):
1394 """ Returns True if the string was written using special methods,
1395 False if it has yet to be written out."""
1396 # Adapted from http://stackoverflow.com/a/3259271/35070
1397
1398 import ctypes
1399 import ctypes.wintypes
1400
1401 WIN_OUTPUT_IDS = {
1402 1: -11,
1403 2: -12,
1404 }
1405
1406 try:
1407 fileno = out.fileno()
1408 except AttributeError:
1409 # If the output stream doesn't have a fileno, it's virtual
1410 return False
1411 except io.UnsupportedOperation:
1412 # Some strange Windows pseudo files?
1413 return False
1414 if fileno not in WIN_OUTPUT_IDS:
1415 return False
1416
1417 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1418 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1419 ('GetStdHandle', ctypes.windll.kernel32))
1420 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1421
1422 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1423 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1424 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1425 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1426 written = ctypes.wintypes.DWORD(0)
1427
1428 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1429 FILE_TYPE_CHAR = 0x0002
1430 FILE_TYPE_REMOTE = 0x8000
1431 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1432 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1433 ctypes.POINTER(ctypes.wintypes.DWORD))(
1434 ('GetConsoleMode', ctypes.windll.kernel32))
1435 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1436
1437 def not_a_console(handle):
1438 if handle == INVALID_HANDLE_VALUE or handle is None:
1439 return True
1440 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1441 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1442
1443 if not_a_console(h):
1444 return False
1445
1446 def next_nonbmp_pos(s):
1447 try:
1448 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1449 except StopIteration:
1450 return len(s)
1451
1452 while s:
1453 count = min(next_nonbmp_pos(s), 1024)
1454
1455 ret = WriteConsoleW(
1456 h, s, count if count else 2, ctypes.byref(written), None)
1457 if ret == 0:
1458 raise OSError('Failed to write string')
1459 if not count: # We just wrote a non-BMP character
1460 assert written.value == 2
1461 s = s[1:]
1462 else:
1463 assert written.value > 0
1464 s = s[written.value:]
1465 return True
1466
1467
1468 def write_string(s, out=None, encoding=None):
1469 if out is None:
1470 out = sys.stderr
1471 assert type(s) == compat_str
1472
1473 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1474 if _windows_write_string(s, out):
1475 return
1476
1477 if ('b' in getattr(out, 'mode', '') or
1478 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1479 byt = s.encode(encoding or preferredencoding(), 'ignore')
1480 out.write(byt)
1481 elif hasattr(out, 'buffer'):
1482 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1483 byt = s.encode(enc, 'ignore')
1484 out.buffer.write(byt)
1485 else:
1486 out.write(s)
1487 out.flush()
1488
1489
1490 def bytes_to_intlist(bs):
1491 if not bs:
1492 return []
1493 if isinstance(bs[0], int): # Python 3
1494 return list(bs)
1495 else:
1496 return [ord(c) for c in bs]
1497
1498
1499 def intlist_to_bytes(xs):
1500 if not xs:
1501 return b''
1502 return compat_struct_pack('%dB' % len(xs), *xs)
1503
1504
1505 # Cross-platform file locking
1506 if sys.platform == 'win32':
1507 import ctypes.wintypes
1508 import msvcrt
1509
1510 class OVERLAPPED(ctypes.Structure):
1511 _fields_ = [
1512 ('Internal', ctypes.wintypes.LPVOID),
1513 ('InternalHigh', ctypes.wintypes.LPVOID),
1514 ('Offset', ctypes.wintypes.DWORD),
1515 ('OffsetHigh', ctypes.wintypes.DWORD),
1516 ('hEvent', ctypes.wintypes.HANDLE),
1517 ]
1518
1519 kernel32 = ctypes.windll.kernel32
1520 LockFileEx = kernel32.LockFileEx
1521 LockFileEx.argtypes = [
1522 ctypes.wintypes.HANDLE, # hFile
1523 ctypes.wintypes.DWORD, # dwFlags
1524 ctypes.wintypes.DWORD, # dwReserved
1525 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1526 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1527 ctypes.POINTER(OVERLAPPED) # Overlapped
1528 ]
1529 LockFileEx.restype = ctypes.wintypes.BOOL
1530 UnlockFileEx = kernel32.UnlockFileEx
1531 UnlockFileEx.argtypes = [
1532 ctypes.wintypes.HANDLE, # hFile
1533 ctypes.wintypes.DWORD, # dwReserved
1534 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1535 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1536 ctypes.POINTER(OVERLAPPED) # Overlapped
1537 ]
1538 UnlockFileEx.restype = ctypes.wintypes.BOOL
1539 whole_low = 0xffffffff
1540 whole_high = 0x7fffffff
1541
1542 def _lock_file(f, exclusive):
1543 overlapped = OVERLAPPED()
1544 overlapped.Offset = 0
1545 overlapped.OffsetHigh = 0
1546 overlapped.hEvent = 0
1547 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1548 handle = msvcrt.get_osfhandle(f.fileno())
1549 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1550 whole_low, whole_high, f._lock_file_overlapped_p):
1551 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1552
1553 def _unlock_file(f):
1554 assert f._lock_file_overlapped_p
1555 handle = msvcrt.get_osfhandle(f.fileno())
1556 if not UnlockFileEx(handle, 0,
1557 whole_low, whole_high, f._lock_file_overlapped_p):
1558 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1559
1560 else:
1561 # Some platforms, such as Jython, is missing fcntl
1562 try:
1563 import fcntl
1564
1565 def _lock_file(f, exclusive):
1566 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1567
1568 def _unlock_file(f):
1569 fcntl.flock(f, fcntl.LOCK_UN)
1570 except ImportError:
1571 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1572
1573 def _lock_file(f, exclusive):
1574 raise IOError(UNSUPPORTED_MSG)
1575
1576 def _unlock_file(f):
1577 raise IOError(UNSUPPORTED_MSG)
1578
1579
1580 class locked_file(object):
1581 def __init__(self, filename, mode, encoding=None):
1582 assert mode in ['r', 'a', 'w']
1583 self.f = io.open(filename, mode, encoding=encoding)
1584 self.mode = mode
1585
1586 def __enter__(self):
1587 exclusive = self.mode != 'r'
1588 try:
1589 _lock_file(self.f, exclusive)
1590 except IOError:
1591 self.f.close()
1592 raise
1593 return self
1594
1595 def __exit__(self, etype, value, traceback):
1596 try:
1597 _unlock_file(self.f)
1598 finally:
1599 self.f.close()
1600
1601 def __iter__(self):
1602 return iter(self.f)
1603
1604 def write(self, *args):
1605 return self.f.write(*args)
1606
1607 def read(self, *args):
1608 return self.f.read(*args)
1609
1610
1611 def get_filesystem_encoding():
1612 encoding = sys.getfilesystemencoding()
1613 return encoding if encoding is not None else 'utf-8'
1614
1615
1616 def shell_quote(args):
1617 quoted_args = []
1618 encoding = get_filesystem_encoding()
1619 for a in args:
1620 if isinstance(a, bytes):
1621 # We may get a filename encoded with 'encodeFilename'
1622 a = a.decode(encoding)
1623 quoted_args.append(compat_shlex_quote(a))
1624 return ' '.join(quoted_args)
1625
1626
1627 def smuggle_url(url, data):
1628 """ Pass additional data in a URL for internal use. """
1629
1630 url, idata = unsmuggle_url(url, {})
1631 data.update(idata)
1632 sdata = compat_urllib_parse_urlencode(
1633 {'__youtubedl_smuggle': json.dumps(data)})
1634 return url + '#' + sdata
1635
1636
1637 def unsmuggle_url(smug_url, default=None):
1638 if '#__youtubedl_smuggle' not in smug_url:
1639 return smug_url, default
1640 url, _, sdata = smug_url.rpartition('#')
1641 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1642 data = json.loads(jsond)
1643 return url, data
1644
1645
1646 def format_bytes(bytes):
1647 if bytes is None:
1648 return 'N/A'
1649 if type(bytes) is str:
1650 bytes = float(bytes)
1651 if bytes == 0.0:
1652 exponent = 0
1653 else:
1654 exponent = int(math.log(bytes, 1024.0))
1655 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1656 converted = float(bytes) / float(1024 ** exponent)
1657 return '%.2f%s' % (converted, suffix)
1658
1659
1660 def lookup_unit_table(unit_table, s):
1661 units_re = '|'.join(re.escape(u) for u in unit_table)
1662 m = re.match(
1663 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1664 if not m:
1665 return None
1666 num_str = m.group('num').replace(',', '.')
1667 mult = unit_table[m.group('unit')]
1668 return int(float(num_str) * mult)
1669
1670
1671 def parse_filesize(s):
1672 if s is None:
1673 return None
1674
1675 # The lower-case forms are of course incorrect and unofficial,
1676 # but we support those too
1677 _UNIT_TABLE = {
1678 'B': 1,
1679 'b': 1,
1680 'bytes': 1,
1681 'KiB': 1024,
1682 'KB': 1000,
1683 'kB': 1024,
1684 'Kb': 1000,
1685 'kb': 1000,
1686 'kilobytes': 1000,
1687 'kibibytes': 1024,
1688 'MiB': 1024 ** 2,
1689 'MB': 1000 ** 2,
1690 'mB': 1024 ** 2,
1691 'Mb': 1000 ** 2,
1692 'mb': 1000 ** 2,
1693 'megabytes': 1000 ** 2,
1694 'mebibytes': 1024 ** 2,
1695 'GiB': 1024 ** 3,
1696 'GB': 1000 ** 3,
1697 'gB': 1024 ** 3,
1698 'Gb': 1000 ** 3,
1699 'gb': 1000 ** 3,
1700 'gigabytes': 1000 ** 3,
1701 'gibibytes': 1024 ** 3,
1702 'TiB': 1024 ** 4,
1703 'TB': 1000 ** 4,
1704 'tB': 1024 ** 4,
1705 'Tb': 1000 ** 4,
1706 'tb': 1000 ** 4,
1707 'terabytes': 1000 ** 4,
1708 'tebibytes': 1024 ** 4,
1709 'PiB': 1024 ** 5,
1710 'PB': 1000 ** 5,
1711 'pB': 1024 ** 5,
1712 'Pb': 1000 ** 5,
1713 'pb': 1000 ** 5,
1714 'petabytes': 1000 ** 5,
1715 'pebibytes': 1024 ** 5,
1716 'EiB': 1024 ** 6,
1717 'EB': 1000 ** 6,
1718 'eB': 1024 ** 6,
1719 'Eb': 1000 ** 6,
1720 'eb': 1000 ** 6,
1721 'exabytes': 1000 ** 6,
1722 'exbibytes': 1024 ** 6,
1723 'ZiB': 1024 ** 7,
1724 'ZB': 1000 ** 7,
1725 'zB': 1024 ** 7,
1726 'Zb': 1000 ** 7,
1727 'zb': 1000 ** 7,
1728 'zettabytes': 1000 ** 7,
1729 'zebibytes': 1024 ** 7,
1730 'YiB': 1024 ** 8,
1731 'YB': 1000 ** 8,
1732 'yB': 1024 ** 8,
1733 'Yb': 1000 ** 8,
1734 'yb': 1000 ** 8,
1735 'yottabytes': 1000 ** 8,
1736 'yobibytes': 1024 ** 8,
1737 }
1738
1739 return lookup_unit_table(_UNIT_TABLE, s)
1740
1741
1742 def parse_count(s):
1743 if s is None:
1744 return None
1745
1746 s = s.strip()
1747
1748 if re.match(r'^[\d,.]+$', s):
1749 return str_to_int(s)
1750
1751 _UNIT_TABLE = {
1752 'k': 1000,
1753 'K': 1000,
1754 'm': 1000 ** 2,
1755 'M': 1000 ** 2,
1756 'kk': 1000 ** 2,
1757 'KK': 1000 ** 2,
1758 }
1759
1760 return lookup_unit_table(_UNIT_TABLE, s)
1761
1762
1763 def parse_resolution(s):
1764 if s is None:
1765 return {}
1766
1767 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1768 if mobj:
1769 return {
1770 'width': int(mobj.group('w')),
1771 'height': int(mobj.group('h')),
1772 }
1773
1774 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1775 if mobj:
1776 return {'height': int(mobj.group(1))}
1777
1778 mobj = re.search(r'\b([48])[kK]\b', s)
1779 if mobj:
1780 return {'height': int(mobj.group(1)) * 540}
1781
1782 return {}
1783
1784
1785 def month_by_name(name, lang='en'):
1786 """ Return the number of a month by (locale-independently) English name """
1787
1788 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1789
1790 try:
1791 return month_names.index(name) + 1
1792 except ValueError:
1793 return None
1794
1795
1796 def month_by_abbreviation(abbrev):
1797 """ Return the number of a month by (locale-independently) English
1798 abbreviations """
1799
1800 try:
1801 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1802 except ValueError:
1803 return None
1804
1805
1806 def fix_xml_ampersands(xml_str):
1807 """Replace all the '&' by '&amp;' in XML"""
1808 return re.sub(
1809 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1810 '&amp;',
1811 xml_str)
1812
1813
1814 def setproctitle(title):
1815 assert isinstance(title, compat_str)
1816
1817 # ctypes in Jython is not complete
1818 # http://bugs.jython.org/issue2148
1819 if sys.platform.startswith('java'):
1820 return
1821
1822 try:
1823 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1824 except OSError:
1825 return
1826 except TypeError:
1827 # LoadLibrary in Windows Python 2.7.13 only expects
1828 # a bytestring, but since unicode_literals turns
1829 # every string into a unicode string, it fails.
1830 return
1831 title_bytes = title.encode('utf-8')
1832 buf = ctypes.create_string_buffer(len(title_bytes))
1833 buf.value = title_bytes
1834 try:
1835 libc.prctl(15, buf, 0, 0, 0)
1836 except AttributeError:
1837 return # Strange libc, just skip this
1838
1839
1840 def remove_start(s, start):
1841 return s[len(start):] if s is not None and s.startswith(start) else s
1842
1843
1844 def remove_end(s, end):
1845 return s[:-len(end)] if s is not None and s.endswith(end) else s
1846
1847
1848 def remove_quotes(s):
1849 if s is None or len(s) < 2:
1850 return s
1851 for quote in ('"', "'", ):
1852 if s[0] == quote and s[-1] == quote:
1853 return s[1:-1]
1854 return s
1855
1856
1857 def url_basename(url):
1858 path = compat_urlparse.urlparse(url).path
1859 return path.strip('/').split('/')[-1]
1860
1861
1862 def base_url(url):
1863 return re.match(r'https?://[^?#&]+/', url).group()
1864
1865
1866 def urljoin(base, path):
1867 if isinstance(path, bytes):
1868 path = path.decode('utf-8')
1869 if not isinstance(path, compat_str) or not path:
1870 return None
1871 if re.match(r'^(?:https?:)?//', path):
1872 return path
1873 if isinstance(base, bytes):
1874 base = base.decode('utf-8')
1875 if not isinstance(base, compat_str) or not re.match(
1876 r'^(?:https?:)?//', base):
1877 return None
1878 return compat_urlparse.urljoin(base, path)
1879
1880
1881 class HEADRequest(compat_urllib_request.Request):
1882 def get_method(self):
1883 return 'HEAD'
1884
1885
1886 class PUTRequest(compat_urllib_request.Request):
1887 def get_method(self):
1888 return 'PUT'
1889
1890
1891 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1892 if get_attr:
1893 if v is not None:
1894 v = getattr(v, get_attr, None)
1895 if v == '':
1896 v = None
1897 if v is None:
1898 return default
1899 try:
1900 return int(v) * invscale // scale
1901 except ValueError:
1902 return default
1903
1904
1905 def str_or_none(v, default=None):
1906 return default if v is None else compat_str(v)
1907
1908
1909 def str_to_int(int_str):
1910 """ A more relaxed version of int_or_none """
1911 if int_str is None:
1912 return None
1913 int_str = re.sub(r'[,\.\+]', '', int_str)
1914 return int(int_str)
1915
1916
1917 def float_or_none(v, scale=1, invscale=1, default=None):
1918 if v is None:
1919 return default
1920 try:
1921 return float(v) * invscale / scale
1922 except ValueError:
1923 return default
1924
1925
1926 def bool_or_none(v, default=None):
1927 return v if isinstance(v, bool) else default
1928
1929
1930 def strip_or_none(v):
1931 return None if v is None else v.strip()
1932
1933
1934 def url_or_none(url):
1935 if not url or not isinstance(url, compat_str):
1936 return None
1937 url = url.strip()
1938 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1939
1940
1941 def parse_duration(s):
1942 if not isinstance(s, compat_basestring):
1943 return None
1944
1945 s = s.strip()
1946
1947 days, hours, mins, secs, ms = [None] * 5
1948 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1949 if m:
1950 days, hours, mins, secs, ms = m.groups()
1951 else:
1952 m = re.match(
1953 r'''(?ix)(?:P?
1954 (?:
1955 [0-9]+\s*y(?:ears?)?\s*
1956 )?
1957 (?:
1958 [0-9]+\s*m(?:onths?)?\s*
1959 )?
1960 (?:
1961 [0-9]+\s*w(?:eeks?)?\s*
1962 )?
1963 (?:
1964 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1965 )?
1966 T)?
1967 (?:
1968 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1969 )?
1970 (?:
1971 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1972 )?
1973 (?:
1974 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1975 )?Z?$''', s)
1976 if m:
1977 days, hours, mins, secs, ms = m.groups()
1978 else:
1979 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1980 if m:
1981 hours, mins = m.groups()
1982 else:
1983 return None
1984
1985 duration = 0
1986 if secs:
1987 duration += float(secs)
1988 if mins:
1989 duration += float(mins) * 60
1990 if hours:
1991 duration += float(hours) * 60 * 60
1992 if days:
1993 duration += float(days) * 24 * 60 * 60
1994 if ms:
1995 duration += float(ms)
1996 return duration
1997
1998
1999 def prepend_extension(filename, ext, expected_real_ext=None):
2000 name, real_ext = os.path.splitext(filename)
2001 return (
2002 '{0}.{1}{2}'.format(name, ext, real_ext)
2003 if not expected_real_ext or real_ext[1:] == expected_real_ext
2004 else '{0}.{1}'.format(filename, ext))
2005
2006
2007 def replace_extension(filename, ext, expected_real_ext=None):
2008 name, real_ext = os.path.splitext(filename)
2009 return '{0}.{1}'.format(
2010 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2011 ext)
2012
2013
2014 def check_executable(exe, args=[]):
2015 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2016 args can be a list of arguments for a short output (like -version) """
2017 try:
2018 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
2019 except OSError:
2020 return False
2021 return exe
2022
2023
2024 def get_exe_version(exe, args=['--version'],
2025 version_re=None, unrecognized='present'):
2026 """ Returns the version of the specified executable,
2027 or False if the executable is not present """
2028 try:
2029 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2030 # SIGTTOU if youtube-dl is run in the background.
2031 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
2032 out, _ = subprocess.Popen(
2033 [encodeArgument(exe)] + args,
2034 stdin=subprocess.PIPE,
2035 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2036 except OSError:
2037 return False
2038 if isinstance(out, bytes): # Python 2.x
2039 out = out.decode('ascii', 'ignore')
2040 return detect_exe_version(out, version_re, unrecognized)
2041
2042
2043 def detect_exe_version(output, version_re=None, unrecognized='present'):
2044 assert isinstance(output, compat_str)
2045 if version_re is None:
2046 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2047 m = re.search(version_re, output)
2048 if m:
2049 return m.group(1)
2050 else:
2051 return unrecognized
2052
2053
2054 class PagedList(object):
2055 def __len__(self):
2056 # This is only useful for tests
2057 return len(self.getslice())
2058
2059
2060 class OnDemandPagedList(PagedList):
2061 def __init__(self, pagefunc, pagesize, use_cache=True):
2062 self._pagefunc = pagefunc
2063 self._pagesize = pagesize
2064 self._use_cache = use_cache
2065 if use_cache:
2066 self._cache = {}
2067
2068 def getslice(self, start=0, end=None):
2069 res = []
2070 for pagenum in itertools.count(start // self._pagesize):
2071 firstid = pagenum * self._pagesize
2072 nextfirstid = pagenum * self._pagesize + self._pagesize
2073 if start >= nextfirstid:
2074 continue
2075
2076 page_results = None
2077 if self._use_cache:
2078 page_results = self._cache.get(pagenum)
2079 if page_results is None:
2080 page_results = list(self._pagefunc(pagenum))
2081 if self._use_cache:
2082 self._cache[pagenum] = page_results
2083
2084 startv = (
2085 start % self._pagesize
2086 if firstid <= start < nextfirstid
2087 else 0)
2088
2089 endv = (
2090 ((end - 1) % self._pagesize) + 1
2091 if (end is not None and firstid <= end <= nextfirstid)
2092 else None)
2093
2094 if startv != 0 or endv is not None:
2095 page_results = page_results[startv:endv]
2096 res.extend(page_results)
2097
2098 # A little optimization - if current page is not "full", ie. does
2099 # not contain page_size videos then we can assume that this page
2100 # is the last one - there are no more ids on further pages -
2101 # i.e. no need to query again.
2102 if len(page_results) + startv < self._pagesize:
2103 break
2104
2105 # If we got the whole page, but the next page is not interesting,
2106 # break out early as well
2107 if end == nextfirstid:
2108 break
2109 return res
2110
2111
2112 class InAdvancePagedList(PagedList):
2113 def __init__(self, pagefunc, pagecount, pagesize):
2114 self._pagefunc = pagefunc
2115 self._pagecount = pagecount
2116 self._pagesize = pagesize
2117
2118 def getslice(self, start=0, end=None):
2119 res = []
2120 start_page = start // self._pagesize
2121 end_page = (
2122 self._pagecount if end is None else (end // self._pagesize + 1))
2123 skip_elems = start - start_page * self._pagesize
2124 only_more = None if end is None else end - start
2125 for pagenum in range(start_page, end_page):
2126 page = list(self._pagefunc(pagenum))
2127 if skip_elems:
2128 page = page[skip_elems:]
2129 skip_elems = None
2130 if only_more is not None:
2131 if len(page) < only_more:
2132 only_more -= len(page)
2133 else:
2134 page = page[:only_more]
2135 res.extend(page)
2136 break
2137 res.extend(page)
2138 return res
2139
2140
2141 def uppercase_escape(s):
2142 unicode_escape = codecs.getdecoder('unicode_escape')
2143 return re.sub(
2144 r'\\U[0-9a-fA-F]{8}',
2145 lambda m: unicode_escape(m.group(0))[0],
2146 s)
2147
2148
2149 def lowercase_escape(s):
2150 unicode_escape = codecs.getdecoder('unicode_escape')
2151 return re.sub(
2152 r'\\u[0-9a-fA-F]{4}',
2153 lambda m: unicode_escape(m.group(0))[0],
2154 s)
2155
2156
2157 def escape_rfc3986(s):
2158 """Escape non-ASCII characters as suggested by RFC 3986"""
2159 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2160 s = s.encode('utf-8')
2161 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2162
2163
2164 def escape_url(url):
2165 """Escape URL as suggested by RFC 3986"""
2166 url_parsed = compat_urllib_parse_urlparse(url)
2167 return url_parsed._replace(
2168 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2169 path=escape_rfc3986(url_parsed.path),
2170 params=escape_rfc3986(url_parsed.params),
2171 query=escape_rfc3986(url_parsed.query),
2172 fragment=escape_rfc3986(url_parsed.fragment)
2173 ).geturl()
2174
2175
2176 def read_batch_urls(batch_fd):
2177 def fixup(url):
2178 if not isinstance(url, compat_str):
2179 url = url.decode('utf-8', 'replace')
2180 BOM_UTF8 = '\xef\xbb\xbf'
2181 if url.startswith(BOM_UTF8):
2182 url = url[len(BOM_UTF8):]
2183 url = url.strip()
2184 if url.startswith(('#', ';', ']')):
2185 return False
2186 return url
2187
2188 with contextlib.closing(batch_fd) as fd:
2189 return [url for url in map(fixup, fd) if url]
2190
2191
2192 def urlencode_postdata(*args, **kargs):
2193 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2194
2195
2196 def update_url_query(url, query):
2197 if not query:
2198 return url
2199 parsed_url = compat_urlparse.urlparse(url)
2200 qs = compat_parse_qs(parsed_url.query)
2201 qs.update(query)
2202 return compat_urlparse.urlunparse(parsed_url._replace(
2203 query=compat_urllib_parse_urlencode(qs, True)))
2204
2205
2206 def update_Request(req, url=None, data=None, headers={}, query={}):
2207 req_headers = req.headers.copy()
2208 req_headers.update(headers)
2209 req_data = data or req.data
2210 req_url = update_url_query(url or req.get_full_url(), query)
2211 req_get_method = req.get_method()
2212 if req_get_method == 'HEAD':
2213 req_type = HEADRequest
2214 elif req_get_method == 'PUT':
2215 req_type = PUTRequest
2216 else:
2217 req_type = compat_urllib_request.Request
2218 new_req = req_type(
2219 req_url, data=req_data, headers=req_headers,
2220 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2221 if hasattr(req, 'timeout'):
2222 new_req.timeout = req.timeout
2223 return new_req
2224
2225
2226 def _multipart_encode_impl(data, boundary):
2227 content_type = 'multipart/form-data; boundary=%s' % boundary
2228
2229 out = b''
2230 for k, v in data.items():
2231 out += b'--' + boundary.encode('ascii') + b'\r\n'
2232 if isinstance(k, compat_str):
2233 k = k.encode('utf-8')
2234 if isinstance(v, compat_str):
2235 v = v.encode('utf-8')
2236 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2237 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2238 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2239 if boundary.encode('ascii') in content:
2240 raise ValueError('Boundary overlaps with data')
2241 out += content
2242
2243 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2244
2245 return out, content_type
2246
2247
2248 def multipart_encode(data, boundary=None):
2249 '''
2250 Encode a dict to RFC 7578-compliant form-data
2251
2252 data:
2253 A dict where keys and values can be either Unicode or bytes-like
2254 objects.
2255 boundary:
2256 If specified a Unicode object, it's used as the boundary. Otherwise
2257 a random boundary is generated.
2258
2259 Reference: https://tools.ietf.org/html/rfc7578
2260 '''
2261 has_specified_boundary = boundary is not None
2262
2263 while True:
2264 if boundary is None:
2265 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2266
2267 try:
2268 out, content_type = _multipart_encode_impl(data, boundary)
2269 break
2270 except ValueError:
2271 if has_specified_boundary:
2272 raise
2273 boundary = None
2274
2275 return out, content_type
2276
2277
2278 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2279 if isinstance(key_or_keys, (list, tuple)):
2280 for key in key_or_keys:
2281 if key not in d or d[key] is None or skip_false_values and not d[key]:
2282 continue
2283 return d[key]
2284 return default
2285 return d.get(key_or_keys, default)
2286
2287
2288 def try_get(src, getter, expected_type=None):
2289 if not isinstance(getter, (list, tuple)):
2290 getter = [getter]
2291 for get in getter:
2292 try:
2293 v = get(src)
2294 except (AttributeError, KeyError, TypeError, IndexError):
2295 pass
2296 else:
2297 if expected_type is None or isinstance(v, expected_type):
2298 return v
2299
2300
2301 def merge_dicts(*dicts):
2302 merged = {}
2303 for a_dict in dicts:
2304 for k, v in a_dict.items():
2305 if v is None:
2306 continue
2307 if (k not in merged or
2308 (isinstance(v, compat_str) and v and
2309 isinstance(merged[k], compat_str) and
2310 not merged[k])):
2311 merged[k] = v
2312 return merged
2313
2314
2315 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2316 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2317
2318
2319 US_RATINGS = {
2320 'G': 0,
2321 'PG': 10,
2322 'PG-13': 13,
2323 'R': 16,
2324 'NC': 18,
2325 }
2326
2327
2328 TV_PARENTAL_GUIDELINES = {
2329 'TV-Y': 0,
2330 'TV-Y7': 7,
2331 'TV-G': 0,
2332 'TV-PG': 0,
2333 'TV-14': 14,
2334 'TV-MA': 17,
2335 }
2336
2337
2338 def parse_age_limit(s):
2339 if type(s) == int:
2340 return s if 0 <= s <= 21 else None
2341 if not isinstance(s, compat_basestring):
2342 return None
2343 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2344 if m:
2345 return int(m.group('age'))
2346 if s in US_RATINGS:
2347 return US_RATINGS[s]
2348 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2349 if m:
2350 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2351 return None
2352
2353
2354 def strip_jsonp(code):
2355 return re.sub(
2356 r'''(?sx)^
2357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2358 (?:\s*&&\s*(?P=func_name))?
2359 \s*\(\s*(?P<callback_data>.*)\);?
2360 \s*?(?://[^\n]*)*$''',
2361 r'\g<callback_data>', code)
2362
2363
2364 def js_to_json(code):
2365 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2366 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2367 INTEGER_TABLE = (
2368 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2369 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2370 )
2371
2372 def fix_kv(m):
2373 v = m.group(0)
2374 if v in ('true', 'false', 'null'):
2375 return v
2376 elif v.startswith('/*') or v.startswith('//') or v == ',':
2377 return ""
2378
2379 if v[0] in ("'", '"'):
2380 v = re.sub(r'(?s)\\.|"', lambda m: {
2381 '"': '\\"',
2382 "\\'": "'",
2383 '\\\n': '',
2384 '\\x': '\\u00',
2385 }.get(m.group(0), m.group(0)), v[1:-1])
2386
2387 for regex, base in INTEGER_TABLE:
2388 im = re.match(regex, v)
2389 if im:
2390 i = int(im.group(1), base)
2391 return '"%d":' % i if v.endswith(':') else '%d' % i
2392
2393 return '"%s"' % v
2394
2395 return re.sub(r'''(?sx)
2396 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2397 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2398 {comment}|,(?={skip}[\]}}])|
2399 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2400 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2401 [0-9]+(?={skip}:)
2402 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2403
2404
2405 def qualities(quality_ids):
2406 """ Get a numeric quality value out of a list of possible values """
2407 def q(qid):
2408 try:
2409 return quality_ids.index(qid)
2410 except ValueError:
2411 return -1
2412 return q
2413
2414
2415 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2416
2417
2418 def limit_length(s, length):
2419 """ Add ellipses to overly long strings """
2420 if s is None:
2421 return None
2422 ELLIPSES = '...'
2423 if len(s) > length:
2424 return s[:length - len(ELLIPSES)] + ELLIPSES
2425 return s
2426
2427
2428 def version_tuple(v):
2429 return tuple(int(e) for e in re.split(r'[-.]', v))
2430
2431
2432 def is_outdated_version(version, limit, assume_new=True):
2433 if not version:
2434 return not assume_new
2435 try:
2436 return version_tuple(version) < version_tuple(limit)
2437 except ValueError:
2438 return not assume_new
2439
2440
2441 def ytdl_is_updateable():
2442 """ Returns if youtube-dl can be updated with -U """
2443 from zipimport import zipimporter
2444
2445 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2446
2447
2448 def args_to_str(args):
2449 # Get a short string representation for a subprocess command
2450 return ' '.join(compat_shlex_quote(a) for a in args)
2451
2452
2453 def error_to_compat_str(err):
2454 err_str = str(err)
2455 # On python 2 error byte string must be decoded with proper
2456 # encoding rather than ascii
2457 if sys.version_info[0] < 3:
2458 err_str = err_str.decode(preferredencoding())
2459 return err_str
2460
2461
2462 def mimetype2ext(mt):
2463 if mt is None:
2464 return None
2465
2466 ext = {
2467 'audio/mp4': 'm4a',
2468 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2469 # it's the most popular one
2470 'audio/mpeg': 'mp3',
2471 }.get(mt)
2472 if ext is not None:
2473 return ext
2474
2475 _, _, res = mt.rpartition('/')
2476 res = res.split(';')[0].strip().lower()
2477
2478 return {
2479 '3gpp': '3gp',
2480 'smptett+xml': 'tt',
2481 'ttaf+xml': 'dfxp',
2482 'ttml+xml': 'ttml',
2483 'x-flv': 'flv',
2484 'x-mp4-fragmented': 'mp4',
2485 'x-ms-sami': 'sami',
2486 'x-ms-wmv': 'wmv',
2487 'mpegurl': 'm3u8',
2488 'x-mpegurl': 'm3u8',
2489 'vnd.apple.mpegurl': 'm3u8',
2490 'dash+xml': 'mpd',
2491 'f4m+xml': 'f4m',
2492 'hds+xml': 'f4m',
2493 'vnd.ms-sstr+xml': 'ism',
2494 'quicktime': 'mov',
2495 'mp2t': 'ts',
2496 }.get(res, res)
2497
2498
2499 def parse_codecs(codecs_str):
2500 # http://tools.ietf.org/html/rfc6381
2501 if not codecs_str:
2502 return {}
2503 splited_codecs = list(filter(None, map(
2504 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2505 vcodec, acodec = None, None
2506 for full_codec in splited_codecs:
2507 codec = full_codec.split('.')[0]
2508 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
2509 if not vcodec:
2510 vcodec = full_codec
2511 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2512 if not acodec:
2513 acodec = full_codec
2514 else:
2515 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2516 if not vcodec and not acodec:
2517 if len(splited_codecs) == 2:
2518 return {
2519 'vcodec': vcodec,
2520 'acodec': acodec,
2521 }
2522 elif len(splited_codecs) == 1:
2523 return {
2524 'vcodec': 'none',
2525 'acodec': vcodec,
2526 }
2527 else:
2528 return {
2529 'vcodec': vcodec or 'none',
2530 'acodec': acodec or 'none',
2531 }
2532 return {}
2533
2534
2535 def urlhandle_detect_ext(url_handle):
2536 getheader = url_handle.headers.get
2537
2538 cd = getheader('Content-Disposition')
2539 if cd:
2540 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2541 if m:
2542 e = determine_ext(m.group('filename'), default_ext=None)
2543 if e:
2544 return e
2545
2546 return mimetype2ext(getheader('Content-Type'))
2547
2548
2549 def encode_data_uri(data, mime_type):
2550 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2551
2552
2553 def age_restricted(content_limit, age_limit):
2554 """ Returns True iff the content should be blocked """
2555
2556 if age_limit is None: # No limit set
2557 return False
2558 if content_limit is None:
2559 return False # Content available for everyone
2560 return age_limit < content_limit
2561
2562
2563 def is_html(first_bytes):
2564 """ Detect whether a file contains HTML by examining its first bytes. """
2565
2566 BOMS = [
2567 (b'\xef\xbb\xbf', 'utf-8'),
2568 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2569 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2570 (b'\xff\xfe', 'utf-16-le'),
2571 (b'\xfe\xff', 'utf-16-be'),
2572 ]
2573 for bom, enc in BOMS:
2574 if first_bytes.startswith(bom):
2575 s = first_bytes[len(bom):].decode(enc, 'replace')
2576 break
2577 else:
2578 s = first_bytes.decode('utf-8', 'replace')
2579
2580 return re.match(r'^\s*<', s)
2581
2582
2583 def determine_protocol(info_dict):
2584 protocol = info_dict.get('protocol')
2585 if protocol is not None:
2586 return protocol
2587
2588 url = info_dict['url']
2589 if url.startswith('rtmp'):
2590 return 'rtmp'
2591 elif url.startswith('mms'):
2592 return 'mms'
2593 elif url.startswith('rtsp'):
2594 return 'rtsp'
2595
2596 ext = determine_ext(url)
2597 if ext == 'm3u8':
2598 return 'm3u8'
2599 elif ext == 'f4m':
2600 return 'f4m'
2601
2602 return compat_urllib_parse_urlparse(url).scheme
2603
2604
2605 def render_table(header_row, data):
2606 """ Render a list of rows, each as a list of values """
2607 table = [header_row] + data
2608 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2609 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2610 return '\n'.join(format_str % tuple(row) for row in table)
2611
2612
2613 def _match_one(filter_part, dct):
2614 COMPARISON_OPERATORS = {
2615 '<': operator.lt,
2616 '<=': operator.le,
2617 '>': operator.gt,
2618 '>=': operator.ge,
2619 '=': operator.eq,
2620 '!=': operator.ne,
2621 }
2622 operator_rex = re.compile(r'''(?x)\s*
2623 (?P<key>[a-z_]+)
2624 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2625 (?:
2626 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2627 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2628 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2629 )
2630 \s*$
2631 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2632 m = operator_rex.search(filter_part)
2633 if m:
2634 op = COMPARISON_OPERATORS[m.group('op')]
2635 actual_value = dct.get(m.group('key'))
2636 if (m.group('quotedstrval') is not None or
2637 m.group('strval') is not None or
2638 # If the original field is a string and matching comparisonvalue is
2639 # a number we should respect the origin of the original field
2640 # and process comparison value as a string (see
2641 # https://github.com/rg3/youtube-dl/issues/11082).
2642 actual_value is not None and m.group('intval') is not None and
2643 isinstance(actual_value, compat_str)):
2644 if m.group('op') not in ('=', '!='):
2645 raise ValueError(
2646 'Operator %s does not support string values!' % m.group('op'))
2647 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2648 quote = m.group('quote')
2649 if quote is not None:
2650 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2651 else:
2652 try:
2653 comparison_value = int(m.group('intval'))
2654 except ValueError:
2655 comparison_value = parse_filesize(m.group('intval'))
2656 if comparison_value is None:
2657 comparison_value = parse_filesize(m.group('intval') + 'B')
2658 if comparison_value is None:
2659 raise ValueError(
2660 'Invalid integer value %r in filter part %r' % (
2661 m.group('intval'), filter_part))
2662 if actual_value is None:
2663 return m.group('none_inclusive')
2664 return op(actual_value, comparison_value)
2665
2666 UNARY_OPERATORS = {
2667 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2668 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2669 }
2670 operator_rex = re.compile(r'''(?x)\s*
2671 (?P<op>%s)\s*(?P<key>[a-z_]+)
2672 \s*$
2673 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2674 m = operator_rex.search(filter_part)
2675 if m:
2676 op = UNARY_OPERATORS[m.group('op')]
2677 actual_value = dct.get(m.group('key'))
2678 return op(actual_value)
2679
2680 raise ValueError('Invalid filter part %r' % filter_part)
2681
2682
2683 def match_str(filter_str, dct):
2684 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2685
2686 return all(
2687 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2688
2689
2690 def match_filter_func(filter_str):
2691 def _match_func(info_dict):
2692 if match_str(filter_str, info_dict):
2693 return None
2694 else:
2695 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2696 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2697 return _match_func
2698
2699
2700 def parse_dfxp_time_expr(time_expr):
2701 if not time_expr:
2702 return
2703
2704 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2705 if mobj:
2706 return float(mobj.group('time_offset'))
2707
2708 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2709 if mobj:
2710 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2711
2712
2713 def srt_subtitles_timecode(seconds):
2714 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2715
2716
2717 def dfxp2srt(dfxp_data):
2718 '''
2719 @param dfxp_data A bytes-like object containing DFXP data
2720 @returns A unicode object containing converted SRT data
2721 '''
2722 LEGACY_NAMESPACES = (
2723 (b'http://www.w3.org/ns/ttml', [
2724 b'http://www.w3.org/2004/11/ttaf1',
2725 b'http://www.w3.org/2006/04/ttaf1',
2726 b'http://www.w3.org/2006/10/ttaf1',
2727 ]),
2728 (b'http://www.w3.org/ns/ttml#styling', [
2729 b'http://www.w3.org/ns/ttml#style',
2730 ]),
2731 )
2732
2733 SUPPORTED_STYLING = [
2734 'color',
2735 'fontFamily',
2736 'fontSize',
2737 'fontStyle',
2738 'fontWeight',
2739 'textDecoration'
2740 ]
2741
2742 _x = functools.partial(xpath_with_ns, ns_map={
2743 'xml': 'http://www.w3.org/XML/1998/namespace',
2744 'ttml': 'http://www.w3.org/ns/ttml',
2745 'tts': 'http://www.w3.org/ns/ttml#styling',
2746 })
2747
2748 styles = {}
2749 default_style = {}
2750
2751 class TTMLPElementParser(object):
2752 _out = ''
2753 _unclosed_elements = []
2754 _applied_styles = []
2755
2756 def start(self, tag, attrib):
2757 if tag in (_x('ttml:br'), 'br'):
2758 self._out += '\n'
2759 else:
2760 unclosed_elements = []
2761 style = {}
2762 element_style_id = attrib.get('style')
2763 if default_style:
2764 style.update(default_style)
2765 if element_style_id:
2766 style.update(styles.get(element_style_id, {}))
2767 for prop in SUPPORTED_STYLING:
2768 prop_val = attrib.get(_x('tts:' + prop))
2769 if prop_val:
2770 style[prop] = prop_val
2771 if style:
2772 font = ''
2773 for k, v in sorted(style.items()):
2774 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2775 continue
2776 if k == 'color':
2777 font += ' color="%s"' % v
2778 elif k == 'fontSize':
2779 font += ' size="%s"' % v
2780 elif k == 'fontFamily':
2781 font += ' face="%s"' % v
2782 elif k == 'fontWeight' and v == 'bold':
2783 self._out += '<b>'
2784 unclosed_elements.append('b')
2785 elif k == 'fontStyle' and v == 'italic':
2786 self._out += '<i>'
2787 unclosed_elements.append('i')
2788 elif k == 'textDecoration' and v == 'underline':
2789 self._out += '<u>'
2790 unclosed_elements.append('u')
2791 if font:
2792 self._out += '<font' + font + '>'
2793 unclosed_elements.append('font')
2794 applied_style = {}
2795 if self._applied_styles:
2796 applied_style.update(self._applied_styles[-1])
2797 applied_style.update(style)
2798 self._applied_styles.append(applied_style)
2799 self._unclosed_elements.append(unclosed_elements)
2800
2801 def end(self, tag):
2802 if tag not in (_x('ttml:br'), 'br'):
2803 unclosed_elements = self._unclosed_elements.pop()
2804 for element in reversed(unclosed_elements):
2805 self._out += '</%s>' % element
2806 if unclosed_elements and self._applied_styles:
2807 self._applied_styles.pop()
2808
2809 def data(self, data):
2810 self._out += data
2811
2812 def close(self):
2813 return self._out.strip()
2814
2815 def parse_node(node):
2816 target = TTMLPElementParser()
2817 parser = xml.etree.ElementTree.XMLParser(target=target)
2818 parser.feed(xml.etree.ElementTree.tostring(node))
2819 return parser.close()
2820
2821 for k, v in LEGACY_NAMESPACES:
2822 for ns in v:
2823 dfxp_data = dfxp_data.replace(ns, k)
2824
2825 dfxp = compat_etree_fromstring(dfxp_data)
2826 out = []
2827 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2828
2829 if not paras:
2830 raise ValueError('Invalid dfxp/TTML subtitle')
2831
2832 repeat = False
2833 while True:
2834 for style in dfxp.findall(_x('.//ttml:style')):
2835 style_id = style.get('id') or style.get(_x('xml:id'))
2836 if not style_id:
2837 continue
2838 parent_style_id = style.get('style')
2839 if parent_style_id:
2840 if parent_style_id not in styles:
2841 repeat = True
2842 continue
2843 styles[style_id] = styles[parent_style_id].copy()
2844 for prop in SUPPORTED_STYLING:
2845 prop_val = style.get(_x('tts:' + prop))
2846 if prop_val:
2847 styles.setdefault(style_id, {})[prop] = prop_val
2848 if repeat:
2849 repeat = False
2850 else:
2851 break
2852
2853 for p in ('body', 'div'):
2854 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2855 if ele is None:
2856 continue
2857 style = styles.get(ele.get('style'))
2858 if not style:
2859 continue
2860 default_style.update(style)
2861
2862 for para, index in zip(paras, itertools.count(1)):
2863 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2864 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2865 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2866 if begin_time is None:
2867 continue
2868 if not end_time:
2869 if not dur:
2870 continue
2871 end_time = begin_time + dur
2872 out.append('%d\n%s --> %s\n%s\n\n' % (
2873 index,
2874 srt_subtitles_timecode(begin_time),
2875 srt_subtitles_timecode(end_time),
2876 parse_node(para)))
2877
2878 return ''.join(out)
2879
2880
2881 def cli_option(params, command_option, param):
2882 param = params.get(param)
2883 if param:
2884 param = compat_str(param)
2885 return [command_option, param] if param is not None else []
2886
2887
2888 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2889 param = params.get(param)
2890 if param is None:
2891 return []
2892 assert isinstance(param, bool)
2893 if separator:
2894 return [command_option + separator + (true_value if param else false_value)]
2895 return [command_option, true_value if param else false_value]
2896
2897
2898 def cli_valueless_option(params, command_option, param, expected_value=True):
2899 param = params.get(param)
2900 return [command_option] if param == expected_value else []
2901
2902
2903 def cli_configuration_args(params, param, default=[]):
2904 ex_args = params.get(param)
2905 if ex_args is None:
2906 return default
2907 assert isinstance(ex_args, list)
2908 return ex_args
2909
2910
2911 class ISO639Utils(object):
2912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2913 _lang_map = {
2914 'aa': 'aar',
2915 'ab': 'abk',
2916 'ae': 'ave',
2917 'af': 'afr',
2918 'ak': 'aka',
2919 'am': 'amh',
2920 'an': 'arg',
2921 'ar': 'ara',
2922 'as': 'asm',
2923 'av': 'ava',
2924 'ay': 'aym',
2925 'az': 'aze',
2926 'ba': 'bak',
2927 'be': 'bel',
2928 'bg': 'bul',
2929 'bh': 'bih',
2930 'bi': 'bis',
2931 'bm': 'bam',
2932 'bn': 'ben',
2933 'bo': 'bod',
2934 'br': 'bre',
2935 'bs': 'bos',
2936 'ca': 'cat',
2937 'ce': 'che',
2938 'ch': 'cha',
2939 'co': 'cos',
2940 'cr': 'cre',
2941 'cs': 'ces',
2942 'cu': 'chu',
2943 'cv': 'chv',
2944 'cy': 'cym',
2945 'da': 'dan',
2946 'de': 'deu',
2947 'dv': 'div',
2948 'dz': 'dzo',
2949 'ee': 'ewe',
2950 'el': 'ell',
2951 'en': 'eng',
2952 'eo': 'epo',
2953 'es': 'spa',
2954 'et': 'est',
2955 'eu': 'eus',
2956 'fa': 'fas',
2957 'ff': 'ful',
2958 'fi': 'fin',
2959 'fj': 'fij',
2960 'fo': 'fao',
2961 'fr': 'fra',
2962 'fy': 'fry',
2963 'ga': 'gle',
2964 'gd': 'gla',
2965 'gl': 'glg',
2966 'gn': 'grn',
2967 'gu': 'guj',
2968 'gv': 'glv',
2969 'ha': 'hau',
2970 'he': 'heb',
2971 'iw': 'heb', # Replaced by he in 1989 revision
2972 'hi': 'hin',
2973 'ho': 'hmo',
2974 'hr': 'hrv',
2975 'ht': 'hat',
2976 'hu': 'hun',
2977 'hy': 'hye',
2978 'hz': 'her',
2979 'ia': 'ina',
2980 'id': 'ind',
2981 'in': 'ind', # Replaced by id in 1989 revision
2982 'ie': 'ile',
2983 'ig': 'ibo',
2984 'ii': 'iii',
2985 'ik': 'ipk',
2986 'io': 'ido',
2987 'is': 'isl',
2988 'it': 'ita',
2989 'iu': 'iku',
2990 'ja': 'jpn',
2991 'jv': 'jav',
2992 'ka': 'kat',
2993 'kg': 'kon',
2994 'ki': 'kik',
2995 'kj': 'kua',
2996 'kk': 'kaz',
2997 'kl': 'kal',
2998 'km': 'khm',
2999 'kn': 'kan',
3000 'ko': 'kor',
3001 'kr': 'kau',
3002 'ks': 'kas',
3003 'ku': 'kur',
3004 'kv': 'kom',
3005 'kw': 'cor',
3006 'ky': 'kir',
3007 'la': 'lat',
3008 'lb': 'ltz',
3009 'lg': 'lug',
3010 'li': 'lim',
3011 'ln': 'lin',
3012 'lo': 'lao',
3013 'lt': 'lit',
3014 'lu': 'lub',
3015 'lv': 'lav',
3016 'mg': 'mlg',
3017 'mh': 'mah',
3018 'mi': 'mri',
3019 'mk': 'mkd',
3020 'ml': 'mal',
3021 'mn': 'mon',
3022 'mr': 'mar',
3023 'ms': 'msa',
3024 'mt': 'mlt',
3025 'my': 'mya',
3026 'na': 'nau',
3027 'nb': 'nob',
3028 'nd': 'nde',
3029 'ne': 'nep',
3030 'ng': 'ndo',
3031 'nl': 'nld',
3032 'nn': 'nno',
3033 'no': 'nor',
3034 'nr': 'nbl',
3035 'nv': 'nav',
3036 'ny': 'nya',
3037 'oc': 'oci',
3038 'oj': 'oji',
3039 'om': 'orm',
3040 'or': 'ori',
3041 'os': 'oss',
3042 'pa': 'pan',
3043 'pi': 'pli',
3044 'pl': 'pol',
3045 'ps': 'pus',
3046 'pt': 'por',
3047 'qu': 'que',
3048 'rm': 'roh',
3049 'rn': 'run',
3050 'ro': 'ron',
3051 'ru': 'rus',
3052 'rw': 'kin',
3053 'sa': 'san',
3054 'sc': 'srd',
3055 'sd': 'snd',
3056 'se': 'sme',
3057 'sg': 'sag',
3058 'si': 'sin',
3059 'sk': 'slk',
3060 'sl': 'slv',
3061 'sm': 'smo',
3062 'sn': 'sna',
3063 'so': 'som',
3064 'sq': 'sqi',
3065 'sr': 'srp',
3066 'ss': 'ssw',
3067 'st': 'sot',
3068 'su': 'sun',
3069 'sv': 'swe',
3070 'sw': 'swa',
3071 'ta': 'tam',
3072 'te': 'tel',
3073 'tg': 'tgk',
3074 'th': 'tha',
3075 'ti': 'tir',
3076 'tk': 'tuk',
3077 'tl': 'tgl',
3078 'tn': 'tsn',
3079 'to': 'ton',
3080 'tr': 'tur',
3081 'ts': 'tso',
3082 'tt': 'tat',
3083 'tw': 'twi',
3084 'ty': 'tah',
3085 'ug': 'uig',
3086 'uk': 'ukr',
3087 'ur': 'urd',
3088 'uz': 'uzb',
3089 've': 'ven',
3090 'vi': 'vie',
3091 'vo': 'vol',
3092 'wa': 'wln',
3093 'wo': 'wol',
3094 'xh': 'xho',
3095 'yi': 'yid',
3096 'ji': 'yid', # Replaced by yi in 1989 revision
3097 'yo': 'yor',
3098 'za': 'zha',
3099 'zh': 'zho',
3100 'zu': 'zul',
3101 }
3102
3103 @classmethod
3104 def short2long(cls, code):
3105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3106 return cls._lang_map.get(code[:2])
3107
3108 @classmethod
3109 def long2short(cls, code):
3110 """Convert language code from ISO 639-2/T to ISO 639-1"""
3111 for short_name, long_name in cls._lang_map.items():
3112 if long_name == code:
3113 return short_name
3114
3115
3116 class ISO3166Utils(object):
3117 # From http://data.okfn.org/data/core/country-list
3118 _country_map = {
3119 'AF': 'Afghanistan',
3120 'AX': 'Åland Islands',
3121 'AL': 'Albania',
3122 'DZ': 'Algeria',
3123 'AS': 'American Samoa',
3124 'AD': 'Andorra',
3125 'AO': 'Angola',
3126 'AI': 'Anguilla',
3127 'AQ': 'Antarctica',
3128 'AG': 'Antigua and Barbuda',
3129 'AR': 'Argentina',
3130 'AM': 'Armenia',
3131 'AW': 'Aruba',
3132 'AU': 'Australia',
3133 'AT': 'Austria',
3134 'AZ': 'Azerbaijan',
3135 'BS': 'Bahamas',
3136 'BH': 'Bahrain',
3137 'BD': 'Bangladesh',
3138 'BB': 'Barbados',
3139 'BY': 'Belarus',
3140 'BE': 'Belgium',
3141 'BZ': 'Belize',
3142 'BJ': 'Benin',
3143 'BM': 'Bermuda',
3144 'BT': 'Bhutan',
3145 'BO': 'Bolivia, Plurinational State of',
3146 'BQ': 'Bonaire, Sint Eustatius and Saba',
3147 'BA': 'Bosnia and Herzegovina',
3148 'BW': 'Botswana',
3149 'BV': 'Bouvet Island',
3150 'BR': 'Brazil',
3151 'IO': 'British Indian Ocean Territory',
3152 'BN': 'Brunei Darussalam',
3153 'BG': 'Bulgaria',
3154 'BF': 'Burkina Faso',
3155 'BI': 'Burundi',
3156 'KH': 'Cambodia',
3157 'CM': 'Cameroon',
3158 'CA': 'Canada',
3159 'CV': 'Cape Verde',
3160 'KY': 'Cayman Islands',
3161 'CF': 'Central African Republic',
3162 'TD': 'Chad',
3163 'CL': 'Chile',
3164 'CN': 'China',
3165 'CX': 'Christmas Island',
3166 'CC': 'Cocos (Keeling) Islands',
3167 'CO': 'Colombia',
3168 'KM': 'Comoros',
3169 'CG': 'Congo',
3170 'CD': 'Congo, the Democratic Republic of the',
3171 'CK': 'Cook Islands',
3172 'CR': 'Costa Rica',
3173 'CI': 'Côte d\'Ivoire',
3174 'HR': 'Croatia',
3175 'CU': 'Cuba',
3176 'CW': 'Curaçao',
3177 'CY': 'Cyprus',
3178 'CZ': 'Czech Republic',
3179 'DK': 'Denmark',
3180 'DJ': 'Djibouti',
3181 'DM': 'Dominica',
3182 'DO': 'Dominican Republic',
3183 'EC': 'Ecuador',
3184 'EG': 'Egypt',
3185 'SV': 'El Salvador',
3186 'GQ': 'Equatorial Guinea',
3187 'ER': 'Eritrea',
3188 'EE': 'Estonia',
3189 'ET': 'Ethiopia',
3190 'FK': 'Falkland Islands (Malvinas)',
3191 'FO': 'Faroe Islands',
3192 'FJ': 'Fiji',
3193 'FI': 'Finland',
3194 'FR': 'France',
3195 'GF': 'French Guiana',
3196 'PF': 'French Polynesia',
3197 'TF': 'French Southern Territories',
3198 'GA': 'Gabon',
3199 'GM': 'Gambia',
3200 'GE': 'Georgia',
3201 'DE': 'Germany',
3202 'GH': 'Ghana',
3203 'GI': 'Gibraltar',
3204 'GR': 'Greece',
3205 'GL': 'Greenland',
3206 'GD': 'Grenada',
3207 'GP': 'Guadeloupe',
3208 'GU': 'Guam',
3209 'GT': 'Guatemala',
3210 'GG': 'Guernsey',
3211 'GN': 'Guinea',
3212 'GW': 'Guinea-Bissau',
3213 'GY': 'Guyana',
3214 'HT': 'Haiti',
3215 'HM': 'Heard Island and McDonald Islands',
3216 'VA': 'Holy See (Vatican City State)',
3217 'HN': 'Honduras',
3218 'HK': 'Hong Kong',
3219 'HU': 'Hungary',
3220 'IS': 'Iceland',
3221 'IN': 'India',
3222 'ID': 'Indonesia',
3223 'IR': 'Iran, Islamic Republic of',
3224 'IQ': 'Iraq',
3225 'IE': 'Ireland',
3226 'IM': 'Isle of Man',
3227 'IL': 'Israel',
3228 'IT': 'Italy',
3229 'JM': 'Jamaica',
3230 'JP': 'Japan',
3231 'JE': 'Jersey',
3232 'JO': 'Jordan',
3233 'KZ': 'Kazakhstan',
3234 'KE': 'Kenya',
3235 'KI': 'Kiribati',
3236 'KP': 'Korea, Democratic People\'s Republic of',
3237 'KR': 'Korea, Republic of',
3238 'KW': 'Kuwait',
3239 'KG': 'Kyrgyzstan',
3240 'LA': 'Lao People\'s Democratic Republic',
3241 'LV': 'Latvia',
3242 'LB': 'Lebanon',
3243 'LS': 'Lesotho',
3244 'LR': 'Liberia',
3245 'LY': 'Libya',
3246 'LI': 'Liechtenstein',
3247 'LT': 'Lithuania',
3248 'LU': 'Luxembourg',
3249 'MO': 'Macao',
3250 'MK': 'Macedonia, the Former Yugoslav Republic of',
3251 'MG': 'Madagascar',
3252 'MW': 'Malawi',
3253 'MY': 'Malaysia',
3254 'MV': 'Maldives',
3255 'ML': 'Mali',
3256 'MT': 'Malta',
3257 'MH': 'Marshall Islands',
3258 'MQ': 'Martinique',
3259 'MR': 'Mauritania',
3260 'MU': 'Mauritius',
3261 'YT': 'Mayotte',
3262 'MX': 'Mexico',
3263 'FM': 'Micronesia, Federated States of',
3264 'MD': 'Moldova, Republic of',
3265 'MC': 'Monaco',
3266 'MN': 'Mongolia',
3267 'ME': 'Montenegro',
3268 'MS': 'Montserrat',
3269 'MA': 'Morocco',
3270 'MZ': 'Mozambique',
3271 'MM': 'Myanmar',
3272 'NA': 'Namibia',
3273 'NR': 'Nauru',
3274 'NP': 'Nepal',
3275 'NL': 'Netherlands',
3276 'NC': 'New Caledonia',
3277 'NZ': 'New Zealand',
3278 'NI': 'Nicaragua',
3279 'NE': 'Niger',
3280 'NG': 'Nigeria',
3281 'NU': 'Niue',
3282 'NF': 'Norfolk Island',
3283 'MP': 'Northern Mariana Islands',
3284 'NO': 'Norway',
3285 'OM': 'Oman',
3286 'PK': 'Pakistan',
3287 'PW': 'Palau',
3288 'PS': 'Palestine, State of',
3289 'PA': 'Panama',
3290 'PG': 'Papua New Guinea',
3291 'PY': 'Paraguay',
3292 'PE': 'Peru',
3293 'PH': 'Philippines',
3294 'PN': 'Pitcairn',
3295 'PL': 'Poland',
3296 'PT': 'Portugal',
3297 'PR': 'Puerto Rico',
3298 'QA': 'Qatar',
3299 'RE': 'Réunion',
3300 'RO': 'Romania',
3301 'RU': 'Russian Federation',
3302 'RW': 'Rwanda',
3303 'BL': 'Saint Barthélemy',
3304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3305 'KN': 'Saint Kitts and Nevis',
3306 'LC': 'Saint Lucia',
3307 'MF': 'Saint Martin (French part)',
3308 'PM': 'Saint Pierre and Miquelon',
3309 'VC': 'Saint Vincent and the Grenadines',
3310 'WS': 'Samoa',
3311 'SM': 'San Marino',
3312 'ST': 'Sao Tome and Principe',
3313 'SA': 'Saudi Arabia',
3314 'SN': 'Senegal',
3315 'RS': 'Serbia',
3316 'SC': 'Seychelles',
3317 'SL': 'Sierra Leone',
3318 'SG': 'Singapore',
3319 'SX': 'Sint Maarten (Dutch part)',
3320 'SK': 'Slovakia',
3321 'SI': 'Slovenia',
3322 'SB': 'Solomon Islands',
3323 'SO': 'Somalia',
3324 'ZA': 'South Africa',
3325 'GS': 'South Georgia and the South Sandwich Islands',
3326 'SS': 'South Sudan',
3327 'ES': 'Spain',
3328 'LK': 'Sri Lanka',
3329 'SD': 'Sudan',
3330 'SR': 'Suriname',
3331 'SJ': 'Svalbard and Jan Mayen',
3332 'SZ': 'Swaziland',
3333 'SE': 'Sweden',
3334 'CH': 'Switzerland',
3335 'SY': 'Syrian Arab Republic',
3336 'TW': 'Taiwan, Province of China',
3337 'TJ': 'Tajikistan',
3338 'TZ': 'Tanzania, United Republic of',
3339 'TH': 'Thailand',
3340 'TL': 'Timor-Leste',
3341 'TG': 'Togo',
3342 'TK': 'Tokelau',
3343 'TO': 'Tonga',
3344 'TT': 'Trinidad and Tobago',
3345 'TN': 'Tunisia',
3346 'TR': 'Turkey',
3347 'TM': 'Turkmenistan',
3348 'TC': 'Turks and Caicos Islands',
3349 'TV': 'Tuvalu',
3350 'UG': 'Uganda',
3351 'UA': 'Ukraine',
3352 'AE': 'United Arab Emirates',
3353 'GB': 'United Kingdom',
3354 'US': 'United States',
3355 'UM': 'United States Minor Outlying Islands',
3356 'UY': 'Uruguay',
3357 'UZ': 'Uzbekistan',
3358 'VU': 'Vanuatu',
3359 'VE': 'Venezuela, Bolivarian Republic of',
3360 'VN': 'Viet Nam',
3361 'VG': 'Virgin Islands, British',
3362 'VI': 'Virgin Islands, U.S.',
3363 'WF': 'Wallis and Futuna',
3364 'EH': 'Western Sahara',
3365 'YE': 'Yemen',
3366 'ZM': 'Zambia',
3367 'ZW': 'Zimbabwe',
3368 }
3369
3370 @classmethod
3371 def short2full(cls, code):
3372 """Convert an ISO 3166-2 country code to the corresponding full name"""
3373 return cls._country_map.get(code.upper())
3374
3375
3376 class GeoUtils(object):
3377 # Major IPv4 address blocks per country
3378 _country_ip_map = {
3379 'AD': '85.94.160.0/19',
3380 'AE': '94.200.0.0/13',
3381 'AF': '149.54.0.0/17',
3382 'AG': '209.59.64.0/18',
3383 'AI': '204.14.248.0/21',
3384 'AL': '46.99.0.0/16',
3385 'AM': '46.70.0.0/15',
3386 'AO': '105.168.0.0/13',
3387 'AP': '159.117.192.0/21',
3388 'AR': '181.0.0.0/12',
3389 'AS': '202.70.112.0/20',
3390 'AT': '84.112.0.0/13',
3391 'AU': '1.128.0.0/11',
3392 'AW': '181.41.0.0/18',
3393 'AZ': '5.191.0.0/16',
3394 'BA': '31.176.128.0/17',
3395 'BB': '65.48.128.0/17',
3396 'BD': '114.130.0.0/16',
3397 'BE': '57.0.0.0/8',
3398 'BF': '129.45.128.0/17',
3399 'BG': '95.42.0.0/15',
3400 'BH': '37.131.0.0/17',
3401 'BI': '154.117.192.0/18',
3402 'BJ': '137.255.0.0/16',
3403 'BL': '192.131.134.0/24',
3404 'BM': '196.12.64.0/18',
3405 'BN': '156.31.0.0/16',
3406 'BO': '161.56.0.0/16',
3407 'BQ': '161.0.80.0/20',
3408 'BR': '152.240.0.0/12',
3409 'BS': '24.51.64.0/18',
3410 'BT': '119.2.96.0/19',
3411 'BW': '168.167.0.0/16',
3412 'BY': '178.120.0.0/13',
3413 'BZ': '179.42.192.0/18',
3414 'CA': '99.224.0.0/11',
3415 'CD': '41.243.0.0/16',
3416 'CF': '196.32.200.0/21',
3417 'CG': '197.214.128.0/17',
3418 'CH': '85.0.0.0/13',
3419 'CI': '154.232.0.0/14',
3420 'CK': '202.65.32.0/19',
3421 'CL': '152.172.0.0/14',
3422 'CM': '165.210.0.0/15',
3423 'CN': '36.128.0.0/10',
3424 'CO': '181.240.0.0/12',
3425 'CR': '201.192.0.0/12',
3426 'CU': '152.206.0.0/15',
3427 'CV': '165.90.96.0/19',
3428 'CW': '190.88.128.0/17',
3429 'CY': '46.198.0.0/15',
3430 'CZ': '88.100.0.0/14',
3431 'DE': '53.0.0.0/8',
3432 'DJ': '197.241.0.0/17',
3433 'DK': '87.48.0.0/12',
3434 'DM': '192.243.48.0/20',
3435 'DO': '152.166.0.0/15',
3436 'DZ': '41.96.0.0/12',
3437 'EC': '186.68.0.0/15',
3438 'EE': '90.190.0.0/15',
3439 'EG': '156.160.0.0/11',
3440 'ER': '196.200.96.0/20',
3441 'ES': '88.0.0.0/11',
3442 'ET': '196.188.0.0/14',
3443 'EU': '2.16.0.0/13',
3444 'FI': '91.152.0.0/13',
3445 'FJ': '144.120.0.0/16',
3446 'FM': '119.252.112.0/20',
3447 'FO': '88.85.32.0/19',
3448 'FR': '90.0.0.0/9',
3449 'GA': '41.158.0.0/15',
3450 'GB': '25.0.0.0/8',
3451 'GD': '74.122.88.0/21',
3452 'GE': '31.146.0.0/16',
3453 'GF': '161.22.64.0/18',
3454 'GG': '62.68.160.0/19',
3455 'GH': '45.208.0.0/14',
3456 'GI': '85.115.128.0/19',
3457 'GL': '88.83.0.0/19',
3458 'GM': '160.182.0.0/15',
3459 'GN': '197.149.192.0/18',
3460 'GP': '104.250.0.0/19',
3461 'GQ': '105.235.224.0/20',
3462 'GR': '94.64.0.0/13',
3463 'GT': '168.234.0.0/16',
3464 'GU': '168.123.0.0/16',
3465 'GW': '197.214.80.0/20',
3466 'GY': '181.41.64.0/18',
3467 'HK': '113.252.0.0/14',
3468 'HN': '181.210.0.0/16',
3469 'HR': '93.136.0.0/13',
3470 'HT': '148.102.128.0/17',
3471 'HU': '84.0.0.0/14',
3472 'ID': '39.192.0.0/10',
3473 'IE': '87.32.0.0/12',
3474 'IL': '79.176.0.0/13',
3475 'IM': '5.62.80.0/20',
3476 'IN': '117.192.0.0/10',
3477 'IO': '203.83.48.0/21',
3478 'IQ': '37.236.0.0/14',
3479 'IR': '2.176.0.0/12',
3480 'IS': '82.221.0.0/16',
3481 'IT': '79.0.0.0/10',
3482 'JE': '87.244.64.0/18',
3483 'JM': '72.27.0.0/17',
3484 'JO': '176.29.0.0/16',
3485 'JP': '126.0.0.0/8',
3486 'KE': '105.48.0.0/12',
3487 'KG': '158.181.128.0/17',
3488 'KH': '36.37.128.0/17',
3489 'KI': '103.25.140.0/22',
3490 'KM': '197.255.224.0/20',
3491 'KN': '198.32.32.0/19',
3492 'KP': '175.45.176.0/22',
3493 'KR': '175.192.0.0/10',
3494 'KW': '37.36.0.0/14',
3495 'KY': '64.96.0.0/15',
3496 'KZ': '2.72.0.0/13',
3497 'LA': '115.84.64.0/18',
3498 'LB': '178.135.0.0/16',
3499 'LC': '192.147.231.0/24',
3500 'LI': '82.117.0.0/19',
3501 'LK': '112.134.0.0/15',
3502 'LR': '41.86.0.0/19',
3503 'LS': '129.232.0.0/17',
3504 'LT': '78.56.0.0/13',
3505 'LU': '188.42.0.0/16',
3506 'LV': '46.109.0.0/16',
3507 'LY': '41.252.0.0/14',
3508 'MA': '105.128.0.0/11',
3509 'MC': '88.209.64.0/18',
3510 'MD': '37.246.0.0/16',
3511 'ME': '178.175.0.0/17',
3512 'MF': '74.112.232.0/21',
3513 'MG': '154.126.0.0/17',
3514 'MH': '117.103.88.0/21',
3515 'MK': '77.28.0.0/15',
3516 'ML': '154.118.128.0/18',
3517 'MM': '37.111.0.0/17',
3518 'MN': '49.0.128.0/17',
3519 'MO': '60.246.0.0/16',
3520 'MP': '202.88.64.0/20',
3521 'MQ': '109.203.224.0/19',
3522 'MR': '41.188.64.0/18',
3523 'MS': '208.90.112.0/22',
3524 'MT': '46.11.0.0/16',
3525 'MU': '105.16.0.0/12',
3526 'MV': '27.114.128.0/18',
3527 'MW': '105.234.0.0/16',
3528 'MX': '187.192.0.0/11',
3529 'MY': '175.136.0.0/13',
3530 'MZ': '197.218.0.0/15',
3531 'NA': '41.182.0.0/16',
3532 'NC': '101.101.0.0/18',
3533 'NE': '197.214.0.0/18',
3534 'NF': '203.17.240.0/22',
3535 'NG': '105.112.0.0/12',
3536 'NI': '186.76.0.0/15',
3537 'NL': '145.96.0.0/11',
3538 'NO': '84.208.0.0/13',
3539 'NP': '36.252.0.0/15',
3540 'NR': '203.98.224.0/19',
3541 'NU': '49.156.48.0/22',
3542 'NZ': '49.224.0.0/14',
3543 'OM': '5.36.0.0/15',
3544 'PA': '186.72.0.0/15',
3545 'PE': '186.160.0.0/14',
3546 'PF': '123.50.64.0/18',
3547 'PG': '124.240.192.0/19',
3548 'PH': '49.144.0.0/13',
3549 'PK': '39.32.0.0/11',
3550 'PL': '83.0.0.0/11',
3551 'PM': '70.36.0.0/20',
3552 'PR': '66.50.0.0/16',
3553 'PS': '188.161.0.0/16',
3554 'PT': '85.240.0.0/13',
3555 'PW': '202.124.224.0/20',
3556 'PY': '181.120.0.0/14',
3557 'QA': '37.210.0.0/15',
3558 'RE': '139.26.0.0/16',
3559 'RO': '79.112.0.0/13',
3560 'RS': '178.220.0.0/14',
3561 'RU': '5.136.0.0/13',
3562 'RW': '105.178.0.0/15',
3563 'SA': '188.48.0.0/13',
3564 'SB': '202.1.160.0/19',
3565 'SC': '154.192.0.0/11',
3566 'SD': '154.96.0.0/13',
3567 'SE': '78.64.0.0/12',
3568 'SG': '152.56.0.0/14',
3569 'SI': '188.196.0.0/14',
3570 'SK': '78.98.0.0/15',
3571 'SL': '197.215.0.0/17',
3572 'SM': '89.186.32.0/19',
3573 'SN': '41.82.0.0/15',
3574 'SO': '197.220.64.0/19',
3575 'SR': '186.179.128.0/17',
3576 'SS': '105.235.208.0/21',
3577 'ST': '197.159.160.0/19',
3578 'SV': '168.243.0.0/16',
3579 'SX': '190.102.0.0/20',
3580 'SY': '5.0.0.0/16',
3581 'SZ': '41.84.224.0/19',
3582 'TC': '65.255.48.0/20',
3583 'TD': '154.68.128.0/19',
3584 'TG': '196.168.0.0/14',
3585 'TH': '171.96.0.0/13',
3586 'TJ': '85.9.128.0/18',
3587 'TK': '27.96.24.0/21',
3588 'TL': '180.189.160.0/20',
3589 'TM': '95.85.96.0/19',
3590 'TN': '197.0.0.0/11',
3591 'TO': '175.176.144.0/21',
3592 'TR': '78.160.0.0/11',
3593 'TT': '186.44.0.0/15',
3594 'TV': '202.2.96.0/19',
3595 'TW': '120.96.0.0/11',
3596 'TZ': '156.156.0.0/14',
3597 'UA': '93.72.0.0/13',
3598 'UG': '154.224.0.0/13',
3599 'US': '3.0.0.0/8',
3600 'UY': '167.56.0.0/13',
3601 'UZ': '82.215.64.0/18',
3602 'VA': '212.77.0.0/19',
3603 'VC': '24.92.144.0/20',
3604 'VE': '186.88.0.0/13',
3605 'VG': '172.103.64.0/18',
3606 'VI': '146.226.0.0/16',
3607 'VN': '14.160.0.0/11',
3608 'VU': '202.80.32.0/20',
3609 'WF': '117.20.32.0/21',
3610 'WS': '202.4.32.0/19',
3611 'YE': '134.35.0.0/16',
3612 'YT': '41.242.116.0/22',
3613 'ZA': '41.0.0.0/11',
3614 'ZM': '165.56.0.0/13',
3615 'ZW': '41.85.192.0/19',
3616 }
3617
3618 @classmethod
3619 def random_ipv4(cls, code_or_block):
3620 if len(code_or_block) == 2:
3621 block = cls._country_ip_map.get(code_or_block.upper())
3622 if not block:
3623 return None
3624 else:
3625 block = code_or_block
3626 addr, preflen = block.split('/')
3627 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3628 addr_max = addr_min | (0xffffffff >> int(preflen))
3629 return compat_str(socket.inet_ntoa(
3630 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3631
3632
3633 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3634 def __init__(self, proxies=None):
3635 # Set default handlers
3636 for type in ('http', 'https'):
3637 setattr(self, '%s_open' % type,
3638 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3639 meth(r, proxy, type))
3640 compat_urllib_request.ProxyHandler.__init__(self, proxies)
3641
3642 def proxy_open(self, req, proxy, type):
3643 req_proxy = req.headers.get('Ytdl-request-proxy')
3644 if req_proxy is not None:
3645 proxy = req_proxy
3646 del req.headers['Ytdl-request-proxy']
3647
3648 if proxy == '__noproxy__':
3649 return None # No Proxy
3650 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3651 req.add_header('Ytdl-socks-proxy', proxy)
3652 # youtube-dl's http/https handlers do wrapping the socket with socks
3653 return None
3654 return compat_urllib_request.ProxyHandler.proxy_open(
3655 self, req, proxy, type)
3656
3657
3658 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3659 # released into Public Domain
3660 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3661
3662 def long_to_bytes(n, blocksize=0):
3663 """long_to_bytes(n:long, blocksize:int) : string
3664 Convert a long integer to a byte string.
3665
3666 If optional blocksize is given and greater than zero, pad the front of the
3667 byte string with binary zeros so that the length is a multiple of
3668 blocksize.
3669 """
3670 # after much testing, this algorithm was deemed to be the fastest
3671 s = b''
3672 n = int(n)
3673 while n > 0:
3674 s = compat_struct_pack('>I', n & 0xffffffff) + s
3675 n = n >> 32
3676 # strip off leading zeros
3677 for i in range(len(s)):
3678 if s[i] != b'\000'[0]:
3679 break
3680 else:
3681 # only happens when n == 0
3682 s = b'\000'
3683 i = 0
3684 s = s[i:]
3685 # add back some pad bytes. this could be done more efficiently w.r.t. the
3686 # de-padding being done above, but sigh...
3687 if blocksize > 0 and len(s) % blocksize:
3688 s = (blocksize - len(s) % blocksize) * b'\000' + s
3689 return s
3690
3691
3692 def bytes_to_long(s):
3693 """bytes_to_long(string) : long
3694 Convert a byte string to a long integer.
3695
3696 This is (essentially) the inverse of long_to_bytes().
3697 """
3698 acc = 0
3699 length = len(s)
3700 if length % 4:
3701 extra = (4 - length % 4)
3702 s = b'\000' * extra + s
3703 length = length + extra
3704 for i in range(0, length, 4):
3705 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3706 return acc
3707
3708
3709 def ohdave_rsa_encrypt(data, exponent, modulus):
3710 '''
3711 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3712
3713 Input:
3714 data: data to encrypt, bytes-like object
3715 exponent, modulus: parameter e and N of RSA algorithm, both integer
3716 Output: hex string of encrypted data
3717
3718 Limitation: supports one block encryption only
3719 '''
3720
3721 payload = int(binascii.hexlify(data[::-1]), 16)
3722 encrypted = pow(payload, exponent, modulus)
3723 return '%x' % encrypted
3724
3725
3726 def pkcs1pad(data, length):
3727 """
3728 Padding input data with PKCS#1 scheme
3729
3730 @param {int[]} data input data
3731 @param {int} length target length
3732 @returns {int[]} padded data
3733 """
3734 if len(data) > length - 11:
3735 raise ValueError('Input data too long for PKCS#1 padding')
3736
3737 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3738 return [0, 2] + pseudo_random + [0] + data
3739
3740
3741 def encode_base_n(num, n, table=None):
3742 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3743 if not table:
3744 table = FULL_TABLE[:n]
3745
3746 if n > len(table):
3747 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3748
3749 if num == 0:
3750 return table[0]
3751
3752 ret = ''
3753 while num:
3754 ret = table[num % n] + ret
3755 num = num // n
3756 return ret
3757
3758
3759 def decode_packed_codes(code):
3760 mobj = re.search(PACKED_CODES_RE, code)
3761 obfucasted_code, base, count, symbols = mobj.groups()
3762 base = int(base)
3763 count = int(count)
3764 symbols = symbols.split('|')
3765 symbol_table = {}
3766
3767 while count:
3768 count -= 1
3769 base_n_count = encode_base_n(count, base)
3770 symbol_table[base_n_count] = symbols[count] or base_n_count
3771
3772 return re.sub(
3773 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3774 obfucasted_code)
3775
3776
3777 def parse_m3u8_attributes(attrib):
3778 info = {}
3779 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3780 if val.startswith('"'):
3781 val = val[1:-1]
3782 info[key] = val
3783 return info
3784
3785
3786 def urshift(val, n):
3787 return val >> n if val >= 0 else (val + 0x100000000) >> n
3788
3789
3790 # Based on png2str() written by @gdkchan and improved by @yokrysty
3791 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3792 def decode_png(png_data):
3793 # Reference: https://www.w3.org/TR/PNG/
3794 header = png_data[8:]
3795
3796 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3797 raise IOError('Not a valid PNG file.')
3798
3799 int_map = {1: '>B', 2: '>H', 4: '>I'}
3800 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3801
3802 chunks = []
3803
3804 while header:
3805 length = unpack_integer(header[:4])
3806 header = header[4:]
3807
3808 chunk_type = header[:4]
3809 header = header[4:]
3810
3811 chunk_data = header[:length]
3812 header = header[length:]
3813
3814 header = header[4:] # Skip CRC
3815
3816 chunks.append({
3817 'type': chunk_type,
3818 'length': length,
3819 'data': chunk_data
3820 })
3821
3822 ihdr = chunks[0]['data']
3823
3824 width = unpack_integer(ihdr[:4])
3825 height = unpack_integer(ihdr[4:8])
3826
3827 idat = b''
3828
3829 for chunk in chunks:
3830 if chunk['type'] == b'IDAT':
3831 idat += chunk['data']
3832
3833 if not idat:
3834 raise IOError('Unable to read PNG data.')
3835
3836 decompressed_data = bytearray(zlib.decompress(idat))
3837
3838 stride = width * 3
3839 pixels = []
3840
3841 def _get_pixel(idx):
3842 x = idx % stride
3843 y = idx // stride
3844 return pixels[y][x]
3845
3846 for y in range(height):
3847 basePos = y * (1 + stride)
3848 filter_type = decompressed_data[basePos]
3849
3850 current_row = []
3851
3852 pixels.append(current_row)
3853
3854 for x in range(stride):
3855 color = decompressed_data[1 + basePos + x]
3856 basex = y * stride + x
3857 left = 0
3858 up = 0
3859
3860 if x > 2:
3861 left = _get_pixel(basex - 3)
3862 if y > 0:
3863 up = _get_pixel(basex - stride)
3864
3865 if filter_type == 1: # Sub
3866 color = (color + left) & 0xff
3867 elif filter_type == 2: # Up
3868 color = (color + up) & 0xff
3869 elif filter_type == 3: # Average
3870 color = (color + ((left + up) >> 1)) & 0xff
3871 elif filter_type == 4: # Paeth
3872 a = left
3873 b = up
3874 c = 0
3875
3876 if x > 2 and y > 0:
3877 c = _get_pixel(basex - stride - 3)
3878
3879 p = a + b - c
3880
3881 pa = abs(p - a)
3882 pb = abs(p - b)
3883 pc = abs(p - c)
3884
3885 if pa <= pb and pa <= pc:
3886 color = (color + a) & 0xff
3887 elif pb <= pc:
3888 color = (color + b) & 0xff
3889 else:
3890 color = (color + c) & 0xff
3891
3892 current_row.append(color)
3893
3894 return width, height, pixels
3895
3896
3897 def write_xattr(path, key, value):
3898 # This mess below finds the best xattr tool for the job
3899 try:
3900 # try the pyxattr module...
3901 import xattr
3902
3903 if hasattr(xattr, 'set'): # pyxattr
3904 # Unicode arguments are not supported in python-pyxattr until
3905 # version 0.5.0
3906 # See https://github.com/rg3/youtube-dl/issues/5498
3907 pyxattr_required_version = '0.5.0'
3908 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3909 # TODO: fallback to CLI tools
3910 raise XAttrUnavailableError(
3911 'python-pyxattr is detected but is too old. '
3912 'youtube-dl requires %s or above while your version is %s. '
3913 'Falling back to other xattr implementations' % (
3914 pyxattr_required_version, xattr.__version__))
3915
3916 setxattr = xattr.set
3917 else: # xattr
3918 setxattr = xattr.setxattr
3919
3920 try:
3921 setxattr(path, key, value)
3922 except EnvironmentError as e:
3923 raise XAttrMetadataError(e.errno, e.strerror)
3924
3925 except ImportError:
3926 if compat_os_name == 'nt':
3927 # Write xattrs to NTFS Alternate Data Streams:
3928 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3929 assert ':' not in key
3930 assert os.path.exists(path)
3931
3932 ads_fn = path + ':' + key
3933 try:
3934 with open(ads_fn, 'wb') as f:
3935 f.write(value)
3936 except EnvironmentError as e:
3937 raise XAttrMetadataError(e.errno, e.strerror)
3938 else:
3939 user_has_setfattr = check_executable('setfattr', ['--version'])
3940 user_has_xattr = check_executable('xattr', ['-h'])
3941
3942 if user_has_setfattr or user_has_xattr:
3943
3944 value = value.decode('utf-8')
3945 if user_has_setfattr:
3946 executable = 'setfattr'
3947 opts = ['-n', key, '-v', value]
3948 elif user_has_xattr:
3949 executable = 'xattr'
3950 opts = ['-w', key, value]
3951
3952 cmd = ([encodeFilename(executable, True)] +
3953 [encodeArgument(o) for o in opts] +
3954 [encodeFilename(path, True)])
3955
3956 try:
3957 p = subprocess.Popen(
3958 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3959 except EnvironmentError as e:
3960 raise XAttrMetadataError(e.errno, e.strerror)
3961 stdout, stderr = p.communicate()
3962 stderr = stderr.decode('utf-8', 'replace')
3963 if p.returncode != 0:
3964 raise XAttrMetadataError(p.returncode, stderr)
3965
3966 else:
3967 # On Unix, and can't find pyxattr, setfattr, or xattr.
3968 if sys.platform.startswith('linux'):
3969 raise XAttrUnavailableError(
3970 "Couldn't find a tool to set the xattrs. "
3971 "Install either the python 'pyxattr' or 'xattr' "
3972 "modules, or the GNU 'attr' package "
3973 "(which contains the 'setfattr' tool).")
3974 else:
3975 raise XAttrUnavailableError(
3976 "Couldn't find a tool to set the xattrs. "
3977 "Install either the python 'xattr' module, "
3978 "or the 'xattr' binary.")
3979
3980
3981 def random_birthday(year_field, month_field, day_field):
3982 start_date = datetime.date(1950, 1, 1)
3983 end_date = datetime.date(1995, 12, 31)
3984 offset = random.randint(0, (end_date - start_date).days)
3985 random_date = start_date + datetime.timedelta(offset)
3986 return {
3987 year_field: str(random_date.year),
3988 month_field: str(random_date.month),
3989 day_field: str(random_date.day),
3990 }