]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
New upstream version 2017.02.24.1
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import random
27 import re
28 import socket
29 import ssl
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParser,
39 compat_basestring,
40 compat_chr,
41 compat_etree_fromstring,
42 compat_html_entities,
43 compat_html_entities_html5,
44 compat_http_client,
45 compat_kwargs,
46 compat_os_name,
47 compat_parse_qs,
48 compat_shlex_quote,
49 compat_socket_create_connection,
50 compat_str,
51 compat_struct_pack,
52 compat_struct_unpack,
53 compat_urllib_error,
54 compat_urllib_parse,
55 compat_urllib_parse_urlencode,
56 compat_urllib_parse_urlparse,
57 compat_urllib_parse_unquote_plus,
58 compat_urllib_request,
59 compat_urlparse,
60 compat_xpath,
61 )
62
63 from .socks import (
64 ProxyType,
65 sockssocket,
66 )
67
68
69 def register_socks_protocols():
70 # "Register" SOCKS protocols
71 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
72 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
73 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
74 if scheme not in compat_urlparse.uses_netloc:
75 compat_urlparse.uses_netloc.append(scheme)
76
77
78 # This is not clearly defined otherwise
79 compiled_regex_type = type(re.compile(''))
80
81 std_headers = {
82 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
83 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
84 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
85 'Accept-Encoding': 'gzip, deflate',
86 'Accept-Language': 'en-us,en;q=0.5',
87 }
88
89
90 USER_AGENTS = {
91 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
92 }
93
94
95 NO_DEFAULT = object()
96
97 ENGLISH_MONTH_NAMES = [
98 'January', 'February', 'March', 'April', 'May', 'June',
99 'July', 'August', 'September', 'October', 'November', 'December']
100
101 MONTH_NAMES = {
102 'en': ENGLISH_MONTH_NAMES,
103 'fr': [
104 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
105 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
106 }
107
108 KNOWN_EXTENSIONS = (
109 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
110 'flv', 'f4v', 'f4a', 'f4b',
111 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
112 'mkv', 'mka', 'mk3d',
113 'avi', 'divx',
114 'mov',
115 'asf', 'wmv', 'wma',
116 '3gp', '3g2',
117 'mp3',
118 'flac',
119 'ape',
120 'wav',
121 'f4f', 'f4m', 'm3u8', 'smil')
122
123 # needed for sanitizing filenames in restricted mode
124 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
125 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
126 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
127
128 DATE_FORMATS = (
129 '%d %B %Y',
130 '%d %b %Y',
131 '%B %d %Y',
132 '%B %dst %Y',
133 '%B %dnd %Y',
134 '%B %dth %Y',
135 '%b %d %Y',
136 '%b %dst %Y',
137 '%b %dnd %Y',
138 '%b %dth %Y',
139 '%b %dst %Y %I:%M',
140 '%b %dnd %Y %I:%M',
141 '%b %dth %Y %I:%M',
142 '%Y %m %d',
143 '%Y-%m-%d',
144 '%Y/%m/%d',
145 '%Y/%m/%d %H:%M',
146 '%Y/%m/%d %H:%M:%S',
147 '%Y-%m-%d %H:%M',
148 '%Y-%m-%d %H:%M:%S',
149 '%Y-%m-%d %H:%M:%S.%f',
150 '%d.%m.%Y %H:%M',
151 '%d.%m.%Y %H.%M',
152 '%Y-%m-%dT%H:%M:%SZ',
153 '%Y-%m-%dT%H:%M:%S.%fZ',
154 '%Y-%m-%dT%H:%M:%S.%f0Z',
155 '%Y-%m-%dT%H:%M:%S',
156 '%Y-%m-%dT%H:%M:%S.%f',
157 '%Y-%m-%dT%H:%M',
158 '%b %d %Y at %H:%M',
159 '%b %d %Y at %H:%M:%S',
160 )
161
162 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
163 DATE_FORMATS_DAY_FIRST.extend([
164 '%d-%m-%Y',
165 '%d.%m.%Y',
166 '%d.%m.%y',
167 '%d/%m/%Y',
168 '%d/%m/%y',
169 '%d/%m/%Y %H:%M:%S',
170 ])
171
172 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
173 DATE_FORMATS_MONTH_FIRST.extend([
174 '%m-%d-%Y',
175 '%m.%d.%Y',
176 '%m/%d/%Y',
177 '%m/%d/%y',
178 '%m/%d/%Y %H:%M:%S',
179 ])
180
181 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
182
183
184 def preferredencoding():
185 """Get preferred encoding.
186
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
189 """
190 try:
191 pref = locale.getpreferredencoding()
192 'TEST'.encode(pref)
193 except Exception:
194 pref = 'UTF-8'
195
196 return pref
197
198
199 def write_json_file(obj, fn):
200 """ Encode obj as JSON and write it to fn, atomically if possible """
201
202 fn = encodeFilename(fn)
203 if sys.version_info < (3, 0) and sys.platform != 'win32':
204 encoding = get_filesystem_encoding()
205 # os.path.basename returns a bytes object, but NamedTemporaryFile
206 # will fail if the filename contains non ascii characters unless we
207 # use a unicode object
208 path_basename = lambda f: os.path.basename(fn).decode(encoding)
209 # the same for os.path.dirname
210 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
211 else:
212 path_basename = os.path.basename
213 path_dirname = os.path.dirname
214
215 args = {
216 'suffix': '.tmp',
217 'prefix': path_basename(fn) + '.',
218 'dir': path_dirname(fn),
219 'delete': False,
220 }
221
222 # In Python 2.x, json.dump expects a bytestream.
223 # In Python 3.x, it writes to a character stream
224 if sys.version_info < (3, 0):
225 args['mode'] = 'wb'
226 else:
227 args.update({
228 'mode': 'w',
229 'encoding': 'utf-8',
230 })
231
232 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
233
234 try:
235 with tf:
236 json.dump(obj, tf)
237 if sys.platform == 'win32':
238 # Need to remove existing file on Windows, else os.rename raises
239 # WindowsError or FileExistsError.
240 try:
241 os.unlink(fn)
242 except OSError:
243 pass
244 os.rename(tf.name, fn)
245 except Exception:
246 try:
247 os.remove(tf.name)
248 except OSError:
249 pass
250 raise
251
252
253 if sys.version_info >= (2, 7):
254 def find_xpath_attr(node, xpath, key, val=None):
255 """ Find the xpath xpath[@key=val] """
256 assert re.match(r'^[a-zA-Z_-]+$', key)
257 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
258 return node.find(expr)
259 else:
260 def find_xpath_attr(node, xpath, key, val=None):
261 for f in node.findall(compat_xpath(xpath)):
262 if key not in f.attrib:
263 continue
264 if val is None or f.attrib.get(key) == val:
265 return f
266 return None
267
268 # On python2.6 the xml.etree.ElementTree.Element methods don't support
269 # the namespace parameter
270
271
272 def xpath_with_ns(path, ns_map):
273 components = [c.split(':') for c in path.split('/')]
274 replaced = []
275 for c in components:
276 if len(c) == 1:
277 replaced.append(c[0])
278 else:
279 ns, tag = c
280 replaced.append('{%s}%s' % (ns_map[ns], tag))
281 return '/'.join(replaced)
282
283
284 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
285 def _find_xpath(xpath):
286 return node.find(compat_xpath(xpath))
287
288 if isinstance(xpath, (str, compat_str)):
289 n = _find_xpath(xpath)
290 else:
291 for xp in xpath:
292 n = _find_xpath(xp)
293 if n is not None:
294 break
295
296 if n is None:
297 if default is not NO_DEFAULT:
298 return default
299 elif fatal:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element %s' % name)
302 else:
303 return None
304 return n
305
306
307 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
308 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
309 if n is None or n == default:
310 return n
311 if n.text is None:
312 if default is not NO_DEFAULT:
313 return default
314 elif fatal:
315 name = xpath if name is None else name
316 raise ExtractorError('Could not find XML element\'s text %s' % name)
317 else:
318 return None
319 return n.text
320
321
322 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
323 n = find_xpath_attr(node, xpath, key)
324 if n is None:
325 if default is not NO_DEFAULT:
326 return default
327 elif fatal:
328 name = '%s[@%s]' % (xpath, key) if name is None else name
329 raise ExtractorError('Could not find XML attribute %s' % name)
330 else:
331 return None
332 return n.attrib[key]
333
334
335 def get_element_by_id(id, html):
336 """Return the content of the tag with the specified ID in the passed HTML document"""
337 return get_element_by_attribute('id', id, html)
338
339
340 def get_element_by_class(class_name, html):
341 """Return the content of the first tag with the specified class in the passed HTML document"""
342 retval = get_elements_by_class(class_name, html)
343 return retval[0] if retval else None
344
345
346 def get_element_by_attribute(attribute, value, html, escape_value=True):
347 retval = get_elements_by_attribute(attribute, value, html, escape_value)
348 return retval[0] if retval else None
349
350
351 def get_elements_by_class(class_name, html):
352 """Return the content of all tags with the specified class in the passed HTML document as a list"""
353 return get_elements_by_attribute(
354 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
355 html, escape_value=False)
356
357
358 def get_elements_by_attribute(attribute, value, html, escape_value=True):
359 """Return the content of the tag with the specified attribute in the passed HTML document"""
360
361 value = re.escape(value) if escape_value else value
362
363 retlist = []
364 for m in re.finditer(r'''(?xs)
365 <([a-zA-Z0-9:._-]+)
366 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
367 \s+%s=['"]?%s['"]?
368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
369 \s*>
370 (?P<content>.*?)
371 </\1>
372 ''' % (re.escape(attribute), value), html):
373 res = m.group('content')
374
375 if res.startswith('"') or res.startswith("'"):
376 res = res[1:-1]
377
378 retlist.append(unescapeHTML(res))
379
380 return retlist
381
382
383 class HTMLAttributeParser(compat_HTMLParser):
384 """Trivial HTML parser to gather the attributes for a single element"""
385 def __init__(self):
386 self.attrs = {}
387 compat_HTMLParser.__init__(self)
388
389 def handle_starttag(self, tag, attrs):
390 self.attrs = dict(attrs)
391
392
393 def extract_attributes(html_element):
394 """Given a string for an HTML element such as
395 <el
396 a="foo" B="bar" c="&98;az" d=boz
397 empty= noval entity="&amp;"
398 sq='"' dq="'"
399 >
400 Decode and return a dictionary of attributes.
401 {
402 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
403 'empty': '', 'noval': None, 'entity': '&',
404 'sq': '"', 'dq': '\''
405 }.
406 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
407 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
408 """
409 parser = HTMLAttributeParser()
410 parser.feed(html_element)
411 parser.close()
412 return parser.attrs
413
414
415 def clean_html(html):
416 """Clean an HTML snippet into a readable string"""
417
418 if html is None: # Convenience for sanitizing descriptions etc.
419 return html
420
421 # Newline vs <br />
422 html = html.replace('\n', ' ')
423 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
424 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
425 # Strip html tags
426 html = re.sub('<.*?>', '', html)
427 # Replace html entities
428 html = unescapeHTML(html)
429 return html.strip()
430
431
432 def sanitize_open(filename, open_mode):
433 """Try to open the given filename, and slightly tweak it if this fails.
434
435 Attempts to open the given filename. If this fails, it tries to change
436 the filename slightly, step by step, until it's either able to open it
437 or it fails and raises a final exception, like the standard open()
438 function.
439
440 It returns the tuple (stream, definitive_file_name).
441 """
442 try:
443 if filename == '-':
444 if sys.platform == 'win32':
445 import msvcrt
446 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
447 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
448 stream = open(encodeFilename(filename), open_mode)
449 return (stream, filename)
450 except (IOError, OSError) as err:
451 if err.errno in (errno.EACCES,):
452 raise
453
454 # In case of error, try to remove win32 forbidden chars
455 alt_filename = sanitize_path(filename)
456 if alt_filename == filename:
457 raise
458 else:
459 # An exception here should be caught in the caller
460 stream = open(encodeFilename(alt_filename), open_mode)
461 return (stream, alt_filename)
462
463
464 def timeconvert(timestr):
465 """Convert RFC 2822 defined time string into system timestamp"""
466 timestamp = None
467 timetuple = email.utils.parsedate_tz(timestr)
468 if timetuple is not None:
469 timestamp = email.utils.mktime_tz(timetuple)
470 return timestamp
471
472
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
477 """
478 def replace_insane(char):
479 if restricted and char in ACCENT_CHARS:
480 return ACCENT_CHARS[char]
481 if char == '?' or ord(char) < 32 or ord(char) == 127:
482 return ''
483 elif char == '"':
484 return '' if restricted else '\''
485 elif char == ':':
486 return '_-' if restricted else ' -'
487 elif char in '\\/|*<>':
488 return '_'
489 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
490 return '_'
491 if restricted and ord(char) > 127:
492 return '_'
493 return char
494
495 # Handle timestamps
496 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
497 result = ''.join(map(replace_insane, s))
498 if not is_id:
499 while '__' in result:
500 result = result.replace('__', '_')
501 result = result.strip('_')
502 # Common case of "Foreign band name - English song title"
503 if restricted and result.startswith('-_'):
504 result = result[2:]
505 if result.startswith('-'):
506 result = '_' + result[len('-'):]
507 result = result.lstrip('.')
508 if not result:
509 result = '_'
510 return result
511
512
513 def sanitize_path(s):
514 """Sanitizes and normalizes path on Windows"""
515 if sys.platform != 'win32':
516 return s
517 drive_or_unc, _ = os.path.splitdrive(s)
518 if sys.version_info < (2, 7) and not drive_or_unc:
519 drive_or_unc, _ = os.path.splitunc(s)
520 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
521 if drive_or_unc:
522 norm_path.pop(0)
523 sanitized_path = [
524 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
525 for path_part in norm_path]
526 if drive_or_unc:
527 sanitized_path.insert(0, drive_or_unc + os.path.sep)
528 return os.path.join(*sanitized_path)
529
530
531 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
532 # unwanted failures due to missing protocol
533 def sanitize_url(url):
534 return 'http:%s' % url if url.startswith('//') else url
535
536
537 def sanitized_Request(url, *args, **kwargs):
538 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
539
540
541 def orderedSet(iterable):
542 """ Remove all duplicates from the input iterable """
543 res = []
544 for el in iterable:
545 if el not in res:
546 res.append(el)
547 return res
548
549
550 def _htmlentity_transform(entity_with_semicolon):
551 """Transforms an HTML entity to a character."""
552 entity = entity_with_semicolon[:-1]
553
554 # Known non-numeric HTML entity
555 if entity in compat_html_entities.name2codepoint:
556 return compat_chr(compat_html_entities.name2codepoint[entity])
557
558 # TODO: HTML5 allows entities without a semicolon. For example,
559 # '&Eacuteric' should be decoded as 'Éric'.
560 if entity_with_semicolon in compat_html_entities_html5:
561 return compat_html_entities_html5[entity_with_semicolon]
562
563 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
564 if mobj is not None:
565 numstr = mobj.group(1)
566 if numstr.startswith('x'):
567 base = 16
568 numstr = '0%s' % numstr
569 else:
570 base = 10
571 # See https://github.com/rg3/youtube-dl/issues/7518
572 try:
573 return compat_chr(int(numstr, base))
574 except ValueError:
575 pass
576
577 # Unknown entity in name, return its literal representation
578 return '&%s;' % entity
579
580
581 def unescapeHTML(s):
582 if s is None:
583 return None
584 assert type(s) == compat_str
585
586 return re.sub(
587 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
588
589
590 def get_subprocess_encoding():
591 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
592 # For subprocess calls, encode with locale encoding
593 # Refer to http://stackoverflow.com/a/9951851/35070
594 encoding = preferredencoding()
595 else:
596 encoding = sys.getfilesystemencoding()
597 if encoding is None:
598 encoding = 'utf-8'
599 return encoding
600
601
602 def encodeFilename(s, for_subprocess=False):
603 """
604 @param s The name of the file
605 """
606
607 assert type(s) == compat_str
608
609 # Python 3 has a Unicode API
610 if sys.version_info >= (3, 0):
611 return s
612
613 # Pass '' directly to use Unicode APIs on Windows 2000 and up
614 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
615 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
616 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
617 return s
618
619 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
620 if sys.platform.startswith('java'):
621 return s
622
623 return s.encode(get_subprocess_encoding(), 'ignore')
624
625
626 def decodeFilename(b, for_subprocess=False):
627
628 if sys.version_info >= (3, 0):
629 return b
630
631 if not isinstance(b, bytes):
632 return b
633
634 return b.decode(get_subprocess_encoding(), 'ignore')
635
636
637 def encodeArgument(s):
638 if not isinstance(s, compat_str):
639 # Legacy code that uses byte strings
640 # Uncomment the following line after fixing all post processors
641 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
642 s = s.decode('ascii')
643 return encodeFilename(s, True)
644
645
646 def decodeArgument(b):
647 return decodeFilename(b, True)
648
649
650 def decodeOption(optval):
651 if optval is None:
652 return optval
653 if isinstance(optval, bytes):
654 optval = optval.decode(preferredencoding())
655
656 assert isinstance(optval, compat_str)
657 return optval
658
659
660 def formatSeconds(secs):
661 if secs > 3600:
662 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
663 elif secs > 60:
664 return '%d:%02d' % (secs // 60, secs % 60)
665 else:
666 return '%d' % secs
667
668
669 def make_HTTPS_handler(params, **kwargs):
670 opts_no_check_certificate = params.get('nocheckcertificate', False)
671 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
672 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
673 if opts_no_check_certificate:
674 context.check_hostname = False
675 context.verify_mode = ssl.CERT_NONE
676 try:
677 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
678 except TypeError:
679 # Python 2.7.8
680 # (create_default_context present but HTTPSHandler has no context=)
681 pass
682
683 if sys.version_info < (3, 2):
684 return YoutubeDLHTTPSHandler(params, **kwargs)
685 else: # Python < 3.4
686 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
687 context.verify_mode = (ssl.CERT_NONE
688 if opts_no_check_certificate
689 else ssl.CERT_REQUIRED)
690 context.set_default_verify_paths()
691 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
692
693
694 def bug_reports_message():
695 if ytdl_is_updateable():
696 update_cmd = 'type youtube-dl -U to update'
697 else:
698 update_cmd = 'see https://yt-dl.org/update on how to update'
699 msg = '; please report this issue on https://yt-dl.org/bug .'
700 msg += ' Make sure you are using the latest version; %s.' % update_cmd
701 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
702 return msg
703
704
705 class YoutubeDLError(Exception):
706 """Base exception for YoutubeDL errors."""
707 pass
708
709
710 class ExtractorError(YoutubeDLError):
711 """Error during info extraction."""
712
713 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
714 """ tb, if given, is the original traceback (so that it can be printed out).
715 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
716 """
717
718 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
719 expected = True
720 if video_id is not None:
721 msg = video_id + ': ' + msg
722 if cause:
723 msg += ' (caused by %r)' % cause
724 if not expected:
725 msg += bug_reports_message()
726 super(ExtractorError, self).__init__(msg)
727
728 self.traceback = tb
729 self.exc_info = sys.exc_info() # preserve original exception
730 self.cause = cause
731 self.video_id = video_id
732
733 def format_traceback(self):
734 if self.traceback is None:
735 return None
736 return ''.join(traceback.format_tb(self.traceback))
737
738
739 class UnsupportedError(ExtractorError):
740 def __init__(self, url):
741 super(UnsupportedError, self).__init__(
742 'Unsupported URL: %s' % url, expected=True)
743 self.url = url
744
745
746 class RegexNotFoundError(ExtractorError):
747 """Error when a regex didn't match"""
748 pass
749
750
751 class GeoRestrictedError(ExtractorError):
752 """Geographic restriction Error exception.
753
754 This exception may be thrown when a video is not available from your
755 geographic location due to geographic restrictions imposed by a website.
756 """
757 def __init__(self, msg, countries=None):
758 super(GeoRestrictedError, self).__init__(msg, expected=True)
759 self.msg = msg
760 self.countries = countries
761
762
763 class DownloadError(YoutubeDLError):
764 """Download Error exception.
765
766 This exception may be thrown by FileDownloader objects if they are not
767 configured to continue on errors. They will contain the appropriate
768 error message.
769 """
770
771 def __init__(self, msg, exc_info=None):
772 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
773 super(DownloadError, self).__init__(msg)
774 self.exc_info = exc_info
775
776
777 class SameFileError(YoutubeDLError):
778 """Same File exception.
779
780 This exception will be thrown by FileDownloader objects if they detect
781 multiple files would have to be downloaded to the same file on disk.
782 """
783 pass
784
785
786 class PostProcessingError(YoutubeDLError):
787 """Post Processing exception.
788
789 This exception may be raised by PostProcessor's .run() method to
790 indicate an error in the postprocessing task.
791 """
792
793 def __init__(self, msg):
794 super(PostProcessingError, self).__init__(msg)
795 self.msg = msg
796
797
798 class MaxDownloadsReached(YoutubeDLError):
799 """ --max-downloads limit has been reached. """
800 pass
801
802
803 class UnavailableVideoError(YoutubeDLError):
804 """Unavailable Format exception.
805
806 This exception will be thrown when a video is requested
807 in a format that is not available for that video.
808 """
809 pass
810
811
812 class ContentTooShortError(YoutubeDLError):
813 """Content Too Short exception.
814
815 This exception may be raised by FileDownloader objects when a file they
816 download is too small for what the server announced first, indicating
817 the connection was probably interrupted.
818 """
819
820 def __init__(self, downloaded, expected):
821 super(ContentTooShortError, self).__init__(
822 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
823 )
824 # Both in bytes
825 self.downloaded = downloaded
826 self.expected = expected
827
828
829 class XAttrMetadataError(YoutubeDLError):
830 def __init__(self, code=None, msg='Unknown error'):
831 super(XAttrMetadataError, self).__init__(msg)
832 self.code = code
833 self.msg = msg
834
835 # Parsing code and msg
836 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
837 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
838 self.reason = 'NO_SPACE'
839 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
840 self.reason = 'VALUE_TOO_LONG'
841 else:
842 self.reason = 'NOT_SUPPORTED'
843
844
845 class XAttrUnavailableError(YoutubeDLError):
846 pass
847
848
849 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
850 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
851 # expected HTTP responses to meet HTTP/1.0 or later (see also
852 # https://github.com/rg3/youtube-dl/issues/6727)
853 if sys.version_info < (3, 0):
854 kwargs[b'strict'] = True
855 hc = http_class(*args, **kwargs)
856 source_address = ydl_handler._params.get('source_address')
857 if source_address is not None:
858 sa = (source_address, 0)
859 if hasattr(hc, 'source_address'): # Python 2.7+
860 hc.source_address = sa
861 else: # Python 2.6
862 def _hc_connect(self, *args, **kwargs):
863 sock = compat_socket_create_connection(
864 (self.host, self.port), self.timeout, sa)
865 if is_https:
866 self.sock = ssl.wrap_socket(
867 sock, self.key_file, self.cert_file,
868 ssl_version=ssl.PROTOCOL_TLSv1)
869 else:
870 self.sock = sock
871 hc.connect = functools.partial(_hc_connect, hc)
872
873 return hc
874
875
876 def handle_youtubedl_headers(headers):
877 filtered_headers = headers
878
879 if 'Youtubedl-no-compression' in filtered_headers:
880 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
881 del filtered_headers['Youtubedl-no-compression']
882
883 return filtered_headers
884
885
886 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
887 """Handler for HTTP requests and responses.
888
889 This class, when installed with an OpenerDirector, automatically adds
890 the standard headers to every HTTP request and handles gzipped and
891 deflated responses from web servers. If compression is to be avoided in
892 a particular request, the original request in the program code only has
893 to include the HTTP header "Youtubedl-no-compression", which will be
894 removed before making the real request.
895
896 Part of this code was copied from:
897
898 http://techknack.net/python-urllib2-handlers/
899
900 Andrew Rowls, the author of that code, agreed to release it to the
901 public domain.
902 """
903
904 def __init__(self, params, *args, **kwargs):
905 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
906 self._params = params
907
908 def http_open(self, req):
909 conn_class = compat_http_client.HTTPConnection
910
911 socks_proxy = req.headers.get('Ytdl-socks-proxy')
912 if socks_proxy:
913 conn_class = make_socks_conn_class(conn_class, socks_proxy)
914 del req.headers['Ytdl-socks-proxy']
915
916 return self.do_open(functools.partial(
917 _create_http_connection, self, conn_class, False),
918 req)
919
920 @staticmethod
921 def deflate(data):
922 try:
923 return zlib.decompress(data, -zlib.MAX_WBITS)
924 except zlib.error:
925 return zlib.decompress(data)
926
927 @staticmethod
928 def addinfourl_wrapper(stream, headers, url, code):
929 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
930 return compat_urllib_request.addinfourl(stream, headers, url, code)
931 ret = compat_urllib_request.addinfourl(stream, headers, url)
932 ret.code = code
933 return ret
934
935 def http_request(self, req):
936 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
937 # always respected by websites, some tend to give out URLs with non percent-encoded
938 # non-ASCII characters (see telemb.py, ard.py [#3412])
939 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
940 # To work around aforementioned issue we will replace request's original URL with
941 # percent-encoded one
942 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
943 # the code of this workaround has been moved here from YoutubeDL.urlopen()
944 url = req.get_full_url()
945 url_escaped = escape_url(url)
946
947 # Substitute URL if any change after escaping
948 if url != url_escaped:
949 req = update_Request(req, url=url_escaped)
950
951 for h, v in std_headers.items():
952 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
953 # The dict keys are capitalized because of this bug by urllib
954 if h.capitalize() not in req.headers:
955 req.add_header(h, v)
956
957 req.headers = handle_youtubedl_headers(req.headers)
958
959 if sys.version_info < (2, 7) and '#' in req.get_full_url():
960 # Python 2.6 is brain-dead when it comes to fragments
961 req._Request__original = req._Request__original.partition('#')[0]
962 req._Request__r_type = req._Request__r_type.partition('#')[0]
963
964 return req
965
966 def http_response(self, req, resp):
967 old_resp = resp
968 # gzip
969 if resp.headers.get('Content-encoding', '') == 'gzip':
970 content = resp.read()
971 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
972 try:
973 uncompressed = io.BytesIO(gz.read())
974 except IOError as original_ioerror:
975 # There may be junk add the end of the file
976 # See http://stackoverflow.com/q/4928560/35070 for details
977 for i in range(1, 1024):
978 try:
979 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
980 uncompressed = io.BytesIO(gz.read())
981 except IOError:
982 continue
983 break
984 else:
985 raise original_ioerror
986 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
987 resp.msg = old_resp.msg
988 del resp.headers['Content-encoding']
989 # deflate
990 if resp.headers.get('Content-encoding', '') == 'deflate':
991 gz = io.BytesIO(self.deflate(resp.read()))
992 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
993 resp.msg = old_resp.msg
994 del resp.headers['Content-encoding']
995 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
996 # https://github.com/rg3/youtube-dl/issues/6457).
997 if 300 <= resp.code < 400:
998 location = resp.headers.get('Location')
999 if location:
1000 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1001 if sys.version_info >= (3, 0):
1002 location = location.encode('iso-8859-1').decode('utf-8')
1003 else:
1004 location = location.decode('utf-8')
1005 location_escaped = escape_url(location)
1006 if location != location_escaped:
1007 del resp.headers['Location']
1008 if sys.version_info < (3, 0):
1009 location_escaped = location_escaped.encode('utf-8')
1010 resp.headers['Location'] = location_escaped
1011 return resp
1012
1013 https_request = http_request
1014 https_response = http_response
1015
1016
1017 def make_socks_conn_class(base_class, socks_proxy):
1018 assert issubclass(base_class, (
1019 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1020
1021 url_components = compat_urlparse.urlparse(socks_proxy)
1022 if url_components.scheme.lower() == 'socks5':
1023 socks_type = ProxyType.SOCKS5
1024 elif url_components.scheme.lower() in ('socks', 'socks4'):
1025 socks_type = ProxyType.SOCKS4
1026 elif url_components.scheme.lower() == 'socks4a':
1027 socks_type = ProxyType.SOCKS4A
1028
1029 def unquote_if_non_empty(s):
1030 if not s:
1031 return s
1032 return compat_urllib_parse_unquote_plus(s)
1033
1034 proxy_args = (
1035 socks_type,
1036 url_components.hostname, url_components.port or 1080,
1037 True, # Remote DNS
1038 unquote_if_non_empty(url_components.username),
1039 unquote_if_non_empty(url_components.password),
1040 )
1041
1042 class SocksConnection(base_class):
1043 def connect(self):
1044 self.sock = sockssocket()
1045 self.sock.setproxy(*proxy_args)
1046 if type(self.timeout) in (int, float):
1047 self.sock.settimeout(self.timeout)
1048 self.sock.connect((self.host, self.port))
1049
1050 if isinstance(self, compat_http_client.HTTPSConnection):
1051 if hasattr(self, '_context'): # Python > 2.6
1052 self.sock = self._context.wrap_socket(
1053 self.sock, server_hostname=self.host)
1054 else:
1055 self.sock = ssl.wrap_socket(self.sock)
1056
1057 return SocksConnection
1058
1059
1060 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1061 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1062 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1063 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1064 self._params = params
1065
1066 def https_open(self, req):
1067 kwargs = {}
1068 conn_class = self._https_conn_class
1069
1070 if hasattr(self, '_context'): # python > 2.6
1071 kwargs['context'] = self._context
1072 if hasattr(self, '_check_hostname'): # python 3.x
1073 kwargs['check_hostname'] = self._check_hostname
1074
1075 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1076 if socks_proxy:
1077 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1078 del req.headers['Ytdl-socks-proxy']
1079
1080 return self.do_open(functools.partial(
1081 _create_http_connection, self, conn_class, True),
1082 req, **kwargs)
1083
1084
1085 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1086 def __init__(self, cookiejar=None):
1087 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1088
1089 def http_response(self, request, response):
1090 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1091 # characters in Set-Cookie HTTP header of last response (see
1092 # https://github.com/rg3/youtube-dl/issues/6769).
1093 # In order to at least prevent crashing we will percent encode Set-Cookie
1094 # header before HTTPCookieProcessor starts processing it.
1095 # if sys.version_info < (3, 0) and response.headers:
1096 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1097 # set_cookie = response.headers.get(set_cookie_header)
1098 # if set_cookie:
1099 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1100 # if set_cookie != set_cookie_escaped:
1101 # del response.headers[set_cookie_header]
1102 # response.headers[set_cookie_header] = set_cookie_escaped
1103 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1104
1105 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1106 https_response = http_response
1107
1108
1109 def extract_timezone(date_str):
1110 m = re.search(
1111 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1112 date_str)
1113 if not m:
1114 timezone = datetime.timedelta()
1115 else:
1116 date_str = date_str[:-len(m.group('tz'))]
1117 if not m.group('sign'):
1118 timezone = datetime.timedelta()
1119 else:
1120 sign = 1 if m.group('sign') == '+' else -1
1121 timezone = datetime.timedelta(
1122 hours=sign * int(m.group('hours')),
1123 minutes=sign * int(m.group('minutes')))
1124 return timezone, date_str
1125
1126
1127 def parse_iso8601(date_str, delimiter='T', timezone=None):
1128 """ Return a UNIX timestamp from the given date """
1129
1130 if date_str is None:
1131 return None
1132
1133 date_str = re.sub(r'\.[0-9]+', '', date_str)
1134
1135 if timezone is None:
1136 timezone, date_str = extract_timezone(date_str)
1137
1138 try:
1139 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1140 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1141 return calendar.timegm(dt.timetuple())
1142 except ValueError:
1143 pass
1144
1145
1146 def date_formats(day_first=True):
1147 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1148
1149
1150 def unified_strdate(date_str, day_first=True):
1151 """Return a string with the date in the format YYYYMMDD"""
1152
1153 if date_str is None:
1154 return None
1155 upload_date = None
1156 # Replace commas
1157 date_str = date_str.replace(',', ' ')
1158 # Remove AM/PM + timezone
1159 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1160 _, date_str = extract_timezone(date_str)
1161
1162 for expression in date_formats(day_first):
1163 try:
1164 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1165 except ValueError:
1166 pass
1167 if upload_date is None:
1168 timetuple = email.utils.parsedate_tz(date_str)
1169 if timetuple:
1170 try:
1171 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1172 except ValueError:
1173 pass
1174 if upload_date is not None:
1175 return compat_str(upload_date)
1176
1177
1178 def unified_timestamp(date_str, day_first=True):
1179 if date_str is None:
1180 return None
1181
1182 date_str = date_str.replace(',', ' ')
1183
1184 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1185 timezone, date_str = extract_timezone(date_str)
1186
1187 # Remove AM/PM + timezone
1188 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1189
1190 for expression in date_formats(day_first):
1191 try:
1192 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1193 return calendar.timegm(dt.timetuple())
1194 except ValueError:
1195 pass
1196 timetuple = email.utils.parsedate_tz(date_str)
1197 if timetuple:
1198 return calendar.timegm(timetuple) + pm_delta * 3600
1199
1200
1201 def determine_ext(url, default_ext='unknown_video'):
1202 if url is None:
1203 return default_ext
1204 guess = url.partition('?')[0].rpartition('.')[2]
1205 if re.match(r'^[A-Za-z0-9]+$', guess):
1206 return guess
1207 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1208 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1209 return guess.rstrip('/')
1210 else:
1211 return default_ext
1212
1213
1214 def subtitles_filename(filename, sub_lang, sub_format):
1215 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1216
1217
1218 def date_from_str(date_str):
1219 """
1220 Return a datetime object from a string in the format YYYYMMDD or
1221 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1222 today = datetime.date.today()
1223 if date_str in ('now', 'today'):
1224 return today
1225 if date_str == 'yesterday':
1226 return today - datetime.timedelta(days=1)
1227 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1228 if match is not None:
1229 sign = match.group('sign')
1230 time = int(match.group('time'))
1231 if sign == '-':
1232 time = -time
1233 unit = match.group('unit')
1234 # A bad approximation?
1235 if unit == 'month':
1236 unit = 'day'
1237 time *= 30
1238 elif unit == 'year':
1239 unit = 'day'
1240 time *= 365
1241 unit += 's'
1242 delta = datetime.timedelta(**{unit: time})
1243 return today + delta
1244 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1245
1246
1247 def hyphenate_date(date_str):
1248 """
1249 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1250 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1251 if match is not None:
1252 return '-'.join(match.groups())
1253 else:
1254 return date_str
1255
1256
1257 class DateRange(object):
1258 """Represents a time interval between two dates"""
1259
1260 def __init__(self, start=None, end=None):
1261 """start and end must be strings in the format accepted by date"""
1262 if start is not None:
1263 self.start = date_from_str(start)
1264 else:
1265 self.start = datetime.datetime.min.date()
1266 if end is not None:
1267 self.end = date_from_str(end)
1268 else:
1269 self.end = datetime.datetime.max.date()
1270 if self.start > self.end:
1271 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1272
1273 @classmethod
1274 def day(cls, day):
1275 """Returns a range that only contains the given day"""
1276 return cls(day, day)
1277
1278 def __contains__(self, date):
1279 """Check if the date is in the range"""
1280 if not isinstance(date, datetime.date):
1281 date = date_from_str(date)
1282 return self.start <= date <= self.end
1283
1284 def __str__(self):
1285 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1286
1287
1288 def platform_name():
1289 """ Returns the platform name as a compat_str """
1290 res = platform.platform()
1291 if isinstance(res, bytes):
1292 res = res.decode(preferredencoding())
1293
1294 assert isinstance(res, compat_str)
1295 return res
1296
1297
1298 def _windows_write_string(s, out):
1299 """ Returns True if the string was written using special methods,
1300 False if it has yet to be written out."""
1301 # Adapted from http://stackoverflow.com/a/3259271/35070
1302
1303 import ctypes
1304 import ctypes.wintypes
1305
1306 WIN_OUTPUT_IDS = {
1307 1: -11,
1308 2: -12,
1309 }
1310
1311 try:
1312 fileno = out.fileno()
1313 except AttributeError:
1314 # If the output stream doesn't have a fileno, it's virtual
1315 return False
1316 except io.UnsupportedOperation:
1317 # Some strange Windows pseudo files?
1318 return False
1319 if fileno not in WIN_OUTPUT_IDS:
1320 return False
1321
1322 GetStdHandle = ctypes.WINFUNCTYPE(
1323 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1324 (b'GetStdHandle', ctypes.windll.kernel32))
1325 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1326
1327 WriteConsoleW = ctypes.WINFUNCTYPE(
1328 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1329 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1330 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1331 written = ctypes.wintypes.DWORD(0)
1332
1333 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1334 FILE_TYPE_CHAR = 0x0002
1335 FILE_TYPE_REMOTE = 0x8000
1336 GetConsoleMode = ctypes.WINFUNCTYPE(
1337 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1338 ctypes.POINTER(ctypes.wintypes.DWORD))(
1339 (b'GetConsoleMode', ctypes.windll.kernel32))
1340 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1341
1342 def not_a_console(handle):
1343 if handle == INVALID_HANDLE_VALUE or handle is None:
1344 return True
1345 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1346 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1347
1348 if not_a_console(h):
1349 return False
1350
1351 def next_nonbmp_pos(s):
1352 try:
1353 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1354 except StopIteration:
1355 return len(s)
1356
1357 while s:
1358 count = min(next_nonbmp_pos(s), 1024)
1359
1360 ret = WriteConsoleW(
1361 h, s, count if count else 2, ctypes.byref(written), None)
1362 if ret == 0:
1363 raise OSError('Failed to write string')
1364 if not count: # We just wrote a non-BMP character
1365 assert written.value == 2
1366 s = s[1:]
1367 else:
1368 assert written.value > 0
1369 s = s[written.value:]
1370 return True
1371
1372
1373 def write_string(s, out=None, encoding=None):
1374 if out is None:
1375 out = sys.stderr
1376 assert type(s) == compat_str
1377
1378 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1379 if _windows_write_string(s, out):
1380 return
1381
1382 if ('b' in getattr(out, 'mode', '') or
1383 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1384 byt = s.encode(encoding or preferredencoding(), 'ignore')
1385 out.write(byt)
1386 elif hasattr(out, 'buffer'):
1387 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1388 byt = s.encode(enc, 'ignore')
1389 out.buffer.write(byt)
1390 else:
1391 out.write(s)
1392 out.flush()
1393
1394
1395 def bytes_to_intlist(bs):
1396 if not bs:
1397 return []
1398 if isinstance(bs[0], int): # Python 3
1399 return list(bs)
1400 else:
1401 return [ord(c) for c in bs]
1402
1403
1404 def intlist_to_bytes(xs):
1405 if not xs:
1406 return b''
1407 return compat_struct_pack('%dB' % len(xs), *xs)
1408
1409
1410 # Cross-platform file locking
1411 if sys.platform == 'win32':
1412 import ctypes.wintypes
1413 import msvcrt
1414
1415 class OVERLAPPED(ctypes.Structure):
1416 _fields_ = [
1417 ('Internal', ctypes.wintypes.LPVOID),
1418 ('InternalHigh', ctypes.wintypes.LPVOID),
1419 ('Offset', ctypes.wintypes.DWORD),
1420 ('OffsetHigh', ctypes.wintypes.DWORD),
1421 ('hEvent', ctypes.wintypes.HANDLE),
1422 ]
1423
1424 kernel32 = ctypes.windll.kernel32
1425 LockFileEx = kernel32.LockFileEx
1426 LockFileEx.argtypes = [
1427 ctypes.wintypes.HANDLE, # hFile
1428 ctypes.wintypes.DWORD, # dwFlags
1429 ctypes.wintypes.DWORD, # dwReserved
1430 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1431 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1432 ctypes.POINTER(OVERLAPPED) # Overlapped
1433 ]
1434 LockFileEx.restype = ctypes.wintypes.BOOL
1435 UnlockFileEx = kernel32.UnlockFileEx
1436 UnlockFileEx.argtypes = [
1437 ctypes.wintypes.HANDLE, # hFile
1438 ctypes.wintypes.DWORD, # dwReserved
1439 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1441 ctypes.POINTER(OVERLAPPED) # Overlapped
1442 ]
1443 UnlockFileEx.restype = ctypes.wintypes.BOOL
1444 whole_low = 0xffffffff
1445 whole_high = 0x7fffffff
1446
1447 def _lock_file(f, exclusive):
1448 overlapped = OVERLAPPED()
1449 overlapped.Offset = 0
1450 overlapped.OffsetHigh = 0
1451 overlapped.hEvent = 0
1452 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1453 handle = msvcrt.get_osfhandle(f.fileno())
1454 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1455 whole_low, whole_high, f._lock_file_overlapped_p):
1456 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1457
1458 def _unlock_file(f):
1459 assert f._lock_file_overlapped_p
1460 handle = msvcrt.get_osfhandle(f.fileno())
1461 if not UnlockFileEx(handle, 0,
1462 whole_low, whole_high, f._lock_file_overlapped_p):
1463 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1464
1465 else:
1466 # Some platforms, such as Jython, is missing fcntl
1467 try:
1468 import fcntl
1469
1470 def _lock_file(f, exclusive):
1471 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1472
1473 def _unlock_file(f):
1474 fcntl.flock(f, fcntl.LOCK_UN)
1475 except ImportError:
1476 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1477
1478 def _lock_file(f, exclusive):
1479 raise IOError(UNSUPPORTED_MSG)
1480
1481 def _unlock_file(f):
1482 raise IOError(UNSUPPORTED_MSG)
1483
1484
1485 class locked_file(object):
1486 def __init__(self, filename, mode, encoding=None):
1487 assert mode in ['r', 'a', 'w']
1488 self.f = io.open(filename, mode, encoding=encoding)
1489 self.mode = mode
1490
1491 def __enter__(self):
1492 exclusive = self.mode != 'r'
1493 try:
1494 _lock_file(self.f, exclusive)
1495 except IOError:
1496 self.f.close()
1497 raise
1498 return self
1499
1500 def __exit__(self, etype, value, traceback):
1501 try:
1502 _unlock_file(self.f)
1503 finally:
1504 self.f.close()
1505
1506 def __iter__(self):
1507 return iter(self.f)
1508
1509 def write(self, *args):
1510 return self.f.write(*args)
1511
1512 def read(self, *args):
1513 return self.f.read(*args)
1514
1515
1516 def get_filesystem_encoding():
1517 encoding = sys.getfilesystemencoding()
1518 return encoding if encoding is not None else 'utf-8'
1519
1520
1521 def shell_quote(args):
1522 quoted_args = []
1523 encoding = get_filesystem_encoding()
1524 for a in args:
1525 if isinstance(a, bytes):
1526 # We may get a filename encoded with 'encodeFilename'
1527 a = a.decode(encoding)
1528 quoted_args.append(pipes.quote(a))
1529 return ' '.join(quoted_args)
1530
1531
1532 def smuggle_url(url, data):
1533 """ Pass additional data in a URL for internal use. """
1534
1535 url, idata = unsmuggle_url(url, {})
1536 data.update(idata)
1537 sdata = compat_urllib_parse_urlencode(
1538 {'__youtubedl_smuggle': json.dumps(data)})
1539 return url + '#' + sdata
1540
1541
1542 def unsmuggle_url(smug_url, default=None):
1543 if '#__youtubedl_smuggle' not in smug_url:
1544 return smug_url, default
1545 url, _, sdata = smug_url.rpartition('#')
1546 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1547 data = json.loads(jsond)
1548 return url, data
1549
1550
1551 def format_bytes(bytes):
1552 if bytes is None:
1553 return 'N/A'
1554 if type(bytes) is str:
1555 bytes = float(bytes)
1556 if bytes == 0.0:
1557 exponent = 0
1558 else:
1559 exponent = int(math.log(bytes, 1024.0))
1560 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1561 converted = float(bytes) / float(1024 ** exponent)
1562 return '%.2f%s' % (converted, suffix)
1563
1564
1565 def lookup_unit_table(unit_table, s):
1566 units_re = '|'.join(re.escape(u) for u in unit_table)
1567 m = re.match(
1568 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1569 if not m:
1570 return None
1571 num_str = m.group('num').replace(',', '.')
1572 mult = unit_table[m.group('unit')]
1573 return int(float(num_str) * mult)
1574
1575
1576 def parse_filesize(s):
1577 if s is None:
1578 return None
1579
1580 # The lower-case forms are of course incorrect and unofficial,
1581 # but we support those too
1582 _UNIT_TABLE = {
1583 'B': 1,
1584 'b': 1,
1585 'bytes': 1,
1586 'KiB': 1024,
1587 'KB': 1000,
1588 'kB': 1024,
1589 'Kb': 1000,
1590 'kb': 1000,
1591 'kilobytes': 1000,
1592 'kibibytes': 1024,
1593 'MiB': 1024 ** 2,
1594 'MB': 1000 ** 2,
1595 'mB': 1024 ** 2,
1596 'Mb': 1000 ** 2,
1597 'mb': 1000 ** 2,
1598 'megabytes': 1000 ** 2,
1599 'mebibytes': 1024 ** 2,
1600 'GiB': 1024 ** 3,
1601 'GB': 1000 ** 3,
1602 'gB': 1024 ** 3,
1603 'Gb': 1000 ** 3,
1604 'gb': 1000 ** 3,
1605 'gigabytes': 1000 ** 3,
1606 'gibibytes': 1024 ** 3,
1607 'TiB': 1024 ** 4,
1608 'TB': 1000 ** 4,
1609 'tB': 1024 ** 4,
1610 'Tb': 1000 ** 4,
1611 'tb': 1000 ** 4,
1612 'terabytes': 1000 ** 4,
1613 'tebibytes': 1024 ** 4,
1614 'PiB': 1024 ** 5,
1615 'PB': 1000 ** 5,
1616 'pB': 1024 ** 5,
1617 'Pb': 1000 ** 5,
1618 'pb': 1000 ** 5,
1619 'petabytes': 1000 ** 5,
1620 'pebibytes': 1024 ** 5,
1621 'EiB': 1024 ** 6,
1622 'EB': 1000 ** 6,
1623 'eB': 1024 ** 6,
1624 'Eb': 1000 ** 6,
1625 'eb': 1000 ** 6,
1626 'exabytes': 1000 ** 6,
1627 'exbibytes': 1024 ** 6,
1628 'ZiB': 1024 ** 7,
1629 'ZB': 1000 ** 7,
1630 'zB': 1024 ** 7,
1631 'Zb': 1000 ** 7,
1632 'zb': 1000 ** 7,
1633 'zettabytes': 1000 ** 7,
1634 'zebibytes': 1024 ** 7,
1635 'YiB': 1024 ** 8,
1636 'YB': 1000 ** 8,
1637 'yB': 1024 ** 8,
1638 'Yb': 1000 ** 8,
1639 'yb': 1000 ** 8,
1640 'yottabytes': 1000 ** 8,
1641 'yobibytes': 1024 ** 8,
1642 }
1643
1644 return lookup_unit_table(_UNIT_TABLE, s)
1645
1646
1647 def parse_count(s):
1648 if s is None:
1649 return None
1650
1651 s = s.strip()
1652
1653 if re.match(r'^[\d,.]+$', s):
1654 return str_to_int(s)
1655
1656 _UNIT_TABLE = {
1657 'k': 1000,
1658 'K': 1000,
1659 'm': 1000 ** 2,
1660 'M': 1000 ** 2,
1661 'kk': 1000 ** 2,
1662 'KK': 1000 ** 2,
1663 }
1664
1665 return lookup_unit_table(_UNIT_TABLE, s)
1666
1667
1668 def month_by_name(name, lang='en'):
1669 """ Return the number of a month by (locale-independently) English name """
1670
1671 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1672
1673 try:
1674 return month_names.index(name) + 1
1675 except ValueError:
1676 return None
1677
1678
1679 def month_by_abbreviation(abbrev):
1680 """ Return the number of a month by (locale-independently) English
1681 abbreviations """
1682
1683 try:
1684 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1685 except ValueError:
1686 return None
1687
1688
1689 def fix_xml_ampersands(xml_str):
1690 """Replace all the '&' by '&amp;' in XML"""
1691 return re.sub(
1692 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1693 '&amp;',
1694 xml_str)
1695
1696
1697 def setproctitle(title):
1698 assert isinstance(title, compat_str)
1699
1700 # ctypes in Jython is not complete
1701 # http://bugs.jython.org/issue2148
1702 if sys.platform.startswith('java'):
1703 return
1704
1705 try:
1706 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1707 except OSError:
1708 return
1709 except TypeError:
1710 # LoadLibrary in Windows Python 2.7.13 only expects
1711 # a bytestring, but since unicode_literals turns
1712 # every string into a unicode string, it fails.
1713 return
1714 title_bytes = title.encode('utf-8')
1715 buf = ctypes.create_string_buffer(len(title_bytes))
1716 buf.value = title_bytes
1717 try:
1718 libc.prctl(15, buf, 0, 0, 0)
1719 except AttributeError:
1720 return # Strange libc, just skip this
1721
1722
1723 def remove_start(s, start):
1724 return s[len(start):] if s is not None and s.startswith(start) else s
1725
1726
1727 def remove_end(s, end):
1728 return s[:-len(end)] if s is not None and s.endswith(end) else s
1729
1730
1731 def remove_quotes(s):
1732 if s is None or len(s) < 2:
1733 return s
1734 for quote in ('"', "'", ):
1735 if s[0] == quote and s[-1] == quote:
1736 return s[1:-1]
1737 return s
1738
1739
1740 def url_basename(url):
1741 path = compat_urlparse.urlparse(url).path
1742 return path.strip('/').split('/')[-1]
1743
1744
1745 def base_url(url):
1746 return re.match(r'https?://[^?#&]+/', url).group()
1747
1748
1749 def urljoin(base, path):
1750 if not isinstance(path, compat_str) or not path:
1751 return None
1752 if re.match(r'^(?:https?:)?//', path):
1753 return path
1754 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1755 return None
1756 return compat_urlparse.urljoin(base, path)
1757
1758
1759 class HEADRequest(compat_urllib_request.Request):
1760 def get_method(self):
1761 return 'HEAD'
1762
1763
1764 class PUTRequest(compat_urllib_request.Request):
1765 def get_method(self):
1766 return 'PUT'
1767
1768
1769 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1770 if get_attr:
1771 if v is not None:
1772 v = getattr(v, get_attr, None)
1773 if v == '':
1774 v = None
1775 if v is None:
1776 return default
1777 try:
1778 return int(v) * invscale // scale
1779 except ValueError:
1780 return default
1781
1782
1783 def str_or_none(v, default=None):
1784 return default if v is None else compat_str(v)
1785
1786
1787 def str_to_int(int_str):
1788 """ A more relaxed version of int_or_none """
1789 if int_str is None:
1790 return None
1791 int_str = re.sub(r'[,\.\+]', '', int_str)
1792 return int(int_str)
1793
1794
1795 def float_or_none(v, scale=1, invscale=1, default=None):
1796 if v is None:
1797 return default
1798 try:
1799 return float(v) * invscale / scale
1800 except ValueError:
1801 return default
1802
1803
1804 def strip_or_none(v):
1805 return None if v is None else v.strip()
1806
1807
1808 def parse_duration(s):
1809 if not isinstance(s, compat_basestring):
1810 return None
1811
1812 s = s.strip()
1813
1814 days, hours, mins, secs, ms = [None] * 5
1815 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1816 if m:
1817 days, hours, mins, secs, ms = m.groups()
1818 else:
1819 m = re.match(
1820 r'''(?ix)(?:P?T)?
1821 (?:
1822 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1823 )?
1824 (?:
1825 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1826 )?
1827 (?:
1828 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1829 )?
1830 (?:
1831 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1832 )?Z?$''', s)
1833 if m:
1834 days, hours, mins, secs, ms = m.groups()
1835 else:
1836 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1837 if m:
1838 hours, mins = m.groups()
1839 else:
1840 return None
1841
1842 duration = 0
1843 if secs:
1844 duration += float(secs)
1845 if mins:
1846 duration += float(mins) * 60
1847 if hours:
1848 duration += float(hours) * 60 * 60
1849 if days:
1850 duration += float(days) * 24 * 60 * 60
1851 if ms:
1852 duration += float(ms)
1853 return duration
1854
1855
1856 def prepend_extension(filename, ext, expected_real_ext=None):
1857 name, real_ext = os.path.splitext(filename)
1858 return (
1859 '{0}.{1}{2}'.format(name, ext, real_ext)
1860 if not expected_real_ext or real_ext[1:] == expected_real_ext
1861 else '{0}.{1}'.format(filename, ext))
1862
1863
1864 def replace_extension(filename, ext, expected_real_ext=None):
1865 name, real_ext = os.path.splitext(filename)
1866 return '{0}.{1}'.format(
1867 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1868 ext)
1869
1870
1871 def check_executable(exe, args=[]):
1872 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1873 args can be a list of arguments for a short output (like -version) """
1874 try:
1875 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1876 except OSError:
1877 return False
1878 return exe
1879
1880
1881 def get_exe_version(exe, args=['--version'],
1882 version_re=None, unrecognized='present'):
1883 """ Returns the version of the specified executable,
1884 or False if the executable is not present """
1885 try:
1886 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1887 # SIGTTOU if youtube-dl is run in the background.
1888 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1889 out, _ = subprocess.Popen(
1890 [encodeArgument(exe)] + args,
1891 stdin=subprocess.PIPE,
1892 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1893 except OSError:
1894 return False
1895 if isinstance(out, bytes): # Python 2.x
1896 out = out.decode('ascii', 'ignore')
1897 return detect_exe_version(out, version_re, unrecognized)
1898
1899
1900 def detect_exe_version(output, version_re=None, unrecognized='present'):
1901 assert isinstance(output, compat_str)
1902 if version_re is None:
1903 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1904 m = re.search(version_re, output)
1905 if m:
1906 return m.group(1)
1907 else:
1908 return unrecognized
1909
1910
1911 class PagedList(object):
1912 def __len__(self):
1913 # This is only useful for tests
1914 return len(self.getslice())
1915
1916
1917 class OnDemandPagedList(PagedList):
1918 def __init__(self, pagefunc, pagesize, use_cache=False):
1919 self._pagefunc = pagefunc
1920 self._pagesize = pagesize
1921 self._use_cache = use_cache
1922 if use_cache:
1923 self._cache = {}
1924
1925 def getslice(self, start=0, end=None):
1926 res = []
1927 for pagenum in itertools.count(start // self._pagesize):
1928 firstid = pagenum * self._pagesize
1929 nextfirstid = pagenum * self._pagesize + self._pagesize
1930 if start >= nextfirstid:
1931 continue
1932
1933 page_results = None
1934 if self._use_cache:
1935 page_results = self._cache.get(pagenum)
1936 if page_results is None:
1937 page_results = list(self._pagefunc(pagenum))
1938 if self._use_cache:
1939 self._cache[pagenum] = page_results
1940
1941 startv = (
1942 start % self._pagesize
1943 if firstid <= start < nextfirstid
1944 else 0)
1945
1946 endv = (
1947 ((end - 1) % self._pagesize) + 1
1948 if (end is not None and firstid <= end <= nextfirstid)
1949 else None)
1950
1951 if startv != 0 or endv is not None:
1952 page_results = page_results[startv:endv]
1953 res.extend(page_results)
1954
1955 # A little optimization - if current page is not "full", ie. does
1956 # not contain page_size videos then we can assume that this page
1957 # is the last one - there are no more ids on further pages -
1958 # i.e. no need to query again.
1959 if len(page_results) + startv < self._pagesize:
1960 break
1961
1962 # If we got the whole page, but the next page is not interesting,
1963 # break out early as well
1964 if end == nextfirstid:
1965 break
1966 return res
1967
1968
1969 class InAdvancePagedList(PagedList):
1970 def __init__(self, pagefunc, pagecount, pagesize):
1971 self._pagefunc = pagefunc
1972 self._pagecount = pagecount
1973 self._pagesize = pagesize
1974
1975 def getslice(self, start=0, end=None):
1976 res = []
1977 start_page = start // self._pagesize
1978 end_page = (
1979 self._pagecount if end is None else (end // self._pagesize + 1))
1980 skip_elems = start - start_page * self._pagesize
1981 only_more = None if end is None else end - start
1982 for pagenum in range(start_page, end_page):
1983 page = list(self._pagefunc(pagenum))
1984 if skip_elems:
1985 page = page[skip_elems:]
1986 skip_elems = None
1987 if only_more is not None:
1988 if len(page) < only_more:
1989 only_more -= len(page)
1990 else:
1991 page = page[:only_more]
1992 res.extend(page)
1993 break
1994 res.extend(page)
1995 return res
1996
1997
1998 def uppercase_escape(s):
1999 unicode_escape = codecs.getdecoder('unicode_escape')
2000 return re.sub(
2001 r'\\U[0-9a-fA-F]{8}',
2002 lambda m: unicode_escape(m.group(0))[0],
2003 s)
2004
2005
2006 def lowercase_escape(s):
2007 unicode_escape = codecs.getdecoder('unicode_escape')
2008 return re.sub(
2009 r'\\u[0-9a-fA-F]{4}',
2010 lambda m: unicode_escape(m.group(0))[0],
2011 s)
2012
2013
2014 def escape_rfc3986(s):
2015 """Escape non-ASCII characters as suggested by RFC 3986"""
2016 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2017 s = s.encode('utf-8')
2018 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2019
2020
2021 def escape_url(url):
2022 """Escape URL as suggested by RFC 3986"""
2023 url_parsed = compat_urllib_parse_urlparse(url)
2024 return url_parsed._replace(
2025 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2026 path=escape_rfc3986(url_parsed.path),
2027 params=escape_rfc3986(url_parsed.params),
2028 query=escape_rfc3986(url_parsed.query),
2029 fragment=escape_rfc3986(url_parsed.fragment)
2030 ).geturl()
2031
2032
2033 def read_batch_urls(batch_fd):
2034 def fixup(url):
2035 if not isinstance(url, compat_str):
2036 url = url.decode('utf-8', 'replace')
2037 BOM_UTF8 = '\xef\xbb\xbf'
2038 if url.startswith(BOM_UTF8):
2039 url = url[len(BOM_UTF8):]
2040 url = url.strip()
2041 if url.startswith(('#', ';', ']')):
2042 return False
2043 return url
2044
2045 with contextlib.closing(batch_fd) as fd:
2046 return [url for url in map(fixup, fd) if url]
2047
2048
2049 def urlencode_postdata(*args, **kargs):
2050 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2051
2052
2053 def update_url_query(url, query):
2054 if not query:
2055 return url
2056 parsed_url = compat_urlparse.urlparse(url)
2057 qs = compat_parse_qs(parsed_url.query)
2058 qs.update(query)
2059 return compat_urlparse.urlunparse(parsed_url._replace(
2060 query=compat_urllib_parse_urlencode(qs, True)))
2061
2062
2063 def update_Request(req, url=None, data=None, headers={}, query={}):
2064 req_headers = req.headers.copy()
2065 req_headers.update(headers)
2066 req_data = data or req.data
2067 req_url = update_url_query(url or req.get_full_url(), query)
2068 req_get_method = req.get_method()
2069 if req_get_method == 'HEAD':
2070 req_type = HEADRequest
2071 elif req_get_method == 'PUT':
2072 req_type = PUTRequest
2073 else:
2074 req_type = compat_urllib_request.Request
2075 new_req = req_type(
2076 req_url, data=req_data, headers=req_headers,
2077 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2078 if hasattr(req, 'timeout'):
2079 new_req.timeout = req.timeout
2080 return new_req
2081
2082
2083 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2084 if isinstance(key_or_keys, (list, tuple)):
2085 for key in key_or_keys:
2086 if key not in d or d[key] is None or skip_false_values and not d[key]:
2087 continue
2088 return d[key]
2089 return default
2090 return d.get(key_or_keys, default)
2091
2092
2093 def try_get(src, getter, expected_type=None):
2094 try:
2095 v = getter(src)
2096 except (AttributeError, KeyError, TypeError, IndexError):
2097 pass
2098 else:
2099 if expected_type is None or isinstance(v, expected_type):
2100 return v
2101
2102
2103 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2104 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2105
2106
2107 US_RATINGS = {
2108 'G': 0,
2109 'PG': 10,
2110 'PG-13': 13,
2111 'R': 16,
2112 'NC': 18,
2113 }
2114
2115
2116 TV_PARENTAL_GUIDELINES = {
2117 'TV-Y': 0,
2118 'TV-Y7': 7,
2119 'TV-G': 0,
2120 'TV-PG': 0,
2121 'TV-14': 14,
2122 'TV-MA': 17,
2123 }
2124
2125
2126 def parse_age_limit(s):
2127 if type(s) == int:
2128 return s if 0 <= s <= 21 else None
2129 if not isinstance(s, compat_basestring):
2130 return None
2131 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2132 if m:
2133 return int(m.group('age'))
2134 if s in US_RATINGS:
2135 return US_RATINGS[s]
2136 return TV_PARENTAL_GUIDELINES.get(s)
2137
2138
2139 def strip_jsonp(code):
2140 return re.sub(
2141 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2142
2143
2144 def js_to_json(code):
2145 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2146 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2147 INTEGER_TABLE = (
2148 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2149 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2150 )
2151
2152 def fix_kv(m):
2153 v = m.group(0)
2154 if v in ('true', 'false', 'null'):
2155 return v
2156 elif v.startswith('/*') or v.startswith('//') or v == ',':
2157 return ""
2158
2159 if v[0] in ("'", '"'):
2160 v = re.sub(r'(?s)\\.|"', lambda m: {
2161 '"': '\\"',
2162 "\\'": "'",
2163 '\\\n': '',
2164 '\\x': '\\u00',
2165 }.get(m.group(0), m.group(0)), v[1:-1])
2166
2167 for regex, base in INTEGER_TABLE:
2168 im = re.match(regex, v)
2169 if im:
2170 i = int(im.group(1), base)
2171 return '"%d":' % i if v.endswith(':') else '%d' % i
2172
2173 return '"%s"' % v
2174
2175 return re.sub(r'''(?sx)
2176 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2177 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2178 {comment}|,(?={skip}[\]}}])|
2179 [a-zA-Z_][.a-zA-Z_0-9]*|
2180 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2181 [0-9]+(?={skip}:)
2182 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2183
2184
2185 def qualities(quality_ids):
2186 """ Get a numeric quality value out of a list of possible values """
2187 def q(qid):
2188 try:
2189 return quality_ids.index(qid)
2190 except ValueError:
2191 return -1
2192 return q
2193
2194
2195 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2196
2197
2198 def limit_length(s, length):
2199 """ Add ellipses to overly long strings """
2200 if s is None:
2201 return None
2202 ELLIPSES = '...'
2203 if len(s) > length:
2204 return s[:length - len(ELLIPSES)] + ELLIPSES
2205 return s
2206
2207
2208 def version_tuple(v):
2209 return tuple(int(e) for e in re.split(r'[-.]', v))
2210
2211
2212 def is_outdated_version(version, limit, assume_new=True):
2213 if not version:
2214 return not assume_new
2215 try:
2216 return version_tuple(version) < version_tuple(limit)
2217 except ValueError:
2218 return not assume_new
2219
2220
2221 def ytdl_is_updateable():
2222 """ Returns if youtube-dl can be updated with -U """
2223 from zipimport import zipimporter
2224
2225 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2226
2227
2228 def args_to_str(args):
2229 # Get a short string representation for a subprocess command
2230 return ' '.join(compat_shlex_quote(a) for a in args)
2231
2232
2233 def error_to_compat_str(err):
2234 err_str = str(err)
2235 # On python 2 error byte string must be decoded with proper
2236 # encoding rather than ascii
2237 if sys.version_info[0] < 3:
2238 err_str = err_str.decode(preferredencoding())
2239 return err_str
2240
2241
2242 def mimetype2ext(mt):
2243 if mt is None:
2244 return None
2245
2246 ext = {
2247 'audio/mp4': 'm4a',
2248 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2249 # it's the most popular one
2250 'audio/mpeg': 'mp3',
2251 }.get(mt)
2252 if ext is not None:
2253 return ext
2254
2255 _, _, res = mt.rpartition('/')
2256 res = res.split(';')[0].strip().lower()
2257
2258 return {
2259 '3gpp': '3gp',
2260 'smptett+xml': 'tt',
2261 'srt': 'srt',
2262 'ttaf+xml': 'dfxp',
2263 'ttml+xml': 'ttml',
2264 'vtt': 'vtt',
2265 'x-flv': 'flv',
2266 'x-mp4-fragmented': 'mp4',
2267 'x-ms-wmv': 'wmv',
2268 'mpegurl': 'm3u8',
2269 'x-mpegurl': 'm3u8',
2270 'vnd.apple.mpegurl': 'm3u8',
2271 'dash+xml': 'mpd',
2272 'f4m': 'f4m',
2273 'f4m+xml': 'f4m',
2274 'hds+xml': 'f4m',
2275 'vnd.ms-sstr+xml': 'ism',
2276 'quicktime': 'mov',
2277 }.get(res, res)
2278
2279
2280 def parse_codecs(codecs_str):
2281 # http://tools.ietf.org/html/rfc6381
2282 if not codecs_str:
2283 return {}
2284 splited_codecs = list(filter(None, map(
2285 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2286 vcodec, acodec = None, None
2287 for full_codec in splited_codecs:
2288 codec = full_codec.split('.')[0]
2289 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2290 if not vcodec:
2291 vcodec = full_codec
2292 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2293 if not acodec:
2294 acodec = full_codec
2295 else:
2296 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2297 if not vcodec and not acodec:
2298 if len(splited_codecs) == 2:
2299 return {
2300 'vcodec': vcodec,
2301 'acodec': acodec,
2302 }
2303 elif len(splited_codecs) == 1:
2304 return {
2305 'vcodec': 'none',
2306 'acodec': vcodec,
2307 }
2308 else:
2309 return {
2310 'vcodec': vcodec or 'none',
2311 'acodec': acodec or 'none',
2312 }
2313 return {}
2314
2315
2316 def urlhandle_detect_ext(url_handle):
2317 getheader = url_handle.headers.get
2318
2319 cd = getheader('Content-Disposition')
2320 if cd:
2321 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2322 if m:
2323 e = determine_ext(m.group('filename'), default_ext=None)
2324 if e:
2325 return e
2326
2327 return mimetype2ext(getheader('Content-Type'))
2328
2329
2330 def encode_data_uri(data, mime_type):
2331 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2332
2333
2334 def age_restricted(content_limit, age_limit):
2335 """ Returns True iff the content should be blocked """
2336
2337 if age_limit is None: # No limit set
2338 return False
2339 if content_limit is None:
2340 return False # Content available for everyone
2341 return age_limit < content_limit
2342
2343
2344 def is_html(first_bytes):
2345 """ Detect whether a file contains HTML by examining its first bytes. """
2346
2347 BOMS = [
2348 (b'\xef\xbb\xbf', 'utf-8'),
2349 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2350 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2351 (b'\xff\xfe', 'utf-16-le'),
2352 (b'\xfe\xff', 'utf-16-be'),
2353 ]
2354 for bom, enc in BOMS:
2355 if first_bytes.startswith(bom):
2356 s = first_bytes[len(bom):].decode(enc, 'replace')
2357 break
2358 else:
2359 s = first_bytes.decode('utf-8', 'replace')
2360
2361 return re.match(r'^\s*<', s)
2362
2363
2364 def determine_protocol(info_dict):
2365 protocol = info_dict.get('protocol')
2366 if protocol is not None:
2367 return protocol
2368
2369 url = info_dict['url']
2370 if url.startswith('rtmp'):
2371 return 'rtmp'
2372 elif url.startswith('mms'):
2373 return 'mms'
2374 elif url.startswith('rtsp'):
2375 return 'rtsp'
2376
2377 ext = determine_ext(url)
2378 if ext == 'm3u8':
2379 return 'm3u8'
2380 elif ext == 'f4m':
2381 return 'f4m'
2382
2383 return compat_urllib_parse_urlparse(url).scheme
2384
2385
2386 def render_table(header_row, data):
2387 """ Render a list of rows, each as a list of values """
2388 table = [header_row] + data
2389 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2390 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2391 return '\n'.join(format_str % tuple(row) for row in table)
2392
2393
2394 def _match_one(filter_part, dct):
2395 COMPARISON_OPERATORS = {
2396 '<': operator.lt,
2397 '<=': operator.le,
2398 '>': operator.gt,
2399 '>=': operator.ge,
2400 '=': operator.eq,
2401 '!=': operator.ne,
2402 }
2403 operator_rex = re.compile(r'''(?x)\s*
2404 (?P<key>[a-z_]+)
2405 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2406 (?:
2407 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2408 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2409 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2410 )
2411 \s*$
2412 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2413 m = operator_rex.search(filter_part)
2414 if m:
2415 op = COMPARISON_OPERATORS[m.group('op')]
2416 actual_value = dct.get(m.group('key'))
2417 if (m.group('quotedstrval') is not None or
2418 m.group('strval') is not None or
2419 # If the original field is a string and matching comparisonvalue is
2420 # a number we should respect the origin of the original field
2421 # and process comparison value as a string (see
2422 # https://github.com/rg3/youtube-dl/issues/11082).
2423 actual_value is not None and m.group('intval') is not None and
2424 isinstance(actual_value, compat_str)):
2425 if m.group('op') not in ('=', '!='):
2426 raise ValueError(
2427 'Operator %s does not support string values!' % m.group('op'))
2428 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2429 quote = m.group('quote')
2430 if quote is not None:
2431 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2432 else:
2433 try:
2434 comparison_value = int(m.group('intval'))
2435 except ValueError:
2436 comparison_value = parse_filesize(m.group('intval'))
2437 if comparison_value is None:
2438 comparison_value = parse_filesize(m.group('intval') + 'B')
2439 if comparison_value is None:
2440 raise ValueError(
2441 'Invalid integer value %r in filter part %r' % (
2442 m.group('intval'), filter_part))
2443 if actual_value is None:
2444 return m.group('none_inclusive')
2445 return op(actual_value, comparison_value)
2446
2447 UNARY_OPERATORS = {
2448 '': lambda v: v is not None,
2449 '!': lambda v: v is None,
2450 }
2451 operator_rex = re.compile(r'''(?x)\s*
2452 (?P<op>%s)\s*(?P<key>[a-z_]+)
2453 \s*$
2454 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2455 m = operator_rex.search(filter_part)
2456 if m:
2457 op = UNARY_OPERATORS[m.group('op')]
2458 actual_value = dct.get(m.group('key'))
2459 return op(actual_value)
2460
2461 raise ValueError('Invalid filter part %r' % filter_part)
2462
2463
2464 def match_str(filter_str, dct):
2465 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2466
2467 return all(
2468 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2469
2470
2471 def match_filter_func(filter_str):
2472 def _match_func(info_dict):
2473 if match_str(filter_str, info_dict):
2474 return None
2475 else:
2476 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2477 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2478 return _match_func
2479
2480
2481 def parse_dfxp_time_expr(time_expr):
2482 if not time_expr:
2483 return
2484
2485 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2486 if mobj:
2487 return float(mobj.group('time_offset'))
2488
2489 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2490 if mobj:
2491 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2492
2493
2494 def srt_subtitles_timecode(seconds):
2495 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2496
2497
2498 def dfxp2srt(dfxp_data):
2499 _x = functools.partial(xpath_with_ns, ns_map={
2500 'ttml': 'http://www.w3.org/ns/ttml',
2501 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2502 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2503 })
2504
2505 class TTMLPElementParser(object):
2506 out = ''
2507
2508 def start(self, tag, attrib):
2509 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2510 self.out += '\n'
2511
2512 def end(self, tag):
2513 pass
2514
2515 def data(self, data):
2516 self.out += data
2517
2518 def close(self):
2519 return self.out.strip()
2520
2521 def parse_node(node):
2522 target = TTMLPElementParser()
2523 parser = xml.etree.ElementTree.XMLParser(target=target)
2524 parser.feed(xml.etree.ElementTree.tostring(node))
2525 return parser.close()
2526
2527 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2528 out = []
2529 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2530
2531 if not paras:
2532 raise ValueError('Invalid dfxp/TTML subtitle')
2533
2534 for para, index in zip(paras, itertools.count(1)):
2535 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2536 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2537 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2538 if begin_time is None:
2539 continue
2540 if not end_time:
2541 if not dur:
2542 continue
2543 end_time = begin_time + dur
2544 out.append('%d\n%s --> %s\n%s\n\n' % (
2545 index,
2546 srt_subtitles_timecode(begin_time),
2547 srt_subtitles_timecode(end_time),
2548 parse_node(para)))
2549
2550 return ''.join(out)
2551
2552
2553 def cli_option(params, command_option, param):
2554 param = params.get(param)
2555 if param:
2556 param = compat_str(param)
2557 return [command_option, param] if param is not None else []
2558
2559
2560 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2561 param = params.get(param)
2562 assert isinstance(param, bool)
2563 if separator:
2564 return [command_option + separator + (true_value if param else false_value)]
2565 return [command_option, true_value if param else false_value]
2566
2567
2568 def cli_valueless_option(params, command_option, param, expected_value=True):
2569 param = params.get(param)
2570 return [command_option] if param == expected_value else []
2571
2572
2573 def cli_configuration_args(params, param, default=[]):
2574 ex_args = params.get(param)
2575 if ex_args is None:
2576 return default
2577 assert isinstance(ex_args, list)
2578 return ex_args
2579
2580
2581 class ISO639Utils(object):
2582 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2583 _lang_map = {
2584 'aa': 'aar',
2585 'ab': 'abk',
2586 'ae': 'ave',
2587 'af': 'afr',
2588 'ak': 'aka',
2589 'am': 'amh',
2590 'an': 'arg',
2591 'ar': 'ara',
2592 'as': 'asm',
2593 'av': 'ava',
2594 'ay': 'aym',
2595 'az': 'aze',
2596 'ba': 'bak',
2597 'be': 'bel',
2598 'bg': 'bul',
2599 'bh': 'bih',
2600 'bi': 'bis',
2601 'bm': 'bam',
2602 'bn': 'ben',
2603 'bo': 'bod',
2604 'br': 'bre',
2605 'bs': 'bos',
2606 'ca': 'cat',
2607 'ce': 'che',
2608 'ch': 'cha',
2609 'co': 'cos',
2610 'cr': 'cre',
2611 'cs': 'ces',
2612 'cu': 'chu',
2613 'cv': 'chv',
2614 'cy': 'cym',
2615 'da': 'dan',
2616 'de': 'deu',
2617 'dv': 'div',
2618 'dz': 'dzo',
2619 'ee': 'ewe',
2620 'el': 'ell',
2621 'en': 'eng',
2622 'eo': 'epo',
2623 'es': 'spa',
2624 'et': 'est',
2625 'eu': 'eus',
2626 'fa': 'fas',
2627 'ff': 'ful',
2628 'fi': 'fin',
2629 'fj': 'fij',
2630 'fo': 'fao',
2631 'fr': 'fra',
2632 'fy': 'fry',
2633 'ga': 'gle',
2634 'gd': 'gla',
2635 'gl': 'glg',
2636 'gn': 'grn',
2637 'gu': 'guj',
2638 'gv': 'glv',
2639 'ha': 'hau',
2640 'he': 'heb',
2641 'hi': 'hin',
2642 'ho': 'hmo',
2643 'hr': 'hrv',
2644 'ht': 'hat',
2645 'hu': 'hun',
2646 'hy': 'hye',
2647 'hz': 'her',
2648 'ia': 'ina',
2649 'id': 'ind',
2650 'ie': 'ile',
2651 'ig': 'ibo',
2652 'ii': 'iii',
2653 'ik': 'ipk',
2654 'io': 'ido',
2655 'is': 'isl',
2656 'it': 'ita',
2657 'iu': 'iku',
2658 'ja': 'jpn',
2659 'jv': 'jav',
2660 'ka': 'kat',
2661 'kg': 'kon',
2662 'ki': 'kik',
2663 'kj': 'kua',
2664 'kk': 'kaz',
2665 'kl': 'kal',
2666 'km': 'khm',
2667 'kn': 'kan',
2668 'ko': 'kor',
2669 'kr': 'kau',
2670 'ks': 'kas',
2671 'ku': 'kur',
2672 'kv': 'kom',
2673 'kw': 'cor',
2674 'ky': 'kir',
2675 'la': 'lat',
2676 'lb': 'ltz',
2677 'lg': 'lug',
2678 'li': 'lim',
2679 'ln': 'lin',
2680 'lo': 'lao',
2681 'lt': 'lit',
2682 'lu': 'lub',
2683 'lv': 'lav',
2684 'mg': 'mlg',
2685 'mh': 'mah',
2686 'mi': 'mri',
2687 'mk': 'mkd',
2688 'ml': 'mal',
2689 'mn': 'mon',
2690 'mr': 'mar',
2691 'ms': 'msa',
2692 'mt': 'mlt',
2693 'my': 'mya',
2694 'na': 'nau',
2695 'nb': 'nob',
2696 'nd': 'nde',
2697 'ne': 'nep',
2698 'ng': 'ndo',
2699 'nl': 'nld',
2700 'nn': 'nno',
2701 'no': 'nor',
2702 'nr': 'nbl',
2703 'nv': 'nav',
2704 'ny': 'nya',
2705 'oc': 'oci',
2706 'oj': 'oji',
2707 'om': 'orm',
2708 'or': 'ori',
2709 'os': 'oss',
2710 'pa': 'pan',
2711 'pi': 'pli',
2712 'pl': 'pol',
2713 'ps': 'pus',
2714 'pt': 'por',
2715 'qu': 'que',
2716 'rm': 'roh',
2717 'rn': 'run',
2718 'ro': 'ron',
2719 'ru': 'rus',
2720 'rw': 'kin',
2721 'sa': 'san',
2722 'sc': 'srd',
2723 'sd': 'snd',
2724 'se': 'sme',
2725 'sg': 'sag',
2726 'si': 'sin',
2727 'sk': 'slk',
2728 'sl': 'slv',
2729 'sm': 'smo',
2730 'sn': 'sna',
2731 'so': 'som',
2732 'sq': 'sqi',
2733 'sr': 'srp',
2734 'ss': 'ssw',
2735 'st': 'sot',
2736 'su': 'sun',
2737 'sv': 'swe',
2738 'sw': 'swa',
2739 'ta': 'tam',
2740 'te': 'tel',
2741 'tg': 'tgk',
2742 'th': 'tha',
2743 'ti': 'tir',
2744 'tk': 'tuk',
2745 'tl': 'tgl',
2746 'tn': 'tsn',
2747 'to': 'ton',
2748 'tr': 'tur',
2749 'ts': 'tso',
2750 'tt': 'tat',
2751 'tw': 'twi',
2752 'ty': 'tah',
2753 'ug': 'uig',
2754 'uk': 'ukr',
2755 'ur': 'urd',
2756 'uz': 'uzb',
2757 've': 'ven',
2758 'vi': 'vie',
2759 'vo': 'vol',
2760 'wa': 'wln',
2761 'wo': 'wol',
2762 'xh': 'xho',
2763 'yi': 'yid',
2764 'yo': 'yor',
2765 'za': 'zha',
2766 'zh': 'zho',
2767 'zu': 'zul',
2768 }
2769
2770 @classmethod
2771 def short2long(cls, code):
2772 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2773 return cls._lang_map.get(code[:2])
2774
2775 @classmethod
2776 def long2short(cls, code):
2777 """Convert language code from ISO 639-2/T to ISO 639-1"""
2778 for short_name, long_name in cls._lang_map.items():
2779 if long_name == code:
2780 return short_name
2781
2782
2783 class ISO3166Utils(object):
2784 # From http://data.okfn.org/data/core/country-list
2785 _country_map = {
2786 'AF': 'Afghanistan',
2787 'AX': 'Åland Islands',
2788 'AL': 'Albania',
2789 'DZ': 'Algeria',
2790 'AS': 'American Samoa',
2791 'AD': 'Andorra',
2792 'AO': 'Angola',
2793 'AI': 'Anguilla',
2794 'AQ': 'Antarctica',
2795 'AG': 'Antigua and Barbuda',
2796 'AR': 'Argentina',
2797 'AM': 'Armenia',
2798 'AW': 'Aruba',
2799 'AU': 'Australia',
2800 'AT': 'Austria',
2801 'AZ': 'Azerbaijan',
2802 'BS': 'Bahamas',
2803 'BH': 'Bahrain',
2804 'BD': 'Bangladesh',
2805 'BB': 'Barbados',
2806 'BY': 'Belarus',
2807 'BE': 'Belgium',
2808 'BZ': 'Belize',
2809 'BJ': 'Benin',
2810 'BM': 'Bermuda',
2811 'BT': 'Bhutan',
2812 'BO': 'Bolivia, Plurinational State of',
2813 'BQ': 'Bonaire, Sint Eustatius and Saba',
2814 'BA': 'Bosnia and Herzegovina',
2815 'BW': 'Botswana',
2816 'BV': 'Bouvet Island',
2817 'BR': 'Brazil',
2818 'IO': 'British Indian Ocean Territory',
2819 'BN': 'Brunei Darussalam',
2820 'BG': 'Bulgaria',
2821 'BF': 'Burkina Faso',
2822 'BI': 'Burundi',
2823 'KH': 'Cambodia',
2824 'CM': 'Cameroon',
2825 'CA': 'Canada',
2826 'CV': 'Cape Verde',
2827 'KY': 'Cayman Islands',
2828 'CF': 'Central African Republic',
2829 'TD': 'Chad',
2830 'CL': 'Chile',
2831 'CN': 'China',
2832 'CX': 'Christmas Island',
2833 'CC': 'Cocos (Keeling) Islands',
2834 'CO': 'Colombia',
2835 'KM': 'Comoros',
2836 'CG': 'Congo',
2837 'CD': 'Congo, the Democratic Republic of the',
2838 'CK': 'Cook Islands',
2839 'CR': 'Costa Rica',
2840 'CI': 'Côte d\'Ivoire',
2841 'HR': 'Croatia',
2842 'CU': 'Cuba',
2843 'CW': 'Curaçao',
2844 'CY': 'Cyprus',
2845 'CZ': 'Czech Republic',
2846 'DK': 'Denmark',
2847 'DJ': 'Djibouti',
2848 'DM': 'Dominica',
2849 'DO': 'Dominican Republic',
2850 'EC': 'Ecuador',
2851 'EG': 'Egypt',
2852 'SV': 'El Salvador',
2853 'GQ': 'Equatorial Guinea',
2854 'ER': 'Eritrea',
2855 'EE': 'Estonia',
2856 'ET': 'Ethiopia',
2857 'FK': 'Falkland Islands (Malvinas)',
2858 'FO': 'Faroe Islands',
2859 'FJ': 'Fiji',
2860 'FI': 'Finland',
2861 'FR': 'France',
2862 'GF': 'French Guiana',
2863 'PF': 'French Polynesia',
2864 'TF': 'French Southern Territories',
2865 'GA': 'Gabon',
2866 'GM': 'Gambia',
2867 'GE': 'Georgia',
2868 'DE': 'Germany',
2869 'GH': 'Ghana',
2870 'GI': 'Gibraltar',
2871 'GR': 'Greece',
2872 'GL': 'Greenland',
2873 'GD': 'Grenada',
2874 'GP': 'Guadeloupe',
2875 'GU': 'Guam',
2876 'GT': 'Guatemala',
2877 'GG': 'Guernsey',
2878 'GN': 'Guinea',
2879 'GW': 'Guinea-Bissau',
2880 'GY': 'Guyana',
2881 'HT': 'Haiti',
2882 'HM': 'Heard Island and McDonald Islands',
2883 'VA': 'Holy See (Vatican City State)',
2884 'HN': 'Honduras',
2885 'HK': 'Hong Kong',
2886 'HU': 'Hungary',
2887 'IS': 'Iceland',
2888 'IN': 'India',
2889 'ID': 'Indonesia',
2890 'IR': 'Iran, Islamic Republic of',
2891 'IQ': 'Iraq',
2892 'IE': 'Ireland',
2893 'IM': 'Isle of Man',
2894 'IL': 'Israel',
2895 'IT': 'Italy',
2896 'JM': 'Jamaica',
2897 'JP': 'Japan',
2898 'JE': 'Jersey',
2899 'JO': 'Jordan',
2900 'KZ': 'Kazakhstan',
2901 'KE': 'Kenya',
2902 'KI': 'Kiribati',
2903 'KP': 'Korea, Democratic People\'s Republic of',
2904 'KR': 'Korea, Republic of',
2905 'KW': 'Kuwait',
2906 'KG': 'Kyrgyzstan',
2907 'LA': 'Lao People\'s Democratic Republic',
2908 'LV': 'Latvia',
2909 'LB': 'Lebanon',
2910 'LS': 'Lesotho',
2911 'LR': 'Liberia',
2912 'LY': 'Libya',
2913 'LI': 'Liechtenstein',
2914 'LT': 'Lithuania',
2915 'LU': 'Luxembourg',
2916 'MO': 'Macao',
2917 'MK': 'Macedonia, the Former Yugoslav Republic of',
2918 'MG': 'Madagascar',
2919 'MW': 'Malawi',
2920 'MY': 'Malaysia',
2921 'MV': 'Maldives',
2922 'ML': 'Mali',
2923 'MT': 'Malta',
2924 'MH': 'Marshall Islands',
2925 'MQ': 'Martinique',
2926 'MR': 'Mauritania',
2927 'MU': 'Mauritius',
2928 'YT': 'Mayotte',
2929 'MX': 'Mexico',
2930 'FM': 'Micronesia, Federated States of',
2931 'MD': 'Moldova, Republic of',
2932 'MC': 'Monaco',
2933 'MN': 'Mongolia',
2934 'ME': 'Montenegro',
2935 'MS': 'Montserrat',
2936 'MA': 'Morocco',
2937 'MZ': 'Mozambique',
2938 'MM': 'Myanmar',
2939 'NA': 'Namibia',
2940 'NR': 'Nauru',
2941 'NP': 'Nepal',
2942 'NL': 'Netherlands',
2943 'NC': 'New Caledonia',
2944 'NZ': 'New Zealand',
2945 'NI': 'Nicaragua',
2946 'NE': 'Niger',
2947 'NG': 'Nigeria',
2948 'NU': 'Niue',
2949 'NF': 'Norfolk Island',
2950 'MP': 'Northern Mariana Islands',
2951 'NO': 'Norway',
2952 'OM': 'Oman',
2953 'PK': 'Pakistan',
2954 'PW': 'Palau',
2955 'PS': 'Palestine, State of',
2956 'PA': 'Panama',
2957 'PG': 'Papua New Guinea',
2958 'PY': 'Paraguay',
2959 'PE': 'Peru',
2960 'PH': 'Philippines',
2961 'PN': 'Pitcairn',
2962 'PL': 'Poland',
2963 'PT': 'Portugal',
2964 'PR': 'Puerto Rico',
2965 'QA': 'Qatar',
2966 'RE': 'Réunion',
2967 'RO': 'Romania',
2968 'RU': 'Russian Federation',
2969 'RW': 'Rwanda',
2970 'BL': 'Saint Barthélemy',
2971 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2972 'KN': 'Saint Kitts and Nevis',
2973 'LC': 'Saint Lucia',
2974 'MF': 'Saint Martin (French part)',
2975 'PM': 'Saint Pierre and Miquelon',
2976 'VC': 'Saint Vincent and the Grenadines',
2977 'WS': 'Samoa',
2978 'SM': 'San Marino',
2979 'ST': 'Sao Tome and Principe',
2980 'SA': 'Saudi Arabia',
2981 'SN': 'Senegal',
2982 'RS': 'Serbia',
2983 'SC': 'Seychelles',
2984 'SL': 'Sierra Leone',
2985 'SG': 'Singapore',
2986 'SX': 'Sint Maarten (Dutch part)',
2987 'SK': 'Slovakia',
2988 'SI': 'Slovenia',
2989 'SB': 'Solomon Islands',
2990 'SO': 'Somalia',
2991 'ZA': 'South Africa',
2992 'GS': 'South Georgia and the South Sandwich Islands',
2993 'SS': 'South Sudan',
2994 'ES': 'Spain',
2995 'LK': 'Sri Lanka',
2996 'SD': 'Sudan',
2997 'SR': 'Suriname',
2998 'SJ': 'Svalbard and Jan Mayen',
2999 'SZ': 'Swaziland',
3000 'SE': 'Sweden',
3001 'CH': 'Switzerland',
3002 'SY': 'Syrian Arab Republic',
3003 'TW': 'Taiwan, Province of China',
3004 'TJ': 'Tajikistan',
3005 'TZ': 'Tanzania, United Republic of',
3006 'TH': 'Thailand',
3007 'TL': 'Timor-Leste',
3008 'TG': 'Togo',
3009 'TK': 'Tokelau',
3010 'TO': 'Tonga',
3011 'TT': 'Trinidad and Tobago',
3012 'TN': 'Tunisia',
3013 'TR': 'Turkey',
3014 'TM': 'Turkmenistan',
3015 'TC': 'Turks and Caicos Islands',
3016 'TV': 'Tuvalu',
3017 'UG': 'Uganda',
3018 'UA': 'Ukraine',
3019 'AE': 'United Arab Emirates',
3020 'GB': 'United Kingdom',
3021 'US': 'United States',
3022 'UM': 'United States Minor Outlying Islands',
3023 'UY': 'Uruguay',
3024 'UZ': 'Uzbekistan',
3025 'VU': 'Vanuatu',
3026 'VE': 'Venezuela, Bolivarian Republic of',
3027 'VN': 'Viet Nam',
3028 'VG': 'Virgin Islands, British',
3029 'VI': 'Virgin Islands, U.S.',
3030 'WF': 'Wallis and Futuna',
3031 'EH': 'Western Sahara',
3032 'YE': 'Yemen',
3033 'ZM': 'Zambia',
3034 'ZW': 'Zimbabwe',
3035 }
3036
3037 @classmethod
3038 def short2full(cls, code):
3039 """Convert an ISO 3166-2 country code to the corresponding full name"""
3040 return cls._country_map.get(code.upper())
3041
3042
3043 class GeoUtils(object):
3044 # Major IPv4 address blocks per country
3045 _country_ip_map = {
3046 'AD': '85.94.160.0/19',
3047 'AE': '94.200.0.0/13',
3048 'AF': '149.54.0.0/17',
3049 'AG': '209.59.64.0/18',
3050 'AI': '204.14.248.0/21',
3051 'AL': '46.99.0.0/16',
3052 'AM': '46.70.0.0/15',
3053 'AO': '105.168.0.0/13',
3054 'AP': '159.117.192.0/21',
3055 'AR': '181.0.0.0/12',
3056 'AS': '202.70.112.0/20',
3057 'AT': '84.112.0.0/13',
3058 'AU': '1.128.0.0/11',
3059 'AW': '181.41.0.0/18',
3060 'AZ': '5.191.0.0/16',
3061 'BA': '31.176.128.0/17',
3062 'BB': '65.48.128.0/17',
3063 'BD': '114.130.0.0/16',
3064 'BE': '57.0.0.0/8',
3065 'BF': '129.45.128.0/17',
3066 'BG': '95.42.0.0/15',
3067 'BH': '37.131.0.0/17',
3068 'BI': '154.117.192.0/18',
3069 'BJ': '137.255.0.0/16',
3070 'BL': '192.131.134.0/24',
3071 'BM': '196.12.64.0/18',
3072 'BN': '156.31.0.0/16',
3073 'BO': '161.56.0.0/16',
3074 'BQ': '161.0.80.0/20',
3075 'BR': '152.240.0.0/12',
3076 'BS': '24.51.64.0/18',
3077 'BT': '119.2.96.0/19',
3078 'BW': '168.167.0.0/16',
3079 'BY': '178.120.0.0/13',
3080 'BZ': '179.42.192.0/18',
3081 'CA': '99.224.0.0/11',
3082 'CD': '41.243.0.0/16',
3083 'CF': '196.32.200.0/21',
3084 'CG': '197.214.128.0/17',
3085 'CH': '85.0.0.0/13',
3086 'CI': '154.232.0.0/14',
3087 'CK': '202.65.32.0/19',
3088 'CL': '152.172.0.0/14',
3089 'CM': '165.210.0.0/15',
3090 'CN': '36.128.0.0/10',
3091 'CO': '181.240.0.0/12',
3092 'CR': '201.192.0.0/12',
3093 'CU': '152.206.0.0/15',
3094 'CV': '165.90.96.0/19',
3095 'CW': '190.88.128.0/17',
3096 'CY': '46.198.0.0/15',
3097 'CZ': '88.100.0.0/14',
3098 'DE': '53.0.0.0/8',
3099 'DJ': '197.241.0.0/17',
3100 'DK': '87.48.0.0/12',
3101 'DM': '192.243.48.0/20',
3102 'DO': '152.166.0.0/15',
3103 'DZ': '41.96.0.0/12',
3104 'EC': '186.68.0.0/15',
3105 'EE': '90.190.0.0/15',
3106 'EG': '156.160.0.0/11',
3107 'ER': '196.200.96.0/20',
3108 'ES': '88.0.0.0/11',
3109 'ET': '196.188.0.0/14',
3110 'EU': '2.16.0.0/13',
3111 'FI': '91.152.0.0/13',
3112 'FJ': '144.120.0.0/16',
3113 'FM': '119.252.112.0/20',
3114 'FO': '88.85.32.0/19',
3115 'FR': '90.0.0.0/9',
3116 'GA': '41.158.0.0/15',
3117 'GB': '25.0.0.0/8',
3118 'GD': '74.122.88.0/21',
3119 'GE': '31.146.0.0/16',
3120 'GF': '161.22.64.0/18',
3121 'GG': '62.68.160.0/19',
3122 'GH': '45.208.0.0/14',
3123 'GI': '85.115.128.0/19',
3124 'GL': '88.83.0.0/19',
3125 'GM': '160.182.0.0/15',
3126 'GN': '197.149.192.0/18',
3127 'GP': '104.250.0.0/19',
3128 'GQ': '105.235.224.0/20',
3129 'GR': '94.64.0.0/13',
3130 'GT': '168.234.0.0/16',
3131 'GU': '168.123.0.0/16',
3132 'GW': '197.214.80.0/20',
3133 'GY': '181.41.64.0/18',
3134 'HK': '113.252.0.0/14',
3135 'HN': '181.210.0.0/16',
3136 'HR': '93.136.0.0/13',
3137 'HT': '148.102.128.0/17',
3138 'HU': '84.0.0.0/14',
3139 'ID': '39.192.0.0/10',
3140 'IE': '87.32.0.0/12',
3141 'IL': '79.176.0.0/13',
3142 'IM': '5.62.80.0/20',
3143 'IN': '117.192.0.0/10',
3144 'IO': '203.83.48.0/21',
3145 'IQ': '37.236.0.0/14',
3146 'IR': '2.176.0.0/12',
3147 'IS': '82.221.0.0/16',
3148 'IT': '79.0.0.0/10',
3149 'JE': '87.244.64.0/18',
3150 'JM': '72.27.0.0/17',
3151 'JO': '176.29.0.0/16',
3152 'JP': '126.0.0.0/8',
3153 'KE': '105.48.0.0/12',
3154 'KG': '158.181.128.0/17',
3155 'KH': '36.37.128.0/17',
3156 'KI': '103.25.140.0/22',
3157 'KM': '197.255.224.0/20',
3158 'KN': '198.32.32.0/19',
3159 'KP': '175.45.176.0/22',
3160 'KR': '175.192.0.0/10',
3161 'KW': '37.36.0.0/14',
3162 'KY': '64.96.0.0/15',
3163 'KZ': '2.72.0.0/13',
3164 'LA': '115.84.64.0/18',
3165 'LB': '178.135.0.0/16',
3166 'LC': '192.147.231.0/24',
3167 'LI': '82.117.0.0/19',
3168 'LK': '112.134.0.0/15',
3169 'LR': '41.86.0.0/19',
3170 'LS': '129.232.0.0/17',
3171 'LT': '78.56.0.0/13',
3172 'LU': '188.42.0.0/16',
3173 'LV': '46.109.0.0/16',
3174 'LY': '41.252.0.0/14',
3175 'MA': '105.128.0.0/11',
3176 'MC': '88.209.64.0/18',
3177 'MD': '37.246.0.0/16',
3178 'ME': '178.175.0.0/17',
3179 'MF': '74.112.232.0/21',
3180 'MG': '154.126.0.0/17',
3181 'MH': '117.103.88.0/21',
3182 'MK': '77.28.0.0/15',
3183 'ML': '154.118.128.0/18',
3184 'MM': '37.111.0.0/17',
3185 'MN': '49.0.128.0/17',
3186 'MO': '60.246.0.0/16',
3187 'MP': '202.88.64.0/20',
3188 'MQ': '109.203.224.0/19',
3189 'MR': '41.188.64.0/18',
3190 'MS': '208.90.112.0/22',
3191 'MT': '46.11.0.0/16',
3192 'MU': '105.16.0.0/12',
3193 'MV': '27.114.128.0/18',
3194 'MW': '105.234.0.0/16',
3195 'MX': '187.192.0.0/11',
3196 'MY': '175.136.0.0/13',
3197 'MZ': '197.218.0.0/15',
3198 'NA': '41.182.0.0/16',
3199 'NC': '101.101.0.0/18',
3200 'NE': '197.214.0.0/18',
3201 'NF': '203.17.240.0/22',
3202 'NG': '105.112.0.0/12',
3203 'NI': '186.76.0.0/15',
3204 'NL': '145.96.0.0/11',
3205 'NO': '84.208.0.0/13',
3206 'NP': '36.252.0.0/15',
3207 'NR': '203.98.224.0/19',
3208 'NU': '49.156.48.0/22',
3209 'NZ': '49.224.0.0/14',
3210 'OM': '5.36.0.0/15',
3211 'PA': '186.72.0.0/15',
3212 'PE': '186.160.0.0/14',
3213 'PF': '123.50.64.0/18',
3214 'PG': '124.240.192.0/19',
3215 'PH': '49.144.0.0/13',
3216 'PK': '39.32.0.0/11',
3217 'PL': '83.0.0.0/11',
3218 'PM': '70.36.0.0/20',
3219 'PR': '66.50.0.0/16',
3220 'PS': '188.161.0.0/16',
3221 'PT': '85.240.0.0/13',
3222 'PW': '202.124.224.0/20',
3223 'PY': '181.120.0.0/14',
3224 'QA': '37.210.0.0/15',
3225 'RE': '139.26.0.0/16',
3226 'RO': '79.112.0.0/13',
3227 'RS': '178.220.0.0/14',
3228 'RU': '5.136.0.0/13',
3229 'RW': '105.178.0.0/15',
3230 'SA': '188.48.0.0/13',
3231 'SB': '202.1.160.0/19',
3232 'SC': '154.192.0.0/11',
3233 'SD': '154.96.0.0/13',
3234 'SE': '78.64.0.0/12',
3235 'SG': '152.56.0.0/14',
3236 'SI': '188.196.0.0/14',
3237 'SK': '78.98.0.0/15',
3238 'SL': '197.215.0.0/17',
3239 'SM': '89.186.32.0/19',
3240 'SN': '41.82.0.0/15',
3241 'SO': '197.220.64.0/19',
3242 'SR': '186.179.128.0/17',
3243 'SS': '105.235.208.0/21',
3244 'ST': '197.159.160.0/19',
3245 'SV': '168.243.0.0/16',
3246 'SX': '190.102.0.0/20',
3247 'SY': '5.0.0.0/16',
3248 'SZ': '41.84.224.0/19',
3249 'TC': '65.255.48.0/20',
3250 'TD': '154.68.128.0/19',
3251 'TG': '196.168.0.0/14',
3252 'TH': '171.96.0.0/13',
3253 'TJ': '85.9.128.0/18',
3254 'TK': '27.96.24.0/21',
3255 'TL': '180.189.160.0/20',
3256 'TM': '95.85.96.0/19',
3257 'TN': '197.0.0.0/11',
3258 'TO': '175.176.144.0/21',
3259 'TR': '78.160.0.0/11',
3260 'TT': '186.44.0.0/15',
3261 'TV': '202.2.96.0/19',
3262 'TW': '120.96.0.0/11',
3263 'TZ': '156.156.0.0/14',
3264 'UA': '93.72.0.0/13',
3265 'UG': '154.224.0.0/13',
3266 'US': '3.0.0.0/8',
3267 'UY': '167.56.0.0/13',
3268 'UZ': '82.215.64.0/18',
3269 'VA': '212.77.0.0/19',
3270 'VC': '24.92.144.0/20',
3271 'VE': '186.88.0.0/13',
3272 'VG': '172.103.64.0/18',
3273 'VI': '146.226.0.0/16',
3274 'VN': '14.160.0.0/11',
3275 'VU': '202.80.32.0/20',
3276 'WF': '117.20.32.0/21',
3277 'WS': '202.4.32.0/19',
3278 'YE': '134.35.0.0/16',
3279 'YT': '41.242.116.0/22',
3280 'ZA': '41.0.0.0/11',
3281 'ZM': '165.56.0.0/13',
3282 'ZW': '41.85.192.0/19',
3283 }
3284
3285 @classmethod
3286 def random_ipv4(cls, code):
3287 block = cls._country_ip_map.get(code.upper())
3288 if not block:
3289 return None
3290 addr, preflen = block.split('/')
3291 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3292 addr_max = addr_min | (0xffffffff >> int(preflen))
3293 return compat_str(socket.inet_ntoa(
3294 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3295
3296
3297 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3298 def __init__(self, proxies=None):
3299 # Set default handlers
3300 for type in ('http', 'https'):
3301 setattr(self, '%s_open' % type,
3302 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3303 meth(r, proxy, type))
3304 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3305
3306 def proxy_open(self, req, proxy, type):
3307 req_proxy = req.headers.get('Ytdl-request-proxy')
3308 if req_proxy is not None:
3309 proxy = req_proxy
3310 del req.headers['Ytdl-request-proxy']
3311
3312 if proxy == '__noproxy__':
3313 return None # No Proxy
3314 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3315 req.add_header('Ytdl-socks-proxy', proxy)
3316 # youtube-dl's http/https handlers do wrapping the socket with socks
3317 return None
3318 return compat_urllib_request.ProxyHandler.proxy_open(
3319 self, req, proxy, type)
3320
3321
3322 def ohdave_rsa_encrypt(data, exponent, modulus):
3323 '''
3324 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3325
3326 Input:
3327 data: data to encrypt, bytes-like object
3328 exponent, modulus: parameter e and N of RSA algorithm, both integer
3329 Output: hex string of encrypted data
3330
3331 Limitation: supports one block encryption only
3332 '''
3333
3334 payload = int(binascii.hexlify(data[::-1]), 16)
3335 encrypted = pow(payload, exponent, modulus)
3336 return '%x' % encrypted
3337
3338
3339 def encode_base_n(num, n, table=None):
3340 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3341 if not table:
3342 table = FULL_TABLE[:n]
3343
3344 if n > len(table):
3345 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3346
3347 if num == 0:
3348 return table[0]
3349
3350 ret = ''
3351 while num:
3352 ret = table[num % n] + ret
3353 num = num // n
3354 return ret
3355
3356
3357 def decode_packed_codes(code):
3358 mobj = re.search(PACKED_CODES_RE, code)
3359 obfucasted_code, base, count, symbols = mobj.groups()
3360 base = int(base)
3361 count = int(count)
3362 symbols = symbols.split('|')
3363 symbol_table = {}
3364
3365 while count:
3366 count -= 1
3367 base_n_count = encode_base_n(count, base)
3368 symbol_table[base_n_count] = symbols[count] or base_n_count
3369
3370 return re.sub(
3371 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3372 obfucasted_code)
3373
3374
3375 def parse_m3u8_attributes(attrib):
3376 info = {}
3377 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3378 if val.startswith('"'):
3379 val = val[1:-1]
3380 info[key] = val
3381 return info
3382
3383
3384 def urshift(val, n):
3385 return val >> n if val >= 0 else (val + 0x100000000) >> n
3386
3387
3388 # Based on png2str() written by @gdkchan and improved by @yokrysty
3389 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3390 def decode_png(png_data):
3391 # Reference: https://www.w3.org/TR/PNG/
3392 header = png_data[8:]
3393
3394 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3395 raise IOError('Not a valid PNG file.')
3396
3397 int_map = {1: '>B', 2: '>H', 4: '>I'}
3398 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3399
3400 chunks = []
3401
3402 while header:
3403 length = unpack_integer(header[:4])
3404 header = header[4:]
3405
3406 chunk_type = header[:4]
3407 header = header[4:]
3408
3409 chunk_data = header[:length]
3410 header = header[length:]
3411
3412 header = header[4:] # Skip CRC
3413
3414 chunks.append({
3415 'type': chunk_type,
3416 'length': length,
3417 'data': chunk_data
3418 })
3419
3420 ihdr = chunks[0]['data']
3421
3422 width = unpack_integer(ihdr[:4])
3423 height = unpack_integer(ihdr[4:8])
3424
3425 idat = b''
3426
3427 for chunk in chunks:
3428 if chunk['type'] == b'IDAT':
3429 idat += chunk['data']
3430
3431 if not idat:
3432 raise IOError('Unable to read PNG data.')
3433
3434 decompressed_data = bytearray(zlib.decompress(idat))
3435
3436 stride = width * 3
3437 pixels = []
3438
3439 def _get_pixel(idx):
3440 x = idx % stride
3441 y = idx // stride
3442 return pixels[y][x]
3443
3444 for y in range(height):
3445 basePos = y * (1 + stride)
3446 filter_type = decompressed_data[basePos]
3447
3448 current_row = []
3449
3450 pixels.append(current_row)
3451
3452 for x in range(stride):
3453 color = decompressed_data[1 + basePos + x]
3454 basex = y * stride + x
3455 left = 0
3456 up = 0
3457
3458 if x > 2:
3459 left = _get_pixel(basex - 3)
3460 if y > 0:
3461 up = _get_pixel(basex - stride)
3462
3463 if filter_type == 1: # Sub
3464 color = (color + left) & 0xff
3465 elif filter_type == 2: # Up
3466 color = (color + up) & 0xff
3467 elif filter_type == 3: # Average
3468 color = (color + ((left + up) >> 1)) & 0xff
3469 elif filter_type == 4: # Paeth
3470 a = left
3471 b = up
3472 c = 0
3473
3474 if x > 2 and y > 0:
3475 c = _get_pixel(basex - stride - 3)
3476
3477 p = a + b - c
3478
3479 pa = abs(p - a)
3480 pb = abs(p - b)
3481 pc = abs(p - c)
3482
3483 if pa <= pb and pa <= pc:
3484 color = (color + a) & 0xff
3485 elif pb <= pc:
3486 color = (color + b) & 0xff
3487 else:
3488 color = (color + c) & 0xff
3489
3490 current_row.append(color)
3491
3492 return width, height, pixels
3493
3494
3495 def write_xattr(path, key, value):
3496 # This mess below finds the best xattr tool for the job
3497 try:
3498 # try the pyxattr module...
3499 import xattr
3500
3501 if hasattr(xattr, 'set'): # pyxattr
3502 # Unicode arguments are not supported in python-pyxattr until
3503 # version 0.5.0
3504 # See https://github.com/rg3/youtube-dl/issues/5498
3505 pyxattr_required_version = '0.5.0'
3506 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3507 # TODO: fallback to CLI tools
3508 raise XAttrUnavailableError(
3509 'python-pyxattr is detected but is too old. '
3510 'youtube-dl requires %s or above while your version is %s. '
3511 'Falling back to other xattr implementations' % (
3512 pyxattr_required_version, xattr.__version__))
3513
3514 setxattr = xattr.set
3515 else: # xattr
3516 setxattr = xattr.setxattr
3517
3518 try:
3519 setxattr(path, key, value)
3520 except EnvironmentError as e:
3521 raise XAttrMetadataError(e.errno, e.strerror)
3522
3523 except ImportError:
3524 if compat_os_name == 'nt':
3525 # Write xattrs to NTFS Alternate Data Streams:
3526 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3527 assert ':' not in key
3528 assert os.path.exists(path)
3529
3530 ads_fn = path + ':' + key
3531 try:
3532 with open(ads_fn, 'wb') as f:
3533 f.write(value)
3534 except EnvironmentError as e:
3535 raise XAttrMetadataError(e.errno, e.strerror)
3536 else:
3537 user_has_setfattr = check_executable('setfattr', ['--version'])
3538 user_has_xattr = check_executable('xattr', ['-h'])
3539
3540 if user_has_setfattr or user_has_xattr:
3541
3542 value = value.decode('utf-8')
3543 if user_has_setfattr:
3544 executable = 'setfattr'
3545 opts = ['-n', key, '-v', value]
3546 elif user_has_xattr:
3547 executable = 'xattr'
3548 opts = ['-w', key, value]
3549
3550 cmd = ([encodeFilename(executable, True)] +
3551 [encodeArgument(o) for o in opts] +
3552 [encodeFilename(path, True)])
3553
3554 try:
3555 p = subprocess.Popen(
3556 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3557 except EnvironmentError as e:
3558 raise XAttrMetadataError(e.errno, e.strerror)
3559 stdout, stderr = p.communicate()
3560 stderr = stderr.decode('utf-8', 'replace')
3561 if p.returncode != 0:
3562 raise XAttrMetadataError(p.returncode, stderr)
3563
3564 else:
3565 # On Unix, and can't find pyxattr, setfattr, or xattr.
3566 if sys.platform.startswith('linux'):
3567 raise XAttrUnavailableError(
3568 "Couldn't find a tool to set the xattrs. "
3569 "Install either the python 'pyxattr' or 'xattr' "
3570 "modules, or the GNU 'attr' package "
3571 "(which contains the 'setfattr' tool).")
3572 else:
3573 raise XAttrUnavailableError(
3574 "Couldn't find a tool to set the xattrs. "
3575 "Install either the python 'xattr' module, "
3576 "or the 'xattr' binary.")