]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/changelog: Prepare for release.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_os_name,
46 compat_parse_qs,
47 compat_shlex_quote,
48 compat_socket_create_connection,
49 compat_str,
50 compat_struct_pack,
51 compat_struct_unpack,
52 compat_urllib_error,
53 compat_urllib_parse,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
58 compat_urlparse,
59 compat_xpath,
60 )
61
62 from .socks import (
63 ProxyType,
64 sockssocket,
65 )
66
67
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
79
80 std_headers = {
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88
89 USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91 }
92
93
94 NO_DEFAULT = object()
95
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
100 MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
105 }
106
107 KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
126
127 DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
131 '%B %dst %Y',
132 '%B %dnd %Y',
133 '%B %dth %Y',
134 '%b %d %Y',
135 '%b %dst %Y',
136 '%b %dnd %Y',
137 '%b %dth %Y',
138 '%b %dst %Y %I:%M',
139 '%b %dnd %Y %I:%M',
140 '%b %dth %Y %I:%M',
141 '%Y %m %d',
142 '%Y-%m-%d',
143 '%Y/%m/%d',
144 '%Y/%m/%d %H:%M',
145 '%Y/%m/%d %H:%M:%S',
146 '%Y-%m-%d %H:%M',
147 '%Y-%m-%d %H:%M:%S',
148 '%Y-%m-%d %H:%M:%S.%f',
149 '%d.%m.%Y %H:%M',
150 '%d.%m.%Y %H.%M',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
154 '%Y-%m-%dT%H:%M:%S',
155 '%Y-%m-%dT%H:%M:%S.%f',
156 '%Y-%m-%dT%H:%M',
157 '%b %d %Y at %H:%M',
158 '%b %d %Y at %H:%M:%S',
159 )
160
161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
162 DATE_FORMATS_DAY_FIRST.extend([
163 '%d-%m-%Y',
164 '%d.%m.%Y',
165 '%d.%m.%y',
166 '%d/%m/%Y',
167 '%d/%m/%y',
168 '%d/%m/%Y %H:%M:%S',
169 ])
170
171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
172 DATE_FORMATS_MONTH_FIRST.extend([
173 '%m-%d-%Y',
174 '%m.%d.%Y',
175 '%m/%d/%Y',
176 '%m/%d/%y',
177 '%m/%d/%Y %H:%M:%S',
178 ])
179
180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
181
182
183 def preferredencoding():
184 """Get preferred encoding.
185
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
188 """
189 try:
190 pref = locale.getpreferredencoding()
191 'TEST'.encode(pref)
192 except Exception:
193 pref = 'UTF-8'
194
195 return pref
196
197
198 def write_json_file(obj, fn):
199 """ Encode obj as JSON and write it to fn, atomically if possible """
200
201 fn = encodeFilename(fn)
202 if sys.version_info < (3, 0) and sys.platform != 'win32':
203 encoding = get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename = lambda f: os.path.basename(fn).decode(encoding)
208 # the same for os.path.dirname
209 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
210 else:
211 path_basename = os.path.basename
212 path_dirname = os.path.dirname
213
214 args = {
215 'suffix': '.tmp',
216 'prefix': path_basename(fn) + '.',
217 'dir': path_dirname(fn),
218 'delete': False,
219 }
220
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3, 0):
224 args['mode'] = 'wb'
225 else:
226 args.update({
227 'mode': 'w',
228 'encoding': 'utf-8',
229 })
230
231 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
232
233 try:
234 with tf:
235 json.dump(obj, tf)
236 if sys.platform == 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
239 try:
240 os.unlink(fn)
241 except OSError:
242 pass
243 os.rename(tf.name, fn)
244 except Exception:
245 try:
246 os.remove(tf.name)
247 except OSError:
248 pass
249 raise
250
251
252 if sys.version_info >= (2, 7):
253 def find_xpath_attr(node, xpath, key, val=None):
254 """ Find the xpath xpath[@key=val] """
255 assert re.match(r'^[a-zA-Z_-]+$', key)
256 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
257 return node.find(expr)
258 else:
259 def find_xpath_attr(node, xpath, key, val=None):
260 for f in node.findall(compat_xpath(xpath)):
261 if key not in f.attrib:
262 continue
263 if val is None or f.attrib.get(key) == val:
264 return f
265 return None
266
267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
268 # the namespace parameter
269
270
271 def xpath_with_ns(path, ns_map):
272 components = [c.split(':') for c in path.split('/')]
273 replaced = []
274 for c in components:
275 if len(c) == 1:
276 replaced.append(c[0])
277 else:
278 ns, tag = c
279 replaced.append('{%s}%s' % (ns_map[ns], tag))
280 return '/'.join(replaced)
281
282
283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
284 def _find_xpath(xpath):
285 return node.find(compat_xpath(xpath))
286
287 if isinstance(xpath, (str, compat_str)):
288 n = _find_xpath(xpath)
289 else:
290 for xp in xpath:
291 n = _find_xpath(xp)
292 if n is not None:
293 break
294
295 if n is None:
296 if default is not NO_DEFAULT:
297 return default
298 elif fatal:
299 name = xpath if name is None else name
300 raise ExtractorError('Could not find XML element %s' % name)
301 else:
302 return None
303 return n
304
305
306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
307 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
308 if n is None or n == default:
309 return n
310 if n.text is None:
311 if default is not NO_DEFAULT:
312 return default
313 elif fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name)
316 else:
317 return None
318 return n.text
319
320
321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
322 n = find_xpath_attr(node, xpath, key)
323 if n is None:
324 if default is not NO_DEFAULT:
325 return default
326 elif fatal:
327 name = '%s[@%s]' % (xpath, key) if name is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name)
329 else:
330 return None
331 return n.attrib[key]
332
333
334 def get_element_by_id(id, html):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute('id', id, html)
337
338
339 def get_element_by_class(class_name, html):
340 return get_element_by_attribute(
341 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
342 html, escape_value=False)
343
344
345 def get_element_by_attribute(attribute, value, html, escape_value=True):
346 """Return the content of the tag with the specified attribute in the passed HTML document"""
347
348 value = re.escape(value) if escape_value else value
349
350 m = re.search(r'''(?xs)
351 <([a-zA-Z0-9:._-]+)
352 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
353 \s+%s=['"]?%s['"]?
354 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
355 \s*>
356 (?P<content>.*?)
357 </\1>
358 ''' % (re.escape(attribute), value), html)
359
360 if not m:
361 return None
362 res = m.group('content')
363
364 if res.startswith('"') or res.startswith("'"):
365 res = res[1:-1]
366
367 return unescapeHTML(res)
368
369
370 class HTMLAttributeParser(compat_HTMLParser):
371 """Trivial HTML parser to gather the attributes for a single element"""
372 def __init__(self):
373 self.attrs = {}
374 compat_HTMLParser.__init__(self)
375
376 def handle_starttag(self, tag, attrs):
377 self.attrs = dict(attrs)
378
379
380 def extract_attributes(html_element):
381 """Given a string for an HTML element such as
382 <el
383 a="foo" B="bar" c="&98;az" d=boz
384 empty= noval entity="&amp;"
385 sq='"' dq="'"
386 >
387 Decode and return a dictionary of attributes.
388 {
389 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
390 'empty': '', 'noval': None, 'entity': '&',
391 'sq': '"', 'dq': '\''
392 }.
393 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
394 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
395 """
396 parser = HTMLAttributeParser()
397 parser.feed(html_element)
398 parser.close()
399 return parser.attrs
400
401
402 def clean_html(html):
403 """Clean an HTML snippet into a readable string"""
404
405 if html is None: # Convenience for sanitizing descriptions etc.
406 return html
407
408 # Newline vs <br />
409 html = html.replace('\n', ' ')
410 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
411 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
412 # Strip html tags
413 html = re.sub('<.*?>', '', html)
414 # Replace html entities
415 html = unescapeHTML(html)
416 return html.strip()
417
418
419 def sanitize_open(filename, open_mode):
420 """Try to open the given filename, and slightly tweak it if this fails.
421
422 Attempts to open the given filename. If this fails, it tries to change
423 the filename slightly, step by step, until it's either able to open it
424 or it fails and raises a final exception, like the standard open()
425 function.
426
427 It returns the tuple (stream, definitive_file_name).
428 """
429 try:
430 if filename == '-':
431 if sys.platform == 'win32':
432 import msvcrt
433 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
434 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
435 stream = open(encodeFilename(filename), open_mode)
436 return (stream, filename)
437 except (IOError, OSError) as err:
438 if err.errno in (errno.EACCES,):
439 raise
440
441 # In case of error, try to remove win32 forbidden chars
442 alt_filename = sanitize_path(filename)
443 if alt_filename == filename:
444 raise
445 else:
446 # An exception here should be caught in the caller
447 stream = open(encodeFilename(alt_filename), open_mode)
448 return (stream, alt_filename)
449
450
451 def timeconvert(timestr):
452 """Convert RFC 2822 defined time string into system timestamp"""
453 timestamp = None
454 timetuple = email.utils.parsedate_tz(timestr)
455 if timetuple is not None:
456 timestamp = email.utils.mktime_tz(timetuple)
457 return timestamp
458
459
460 def sanitize_filename(s, restricted=False, is_id=False):
461 """Sanitizes a string so it could be used as part of a filename.
462 If restricted is set, use a stricter subset of allowed characters.
463 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
464 """
465 def replace_insane(char):
466 if restricted and char in ACCENT_CHARS:
467 return ACCENT_CHARS[char]
468 if char == '?' or ord(char) < 32 or ord(char) == 127:
469 return ''
470 elif char == '"':
471 return '' if restricted else '\''
472 elif char == ':':
473 return '_-' if restricted else ' -'
474 elif char in '\\/|*<>':
475 return '_'
476 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
477 return '_'
478 if restricted and ord(char) > 127:
479 return '_'
480 return char
481
482 # Handle timestamps
483 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
484 result = ''.join(map(replace_insane, s))
485 if not is_id:
486 while '__' in result:
487 result = result.replace('__', '_')
488 result = result.strip('_')
489 # Common case of "Foreign band name - English song title"
490 if restricted and result.startswith('-_'):
491 result = result[2:]
492 if result.startswith('-'):
493 result = '_' + result[len('-'):]
494 result = result.lstrip('.')
495 if not result:
496 result = '_'
497 return result
498
499
500 def sanitize_path(s):
501 """Sanitizes and normalizes path on Windows"""
502 if sys.platform != 'win32':
503 return s
504 drive_or_unc, _ = os.path.splitdrive(s)
505 if sys.version_info < (2, 7) and not drive_or_unc:
506 drive_or_unc, _ = os.path.splitunc(s)
507 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
508 if drive_or_unc:
509 norm_path.pop(0)
510 sanitized_path = [
511 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
512 for path_part in norm_path]
513 if drive_or_unc:
514 sanitized_path.insert(0, drive_or_unc + os.path.sep)
515 return os.path.join(*sanitized_path)
516
517
518 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
519 # unwanted failures due to missing protocol
520 def sanitize_url(url):
521 return 'http:%s' % url if url.startswith('//') else url
522
523
524 def sanitized_Request(url, *args, **kwargs):
525 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
526
527
528 def orderedSet(iterable):
529 """ Remove all duplicates from the input iterable """
530 res = []
531 for el in iterable:
532 if el not in res:
533 res.append(el)
534 return res
535
536
537 def _htmlentity_transform(entity_with_semicolon):
538 """Transforms an HTML entity to a character."""
539 entity = entity_with_semicolon[:-1]
540
541 # Known non-numeric HTML entity
542 if entity in compat_html_entities.name2codepoint:
543 return compat_chr(compat_html_entities.name2codepoint[entity])
544
545 # TODO: HTML5 allows entities without a semicolon. For example,
546 # '&Eacuteric' should be decoded as 'Éric'.
547 if entity_with_semicolon in compat_html_entities_html5:
548 return compat_html_entities_html5[entity_with_semicolon]
549
550 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
551 if mobj is not None:
552 numstr = mobj.group(1)
553 if numstr.startswith('x'):
554 base = 16
555 numstr = '0%s' % numstr
556 else:
557 base = 10
558 # See https://github.com/rg3/youtube-dl/issues/7518
559 try:
560 return compat_chr(int(numstr, base))
561 except ValueError:
562 pass
563
564 # Unknown entity in name, return its literal representation
565 return '&%s;' % entity
566
567
568 def unescapeHTML(s):
569 if s is None:
570 return None
571 assert type(s) == compat_str
572
573 return re.sub(
574 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
575
576
577 def get_subprocess_encoding():
578 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 # For subprocess calls, encode with locale encoding
580 # Refer to http://stackoverflow.com/a/9951851/35070
581 encoding = preferredencoding()
582 else:
583 encoding = sys.getfilesystemencoding()
584 if encoding is None:
585 encoding = 'utf-8'
586 return encoding
587
588
589 def encodeFilename(s, for_subprocess=False):
590 """
591 @param s The name of the file
592 """
593
594 assert type(s) == compat_str
595
596 # Python 3 has a Unicode API
597 if sys.version_info >= (3, 0):
598 return s
599
600 # Pass '' directly to use Unicode APIs on Windows 2000 and up
601 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
602 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
603 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 return s
605
606 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
607 if sys.platform.startswith('java'):
608 return s
609
610 return s.encode(get_subprocess_encoding(), 'ignore')
611
612
613 def decodeFilename(b, for_subprocess=False):
614
615 if sys.version_info >= (3, 0):
616 return b
617
618 if not isinstance(b, bytes):
619 return b
620
621 return b.decode(get_subprocess_encoding(), 'ignore')
622
623
624 def encodeArgument(s):
625 if not isinstance(s, compat_str):
626 # Legacy code that uses byte strings
627 # Uncomment the following line after fixing all post processors
628 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
629 s = s.decode('ascii')
630 return encodeFilename(s, True)
631
632
633 def decodeArgument(b):
634 return decodeFilename(b, True)
635
636
637 def decodeOption(optval):
638 if optval is None:
639 return optval
640 if isinstance(optval, bytes):
641 optval = optval.decode(preferredencoding())
642
643 assert isinstance(optval, compat_str)
644 return optval
645
646
647 def formatSeconds(secs):
648 if secs > 3600:
649 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
650 elif secs > 60:
651 return '%d:%02d' % (secs // 60, secs % 60)
652 else:
653 return '%d' % secs
654
655
656 def make_HTTPS_handler(params, **kwargs):
657 opts_no_check_certificate = params.get('nocheckcertificate', False)
658 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
659 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
660 if opts_no_check_certificate:
661 context.check_hostname = False
662 context.verify_mode = ssl.CERT_NONE
663 try:
664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
665 except TypeError:
666 # Python 2.7.8
667 # (create_default_context present but HTTPSHandler has no context=)
668 pass
669
670 if sys.version_info < (3, 2):
671 return YoutubeDLHTTPSHandler(params, **kwargs)
672 else: # Python < 3.4
673 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
674 context.verify_mode = (ssl.CERT_NONE
675 if opts_no_check_certificate
676 else ssl.CERT_REQUIRED)
677 context.set_default_verify_paths()
678 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
679
680
681 def bug_reports_message():
682 if ytdl_is_updateable():
683 update_cmd = 'type youtube-dl -U to update'
684 else:
685 update_cmd = 'see https://yt-dl.org/update on how to update'
686 msg = '; please report this issue on https://yt-dl.org/bug .'
687 msg += ' Make sure you are using the latest version; %s.' % update_cmd
688 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
689 return msg
690
691
692 class ExtractorError(Exception):
693 """Error during info extraction."""
694
695 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
696 """ tb, if given, is the original traceback (so that it can be printed out).
697 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
698 """
699
700 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
701 expected = True
702 if video_id is not None:
703 msg = video_id + ': ' + msg
704 if cause:
705 msg += ' (caused by %r)' % cause
706 if not expected:
707 msg += bug_reports_message()
708 super(ExtractorError, self).__init__(msg)
709
710 self.traceback = tb
711 self.exc_info = sys.exc_info() # preserve original exception
712 self.cause = cause
713 self.video_id = video_id
714
715 def format_traceback(self):
716 if self.traceback is None:
717 return None
718 return ''.join(traceback.format_tb(self.traceback))
719
720
721 class UnsupportedError(ExtractorError):
722 def __init__(self, url):
723 super(UnsupportedError, self).__init__(
724 'Unsupported URL: %s' % url, expected=True)
725 self.url = url
726
727
728 class RegexNotFoundError(ExtractorError):
729 """Error when a regex didn't match"""
730 pass
731
732
733 class DownloadError(Exception):
734 """Download Error exception.
735
736 This exception may be thrown by FileDownloader objects if they are not
737 configured to continue on errors. They will contain the appropriate
738 error message.
739 """
740
741 def __init__(self, msg, exc_info=None):
742 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
743 super(DownloadError, self).__init__(msg)
744 self.exc_info = exc_info
745
746
747 class SameFileError(Exception):
748 """Same File exception.
749
750 This exception will be thrown by FileDownloader objects if they detect
751 multiple files would have to be downloaded to the same file on disk.
752 """
753 pass
754
755
756 class PostProcessingError(Exception):
757 """Post Processing exception.
758
759 This exception may be raised by PostProcessor's .run() method to
760 indicate an error in the postprocessing task.
761 """
762
763 def __init__(self, msg):
764 self.msg = msg
765
766
767 class MaxDownloadsReached(Exception):
768 """ --max-downloads limit has been reached. """
769 pass
770
771
772 class UnavailableVideoError(Exception):
773 """Unavailable Format exception.
774
775 This exception will be thrown when a video is requested
776 in a format that is not available for that video.
777 """
778 pass
779
780
781 class ContentTooShortError(Exception):
782 """Content Too Short exception.
783
784 This exception may be raised by FileDownloader objects when a file they
785 download is too small for what the server announced first, indicating
786 the connection was probably interrupted.
787 """
788
789 def __init__(self, downloaded, expected):
790 # Both in bytes
791 self.downloaded = downloaded
792 self.expected = expected
793
794
795 class XAttrMetadataError(Exception):
796 def __init__(self, code=None, msg='Unknown error'):
797 super(XAttrMetadataError, self).__init__(msg)
798 self.code = code
799 self.msg = msg
800
801 # Parsing code and msg
802 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
803 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
804 self.reason = 'NO_SPACE'
805 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
806 self.reason = 'VALUE_TOO_LONG'
807 else:
808 self.reason = 'NOT_SUPPORTED'
809
810
811 class XAttrUnavailableError(Exception):
812 pass
813
814
815 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
816 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
817 # expected HTTP responses to meet HTTP/1.0 or later (see also
818 # https://github.com/rg3/youtube-dl/issues/6727)
819 if sys.version_info < (3, 0):
820 kwargs[b'strict'] = True
821 hc = http_class(*args, **kwargs)
822 source_address = ydl_handler._params.get('source_address')
823 if source_address is not None:
824 sa = (source_address, 0)
825 if hasattr(hc, 'source_address'): # Python 2.7+
826 hc.source_address = sa
827 else: # Python 2.6
828 def _hc_connect(self, *args, **kwargs):
829 sock = compat_socket_create_connection(
830 (self.host, self.port), self.timeout, sa)
831 if is_https:
832 self.sock = ssl.wrap_socket(
833 sock, self.key_file, self.cert_file,
834 ssl_version=ssl.PROTOCOL_TLSv1)
835 else:
836 self.sock = sock
837 hc.connect = functools.partial(_hc_connect, hc)
838
839 return hc
840
841
842 def handle_youtubedl_headers(headers):
843 filtered_headers = headers
844
845 if 'Youtubedl-no-compression' in filtered_headers:
846 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
847 del filtered_headers['Youtubedl-no-compression']
848
849 return filtered_headers
850
851
852 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
853 """Handler for HTTP requests and responses.
854
855 This class, when installed with an OpenerDirector, automatically adds
856 the standard headers to every HTTP request and handles gzipped and
857 deflated responses from web servers. If compression is to be avoided in
858 a particular request, the original request in the program code only has
859 to include the HTTP header "Youtubedl-no-compression", which will be
860 removed before making the real request.
861
862 Part of this code was copied from:
863
864 http://techknack.net/python-urllib2-handlers/
865
866 Andrew Rowls, the author of that code, agreed to release it to the
867 public domain.
868 """
869
870 def __init__(self, params, *args, **kwargs):
871 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
872 self._params = params
873
874 def http_open(self, req):
875 conn_class = compat_http_client.HTTPConnection
876
877 socks_proxy = req.headers.get('Ytdl-socks-proxy')
878 if socks_proxy:
879 conn_class = make_socks_conn_class(conn_class, socks_proxy)
880 del req.headers['Ytdl-socks-proxy']
881
882 return self.do_open(functools.partial(
883 _create_http_connection, self, conn_class, False),
884 req)
885
886 @staticmethod
887 def deflate(data):
888 try:
889 return zlib.decompress(data, -zlib.MAX_WBITS)
890 except zlib.error:
891 return zlib.decompress(data)
892
893 @staticmethod
894 def addinfourl_wrapper(stream, headers, url, code):
895 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
896 return compat_urllib_request.addinfourl(stream, headers, url, code)
897 ret = compat_urllib_request.addinfourl(stream, headers, url)
898 ret.code = code
899 return ret
900
901 def http_request(self, req):
902 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
903 # always respected by websites, some tend to give out URLs with non percent-encoded
904 # non-ASCII characters (see telemb.py, ard.py [#3412])
905 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
906 # To work around aforementioned issue we will replace request's original URL with
907 # percent-encoded one
908 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
909 # the code of this workaround has been moved here from YoutubeDL.urlopen()
910 url = req.get_full_url()
911 url_escaped = escape_url(url)
912
913 # Substitute URL if any change after escaping
914 if url != url_escaped:
915 req = update_Request(req, url=url_escaped)
916
917 for h, v in std_headers.items():
918 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
919 # The dict keys are capitalized because of this bug by urllib
920 if h.capitalize() not in req.headers:
921 req.add_header(h, v)
922
923 req.headers = handle_youtubedl_headers(req.headers)
924
925 if sys.version_info < (2, 7) and '#' in req.get_full_url():
926 # Python 2.6 is brain-dead when it comes to fragments
927 req._Request__original = req._Request__original.partition('#')[0]
928 req._Request__r_type = req._Request__r_type.partition('#')[0]
929
930 return req
931
932 def http_response(self, req, resp):
933 old_resp = resp
934 # gzip
935 if resp.headers.get('Content-encoding', '') == 'gzip':
936 content = resp.read()
937 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
938 try:
939 uncompressed = io.BytesIO(gz.read())
940 except IOError as original_ioerror:
941 # There may be junk add the end of the file
942 # See http://stackoverflow.com/q/4928560/35070 for details
943 for i in range(1, 1024):
944 try:
945 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
946 uncompressed = io.BytesIO(gz.read())
947 except IOError:
948 continue
949 break
950 else:
951 raise original_ioerror
952 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
953 resp.msg = old_resp.msg
954 del resp.headers['Content-encoding']
955 # deflate
956 if resp.headers.get('Content-encoding', '') == 'deflate':
957 gz = io.BytesIO(self.deflate(resp.read()))
958 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
959 resp.msg = old_resp.msg
960 del resp.headers['Content-encoding']
961 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
962 # https://github.com/rg3/youtube-dl/issues/6457).
963 if 300 <= resp.code < 400:
964 location = resp.headers.get('Location')
965 if location:
966 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
967 if sys.version_info >= (3, 0):
968 location = location.encode('iso-8859-1').decode('utf-8')
969 else:
970 location = location.decode('utf-8')
971 location_escaped = escape_url(location)
972 if location != location_escaped:
973 del resp.headers['Location']
974 if sys.version_info < (3, 0):
975 location_escaped = location_escaped.encode('utf-8')
976 resp.headers['Location'] = location_escaped
977 return resp
978
979 https_request = http_request
980 https_response = http_response
981
982
983 def make_socks_conn_class(base_class, socks_proxy):
984 assert issubclass(base_class, (
985 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
986
987 url_components = compat_urlparse.urlparse(socks_proxy)
988 if url_components.scheme.lower() == 'socks5':
989 socks_type = ProxyType.SOCKS5
990 elif url_components.scheme.lower() in ('socks', 'socks4'):
991 socks_type = ProxyType.SOCKS4
992 elif url_components.scheme.lower() == 'socks4a':
993 socks_type = ProxyType.SOCKS4A
994
995 def unquote_if_non_empty(s):
996 if not s:
997 return s
998 return compat_urllib_parse_unquote_plus(s)
999
1000 proxy_args = (
1001 socks_type,
1002 url_components.hostname, url_components.port or 1080,
1003 True, # Remote DNS
1004 unquote_if_non_empty(url_components.username),
1005 unquote_if_non_empty(url_components.password),
1006 )
1007
1008 class SocksConnection(base_class):
1009 def connect(self):
1010 self.sock = sockssocket()
1011 self.sock.setproxy(*proxy_args)
1012 if type(self.timeout) in (int, float):
1013 self.sock.settimeout(self.timeout)
1014 self.sock.connect((self.host, self.port))
1015
1016 if isinstance(self, compat_http_client.HTTPSConnection):
1017 if hasattr(self, '_context'): # Python > 2.6
1018 self.sock = self._context.wrap_socket(
1019 self.sock, server_hostname=self.host)
1020 else:
1021 self.sock = ssl.wrap_socket(self.sock)
1022
1023 return SocksConnection
1024
1025
1026 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1027 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1028 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1029 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1030 self._params = params
1031
1032 def https_open(self, req):
1033 kwargs = {}
1034 conn_class = self._https_conn_class
1035
1036 if hasattr(self, '_context'): # python > 2.6
1037 kwargs['context'] = self._context
1038 if hasattr(self, '_check_hostname'): # python 3.x
1039 kwargs['check_hostname'] = self._check_hostname
1040
1041 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1042 if socks_proxy:
1043 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1044 del req.headers['Ytdl-socks-proxy']
1045
1046 return self.do_open(functools.partial(
1047 _create_http_connection, self, conn_class, True),
1048 req, **kwargs)
1049
1050
1051 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1052 def __init__(self, cookiejar=None):
1053 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1054
1055 def http_response(self, request, response):
1056 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1057 # characters in Set-Cookie HTTP header of last response (see
1058 # https://github.com/rg3/youtube-dl/issues/6769).
1059 # In order to at least prevent crashing we will percent encode Set-Cookie
1060 # header before HTTPCookieProcessor starts processing it.
1061 # if sys.version_info < (3, 0) and response.headers:
1062 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1063 # set_cookie = response.headers.get(set_cookie_header)
1064 # if set_cookie:
1065 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1066 # if set_cookie != set_cookie_escaped:
1067 # del response.headers[set_cookie_header]
1068 # response.headers[set_cookie_header] = set_cookie_escaped
1069 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1070
1071 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1072 https_response = http_response
1073
1074
1075 def extract_timezone(date_str):
1076 m = re.search(
1077 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1078 date_str)
1079 if not m:
1080 timezone = datetime.timedelta()
1081 else:
1082 date_str = date_str[:-len(m.group('tz'))]
1083 if not m.group('sign'):
1084 timezone = datetime.timedelta()
1085 else:
1086 sign = 1 if m.group('sign') == '+' else -1
1087 timezone = datetime.timedelta(
1088 hours=sign * int(m.group('hours')),
1089 minutes=sign * int(m.group('minutes')))
1090 return timezone, date_str
1091
1092
1093 def parse_iso8601(date_str, delimiter='T', timezone=None):
1094 """ Return a UNIX timestamp from the given date """
1095
1096 if date_str is None:
1097 return None
1098
1099 date_str = re.sub(r'\.[0-9]+', '', date_str)
1100
1101 if timezone is None:
1102 timezone, date_str = extract_timezone(date_str)
1103
1104 try:
1105 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1106 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1107 return calendar.timegm(dt.timetuple())
1108 except ValueError:
1109 pass
1110
1111
1112 def date_formats(day_first=True):
1113 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1114
1115
1116 def unified_strdate(date_str, day_first=True):
1117 """Return a string with the date in the format YYYYMMDD"""
1118
1119 if date_str is None:
1120 return None
1121 upload_date = None
1122 # Replace commas
1123 date_str = date_str.replace(',', ' ')
1124 # Remove AM/PM + timezone
1125 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1126 _, date_str = extract_timezone(date_str)
1127
1128 for expression in date_formats(day_first):
1129 try:
1130 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1131 except ValueError:
1132 pass
1133 if upload_date is None:
1134 timetuple = email.utils.parsedate_tz(date_str)
1135 if timetuple:
1136 try:
1137 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1138 except ValueError:
1139 pass
1140 if upload_date is not None:
1141 return compat_str(upload_date)
1142
1143
1144 def unified_timestamp(date_str, day_first=True):
1145 if date_str is None:
1146 return None
1147
1148 date_str = date_str.replace(',', ' ')
1149
1150 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1151 timezone, date_str = extract_timezone(date_str)
1152
1153 # Remove AM/PM + timezone
1154 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1155
1156 for expression in date_formats(day_first):
1157 try:
1158 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1159 return calendar.timegm(dt.timetuple())
1160 except ValueError:
1161 pass
1162 timetuple = email.utils.parsedate_tz(date_str)
1163 if timetuple:
1164 return calendar.timegm(timetuple) + pm_delta * 3600
1165
1166
1167 def determine_ext(url, default_ext='unknown_video'):
1168 if url is None:
1169 return default_ext
1170 guess = url.partition('?')[0].rpartition('.')[2]
1171 if re.match(r'^[A-Za-z0-9]+$', guess):
1172 return guess
1173 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1174 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1175 return guess.rstrip('/')
1176 else:
1177 return default_ext
1178
1179
1180 def subtitles_filename(filename, sub_lang, sub_format):
1181 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1182
1183
1184 def date_from_str(date_str):
1185 """
1186 Return a datetime object from a string in the format YYYYMMDD or
1187 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1188 today = datetime.date.today()
1189 if date_str in ('now', 'today'):
1190 return today
1191 if date_str == 'yesterday':
1192 return today - datetime.timedelta(days=1)
1193 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1194 if match is not None:
1195 sign = match.group('sign')
1196 time = int(match.group('time'))
1197 if sign == '-':
1198 time = -time
1199 unit = match.group('unit')
1200 # A bad approximation?
1201 if unit == 'month':
1202 unit = 'day'
1203 time *= 30
1204 elif unit == 'year':
1205 unit = 'day'
1206 time *= 365
1207 unit += 's'
1208 delta = datetime.timedelta(**{unit: time})
1209 return today + delta
1210 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1211
1212
1213 def hyphenate_date(date_str):
1214 """
1215 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1216 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1217 if match is not None:
1218 return '-'.join(match.groups())
1219 else:
1220 return date_str
1221
1222
1223 class DateRange(object):
1224 """Represents a time interval between two dates"""
1225
1226 def __init__(self, start=None, end=None):
1227 """start and end must be strings in the format accepted by date"""
1228 if start is not None:
1229 self.start = date_from_str(start)
1230 else:
1231 self.start = datetime.datetime.min.date()
1232 if end is not None:
1233 self.end = date_from_str(end)
1234 else:
1235 self.end = datetime.datetime.max.date()
1236 if self.start > self.end:
1237 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1238
1239 @classmethod
1240 def day(cls, day):
1241 """Returns a range that only contains the given day"""
1242 return cls(day, day)
1243
1244 def __contains__(self, date):
1245 """Check if the date is in the range"""
1246 if not isinstance(date, datetime.date):
1247 date = date_from_str(date)
1248 return self.start <= date <= self.end
1249
1250 def __str__(self):
1251 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1252
1253
1254 def platform_name():
1255 """ Returns the platform name as a compat_str """
1256 res = platform.platform()
1257 if isinstance(res, bytes):
1258 res = res.decode(preferredencoding())
1259
1260 assert isinstance(res, compat_str)
1261 return res
1262
1263
1264 def _windows_write_string(s, out):
1265 """ Returns True if the string was written using special methods,
1266 False if it has yet to be written out."""
1267 # Adapted from http://stackoverflow.com/a/3259271/35070
1268
1269 import ctypes
1270 import ctypes.wintypes
1271
1272 WIN_OUTPUT_IDS = {
1273 1: -11,
1274 2: -12,
1275 }
1276
1277 try:
1278 fileno = out.fileno()
1279 except AttributeError:
1280 # If the output stream doesn't have a fileno, it's virtual
1281 return False
1282 except io.UnsupportedOperation:
1283 # Some strange Windows pseudo files?
1284 return False
1285 if fileno not in WIN_OUTPUT_IDS:
1286 return False
1287
1288 GetStdHandle = ctypes.WINFUNCTYPE(
1289 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1290 (b'GetStdHandle', ctypes.windll.kernel32))
1291 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1292
1293 WriteConsoleW = ctypes.WINFUNCTYPE(
1294 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1295 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1296 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1297 written = ctypes.wintypes.DWORD(0)
1298
1299 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1300 FILE_TYPE_CHAR = 0x0002
1301 FILE_TYPE_REMOTE = 0x8000
1302 GetConsoleMode = ctypes.WINFUNCTYPE(
1303 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1304 ctypes.POINTER(ctypes.wintypes.DWORD))(
1305 (b'GetConsoleMode', ctypes.windll.kernel32))
1306 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1307
1308 def not_a_console(handle):
1309 if handle == INVALID_HANDLE_VALUE or handle is None:
1310 return True
1311 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1312 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1313
1314 if not_a_console(h):
1315 return False
1316
1317 def next_nonbmp_pos(s):
1318 try:
1319 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1320 except StopIteration:
1321 return len(s)
1322
1323 while s:
1324 count = min(next_nonbmp_pos(s), 1024)
1325
1326 ret = WriteConsoleW(
1327 h, s, count if count else 2, ctypes.byref(written), None)
1328 if ret == 0:
1329 raise OSError('Failed to write string')
1330 if not count: # We just wrote a non-BMP character
1331 assert written.value == 2
1332 s = s[1:]
1333 else:
1334 assert written.value > 0
1335 s = s[written.value:]
1336 return True
1337
1338
1339 def write_string(s, out=None, encoding=None):
1340 if out is None:
1341 out = sys.stderr
1342 assert type(s) == compat_str
1343
1344 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1345 if _windows_write_string(s, out):
1346 return
1347
1348 if ('b' in getattr(out, 'mode', '') or
1349 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1350 byt = s.encode(encoding or preferredencoding(), 'ignore')
1351 out.write(byt)
1352 elif hasattr(out, 'buffer'):
1353 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1354 byt = s.encode(enc, 'ignore')
1355 out.buffer.write(byt)
1356 else:
1357 out.write(s)
1358 out.flush()
1359
1360
1361 def bytes_to_intlist(bs):
1362 if not bs:
1363 return []
1364 if isinstance(bs[0], int): # Python 3
1365 return list(bs)
1366 else:
1367 return [ord(c) for c in bs]
1368
1369
1370 def intlist_to_bytes(xs):
1371 if not xs:
1372 return b''
1373 return compat_struct_pack('%dB' % len(xs), *xs)
1374
1375
1376 # Cross-platform file locking
1377 if sys.platform == 'win32':
1378 import ctypes.wintypes
1379 import msvcrt
1380
1381 class OVERLAPPED(ctypes.Structure):
1382 _fields_ = [
1383 ('Internal', ctypes.wintypes.LPVOID),
1384 ('InternalHigh', ctypes.wintypes.LPVOID),
1385 ('Offset', ctypes.wintypes.DWORD),
1386 ('OffsetHigh', ctypes.wintypes.DWORD),
1387 ('hEvent', ctypes.wintypes.HANDLE),
1388 ]
1389
1390 kernel32 = ctypes.windll.kernel32
1391 LockFileEx = kernel32.LockFileEx
1392 LockFileEx.argtypes = [
1393 ctypes.wintypes.HANDLE, # hFile
1394 ctypes.wintypes.DWORD, # dwFlags
1395 ctypes.wintypes.DWORD, # dwReserved
1396 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1397 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1398 ctypes.POINTER(OVERLAPPED) # Overlapped
1399 ]
1400 LockFileEx.restype = ctypes.wintypes.BOOL
1401 UnlockFileEx = kernel32.UnlockFileEx
1402 UnlockFileEx.argtypes = [
1403 ctypes.wintypes.HANDLE, # hFile
1404 ctypes.wintypes.DWORD, # dwReserved
1405 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1406 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1407 ctypes.POINTER(OVERLAPPED) # Overlapped
1408 ]
1409 UnlockFileEx.restype = ctypes.wintypes.BOOL
1410 whole_low = 0xffffffff
1411 whole_high = 0x7fffffff
1412
1413 def _lock_file(f, exclusive):
1414 overlapped = OVERLAPPED()
1415 overlapped.Offset = 0
1416 overlapped.OffsetHigh = 0
1417 overlapped.hEvent = 0
1418 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1419 handle = msvcrt.get_osfhandle(f.fileno())
1420 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1421 whole_low, whole_high, f._lock_file_overlapped_p):
1422 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1423
1424 def _unlock_file(f):
1425 assert f._lock_file_overlapped_p
1426 handle = msvcrt.get_osfhandle(f.fileno())
1427 if not UnlockFileEx(handle, 0,
1428 whole_low, whole_high, f._lock_file_overlapped_p):
1429 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1430
1431 else:
1432 # Some platforms, such as Jython, is missing fcntl
1433 try:
1434 import fcntl
1435
1436 def _lock_file(f, exclusive):
1437 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1438
1439 def _unlock_file(f):
1440 fcntl.flock(f, fcntl.LOCK_UN)
1441 except ImportError:
1442 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1443
1444 def _lock_file(f, exclusive):
1445 raise IOError(UNSUPPORTED_MSG)
1446
1447 def _unlock_file(f):
1448 raise IOError(UNSUPPORTED_MSG)
1449
1450
1451 class locked_file(object):
1452 def __init__(self, filename, mode, encoding=None):
1453 assert mode in ['r', 'a', 'w']
1454 self.f = io.open(filename, mode, encoding=encoding)
1455 self.mode = mode
1456
1457 def __enter__(self):
1458 exclusive = self.mode != 'r'
1459 try:
1460 _lock_file(self.f, exclusive)
1461 except IOError:
1462 self.f.close()
1463 raise
1464 return self
1465
1466 def __exit__(self, etype, value, traceback):
1467 try:
1468 _unlock_file(self.f)
1469 finally:
1470 self.f.close()
1471
1472 def __iter__(self):
1473 return iter(self.f)
1474
1475 def write(self, *args):
1476 return self.f.write(*args)
1477
1478 def read(self, *args):
1479 return self.f.read(*args)
1480
1481
1482 def get_filesystem_encoding():
1483 encoding = sys.getfilesystemencoding()
1484 return encoding if encoding is not None else 'utf-8'
1485
1486
1487 def shell_quote(args):
1488 quoted_args = []
1489 encoding = get_filesystem_encoding()
1490 for a in args:
1491 if isinstance(a, bytes):
1492 # We may get a filename encoded with 'encodeFilename'
1493 a = a.decode(encoding)
1494 quoted_args.append(pipes.quote(a))
1495 return ' '.join(quoted_args)
1496
1497
1498 def smuggle_url(url, data):
1499 """ Pass additional data in a URL for internal use. """
1500
1501 url, idata = unsmuggle_url(url, {})
1502 data.update(idata)
1503 sdata = compat_urllib_parse_urlencode(
1504 {'__youtubedl_smuggle': json.dumps(data)})
1505 return url + '#' + sdata
1506
1507
1508 def unsmuggle_url(smug_url, default=None):
1509 if '#__youtubedl_smuggle' not in smug_url:
1510 return smug_url, default
1511 url, _, sdata = smug_url.rpartition('#')
1512 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1513 data = json.loads(jsond)
1514 return url, data
1515
1516
1517 def format_bytes(bytes):
1518 if bytes is None:
1519 return 'N/A'
1520 if type(bytes) is str:
1521 bytes = float(bytes)
1522 if bytes == 0.0:
1523 exponent = 0
1524 else:
1525 exponent = int(math.log(bytes, 1024.0))
1526 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1527 converted = float(bytes) / float(1024 ** exponent)
1528 return '%.2f%s' % (converted, suffix)
1529
1530
1531 def lookup_unit_table(unit_table, s):
1532 units_re = '|'.join(re.escape(u) for u in unit_table)
1533 m = re.match(
1534 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1535 if not m:
1536 return None
1537 num_str = m.group('num').replace(',', '.')
1538 mult = unit_table[m.group('unit')]
1539 return int(float(num_str) * mult)
1540
1541
1542 def parse_filesize(s):
1543 if s is None:
1544 return None
1545
1546 # The lower-case forms are of course incorrect and unofficial,
1547 # but we support those too
1548 _UNIT_TABLE = {
1549 'B': 1,
1550 'b': 1,
1551 'bytes': 1,
1552 'KiB': 1024,
1553 'KB': 1000,
1554 'kB': 1024,
1555 'Kb': 1000,
1556 'kb': 1000,
1557 'kilobytes': 1000,
1558 'kibibytes': 1024,
1559 'MiB': 1024 ** 2,
1560 'MB': 1000 ** 2,
1561 'mB': 1024 ** 2,
1562 'Mb': 1000 ** 2,
1563 'mb': 1000 ** 2,
1564 'megabytes': 1000 ** 2,
1565 'mebibytes': 1024 ** 2,
1566 'GiB': 1024 ** 3,
1567 'GB': 1000 ** 3,
1568 'gB': 1024 ** 3,
1569 'Gb': 1000 ** 3,
1570 'gb': 1000 ** 3,
1571 'gigabytes': 1000 ** 3,
1572 'gibibytes': 1024 ** 3,
1573 'TiB': 1024 ** 4,
1574 'TB': 1000 ** 4,
1575 'tB': 1024 ** 4,
1576 'Tb': 1000 ** 4,
1577 'tb': 1000 ** 4,
1578 'terabytes': 1000 ** 4,
1579 'tebibytes': 1024 ** 4,
1580 'PiB': 1024 ** 5,
1581 'PB': 1000 ** 5,
1582 'pB': 1024 ** 5,
1583 'Pb': 1000 ** 5,
1584 'pb': 1000 ** 5,
1585 'petabytes': 1000 ** 5,
1586 'pebibytes': 1024 ** 5,
1587 'EiB': 1024 ** 6,
1588 'EB': 1000 ** 6,
1589 'eB': 1024 ** 6,
1590 'Eb': 1000 ** 6,
1591 'eb': 1000 ** 6,
1592 'exabytes': 1000 ** 6,
1593 'exbibytes': 1024 ** 6,
1594 'ZiB': 1024 ** 7,
1595 'ZB': 1000 ** 7,
1596 'zB': 1024 ** 7,
1597 'Zb': 1000 ** 7,
1598 'zb': 1000 ** 7,
1599 'zettabytes': 1000 ** 7,
1600 'zebibytes': 1024 ** 7,
1601 'YiB': 1024 ** 8,
1602 'YB': 1000 ** 8,
1603 'yB': 1024 ** 8,
1604 'Yb': 1000 ** 8,
1605 'yb': 1000 ** 8,
1606 'yottabytes': 1000 ** 8,
1607 'yobibytes': 1024 ** 8,
1608 }
1609
1610 return lookup_unit_table(_UNIT_TABLE, s)
1611
1612
1613 def parse_count(s):
1614 if s is None:
1615 return None
1616
1617 s = s.strip()
1618
1619 if re.match(r'^[\d,.]+$', s):
1620 return str_to_int(s)
1621
1622 _UNIT_TABLE = {
1623 'k': 1000,
1624 'K': 1000,
1625 'm': 1000 ** 2,
1626 'M': 1000 ** 2,
1627 'kk': 1000 ** 2,
1628 'KK': 1000 ** 2,
1629 }
1630
1631 return lookup_unit_table(_UNIT_TABLE, s)
1632
1633
1634 def month_by_name(name, lang='en'):
1635 """ Return the number of a month by (locale-independently) English name """
1636
1637 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1638
1639 try:
1640 return month_names.index(name) + 1
1641 except ValueError:
1642 return None
1643
1644
1645 def month_by_abbreviation(abbrev):
1646 """ Return the number of a month by (locale-independently) English
1647 abbreviations """
1648
1649 try:
1650 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1651 except ValueError:
1652 return None
1653
1654
1655 def fix_xml_ampersands(xml_str):
1656 """Replace all the '&' by '&amp;' in XML"""
1657 return re.sub(
1658 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1659 '&amp;',
1660 xml_str)
1661
1662
1663 def setproctitle(title):
1664 assert isinstance(title, compat_str)
1665
1666 # ctypes in Jython is not complete
1667 # http://bugs.jython.org/issue2148
1668 if sys.platform.startswith('java'):
1669 return
1670
1671 try:
1672 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1673 except OSError:
1674 return
1675 title_bytes = title.encode('utf-8')
1676 buf = ctypes.create_string_buffer(len(title_bytes))
1677 buf.value = title_bytes
1678 try:
1679 libc.prctl(15, buf, 0, 0, 0)
1680 except AttributeError:
1681 return # Strange libc, just skip this
1682
1683
1684 def remove_start(s, start):
1685 return s[len(start):] if s is not None and s.startswith(start) else s
1686
1687
1688 def remove_end(s, end):
1689 return s[:-len(end)] if s is not None and s.endswith(end) else s
1690
1691
1692 def remove_quotes(s):
1693 if s is None or len(s) < 2:
1694 return s
1695 for quote in ('"', "'", ):
1696 if s[0] == quote and s[-1] == quote:
1697 return s[1:-1]
1698 return s
1699
1700
1701 def url_basename(url):
1702 path = compat_urlparse.urlparse(url).path
1703 return path.strip('/').split('/')[-1]
1704
1705
1706 def base_url(url):
1707 return re.match(r'https?://[^?#&]+/', url).group()
1708
1709
1710 def urljoin(base, path):
1711 if not isinstance(path, compat_str) or not path:
1712 return None
1713 if re.match(r'^(?:https?:)?//', path):
1714 return path
1715 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1716 return None
1717 return compat_urlparse.urljoin(base, path)
1718
1719
1720 class HEADRequest(compat_urllib_request.Request):
1721 def get_method(self):
1722 return 'HEAD'
1723
1724
1725 class PUTRequest(compat_urllib_request.Request):
1726 def get_method(self):
1727 return 'PUT'
1728
1729
1730 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1731 if get_attr:
1732 if v is not None:
1733 v = getattr(v, get_attr, None)
1734 if v == '':
1735 v = None
1736 if v is None:
1737 return default
1738 try:
1739 return int(v) * invscale // scale
1740 except ValueError:
1741 return default
1742
1743
1744 def str_or_none(v, default=None):
1745 return default if v is None else compat_str(v)
1746
1747
1748 def str_to_int(int_str):
1749 """ A more relaxed version of int_or_none """
1750 if int_str is None:
1751 return None
1752 int_str = re.sub(r'[,\.\+]', '', int_str)
1753 return int(int_str)
1754
1755
1756 def float_or_none(v, scale=1, invscale=1, default=None):
1757 if v is None:
1758 return default
1759 try:
1760 return float(v) * invscale / scale
1761 except ValueError:
1762 return default
1763
1764
1765 def strip_or_none(v):
1766 return None if v is None else v.strip()
1767
1768
1769 def parse_duration(s):
1770 if not isinstance(s, compat_basestring):
1771 return None
1772
1773 s = s.strip()
1774
1775 days, hours, mins, secs, ms = [None] * 5
1776 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1777 if m:
1778 days, hours, mins, secs, ms = m.groups()
1779 else:
1780 m = re.match(
1781 r'''(?ix)(?:P?T)?
1782 (?:
1783 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1784 )?
1785 (?:
1786 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1787 )?
1788 (?:
1789 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1790 )?
1791 (?:
1792 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1793 )?Z?$''', s)
1794 if m:
1795 days, hours, mins, secs, ms = m.groups()
1796 else:
1797 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1798 if m:
1799 hours, mins = m.groups()
1800 else:
1801 return None
1802
1803 duration = 0
1804 if secs:
1805 duration += float(secs)
1806 if mins:
1807 duration += float(mins) * 60
1808 if hours:
1809 duration += float(hours) * 60 * 60
1810 if days:
1811 duration += float(days) * 24 * 60 * 60
1812 if ms:
1813 duration += float(ms)
1814 return duration
1815
1816
1817 def prepend_extension(filename, ext, expected_real_ext=None):
1818 name, real_ext = os.path.splitext(filename)
1819 return (
1820 '{0}.{1}{2}'.format(name, ext, real_ext)
1821 if not expected_real_ext or real_ext[1:] == expected_real_ext
1822 else '{0}.{1}'.format(filename, ext))
1823
1824
1825 def replace_extension(filename, ext, expected_real_ext=None):
1826 name, real_ext = os.path.splitext(filename)
1827 return '{0}.{1}'.format(
1828 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1829 ext)
1830
1831
1832 def check_executable(exe, args=[]):
1833 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1834 args can be a list of arguments for a short output (like -version) """
1835 try:
1836 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1837 except OSError:
1838 return False
1839 return exe
1840
1841
1842 def get_exe_version(exe, args=['--version'],
1843 version_re=None, unrecognized='present'):
1844 """ Returns the version of the specified executable,
1845 or False if the executable is not present """
1846 try:
1847 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1848 # SIGTTOU if youtube-dl is run in the background.
1849 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1850 out, _ = subprocess.Popen(
1851 [encodeArgument(exe)] + args,
1852 stdin=subprocess.PIPE,
1853 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1854 except OSError:
1855 return False
1856 if isinstance(out, bytes): # Python 2.x
1857 out = out.decode('ascii', 'ignore')
1858 return detect_exe_version(out, version_re, unrecognized)
1859
1860
1861 def detect_exe_version(output, version_re=None, unrecognized='present'):
1862 assert isinstance(output, compat_str)
1863 if version_re is None:
1864 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1865 m = re.search(version_re, output)
1866 if m:
1867 return m.group(1)
1868 else:
1869 return unrecognized
1870
1871
1872 class PagedList(object):
1873 def __len__(self):
1874 # This is only useful for tests
1875 return len(self.getslice())
1876
1877
1878 class OnDemandPagedList(PagedList):
1879 def __init__(self, pagefunc, pagesize, use_cache=False):
1880 self._pagefunc = pagefunc
1881 self._pagesize = pagesize
1882 self._use_cache = use_cache
1883 if use_cache:
1884 self._cache = {}
1885
1886 def getslice(self, start=0, end=None):
1887 res = []
1888 for pagenum in itertools.count(start // self._pagesize):
1889 firstid = pagenum * self._pagesize
1890 nextfirstid = pagenum * self._pagesize + self._pagesize
1891 if start >= nextfirstid:
1892 continue
1893
1894 page_results = None
1895 if self._use_cache:
1896 page_results = self._cache.get(pagenum)
1897 if page_results is None:
1898 page_results = list(self._pagefunc(pagenum))
1899 if self._use_cache:
1900 self._cache[pagenum] = page_results
1901
1902 startv = (
1903 start % self._pagesize
1904 if firstid <= start < nextfirstid
1905 else 0)
1906
1907 endv = (
1908 ((end - 1) % self._pagesize) + 1
1909 if (end is not None and firstid <= end <= nextfirstid)
1910 else None)
1911
1912 if startv != 0 or endv is not None:
1913 page_results = page_results[startv:endv]
1914 res.extend(page_results)
1915
1916 # A little optimization - if current page is not "full", ie. does
1917 # not contain page_size videos then we can assume that this page
1918 # is the last one - there are no more ids on further pages -
1919 # i.e. no need to query again.
1920 if len(page_results) + startv < self._pagesize:
1921 break
1922
1923 # If we got the whole page, but the next page is not interesting,
1924 # break out early as well
1925 if end == nextfirstid:
1926 break
1927 return res
1928
1929
1930 class InAdvancePagedList(PagedList):
1931 def __init__(self, pagefunc, pagecount, pagesize):
1932 self._pagefunc = pagefunc
1933 self._pagecount = pagecount
1934 self._pagesize = pagesize
1935
1936 def getslice(self, start=0, end=None):
1937 res = []
1938 start_page = start // self._pagesize
1939 end_page = (
1940 self._pagecount if end is None else (end // self._pagesize + 1))
1941 skip_elems = start - start_page * self._pagesize
1942 only_more = None if end is None else end - start
1943 for pagenum in range(start_page, end_page):
1944 page = list(self._pagefunc(pagenum))
1945 if skip_elems:
1946 page = page[skip_elems:]
1947 skip_elems = None
1948 if only_more is not None:
1949 if len(page) < only_more:
1950 only_more -= len(page)
1951 else:
1952 page = page[:only_more]
1953 res.extend(page)
1954 break
1955 res.extend(page)
1956 return res
1957
1958
1959 def uppercase_escape(s):
1960 unicode_escape = codecs.getdecoder('unicode_escape')
1961 return re.sub(
1962 r'\\U[0-9a-fA-F]{8}',
1963 lambda m: unicode_escape(m.group(0))[0],
1964 s)
1965
1966
1967 def lowercase_escape(s):
1968 unicode_escape = codecs.getdecoder('unicode_escape')
1969 return re.sub(
1970 r'\\u[0-9a-fA-F]{4}',
1971 lambda m: unicode_escape(m.group(0))[0],
1972 s)
1973
1974
1975 def escape_rfc3986(s):
1976 """Escape non-ASCII characters as suggested by RFC 3986"""
1977 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1978 s = s.encode('utf-8')
1979 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1980
1981
1982 def escape_url(url):
1983 """Escape URL as suggested by RFC 3986"""
1984 url_parsed = compat_urllib_parse_urlparse(url)
1985 return url_parsed._replace(
1986 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1987 path=escape_rfc3986(url_parsed.path),
1988 params=escape_rfc3986(url_parsed.params),
1989 query=escape_rfc3986(url_parsed.query),
1990 fragment=escape_rfc3986(url_parsed.fragment)
1991 ).geturl()
1992
1993
1994 def read_batch_urls(batch_fd):
1995 def fixup(url):
1996 if not isinstance(url, compat_str):
1997 url = url.decode('utf-8', 'replace')
1998 BOM_UTF8 = '\xef\xbb\xbf'
1999 if url.startswith(BOM_UTF8):
2000 url = url[len(BOM_UTF8):]
2001 url = url.strip()
2002 if url.startswith(('#', ';', ']')):
2003 return False
2004 return url
2005
2006 with contextlib.closing(batch_fd) as fd:
2007 return [url for url in map(fixup, fd) if url]
2008
2009
2010 def urlencode_postdata(*args, **kargs):
2011 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2012
2013
2014 def update_url_query(url, query):
2015 if not query:
2016 return url
2017 parsed_url = compat_urlparse.urlparse(url)
2018 qs = compat_parse_qs(parsed_url.query)
2019 qs.update(query)
2020 return compat_urlparse.urlunparse(parsed_url._replace(
2021 query=compat_urllib_parse_urlencode(qs, True)))
2022
2023
2024 def update_Request(req, url=None, data=None, headers={}, query={}):
2025 req_headers = req.headers.copy()
2026 req_headers.update(headers)
2027 req_data = data or req.data
2028 req_url = update_url_query(url or req.get_full_url(), query)
2029 req_get_method = req.get_method()
2030 if req_get_method == 'HEAD':
2031 req_type = HEADRequest
2032 elif req_get_method == 'PUT':
2033 req_type = PUTRequest
2034 else:
2035 req_type = compat_urllib_request.Request
2036 new_req = req_type(
2037 req_url, data=req_data, headers=req_headers,
2038 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2039 if hasattr(req, 'timeout'):
2040 new_req.timeout = req.timeout
2041 return new_req
2042
2043
2044 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2045 if isinstance(key_or_keys, (list, tuple)):
2046 for key in key_or_keys:
2047 if key not in d or d[key] is None or skip_false_values and not d[key]:
2048 continue
2049 return d[key]
2050 return default
2051 return d.get(key_or_keys, default)
2052
2053
2054 def try_get(src, getter, expected_type=None):
2055 try:
2056 v = getter(src)
2057 except (AttributeError, KeyError, TypeError, IndexError):
2058 pass
2059 else:
2060 if expected_type is None or isinstance(v, expected_type):
2061 return v
2062
2063
2064 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2065 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2066
2067
2068 US_RATINGS = {
2069 'G': 0,
2070 'PG': 10,
2071 'PG-13': 13,
2072 'R': 16,
2073 'NC': 18,
2074 }
2075
2076
2077 TV_PARENTAL_GUIDELINES = {
2078 'TV-Y': 0,
2079 'TV-Y7': 7,
2080 'TV-G': 0,
2081 'TV-PG': 0,
2082 'TV-14': 14,
2083 'TV-MA': 17,
2084 }
2085
2086
2087 def parse_age_limit(s):
2088 if type(s) == int:
2089 return s if 0 <= s <= 21 else None
2090 if not isinstance(s, compat_basestring):
2091 return None
2092 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2093 if m:
2094 return int(m.group('age'))
2095 if s in US_RATINGS:
2096 return US_RATINGS[s]
2097 return TV_PARENTAL_GUIDELINES.get(s)
2098
2099
2100 def strip_jsonp(code):
2101 return re.sub(
2102 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2103
2104
2105 def js_to_json(code):
2106 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2107 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2108 INTEGER_TABLE = (
2109 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2110 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2111 )
2112
2113 def fix_kv(m):
2114 v = m.group(0)
2115 if v in ('true', 'false', 'null'):
2116 return v
2117 elif v.startswith('/*') or v.startswith('//') or v == ',':
2118 return ""
2119
2120 if v[0] in ("'", '"'):
2121 v = re.sub(r'(?s)\\.|"', lambda m: {
2122 '"': '\\"',
2123 "\\'": "'",
2124 '\\\n': '',
2125 '\\x': '\\u00',
2126 }.get(m.group(0), m.group(0)), v[1:-1])
2127
2128 for regex, base in INTEGER_TABLE:
2129 im = re.match(regex, v)
2130 if im:
2131 i = int(im.group(1), base)
2132 return '"%d":' % i if v.endswith(':') else '%d' % i
2133
2134 return '"%s"' % v
2135
2136 return re.sub(r'''(?sx)
2137 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2138 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2139 {comment}|,(?={skip}[\]}}])|
2140 [a-zA-Z_][.a-zA-Z_0-9]*|
2141 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2142 [0-9]+(?={skip}:)
2143 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2144
2145
2146 def qualities(quality_ids):
2147 """ Get a numeric quality value out of a list of possible values """
2148 def q(qid):
2149 try:
2150 return quality_ids.index(qid)
2151 except ValueError:
2152 return -1
2153 return q
2154
2155
2156 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2157
2158
2159 def limit_length(s, length):
2160 """ Add ellipses to overly long strings """
2161 if s is None:
2162 return None
2163 ELLIPSES = '...'
2164 if len(s) > length:
2165 return s[:length - len(ELLIPSES)] + ELLIPSES
2166 return s
2167
2168
2169 def version_tuple(v):
2170 return tuple(int(e) for e in re.split(r'[-.]', v))
2171
2172
2173 def is_outdated_version(version, limit, assume_new=True):
2174 if not version:
2175 return not assume_new
2176 try:
2177 return version_tuple(version) < version_tuple(limit)
2178 except ValueError:
2179 return not assume_new
2180
2181
2182 def ytdl_is_updateable():
2183 """ Returns if youtube-dl can be updated with -U """
2184 from zipimport import zipimporter
2185
2186 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2187
2188
2189 def args_to_str(args):
2190 # Get a short string representation for a subprocess command
2191 return ' '.join(compat_shlex_quote(a) for a in args)
2192
2193
2194 def error_to_compat_str(err):
2195 err_str = str(err)
2196 # On python 2 error byte string must be decoded with proper
2197 # encoding rather than ascii
2198 if sys.version_info[0] < 3:
2199 err_str = err_str.decode(preferredencoding())
2200 return err_str
2201
2202
2203 def mimetype2ext(mt):
2204 if mt is None:
2205 return None
2206
2207 ext = {
2208 'audio/mp4': 'm4a',
2209 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2210 # it's the most popular one
2211 'audio/mpeg': 'mp3',
2212 }.get(mt)
2213 if ext is not None:
2214 return ext
2215
2216 _, _, res = mt.rpartition('/')
2217 res = res.split(';')[0].strip().lower()
2218
2219 return {
2220 '3gpp': '3gp',
2221 'smptett+xml': 'tt',
2222 'srt': 'srt',
2223 'ttaf+xml': 'dfxp',
2224 'ttml+xml': 'ttml',
2225 'vtt': 'vtt',
2226 'x-flv': 'flv',
2227 'x-mp4-fragmented': 'mp4',
2228 'x-ms-wmv': 'wmv',
2229 'mpegurl': 'm3u8',
2230 'x-mpegurl': 'm3u8',
2231 'vnd.apple.mpegurl': 'm3u8',
2232 'dash+xml': 'mpd',
2233 'f4m': 'f4m',
2234 'f4m+xml': 'f4m',
2235 'hds+xml': 'f4m',
2236 'vnd.ms-sstr+xml': 'ism',
2237 'quicktime': 'mov',
2238 }.get(res, res)
2239
2240
2241 def parse_codecs(codecs_str):
2242 # http://tools.ietf.org/html/rfc6381
2243 if not codecs_str:
2244 return {}
2245 splited_codecs = list(filter(None, map(
2246 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2247 vcodec, acodec = None, None
2248 for full_codec in splited_codecs:
2249 codec = full_codec.split('.')[0]
2250 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2251 if not vcodec:
2252 vcodec = full_codec
2253 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2254 if not acodec:
2255 acodec = full_codec
2256 else:
2257 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2258 if not vcodec and not acodec:
2259 if len(splited_codecs) == 2:
2260 return {
2261 'vcodec': vcodec,
2262 'acodec': acodec,
2263 }
2264 elif len(splited_codecs) == 1:
2265 return {
2266 'vcodec': 'none',
2267 'acodec': vcodec,
2268 }
2269 else:
2270 return {
2271 'vcodec': vcodec or 'none',
2272 'acodec': acodec or 'none',
2273 }
2274 return {}
2275
2276
2277 def urlhandle_detect_ext(url_handle):
2278 getheader = url_handle.headers.get
2279
2280 cd = getheader('Content-Disposition')
2281 if cd:
2282 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2283 if m:
2284 e = determine_ext(m.group('filename'), default_ext=None)
2285 if e:
2286 return e
2287
2288 return mimetype2ext(getheader('Content-Type'))
2289
2290
2291 def encode_data_uri(data, mime_type):
2292 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2293
2294
2295 def age_restricted(content_limit, age_limit):
2296 """ Returns True iff the content should be blocked """
2297
2298 if age_limit is None: # No limit set
2299 return False
2300 if content_limit is None:
2301 return False # Content available for everyone
2302 return age_limit < content_limit
2303
2304
2305 def is_html(first_bytes):
2306 """ Detect whether a file contains HTML by examining its first bytes. """
2307
2308 BOMS = [
2309 (b'\xef\xbb\xbf', 'utf-8'),
2310 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2311 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2312 (b'\xff\xfe', 'utf-16-le'),
2313 (b'\xfe\xff', 'utf-16-be'),
2314 ]
2315 for bom, enc in BOMS:
2316 if first_bytes.startswith(bom):
2317 s = first_bytes[len(bom):].decode(enc, 'replace')
2318 break
2319 else:
2320 s = first_bytes.decode('utf-8', 'replace')
2321
2322 return re.match(r'^\s*<', s)
2323
2324
2325 def determine_protocol(info_dict):
2326 protocol = info_dict.get('protocol')
2327 if protocol is not None:
2328 return protocol
2329
2330 url = info_dict['url']
2331 if url.startswith('rtmp'):
2332 return 'rtmp'
2333 elif url.startswith('mms'):
2334 return 'mms'
2335 elif url.startswith('rtsp'):
2336 return 'rtsp'
2337
2338 ext = determine_ext(url)
2339 if ext == 'm3u8':
2340 return 'm3u8'
2341 elif ext == 'f4m':
2342 return 'f4m'
2343
2344 return compat_urllib_parse_urlparse(url).scheme
2345
2346
2347 def render_table(header_row, data):
2348 """ Render a list of rows, each as a list of values """
2349 table = [header_row] + data
2350 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2351 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2352 return '\n'.join(format_str % tuple(row) for row in table)
2353
2354
2355 def _match_one(filter_part, dct):
2356 COMPARISON_OPERATORS = {
2357 '<': operator.lt,
2358 '<=': operator.le,
2359 '>': operator.gt,
2360 '>=': operator.ge,
2361 '=': operator.eq,
2362 '!=': operator.ne,
2363 }
2364 operator_rex = re.compile(r'''(?x)\s*
2365 (?P<key>[a-z_]+)
2366 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2367 (?:
2368 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2369 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2370 )
2371 \s*$
2372 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2373 m = operator_rex.search(filter_part)
2374 if m:
2375 op = COMPARISON_OPERATORS[m.group('op')]
2376 actual_value = dct.get(m.group('key'))
2377 if (m.group('strval') is not None or
2378 # If the original field is a string and matching comparisonvalue is
2379 # a number we should respect the origin of the original field
2380 # and process comparison value as a string (see
2381 # https://github.com/rg3/youtube-dl/issues/11082).
2382 actual_value is not None and m.group('intval') is not None and
2383 isinstance(actual_value, compat_str)):
2384 if m.group('op') not in ('=', '!='):
2385 raise ValueError(
2386 'Operator %s does not support string values!' % m.group('op'))
2387 comparison_value = m.group('strval') or m.group('intval')
2388 else:
2389 try:
2390 comparison_value = int(m.group('intval'))
2391 except ValueError:
2392 comparison_value = parse_filesize(m.group('intval'))
2393 if comparison_value is None:
2394 comparison_value = parse_filesize(m.group('intval') + 'B')
2395 if comparison_value is None:
2396 raise ValueError(
2397 'Invalid integer value %r in filter part %r' % (
2398 m.group('intval'), filter_part))
2399 if actual_value is None:
2400 return m.group('none_inclusive')
2401 return op(actual_value, comparison_value)
2402
2403 UNARY_OPERATORS = {
2404 '': lambda v: v is not None,
2405 '!': lambda v: v is None,
2406 }
2407 operator_rex = re.compile(r'''(?x)\s*
2408 (?P<op>%s)\s*(?P<key>[a-z_]+)
2409 \s*$
2410 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2411 m = operator_rex.search(filter_part)
2412 if m:
2413 op = UNARY_OPERATORS[m.group('op')]
2414 actual_value = dct.get(m.group('key'))
2415 return op(actual_value)
2416
2417 raise ValueError('Invalid filter part %r' % filter_part)
2418
2419
2420 def match_str(filter_str, dct):
2421 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2422
2423 return all(
2424 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2425
2426
2427 def match_filter_func(filter_str):
2428 def _match_func(info_dict):
2429 if match_str(filter_str, info_dict):
2430 return None
2431 else:
2432 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2433 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2434 return _match_func
2435
2436
2437 def parse_dfxp_time_expr(time_expr):
2438 if not time_expr:
2439 return
2440
2441 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2442 if mobj:
2443 return float(mobj.group('time_offset'))
2444
2445 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2446 if mobj:
2447 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2448
2449
2450 def srt_subtitles_timecode(seconds):
2451 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2452
2453
2454 def dfxp2srt(dfxp_data):
2455 _x = functools.partial(xpath_with_ns, ns_map={
2456 'ttml': 'http://www.w3.org/ns/ttml',
2457 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2458 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2459 })
2460
2461 class TTMLPElementParser(object):
2462 out = ''
2463
2464 def start(self, tag, attrib):
2465 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2466 self.out += '\n'
2467
2468 def end(self, tag):
2469 pass
2470
2471 def data(self, data):
2472 self.out += data
2473
2474 def close(self):
2475 return self.out.strip()
2476
2477 def parse_node(node):
2478 target = TTMLPElementParser()
2479 parser = xml.etree.ElementTree.XMLParser(target=target)
2480 parser.feed(xml.etree.ElementTree.tostring(node))
2481 return parser.close()
2482
2483 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2484 out = []
2485 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2486
2487 if not paras:
2488 raise ValueError('Invalid dfxp/TTML subtitle')
2489
2490 for para, index in zip(paras, itertools.count(1)):
2491 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2492 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2493 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2494 if begin_time is None:
2495 continue
2496 if not end_time:
2497 if not dur:
2498 continue
2499 end_time = begin_time + dur
2500 out.append('%d\n%s --> %s\n%s\n\n' % (
2501 index,
2502 srt_subtitles_timecode(begin_time),
2503 srt_subtitles_timecode(end_time),
2504 parse_node(para)))
2505
2506 return ''.join(out)
2507
2508
2509 def cli_option(params, command_option, param):
2510 param = params.get(param)
2511 if param:
2512 param = compat_str(param)
2513 return [command_option, param] if param is not None else []
2514
2515
2516 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2517 param = params.get(param)
2518 assert isinstance(param, bool)
2519 if separator:
2520 return [command_option + separator + (true_value if param else false_value)]
2521 return [command_option, true_value if param else false_value]
2522
2523
2524 def cli_valueless_option(params, command_option, param, expected_value=True):
2525 param = params.get(param)
2526 return [command_option] if param == expected_value else []
2527
2528
2529 def cli_configuration_args(params, param, default=[]):
2530 ex_args = params.get(param)
2531 if ex_args is None:
2532 return default
2533 assert isinstance(ex_args, list)
2534 return ex_args
2535
2536
2537 class ISO639Utils(object):
2538 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2539 _lang_map = {
2540 'aa': 'aar',
2541 'ab': 'abk',
2542 'ae': 'ave',
2543 'af': 'afr',
2544 'ak': 'aka',
2545 'am': 'amh',
2546 'an': 'arg',
2547 'ar': 'ara',
2548 'as': 'asm',
2549 'av': 'ava',
2550 'ay': 'aym',
2551 'az': 'aze',
2552 'ba': 'bak',
2553 'be': 'bel',
2554 'bg': 'bul',
2555 'bh': 'bih',
2556 'bi': 'bis',
2557 'bm': 'bam',
2558 'bn': 'ben',
2559 'bo': 'bod',
2560 'br': 'bre',
2561 'bs': 'bos',
2562 'ca': 'cat',
2563 'ce': 'che',
2564 'ch': 'cha',
2565 'co': 'cos',
2566 'cr': 'cre',
2567 'cs': 'ces',
2568 'cu': 'chu',
2569 'cv': 'chv',
2570 'cy': 'cym',
2571 'da': 'dan',
2572 'de': 'deu',
2573 'dv': 'div',
2574 'dz': 'dzo',
2575 'ee': 'ewe',
2576 'el': 'ell',
2577 'en': 'eng',
2578 'eo': 'epo',
2579 'es': 'spa',
2580 'et': 'est',
2581 'eu': 'eus',
2582 'fa': 'fas',
2583 'ff': 'ful',
2584 'fi': 'fin',
2585 'fj': 'fij',
2586 'fo': 'fao',
2587 'fr': 'fra',
2588 'fy': 'fry',
2589 'ga': 'gle',
2590 'gd': 'gla',
2591 'gl': 'glg',
2592 'gn': 'grn',
2593 'gu': 'guj',
2594 'gv': 'glv',
2595 'ha': 'hau',
2596 'he': 'heb',
2597 'hi': 'hin',
2598 'ho': 'hmo',
2599 'hr': 'hrv',
2600 'ht': 'hat',
2601 'hu': 'hun',
2602 'hy': 'hye',
2603 'hz': 'her',
2604 'ia': 'ina',
2605 'id': 'ind',
2606 'ie': 'ile',
2607 'ig': 'ibo',
2608 'ii': 'iii',
2609 'ik': 'ipk',
2610 'io': 'ido',
2611 'is': 'isl',
2612 'it': 'ita',
2613 'iu': 'iku',
2614 'ja': 'jpn',
2615 'jv': 'jav',
2616 'ka': 'kat',
2617 'kg': 'kon',
2618 'ki': 'kik',
2619 'kj': 'kua',
2620 'kk': 'kaz',
2621 'kl': 'kal',
2622 'km': 'khm',
2623 'kn': 'kan',
2624 'ko': 'kor',
2625 'kr': 'kau',
2626 'ks': 'kas',
2627 'ku': 'kur',
2628 'kv': 'kom',
2629 'kw': 'cor',
2630 'ky': 'kir',
2631 'la': 'lat',
2632 'lb': 'ltz',
2633 'lg': 'lug',
2634 'li': 'lim',
2635 'ln': 'lin',
2636 'lo': 'lao',
2637 'lt': 'lit',
2638 'lu': 'lub',
2639 'lv': 'lav',
2640 'mg': 'mlg',
2641 'mh': 'mah',
2642 'mi': 'mri',
2643 'mk': 'mkd',
2644 'ml': 'mal',
2645 'mn': 'mon',
2646 'mr': 'mar',
2647 'ms': 'msa',
2648 'mt': 'mlt',
2649 'my': 'mya',
2650 'na': 'nau',
2651 'nb': 'nob',
2652 'nd': 'nde',
2653 'ne': 'nep',
2654 'ng': 'ndo',
2655 'nl': 'nld',
2656 'nn': 'nno',
2657 'no': 'nor',
2658 'nr': 'nbl',
2659 'nv': 'nav',
2660 'ny': 'nya',
2661 'oc': 'oci',
2662 'oj': 'oji',
2663 'om': 'orm',
2664 'or': 'ori',
2665 'os': 'oss',
2666 'pa': 'pan',
2667 'pi': 'pli',
2668 'pl': 'pol',
2669 'ps': 'pus',
2670 'pt': 'por',
2671 'qu': 'que',
2672 'rm': 'roh',
2673 'rn': 'run',
2674 'ro': 'ron',
2675 'ru': 'rus',
2676 'rw': 'kin',
2677 'sa': 'san',
2678 'sc': 'srd',
2679 'sd': 'snd',
2680 'se': 'sme',
2681 'sg': 'sag',
2682 'si': 'sin',
2683 'sk': 'slk',
2684 'sl': 'slv',
2685 'sm': 'smo',
2686 'sn': 'sna',
2687 'so': 'som',
2688 'sq': 'sqi',
2689 'sr': 'srp',
2690 'ss': 'ssw',
2691 'st': 'sot',
2692 'su': 'sun',
2693 'sv': 'swe',
2694 'sw': 'swa',
2695 'ta': 'tam',
2696 'te': 'tel',
2697 'tg': 'tgk',
2698 'th': 'tha',
2699 'ti': 'tir',
2700 'tk': 'tuk',
2701 'tl': 'tgl',
2702 'tn': 'tsn',
2703 'to': 'ton',
2704 'tr': 'tur',
2705 'ts': 'tso',
2706 'tt': 'tat',
2707 'tw': 'twi',
2708 'ty': 'tah',
2709 'ug': 'uig',
2710 'uk': 'ukr',
2711 'ur': 'urd',
2712 'uz': 'uzb',
2713 've': 'ven',
2714 'vi': 'vie',
2715 'vo': 'vol',
2716 'wa': 'wln',
2717 'wo': 'wol',
2718 'xh': 'xho',
2719 'yi': 'yid',
2720 'yo': 'yor',
2721 'za': 'zha',
2722 'zh': 'zho',
2723 'zu': 'zul',
2724 }
2725
2726 @classmethod
2727 def short2long(cls, code):
2728 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2729 return cls._lang_map.get(code[:2])
2730
2731 @classmethod
2732 def long2short(cls, code):
2733 """Convert language code from ISO 639-2/T to ISO 639-1"""
2734 for short_name, long_name in cls._lang_map.items():
2735 if long_name == code:
2736 return short_name
2737
2738
2739 class ISO3166Utils(object):
2740 # From http://data.okfn.org/data/core/country-list
2741 _country_map = {
2742 'AF': 'Afghanistan',
2743 'AX': 'Åland Islands',
2744 'AL': 'Albania',
2745 'DZ': 'Algeria',
2746 'AS': 'American Samoa',
2747 'AD': 'Andorra',
2748 'AO': 'Angola',
2749 'AI': 'Anguilla',
2750 'AQ': 'Antarctica',
2751 'AG': 'Antigua and Barbuda',
2752 'AR': 'Argentina',
2753 'AM': 'Armenia',
2754 'AW': 'Aruba',
2755 'AU': 'Australia',
2756 'AT': 'Austria',
2757 'AZ': 'Azerbaijan',
2758 'BS': 'Bahamas',
2759 'BH': 'Bahrain',
2760 'BD': 'Bangladesh',
2761 'BB': 'Barbados',
2762 'BY': 'Belarus',
2763 'BE': 'Belgium',
2764 'BZ': 'Belize',
2765 'BJ': 'Benin',
2766 'BM': 'Bermuda',
2767 'BT': 'Bhutan',
2768 'BO': 'Bolivia, Plurinational State of',
2769 'BQ': 'Bonaire, Sint Eustatius and Saba',
2770 'BA': 'Bosnia and Herzegovina',
2771 'BW': 'Botswana',
2772 'BV': 'Bouvet Island',
2773 'BR': 'Brazil',
2774 'IO': 'British Indian Ocean Territory',
2775 'BN': 'Brunei Darussalam',
2776 'BG': 'Bulgaria',
2777 'BF': 'Burkina Faso',
2778 'BI': 'Burundi',
2779 'KH': 'Cambodia',
2780 'CM': 'Cameroon',
2781 'CA': 'Canada',
2782 'CV': 'Cape Verde',
2783 'KY': 'Cayman Islands',
2784 'CF': 'Central African Republic',
2785 'TD': 'Chad',
2786 'CL': 'Chile',
2787 'CN': 'China',
2788 'CX': 'Christmas Island',
2789 'CC': 'Cocos (Keeling) Islands',
2790 'CO': 'Colombia',
2791 'KM': 'Comoros',
2792 'CG': 'Congo',
2793 'CD': 'Congo, the Democratic Republic of the',
2794 'CK': 'Cook Islands',
2795 'CR': 'Costa Rica',
2796 'CI': 'Côte d\'Ivoire',
2797 'HR': 'Croatia',
2798 'CU': 'Cuba',
2799 'CW': 'Curaçao',
2800 'CY': 'Cyprus',
2801 'CZ': 'Czech Republic',
2802 'DK': 'Denmark',
2803 'DJ': 'Djibouti',
2804 'DM': 'Dominica',
2805 'DO': 'Dominican Republic',
2806 'EC': 'Ecuador',
2807 'EG': 'Egypt',
2808 'SV': 'El Salvador',
2809 'GQ': 'Equatorial Guinea',
2810 'ER': 'Eritrea',
2811 'EE': 'Estonia',
2812 'ET': 'Ethiopia',
2813 'FK': 'Falkland Islands (Malvinas)',
2814 'FO': 'Faroe Islands',
2815 'FJ': 'Fiji',
2816 'FI': 'Finland',
2817 'FR': 'France',
2818 'GF': 'French Guiana',
2819 'PF': 'French Polynesia',
2820 'TF': 'French Southern Territories',
2821 'GA': 'Gabon',
2822 'GM': 'Gambia',
2823 'GE': 'Georgia',
2824 'DE': 'Germany',
2825 'GH': 'Ghana',
2826 'GI': 'Gibraltar',
2827 'GR': 'Greece',
2828 'GL': 'Greenland',
2829 'GD': 'Grenada',
2830 'GP': 'Guadeloupe',
2831 'GU': 'Guam',
2832 'GT': 'Guatemala',
2833 'GG': 'Guernsey',
2834 'GN': 'Guinea',
2835 'GW': 'Guinea-Bissau',
2836 'GY': 'Guyana',
2837 'HT': 'Haiti',
2838 'HM': 'Heard Island and McDonald Islands',
2839 'VA': 'Holy See (Vatican City State)',
2840 'HN': 'Honduras',
2841 'HK': 'Hong Kong',
2842 'HU': 'Hungary',
2843 'IS': 'Iceland',
2844 'IN': 'India',
2845 'ID': 'Indonesia',
2846 'IR': 'Iran, Islamic Republic of',
2847 'IQ': 'Iraq',
2848 'IE': 'Ireland',
2849 'IM': 'Isle of Man',
2850 'IL': 'Israel',
2851 'IT': 'Italy',
2852 'JM': 'Jamaica',
2853 'JP': 'Japan',
2854 'JE': 'Jersey',
2855 'JO': 'Jordan',
2856 'KZ': 'Kazakhstan',
2857 'KE': 'Kenya',
2858 'KI': 'Kiribati',
2859 'KP': 'Korea, Democratic People\'s Republic of',
2860 'KR': 'Korea, Republic of',
2861 'KW': 'Kuwait',
2862 'KG': 'Kyrgyzstan',
2863 'LA': 'Lao People\'s Democratic Republic',
2864 'LV': 'Latvia',
2865 'LB': 'Lebanon',
2866 'LS': 'Lesotho',
2867 'LR': 'Liberia',
2868 'LY': 'Libya',
2869 'LI': 'Liechtenstein',
2870 'LT': 'Lithuania',
2871 'LU': 'Luxembourg',
2872 'MO': 'Macao',
2873 'MK': 'Macedonia, the Former Yugoslav Republic of',
2874 'MG': 'Madagascar',
2875 'MW': 'Malawi',
2876 'MY': 'Malaysia',
2877 'MV': 'Maldives',
2878 'ML': 'Mali',
2879 'MT': 'Malta',
2880 'MH': 'Marshall Islands',
2881 'MQ': 'Martinique',
2882 'MR': 'Mauritania',
2883 'MU': 'Mauritius',
2884 'YT': 'Mayotte',
2885 'MX': 'Mexico',
2886 'FM': 'Micronesia, Federated States of',
2887 'MD': 'Moldova, Republic of',
2888 'MC': 'Monaco',
2889 'MN': 'Mongolia',
2890 'ME': 'Montenegro',
2891 'MS': 'Montserrat',
2892 'MA': 'Morocco',
2893 'MZ': 'Mozambique',
2894 'MM': 'Myanmar',
2895 'NA': 'Namibia',
2896 'NR': 'Nauru',
2897 'NP': 'Nepal',
2898 'NL': 'Netherlands',
2899 'NC': 'New Caledonia',
2900 'NZ': 'New Zealand',
2901 'NI': 'Nicaragua',
2902 'NE': 'Niger',
2903 'NG': 'Nigeria',
2904 'NU': 'Niue',
2905 'NF': 'Norfolk Island',
2906 'MP': 'Northern Mariana Islands',
2907 'NO': 'Norway',
2908 'OM': 'Oman',
2909 'PK': 'Pakistan',
2910 'PW': 'Palau',
2911 'PS': 'Palestine, State of',
2912 'PA': 'Panama',
2913 'PG': 'Papua New Guinea',
2914 'PY': 'Paraguay',
2915 'PE': 'Peru',
2916 'PH': 'Philippines',
2917 'PN': 'Pitcairn',
2918 'PL': 'Poland',
2919 'PT': 'Portugal',
2920 'PR': 'Puerto Rico',
2921 'QA': 'Qatar',
2922 'RE': 'Réunion',
2923 'RO': 'Romania',
2924 'RU': 'Russian Federation',
2925 'RW': 'Rwanda',
2926 'BL': 'Saint Barthélemy',
2927 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2928 'KN': 'Saint Kitts and Nevis',
2929 'LC': 'Saint Lucia',
2930 'MF': 'Saint Martin (French part)',
2931 'PM': 'Saint Pierre and Miquelon',
2932 'VC': 'Saint Vincent and the Grenadines',
2933 'WS': 'Samoa',
2934 'SM': 'San Marino',
2935 'ST': 'Sao Tome and Principe',
2936 'SA': 'Saudi Arabia',
2937 'SN': 'Senegal',
2938 'RS': 'Serbia',
2939 'SC': 'Seychelles',
2940 'SL': 'Sierra Leone',
2941 'SG': 'Singapore',
2942 'SX': 'Sint Maarten (Dutch part)',
2943 'SK': 'Slovakia',
2944 'SI': 'Slovenia',
2945 'SB': 'Solomon Islands',
2946 'SO': 'Somalia',
2947 'ZA': 'South Africa',
2948 'GS': 'South Georgia and the South Sandwich Islands',
2949 'SS': 'South Sudan',
2950 'ES': 'Spain',
2951 'LK': 'Sri Lanka',
2952 'SD': 'Sudan',
2953 'SR': 'Suriname',
2954 'SJ': 'Svalbard and Jan Mayen',
2955 'SZ': 'Swaziland',
2956 'SE': 'Sweden',
2957 'CH': 'Switzerland',
2958 'SY': 'Syrian Arab Republic',
2959 'TW': 'Taiwan, Province of China',
2960 'TJ': 'Tajikistan',
2961 'TZ': 'Tanzania, United Republic of',
2962 'TH': 'Thailand',
2963 'TL': 'Timor-Leste',
2964 'TG': 'Togo',
2965 'TK': 'Tokelau',
2966 'TO': 'Tonga',
2967 'TT': 'Trinidad and Tobago',
2968 'TN': 'Tunisia',
2969 'TR': 'Turkey',
2970 'TM': 'Turkmenistan',
2971 'TC': 'Turks and Caicos Islands',
2972 'TV': 'Tuvalu',
2973 'UG': 'Uganda',
2974 'UA': 'Ukraine',
2975 'AE': 'United Arab Emirates',
2976 'GB': 'United Kingdom',
2977 'US': 'United States',
2978 'UM': 'United States Minor Outlying Islands',
2979 'UY': 'Uruguay',
2980 'UZ': 'Uzbekistan',
2981 'VU': 'Vanuatu',
2982 'VE': 'Venezuela, Bolivarian Republic of',
2983 'VN': 'Viet Nam',
2984 'VG': 'Virgin Islands, British',
2985 'VI': 'Virgin Islands, U.S.',
2986 'WF': 'Wallis and Futuna',
2987 'EH': 'Western Sahara',
2988 'YE': 'Yemen',
2989 'ZM': 'Zambia',
2990 'ZW': 'Zimbabwe',
2991 }
2992
2993 @classmethod
2994 def short2full(cls, code):
2995 """Convert an ISO 3166-2 country code to the corresponding full name"""
2996 return cls._country_map.get(code.upper())
2997
2998
2999 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3000 def __init__(self, proxies=None):
3001 # Set default handlers
3002 for type in ('http', 'https'):
3003 setattr(self, '%s_open' % type,
3004 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3005 meth(r, proxy, type))
3006 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3007
3008 def proxy_open(self, req, proxy, type):
3009 req_proxy = req.headers.get('Ytdl-request-proxy')
3010 if req_proxy is not None:
3011 proxy = req_proxy
3012 del req.headers['Ytdl-request-proxy']
3013
3014 if proxy == '__noproxy__':
3015 return None # No Proxy
3016 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3017 req.add_header('Ytdl-socks-proxy', proxy)
3018 # youtube-dl's http/https handlers do wrapping the socket with socks
3019 return None
3020 return compat_urllib_request.ProxyHandler.proxy_open(
3021 self, req, proxy, type)
3022
3023
3024 def ohdave_rsa_encrypt(data, exponent, modulus):
3025 '''
3026 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3027
3028 Input:
3029 data: data to encrypt, bytes-like object
3030 exponent, modulus: parameter e and N of RSA algorithm, both integer
3031 Output: hex string of encrypted data
3032
3033 Limitation: supports one block encryption only
3034 '''
3035
3036 payload = int(binascii.hexlify(data[::-1]), 16)
3037 encrypted = pow(payload, exponent, modulus)
3038 return '%x' % encrypted
3039
3040
3041 def encode_base_n(num, n, table=None):
3042 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3043 if not table:
3044 table = FULL_TABLE[:n]
3045
3046 if n > len(table):
3047 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3048
3049 if num == 0:
3050 return table[0]
3051
3052 ret = ''
3053 while num:
3054 ret = table[num % n] + ret
3055 num = num // n
3056 return ret
3057
3058
3059 def decode_packed_codes(code):
3060 mobj = re.search(PACKED_CODES_RE, code)
3061 obfucasted_code, base, count, symbols = mobj.groups()
3062 base = int(base)
3063 count = int(count)
3064 symbols = symbols.split('|')
3065 symbol_table = {}
3066
3067 while count:
3068 count -= 1
3069 base_n_count = encode_base_n(count, base)
3070 symbol_table[base_n_count] = symbols[count] or base_n_count
3071
3072 return re.sub(
3073 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3074 obfucasted_code)
3075
3076
3077 def parse_m3u8_attributes(attrib):
3078 info = {}
3079 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3080 if val.startswith('"'):
3081 val = val[1:-1]
3082 info[key] = val
3083 return info
3084
3085
3086 def urshift(val, n):
3087 return val >> n if val >= 0 else (val + 0x100000000) >> n
3088
3089
3090 # Based on png2str() written by @gdkchan and improved by @yokrysty
3091 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3092 def decode_png(png_data):
3093 # Reference: https://www.w3.org/TR/PNG/
3094 header = png_data[8:]
3095
3096 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3097 raise IOError('Not a valid PNG file.')
3098
3099 int_map = {1: '>B', 2: '>H', 4: '>I'}
3100 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3101
3102 chunks = []
3103
3104 while header:
3105 length = unpack_integer(header[:4])
3106 header = header[4:]
3107
3108 chunk_type = header[:4]
3109 header = header[4:]
3110
3111 chunk_data = header[:length]
3112 header = header[length:]
3113
3114 header = header[4:] # Skip CRC
3115
3116 chunks.append({
3117 'type': chunk_type,
3118 'length': length,
3119 'data': chunk_data
3120 })
3121
3122 ihdr = chunks[0]['data']
3123
3124 width = unpack_integer(ihdr[:4])
3125 height = unpack_integer(ihdr[4:8])
3126
3127 idat = b''
3128
3129 for chunk in chunks:
3130 if chunk['type'] == b'IDAT':
3131 idat += chunk['data']
3132
3133 if not idat:
3134 raise IOError('Unable to read PNG data.')
3135
3136 decompressed_data = bytearray(zlib.decompress(idat))
3137
3138 stride = width * 3
3139 pixels = []
3140
3141 def _get_pixel(idx):
3142 x = idx % stride
3143 y = idx // stride
3144 return pixels[y][x]
3145
3146 for y in range(height):
3147 basePos = y * (1 + stride)
3148 filter_type = decompressed_data[basePos]
3149
3150 current_row = []
3151
3152 pixels.append(current_row)
3153
3154 for x in range(stride):
3155 color = decompressed_data[1 + basePos + x]
3156 basex = y * stride + x
3157 left = 0
3158 up = 0
3159
3160 if x > 2:
3161 left = _get_pixel(basex - 3)
3162 if y > 0:
3163 up = _get_pixel(basex - stride)
3164
3165 if filter_type == 1: # Sub
3166 color = (color + left) & 0xff
3167 elif filter_type == 2: # Up
3168 color = (color + up) & 0xff
3169 elif filter_type == 3: # Average
3170 color = (color + ((left + up) >> 1)) & 0xff
3171 elif filter_type == 4: # Paeth
3172 a = left
3173 b = up
3174 c = 0
3175
3176 if x > 2 and y > 0:
3177 c = _get_pixel(basex - stride - 3)
3178
3179 p = a + b - c
3180
3181 pa = abs(p - a)
3182 pb = abs(p - b)
3183 pc = abs(p - c)
3184
3185 if pa <= pb and pa <= pc:
3186 color = (color + a) & 0xff
3187 elif pb <= pc:
3188 color = (color + b) & 0xff
3189 else:
3190 color = (color + c) & 0xff
3191
3192 current_row.append(color)
3193
3194 return width, height, pixels
3195
3196
3197 def write_xattr(path, key, value):
3198 # This mess below finds the best xattr tool for the job
3199 try:
3200 # try the pyxattr module...
3201 import xattr
3202
3203 if hasattr(xattr, 'set'): # pyxattr
3204 # Unicode arguments are not supported in python-pyxattr until
3205 # version 0.5.0
3206 # See https://github.com/rg3/youtube-dl/issues/5498
3207 pyxattr_required_version = '0.5.0'
3208 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3209 # TODO: fallback to CLI tools
3210 raise XAttrUnavailableError(
3211 'python-pyxattr is detected but is too old. '
3212 'youtube-dl requires %s or above while your version is %s. '
3213 'Falling back to other xattr implementations' % (
3214 pyxattr_required_version, xattr.__version__))
3215
3216 setxattr = xattr.set
3217 else: # xattr
3218 setxattr = xattr.setxattr
3219
3220 try:
3221 setxattr(path, key, value)
3222 except EnvironmentError as e:
3223 raise XAttrMetadataError(e.errno, e.strerror)
3224
3225 except ImportError:
3226 if compat_os_name == 'nt':
3227 # Write xattrs to NTFS Alternate Data Streams:
3228 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3229 assert ':' not in key
3230 assert os.path.exists(path)
3231
3232 ads_fn = path + ':' + key
3233 try:
3234 with open(ads_fn, 'wb') as f:
3235 f.write(value)
3236 except EnvironmentError as e:
3237 raise XAttrMetadataError(e.errno, e.strerror)
3238 else:
3239 user_has_setfattr = check_executable('setfattr', ['--version'])
3240 user_has_xattr = check_executable('xattr', ['-h'])
3241
3242 if user_has_setfattr or user_has_xattr:
3243
3244 value = value.decode('utf-8')
3245 if user_has_setfattr:
3246 executable = 'setfattr'
3247 opts = ['-n', key, '-v', value]
3248 elif user_has_xattr:
3249 executable = 'xattr'
3250 opts = ['-w', key, value]
3251
3252 cmd = ([encodeFilename(executable, True)] +
3253 [encodeArgument(o) for o in opts] +
3254 [encodeFilename(path, True)])
3255
3256 try:
3257 p = subprocess.Popen(
3258 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3259 except EnvironmentError as e:
3260 raise XAttrMetadataError(e.errno, e.strerror)
3261 stdout, stderr = p.communicate()
3262 stderr = stderr.decode('utf-8', 'replace')
3263 if p.returncode != 0:
3264 raise XAttrMetadataError(p.returncode, stderr)
3265
3266 else:
3267 # On Unix, and can't find pyxattr, setfattr, or xattr.
3268 if sys.platform.startswith('linux'):
3269 raise XAttrUnavailableError(
3270 "Couldn't find a tool to set the xattrs. "
3271 "Install either the python 'pyxattr' or 'xattr' "
3272 "modules, or the GNU 'attr' package "
3273 "(which contains the 'setfattr' tool).")
3274 else:
3275 raise XAttrUnavailableError(
3276 "Couldn't find a tool to set the xattrs. "
3277 "Install either the python 'xattr' module, "
3278 "or the 'xattr' binary.")