]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/patches: Add patch from upstream to fix extraction from youtube.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_os_name,
46 compat_parse_qs,
47 compat_shlex_quote,
48 compat_socket_create_connection,
49 compat_str,
50 compat_struct_pack,
51 compat_struct_unpack,
52 compat_urllib_error,
53 compat_urllib_parse,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
58 compat_urlparse,
59 compat_xpath,
60 )
61
62 from .socks import (
63 ProxyType,
64 sockssocket,
65 )
66
67
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
79
80 std_headers = {
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88
89 NO_DEFAULT = object()
90
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
94
95 MONTH_NAMES = {
96 'en': ENGLISH_MONTH_NAMES,
97 'fr': [
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
100 }
101
102 KNOWN_EXTENSIONS = (
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
107 'avi', 'divx',
108 'mov',
109 'asf', 'wmv', 'wma',
110 '3gp', '3g2',
111 'mp3',
112 'flac',
113 'ape',
114 'wav',
115 'f4f', 'f4m', 'm3u8', 'smil')
116
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
121
122 DATE_FORMATS = (
123 '%d %B %Y',
124 '%d %b %Y',
125 '%B %d %Y',
126 '%b %d %Y',
127 '%b %dst %Y %I:%M',
128 '%b %dnd %Y %I:%M',
129 '%b %dth %Y %I:%M',
130 '%Y %m %d',
131 '%Y-%m-%d',
132 '%Y/%m/%d',
133 '%Y/%m/%d %H:%M',
134 '%Y/%m/%d %H:%M:%S',
135 '%Y-%m-%d %H:%M:%S',
136 '%Y-%m-%d %H:%M:%S.%f',
137 '%d.%m.%Y %H:%M',
138 '%d.%m.%Y %H.%M',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
142 '%Y-%m-%dT%H:%M:%S',
143 '%Y-%m-%dT%H:%M:%S.%f',
144 '%Y-%m-%dT%H:%M',
145 '%b %d %Y at %H:%M',
146 '%b %d %Y at %H:%M:%S',
147 )
148
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
157 ])
158
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166 ])
167
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
169
170
171 def preferredencoding():
172 """Get preferred encoding.
173
174 Returns the best encoding scheme for the system, based on
175 locale.getpreferredencoding() and some further tweaks.
176 """
177 try:
178 pref = locale.getpreferredencoding()
179 'TEST'.encode(pref)
180 except Exception:
181 pref = 'UTF-8'
182
183 return pref
184
185
186 def write_json_file(obj, fn):
187 """ Encode obj as JSON and write it to fn, atomically if possible """
188
189 fn = encodeFilename(fn)
190 if sys.version_info < (3, 0) and sys.platform != 'win32':
191 encoding = get_filesystem_encoding()
192 # os.path.basename returns a bytes object, but NamedTemporaryFile
193 # will fail if the filename contains non ascii characters unless we
194 # use a unicode object
195 path_basename = lambda f: os.path.basename(fn).decode(encoding)
196 # the same for os.path.dirname
197 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
198 else:
199 path_basename = os.path.basename
200 path_dirname = os.path.dirname
201
202 args = {
203 'suffix': '.tmp',
204 'prefix': path_basename(fn) + '.',
205 'dir': path_dirname(fn),
206 'delete': False,
207 }
208
209 # In Python 2.x, json.dump expects a bytestream.
210 # In Python 3.x, it writes to a character stream
211 if sys.version_info < (3, 0):
212 args['mode'] = 'wb'
213 else:
214 args.update({
215 'mode': 'w',
216 'encoding': 'utf-8',
217 })
218
219 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
220
221 try:
222 with tf:
223 json.dump(obj, tf)
224 if sys.platform == 'win32':
225 # Need to remove existing file on Windows, else os.rename raises
226 # WindowsError or FileExistsError.
227 try:
228 os.unlink(fn)
229 except OSError:
230 pass
231 os.rename(tf.name, fn)
232 except Exception:
233 try:
234 os.remove(tf.name)
235 except OSError:
236 pass
237 raise
238
239
240 if sys.version_info >= (2, 7):
241 def find_xpath_attr(node, xpath, key, val=None):
242 """ Find the xpath xpath[@key=val] """
243 assert re.match(r'^[a-zA-Z_-]+$', key)
244 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
245 return node.find(expr)
246 else:
247 def find_xpath_attr(node, xpath, key, val=None):
248 for f in node.findall(compat_xpath(xpath)):
249 if key not in f.attrib:
250 continue
251 if val is None or f.attrib.get(key) == val:
252 return f
253 return None
254
255 # On python2.6 the xml.etree.ElementTree.Element methods don't support
256 # the namespace parameter
257
258
259 def xpath_with_ns(path, ns_map):
260 components = [c.split(':') for c in path.split('/')]
261 replaced = []
262 for c in components:
263 if len(c) == 1:
264 replaced.append(c[0])
265 else:
266 ns, tag = c
267 replaced.append('{%s}%s' % (ns_map[ns], tag))
268 return '/'.join(replaced)
269
270
271 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
272 def _find_xpath(xpath):
273 return node.find(compat_xpath(xpath))
274
275 if isinstance(xpath, (str, compat_str)):
276 n = _find_xpath(xpath)
277 else:
278 for xp in xpath:
279 n = _find_xpath(xp)
280 if n is not None:
281 break
282
283 if n is None:
284 if default is not NO_DEFAULT:
285 return default
286 elif fatal:
287 name = xpath if name is None else name
288 raise ExtractorError('Could not find XML element %s' % name)
289 else:
290 return None
291 return n
292
293
294 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
295 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
296 if n is None or n == default:
297 return n
298 if n.text is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element\'s text %s' % name)
304 else:
305 return None
306 return n.text
307
308
309 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
310 n = find_xpath_attr(node, xpath, key)
311 if n is None:
312 if default is not NO_DEFAULT:
313 return default
314 elif fatal:
315 name = '%s[@%s]' % (xpath, key) if name is None else name
316 raise ExtractorError('Could not find XML attribute %s' % name)
317 else:
318 return None
319 return n.attrib[key]
320
321
322 def get_element_by_id(id, html):
323 """Return the content of the tag with the specified ID in the passed HTML document"""
324 return get_element_by_attribute('id', id, html)
325
326
327 def get_element_by_class(class_name, html):
328 return get_element_by_attribute(
329 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
330 html, escape_value=False)
331
332
333 def get_element_by_attribute(attribute, value, html, escape_value=True):
334 """Return the content of the tag with the specified attribute in the passed HTML document"""
335
336 value = re.escape(value) if escape_value else value
337
338 m = re.search(r'''(?xs)
339 <([a-zA-Z0-9:._-]+)
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
341 \s+%s=['"]?%s['"]?
342 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
343 \s*>
344 (?P<content>.*?)
345 </\1>
346 ''' % (re.escape(attribute), value), html)
347
348 if not m:
349 return None
350 res = m.group('content')
351
352 if res.startswith('"') or res.startswith("'"):
353 res = res[1:-1]
354
355 return unescapeHTML(res)
356
357
358 class HTMLAttributeParser(compat_HTMLParser):
359 """Trivial HTML parser to gather the attributes for a single element"""
360 def __init__(self):
361 self.attrs = {}
362 compat_HTMLParser.__init__(self)
363
364 def handle_starttag(self, tag, attrs):
365 self.attrs = dict(attrs)
366
367
368 def extract_attributes(html_element):
369 """Given a string for an HTML element such as
370 <el
371 a="foo" B="bar" c="&98;az" d=boz
372 empty= noval entity="&amp;"
373 sq='"' dq="'"
374 >
375 Decode and return a dictionary of attributes.
376 {
377 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
378 'empty': '', 'noval': None, 'entity': '&',
379 'sq': '"', 'dq': '\''
380 }.
381 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
382 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
383 """
384 parser = HTMLAttributeParser()
385 parser.feed(html_element)
386 parser.close()
387 return parser.attrs
388
389
390 def clean_html(html):
391 """Clean an HTML snippet into a readable string"""
392
393 if html is None: # Convenience for sanitizing descriptions etc.
394 return html
395
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
404 return html.strip()
405
406
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
418 if filename == '-':
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
427 raise
428
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = sanitize_path(filename)
431 if alt_filename == filename:
432 raise
433 else:
434 # An exception here should be caught in the caller
435 stream = open(encodeFilename(alt_filename), open_mode)
436 return (stream, alt_filename)
437
438
439 def timeconvert(timestr):
440 """Convert RFC 2822 defined time string into system timestamp"""
441 timestamp = None
442 timetuple = email.utils.parsedate_tz(timestr)
443 if timetuple is not None:
444 timestamp = email.utils.mktime_tz(timetuple)
445 return timestamp
446
447
448 def sanitize_filename(s, restricted=False, is_id=False):
449 """Sanitizes a string so it could be used as part of a filename.
450 If restricted is set, use a stricter subset of allowed characters.
451 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
452 """
453 def replace_insane(char):
454 if restricted and char in ACCENT_CHARS:
455 return ACCENT_CHARS[char]
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
470 # Handle timestamps
471 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
472 result = ''.join(map(replace_insane, s))
473 if not is_id:
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
479 result = result[2:]
480 if result.startswith('-'):
481 result = '_' + result[len('-'):]
482 result = result.lstrip('.')
483 if not result:
484 result = '_'
485 return result
486
487
488 def sanitize_path(s):
489 """Sanitizes and normalizes path on Windows"""
490 if sys.platform != 'win32':
491 return s
492 drive_or_unc, _ = os.path.splitdrive(s)
493 if sys.version_info < (2, 7) and not drive_or_unc:
494 drive_or_unc, _ = os.path.splitunc(s)
495 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
496 if drive_or_unc:
497 norm_path.pop(0)
498 sanitized_path = [
499 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
500 for path_part in norm_path]
501 if drive_or_unc:
502 sanitized_path.insert(0, drive_or_unc + os.path.sep)
503 return os.path.join(*sanitized_path)
504
505
506 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
507 # unwanted failures due to missing protocol
508 def sanitize_url(url):
509 return 'http:%s' % url if url.startswith('//') else url
510
511
512 def sanitized_Request(url, *args, **kwargs):
513 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
514
515
516 def orderedSet(iterable):
517 """ Remove all duplicates from the input iterable """
518 res = []
519 for el in iterable:
520 if el not in res:
521 res.append(el)
522 return res
523
524
525 def _htmlentity_transform(entity_with_semicolon):
526 """Transforms an HTML entity to a character."""
527 entity = entity_with_semicolon[:-1]
528
529 # Known non-numeric HTML entity
530 if entity in compat_html_entities.name2codepoint:
531 return compat_chr(compat_html_entities.name2codepoint[entity])
532
533 # TODO: HTML5 allows entities without a semicolon. For example,
534 # '&Eacuteric' should be decoded as 'Éric'.
535 if entity_with_semicolon in compat_html_entities_html5:
536 return compat_html_entities_html5[entity_with_semicolon]
537
538 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
539 if mobj is not None:
540 numstr = mobj.group(1)
541 if numstr.startswith('x'):
542 base = 16
543 numstr = '0%s' % numstr
544 else:
545 base = 10
546 # See https://github.com/rg3/youtube-dl/issues/7518
547 try:
548 return compat_chr(int(numstr, base))
549 except ValueError:
550 pass
551
552 # Unknown entity in name, return its literal representation
553 return '&%s;' % entity
554
555
556 def unescapeHTML(s):
557 if s is None:
558 return None
559 assert type(s) == compat_str
560
561 return re.sub(
562 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
563
564
565 def get_subprocess_encoding():
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # For subprocess calls, encode with locale encoding
568 # Refer to http://stackoverflow.com/a/9951851/35070
569 encoding = preferredencoding()
570 else:
571 encoding = sys.getfilesystemencoding()
572 if encoding is None:
573 encoding = 'utf-8'
574 return encoding
575
576
577 def encodeFilename(s, for_subprocess=False):
578 """
579 @param s The name of the file
580 """
581
582 assert type(s) == compat_str
583
584 # Python 3 has a Unicode API
585 if sys.version_info >= (3, 0):
586 return s
587
588 # Pass '' directly to use Unicode APIs on Windows 2000 and up
589 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
590 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
591 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
592 return s
593
594 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
595 if sys.platform.startswith('java'):
596 return s
597
598 return s.encode(get_subprocess_encoding(), 'ignore')
599
600
601 def decodeFilename(b, for_subprocess=False):
602
603 if sys.version_info >= (3, 0):
604 return b
605
606 if not isinstance(b, bytes):
607 return b
608
609 return b.decode(get_subprocess_encoding(), 'ignore')
610
611
612 def encodeArgument(s):
613 if not isinstance(s, compat_str):
614 # Legacy code that uses byte strings
615 # Uncomment the following line after fixing all post processors
616 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
617 s = s.decode('ascii')
618 return encodeFilename(s, True)
619
620
621 def decodeArgument(b):
622 return decodeFilename(b, True)
623
624
625 def decodeOption(optval):
626 if optval is None:
627 return optval
628 if isinstance(optval, bytes):
629 optval = optval.decode(preferredencoding())
630
631 assert isinstance(optval, compat_str)
632 return optval
633
634
635 def formatSeconds(secs):
636 if secs > 3600:
637 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
638 elif secs > 60:
639 return '%d:%02d' % (secs // 60, secs % 60)
640 else:
641 return '%d' % secs
642
643
644 def make_HTTPS_handler(params, **kwargs):
645 opts_no_check_certificate = params.get('nocheckcertificate', False)
646 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
647 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
648 if opts_no_check_certificate:
649 context.check_hostname = False
650 context.verify_mode = ssl.CERT_NONE
651 try:
652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
653 except TypeError:
654 # Python 2.7.8
655 # (create_default_context present but HTTPSHandler has no context=)
656 pass
657
658 if sys.version_info < (3, 2):
659 return YoutubeDLHTTPSHandler(params, **kwargs)
660 else: # Python < 3.4
661 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
662 context.verify_mode = (ssl.CERT_NONE
663 if opts_no_check_certificate
664 else ssl.CERT_REQUIRED)
665 context.set_default_verify_paths()
666 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
667
668
669 def bug_reports_message():
670 if ytdl_is_updateable():
671 update_cmd = 'type youtube-dl -U to update'
672 else:
673 update_cmd = 'see https://yt-dl.org/update on how to update'
674 msg = '; please report this issue on https://yt-dl.org/bug .'
675 msg += ' Make sure you are using the latest version; %s.' % update_cmd
676 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
677 return msg
678
679
680 class ExtractorError(Exception):
681 """Error during info extraction."""
682
683 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
684 """ tb, if given, is the original traceback (so that it can be printed out).
685 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
686 """
687
688 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
689 expected = True
690 if video_id is not None:
691 msg = video_id + ': ' + msg
692 if cause:
693 msg += ' (caused by %r)' % cause
694 if not expected:
695 msg += bug_reports_message()
696 super(ExtractorError, self).__init__(msg)
697
698 self.traceback = tb
699 self.exc_info = sys.exc_info() # preserve original exception
700 self.cause = cause
701 self.video_id = video_id
702
703 def format_traceback(self):
704 if self.traceback is None:
705 return None
706 return ''.join(traceback.format_tb(self.traceback))
707
708
709 class UnsupportedError(ExtractorError):
710 def __init__(self, url):
711 super(UnsupportedError, self).__init__(
712 'Unsupported URL: %s' % url, expected=True)
713 self.url = url
714
715
716 class RegexNotFoundError(ExtractorError):
717 """Error when a regex didn't match"""
718 pass
719
720
721 class DownloadError(Exception):
722 """Download Error exception.
723
724 This exception may be thrown by FileDownloader objects if they are not
725 configured to continue on errors. They will contain the appropriate
726 error message.
727 """
728
729 def __init__(self, msg, exc_info=None):
730 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
731 super(DownloadError, self).__init__(msg)
732 self.exc_info = exc_info
733
734
735 class SameFileError(Exception):
736 """Same File exception.
737
738 This exception will be thrown by FileDownloader objects if they detect
739 multiple files would have to be downloaded to the same file on disk.
740 """
741 pass
742
743
744 class PostProcessingError(Exception):
745 """Post Processing exception.
746
747 This exception may be raised by PostProcessor's .run() method to
748 indicate an error in the postprocessing task.
749 """
750
751 def __init__(self, msg):
752 self.msg = msg
753
754
755 class MaxDownloadsReached(Exception):
756 """ --max-downloads limit has been reached. """
757 pass
758
759
760 class UnavailableVideoError(Exception):
761 """Unavailable Format exception.
762
763 This exception will be thrown when a video is requested
764 in a format that is not available for that video.
765 """
766 pass
767
768
769 class ContentTooShortError(Exception):
770 """Content Too Short exception.
771
772 This exception may be raised by FileDownloader objects when a file they
773 download is too small for what the server announced first, indicating
774 the connection was probably interrupted.
775 """
776
777 def __init__(self, downloaded, expected):
778 # Both in bytes
779 self.downloaded = downloaded
780 self.expected = expected
781
782
783 class XAttrMetadataError(Exception):
784 def __init__(self, code=None, msg='Unknown error'):
785 super(XAttrMetadataError, self).__init__(msg)
786 self.code = code
787 self.msg = msg
788
789 # Parsing code and msg
790 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
791 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
792 self.reason = 'NO_SPACE'
793 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
794 self.reason = 'VALUE_TOO_LONG'
795 else:
796 self.reason = 'NOT_SUPPORTED'
797
798
799 class XAttrUnavailableError(Exception):
800 pass
801
802
803 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
804 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
805 # expected HTTP responses to meet HTTP/1.0 or later (see also
806 # https://github.com/rg3/youtube-dl/issues/6727)
807 if sys.version_info < (3, 0):
808 kwargs[b'strict'] = True
809 hc = http_class(*args, **kwargs)
810 source_address = ydl_handler._params.get('source_address')
811 if source_address is not None:
812 sa = (source_address, 0)
813 if hasattr(hc, 'source_address'): # Python 2.7+
814 hc.source_address = sa
815 else: # Python 2.6
816 def _hc_connect(self, *args, **kwargs):
817 sock = compat_socket_create_connection(
818 (self.host, self.port), self.timeout, sa)
819 if is_https:
820 self.sock = ssl.wrap_socket(
821 sock, self.key_file, self.cert_file,
822 ssl_version=ssl.PROTOCOL_TLSv1)
823 else:
824 self.sock = sock
825 hc.connect = functools.partial(_hc_connect, hc)
826
827 return hc
828
829
830 def handle_youtubedl_headers(headers):
831 filtered_headers = headers
832
833 if 'Youtubedl-no-compression' in filtered_headers:
834 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
835 del filtered_headers['Youtubedl-no-compression']
836
837 return filtered_headers
838
839
840 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
841 """Handler for HTTP requests and responses.
842
843 This class, when installed with an OpenerDirector, automatically adds
844 the standard headers to every HTTP request and handles gzipped and
845 deflated responses from web servers. If compression is to be avoided in
846 a particular request, the original request in the program code only has
847 to include the HTTP header "Youtubedl-no-compression", which will be
848 removed before making the real request.
849
850 Part of this code was copied from:
851
852 http://techknack.net/python-urllib2-handlers/
853
854 Andrew Rowls, the author of that code, agreed to release it to the
855 public domain.
856 """
857
858 def __init__(self, params, *args, **kwargs):
859 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
860 self._params = params
861
862 def http_open(self, req):
863 conn_class = compat_http_client.HTTPConnection
864
865 socks_proxy = req.headers.get('Ytdl-socks-proxy')
866 if socks_proxy:
867 conn_class = make_socks_conn_class(conn_class, socks_proxy)
868 del req.headers['Ytdl-socks-proxy']
869
870 return self.do_open(functools.partial(
871 _create_http_connection, self, conn_class, False),
872 req)
873
874 @staticmethod
875 def deflate(data):
876 try:
877 return zlib.decompress(data, -zlib.MAX_WBITS)
878 except zlib.error:
879 return zlib.decompress(data)
880
881 @staticmethod
882 def addinfourl_wrapper(stream, headers, url, code):
883 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
884 return compat_urllib_request.addinfourl(stream, headers, url, code)
885 ret = compat_urllib_request.addinfourl(stream, headers, url)
886 ret.code = code
887 return ret
888
889 def http_request(self, req):
890 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
891 # always respected by websites, some tend to give out URLs with non percent-encoded
892 # non-ASCII characters (see telemb.py, ard.py [#3412])
893 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
894 # To work around aforementioned issue we will replace request's original URL with
895 # percent-encoded one
896 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
897 # the code of this workaround has been moved here from YoutubeDL.urlopen()
898 url = req.get_full_url()
899 url_escaped = escape_url(url)
900
901 # Substitute URL if any change after escaping
902 if url != url_escaped:
903 req = update_Request(req, url=url_escaped)
904
905 for h, v in std_headers.items():
906 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
907 # The dict keys are capitalized because of this bug by urllib
908 if h.capitalize() not in req.headers:
909 req.add_header(h, v)
910
911 req.headers = handle_youtubedl_headers(req.headers)
912
913 if sys.version_info < (2, 7) and '#' in req.get_full_url():
914 # Python 2.6 is brain-dead when it comes to fragments
915 req._Request__original = req._Request__original.partition('#')[0]
916 req._Request__r_type = req._Request__r_type.partition('#')[0]
917
918 return req
919
920 def http_response(self, req, resp):
921 old_resp = resp
922 # gzip
923 if resp.headers.get('Content-encoding', '') == 'gzip':
924 content = resp.read()
925 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
926 try:
927 uncompressed = io.BytesIO(gz.read())
928 except IOError as original_ioerror:
929 # There may be junk add the end of the file
930 # See http://stackoverflow.com/q/4928560/35070 for details
931 for i in range(1, 1024):
932 try:
933 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
934 uncompressed = io.BytesIO(gz.read())
935 except IOError:
936 continue
937 break
938 else:
939 raise original_ioerror
940 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
941 resp.msg = old_resp.msg
942 del resp.headers['Content-encoding']
943 # deflate
944 if resp.headers.get('Content-encoding', '') == 'deflate':
945 gz = io.BytesIO(self.deflate(resp.read()))
946 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
947 resp.msg = old_resp.msg
948 del resp.headers['Content-encoding']
949 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
950 # https://github.com/rg3/youtube-dl/issues/6457).
951 if 300 <= resp.code < 400:
952 location = resp.headers.get('Location')
953 if location:
954 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
955 if sys.version_info >= (3, 0):
956 location = location.encode('iso-8859-1').decode('utf-8')
957 else:
958 location = location.decode('utf-8')
959 location_escaped = escape_url(location)
960 if location != location_escaped:
961 del resp.headers['Location']
962 if sys.version_info < (3, 0):
963 location_escaped = location_escaped.encode('utf-8')
964 resp.headers['Location'] = location_escaped
965 return resp
966
967 https_request = http_request
968 https_response = http_response
969
970
971 def make_socks_conn_class(base_class, socks_proxy):
972 assert issubclass(base_class, (
973 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
974
975 url_components = compat_urlparse.urlparse(socks_proxy)
976 if url_components.scheme.lower() == 'socks5':
977 socks_type = ProxyType.SOCKS5
978 elif url_components.scheme.lower() in ('socks', 'socks4'):
979 socks_type = ProxyType.SOCKS4
980 elif url_components.scheme.lower() == 'socks4a':
981 socks_type = ProxyType.SOCKS4A
982
983 def unquote_if_non_empty(s):
984 if not s:
985 return s
986 return compat_urllib_parse_unquote_plus(s)
987
988 proxy_args = (
989 socks_type,
990 url_components.hostname, url_components.port or 1080,
991 True, # Remote DNS
992 unquote_if_non_empty(url_components.username),
993 unquote_if_non_empty(url_components.password),
994 )
995
996 class SocksConnection(base_class):
997 def connect(self):
998 self.sock = sockssocket()
999 self.sock.setproxy(*proxy_args)
1000 if type(self.timeout) in (int, float):
1001 self.sock.settimeout(self.timeout)
1002 self.sock.connect((self.host, self.port))
1003
1004 if isinstance(self, compat_http_client.HTTPSConnection):
1005 if hasattr(self, '_context'): # Python > 2.6
1006 self.sock = self._context.wrap_socket(
1007 self.sock, server_hostname=self.host)
1008 else:
1009 self.sock = ssl.wrap_socket(self.sock)
1010
1011 return SocksConnection
1012
1013
1014 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1015 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1016 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1017 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1018 self._params = params
1019
1020 def https_open(self, req):
1021 kwargs = {}
1022 conn_class = self._https_conn_class
1023
1024 if hasattr(self, '_context'): # python > 2.6
1025 kwargs['context'] = self._context
1026 if hasattr(self, '_check_hostname'): # python 3.x
1027 kwargs['check_hostname'] = self._check_hostname
1028
1029 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1030 if socks_proxy:
1031 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1032 del req.headers['Ytdl-socks-proxy']
1033
1034 return self.do_open(functools.partial(
1035 _create_http_connection, self, conn_class, True),
1036 req, **kwargs)
1037
1038
1039 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1040 def __init__(self, cookiejar=None):
1041 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1042
1043 def http_response(self, request, response):
1044 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1045 # characters in Set-Cookie HTTP header of last response (see
1046 # https://github.com/rg3/youtube-dl/issues/6769).
1047 # In order to at least prevent crashing we will percent encode Set-Cookie
1048 # header before HTTPCookieProcessor starts processing it.
1049 # if sys.version_info < (3, 0) and response.headers:
1050 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1051 # set_cookie = response.headers.get(set_cookie_header)
1052 # if set_cookie:
1053 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1054 # if set_cookie != set_cookie_escaped:
1055 # del response.headers[set_cookie_header]
1056 # response.headers[set_cookie_header] = set_cookie_escaped
1057 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1058
1059 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1060 https_response = http_response
1061
1062
1063 def extract_timezone(date_str):
1064 m = re.search(
1065 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1066 date_str)
1067 if not m:
1068 timezone = datetime.timedelta()
1069 else:
1070 date_str = date_str[:-len(m.group('tz'))]
1071 if not m.group('sign'):
1072 timezone = datetime.timedelta()
1073 else:
1074 sign = 1 if m.group('sign') == '+' else -1
1075 timezone = datetime.timedelta(
1076 hours=sign * int(m.group('hours')),
1077 minutes=sign * int(m.group('minutes')))
1078 return timezone, date_str
1079
1080
1081 def parse_iso8601(date_str, delimiter='T', timezone=None):
1082 """ Return a UNIX timestamp from the given date """
1083
1084 if date_str is None:
1085 return None
1086
1087 date_str = re.sub(r'\.[0-9]+', '', date_str)
1088
1089 if timezone is None:
1090 timezone, date_str = extract_timezone(date_str)
1091
1092 try:
1093 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1094 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1095 return calendar.timegm(dt.timetuple())
1096 except ValueError:
1097 pass
1098
1099
1100 def date_formats(day_first=True):
1101 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1102
1103
1104 def unified_strdate(date_str, day_first=True):
1105 """Return a string with the date in the format YYYYMMDD"""
1106
1107 if date_str is None:
1108 return None
1109 upload_date = None
1110 # Replace commas
1111 date_str = date_str.replace(',', ' ')
1112 # Remove AM/PM + timezone
1113 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1114 _, date_str = extract_timezone(date_str)
1115
1116 for expression in date_formats(day_first):
1117 try:
1118 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1119 except ValueError:
1120 pass
1121 if upload_date is None:
1122 timetuple = email.utils.parsedate_tz(date_str)
1123 if timetuple:
1124 try:
1125 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1126 except ValueError:
1127 pass
1128 if upload_date is not None:
1129 return compat_str(upload_date)
1130
1131
1132 def unified_timestamp(date_str, day_first=True):
1133 if date_str is None:
1134 return None
1135
1136 date_str = date_str.replace(',', ' ')
1137
1138 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1139 timezone, date_str = extract_timezone(date_str)
1140
1141 # Remove AM/PM + timezone
1142 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1143
1144 for expression in date_formats(day_first):
1145 try:
1146 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1147 return calendar.timegm(dt.timetuple())
1148 except ValueError:
1149 pass
1150 timetuple = email.utils.parsedate_tz(date_str)
1151 if timetuple:
1152 return calendar.timegm(timetuple) + pm_delta * 3600
1153
1154
1155 def determine_ext(url, default_ext='unknown_video'):
1156 if url is None:
1157 return default_ext
1158 guess = url.partition('?')[0].rpartition('.')[2]
1159 if re.match(r'^[A-Za-z0-9]+$', guess):
1160 return guess
1161 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1162 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1163 return guess.rstrip('/')
1164 else:
1165 return default_ext
1166
1167
1168 def subtitles_filename(filename, sub_lang, sub_format):
1169 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1170
1171
1172 def date_from_str(date_str):
1173 """
1174 Return a datetime object from a string in the format YYYYMMDD or
1175 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1176 today = datetime.date.today()
1177 if date_str in ('now', 'today'):
1178 return today
1179 if date_str == 'yesterday':
1180 return today - datetime.timedelta(days=1)
1181 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1182 if match is not None:
1183 sign = match.group('sign')
1184 time = int(match.group('time'))
1185 if sign == '-':
1186 time = -time
1187 unit = match.group('unit')
1188 # A bad approximation?
1189 if unit == 'month':
1190 unit = 'day'
1191 time *= 30
1192 elif unit == 'year':
1193 unit = 'day'
1194 time *= 365
1195 unit += 's'
1196 delta = datetime.timedelta(**{unit: time})
1197 return today + delta
1198 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1199
1200
1201 def hyphenate_date(date_str):
1202 """
1203 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1204 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1205 if match is not None:
1206 return '-'.join(match.groups())
1207 else:
1208 return date_str
1209
1210
1211 class DateRange(object):
1212 """Represents a time interval between two dates"""
1213
1214 def __init__(self, start=None, end=None):
1215 """start and end must be strings in the format accepted by date"""
1216 if start is not None:
1217 self.start = date_from_str(start)
1218 else:
1219 self.start = datetime.datetime.min.date()
1220 if end is not None:
1221 self.end = date_from_str(end)
1222 else:
1223 self.end = datetime.datetime.max.date()
1224 if self.start > self.end:
1225 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1226
1227 @classmethod
1228 def day(cls, day):
1229 """Returns a range that only contains the given day"""
1230 return cls(day, day)
1231
1232 def __contains__(self, date):
1233 """Check if the date is in the range"""
1234 if not isinstance(date, datetime.date):
1235 date = date_from_str(date)
1236 return self.start <= date <= self.end
1237
1238 def __str__(self):
1239 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1240
1241
1242 def platform_name():
1243 """ Returns the platform name as a compat_str """
1244 res = platform.platform()
1245 if isinstance(res, bytes):
1246 res = res.decode(preferredencoding())
1247
1248 assert isinstance(res, compat_str)
1249 return res
1250
1251
1252 def _windows_write_string(s, out):
1253 """ Returns True if the string was written using special methods,
1254 False if it has yet to be written out."""
1255 # Adapted from http://stackoverflow.com/a/3259271/35070
1256
1257 import ctypes
1258 import ctypes.wintypes
1259
1260 WIN_OUTPUT_IDS = {
1261 1: -11,
1262 2: -12,
1263 }
1264
1265 try:
1266 fileno = out.fileno()
1267 except AttributeError:
1268 # If the output stream doesn't have a fileno, it's virtual
1269 return False
1270 except io.UnsupportedOperation:
1271 # Some strange Windows pseudo files?
1272 return False
1273 if fileno not in WIN_OUTPUT_IDS:
1274 return False
1275
1276 GetStdHandle = ctypes.WINFUNCTYPE(
1277 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1278 (b'GetStdHandle', ctypes.windll.kernel32))
1279 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1280
1281 WriteConsoleW = ctypes.WINFUNCTYPE(
1282 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1283 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1284 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1285 written = ctypes.wintypes.DWORD(0)
1286
1287 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1288 FILE_TYPE_CHAR = 0x0002
1289 FILE_TYPE_REMOTE = 0x8000
1290 GetConsoleMode = ctypes.WINFUNCTYPE(
1291 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1292 ctypes.POINTER(ctypes.wintypes.DWORD))(
1293 (b'GetConsoleMode', ctypes.windll.kernel32))
1294 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1295
1296 def not_a_console(handle):
1297 if handle == INVALID_HANDLE_VALUE or handle is None:
1298 return True
1299 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1300 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1301
1302 if not_a_console(h):
1303 return False
1304
1305 def next_nonbmp_pos(s):
1306 try:
1307 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1308 except StopIteration:
1309 return len(s)
1310
1311 while s:
1312 count = min(next_nonbmp_pos(s), 1024)
1313
1314 ret = WriteConsoleW(
1315 h, s, count if count else 2, ctypes.byref(written), None)
1316 if ret == 0:
1317 raise OSError('Failed to write string')
1318 if not count: # We just wrote a non-BMP character
1319 assert written.value == 2
1320 s = s[1:]
1321 else:
1322 assert written.value > 0
1323 s = s[written.value:]
1324 return True
1325
1326
1327 def write_string(s, out=None, encoding=None):
1328 if out is None:
1329 out = sys.stderr
1330 assert type(s) == compat_str
1331
1332 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1333 if _windows_write_string(s, out):
1334 return
1335
1336 if ('b' in getattr(out, 'mode', '') or
1337 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1338 byt = s.encode(encoding or preferredencoding(), 'ignore')
1339 out.write(byt)
1340 elif hasattr(out, 'buffer'):
1341 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1342 byt = s.encode(enc, 'ignore')
1343 out.buffer.write(byt)
1344 else:
1345 out.write(s)
1346 out.flush()
1347
1348
1349 def bytes_to_intlist(bs):
1350 if not bs:
1351 return []
1352 if isinstance(bs[0], int): # Python 3
1353 return list(bs)
1354 else:
1355 return [ord(c) for c in bs]
1356
1357
1358 def intlist_to_bytes(xs):
1359 if not xs:
1360 return b''
1361 return compat_struct_pack('%dB' % len(xs), *xs)
1362
1363
1364 # Cross-platform file locking
1365 if sys.platform == 'win32':
1366 import ctypes.wintypes
1367 import msvcrt
1368
1369 class OVERLAPPED(ctypes.Structure):
1370 _fields_ = [
1371 ('Internal', ctypes.wintypes.LPVOID),
1372 ('InternalHigh', ctypes.wintypes.LPVOID),
1373 ('Offset', ctypes.wintypes.DWORD),
1374 ('OffsetHigh', ctypes.wintypes.DWORD),
1375 ('hEvent', ctypes.wintypes.HANDLE),
1376 ]
1377
1378 kernel32 = ctypes.windll.kernel32
1379 LockFileEx = kernel32.LockFileEx
1380 LockFileEx.argtypes = [
1381 ctypes.wintypes.HANDLE, # hFile
1382 ctypes.wintypes.DWORD, # dwFlags
1383 ctypes.wintypes.DWORD, # dwReserved
1384 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1385 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1386 ctypes.POINTER(OVERLAPPED) # Overlapped
1387 ]
1388 LockFileEx.restype = ctypes.wintypes.BOOL
1389 UnlockFileEx = kernel32.UnlockFileEx
1390 UnlockFileEx.argtypes = [
1391 ctypes.wintypes.HANDLE, # hFile
1392 ctypes.wintypes.DWORD, # dwReserved
1393 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1394 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1395 ctypes.POINTER(OVERLAPPED) # Overlapped
1396 ]
1397 UnlockFileEx.restype = ctypes.wintypes.BOOL
1398 whole_low = 0xffffffff
1399 whole_high = 0x7fffffff
1400
1401 def _lock_file(f, exclusive):
1402 overlapped = OVERLAPPED()
1403 overlapped.Offset = 0
1404 overlapped.OffsetHigh = 0
1405 overlapped.hEvent = 0
1406 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1407 handle = msvcrt.get_osfhandle(f.fileno())
1408 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1409 whole_low, whole_high, f._lock_file_overlapped_p):
1410 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1411
1412 def _unlock_file(f):
1413 assert f._lock_file_overlapped_p
1414 handle = msvcrt.get_osfhandle(f.fileno())
1415 if not UnlockFileEx(handle, 0,
1416 whole_low, whole_high, f._lock_file_overlapped_p):
1417 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1418
1419 else:
1420 # Some platforms, such as Jython, is missing fcntl
1421 try:
1422 import fcntl
1423
1424 def _lock_file(f, exclusive):
1425 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1426
1427 def _unlock_file(f):
1428 fcntl.flock(f, fcntl.LOCK_UN)
1429 except ImportError:
1430 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1431
1432 def _lock_file(f, exclusive):
1433 raise IOError(UNSUPPORTED_MSG)
1434
1435 def _unlock_file(f):
1436 raise IOError(UNSUPPORTED_MSG)
1437
1438
1439 class locked_file(object):
1440 def __init__(self, filename, mode, encoding=None):
1441 assert mode in ['r', 'a', 'w']
1442 self.f = io.open(filename, mode, encoding=encoding)
1443 self.mode = mode
1444
1445 def __enter__(self):
1446 exclusive = self.mode != 'r'
1447 try:
1448 _lock_file(self.f, exclusive)
1449 except IOError:
1450 self.f.close()
1451 raise
1452 return self
1453
1454 def __exit__(self, etype, value, traceback):
1455 try:
1456 _unlock_file(self.f)
1457 finally:
1458 self.f.close()
1459
1460 def __iter__(self):
1461 return iter(self.f)
1462
1463 def write(self, *args):
1464 return self.f.write(*args)
1465
1466 def read(self, *args):
1467 return self.f.read(*args)
1468
1469
1470 def get_filesystem_encoding():
1471 encoding = sys.getfilesystemencoding()
1472 return encoding if encoding is not None else 'utf-8'
1473
1474
1475 def shell_quote(args):
1476 quoted_args = []
1477 encoding = get_filesystem_encoding()
1478 for a in args:
1479 if isinstance(a, bytes):
1480 # We may get a filename encoded with 'encodeFilename'
1481 a = a.decode(encoding)
1482 quoted_args.append(pipes.quote(a))
1483 return ' '.join(quoted_args)
1484
1485
1486 def smuggle_url(url, data):
1487 """ Pass additional data in a URL for internal use. """
1488
1489 url, idata = unsmuggle_url(url, {})
1490 data.update(idata)
1491 sdata = compat_urllib_parse_urlencode(
1492 {'__youtubedl_smuggle': json.dumps(data)})
1493 return url + '#' + sdata
1494
1495
1496 def unsmuggle_url(smug_url, default=None):
1497 if '#__youtubedl_smuggle' not in smug_url:
1498 return smug_url, default
1499 url, _, sdata = smug_url.rpartition('#')
1500 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1501 data = json.loads(jsond)
1502 return url, data
1503
1504
1505 def format_bytes(bytes):
1506 if bytes is None:
1507 return 'N/A'
1508 if type(bytes) is str:
1509 bytes = float(bytes)
1510 if bytes == 0.0:
1511 exponent = 0
1512 else:
1513 exponent = int(math.log(bytes, 1024.0))
1514 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1515 converted = float(bytes) / float(1024 ** exponent)
1516 return '%.2f%s' % (converted, suffix)
1517
1518
1519 def lookup_unit_table(unit_table, s):
1520 units_re = '|'.join(re.escape(u) for u in unit_table)
1521 m = re.match(
1522 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1523 if not m:
1524 return None
1525 num_str = m.group('num').replace(',', '.')
1526 mult = unit_table[m.group('unit')]
1527 return int(float(num_str) * mult)
1528
1529
1530 def parse_filesize(s):
1531 if s is None:
1532 return None
1533
1534 # The lower-case forms are of course incorrect and unofficial,
1535 # but we support those too
1536 _UNIT_TABLE = {
1537 'B': 1,
1538 'b': 1,
1539 'bytes': 1,
1540 'KiB': 1024,
1541 'KB': 1000,
1542 'kB': 1024,
1543 'Kb': 1000,
1544 'kb': 1000,
1545 'kilobytes': 1000,
1546 'kibibytes': 1024,
1547 'MiB': 1024 ** 2,
1548 'MB': 1000 ** 2,
1549 'mB': 1024 ** 2,
1550 'Mb': 1000 ** 2,
1551 'mb': 1000 ** 2,
1552 'megabytes': 1000 ** 2,
1553 'mebibytes': 1024 ** 2,
1554 'GiB': 1024 ** 3,
1555 'GB': 1000 ** 3,
1556 'gB': 1024 ** 3,
1557 'Gb': 1000 ** 3,
1558 'gb': 1000 ** 3,
1559 'gigabytes': 1000 ** 3,
1560 'gibibytes': 1024 ** 3,
1561 'TiB': 1024 ** 4,
1562 'TB': 1000 ** 4,
1563 'tB': 1024 ** 4,
1564 'Tb': 1000 ** 4,
1565 'tb': 1000 ** 4,
1566 'terabytes': 1000 ** 4,
1567 'tebibytes': 1024 ** 4,
1568 'PiB': 1024 ** 5,
1569 'PB': 1000 ** 5,
1570 'pB': 1024 ** 5,
1571 'Pb': 1000 ** 5,
1572 'pb': 1000 ** 5,
1573 'petabytes': 1000 ** 5,
1574 'pebibytes': 1024 ** 5,
1575 'EiB': 1024 ** 6,
1576 'EB': 1000 ** 6,
1577 'eB': 1024 ** 6,
1578 'Eb': 1000 ** 6,
1579 'eb': 1000 ** 6,
1580 'exabytes': 1000 ** 6,
1581 'exbibytes': 1024 ** 6,
1582 'ZiB': 1024 ** 7,
1583 'ZB': 1000 ** 7,
1584 'zB': 1024 ** 7,
1585 'Zb': 1000 ** 7,
1586 'zb': 1000 ** 7,
1587 'zettabytes': 1000 ** 7,
1588 'zebibytes': 1024 ** 7,
1589 'YiB': 1024 ** 8,
1590 'YB': 1000 ** 8,
1591 'yB': 1024 ** 8,
1592 'Yb': 1000 ** 8,
1593 'yb': 1000 ** 8,
1594 'yottabytes': 1000 ** 8,
1595 'yobibytes': 1024 ** 8,
1596 }
1597
1598 return lookup_unit_table(_UNIT_TABLE, s)
1599
1600
1601 def parse_count(s):
1602 if s is None:
1603 return None
1604
1605 s = s.strip()
1606
1607 if re.match(r'^[\d,.]+$', s):
1608 return str_to_int(s)
1609
1610 _UNIT_TABLE = {
1611 'k': 1000,
1612 'K': 1000,
1613 'm': 1000 ** 2,
1614 'M': 1000 ** 2,
1615 'kk': 1000 ** 2,
1616 'KK': 1000 ** 2,
1617 }
1618
1619 return lookup_unit_table(_UNIT_TABLE, s)
1620
1621
1622 def month_by_name(name, lang='en'):
1623 """ Return the number of a month by (locale-independently) English name """
1624
1625 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1626
1627 try:
1628 return month_names.index(name) + 1
1629 except ValueError:
1630 return None
1631
1632
1633 def month_by_abbreviation(abbrev):
1634 """ Return the number of a month by (locale-independently) English
1635 abbreviations """
1636
1637 try:
1638 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1639 except ValueError:
1640 return None
1641
1642
1643 def fix_xml_ampersands(xml_str):
1644 """Replace all the '&' by '&amp;' in XML"""
1645 return re.sub(
1646 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1647 '&amp;',
1648 xml_str)
1649
1650
1651 def setproctitle(title):
1652 assert isinstance(title, compat_str)
1653
1654 # ctypes in Jython is not complete
1655 # http://bugs.jython.org/issue2148
1656 if sys.platform.startswith('java'):
1657 return
1658
1659 try:
1660 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1661 except OSError:
1662 return
1663 title_bytes = title.encode('utf-8')
1664 buf = ctypes.create_string_buffer(len(title_bytes))
1665 buf.value = title_bytes
1666 try:
1667 libc.prctl(15, buf, 0, 0, 0)
1668 except AttributeError:
1669 return # Strange libc, just skip this
1670
1671
1672 def remove_start(s, start):
1673 return s[len(start):] if s is not None and s.startswith(start) else s
1674
1675
1676 def remove_end(s, end):
1677 return s[:-len(end)] if s is not None and s.endswith(end) else s
1678
1679
1680 def remove_quotes(s):
1681 if s is None or len(s) < 2:
1682 return s
1683 for quote in ('"', "'", ):
1684 if s[0] == quote and s[-1] == quote:
1685 return s[1:-1]
1686 return s
1687
1688
1689 def url_basename(url):
1690 path = compat_urlparse.urlparse(url).path
1691 return path.strip('/').split('/')[-1]
1692
1693
1694 def base_url(url):
1695 return re.match(r'https?://[^?#&]+/', url).group()
1696
1697
1698 class HEADRequest(compat_urllib_request.Request):
1699 def get_method(self):
1700 return 'HEAD'
1701
1702
1703 class PUTRequest(compat_urllib_request.Request):
1704 def get_method(self):
1705 return 'PUT'
1706
1707
1708 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1709 if get_attr:
1710 if v is not None:
1711 v = getattr(v, get_attr, None)
1712 if v == '':
1713 v = None
1714 if v is None:
1715 return default
1716 try:
1717 return int(v) * invscale // scale
1718 except ValueError:
1719 return default
1720
1721
1722 def str_or_none(v, default=None):
1723 return default if v is None else compat_str(v)
1724
1725
1726 def str_to_int(int_str):
1727 """ A more relaxed version of int_or_none """
1728 if int_str is None:
1729 return None
1730 int_str = re.sub(r'[,\.\+]', '', int_str)
1731 return int(int_str)
1732
1733
1734 def float_or_none(v, scale=1, invscale=1, default=None):
1735 if v is None:
1736 return default
1737 try:
1738 return float(v) * invscale / scale
1739 except ValueError:
1740 return default
1741
1742
1743 def strip_or_none(v):
1744 return None if v is None else v.strip()
1745
1746
1747 def parse_duration(s):
1748 if not isinstance(s, compat_basestring):
1749 return None
1750
1751 s = s.strip()
1752
1753 days, hours, mins, secs, ms = [None] * 5
1754 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1755 if m:
1756 days, hours, mins, secs, ms = m.groups()
1757 else:
1758 m = re.match(
1759 r'''(?ix)(?:P?T)?
1760 (?:
1761 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1762 )?
1763 (?:
1764 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1765 )?
1766 (?:
1767 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1768 )?
1769 (?:
1770 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1771 )?$''', s)
1772 if m:
1773 days, hours, mins, secs, ms = m.groups()
1774 else:
1775 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1776 if m:
1777 hours, mins = m.groups()
1778 else:
1779 return None
1780
1781 duration = 0
1782 if secs:
1783 duration += float(secs)
1784 if mins:
1785 duration += float(mins) * 60
1786 if hours:
1787 duration += float(hours) * 60 * 60
1788 if days:
1789 duration += float(days) * 24 * 60 * 60
1790 if ms:
1791 duration += float(ms)
1792 return duration
1793
1794
1795 def prepend_extension(filename, ext, expected_real_ext=None):
1796 name, real_ext = os.path.splitext(filename)
1797 return (
1798 '{0}.{1}{2}'.format(name, ext, real_ext)
1799 if not expected_real_ext or real_ext[1:] == expected_real_ext
1800 else '{0}.{1}'.format(filename, ext))
1801
1802
1803 def replace_extension(filename, ext, expected_real_ext=None):
1804 name, real_ext = os.path.splitext(filename)
1805 return '{0}.{1}'.format(
1806 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1807 ext)
1808
1809
1810 def check_executable(exe, args=[]):
1811 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1812 args can be a list of arguments for a short output (like -version) """
1813 try:
1814 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1815 except OSError:
1816 return False
1817 return exe
1818
1819
1820 def get_exe_version(exe, args=['--version'],
1821 version_re=None, unrecognized='present'):
1822 """ Returns the version of the specified executable,
1823 or False if the executable is not present """
1824 try:
1825 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1826 # SIGTTOU if youtube-dl is run in the background.
1827 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1828 out, _ = subprocess.Popen(
1829 [encodeArgument(exe)] + args,
1830 stdin=subprocess.PIPE,
1831 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1832 except OSError:
1833 return False
1834 if isinstance(out, bytes): # Python 2.x
1835 out = out.decode('ascii', 'ignore')
1836 return detect_exe_version(out, version_re, unrecognized)
1837
1838
1839 def detect_exe_version(output, version_re=None, unrecognized='present'):
1840 assert isinstance(output, compat_str)
1841 if version_re is None:
1842 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1843 m = re.search(version_re, output)
1844 if m:
1845 return m.group(1)
1846 else:
1847 return unrecognized
1848
1849
1850 class PagedList(object):
1851 def __len__(self):
1852 # This is only useful for tests
1853 return len(self.getslice())
1854
1855
1856 class OnDemandPagedList(PagedList):
1857 def __init__(self, pagefunc, pagesize, use_cache=False):
1858 self._pagefunc = pagefunc
1859 self._pagesize = pagesize
1860 self._use_cache = use_cache
1861 if use_cache:
1862 self._cache = {}
1863
1864 def getslice(self, start=0, end=None):
1865 res = []
1866 for pagenum in itertools.count(start // self._pagesize):
1867 firstid = pagenum * self._pagesize
1868 nextfirstid = pagenum * self._pagesize + self._pagesize
1869 if start >= nextfirstid:
1870 continue
1871
1872 page_results = None
1873 if self._use_cache:
1874 page_results = self._cache.get(pagenum)
1875 if page_results is None:
1876 page_results = list(self._pagefunc(pagenum))
1877 if self._use_cache:
1878 self._cache[pagenum] = page_results
1879
1880 startv = (
1881 start % self._pagesize
1882 if firstid <= start < nextfirstid
1883 else 0)
1884
1885 endv = (
1886 ((end - 1) % self._pagesize) + 1
1887 if (end is not None and firstid <= end <= nextfirstid)
1888 else None)
1889
1890 if startv != 0 or endv is not None:
1891 page_results = page_results[startv:endv]
1892 res.extend(page_results)
1893
1894 # A little optimization - if current page is not "full", ie. does
1895 # not contain page_size videos then we can assume that this page
1896 # is the last one - there are no more ids on further pages -
1897 # i.e. no need to query again.
1898 if len(page_results) + startv < self._pagesize:
1899 break
1900
1901 # If we got the whole page, but the next page is not interesting,
1902 # break out early as well
1903 if end == nextfirstid:
1904 break
1905 return res
1906
1907
1908 class InAdvancePagedList(PagedList):
1909 def __init__(self, pagefunc, pagecount, pagesize):
1910 self._pagefunc = pagefunc
1911 self._pagecount = pagecount
1912 self._pagesize = pagesize
1913
1914 def getslice(self, start=0, end=None):
1915 res = []
1916 start_page = start // self._pagesize
1917 end_page = (
1918 self._pagecount if end is None else (end // self._pagesize + 1))
1919 skip_elems = start - start_page * self._pagesize
1920 only_more = None if end is None else end - start
1921 for pagenum in range(start_page, end_page):
1922 page = list(self._pagefunc(pagenum))
1923 if skip_elems:
1924 page = page[skip_elems:]
1925 skip_elems = None
1926 if only_more is not None:
1927 if len(page) < only_more:
1928 only_more -= len(page)
1929 else:
1930 page = page[:only_more]
1931 res.extend(page)
1932 break
1933 res.extend(page)
1934 return res
1935
1936
1937 def uppercase_escape(s):
1938 unicode_escape = codecs.getdecoder('unicode_escape')
1939 return re.sub(
1940 r'\\U[0-9a-fA-F]{8}',
1941 lambda m: unicode_escape(m.group(0))[0],
1942 s)
1943
1944
1945 def lowercase_escape(s):
1946 unicode_escape = codecs.getdecoder('unicode_escape')
1947 return re.sub(
1948 r'\\u[0-9a-fA-F]{4}',
1949 lambda m: unicode_escape(m.group(0))[0],
1950 s)
1951
1952
1953 def escape_rfc3986(s):
1954 """Escape non-ASCII characters as suggested by RFC 3986"""
1955 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1956 s = s.encode('utf-8')
1957 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1958
1959
1960 def escape_url(url):
1961 """Escape URL as suggested by RFC 3986"""
1962 url_parsed = compat_urllib_parse_urlparse(url)
1963 return url_parsed._replace(
1964 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1965 path=escape_rfc3986(url_parsed.path),
1966 params=escape_rfc3986(url_parsed.params),
1967 query=escape_rfc3986(url_parsed.query),
1968 fragment=escape_rfc3986(url_parsed.fragment)
1969 ).geturl()
1970
1971
1972 def read_batch_urls(batch_fd):
1973 def fixup(url):
1974 if not isinstance(url, compat_str):
1975 url = url.decode('utf-8', 'replace')
1976 BOM_UTF8 = '\xef\xbb\xbf'
1977 if url.startswith(BOM_UTF8):
1978 url = url[len(BOM_UTF8):]
1979 url = url.strip()
1980 if url.startswith(('#', ';', ']')):
1981 return False
1982 return url
1983
1984 with contextlib.closing(batch_fd) as fd:
1985 return [url for url in map(fixup, fd) if url]
1986
1987
1988 def urlencode_postdata(*args, **kargs):
1989 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1990
1991
1992 def update_url_query(url, query):
1993 if not query:
1994 return url
1995 parsed_url = compat_urlparse.urlparse(url)
1996 qs = compat_parse_qs(parsed_url.query)
1997 qs.update(query)
1998 return compat_urlparse.urlunparse(parsed_url._replace(
1999 query=compat_urllib_parse_urlencode(qs, True)))
2000
2001
2002 def update_Request(req, url=None, data=None, headers={}, query={}):
2003 req_headers = req.headers.copy()
2004 req_headers.update(headers)
2005 req_data = data or req.data
2006 req_url = update_url_query(url or req.get_full_url(), query)
2007 req_get_method = req.get_method()
2008 if req_get_method == 'HEAD':
2009 req_type = HEADRequest
2010 elif req_get_method == 'PUT':
2011 req_type = PUTRequest
2012 else:
2013 req_type = compat_urllib_request.Request
2014 new_req = req_type(
2015 req_url, data=req_data, headers=req_headers,
2016 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2017 if hasattr(req, 'timeout'):
2018 new_req.timeout = req.timeout
2019 return new_req
2020
2021
2022 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2023 if isinstance(key_or_keys, (list, tuple)):
2024 for key in key_or_keys:
2025 if key not in d or d[key] is None or skip_false_values and not d[key]:
2026 continue
2027 return d[key]
2028 return default
2029 return d.get(key_or_keys, default)
2030
2031
2032 def try_get(src, getter, expected_type=None):
2033 try:
2034 v = getter(src)
2035 except (AttributeError, KeyError, TypeError, IndexError):
2036 pass
2037 else:
2038 if expected_type is None or isinstance(v, expected_type):
2039 return v
2040
2041
2042 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2043 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2044
2045
2046 US_RATINGS = {
2047 'G': 0,
2048 'PG': 10,
2049 'PG-13': 13,
2050 'R': 16,
2051 'NC': 18,
2052 }
2053
2054
2055 TV_PARENTAL_GUIDELINES = {
2056 'TV-Y': 0,
2057 'TV-Y7': 7,
2058 'TV-G': 0,
2059 'TV-PG': 0,
2060 'TV-14': 14,
2061 'TV-MA': 17,
2062 }
2063
2064
2065 def parse_age_limit(s):
2066 if type(s) == int:
2067 return s if 0 <= s <= 21 else None
2068 if not isinstance(s, compat_basestring):
2069 return None
2070 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2071 if m:
2072 return int(m.group('age'))
2073 if s in US_RATINGS:
2074 return US_RATINGS[s]
2075 return TV_PARENTAL_GUIDELINES.get(s)
2076
2077
2078 def strip_jsonp(code):
2079 return re.sub(
2080 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2081
2082
2083 def js_to_json(code):
2084 def fix_kv(m):
2085 v = m.group(0)
2086 if v in ('true', 'false', 'null'):
2087 return v
2088 elif v.startswith('/*') or v == ',':
2089 return ""
2090
2091 if v[0] in ("'", '"'):
2092 v = re.sub(r'(?s)\\.|"', lambda m: {
2093 '"': '\\"',
2094 "\\'": "'",
2095 '\\\n': '',
2096 '\\x': '\\u00',
2097 }.get(m.group(0), m.group(0)), v[1:-1])
2098
2099 INTEGER_TABLE = (
2100 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2101 (r'^(0+[0-7]+)\s*:?$', 8),
2102 )
2103
2104 for regex, base in INTEGER_TABLE:
2105 im = re.match(regex, v)
2106 if im:
2107 i = int(im.group(1), base)
2108 return '"%d":' % i if v.endswith(':') else '%d' % i
2109
2110 return '"%s"' % v
2111
2112 return re.sub(r'''(?sx)
2113 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2114 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2115 /\*.*?\*/|,(?=\s*[\]}])|
2116 [a-zA-Z_][.a-zA-Z_0-9]*|
2117 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2118 [0-9]+(?=\s*:)
2119 ''', fix_kv, code)
2120
2121
2122 def qualities(quality_ids):
2123 """ Get a numeric quality value out of a list of possible values """
2124 def q(qid):
2125 try:
2126 return quality_ids.index(qid)
2127 except ValueError:
2128 return -1
2129 return q
2130
2131
2132 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2133
2134
2135 def limit_length(s, length):
2136 """ Add ellipses to overly long strings """
2137 if s is None:
2138 return None
2139 ELLIPSES = '...'
2140 if len(s) > length:
2141 return s[:length - len(ELLIPSES)] + ELLIPSES
2142 return s
2143
2144
2145 def version_tuple(v):
2146 return tuple(int(e) for e in re.split(r'[-.]', v))
2147
2148
2149 def is_outdated_version(version, limit, assume_new=True):
2150 if not version:
2151 return not assume_new
2152 try:
2153 return version_tuple(version) < version_tuple(limit)
2154 except ValueError:
2155 return not assume_new
2156
2157
2158 def ytdl_is_updateable():
2159 """ Returns if youtube-dl can be updated with -U """
2160 from zipimport import zipimporter
2161
2162 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2163
2164
2165 def args_to_str(args):
2166 # Get a short string representation for a subprocess command
2167 return ' '.join(compat_shlex_quote(a) for a in args)
2168
2169
2170 def error_to_compat_str(err):
2171 err_str = str(err)
2172 # On python 2 error byte string must be decoded with proper
2173 # encoding rather than ascii
2174 if sys.version_info[0] < 3:
2175 err_str = err_str.decode(preferredencoding())
2176 return err_str
2177
2178
2179 def mimetype2ext(mt):
2180 if mt is None:
2181 return None
2182
2183 ext = {
2184 'audio/mp4': 'm4a',
2185 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2186 # it's the most popular one
2187 'audio/mpeg': 'mp3',
2188 }.get(mt)
2189 if ext is not None:
2190 return ext
2191
2192 _, _, res = mt.rpartition('/')
2193 res = res.split(';')[0].strip().lower()
2194
2195 return {
2196 '3gpp': '3gp',
2197 'smptett+xml': 'tt',
2198 'srt': 'srt',
2199 'ttaf+xml': 'dfxp',
2200 'ttml+xml': 'ttml',
2201 'vtt': 'vtt',
2202 'x-flv': 'flv',
2203 'x-mp4-fragmented': 'mp4',
2204 'x-ms-wmv': 'wmv',
2205 'mpegurl': 'm3u8',
2206 'x-mpegurl': 'm3u8',
2207 'vnd.apple.mpegurl': 'm3u8',
2208 'dash+xml': 'mpd',
2209 'f4m': 'f4m',
2210 'f4m+xml': 'f4m',
2211 'hds+xml': 'f4m',
2212 'vnd.ms-sstr+xml': 'ism',
2213 'quicktime': 'mov',
2214 }.get(res, res)
2215
2216
2217 def parse_codecs(codecs_str):
2218 # http://tools.ietf.org/html/rfc6381
2219 if not codecs_str:
2220 return {}
2221 splited_codecs = list(filter(None, map(
2222 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2223 vcodec, acodec = None, None
2224 for full_codec in splited_codecs:
2225 codec = full_codec.split('.')[0]
2226 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2227 if not vcodec:
2228 vcodec = full_codec
2229 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2230 if not acodec:
2231 acodec = full_codec
2232 else:
2233 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2234 if not vcodec and not acodec:
2235 if len(splited_codecs) == 2:
2236 return {
2237 'vcodec': vcodec,
2238 'acodec': acodec,
2239 }
2240 elif len(splited_codecs) == 1:
2241 return {
2242 'vcodec': 'none',
2243 'acodec': vcodec,
2244 }
2245 else:
2246 return {
2247 'vcodec': vcodec or 'none',
2248 'acodec': acodec or 'none',
2249 }
2250 return {}
2251
2252
2253 def urlhandle_detect_ext(url_handle):
2254 getheader = url_handle.headers.get
2255
2256 cd = getheader('Content-Disposition')
2257 if cd:
2258 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2259 if m:
2260 e = determine_ext(m.group('filename'), default_ext=None)
2261 if e:
2262 return e
2263
2264 return mimetype2ext(getheader('Content-Type'))
2265
2266
2267 def encode_data_uri(data, mime_type):
2268 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2269
2270
2271 def age_restricted(content_limit, age_limit):
2272 """ Returns True iff the content should be blocked """
2273
2274 if age_limit is None: # No limit set
2275 return False
2276 if content_limit is None:
2277 return False # Content available for everyone
2278 return age_limit < content_limit
2279
2280
2281 def is_html(first_bytes):
2282 """ Detect whether a file contains HTML by examining its first bytes. """
2283
2284 BOMS = [
2285 (b'\xef\xbb\xbf', 'utf-8'),
2286 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2287 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2288 (b'\xff\xfe', 'utf-16-le'),
2289 (b'\xfe\xff', 'utf-16-be'),
2290 ]
2291 for bom, enc in BOMS:
2292 if first_bytes.startswith(bom):
2293 s = first_bytes[len(bom):].decode(enc, 'replace')
2294 break
2295 else:
2296 s = first_bytes.decode('utf-8', 'replace')
2297
2298 return re.match(r'^\s*<', s)
2299
2300
2301 def determine_protocol(info_dict):
2302 protocol = info_dict.get('protocol')
2303 if protocol is not None:
2304 return protocol
2305
2306 url = info_dict['url']
2307 if url.startswith('rtmp'):
2308 return 'rtmp'
2309 elif url.startswith('mms'):
2310 return 'mms'
2311 elif url.startswith('rtsp'):
2312 return 'rtsp'
2313
2314 ext = determine_ext(url)
2315 if ext == 'm3u8':
2316 return 'm3u8'
2317 elif ext == 'f4m':
2318 return 'f4m'
2319
2320 return compat_urllib_parse_urlparse(url).scheme
2321
2322
2323 def render_table(header_row, data):
2324 """ Render a list of rows, each as a list of values """
2325 table = [header_row] + data
2326 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2327 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2328 return '\n'.join(format_str % tuple(row) for row in table)
2329
2330
2331 def _match_one(filter_part, dct):
2332 COMPARISON_OPERATORS = {
2333 '<': operator.lt,
2334 '<=': operator.le,
2335 '>': operator.gt,
2336 '>=': operator.ge,
2337 '=': operator.eq,
2338 '!=': operator.ne,
2339 }
2340 operator_rex = re.compile(r'''(?x)\s*
2341 (?P<key>[a-z_]+)
2342 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2343 (?:
2344 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2345 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2346 )
2347 \s*$
2348 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2349 m = operator_rex.search(filter_part)
2350 if m:
2351 op = COMPARISON_OPERATORS[m.group('op')]
2352 actual_value = dct.get(m.group('key'))
2353 if (m.group('strval') is not None or
2354 # If the original field is a string and matching comparisonvalue is
2355 # a number we should respect the origin of the original field
2356 # and process comparison value as a string (see
2357 # https://github.com/rg3/youtube-dl/issues/11082).
2358 actual_value is not None and m.group('intval') is not None and
2359 isinstance(actual_value, compat_str)):
2360 if m.group('op') not in ('=', '!='):
2361 raise ValueError(
2362 'Operator %s does not support string values!' % m.group('op'))
2363 comparison_value = m.group('strval') or m.group('intval')
2364 else:
2365 try:
2366 comparison_value = int(m.group('intval'))
2367 except ValueError:
2368 comparison_value = parse_filesize(m.group('intval'))
2369 if comparison_value is None:
2370 comparison_value = parse_filesize(m.group('intval') + 'B')
2371 if comparison_value is None:
2372 raise ValueError(
2373 'Invalid integer value %r in filter part %r' % (
2374 m.group('intval'), filter_part))
2375 if actual_value is None:
2376 return m.group('none_inclusive')
2377 return op(actual_value, comparison_value)
2378
2379 UNARY_OPERATORS = {
2380 '': lambda v: v is not None,
2381 '!': lambda v: v is None,
2382 }
2383 operator_rex = re.compile(r'''(?x)\s*
2384 (?P<op>%s)\s*(?P<key>[a-z_]+)
2385 \s*$
2386 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2387 m = operator_rex.search(filter_part)
2388 if m:
2389 op = UNARY_OPERATORS[m.group('op')]
2390 actual_value = dct.get(m.group('key'))
2391 return op(actual_value)
2392
2393 raise ValueError('Invalid filter part %r' % filter_part)
2394
2395
2396 def match_str(filter_str, dct):
2397 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2398
2399 return all(
2400 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2401
2402
2403 def match_filter_func(filter_str):
2404 def _match_func(info_dict):
2405 if match_str(filter_str, info_dict):
2406 return None
2407 else:
2408 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2409 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2410 return _match_func
2411
2412
2413 def parse_dfxp_time_expr(time_expr):
2414 if not time_expr:
2415 return
2416
2417 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2418 if mobj:
2419 return float(mobj.group('time_offset'))
2420
2421 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2422 if mobj:
2423 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2424
2425
2426 def srt_subtitles_timecode(seconds):
2427 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2428
2429
2430 def dfxp2srt(dfxp_data):
2431 _x = functools.partial(xpath_with_ns, ns_map={
2432 'ttml': 'http://www.w3.org/ns/ttml',
2433 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2434 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2435 })
2436
2437 class TTMLPElementParser(object):
2438 out = ''
2439
2440 def start(self, tag, attrib):
2441 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2442 self.out += '\n'
2443
2444 def end(self, tag):
2445 pass
2446
2447 def data(self, data):
2448 self.out += data
2449
2450 def close(self):
2451 return self.out.strip()
2452
2453 def parse_node(node):
2454 target = TTMLPElementParser()
2455 parser = xml.etree.ElementTree.XMLParser(target=target)
2456 parser.feed(xml.etree.ElementTree.tostring(node))
2457 return parser.close()
2458
2459 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2460 out = []
2461 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2462
2463 if not paras:
2464 raise ValueError('Invalid dfxp/TTML subtitle')
2465
2466 for para, index in zip(paras, itertools.count(1)):
2467 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2468 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2469 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2470 if begin_time is None:
2471 continue
2472 if not end_time:
2473 if not dur:
2474 continue
2475 end_time = begin_time + dur
2476 out.append('%d\n%s --> %s\n%s\n\n' % (
2477 index,
2478 srt_subtitles_timecode(begin_time),
2479 srt_subtitles_timecode(end_time),
2480 parse_node(para)))
2481
2482 return ''.join(out)
2483
2484
2485 def cli_option(params, command_option, param):
2486 param = params.get(param)
2487 if param:
2488 param = compat_str(param)
2489 return [command_option, param] if param is not None else []
2490
2491
2492 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2493 param = params.get(param)
2494 assert isinstance(param, bool)
2495 if separator:
2496 return [command_option + separator + (true_value if param else false_value)]
2497 return [command_option, true_value if param else false_value]
2498
2499
2500 def cli_valueless_option(params, command_option, param, expected_value=True):
2501 param = params.get(param)
2502 return [command_option] if param == expected_value else []
2503
2504
2505 def cli_configuration_args(params, param, default=[]):
2506 ex_args = params.get(param)
2507 if ex_args is None:
2508 return default
2509 assert isinstance(ex_args, list)
2510 return ex_args
2511
2512
2513 class ISO639Utils(object):
2514 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2515 _lang_map = {
2516 'aa': 'aar',
2517 'ab': 'abk',
2518 'ae': 'ave',
2519 'af': 'afr',
2520 'ak': 'aka',
2521 'am': 'amh',
2522 'an': 'arg',
2523 'ar': 'ara',
2524 'as': 'asm',
2525 'av': 'ava',
2526 'ay': 'aym',
2527 'az': 'aze',
2528 'ba': 'bak',
2529 'be': 'bel',
2530 'bg': 'bul',
2531 'bh': 'bih',
2532 'bi': 'bis',
2533 'bm': 'bam',
2534 'bn': 'ben',
2535 'bo': 'bod',
2536 'br': 'bre',
2537 'bs': 'bos',
2538 'ca': 'cat',
2539 'ce': 'che',
2540 'ch': 'cha',
2541 'co': 'cos',
2542 'cr': 'cre',
2543 'cs': 'ces',
2544 'cu': 'chu',
2545 'cv': 'chv',
2546 'cy': 'cym',
2547 'da': 'dan',
2548 'de': 'deu',
2549 'dv': 'div',
2550 'dz': 'dzo',
2551 'ee': 'ewe',
2552 'el': 'ell',
2553 'en': 'eng',
2554 'eo': 'epo',
2555 'es': 'spa',
2556 'et': 'est',
2557 'eu': 'eus',
2558 'fa': 'fas',
2559 'ff': 'ful',
2560 'fi': 'fin',
2561 'fj': 'fij',
2562 'fo': 'fao',
2563 'fr': 'fra',
2564 'fy': 'fry',
2565 'ga': 'gle',
2566 'gd': 'gla',
2567 'gl': 'glg',
2568 'gn': 'grn',
2569 'gu': 'guj',
2570 'gv': 'glv',
2571 'ha': 'hau',
2572 'he': 'heb',
2573 'hi': 'hin',
2574 'ho': 'hmo',
2575 'hr': 'hrv',
2576 'ht': 'hat',
2577 'hu': 'hun',
2578 'hy': 'hye',
2579 'hz': 'her',
2580 'ia': 'ina',
2581 'id': 'ind',
2582 'ie': 'ile',
2583 'ig': 'ibo',
2584 'ii': 'iii',
2585 'ik': 'ipk',
2586 'io': 'ido',
2587 'is': 'isl',
2588 'it': 'ita',
2589 'iu': 'iku',
2590 'ja': 'jpn',
2591 'jv': 'jav',
2592 'ka': 'kat',
2593 'kg': 'kon',
2594 'ki': 'kik',
2595 'kj': 'kua',
2596 'kk': 'kaz',
2597 'kl': 'kal',
2598 'km': 'khm',
2599 'kn': 'kan',
2600 'ko': 'kor',
2601 'kr': 'kau',
2602 'ks': 'kas',
2603 'ku': 'kur',
2604 'kv': 'kom',
2605 'kw': 'cor',
2606 'ky': 'kir',
2607 'la': 'lat',
2608 'lb': 'ltz',
2609 'lg': 'lug',
2610 'li': 'lim',
2611 'ln': 'lin',
2612 'lo': 'lao',
2613 'lt': 'lit',
2614 'lu': 'lub',
2615 'lv': 'lav',
2616 'mg': 'mlg',
2617 'mh': 'mah',
2618 'mi': 'mri',
2619 'mk': 'mkd',
2620 'ml': 'mal',
2621 'mn': 'mon',
2622 'mr': 'mar',
2623 'ms': 'msa',
2624 'mt': 'mlt',
2625 'my': 'mya',
2626 'na': 'nau',
2627 'nb': 'nob',
2628 'nd': 'nde',
2629 'ne': 'nep',
2630 'ng': 'ndo',
2631 'nl': 'nld',
2632 'nn': 'nno',
2633 'no': 'nor',
2634 'nr': 'nbl',
2635 'nv': 'nav',
2636 'ny': 'nya',
2637 'oc': 'oci',
2638 'oj': 'oji',
2639 'om': 'orm',
2640 'or': 'ori',
2641 'os': 'oss',
2642 'pa': 'pan',
2643 'pi': 'pli',
2644 'pl': 'pol',
2645 'ps': 'pus',
2646 'pt': 'por',
2647 'qu': 'que',
2648 'rm': 'roh',
2649 'rn': 'run',
2650 'ro': 'ron',
2651 'ru': 'rus',
2652 'rw': 'kin',
2653 'sa': 'san',
2654 'sc': 'srd',
2655 'sd': 'snd',
2656 'se': 'sme',
2657 'sg': 'sag',
2658 'si': 'sin',
2659 'sk': 'slk',
2660 'sl': 'slv',
2661 'sm': 'smo',
2662 'sn': 'sna',
2663 'so': 'som',
2664 'sq': 'sqi',
2665 'sr': 'srp',
2666 'ss': 'ssw',
2667 'st': 'sot',
2668 'su': 'sun',
2669 'sv': 'swe',
2670 'sw': 'swa',
2671 'ta': 'tam',
2672 'te': 'tel',
2673 'tg': 'tgk',
2674 'th': 'tha',
2675 'ti': 'tir',
2676 'tk': 'tuk',
2677 'tl': 'tgl',
2678 'tn': 'tsn',
2679 'to': 'ton',
2680 'tr': 'tur',
2681 'ts': 'tso',
2682 'tt': 'tat',
2683 'tw': 'twi',
2684 'ty': 'tah',
2685 'ug': 'uig',
2686 'uk': 'ukr',
2687 'ur': 'urd',
2688 'uz': 'uzb',
2689 've': 'ven',
2690 'vi': 'vie',
2691 'vo': 'vol',
2692 'wa': 'wln',
2693 'wo': 'wol',
2694 'xh': 'xho',
2695 'yi': 'yid',
2696 'yo': 'yor',
2697 'za': 'zha',
2698 'zh': 'zho',
2699 'zu': 'zul',
2700 }
2701
2702 @classmethod
2703 def short2long(cls, code):
2704 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2705 return cls._lang_map.get(code[:2])
2706
2707 @classmethod
2708 def long2short(cls, code):
2709 """Convert language code from ISO 639-2/T to ISO 639-1"""
2710 for short_name, long_name in cls._lang_map.items():
2711 if long_name == code:
2712 return short_name
2713
2714
2715 class ISO3166Utils(object):
2716 # From http://data.okfn.org/data/core/country-list
2717 _country_map = {
2718 'AF': 'Afghanistan',
2719 'AX': 'Åland Islands',
2720 'AL': 'Albania',
2721 'DZ': 'Algeria',
2722 'AS': 'American Samoa',
2723 'AD': 'Andorra',
2724 'AO': 'Angola',
2725 'AI': 'Anguilla',
2726 'AQ': 'Antarctica',
2727 'AG': 'Antigua and Barbuda',
2728 'AR': 'Argentina',
2729 'AM': 'Armenia',
2730 'AW': 'Aruba',
2731 'AU': 'Australia',
2732 'AT': 'Austria',
2733 'AZ': 'Azerbaijan',
2734 'BS': 'Bahamas',
2735 'BH': 'Bahrain',
2736 'BD': 'Bangladesh',
2737 'BB': 'Barbados',
2738 'BY': 'Belarus',
2739 'BE': 'Belgium',
2740 'BZ': 'Belize',
2741 'BJ': 'Benin',
2742 'BM': 'Bermuda',
2743 'BT': 'Bhutan',
2744 'BO': 'Bolivia, Plurinational State of',
2745 'BQ': 'Bonaire, Sint Eustatius and Saba',
2746 'BA': 'Bosnia and Herzegovina',
2747 'BW': 'Botswana',
2748 'BV': 'Bouvet Island',
2749 'BR': 'Brazil',
2750 'IO': 'British Indian Ocean Territory',
2751 'BN': 'Brunei Darussalam',
2752 'BG': 'Bulgaria',
2753 'BF': 'Burkina Faso',
2754 'BI': 'Burundi',
2755 'KH': 'Cambodia',
2756 'CM': 'Cameroon',
2757 'CA': 'Canada',
2758 'CV': 'Cape Verde',
2759 'KY': 'Cayman Islands',
2760 'CF': 'Central African Republic',
2761 'TD': 'Chad',
2762 'CL': 'Chile',
2763 'CN': 'China',
2764 'CX': 'Christmas Island',
2765 'CC': 'Cocos (Keeling) Islands',
2766 'CO': 'Colombia',
2767 'KM': 'Comoros',
2768 'CG': 'Congo',
2769 'CD': 'Congo, the Democratic Republic of the',
2770 'CK': 'Cook Islands',
2771 'CR': 'Costa Rica',
2772 'CI': 'Côte d\'Ivoire',
2773 'HR': 'Croatia',
2774 'CU': 'Cuba',
2775 'CW': 'Curaçao',
2776 'CY': 'Cyprus',
2777 'CZ': 'Czech Republic',
2778 'DK': 'Denmark',
2779 'DJ': 'Djibouti',
2780 'DM': 'Dominica',
2781 'DO': 'Dominican Republic',
2782 'EC': 'Ecuador',
2783 'EG': 'Egypt',
2784 'SV': 'El Salvador',
2785 'GQ': 'Equatorial Guinea',
2786 'ER': 'Eritrea',
2787 'EE': 'Estonia',
2788 'ET': 'Ethiopia',
2789 'FK': 'Falkland Islands (Malvinas)',
2790 'FO': 'Faroe Islands',
2791 'FJ': 'Fiji',
2792 'FI': 'Finland',
2793 'FR': 'France',
2794 'GF': 'French Guiana',
2795 'PF': 'French Polynesia',
2796 'TF': 'French Southern Territories',
2797 'GA': 'Gabon',
2798 'GM': 'Gambia',
2799 'GE': 'Georgia',
2800 'DE': 'Germany',
2801 'GH': 'Ghana',
2802 'GI': 'Gibraltar',
2803 'GR': 'Greece',
2804 'GL': 'Greenland',
2805 'GD': 'Grenada',
2806 'GP': 'Guadeloupe',
2807 'GU': 'Guam',
2808 'GT': 'Guatemala',
2809 'GG': 'Guernsey',
2810 'GN': 'Guinea',
2811 'GW': 'Guinea-Bissau',
2812 'GY': 'Guyana',
2813 'HT': 'Haiti',
2814 'HM': 'Heard Island and McDonald Islands',
2815 'VA': 'Holy See (Vatican City State)',
2816 'HN': 'Honduras',
2817 'HK': 'Hong Kong',
2818 'HU': 'Hungary',
2819 'IS': 'Iceland',
2820 'IN': 'India',
2821 'ID': 'Indonesia',
2822 'IR': 'Iran, Islamic Republic of',
2823 'IQ': 'Iraq',
2824 'IE': 'Ireland',
2825 'IM': 'Isle of Man',
2826 'IL': 'Israel',
2827 'IT': 'Italy',
2828 'JM': 'Jamaica',
2829 'JP': 'Japan',
2830 'JE': 'Jersey',
2831 'JO': 'Jordan',
2832 'KZ': 'Kazakhstan',
2833 'KE': 'Kenya',
2834 'KI': 'Kiribati',
2835 'KP': 'Korea, Democratic People\'s Republic of',
2836 'KR': 'Korea, Republic of',
2837 'KW': 'Kuwait',
2838 'KG': 'Kyrgyzstan',
2839 'LA': 'Lao People\'s Democratic Republic',
2840 'LV': 'Latvia',
2841 'LB': 'Lebanon',
2842 'LS': 'Lesotho',
2843 'LR': 'Liberia',
2844 'LY': 'Libya',
2845 'LI': 'Liechtenstein',
2846 'LT': 'Lithuania',
2847 'LU': 'Luxembourg',
2848 'MO': 'Macao',
2849 'MK': 'Macedonia, the Former Yugoslav Republic of',
2850 'MG': 'Madagascar',
2851 'MW': 'Malawi',
2852 'MY': 'Malaysia',
2853 'MV': 'Maldives',
2854 'ML': 'Mali',
2855 'MT': 'Malta',
2856 'MH': 'Marshall Islands',
2857 'MQ': 'Martinique',
2858 'MR': 'Mauritania',
2859 'MU': 'Mauritius',
2860 'YT': 'Mayotte',
2861 'MX': 'Mexico',
2862 'FM': 'Micronesia, Federated States of',
2863 'MD': 'Moldova, Republic of',
2864 'MC': 'Monaco',
2865 'MN': 'Mongolia',
2866 'ME': 'Montenegro',
2867 'MS': 'Montserrat',
2868 'MA': 'Morocco',
2869 'MZ': 'Mozambique',
2870 'MM': 'Myanmar',
2871 'NA': 'Namibia',
2872 'NR': 'Nauru',
2873 'NP': 'Nepal',
2874 'NL': 'Netherlands',
2875 'NC': 'New Caledonia',
2876 'NZ': 'New Zealand',
2877 'NI': 'Nicaragua',
2878 'NE': 'Niger',
2879 'NG': 'Nigeria',
2880 'NU': 'Niue',
2881 'NF': 'Norfolk Island',
2882 'MP': 'Northern Mariana Islands',
2883 'NO': 'Norway',
2884 'OM': 'Oman',
2885 'PK': 'Pakistan',
2886 'PW': 'Palau',
2887 'PS': 'Palestine, State of',
2888 'PA': 'Panama',
2889 'PG': 'Papua New Guinea',
2890 'PY': 'Paraguay',
2891 'PE': 'Peru',
2892 'PH': 'Philippines',
2893 'PN': 'Pitcairn',
2894 'PL': 'Poland',
2895 'PT': 'Portugal',
2896 'PR': 'Puerto Rico',
2897 'QA': 'Qatar',
2898 'RE': 'Réunion',
2899 'RO': 'Romania',
2900 'RU': 'Russian Federation',
2901 'RW': 'Rwanda',
2902 'BL': 'Saint Barthélemy',
2903 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2904 'KN': 'Saint Kitts and Nevis',
2905 'LC': 'Saint Lucia',
2906 'MF': 'Saint Martin (French part)',
2907 'PM': 'Saint Pierre and Miquelon',
2908 'VC': 'Saint Vincent and the Grenadines',
2909 'WS': 'Samoa',
2910 'SM': 'San Marino',
2911 'ST': 'Sao Tome and Principe',
2912 'SA': 'Saudi Arabia',
2913 'SN': 'Senegal',
2914 'RS': 'Serbia',
2915 'SC': 'Seychelles',
2916 'SL': 'Sierra Leone',
2917 'SG': 'Singapore',
2918 'SX': 'Sint Maarten (Dutch part)',
2919 'SK': 'Slovakia',
2920 'SI': 'Slovenia',
2921 'SB': 'Solomon Islands',
2922 'SO': 'Somalia',
2923 'ZA': 'South Africa',
2924 'GS': 'South Georgia and the South Sandwich Islands',
2925 'SS': 'South Sudan',
2926 'ES': 'Spain',
2927 'LK': 'Sri Lanka',
2928 'SD': 'Sudan',
2929 'SR': 'Suriname',
2930 'SJ': 'Svalbard and Jan Mayen',
2931 'SZ': 'Swaziland',
2932 'SE': 'Sweden',
2933 'CH': 'Switzerland',
2934 'SY': 'Syrian Arab Republic',
2935 'TW': 'Taiwan, Province of China',
2936 'TJ': 'Tajikistan',
2937 'TZ': 'Tanzania, United Republic of',
2938 'TH': 'Thailand',
2939 'TL': 'Timor-Leste',
2940 'TG': 'Togo',
2941 'TK': 'Tokelau',
2942 'TO': 'Tonga',
2943 'TT': 'Trinidad and Tobago',
2944 'TN': 'Tunisia',
2945 'TR': 'Turkey',
2946 'TM': 'Turkmenistan',
2947 'TC': 'Turks and Caicos Islands',
2948 'TV': 'Tuvalu',
2949 'UG': 'Uganda',
2950 'UA': 'Ukraine',
2951 'AE': 'United Arab Emirates',
2952 'GB': 'United Kingdom',
2953 'US': 'United States',
2954 'UM': 'United States Minor Outlying Islands',
2955 'UY': 'Uruguay',
2956 'UZ': 'Uzbekistan',
2957 'VU': 'Vanuatu',
2958 'VE': 'Venezuela, Bolivarian Republic of',
2959 'VN': 'Viet Nam',
2960 'VG': 'Virgin Islands, British',
2961 'VI': 'Virgin Islands, U.S.',
2962 'WF': 'Wallis and Futuna',
2963 'EH': 'Western Sahara',
2964 'YE': 'Yemen',
2965 'ZM': 'Zambia',
2966 'ZW': 'Zimbabwe',
2967 }
2968
2969 @classmethod
2970 def short2full(cls, code):
2971 """Convert an ISO 3166-2 country code to the corresponding full name"""
2972 return cls._country_map.get(code.upper())
2973
2974
2975 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2976 def __init__(self, proxies=None):
2977 # Set default handlers
2978 for type in ('http', 'https'):
2979 setattr(self, '%s_open' % type,
2980 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2981 meth(r, proxy, type))
2982 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2983
2984 def proxy_open(self, req, proxy, type):
2985 req_proxy = req.headers.get('Ytdl-request-proxy')
2986 if req_proxy is not None:
2987 proxy = req_proxy
2988 del req.headers['Ytdl-request-proxy']
2989
2990 if proxy == '__noproxy__':
2991 return None # No Proxy
2992 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2993 req.add_header('Ytdl-socks-proxy', proxy)
2994 # youtube-dl's http/https handlers do wrapping the socket with socks
2995 return None
2996 return compat_urllib_request.ProxyHandler.proxy_open(
2997 self, req, proxy, type)
2998
2999
3000 def ohdave_rsa_encrypt(data, exponent, modulus):
3001 '''
3002 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3003
3004 Input:
3005 data: data to encrypt, bytes-like object
3006 exponent, modulus: parameter e and N of RSA algorithm, both integer
3007 Output: hex string of encrypted data
3008
3009 Limitation: supports one block encryption only
3010 '''
3011
3012 payload = int(binascii.hexlify(data[::-1]), 16)
3013 encrypted = pow(payload, exponent, modulus)
3014 return '%x' % encrypted
3015
3016
3017 def encode_base_n(num, n, table=None):
3018 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3019 if not table:
3020 table = FULL_TABLE[:n]
3021
3022 if n > len(table):
3023 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3024
3025 if num == 0:
3026 return table[0]
3027
3028 ret = ''
3029 while num:
3030 ret = table[num % n] + ret
3031 num = num // n
3032 return ret
3033
3034
3035 def decode_packed_codes(code):
3036 mobj = re.search(PACKED_CODES_RE, code)
3037 obfucasted_code, base, count, symbols = mobj.groups()
3038 base = int(base)
3039 count = int(count)
3040 symbols = symbols.split('|')
3041 symbol_table = {}
3042
3043 while count:
3044 count -= 1
3045 base_n_count = encode_base_n(count, base)
3046 symbol_table[base_n_count] = symbols[count] or base_n_count
3047
3048 return re.sub(
3049 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3050 obfucasted_code)
3051
3052
3053 def parse_m3u8_attributes(attrib):
3054 info = {}
3055 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3056 if val.startswith('"'):
3057 val = val[1:-1]
3058 info[key] = val
3059 return info
3060
3061
3062 def urshift(val, n):
3063 return val >> n if val >= 0 else (val + 0x100000000) >> n
3064
3065
3066 # Based on png2str() written by @gdkchan and improved by @yokrysty
3067 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3068 def decode_png(png_data):
3069 # Reference: https://www.w3.org/TR/PNG/
3070 header = png_data[8:]
3071
3072 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3073 raise IOError('Not a valid PNG file.')
3074
3075 int_map = {1: '>B', 2: '>H', 4: '>I'}
3076 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3077
3078 chunks = []
3079
3080 while header:
3081 length = unpack_integer(header[:4])
3082 header = header[4:]
3083
3084 chunk_type = header[:4]
3085 header = header[4:]
3086
3087 chunk_data = header[:length]
3088 header = header[length:]
3089
3090 header = header[4:] # Skip CRC
3091
3092 chunks.append({
3093 'type': chunk_type,
3094 'length': length,
3095 'data': chunk_data
3096 })
3097
3098 ihdr = chunks[0]['data']
3099
3100 width = unpack_integer(ihdr[:4])
3101 height = unpack_integer(ihdr[4:8])
3102
3103 idat = b''
3104
3105 for chunk in chunks:
3106 if chunk['type'] == b'IDAT':
3107 idat += chunk['data']
3108
3109 if not idat:
3110 raise IOError('Unable to read PNG data.')
3111
3112 decompressed_data = bytearray(zlib.decompress(idat))
3113
3114 stride = width * 3
3115 pixels = []
3116
3117 def _get_pixel(idx):
3118 x = idx % stride
3119 y = idx // stride
3120 return pixels[y][x]
3121
3122 for y in range(height):
3123 basePos = y * (1 + stride)
3124 filter_type = decompressed_data[basePos]
3125
3126 current_row = []
3127
3128 pixels.append(current_row)
3129
3130 for x in range(stride):
3131 color = decompressed_data[1 + basePos + x]
3132 basex = y * stride + x
3133 left = 0
3134 up = 0
3135
3136 if x > 2:
3137 left = _get_pixel(basex - 3)
3138 if y > 0:
3139 up = _get_pixel(basex - stride)
3140
3141 if filter_type == 1: # Sub
3142 color = (color + left) & 0xff
3143 elif filter_type == 2: # Up
3144 color = (color + up) & 0xff
3145 elif filter_type == 3: # Average
3146 color = (color + ((left + up) >> 1)) & 0xff
3147 elif filter_type == 4: # Paeth
3148 a = left
3149 b = up
3150 c = 0
3151
3152 if x > 2 and y > 0:
3153 c = _get_pixel(basex - stride - 3)
3154
3155 p = a + b - c
3156
3157 pa = abs(p - a)
3158 pb = abs(p - b)
3159 pc = abs(p - c)
3160
3161 if pa <= pb and pa <= pc:
3162 color = (color + a) & 0xff
3163 elif pb <= pc:
3164 color = (color + b) & 0xff
3165 else:
3166 color = (color + c) & 0xff
3167
3168 current_row.append(color)
3169
3170 return width, height, pixels
3171
3172
3173 def write_xattr(path, key, value):
3174 # This mess below finds the best xattr tool for the job
3175 try:
3176 # try the pyxattr module...
3177 import xattr
3178
3179 if hasattr(xattr, 'set'): # pyxattr
3180 # Unicode arguments are not supported in python-pyxattr until
3181 # version 0.5.0
3182 # See https://github.com/rg3/youtube-dl/issues/5498
3183 pyxattr_required_version = '0.5.0'
3184 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3185 # TODO: fallback to CLI tools
3186 raise XAttrUnavailableError(
3187 'python-pyxattr is detected but is too old. '
3188 'youtube-dl requires %s or above while your version is %s. '
3189 'Falling back to other xattr implementations' % (
3190 pyxattr_required_version, xattr.__version__))
3191
3192 setxattr = xattr.set
3193 else: # xattr
3194 setxattr = xattr.setxattr
3195
3196 try:
3197 setxattr(path, key, value)
3198 except EnvironmentError as e:
3199 raise XAttrMetadataError(e.errno, e.strerror)
3200
3201 except ImportError:
3202 if compat_os_name == 'nt':
3203 # Write xattrs to NTFS Alternate Data Streams:
3204 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3205 assert ':' not in key
3206 assert os.path.exists(path)
3207
3208 ads_fn = path + ':' + key
3209 try:
3210 with open(ads_fn, 'wb') as f:
3211 f.write(value)
3212 except EnvironmentError as e:
3213 raise XAttrMetadataError(e.errno, e.strerror)
3214 else:
3215 user_has_setfattr = check_executable('setfattr', ['--version'])
3216 user_has_xattr = check_executable('xattr', ['-h'])
3217
3218 if user_has_setfattr or user_has_xattr:
3219
3220 value = value.decode('utf-8')
3221 if user_has_setfattr:
3222 executable = 'setfattr'
3223 opts = ['-n', key, '-v', value]
3224 elif user_has_xattr:
3225 executable = 'xattr'
3226 opts = ['-w', key, value]
3227
3228 cmd = ([encodeFilename(executable, True)] +
3229 [encodeArgument(o) for o in opts] +
3230 [encodeFilename(path, True)])
3231
3232 try:
3233 p = subprocess.Popen(
3234 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3235 except EnvironmentError as e:
3236 raise XAttrMetadataError(e.errno, e.strerror)
3237 stdout, stderr = p.communicate()
3238 stderr = stderr.decode('utf-8', 'replace')
3239 if p.returncode != 0:
3240 raise XAttrMetadataError(p.returncode, stderr)
3241
3242 else:
3243 # On Unix, and can't find pyxattr, setfattr, or xattr.
3244 if sys.platform.startswith('linux'):
3245 raise XAttrUnavailableError(
3246 "Couldn't find a tool to set the xattrs. "
3247 "Install either the python 'pyxattr' or 'xattr' "
3248 "modules, or the GNU 'attr' package "
3249 "(which contains the 'setfattr' tool).")
3250 else:
3251 raise XAttrUnavailableError(
3252 "Couldn't find a tool to set the xattrs. "
3253 "Install either the python 'xattr' module, "
3254 "or the 'xattr' binary.")