]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/control: The package complies with policy 3.9.7.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import itertools
18 import io
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import ssl
28 import socket
29 import struct
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_http_client,
43 compat_kwargs,
44 compat_parse_qs,
45 compat_socket_create_connection,
46 compat_str,
47 compat_urllib_error,
48 compat_urllib_parse,
49 compat_urllib_parse_urlparse,
50 compat_urllib_request,
51 compat_urlparse,
52 shlex_quote,
53 )
54
55
56 # This is not clearly defined otherwise
57 compiled_regex_type = type(re.compile(''))
58
59 std_headers = {
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
65 }
66
67
68 NO_DEFAULT = object()
69
70 ENGLISH_MONTH_NAMES = [
71 'January', 'February', 'March', 'April', 'May', 'June',
72 'July', 'August', 'September', 'October', 'November', 'December']
73
74 KNOWN_EXTENSIONS = (
75 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
76 'flv', 'f4v', 'f4a', 'f4b',
77 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
78 'mkv', 'mka', 'mk3d',
79 'avi', 'divx',
80 'mov',
81 'asf', 'wmv', 'wma',
82 '3gp', '3g2',
83 'mp3',
84 'flac',
85 'ape',
86 'wav',
87 'f4f', 'f4m', 'm3u8', 'smil')
88
89
90 def preferredencoding():
91 """Get preferred encoding.
92
93 Returns the best encoding scheme for the system, based on
94 locale.getpreferredencoding() and some further tweaks.
95 """
96 try:
97 pref = locale.getpreferredencoding()
98 'TEST'.encode(pref)
99 except Exception:
100 pref = 'UTF-8'
101
102 return pref
103
104
105 def write_json_file(obj, fn):
106 """ Encode obj as JSON and write it to fn, atomically if possible """
107
108 fn = encodeFilename(fn)
109 if sys.version_info < (3, 0) and sys.platform != 'win32':
110 encoding = get_filesystem_encoding()
111 # os.path.basename returns a bytes object, but NamedTemporaryFile
112 # will fail if the filename contains non ascii characters unless we
113 # use a unicode object
114 path_basename = lambda f: os.path.basename(fn).decode(encoding)
115 # the same for os.path.dirname
116 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
117 else:
118 path_basename = os.path.basename
119 path_dirname = os.path.dirname
120
121 args = {
122 'suffix': '.tmp',
123 'prefix': path_basename(fn) + '.',
124 'dir': path_dirname(fn),
125 'delete': False,
126 }
127
128 # In Python 2.x, json.dump expects a bytestream.
129 # In Python 3.x, it writes to a character stream
130 if sys.version_info < (3, 0):
131 args['mode'] = 'wb'
132 else:
133 args.update({
134 'mode': 'w',
135 'encoding': 'utf-8',
136 })
137
138 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
139
140 try:
141 with tf:
142 json.dump(obj, tf)
143 if sys.platform == 'win32':
144 # Need to remove existing file on Windows, else os.rename raises
145 # WindowsError or FileExistsError.
146 try:
147 os.unlink(fn)
148 except OSError:
149 pass
150 os.rename(tf.name, fn)
151 except Exception:
152 try:
153 os.remove(tf.name)
154 except OSError:
155 pass
156 raise
157
158
159 if sys.version_info >= (2, 7):
160 def find_xpath_attr(node, xpath, key, val=None):
161 """ Find the xpath xpath[@key=val] """
162 assert re.match(r'^[a-zA-Z_-]+$', key)
163 if val:
164 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
166 return node.find(expr)
167 else:
168 def find_xpath_attr(node, xpath, key, val=None):
169 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
170 # .//node does not match if a node is a direct child of . !
171 if isinstance(xpath, compat_str):
172 xpath = xpath.encode('ascii')
173
174 for f in node.findall(xpath):
175 if key not in f.attrib:
176 continue
177 if val is None or f.attrib.get(key) == val:
178 return f
179 return None
180
181 # On python2.6 the xml.etree.ElementTree.Element methods don't support
182 # the namespace parameter
183
184
185 def xpath_with_ns(path, ns_map):
186 components = [c.split(':') for c in path.split('/')]
187 replaced = []
188 for c in components:
189 if len(c) == 1:
190 replaced.append(c[0])
191 else:
192 ns, tag = c
193 replaced.append('{%s}%s' % (ns_map[ns], tag))
194 return '/'.join(replaced)
195
196
197 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
198 def _find_xpath(xpath):
199 if sys.version_info < (2, 7): # Crazy 2.6
200 xpath = xpath.encode('ascii')
201 return node.find(xpath)
202
203 if isinstance(xpath, (str, compat_str)):
204 n = _find_xpath(xpath)
205 else:
206 for xp in xpath:
207 n = _find_xpath(xp)
208 if n is not None:
209 break
210
211 if n is None:
212 if default is not NO_DEFAULT:
213 return default
214 elif fatal:
215 name = xpath if name is None else name
216 raise ExtractorError('Could not find XML element %s' % name)
217 else:
218 return None
219 return n
220
221
222 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
223 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
224 if n is None or n == default:
225 return n
226 if n.text is None:
227 if default is not NO_DEFAULT:
228 return default
229 elif fatal:
230 name = xpath if name is None else name
231 raise ExtractorError('Could not find XML element\'s text %s' % name)
232 else:
233 return None
234 return n.text
235
236
237 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
238 n = find_xpath_attr(node, xpath, key)
239 if n is None:
240 if default is not NO_DEFAULT:
241 return default
242 elif fatal:
243 name = '%s[@%s]' % (xpath, key) if name is None else name
244 raise ExtractorError('Could not find XML attribute %s' % name)
245 else:
246 return None
247 return n.attrib[key]
248
249
250 def get_element_by_id(id, html):
251 """Return the content of the tag with the specified ID in the passed HTML document"""
252 return get_element_by_attribute('id', id, html)
253
254
255 def get_element_by_attribute(attribute, value, html):
256 """Return the content of the tag with the specified attribute in the passed HTML document"""
257
258 m = re.search(r'''(?xs)
259 <([a-zA-Z0-9:._-]+)
260 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
261 \s+%s=['"]?%s['"]?
262 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
263 \s*>
264 (?P<content>.*?)
265 </\1>
266 ''' % (re.escape(attribute), re.escape(value)), html)
267
268 if not m:
269 return None
270 res = m.group('content')
271
272 if res.startswith('"') or res.startswith("'"):
273 res = res[1:-1]
274
275 return unescapeHTML(res)
276
277
278 def clean_html(html):
279 """Clean an HTML snippet into a readable string"""
280
281 if html is None: # Convenience for sanitizing descriptions etc.
282 return html
283
284 # Newline vs <br />
285 html = html.replace('\n', ' ')
286 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
287 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
288 # Strip html tags
289 html = re.sub('<.*?>', '', html)
290 # Replace html entities
291 html = unescapeHTML(html)
292 return html.strip()
293
294
295 def sanitize_open(filename, open_mode):
296 """Try to open the given filename, and slightly tweak it if this fails.
297
298 Attempts to open the given filename. If this fails, it tries to change
299 the filename slightly, step by step, until it's either able to open it
300 or it fails and raises a final exception, like the standard open()
301 function.
302
303 It returns the tuple (stream, definitive_file_name).
304 """
305 try:
306 if filename == '-':
307 if sys.platform == 'win32':
308 import msvcrt
309 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
310 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
311 stream = open(encodeFilename(filename), open_mode)
312 return (stream, filename)
313 except (IOError, OSError) as err:
314 if err.errno in (errno.EACCES,):
315 raise
316
317 # In case of error, try to remove win32 forbidden chars
318 alt_filename = sanitize_path(filename)
319 if alt_filename == filename:
320 raise
321 else:
322 # An exception here should be caught in the caller
323 stream = open(encodeFilename(alt_filename), open_mode)
324 return (stream, alt_filename)
325
326
327 def timeconvert(timestr):
328 """Convert RFC 2822 defined time string into system timestamp"""
329 timestamp = None
330 timetuple = email.utils.parsedate_tz(timestr)
331 if timetuple is not None:
332 timestamp = email.utils.mktime_tz(timetuple)
333 return timestamp
334
335
336 def sanitize_filename(s, restricted=False, is_id=False):
337 """Sanitizes a string so it could be used as part of a filename.
338 If restricted is set, use a stricter subset of allowed characters.
339 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
340 """
341 def replace_insane(char):
342 if char == '?' or ord(char) < 32 or ord(char) == 127:
343 return ''
344 elif char == '"':
345 return '' if restricted else '\''
346 elif char == ':':
347 return '_-' if restricted else ' -'
348 elif char in '\\/|*<>':
349 return '_'
350 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
351 return '_'
352 if restricted and ord(char) > 127:
353 return '_'
354 return char
355
356 # Handle timestamps
357 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
358 result = ''.join(map(replace_insane, s))
359 if not is_id:
360 while '__' in result:
361 result = result.replace('__', '_')
362 result = result.strip('_')
363 # Common case of "Foreign band name - English song title"
364 if restricted and result.startswith('-_'):
365 result = result[2:]
366 if result.startswith('-'):
367 result = '_' + result[len('-'):]
368 result = result.lstrip('.')
369 if not result:
370 result = '_'
371 return result
372
373
374 def sanitize_path(s):
375 """Sanitizes and normalizes path on Windows"""
376 if sys.platform != 'win32':
377 return s
378 drive_or_unc, _ = os.path.splitdrive(s)
379 if sys.version_info < (2, 7) and not drive_or_unc:
380 drive_or_unc, _ = os.path.splitunc(s)
381 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
382 if drive_or_unc:
383 norm_path.pop(0)
384 sanitized_path = [
385 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
386 for path_part in norm_path]
387 if drive_or_unc:
388 sanitized_path.insert(0, drive_or_unc + os.path.sep)
389 return os.path.join(*sanitized_path)
390
391
392 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
393 # unwanted failures due to missing protocol
394 def sanitized_Request(url, *args, **kwargs):
395 return compat_urllib_request.Request(
396 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
397
398
399 def orderedSet(iterable):
400 """ Remove all duplicates from the input iterable """
401 res = []
402 for el in iterable:
403 if el not in res:
404 res.append(el)
405 return res
406
407
408 def _htmlentity_transform(entity):
409 """Transforms an HTML entity to a character."""
410 # Known non-numeric HTML entity
411 if entity in compat_html_entities.name2codepoint:
412 return compat_chr(compat_html_entities.name2codepoint[entity])
413
414 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
415 if mobj is not None:
416 numstr = mobj.group(1)
417 if numstr.startswith('x'):
418 base = 16
419 numstr = '0%s' % numstr
420 else:
421 base = 10
422 # See https://github.com/rg3/youtube-dl/issues/7518
423 try:
424 return compat_chr(int(numstr, base))
425 except ValueError:
426 pass
427
428 # Unknown entity in name, return its literal representation
429 return '&%s;' % entity
430
431
432 def unescapeHTML(s):
433 if s is None:
434 return None
435 assert type(s) == compat_str
436
437 return re.sub(
438 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
439
440
441 def get_subprocess_encoding():
442 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
443 # For subprocess calls, encode with locale encoding
444 # Refer to http://stackoverflow.com/a/9951851/35070
445 encoding = preferredencoding()
446 else:
447 encoding = sys.getfilesystemencoding()
448 if encoding is None:
449 encoding = 'utf-8'
450 return encoding
451
452
453 def encodeFilename(s, for_subprocess=False):
454 """
455 @param s The name of the file
456 """
457
458 assert type(s) == compat_str
459
460 # Python 3 has a Unicode API
461 if sys.version_info >= (3, 0):
462 return s
463
464 # Pass '' directly to use Unicode APIs on Windows 2000 and up
465 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
466 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
467 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 return s
469
470 return s.encode(get_subprocess_encoding(), 'ignore')
471
472
473 def decodeFilename(b, for_subprocess=False):
474
475 if sys.version_info >= (3, 0):
476 return b
477
478 if not isinstance(b, bytes):
479 return b
480
481 return b.decode(get_subprocess_encoding(), 'ignore')
482
483
484 def encodeArgument(s):
485 if not isinstance(s, compat_str):
486 # Legacy code that uses byte strings
487 # Uncomment the following line after fixing all post processors
488 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
489 s = s.decode('ascii')
490 return encodeFilename(s, True)
491
492
493 def decodeArgument(b):
494 return decodeFilename(b, True)
495
496
497 def decodeOption(optval):
498 if optval is None:
499 return optval
500 if isinstance(optval, bytes):
501 optval = optval.decode(preferredencoding())
502
503 assert isinstance(optval, compat_str)
504 return optval
505
506
507 def formatSeconds(secs):
508 if secs > 3600:
509 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
510 elif secs > 60:
511 return '%d:%02d' % (secs // 60, secs % 60)
512 else:
513 return '%d' % secs
514
515
516 def make_HTTPS_handler(params, **kwargs):
517 opts_no_check_certificate = params.get('nocheckcertificate', False)
518 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
519 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
520 if opts_no_check_certificate:
521 context.check_hostname = False
522 context.verify_mode = ssl.CERT_NONE
523 try:
524 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 except TypeError:
526 # Python 2.7.8
527 # (create_default_context present but HTTPSHandler has no context=)
528 pass
529
530 if sys.version_info < (3, 2):
531 return YoutubeDLHTTPSHandler(params, **kwargs)
532 else: # Python < 3.4
533 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
534 context.verify_mode = (ssl.CERT_NONE
535 if opts_no_check_certificate
536 else ssl.CERT_REQUIRED)
537 context.set_default_verify_paths()
538 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
539
540
541 def bug_reports_message():
542 if ytdl_is_updateable():
543 update_cmd = 'type youtube-dl -U to update'
544 else:
545 update_cmd = 'see https://yt-dl.org/update on how to update'
546 msg = '; please report this issue on https://yt-dl.org/bug .'
547 msg += ' Make sure you are using the latest version; %s.' % update_cmd
548 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
549 return msg
550
551
552 class ExtractorError(Exception):
553 """Error during info extraction."""
554
555 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
556 """ tb, if given, is the original traceback (so that it can be printed out).
557 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
558 """
559
560 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
561 expected = True
562 if video_id is not None:
563 msg = video_id + ': ' + msg
564 if cause:
565 msg += ' (caused by %r)' % cause
566 if not expected:
567 msg += bug_reports_message()
568 super(ExtractorError, self).__init__(msg)
569
570 self.traceback = tb
571 self.exc_info = sys.exc_info() # preserve original exception
572 self.cause = cause
573 self.video_id = video_id
574
575 def format_traceback(self):
576 if self.traceback is None:
577 return None
578 return ''.join(traceback.format_tb(self.traceback))
579
580
581 class UnsupportedError(ExtractorError):
582 def __init__(self, url):
583 super(UnsupportedError, self).__init__(
584 'Unsupported URL: %s' % url, expected=True)
585 self.url = url
586
587
588 class RegexNotFoundError(ExtractorError):
589 """Error when a regex didn't match"""
590 pass
591
592
593 class DownloadError(Exception):
594 """Download Error exception.
595
596 This exception may be thrown by FileDownloader objects if they are not
597 configured to continue on errors. They will contain the appropriate
598 error message.
599 """
600
601 def __init__(self, msg, exc_info=None):
602 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
603 super(DownloadError, self).__init__(msg)
604 self.exc_info = exc_info
605
606
607 class SameFileError(Exception):
608 """Same File exception.
609
610 This exception will be thrown by FileDownloader objects if they detect
611 multiple files would have to be downloaded to the same file on disk.
612 """
613 pass
614
615
616 class PostProcessingError(Exception):
617 """Post Processing exception.
618
619 This exception may be raised by PostProcessor's .run() method to
620 indicate an error in the postprocessing task.
621 """
622
623 def __init__(self, msg):
624 self.msg = msg
625
626
627 class MaxDownloadsReached(Exception):
628 """ --max-downloads limit has been reached. """
629 pass
630
631
632 class UnavailableVideoError(Exception):
633 """Unavailable Format exception.
634
635 This exception will be thrown when a video is requested
636 in a format that is not available for that video.
637 """
638 pass
639
640
641 class ContentTooShortError(Exception):
642 """Content Too Short exception.
643
644 This exception may be raised by FileDownloader objects when a file they
645 download is too small for what the server announced first, indicating
646 the connection was probably interrupted.
647 """
648
649 def __init__(self, downloaded, expected):
650 # Both in bytes
651 self.downloaded = downloaded
652 self.expected = expected
653
654
655 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
656 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
657 # expected HTTP responses to meet HTTP/1.0 or later (see also
658 # https://github.com/rg3/youtube-dl/issues/6727)
659 if sys.version_info < (3, 0):
660 kwargs[b'strict'] = True
661 hc = http_class(*args, **kwargs)
662 source_address = ydl_handler._params.get('source_address')
663 if source_address is not None:
664 sa = (source_address, 0)
665 if hasattr(hc, 'source_address'): # Python 2.7+
666 hc.source_address = sa
667 else: # Python 2.6
668 def _hc_connect(self, *args, **kwargs):
669 sock = compat_socket_create_connection(
670 (self.host, self.port), self.timeout, sa)
671 if is_https:
672 self.sock = ssl.wrap_socket(
673 sock, self.key_file, self.cert_file,
674 ssl_version=ssl.PROTOCOL_TLSv1)
675 else:
676 self.sock = sock
677 hc.connect = functools.partial(_hc_connect, hc)
678
679 return hc
680
681
682 def handle_youtubedl_headers(headers):
683 filtered_headers = headers
684
685 if 'Youtubedl-no-compression' in filtered_headers:
686 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
687 del filtered_headers['Youtubedl-no-compression']
688
689 return filtered_headers
690
691
692 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
693 """Handler for HTTP requests and responses.
694
695 This class, when installed with an OpenerDirector, automatically adds
696 the standard headers to every HTTP request and handles gzipped and
697 deflated responses from web servers. If compression is to be avoided in
698 a particular request, the original request in the program code only has
699 to include the HTTP header "Youtubedl-no-compression", which will be
700 removed before making the real request.
701
702 Part of this code was copied from:
703
704 http://techknack.net/python-urllib2-handlers/
705
706 Andrew Rowls, the author of that code, agreed to release it to the
707 public domain.
708 """
709
710 def __init__(self, params, *args, **kwargs):
711 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
712 self._params = params
713
714 def http_open(self, req):
715 return self.do_open(functools.partial(
716 _create_http_connection, self, compat_http_client.HTTPConnection, False),
717 req)
718
719 @staticmethod
720 def deflate(data):
721 try:
722 return zlib.decompress(data, -zlib.MAX_WBITS)
723 except zlib.error:
724 return zlib.decompress(data)
725
726 @staticmethod
727 def addinfourl_wrapper(stream, headers, url, code):
728 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
729 return compat_urllib_request.addinfourl(stream, headers, url, code)
730 ret = compat_urllib_request.addinfourl(stream, headers, url)
731 ret.code = code
732 return ret
733
734 def http_request(self, req):
735 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
736 # always respected by websites, some tend to give out URLs with non percent-encoded
737 # non-ASCII characters (see telemb.py, ard.py [#3412])
738 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
739 # To work around aforementioned issue we will replace request's original URL with
740 # percent-encoded one
741 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
742 # the code of this workaround has been moved here from YoutubeDL.urlopen()
743 url = req.get_full_url()
744 url_escaped = escape_url(url)
745
746 # Substitute URL if any change after escaping
747 if url != url_escaped:
748 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
749 new_req = req_type(
750 url_escaped, data=req.data, headers=req.headers,
751 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
752 new_req.timeout = req.timeout
753 req = new_req
754
755 for h, v in std_headers.items():
756 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
757 # The dict keys are capitalized because of this bug by urllib
758 if h.capitalize() not in req.headers:
759 req.add_header(h, v)
760
761 req.headers = handle_youtubedl_headers(req.headers)
762
763 if sys.version_info < (2, 7) and '#' in req.get_full_url():
764 # Python 2.6 is brain-dead when it comes to fragments
765 req._Request__original = req._Request__original.partition('#')[0]
766 req._Request__r_type = req._Request__r_type.partition('#')[0]
767
768 return req
769
770 def http_response(self, req, resp):
771 old_resp = resp
772 # gzip
773 if resp.headers.get('Content-encoding', '') == 'gzip':
774 content = resp.read()
775 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
776 try:
777 uncompressed = io.BytesIO(gz.read())
778 except IOError as original_ioerror:
779 # There may be junk add the end of the file
780 # See http://stackoverflow.com/q/4928560/35070 for details
781 for i in range(1, 1024):
782 try:
783 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
784 uncompressed = io.BytesIO(gz.read())
785 except IOError:
786 continue
787 break
788 else:
789 raise original_ioerror
790 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
791 resp.msg = old_resp.msg
792 del resp.headers['Content-encoding']
793 # deflate
794 if resp.headers.get('Content-encoding', '') == 'deflate':
795 gz = io.BytesIO(self.deflate(resp.read()))
796 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
797 resp.msg = old_resp.msg
798 del resp.headers['Content-encoding']
799 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
800 # https://github.com/rg3/youtube-dl/issues/6457).
801 if 300 <= resp.code < 400:
802 location = resp.headers.get('Location')
803 if location:
804 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
805 if sys.version_info >= (3, 0):
806 location = location.encode('iso-8859-1').decode('utf-8')
807 location_escaped = escape_url(location)
808 if location != location_escaped:
809 del resp.headers['Location']
810 resp.headers['Location'] = location_escaped
811 return resp
812
813 https_request = http_request
814 https_response = http_response
815
816
817 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
818 def __init__(self, params, https_conn_class=None, *args, **kwargs):
819 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
820 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
821 self._params = params
822
823 def https_open(self, req):
824 kwargs = {}
825 if hasattr(self, '_context'): # python > 2.6
826 kwargs['context'] = self._context
827 if hasattr(self, '_check_hostname'): # python 3.x
828 kwargs['check_hostname'] = self._check_hostname
829 return self.do_open(functools.partial(
830 _create_http_connection, self, self._https_conn_class, True),
831 req, **kwargs)
832
833
834 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
835 def __init__(self, cookiejar=None):
836 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
837
838 def http_response(self, request, response):
839 # Python 2 will choke on next HTTP request in row if there are non-ASCII
840 # characters in Set-Cookie HTTP header of last response (see
841 # https://github.com/rg3/youtube-dl/issues/6769).
842 # In order to at least prevent crashing we will percent encode Set-Cookie
843 # header before HTTPCookieProcessor starts processing it.
844 # if sys.version_info < (3, 0) and response.headers:
845 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
846 # set_cookie = response.headers.get(set_cookie_header)
847 # if set_cookie:
848 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
849 # if set_cookie != set_cookie_escaped:
850 # del response.headers[set_cookie_header]
851 # response.headers[set_cookie_header] = set_cookie_escaped
852 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
853
854 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
855 https_response = http_response
856
857
858 def parse_iso8601(date_str, delimiter='T', timezone=None):
859 """ Return a UNIX timestamp from the given date """
860
861 if date_str is None:
862 return None
863
864 date_str = re.sub(r'\.[0-9]+', '', date_str)
865
866 if timezone is None:
867 m = re.search(
868 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
869 date_str)
870 if not m:
871 timezone = datetime.timedelta()
872 else:
873 date_str = date_str[:-len(m.group(0))]
874 if not m.group('sign'):
875 timezone = datetime.timedelta()
876 else:
877 sign = 1 if m.group('sign') == '+' else -1
878 timezone = datetime.timedelta(
879 hours=sign * int(m.group('hours')),
880 minutes=sign * int(m.group('minutes')))
881 try:
882 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
883 dt = datetime.datetime.strptime(date_str, date_format) - timezone
884 return calendar.timegm(dt.timetuple())
885 except ValueError:
886 pass
887
888
889 def unified_strdate(date_str, day_first=True):
890 """Return a string with the date in the format YYYYMMDD"""
891
892 if date_str is None:
893 return None
894 upload_date = None
895 # Replace commas
896 date_str = date_str.replace(',', ' ')
897 # %z (UTC offset) is only supported in python>=3.2
898 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
899 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
900 # Remove AM/PM + timezone
901 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
902
903 format_expressions = [
904 '%d %B %Y',
905 '%d %b %Y',
906 '%B %d %Y',
907 '%b %d %Y',
908 '%b %dst %Y %I:%M%p',
909 '%b %dnd %Y %I:%M%p',
910 '%b %dth %Y %I:%M%p',
911 '%Y %m %d',
912 '%Y-%m-%d',
913 '%Y/%m/%d',
914 '%Y/%m/%d %H:%M:%S',
915 '%Y-%m-%d %H:%M:%S',
916 '%Y-%m-%d %H:%M:%S.%f',
917 '%d.%m.%Y %H:%M',
918 '%d.%m.%Y %H.%M',
919 '%Y-%m-%dT%H:%M:%SZ',
920 '%Y-%m-%dT%H:%M:%S.%fZ',
921 '%Y-%m-%dT%H:%M:%S.%f0Z',
922 '%Y-%m-%dT%H:%M:%S',
923 '%Y-%m-%dT%H:%M:%S.%f',
924 '%Y-%m-%dT%H:%M',
925 ]
926 if day_first:
927 format_expressions.extend([
928 '%d-%m-%Y',
929 '%d.%m.%Y',
930 '%d/%m/%Y',
931 '%d/%m/%y',
932 '%d/%m/%Y %H:%M:%S',
933 ])
934 else:
935 format_expressions.extend([
936 '%m-%d-%Y',
937 '%m.%d.%Y',
938 '%m/%d/%Y',
939 '%m/%d/%y',
940 '%m/%d/%Y %H:%M:%S',
941 ])
942 for expression in format_expressions:
943 try:
944 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
945 except ValueError:
946 pass
947 if upload_date is None:
948 timetuple = email.utils.parsedate_tz(date_str)
949 if timetuple:
950 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
951 if upload_date is not None:
952 return compat_str(upload_date)
953
954
955 def determine_ext(url, default_ext='unknown_video'):
956 if url is None:
957 return default_ext
958 guess = url.partition('?')[0].rpartition('.')[2]
959 if re.match(r'^[A-Za-z0-9]+$', guess):
960 return guess
961 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
962 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
963 return guess.rstrip('/')
964 else:
965 return default_ext
966
967
968 def subtitles_filename(filename, sub_lang, sub_format):
969 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
970
971
972 def date_from_str(date_str):
973 """
974 Return a datetime object from a string in the format YYYYMMDD or
975 (now|today)[+-][0-9](day|week|month|year)(s)?"""
976 today = datetime.date.today()
977 if date_str in ('now', 'today'):
978 return today
979 if date_str == 'yesterday':
980 return today - datetime.timedelta(days=1)
981 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
982 if match is not None:
983 sign = match.group('sign')
984 time = int(match.group('time'))
985 if sign == '-':
986 time = -time
987 unit = match.group('unit')
988 # A bad approximation?
989 if unit == 'month':
990 unit = 'day'
991 time *= 30
992 elif unit == 'year':
993 unit = 'day'
994 time *= 365
995 unit += 's'
996 delta = datetime.timedelta(**{unit: time})
997 return today + delta
998 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
999
1000
1001 def hyphenate_date(date_str):
1002 """
1003 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1004 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1005 if match is not None:
1006 return '-'.join(match.groups())
1007 else:
1008 return date_str
1009
1010
1011 class DateRange(object):
1012 """Represents a time interval between two dates"""
1013
1014 def __init__(self, start=None, end=None):
1015 """start and end must be strings in the format accepted by date"""
1016 if start is not None:
1017 self.start = date_from_str(start)
1018 else:
1019 self.start = datetime.datetime.min.date()
1020 if end is not None:
1021 self.end = date_from_str(end)
1022 else:
1023 self.end = datetime.datetime.max.date()
1024 if self.start > self.end:
1025 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1026
1027 @classmethod
1028 def day(cls, day):
1029 """Returns a range that only contains the given day"""
1030 return cls(day, day)
1031
1032 def __contains__(self, date):
1033 """Check if the date is in the range"""
1034 if not isinstance(date, datetime.date):
1035 date = date_from_str(date)
1036 return self.start <= date <= self.end
1037
1038 def __str__(self):
1039 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1040
1041
1042 def platform_name():
1043 """ Returns the platform name as a compat_str """
1044 res = platform.platform()
1045 if isinstance(res, bytes):
1046 res = res.decode(preferredencoding())
1047
1048 assert isinstance(res, compat_str)
1049 return res
1050
1051
1052 def _windows_write_string(s, out):
1053 """ Returns True if the string was written using special methods,
1054 False if it has yet to be written out."""
1055 # Adapted from http://stackoverflow.com/a/3259271/35070
1056
1057 import ctypes
1058 import ctypes.wintypes
1059
1060 WIN_OUTPUT_IDS = {
1061 1: -11,
1062 2: -12,
1063 }
1064
1065 try:
1066 fileno = out.fileno()
1067 except AttributeError:
1068 # If the output stream doesn't have a fileno, it's virtual
1069 return False
1070 except io.UnsupportedOperation:
1071 # Some strange Windows pseudo files?
1072 return False
1073 if fileno not in WIN_OUTPUT_IDS:
1074 return False
1075
1076 GetStdHandle = ctypes.WINFUNCTYPE(
1077 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1078 (b'GetStdHandle', ctypes.windll.kernel32))
1079 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1080
1081 WriteConsoleW = ctypes.WINFUNCTYPE(
1082 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1083 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1084 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1085 written = ctypes.wintypes.DWORD(0)
1086
1087 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1088 FILE_TYPE_CHAR = 0x0002
1089 FILE_TYPE_REMOTE = 0x8000
1090 GetConsoleMode = ctypes.WINFUNCTYPE(
1091 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1092 ctypes.POINTER(ctypes.wintypes.DWORD))(
1093 (b'GetConsoleMode', ctypes.windll.kernel32))
1094 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1095
1096 def not_a_console(handle):
1097 if handle == INVALID_HANDLE_VALUE or handle is None:
1098 return True
1099 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1100 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1101
1102 if not_a_console(h):
1103 return False
1104
1105 def next_nonbmp_pos(s):
1106 try:
1107 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1108 except StopIteration:
1109 return len(s)
1110
1111 while s:
1112 count = min(next_nonbmp_pos(s), 1024)
1113
1114 ret = WriteConsoleW(
1115 h, s, count if count else 2, ctypes.byref(written), None)
1116 if ret == 0:
1117 raise OSError('Failed to write string')
1118 if not count: # We just wrote a non-BMP character
1119 assert written.value == 2
1120 s = s[1:]
1121 else:
1122 assert written.value > 0
1123 s = s[written.value:]
1124 return True
1125
1126
1127 def write_string(s, out=None, encoding=None):
1128 if out is None:
1129 out = sys.stderr
1130 assert type(s) == compat_str
1131
1132 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1133 if _windows_write_string(s, out):
1134 return
1135
1136 if ('b' in getattr(out, 'mode', '') or
1137 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1138 byt = s.encode(encoding or preferredencoding(), 'ignore')
1139 out.write(byt)
1140 elif hasattr(out, 'buffer'):
1141 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1142 byt = s.encode(enc, 'ignore')
1143 out.buffer.write(byt)
1144 else:
1145 out.write(s)
1146 out.flush()
1147
1148
1149 def bytes_to_intlist(bs):
1150 if not bs:
1151 return []
1152 if isinstance(bs[0], int): # Python 3
1153 return list(bs)
1154 else:
1155 return [ord(c) for c in bs]
1156
1157
1158 def intlist_to_bytes(xs):
1159 if not xs:
1160 return b''
1161 return struct_pack('%dB' % len(xs), *xs)
1162
1163
1164 # Cross-platform file locking
1165 if sys.platform == 'win32':
1166 import ctypes.wintypes
1167 import msvcrt
1168
1169 class OVERLAPPED(ctypes.Structure):
1170 _fields_ = [
1171 ('Internal', ctypes.wintypes.LPVOID),
1172 ('InternalHigh', ctypes.wintypes.LPVOID),
1173 ('Offset', ctypes.wintypes.DWORD),
1174 ('OffsetHigh', ctypes.wintypes.DWORD),
1175 ('hEvent', ctypes.wintypes.HANDLE),
1176 ]
1177
1178 kernel32 = ctypes.windll.kernel32
1179 LockFileEx = kernel32.LockFileEx
1180 LockFileEx.argtypes = [
1181 ctypes.wintypes.HANDLE, # hFile
1182 ctypes.wintypes.DWORD, # dwFlags
1183 ctypes.wintypes.DWORD, # dwReserved
1184 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1185 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1186 ctypes.POINTER(OVERLAPPED) # Overlapped
1187 ]
1188 LockFileEx.restype = ctypes.wintypes.BOOL
1189 UnlockFileEx = kernel32.UnlockFileEx
1190 UnlockFileEx.argtypes = [
1191 ctypes.wintypes.HANDLE, # hFile
1192 ctypes.wintypes.DWORD, # dwReserved
1193 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1194 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1195 ctypes.POINTER(OVERLAPPED) # Overlapped
1196 ]
1197 UnlockFileEx.restype = ctypes.wintypes.BOOL
1198 whole_low = 0xffffffff
1199 whole_high = 0x7fffffff
1200
1201 def _lock_file(f, exclusive):
1202 overlapped = OVERLAPPED()
1203 overlapped.Offset = 0
1204 overlapped.OffsetHigh = 0
1205 overlapped.hEvent = 0
1206 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1207 handle = msvcrt.get_osfhandle(f.fileno())
1208 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1209 whole_low, whole_high, f._lock_file_overlapped_p):
1210 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1211
1212 def _unlock_file(f):
1213 assert f._lock_file_overlapped_p
1214 handle = msvcrt.get_osfhandle(f.fileno())
1215 if not UnlockFileEx(handle, 0,
1216 whole_low, whole_high, f._lock_file_overlapped_p):
1217 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1218
1219 else:
1220 import fcntl
1221
1222 def _lock_file(f, exclusive):
1223 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1224
1225 def _unlock_file(f):
1226 fcntl.flock(f, fcntl.LOCK_UN)
1227
1228
1229 class locked_file(object):
1230 def __init__(self, filename, mode, encoding=None):
1231 assert mode in ['r', 'a', 'w']
1232 self.f = io.open(filename, mode, encoding=encoding)
1233 self.mode = mode
1234
1235 def __enter__(self):
1236 exclusive = self.mode != 'r'
1237 try:
1238 _lock_file(self.f, exclusive)
1239 except IOError:
1240 self.f.close()
1241 raise
1242 return self
1243
1244 def __exit__(self, etype, value, traceback):
1245 try:
1246 _unlock_file(self.f)
1247 finally:
1248 self.f.close()
1249
1250 def __iter__(self):
1251 return iter(self.f)
1252
1253 def write(self, *args):
1254 return self.f.write(*args)
1255
1256 def read(self, *args):
1257 return self.f.read(*args)
1258
1259
1260 def get_filesystem_encoding():
1261 encoding = sys.getfilesystemencoding()
1262 return encoding if encoding is not None else 'utf-8'
1263
1264
1265 def shell_quote(args):
1266 quoted_args = []
1267 encoding = get_filesystem_encoding()
1268 for a in args:
1269 if isinstance(a, bytes):
1270 # We may get a filename encoded with 'encodeFilename'
1271 a = a.decode(encoding)
1272 quoted_args.append(pipes.quote(a))
1273 return ' '.join(quoted_args)
1274
1275
1276 def smuggle_url(url, data):
1277 """ Pass additional data in a URL for internal use. """
1278
1279 sdata = compat_urllib_parse.urlencode(
1280 {'__youtubedl_smuggle': json.dumps(data)})
1281 return url + '#' + sdata
1282
1283
1284 def unsmuggle_url(smug_url, default=None):
1285 if '#__youtubedl_smuggle' not in smug_url:
1286 return smug_url, default
1287 url, _, sdata = smug_url.rpartition('#')
1288 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1289 data = json.loads(jsond)
1290 return url, data
1291
1292
1293 def format_bytes(bytes):
1294 if bytes is None:
1295 return 'N/A'
1296 if type(bytes) is str:
1297 bytes = float(bytes)
1298 if bytes == 0.0:
1299 exponent = 0
1300 else:
1301 exponent = int(math.log(bytes, 1024.0))
1302 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1303 converted = float(bytes) / float(1024 ** exponent)
1304 return '%.2f%s' % (converted, suffix)
1305
1306
1307 def parse_filesize(s):
1308 if s is None:
1309 return None
1310
1311 # The lower-case forms are of course incorrect and unofficial,
1312 # but we support those too
1313 _UNIT_TABLE = {
1314 'B': 1,
1315 'b': 1,
1316 'KiB': 1024,
1317 'KB': 1000,
1318 'kB': 1024,
1319 'Kb': 1000,
1320 'MiB': 1024 ** 2,
1321 'MB': 1000 ** 2,
1322 'mB': 1024 ** 2,
1323 'Mb': 1000 ** 2,
1324 'GiB': 1024 ** 3,
1325 'GB': 1000 ** 3,
1326 'gB': 1024 ** 3,
1327 'Gb': 1000 ** 3,
1328 'TiB': 1024 ** 4,
1329 'TB': 1000 ** 4,
1330 'tB': 1024 ** 4,
1331 'Tb': 1000 ** 4,
1332 'PiB': 1024 ** 5,
1333 'PB': 1000 ** 5,
1334 'pB': 1024 ** 5,
1335 'Pb': 1000 ** 5,
1336 'EiB': 1024 ** 6,
1337 'EB': 1000 ** 6,
1338 'eB': 1024 ** 6,
1339 'Eb': 1000 ** 6,
1340 'ZiB': 1024 ** 7,
1341 'ZB': 1000 ** 7,
1342 'zB': 1024 ** 7,
1343 'Zb': 1000 ** 7,
1344 'YiB': 1024 ** 8,
1345 'YB': 1000 ** 8,
1346 'yB': 1024 ** 8,
1347 'Yb': 1000 ** 8,
1348 }
1349
1350 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1351 m = re.match(
1352 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1353 if not m:
1354 return None
1355
1356 num_str = m.group('num').replace(',', '.')
1357 mult = _UNIT_TABLE[m.group('unit')]
1358 return int(float(num_str) * mult)
1359
1360
1361 def month_by_name(name):
1362 """ Return the number of a month by (locale-independently) English name """
1363
1364 try:
1365 return ENGLISH_MONTH_NAMES.index(name) + 1
1366 except ValueError:
1367 return None
1368
1369
1370 def month_by_abbreviation(abbrev):
1371 """ Return the number of a month by (locale-independently) English
1372 abbreviations """
1373
1374 try:
1375 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1376 except ValueError:
1377 return None
1378
1379
1380 def fix_xml_ampersands(xml_str):
1381 """Replace all the '&' by '&amp;' in XML"""
1382 return re.sub(
1383 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1384 '&amp;',
1385 xml_str)
1386
1387
1388 def setproctitle(title):
1389 assert isinstance(title, compat_str)
1390 try:
1391 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1392 except OSError:
1393 return
1394 title_bytes = title.encode('utf-8')
1395 buf = ctypes.create_string_buffer(len(title_bytes))
1396 buf.value = title_bytes
1397 try:
1398 libc.prctl(15, buf, 0, 0, 0)
1399 except AttributeError:
1400 return # Strange libc, just skip this
1401
1402
1403 def remove_start(s, start):
1404 if s.startswith(start):
1405 return s[len(start):]
1406 return s
1407
1408
1409 def remove_end(s, end):
1410 if s.endswith(end):
1411 return s[:-len(end)]
1412 return s
1413
1414
1415 def remove_quotes(s):
1416 if s is None or len(s) < 2:
1417 return s
1418 for quote in ('"', "'", ):
1419 if s[0] == quote and s[-1] == quote:
1420 return s[1:-1]
1421 return s
1422
1423
1424 def url_basename(url):
1425 path = compat_urlparse.urlparse(url).path
1426 return path.strip('/').split('/')[-1]
1427
1428
1429 class HEADRequest(compat_urllib_request.Request):
1430 def get_method(self):
1431 return 'HEAD'
1432
1433
1434 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1435 if get_attr:
1436 if v is not None:
1437 v = getattr(v, get_attr, None)
1438 if v == '':
1439 v = None
1440 if v is None:
1441 return default
1442 try:
1443 return int(v) * invscale // scale
1444 except ValueError:
1445 return default
1446
1447
1448 def str_or_none(v, default=None):
1449 return default if v is None else compat_str(v)
1450
1451
1452 def str_to_int(int_str):
1453 """ A more relaxed version of int_or_none """
1454 if int_str is None:
1455 return None
1456 int_str = re.sub(r'[,\.\+]', '', int_str)
1457 return int(int_str)
1458
1459
1460 def float_or_none(v, scale=1, invscale=1, default=None):
1461 if v is None:
1462 return default
1463 try:
1464 return float(v) * invscale / scale
1465 except ValueError:
1466 return default
1467
1468
1469 def parse_duration(s):
1470 if not isinstance(s, compat_basestring):
1471 return None
1472
1473 s = s.strip()
1474
1475 m = re.match(
1476 r'''(?ix)(?:P?T)?
1477 (?:
1478 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1479 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1480
1481 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1482 (?:
1483 (?:
1484 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1485 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1486 )?
1487 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1488 )?
1489 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1490 )$''', s)
1491 if not m:
1492 return None
1493 res = 0
1494 if m.group('only_mins'):
1495 return float_or_none(m.group('only_mins'), invscale=60)
1496 if m.group('only_hours'):
1497 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1498 if m.group('secs'):
1499 res += int(m.group('secs'))
1500 if m.group('mins_reversed'):
1501 res += int(m.group('mins_reversed')) * 60
1502 if m.group('mins'):
1503 res += int(m.group('mins')) * 60
1504 if m.group('hours'):
1505 res += int(m.group('hours')) * 60 * 60
1506 if m.group('hours_reversed'):
1507 res += int(m.group('hours_reversed')) * 60 * 60
1508 if m.group('days'):
1509 res += int(m.group('days')) * 24 * 60 * 60
1510 if m.group('ms'):
1511 res += float(m.group('ms'))
1512 return res
1513
1514
1515 def prepend_extension(filename, ext, expected_real_ext=None):
1516 name, real_ext = os.path.splitext(filename)
1517 return (
1518 '{0}.{1}{2}'.format(name, ext, real_ext)
1519 if not expected_real_ext or real_ext[1:] == expected_real_ext
1520 else '{0}.{1}'.format(filename, ext))
1521
1522
1523 def replace_extension(filename, ext, expected_real_ext=None):
1524 name, real_ext = os.path.splitext(filename)
1525 return '{0}.{1}'.format(
1526 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1527 ext)
1528
1529
1530 def check_executable(exe, args=[]):
1531 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1532 args can be a list of arguments for a short output (like -version) """
1533 try:
1534 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1535 except OSError:
1536 return False
1537 return exe
1538
1539
1540 def get_exe_version(exe, args=['--version'],
1541 version_re=None, unrecognized='present'):
1542 """ Returns the version of the specified executable,
1543 or False if the executable is not present """
1544 try:
1545 out, _ = subprocess.Popen(
1546 [encodeArgument(exe)] + args,
1547 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1548 except OSError:
1549 return False
1550 if isinstance(out, bytes): # Python 2.x
1551 out = out.decode('ascii', 'ignore')
1552 return detect_exe_version(out, version_re, unrecognized)
1553
1554
1555 def detect_exe_version(output, version_re=None, unrecognized='present'):
1556 assert isinstance(output, compat_str)
1557 if version_re is None:
1558 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1559 m = re.search(version_re, output)
1560 if m:
1561 return m.group(1)
1562 else:
1563 return unrecognized
1564
1565
1566 class PagedList(object):
1567 def __len__(self):
1568 # This is only useful for tests
1569 return len(self.getslice())
1570
1571
1572 class OnDemandPagedList(PagedList):
1573 def __init__(self, pagefunc, pagesize):
1574 self._pagefunc = pagefunc
1575 self._pagesize = pagesize
1576
1577 def getslice(self, start=0, end=None):
1578 res = []
1579 for pagenum in itertools.count(start // self._pagesize):
1580 firstid = pagenum * self._pagesize
1581 nextfirstid = pagenum * self._pagesize + self._pagesize
1582 if start >= nextfirstid:
1583 continue
1584
1585 page_results = list(self._pagefunc(pagenum))
1586
1587 startv = (
1588 start % self._pagesize
1589 if firstid <= start < nextfirstid
1590 else 0)
1591
1592 endv = (
1593 ((end - 1) % self._pagesize) + 1
1594 if (end is not None and firstid <= end <= nextfirstid)
1595 else None)
1596
1597 if startv != 0 or endv is not None:
1598 page_results = page_results[startv:endv]
1599 res.extend(page_results)
1600
1601 # A little optimization - if current page is not "full", ie. does
1602 # not contain page_size videos then we can assume that this page
1603 # is the last one - there are no more ids on further pages -
1604 # i.e. no need to query again.
1605 if len(page_results) + startv < self._pagesize:
1606 break
1607
1608 # If we got the whole page, but the next page is not interesting,
1609 # break out early as well
1610 if end == nextfirstid:
1611 break
1612 return res
1613
1614
1615 class InAdvancePagedList(PagedList):
1616 def __init__(self, pagefunc, pagecount, pagesize):
1617 self._pagefunc = pagefunc
1618 self._pagecount = pagecount
1619 self._pagesize = pagesize
1620
1621 def getslice(self, start=0, end=None):
1622 res = []
1623 start_page = start // self._pagesize
1624 end_page = (
1625 self._pagecount if end is None else (end // self._pagesize + 1))
1626 skip_elems = start - start_page * self._pagesize
1627 only_more = None if end is None else end - start
1628 for pagenum in range(start_page, end_page):
1629 page = list(self._pagefunc(pagenum))
1630 if skip_elems:
1631 page = page[skip_elems:]
1632 skip_elems = None
1633 if only_more is not None:
1634 if len(page) < only_more:
1635 only_more -= len(page)
1636 else:
1637 page = page[:only_more]
1638 res.extend(page)
1639 break
1640 res.extend(page)
1641 return res
1642
1643
1644 def uppercase_escape(s):
1645 unicode_escape = codecs.getdecoder('unicode_escape')
1646 return re.sub(
1647 r'\\U[0-9a-fA-F]{8}',
1648 lambda m: unicode_escape(m.group(0))[0],
1649 s)
1650
1651
1652 def lowercase_escape(s):
1653 unicode_escape = codecs.getdecoder('unicode_escape')
1654 return re.sub(
1655 r'\\u[0-9a-fA-F]{4}',
1656 lambda m: unicode_escape(m.group(0))[0],
1657 s)
1658
1659
1660 def escape_rfc3986(s):
1661 """Escape non-ASCII characters as suggested by RFC 3986"""
1662 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1663 s = s.encode('utf-8')
1664 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1665
1666
1667 def escape_url(url):
1668 """Escape URL as suggested by RFC 3986"""
1669 url_parsed = compat_urllib_parse_urlparse(url)
1670 return url_parsed._replace(
1671 path=escape_rfc3986(url_parsed.path),
1672 params=escape_rfc3986(url_parsed.params),
1673 query=escape_rfc3986(url_parsed.query),
1674 fragment=escape_rfc3986(url_parsed.fragment)
1675 ).geturl()
1676
1677 try:
1678 struct.pack('!I', 0)
1679 except TypeError:
1680 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1681 def struct_pack(spec, *args):
1682 if isinstance(spec, compat_str):
1683 spec = spec.encode('ascii')
1684 return struct.pack(spec, *args)
1685
1686 def struct_unpack(spec, *args):
1687 if isinstance(spec, compat_str):
1688 spec = spec.encode('ascii')
1689 return struct.unpack(spec, *args)
1690 else:
1691 struct_pack = struct.pack
1692 struct_unpack = struct.unpack
1693
1694
1695 def read_batch_urls(batch_fd):
1696 def fixup(url):
1697 if not isinstance(url, compat_str):
1698 url = url.decode('utf-8', 'replace')
1699 BOM_UTF8 = '\xef\xbb\xbf'
1700 if url.startswith(BOM_UTF8):
1701 url = url[len(BOM_UTF8):]
1702 url = url.strip()
1703 if url.startswith(('#', ';', ']')):
1704 return False
1705 return url
1706
1707 with contextlib.closing(batch_fd) as fd:
1708 return [url for url in map(fixup, fd) if url]
1709
1710
1711 def urlencode_postdata(*args, **kargs):
1712 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1713
1714
1715 def encode_dict(d, encoding='utf-8'):
1716 def encode(v):
1717 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1718 return dict((encode(k), encode(v)) for k, v in d.items())
1719
1720
1721 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1722 if isinstance(key_or_keys, (list, tuple)):
1723 for key in key_or_keys:
1724 if key not in d or d[key] is None or skip_false_values and not d[key]:
1725 continue
1726 return d[key]
1727 return default
1728 return d.get(key_or_keys, default)
1729
1730
1731 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1732 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1733
1734
1735 US_RATINGS = {
1736 'G': 0,
1737 'PG': 10,
1738 'PG-13': 13,
1739 'R': 16,
1740 'NC': 18,
1741 }
1742
1743
1744 def parse_age_limit(s):
1745 if s is None:
1746 return None
1747 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1748 return int(m.group('age')) if m else US_RATINGS.get(s)
1749
1750
1751 def strip_jsonp(code):
1752 return re.sub(
1753 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1754
1755
1756 def js_to_json(code):
1757 def fix_kv(m):
1758 v = m.group(0)
1759 if v in ('true', 'false', 'null'):
1760 return v
1761 if v.startswith('"'):
1762 v = re.sub(r"\\'", "'", v[1:-1])
1763 elif v.startswith("'"):
1764 v = v[1:-1]
1765 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1766 '\\\\': '\\\\',
1767 "\\'": "'",
1768 '"': '\\"',
1769 }[m.group(0)], v)
1770 return '"%s"' % v
1771
1772 res = re.sub(r'''(?x)
1773 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1774 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1775 [a-zA-Z_][.a-zA-Z_0-9]*
1776 ''', fix_kv, code)
1777 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1778 return res
1779
1780
1781 def qualities(quality_ids):
1782 """ Get a numeric quality value out of a list of possible values """
1783 def q(qid):
1784 try:
1785 return quality_ids.index(qid)
1786 except ValueError:
1787 return -1
1788 return q
1789
1790
1791 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1792
1793
1794 def limit_length(s, length):
1795 """ Add ellipses to overly long strings """
1796 if s is None:
1797 return None
1798 ELLIPSES = '...'
1799 if len(s) > length:
1800 return s[:length - len(ELLIPSES)] + ELLIPSES
1801 return s
1802
1803
1804 def version_tuple(v):
1805 return tuple(int(e) for e in re.split(r'[-.]', v))
1806
1807
1808 def is_outdated_version(version, limit, assume_new=True):
1809 if not version:
1810 return not assume_new
1811 try:
1812 return version_tuple(version) < version_tuple(limit)
1813 except ValueError:
1814 return not assume_new
1815
1816
1817 def ytdl_is_updateable():
1818 """ Returns if youtube-dl can be updated with -U """
1819 from zipimport import zipimporter
1820
1821 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1822
1823
1824 def args_to_str(args):
1825 # Get a short string representation for a subprocess command
1826 return ' '.join(shlex_quote(a) for a in args)
1827
1828
1829 def error_to_compat_str(err):
1830 err_str = str(err)
1831 # On python 2 error byte string must be decoded with proper
1832 # encoding rather than ascii
1833 if sys.version_info[0] < 3:
1834 err_str = err_str.decode(preferredencoding())
1835 return err_str
1836
1837
1838 def mimetype2ext(mt):
1839 ext = {
1840 'audio/mp4': 'm4a',
1841 }.get(mt)
1842 if ext is not None:
1843 return ext
1844
1845 _, _, res = mt.rpartition('/')
1846
1847 return {
1848 '3gpp': '3gp',
1849 'smptett+xml': 'tt',
1850 'srt': 'srt',
1851 'ttaf+xml': 'dfxp',
1852 'ttml+xml': 'ttml',
1853 'vtt': 'vtt',
1854 'x-flv': 'flv',
1855 'x-mp4-fragmented': 'mp4',
1856 'x-ms-wmv': 'wmv',
1857 }.get(res, res)
1858
1859
1860 def urlhandle_detect_ext(url_handle):
1861 try:
1862 url_handle.headers
1863 getheader = lambda h: url_handle.headers[h]
1864 except AttributeError: # Python < 3
1865 getheader = url_handle.info().getheader
1866
1867 cd = getheader('Content-Disposition')
1868 if cd:
1869 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1870 if m:
1871 e = determine_ext(m.group('filename'), default_ext=None)
1872 if e:
1873 return e
1874
1875 return mimetype2ext(getheader('Content-Type'))
1876
1877
1878 def encode_data_uri(data, mime_type):
1879 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1880
1881
1882 def age_restricted(content_limit, age_limit):
1883 """ Returns True iff the content should be blocked """
1884
1885 if age_limit is None: # No limit set
1886 return False
1887 if content_limit is None:
1888 return False # Content available for everyone
1889 return age_limit < content_limit
1890
1891
1892 def is_html(first_bytes):
1893 """ Detect whether a file contains HTML by examining its first bytes. """
1894
1895 BOMS = [
1896 (b'\xef\xbb\xbf', 'utf-8'),
1897 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1898 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1899 (b'\xff\xfe', 'utf-16-le'),
1900 (b'\xfe\xff', 'utf-16-be'),
1901 ]
1902 for bom, enc in BOMS:
1903 if first_bytes.startswith(bom):
1904 s = first_bytes[len(bom):].decode(enc, 'replace')
1905 break
1906 else:
1907 s = first_bytes.decode('utf-8', 'replace')
1908
1909 return re.match(r'^\s*<', s)
1910
1911
1912 def determine_protocol(info_dict):
1913 protocol = info_dict.get('protocol')
1914 if protocol is not None:
1915 return protocol
1916
1917 url = info_dict['url']
1918 if url.startswith('rtmp'):
1919 return 'rtmp'
1920 elif url.startswith('mms'):
1921 return 'mms'
1922 elif url.startswith('rtsp'):
1923 return 'rtsp'
1924
1925 ext = determine_ext(url)
1926 if ext == 'm3u8':
1927 return 'm3u8'
1928 elif ext == 'f4m':
1929 return 'f4m'
1930
1931 return compat_urllib_parse_urlparse(url).scheme
1932
1933
1934 def render_table(header_row, data):
1935 """ Render a list of rows, each as a list of values """
1936 table = [header_row] + data
1937 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1938 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1939 return '\n'.join(format_str % tuple(row) for row in table)
1940
1941
1942 def _match_one(filter_part, dct):
1943 COMPARISON_OPERATORS = {
1944 '<': operator.lt,
1945 '<=': operator.le,
1946 '>': operator.gt,
1947 '>=': operator.ge,
1948 '=': operator.eq,
1949 '!=': operator.ne,
1950 }
1951 operator_rex = re.compile(r'''(?x)\s*
1952 (?P<key>[a-z_]+)
1953 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1954 (?:
1955 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1956 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1957 )
1958 \s*$
1959 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1960 m = operator_rex.search(filter_part)
1961 if m:
1962 op = COMPARISON_OPERATORS[m.group('op')]
1963 if m.group('strval') is not None:
1964 if m.group('op') not in ('=', '!='):
1965 raise ValueError(
1966 'Operator %s does not support string values!' % m.group('op'))
1967 comparison_value = m.group('strval')
1968 else:
1969 try:
1970 comparison_value = int(m.group('intval'))
1971 except ValueError:
1972 comparison_value = parse_filesize(m.group('intval'))
1973 if comparison_value is None:
1974 comparison_value = parse_filesize(m.group('intval') + 'B')
1975 if comparison_value is None:
1976 raise ValueError(
1977 'Invalid integer value %r in filter part %r' % (
1978 m.group('intval'), filter_part))
1979 actual_value = dct.get(m.group('key'))
1980 if actual_value is None:
1981 return m.group('none_inclusive')
1982 return op(actual_value, comparison_value)
1983
1984 UNARY_OPERATORS = {
1985 '': lambda v: v is not None,
1986 '!': lambda v: v is None,
1987 }
1988 operator_rex = re.compile(r'''(?x)\s*
1989 (?P<op>%s)\s*(?P<key>[a-z_]+)
1990 \s*$
1991 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1992 m = operator_rex.search(filter_part)
1993 if m:
1994 op = UNARY_OPERATORS[m.group('op')]
1995 actual_value = dct.get(m.group('key'))
1996 return op(actual_value)
1997
1998 raise ValueError('Invalid filter part %r' % filter_part)
1999
2000
2001 def match_str(filter_str, dct):
2002 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2003
2004 return all(
2005 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2006
2007
2008 def match_filter_func(filter_str):
2009 def _match_func(info_dict):
2010 if match_str(filter_str, info_dict):
2011 return None
2012 else:
2013 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2014 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2015 return _match_func
2016
2017
2018 def parse_dfxp_time_expr(time_expr):
2019 if not time_expr:
2020 return
2021
2022 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2023 if mobj:
2024 return float(mobj.group('time_offset'))
2025
2026 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2027 if mobj:
2028 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2029
2030
2031 def srt_subtitles_timecode(seconds):
2032 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2033
2034
2035 def dfxp2srt(dfxp_data):
2036 _x = functools.partial(xpath_with_ns, ns_map={
2037 'ttml': 'http://www.w3.org/ns/ttml',
2038 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2039 })
2040
2041 class TTMLPElementParser(object):
2042 out = ''
2043
2044 def start(self, tag, attrib):
2045 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2046 self.out += '\n'
2047
2048 def end(self, tag):
2049 pass
2050
2051 def data(self, data):
2052 self.out += data
2053
2054 def close(self):
2055 return self.out.strip()
2056
2057 def parse_node(node):
2058 target = TTMLPElementParser()
2059 parser = xml.etree.ElementTree.XMLParser(target=target)
2060 parser.feed(xml.etree.ElementTree.tostring(node))
2061 return parser.close()
2062
2063 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2064 out = []
2065 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2066
2067 if not paras:
2068 raise ValueError('Invalid dfxp/TTML subtitle')
2069
2070 for para, index in zip(paras, itertools.count(1)):
2071 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2072 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2073 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2074 if begin_time is None:
2075 continue
2076 if not end_time:
2077 if not dur:
2078 continue
2079 end_time = begin_time + dur
2080 out.append('%d\n%s --> %s\n%s\n\n' % (
2081 index,
2082 srt_subtitles_timecode(begin_time),
2083 srt_subtitles_timecode(end_time),
2084 parse_node(para)))
2085
2086 return ''.join(out)
2087
2088
2089 def cli_option(params, command_option, param):
2090 param = params.get(param)
2091 return [command_option, param] if param is not None else []
2092
2093
2094 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2095 param = params.get(param)
2096 assert isinstance(param, bool)
2097 if separator:
2098 return [command_option + separator + (true_value if param else false_value)]
2099 return [command_option, true_value if param else false_value]
2100
2101
2102 def cli_valueless_option(params, command_option, param, expected_value=True):
2103 param = params.get(param)
2104 return [command_option] if param == expected_value else []
2105
2106
2107 def cli_configuration_args(params, param, default=[]):
2108 ex_args = params.get(param)
2109 if ex_args is None:
2110 return default
2111 assert isinstance(ex_args, list)
2112 return ex_args
2113
2114
2115 class ISO639Utils(object):
2116 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2117 _lang_map = {
2118 'aa': 'aar',
2119 'ab': 'abk',
2120 'ae': 'ave',
2121 'af': 'afr',
2122 'ak': 'aka',
2123 'am': 'amh',
2124 'an': 'arg',
2125 'ar': 'ara',
2126 'as': 'asm',
2127 'av': 'ava',
2128 'ay': 'aym',
2129 'az': 'aze',
2130 'ba': 'bak',
2131 'be': 'bel',
2132 'bg': 'bul',
2133 'bh': 'bih',
2134 'bi': 'bis',
2135 'bm': 'bam',
2136 'bn': 'ben',
2137 'bo': 'bod',
2138 'br': 'bre',
2139 'bs': 'bos',
2140 'ca': 'cat',
2141 'ce': 'che',
2142 'ch': 'cha',
2143 'co': 'cos',
2144 'cr': 'cre',
2145 'cs': 'ces',
2146 'cu': 'chu',
2147 'cv': 'chv',
2148 'cy': 'cym',
2149 'da': 'dan',
2150 'de': 'deu',
2151 'dv': 'div',
2152 'dz': 'dzo',
2153 'ee': 'ewe',
2154 'el': 'ell',
2155 'en': 'eng',
2156 'eo': 'epo',
2157 'es': 'spa',
2158 'et': 'est',
2159 'eu': 'eus',
2160 'fa': 'fas',
2161 'ff': 'ful',
2162 'fi': 'fin',
2163 'fj': 'fij',
2164 'fo': 'fao',
2165 'fr': 'fra',
2166 'fy': 'fry',
2167 'ga': 'gle',
2168 'gd': 'gla',
2169 'gl': 'glg',
2170 'gn': 'grn',
2171 'gu': 'guj',
2172 'gv': 'glv',
2173 'ha': 'hau',
2174 'he': 'heb',
2175 'hi': 'hin',
2176 'ho': 'hmo',
2177 'hr': 'hrv',
2178 'ht': 'hat',
2179 'hu': 'hun',
2180 'hy': 'hye',
2181 'hz': 'her',
2182 'ia': 'ina',
2183 'id': 'ind',
2184 'ie': 'ile',
2185 'ig': 'ibo',
2186 'ii': 'iii',
2187 'ik': 'ipk',
2188 'io': 'ido',
2189 'is': 'isl',
2190 'it': 'ita',
2191 'iu': 'iku',
2192 'ja': 'jpn',
2193 'jv': 'jav',
2194 'ka': 'kat',
2195 'kg': 'kon',
2196 'ki': 'kik',
2197 'kj': 'kua',
2198 'kk': 'kaz',
2199 'kl': 'kal',
2200 'km': 'khm',
2201 'kn': 'kan',
2202 'ko': 'kor',
2203 'kr': 'kau',
2204 'ks': 'kas',
2205 'ku': 'kur',
2206 'kv': 'kom',
2207 'kw': 'cor',
2208 'ky': 'kir',
2209 'la': 'lat',
2210 'lb': 'ltz',
2211 'lg': 'lug',
2212 'li': 'lim',
2213 'ln': 'lin',
2214 'lo': 'lao',
2215 'lt': 'lit',
2216 'lu': 'lub',
2217 'lv': 'lav',
2218 'mg': 'mlg',
2219 'mh': 'mah',
2220 'mi': 'mri',
2221 'mk': 'mkd',
2222 'ml': 'mal',
2223 'mn': 'mon',
2224 'mr': 'mar',
2225 'ms': 'msa',
2226 'mt': 'mlt',
2227 'my': 'mya',
2228 'na': 'nau',
2229 'nb': 'nob',
2230 'nd': 'nde',
2231 'ne': 'nep',
2232 'ng': 'ndo',
2233 'nl': 'nld',
2234 'nn': 'nno',
2235 'no': 'nor',
2236 'nr': 'nbl',
2237 'nv': 'nav',
2238 'ny': 'nya',
2239 'oc': 'oci',
2240 'oj': 'oji',
2241 'om': 'orm',
2242 'or': 'ori',
2243 'os': 'oss',
2244 'pa': 'pan',
2245 'pi': 'pli',
2246 'pl': 'pol',
2247 'ps': 'pus',
2248 'pt': 'por',
2249 'qu': 'que',
2250 'rm': 'roh',
2251 'rn': 'run',
2252 'ro': 'ron',
2253 'ru': 'rus',
2254 'rw': 'kin',
2255 'sa': 'san',
2256 'sc': 'srd',
2257 'sd': 'snd',
2258 'se': 'sme',
2259 'sg': 'sag',
2260 'si': 'sin',
2261 'sk': 'slk',
2262 'sl': 'slv',
2263 'sm': 'smo',
2264 'sn': 'sna',
2265 'so': 'som',
2266 'sq': 'sqi',
2267 'sr': 'srp',
2268 'ss': 'ssw',
2269 'st': 'sot',
2270 'su': 'sun',
2271 'sv': 'swe',
2272 'sw': 'swa',
2273 'ta': 'tam',
2274 'te': 'tel',
2275 'tg': 'tgk',
2276 'th': 'tha',
2277 'ti': 'tir',
2278 'tk': 'tuk',
2279 'tl': 'tgl',
2280 'tn': 'tsn',
2281 'to': 'ton',
2282 'tr': 'tur',
2283 'ts': 'tso',
2284 'tt': 'tat',
2285 'tw': 'twi',
2286 'ty': 'tah',
2287 'ug': 'uig',
2288 'uk': 'ukr',
2289 'ur': 'urd',
2290 'uz': 'uzb',
2291 've': 'ven',
2292 'vi': 'vie',
2293 'vo': 'vol',
2294 'wa': 'wln',
2295 'wo': 'wol',
2296 'xh': 'xho',
2297 'yi': 'yid',
2298 'yo': 'yor',
2299 'za': 'zha',
2300 'zh': 'zho',
2301 'zu': 'zul',
2302 }
2303
2304 @classmethod
2305 def short2long(cls, code):
2306 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2307 return cls._lang_map.get(code[:2])
2308
2309 @classmethod
2310 def long2short(cls, code):
2311 """Convert language code from ISO 639-2/T to ISO 639-1"""
2312 for short_name, long_name in cls._lang_map.items():
2313 if long_name == code:
2314 return short_name
2315
2316
2317 class ISO3166Utils(object):
2318 # From http://data.okfn.org/data/core/country-list
2319 _country_map = {
2320 'AF': 'Afghanistan',
2321 'AX': 'Åland Islands',
2322 'AL': 'Albania',
2323 'DZ': 'Algeria',
2324 'AS': 'American Samoa',
2325 'AD': 'Andorra',
2326 'AO': 'Angola',
2327 'AI': 'Anguilla',
2328 'AQ': 'Antarctica',
2329 'AG': 'Antigua and Barbuda',
2330 'AR': 'Argentina',
2331 'AM': 'Armenia',
2332 'AW': 'Aruba',
2333 'AU': 'Australia',
2334 'AT': 'Austria',
2335 'AZ': 'Azerbaijan',
2336 'BS': 'Bahamas',
2337 'BH': 'Bahrain',
2338 'BD': 'Bangladesh',
2339 'BB': 'Barbados',
2340 'BY': 'Belarus',
2341 'BE': 'Belgium',
2342 'BZ': 'Belize',
2343 'BJ': 'Benin',
2344 'BM': 'Bermuda',
2345 'BT': 'Bhutan',
2346 'BO': 'Bolivia, Plurinational State of',
2347 'BQ': 'Bonaire, Sint Eustatius and Saba',
2348 'BA': 'Bosnia and Herzegovina',
2349 'BW': 'Botswana',
2350 'BV': 'Bouvet Island',
2351 'BR': 'Brazil',
2352 'IO': 'British Indian Ocean Territory',
2353 'BN': 'Brunei Darussalam',
2354 'BG': 'Bulgaria',
2355 'BF': 'Burkina Faso',
2356 'BI': 'Burundi',
2357 'KH': 'Cambodia',
2358 'CM': 'Cameroon',
2359 'CA': 'Canada',
2360 'CV': 'Cape Verde',
2361 'KY': 'Cayman Islands',
2362 'CF': 'Central African Republic',
2363 'TD': 'Chad',
2364 'CL': 'Chile',
2365 'CN': 'China',
2366 'CX': 'Christmas Island',
2367 'CC': 'Cocos (Keeling) Islands',
2368 'CO': 'Colombia',
2369 'KM': 'Comoros',
2370 'CG': 'Congo',
2371 'CD': 'Congo, the Democratic Republic of the',
2372 'CK': 'Cook Islands',
2373 'CR': 'Costa Rica',
2374 'CI': 'Côte d\'Ivoire',
2375 'HR': 'Croatia',
2376 'CU': 'Cuba',
2377 'CW': 'Curaçao',
2378 'CY': 'Cyprus',
2379 'CZ': 'Czech Republic',
2380 'DK': 'Denmark',
2381 'DJ': 'Djibouti',
2382 'DM': 'Dominica',
2383 'DO': 'Dominican Republic',
2384 'EC': 'Ecuador',
2385 'EG': 'Egypt',
2386 'SV': 'El Salvador',
2387 'GQ': 'Equatorial Guinea',
2388 'ER': 'Eritrea',
2389 'EE': 'Estonia',
2390 'ET': 'Ethiopia',
2391 'FK': 'Falkland Islands (Malvinas)',
2392 'FO': 'Faroe Islands',
2393 'FJ': 'Fiji',
2394 'FI': 'Finland',
2395 'FR': 'France',
2396 'GF': 'French Guiana',
2397 'PF': 'French Polynesia',
2398 'TF': 'French Southern Territories',
2399 'GA': 'Gabon',
2400 'GM': 'Gambia',
2401 'GE': 'Georgia',
2402 'DE': 'Germany',
2403 'GH': 'Ghana',
2404 'GI': 'Gibraltar',
2405 'GR': 'Greece',
2406 'GL': 'Greenland',
2407 'GD': 'Grenada',
2408 'GP': 'Guadeloupe',
2409 'GU': 'Guam',
2410 'GT': 'Guatemala',
2411 'GG': 'Guernsey',
2412 'GN': 'Guinea',
2413 'GW': 'Guinea-Bissau',
2414 'GY': 'Guyana',
2415 'HT': 'Haiti',
2416 'HM': 'Heard Island and McDonald Islands',
2417 'VA': 'Holy See (Vatican City State)',
2418 'HN': 'Honduras',
2419 'HK': 'Hong Kong',
2420 'HU': 'Hungary',
2421 'IS': 'Iceland',
2422 'IN': 'India',
2423 'ID': 'Indonesia',
2424 'IR': 'Iran, Islamic Republic of',
2425 'IQ': 'Iraq',
2426 'IE': 'Ireland',
2427 'IM': 'Isle of Man',
2428 'IL': 'Israel',
2429 'IT': 'Italy',
2430 'JM': 'Jamaica',
2431 'JP': 'Japan',
2432 'JE': 'Jersey',
2433 'JO': 'Jordan',
2434 'KZ': 'Kazakhstan',
2435 'KE': 'Kenya',
2436 'KI': 'Kiribati',
2437 'KP': 'Korea, Democratic People\'s Republic of',
2438 'KR': 'Korea, Republic of',
2439 'KW': 'Kuwait',
2440 'KG': 'Kyrgyzstan',
2441 'LA': 'Lao People\'s Democratic Republic',
2442 'LV': 'Latvia',
2443 'LB': 'Lebanon',
2444 'LS': 'Lesotho',
2445 'LR': 'Liberia',
2446 'LY': 'Libya',
2447 'LI': 'Liechtenstein',
2448 'LT': 'Lithuania',
2449 'LU': 'Luxembourg',
2450 'MO': 'Macao',
2451 'MK': 'Macedonia, the Former Yugoslav Republic of',
2452 'MG': 'Madagascar',
2453 'MW': 'Malawi',
2454 'MY': 'Malaysia',
2455 'MV': 'Maldives',
2456 'ML': 'Mali',
2457 'MT': 'Malta',
2458 'MH': 'Marshall Islands',
2459 'MQ': 'Martinique',
2460 'MR': 'Mauritania',
2461 'MU': 'Mauritius',
2462 'YT': 'Mayotte',
2463 'MX': 'Mexico',
2464 'FM': 'Micronesia, Federated States of',
2465 'MD': 'Moldova, Republic of',
2466 'MC': 'Monaco',
2467 'MN': 'Mongolia',
2468 'ME': 'Montenegro',
2469 'MS': 'Montserrat',
2470 'MA': 'Morocco',
2471 'MZ': 'Mozambique',
2472 'MM': 'Myanmar',
2473 'NA': 'Namibia',
2474 'NR': 'Nauru',
2475 'NP': 'Nepal',
2476 'NL': 'Netherlands',
2477 'NC': 'New Caledonia',
2478 'NZ': 'New Zealand',
2479 'NI': 'Nicaragua',
2480 'NE': 'Niger',
2481 'NG': 'Nigeria',
2482 'NU': 'Niue',
2483 'NF': 'Norfolk Island',
2484 'MP': 'Northern Mariana Islands',
2485 'NO': 'Norway',
2486 'OM': 'Oman',
2487 'PK': 'Pakistan',
2488 'PW': 'Palau',
2489 'PS': 'Palestine, State of',
2490 'PA': 'Panama',
2491 'PG': 'Papua New Guinea',
2492 'PY': 'Paraguay',
2493 'PE': 'Peru',
2494 'PH': 'Philippines',
2495 'PN': 'Pitcairn',
2496 'PL': 'Poland',
2497 'PT': 'Portugal',
2498 'PR': 'Puerto Rico',
2499 'QA': 'Qatar',
2500 'RE': 'Réunion',
2501 'RO': 'Romania',
2502 'RU': 'Russian Federation',
2503 'RW': 'Rwanda',
2504 'BL': 'Saint Barthélemy',
2505 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2506 'KN': 'Saint Kitts and Nevis',
2507 'LC': 'Saint Lucia',
2508 'MF': 'Saint Martin (French part)',
2509 'PM': 'Saint Pierre and Miquelon',
2510 'VC': 'Saint Vincent and the Grenadines',
2511 'WS': 'Samoa',
2512 'SM': 'San Marino',
2513 'ST': 'Sao Tome and Principe',
2514 'SA': 'Saudi Arabia',
2515 'SN': 'Senegal',
2516 'RS': 'Serbia',
2517 'SC': 'Seychelles',
2518 'SL': 'Sierra Leone',
2519 'SG': 'Singapore',
2520 'SX': 'Sint Maarten (Dutch part)',
2521 'SK': 'Slovakia',
2522 'SI': 'Slovenia',
2523 'SB': 'Solomon Islands',
2524 'SO': 'Somalia',
2525 'ZA': 'South Africa',
2526 'GS': 'South Georgia and the South Sandwich Islands',
2527 'SS': 'South Sudan',
2528 'ES': 'Spain',
2529 'LK': 'Sri Lanka',
2530 'SD': 'Sudan',
2531 'SR': 'Suriname',
2532 'SJ': 'Svalbard and Jan Mayen',
2533 'SZ': 'Swaziland',
2534 'SE': 'Sweden',
2535 'CH': 'Switzerland',
2536 'SY': 'Syrian Arab Republic',
2537 'TW': 'Taiwan, Province of China',
2538 'TJ': 'Tajikistan',
2539 'TZ': 'Tanzania, United Republic of',
2540 'TH': 'Thailand',
2541 'TL': 'Timor-Leste',
2542 'TG': 'Togo',
2543 'TK': 'Tokelau',
2544 'TO': 'Tonga',
2545 'TT': 'Trinidad and Tobago',
2546 'TN': 'Tunisia',
2547 'TR': 'Turkey',
2548 'TM': 'Turkmenistan',
2549 'TC': 'Turks and Caicos Islands',
2550 'TV': 'Tuvalu',
2551 'UG': 'Uganda',
2552 'UA': 'Ukraine',
2553 'AE': 'United Arab Emirates',
2554 'GB': 'United Kingdom',
2555 'US': 'United States',
2556 'UM': 'United States Minor Outlying Islands',
2557 'UY': 'Uruguay',
2558 'UZ': 'Uzbekistan',
2559 'VU': 'Vanuatu',
2560 'VE': 'Venezuela, Bolivarian Republic of',
2561 'VN': 'Viet Nam',
2562 'VG': 'Virgin Islands, British',
2563 'VI': 'Virgin Islands, U.S.',
2564 'WF': 'Wallis and Futuna',
2565 'EH': 'Western Sahara',
2566 'YE': 'Yemen',
2567 'ZM': 'Zambia',
2568 'ZW': 'Zimbabwe',
2569 }
2570
2571 @classmethod
2572 def short2full(cls, code):
2573 """Convert an ISO 3166-2 country code to the corresponding full name"""
2574 return cls._country_map.get(code.upper())
2575
2576
2577 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2578 def __init__(self, proxies=None):
2579 # Set default handlers
2580 for type in ('http', 'https'):
2581 setattr(self, '%s_open' % type,
2582 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2583 meth(r, proxy, type))
2584 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2585
2586 def proxy_open(self, req, proxy, type):
2587 req_proxy = req.headers.get('Ytdl-request-proxy')
2588 if req_proxy is not None:
2589 proxy = req_proxy
2590 del req.headers['Ytdl-request-proxy']
2591
2592 if proxy == '__noproxy__':
2593 return None # No Proxy
2594 return compat_urllib_request.ProxyHandler.proxy_open(
2595 self, req, proxy, type)
2596
2597
2598 def ohdave_rsa_encrypt(data, exponent, modulus):
2599 '''
2600 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2601
2602 Input:
2603 data: data to encrypt, bytes-like object
2604 exponent, modulus: parameter e and N of RSA algorithm, both integer
2605 Output: hex string of encrypted data
2606
2607 Limitation: supports one block encryption only
2608 '''
2609
2610 payload = int(binascii.hexlify(data[::-1]), 16)
2611 encrypted = pow(payload, exponent, modulus)
2612 return '%x' % encrypted