]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/rules: Ignore tests temporarily.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_shlex_quote,
47 compat_socket_create_connection,
48 compat_str,
49 compat_struct_pack,
50 compat_urllib_error,
51 compat_urllib_parse,
52 compat_urllib_parse_urlencode,
53 compat_urllib_parse_urlparse,
54 compat_urllib_parse_unquote_plus,
55 compat_urllib_request,
56 compat_urlparse,
57 compat_xpath,
58 )
59
60 from .socks import (
61 ProxyType,
62 sockssocket,
63 )
64
65
66 def register_socks_protocols():
67 # "Register" SOCKS protocols
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
75 # This is not clearly defined otherwise
76 compiled_regex_type = type(re.compile(''))
77
78 std_headers = {
79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
84 }
85
86
87 NO_DEFAULT = object()
88
89 ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
93 KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
108 # needed for sanitizing filenames in restricted mode
109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
112
113
114 def preferredencoding():
115 """Get preferred encoding.
116
117 Returns the best encoding scheme for the system, based on
118 locale.getpreferredencoding() and some further tweaks.
119 """
120 try:
121 pref = locale.getpreferredencoding()
122 'TEST'.encode(pref)
123 except Exception:
124 pref = 'UTF-8'
125
126 return pref
127
128
129 def write_json_file(obj, fn):
130 """ Encode obj as JSON and write it to fn, atomically if possible """
131
132 fn = encodeFilename(fn)
133 if sys.version_info < (3, 0) and sys.platform != 'win32':
134 encoding = get_filesystem_encoding()
135 # os.path.basename returns a bytes object, but NamedTemporaryFile
136 # will fail if the filename contains non ascii characters unless we
137 # use a unicode object
138 path_basename = lambda f: os.path.basename(fn).decode(encoding)
139 # the same for os.path.dirname
140 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 else:
142 path_basename = os.path.basename
143 path_dirname = os.path.dirname
144
145 args = {
146 'suffix': '.tmp',
147 'prefix': path_basename(fn) + '.',
148 'dir': path_dirname(fn),
149 'delete': False,
150 }
151
152 # In Python 2.x, json.dump expects a bytestream.
153 # In Python 3.x, it writes to a character stream
154 if sys.version_info < (3, 0):
155 args['mode'] = 'wb'
156 else:
157 args.update({
158 'mode': 'w',
159 'encoding': 'utf-8',
160 })
161
162 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
163
164 try:
165 with tf:
166 json.dump(obj, tf)
167 if sys.platform == 'win32':
168 # Need to remove existing file on Windows, else os.rename raises
169 # WindowsError or FileExistsError.
170 try:
171 os.unlink(fn)
172 except OSError:
173 pass
174 os.rename(tf.name, fn)
175 except Exception:
176 try:
177 os.remove(tf.name)
178 except OSError:
179 pass
180 raise
181
182
183 if sys.version_info >= (2, 7):
184 def find_xpath_attr(node, xpath, key, val=None):
185 """ Find the xpath xpath[@key=val] """
186 assert re.match(r'^[a-zA-Z_-]+$', key)
187 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
188 return node.find(expr)
189 else:
190 def find_xpath_attr(node, xpath, key, val=None):
191 for f in node.findall(compat_xpath(xpath)):
192 if key not in f.attrib:
193 continue
194 if val is None or f.attrib.get(key) == val:
195 return f
196 return None
197
198 # On python2.6 the xml.etree.ElementTree.Element methods don't support
199 # the namespace parameter
200
201
202 def xpath_with_ns(path, ns_map):
203 components = [c.split(':') for c in path.split('/')]
204 replaced = []
205 for c in components:
206 if len(c) == 1:
207 replaced.append(c[0])
208 else:
209 ns, tag = c
210 replaced.append('{%s}%s' % (ns_map[ns], tag))
211 return '/'.join(replaced)
212
213
214 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
215 def _find_xpath(xpath):
216 return node.find(compat_xpath(xpath))
217
218 if isinstance(xpath, (str, compat_str)):
219 n = _find_xpath(xpath)
220 else:
221 for xp in xpath:
222 n = _find_xpath(xp)
223 if n is not None:
224 break
225
226 if n is None:
227 if default is not NO_DEFAULT:
228 return default
229 elif fatal:
230 name = xpath if name is None else name
231 raise ExtractorError('Could not find XML element %s' % name)
232 else:
233 return None
234 return n
235
236
237 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
238 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
239 if n is None or n == default:
240 return n
241 if n.text is None:
242 if default is not NO_DEFAULT:
243 return default
244 elif fatal:
245 name = xpath if name is None else name
246 raise ExtractorError('Could not find XML element\'s text %s' % name)
247 else:
248 return None
249 return n.text
250
251
252 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
253 n = find_xpath_attr(node, xpath, key)
254 if n is None:
255 if default is not NO_DEFAULT:
256 return default
257 elif fatal:
258 name = '%s[@%s]' % (xpath, key) if name is None else name
259 raise ExtractorError('Could not find XML attribute %s' % name)
260 else:
261 return None
262 return n.attrib[key]
263
264
265 def get_element_by_id(id, html):
266 """Return the content of the tag with the specified ID in the passed HTML document"""
267 return get_element_by_attribute('id', id, html)
268
269
270 def get_element_by_attribute(attribute, value, html):
271 """Return the content of the tag with the specified attribute in the passed HTML document"""
272
273 m = re.search(r'''(?xs)
274 <([a-zA-Z0-9:._-]+)
275 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 \s+%s=['"]?%s['"]?
277 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
278 \s*>
279 (?P<content>.*?)
280 </\1>
281 ''' % (re.escape(attribute), re.escape(value)), html)
282
283 if not m:
284 return None
285 res = m.group('content')
286
287 if res.startswith('"') or res.startswith("'"):
288 res = res[1:-1]
289
290 return unescapeHTML(res)
291
292
293 class HTMLAttributeParser(compat_HTMLParser):
294 """Trivial HTML parser to gather the attributes for a single element"""
295 def __init__(self):
296 self.attrs = {}
297 compat_HTMLParser.__init__(self)
298
299 def handle_starttag(self, tag, attrs):
300 self.attrs = dict(attrs)
301
302
303 def extract_attributes(html_element):
304 """Given a string for an HTML element such as
305 <el
306 a="foo" B="bar" c="&98;az" d=boz
307 empty= noval entity="&amp;"
308 sq='"' dq="'"
309 >
310 Decode and return a dictionary of attributes.
311 {
312 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
313 'empty': '', 'noval': None, 'entity': '&',
314 'sq': '"', 'dq': '\''
315 }.
316 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
317 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 """
319 parser = HTMLAttributeParser()
320 parser.feed(html_element)
321 parser.close()
322 return parser.attrs
323
324
325 def clean_html(html):
326 """Clean an HTML snippet into a readable string"""
327
328 if html is None: # Convenience for sanitizing descriptions etc.
329 return html
330
331 # Newline vs <br />
332 html = html.replace('\n', ' ')
333 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
334 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 # Strip html tags
336 html = re.sub('<.*?>', '', html)
337 # Replace html entities
338 html = unescapeHTML(html)
339 return html.strip()
340
341
342 def sanitize_open(filename, open_mode):
343 """Try to open the given filename, and slightly tweak it if this fails.
344
345 Attempts to open the given filename. If this fails, it tries to change
346 the filename slightly, step by step, until it's either able to open it
347 or it fails and raises a final exception, like the standard open()
348 function.
349
350 It returns the tuple (stream, definitive_file_name).
351 """
352 try:
353 if filename == '-':
354 if sys.platform == 'win32':
355 import msvcrt
356 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
357 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
358 stream = open(encodeFilename(filename), open_mode)
359 return (stream, filename)
360 except (IOError, OSError) as err:
361 if err.errno in (errno.EACCES,):
362 raise
363
364 # In case of error, try to remove win32 forbidden chars
365 alt_filename = sanitize_path(filename)
366 if alt_filename == filename:
367 raise
368 else:
369 # An exception here should be caught in the caller
370 stream = open(encodeFilename(alt_filename), open_mode)
371 return (stream, alt_filename)
372
373
374 def timeconvert(timestr):
375 """Convert RFC 2822 defined time string into system timestamp"""
376 timestamp = None
377 timetuple = email.utils.parsedate_tz(timestr)
378 if timetuple is not None:
379 timestamp = email.utils.mktime_tz(timetuple)
380 return timestamp
381
382
383 def sanitize_filename(s, restricted=False, is_id=False):
384 """Sanitizes a string so it could be used as part of a filename.
385 If restricted is set, use a stricter subset of allowed characters.
386 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 """
388 def replace_insane(char):
389 if restricted and char in ACCENT_CHARS:
390 return ACCENT_CHARS[char]
391 if char == '?' or ord(char) < 32 or ord(char) == 127:
392 return ''
393 elif char == '"':
394 return '' if restricted else '\''
395 elif char == ':':
396 return '_-' if restricted else ' -'
397 elif char in '\\/|*<>':
398 return '_'
399 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 return '_'
401 if restricted and ord(char) > 127:
402 return '_'
403 return char
404
405 # Handle timestamps
406 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
407 result = ''.join(map(replace_insane, s))
408 if not is_id:
409 while '__' in result:
410 result = result.replace('__', '_')
411 result = result.strip('_')
412 # Common case of "Foreign band name - English song title"
413 if restricted and result.startswith('-_'):
414 result = result[2:]
415 if result.startswith('-'):
416 result = '_' + result[len('-'):]
417 result = result.lstrip('.')
418 if not result:
419 result = '_'
420 return result
421
422
423 def sanitize_path(s):
424 """Sanitizes and normalizes path on Windows"""
425 if sys.platform != 'win32':
426 return s
427 drive_or_unc, _ = os.path.splitdrive(s)
428 if sys.version_info < (2, 7) and not drive_or_unc:
429 drive_or_unc, _ = os.path.splitunc(s)
430 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
431 if drive_or_unc:
432 norm_path.pop(0)
433 sanitized_path = [
434 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
435 for path_part in norm_path]
436 if drive_or_unc:
437 sanitized_path.insert(0, drive_or_unc + os.path.sep)
438 return os.path.join(*sanitized_path)
439
440
441 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
442 # unwanted failures due to missing protocol
443 def sanitize_url(url):
444 return 'http:%s' % url if url.startswith('//') else url
445
446
447 def sanitized_Request(url, *args, **kwargs):
448 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
449
450
451 def orderedSet(iterable):
452 """ Remove all duplicates from the input iterable """
453 res = []
454 for el in iterable:
455 if el not in res:
456 res.append(el)
457 return res
458
459
460 def _htmlentity_transform(entity_with_semicolon):
461 """Transforms an HTML entity to a character."""
462 entity = entity_with_semicolon[:-1]
463
464 # Known non-numeric HTML entity
465 if entity in compat_html_entities.name2codepoint:
466 return compat_chr(compat_html_entities.name2codepoint[entity])
467
468 # TODO: HTML5 allows entities without a semicolon. For example,
469 # '&Eacuteric' should be decoded as 'Éric'.
470 if entity_with_semicolon in compat_html_entities_html5:
471 return compat_html_entities_html5[entity_with_semicolon]
472
473 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
474 if mobj is not None:
475 numstr = mobj.group(1)
476 if numstr.startswith('x'):
477 base = 16
478 numstr = '0%s' % numstr
479 else:
480 base = 10
481 # See https://github.com/rg3/youtube-dl/issues/7518
482 try:
483 return compat_chr(int(numstr, base))
484 except ValueError:
485 pass
486
487 # Unknown entity in name, return its literal representation
488 return '&%s;' % entity
489
490
491 def unescapeHTML(s):
492 if s is None:
493 return None
494 assert type(s) == compat_str
495
496 return re.sub(
497 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
498
499
500 def get_subprocess_encoding():
501 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
502 # For subprocess calls, encode with locale encoding
503 # Refer to http://stackoverflow.com/a/9951851/35070
504 encoding = preferredencoding()
505 else:
506 encoding = sys.getfilesystemencoding()
507 if encoding is None:
508 encoding = 'utf-8'
509 return encoding
510
511
512 def encodeFilename(s, for_subprocess=False):
513 """
514 @param s The name of the file
515 """
516
517 assert type(s) == compat_str
518
519 # Python 3 has a Unicode API
520 if sys.version_info >= (3, 0):
521 return s
522
523 # Pass '' directly to use Unicode APIs on Windows 2000 and up
524 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
525 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
526 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 return s
528
529 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
530 if sys.platform.startswith('java'):
531 return s
532
533 return s.encode(get_subprocess_encoding(), 'ignore')
534
535
536 def decodeFilename(b, for_subprocess=False):
537
538 if sys.version_info >= (3, 0):
539 return b
540
541 if not isinstance(b, bytes):
542 return b
543
544 return b.decode(get_subprocess_encoding(), 'ignore')
545
546
547 def encodeArgument(s):
548 if not isinstance(s, compat_str):
549 # Legacy code that uses byte strings
550 # Uncomment the following line after fixing all post processors
551 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
552 s = s.decode('ascii')
553 return encodeFilename(s, True)
554
555
556 def decodeArgument(b):
557 return decodeFilename(b, True)
558
559
560 def decodeOption(optval):
561 if optval is None:
562 return optval
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
565
566 assert isinstance(optval, compat_str)
567 return optval
568
569
570 def formatSeconds(secs):
571 if secs > 3600:
572 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
573 elif secs > 60:
574 return '%d:%02d' % (secs // 60, secs % 60)
575 else:
576 return '%d' % secs
577
578
579 def make_HTTPS_handler(params, **kwargs):
580 opts_no_check_certificate = params.get('nocheckcertificate', False)
581 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
582 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
583 if opts_no_check_certificate:
584 context.check_hostname = False
585 context.verify_mode = ssl.CERT_NONE
586 try:
587 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
588 except TypeError:
589 # Python 2.7.8
590 # (create_default_context present but HTTPSHandler has no context=)
591 pass
592
593 if sys.version_info < (3, 2):
594 return YoutubeDLHTTPSHandler(params, **kwargs)
595 else: # Python < 3.4
596 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
597 context.verify_mode = (ssl.CERT_NONE
598 if opts_no_check_certificate
599 else ssl.CERT_REQUIRED)
600 context.set_default_verify_paths()
601 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
602
603
604 def bug_reports_message():
605 if ytdl_is_updateable():
606 update_cmd = 'type youtube-dl -U to update'
607 else:
608 update_cmd = 'see https://yt-dl.org/update on how to update'
609 msg = '; please report this issue on https://yt-dl.org/bug .'
610 msg += ' Make sure you are using the latest version; %s.' % update_cmd
611 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
612 return msg
613
614
615 class ExtractorError(Exception):
616 """Error during info extraction."""
617
618 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
619 """ tb, if given, is the original traceback (so that it can be printed out).
620 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
621 """
622
623 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
624 expected = True
625 if video_id is not None:
626 msg = video_id + ': ' + msg
627 if cause:
628 msg += ' (caused by %r)' % cause
629 if not expected:
630 msg += bug_reports_message()
631 super(ExtractorError, self).__init__(msg)
632
633 self.traceback = tb
634 self.exc_info = sys.exc_info() # preserve original exception
635 self.cause = cause
636 self.video_id = video_id
637
638 def format_traceback(self):
639 if self.traceback is None:
640 return None
641 return ''.join(traceback.format_tb(self.traceback))
642
643
644 class UnsupportedError(ExtractorError):
645 def __init__(self, url):
646 super(UnsupportedError, self).__init__(
647 'Unsupported URL: %s' % url, expected=True)
648 self.url = url
649
650
651 class RegexNotFoundError(ExtractorError):
652 """Error when a regex didn't match"""
653 pass
654
655
656 class DownloadError(Exception):
657 """Download Error exception.
658
659 This exception may be thrown by FileDownloader objects if they are not
660 configured to continue on errors. They will contain the appropriate
661 error message.
662 """
663
664 def __init__(self, msg, exc_info=None):
665 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
666 super(DownloadError, self).__init__(msg)
667 self.exc_info = exc_info
668
669
670 class SameFileError(Exception):
671 """Same File exception.
672
673 This exception will be thrown by FileDownloader objects if they detect
674 multiple files would have to be downloaded to the same file on disk.
675 """
676 pass
677
678
679 class PostProcessingError(Exception):
680 """Post Processing exception.
681
682 This exception may be raised by PostProcessor's .run() method to
683 indicate an error in the postprocessing task.
684 """
685
686 def __init__(self, msg):
687 self.msg = msg
688
689
690 class MaxDownloadsReached(Exception):
691 """ --max-downloads limit has been reached. """
692 pass
693
694
695 class UnavailableVideoError(Exception):
696 """Unavailable Format exception.
697
698 This exception will be thrown when a video is requested
699 in a format that is not available for that video.
700 """
701 pass
702
703
704 class ContentTooShortError(Exception):
705 """Content Too Short exception.
706
707 This exception may be raised by FileDownloader objects when a file they
708 download is too small for what the server announced first, indicating
709 the connection was probably interrupted.
710 """
711
712 def __init__(self, downloaded, expected):
713 # Both in bytes
714 self.downloaded = downloaded
715 self.expected = expected
716
717
718 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
719 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
720 # expected HTTP responses to meet HTTP/1.0 or later (see also
721 # https://github.com/rg3/youtube-dl/issues/6727)
722 if sys.version_info < (3, 0):
723 kwargs[b'strict'] = True
724 hc = http_class(*args, **kwargs)
725 source_address = ydl_handler._params.get('source_address')
726 if source_address is not None:
727 sa = (source_address, 0)
728 if hasattr(hc, 'source_address'): # Python 2.7+
729 hc.source_address = sa
730 else: # Python 2.6
731 def _hc_connect(self, *args, **kwargs):
732 sock = compat_socket_create_connection(
733 (self.host, self.port), self.timeout, sa)
734 if is_https:
735 self.sock = ssl.wrap_socket(
736 sock, self.key_file, self.cert_file,
737 ssl_version=ssl.PROTOCOL_TLSv1)
738 else:
739 self.sock = sock
740 hc.connect = functools.partial(_hc_connect, hc)
741
742 return hc
743
744
745 def handle_youtubedl_headers(headers):
746 filtered_headers = headers
747
748 if 'Youtubedl-no-compression' in filtered_headers:
749 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
750 del filtered_headers['Youtubedl-no-compression']
751
752 return filtered_headers
753
754
755 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
756 """Handler for HTTP requests and responses.
757
758 This class, when installed with an OpenerDirector, automatically adds
759 the standard headers to every HTTP request and handles gzipped and
760 deflated responses from web servers. If compression is to be avoided in
761 a particular request, the original request in the program code only has
762 to include the HTTP header "Youtubedl-no-compression", which will be
763 removed before making the real request.
764
765 Part of this code was copied from:
766
767 http://techknack.net/python-urllib2-handlers/
768
769 Andrew Rowls, the author of that code, agreed to release it to the
770 public domain.
771 """
772
773 def __init__(self, params, *args, **kwargs):
774 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
775 self._params = params
776
777 def http_open(self, req):
778 conn_class = compat_http_client.HTTPConnection
779
780 socks_proxy = req.headers.get('Ytdl-socks-proxy')
781 if socks_proxy:
782 conn_class = make_socks_conn_class(conn_class, socks_proxy)
783 del req.headers['Ytdl-socks-proxy']
784
785 return self.do_open(functools.partial(
786 _create_http_connection, self, conn_class, False),
787 req)
788
789 @staticmethod
790 def deflate(data):
791 try:
792 return zlib.decompress(data, -zlib.MAX_WBITS)
793 except zlib.error:
794 return zlib.decompress(data)
795
796 @staticmethod
797 def addinfourl_wrapper(stream, headers, url, code):
798 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
799 return compat_urllib_request.addinfourl(stream, headers, url, code)
800 ret = compat_urllib_request.addinfourl(stream, headers, url)
801 ret.code = code
802 return ret
803
804 def http_request(self, req):
805 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
806 # always respected by websites, some tend to give out URLs with non percent-encoded
807 # non-ASCII characters (see telemb.py, ard.py [#3412])
808 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
809 # To work around aforementioned issue we will replace request's original URL with
810 # percent-encoded one
811 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
812 # the code of this workaround has been moved here from YoutubeDL.urlopen()
813 url = req.get_full_url()
814 url_escaped = escape_url(url)
815
816 # Substitute URL if any change after escaping
817 if url != url_escaped:
818 req = update_Request(req, url=url_escaped)
819
820 for h, v in std_headers.items():
821 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
822 # The dict keys are capitalized because of this bug by urllib
823 if h.capitalize() not in req.headers:
824 req.add_header(h, v)
825
826 req.headers = handle_youtubedl_headers(req.headers)
827
828 if sys.version_info < (2, 7) and '#' in req.get_full_url():
829 # Python 2.6 is brain-dead when it comes to fragments
830 req._Request__original = req._Request__original.partition('#')[0]
831 req._Request__r_type = req._Request__r_type.partition('#')[0]
832
833 return req
834
835 def http_response(self, req, resp):
836 old_resp = resp
837 # gzip
838 if resp.headers.get('Content-encoding', '') == 'gzip':
839 content = resp.read()
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
841 try:
842 uncompressed = io.BytesIO(gz.read())
843 except IOError as original_ioerror:
844 # There may be junk add the end of the file
845 # See http://stackoverflow.com/q/4928560/35070 for details
846 for i in range(1, 1024):
847 try:
848 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
849 uncompressed = io.BytesIO(gz.read())
850 except IOError:
851 continue
852 break
853 else:
854 raise original_ioerror
855 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
856 resp.msg = old_resp.msg
857 del resp.headers['Content-encoding']
858 # deflate
859 if resp.headers.get('Content-encoding', '') == 'deflate':
860 gz = io.BytesIO(self.deflate(resp.read()))
861 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
862 resp.msg = old_resp.msg
863 del resp.headers['Content-encoding']
864 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
865 # https://github.com/rg3/youtube-dl/issues/6457).
866 if 300 <= resp.code < 400:
867 location = resp.headers.get('Location')
868 if location:
869 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
870 if sys.version_info >= (3, 0):
871 location = location.encode('iso-8859-1').decode('utf-8')
872 else:
873 location = location.decode('utf-8')
874 location_escaped = escape_url(location)
875 if location != location_escaped:
876 del resp.headers['Location']
877 if sys.version_info < (3, 0):
878 location_escaped = location_escaped.encode('utf-8')
879 resp.headers['Location'] = location_escaped
880 return resp
881
882 https_request = http_request
883 https_response = http_response
884
885
886 def make_socks_conn_class(base_class, socks_proxy):
887 assert issubclass(base_class, (
888 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
889
890 url_components = compat_urlparse.urlparse(socks_proxy)
891 if url_components.scheme.lower() == 'socks5':
892 socks_type = ProxyType.SOCKS5
893 elif url_components.scheme.lower() in ('socks', 'socks4'):
894 socks_type = ProxyType.SOCKS4
895 elif url_components.scheme.lower() == 'socks4a':
896 socks_type = ProxyType.SOCKS4A
897
898 def unquote_if_non_empty(s):
899 if not s:
900 return s
901 return compat_urllib_parse_unquote_plus(s)
902
903 proxy_args = (
904 socks_type,
905 url_components.hostname, url_components.port or 1080,
906 True, # Remote DNS
907 unquote_if_non_empty(url_components.username),
908 unquote_if_non_empty(url_components.password),
909 )
910
911 class SocksConnection(base_class):
912 def connect(self):
913 self.sock = sockssocket()
914 self.sock.setproxy(*proxy_args)
915 if type(self.timeout) in (int, float):
916 self.sock.settimeout(self.timeout)
917 self.sock.connect((self.host, self.port))
918
919 if isinstance(self, compat_http_client.HTTPSConnection):
920 if hasattr(self, '_context'): # Python > 2.6
921 self.sock = self._context.wrap_socket(
922 self.sock, server_hostname=self.host)
923 else:
924 self.sock = ssl.wrap_socket(self.sock)
925
926 return SocksConnection
927
928
929 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
930 def __init__(self, params, https_conn_class=None, *args, **kwargs):
931 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
932 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
933 self._params = params
934
935 def https_open(self, req):
936 kwargs = {}
937 conn_class = self._https_conn_class
938
939 if hasattr(self, '_context'): # python > 2.6
940 kwargs['context'] = self._context
941 if hasattr(self, '_check_hostname'): # python 3.x
942 kwargs['check_hostname'] = self._check_hostname
943
944 socks_proxy = req.headers.get('Ytdl-socks-proxy')
945 if socks_proxy:
946 conn_class = make_socks_conn_class(conn_class, socks_proxy)
947 del req.headers['Ytdl-socks-proxy']
948
949 return self.do_open(functools.partial(
950 _create_http_connection, self, conn_class, True),
951 req, **kwargs)
952
953
954 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
955 def __init__(self, cookiejar=None):
956 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
957
958 def http_response(self, request, response):
959 # Python 2 will choke on next HTTP request in row if there are non-ASCII
960 # characters in Set-Cookie HTTP header of last response (see
961 # https://github.com/rg3/youtube-dl/issues/6769).
962 # In order to at least prevent crashing we will percent encode Set-Cookie
963 # header before HTTPCookieProcessor starts processing it.
964 # if sys.version_info < (3, 0) and response.headers:
965 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
966 # set_cookie = response.headers.get(set_cookie_header)
967 # if set_cookie:
968 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
969 # if set_cookie != set_cookie_escaped:
970 # del response.headers[set_cookie_header]
971 # response.headers[set_cookie_header] = set_cookie_escaped
972 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
973
974 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
975 https_response = http_response
976
977
978 def parse_iso8601(date_str, delimiter='T', timezone=None):
979 """ Return a UNIX timestamp from the given date """
980
981 if date_str is None:
982 return None
983
984 date_str = re.sub(r'\.[0-9]+', '', date_str)
985
986 if timezone is None:
987 m = re.search(
988 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
989 date_str)
990 if not m:
991 timezone = datetime.timedelta()
992 else:
993 date_str = date_str[:-len(m.group(0))]
994 if not m.group('sign'):
995 timezone = datetime.timedelta()
996 else:
997 sign = 1 if m.group('sign') == '+' else -1
998 timezone = datetime.timedelta(
999 hours=sign * int(m.group('hours')),
1000 minutes=sign * int(m.group('minutes')))
1001 try:
1002 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1003 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1004 return calendar.timegm(dt.timetuple())
1005 except ValueError:
1006 pass
1007
1008
1009 def unified_strdate(date_str, day_first=True):
1010 """Return a string with the date in the format YYYYMMDD"""
1011
1012 if date_str is None:
1013 return None
1014 upload_date = None
1015 # Replace commas
1016 date_str = date_str.replace(',', ' ')
1017 # %z (UTC offset) is only supported in python>=3.2
1018 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1019 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1020 # Remove AM/PM + timezone
1021 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1022
1023 format_expressions = [
1024 '%d %B %Y',
1025 '%d %b %Y',
1026 '%B %d %Y',
1027 '%b %d %Y',
1028 '%b %dst %Y %I:%M',
1029 '%b %dnd %Y %I:%M',
1030 '%b %dth %Y %I:%M',
1031 '%Y %m %d',
1032 '%Y-%m-%d',
1033 '%Y/%m/%d',
1034 '%Y/%m/%d %H:%M:%S',
1035 '%Y-%m-%d %H:%M:%S',
1036 '%Y-%m-%d %H:%M:%S.%f',
1037 '%d.%m.%Y %H:%M',
1038 '%d.%m.%Y %H.%M',
1039 '%Y-%m-%dT%H:%M:%SZ',
1040 '%Y-%m-%dT%H:%M:%S.%fZ',
1041 '%Y-%m-%dT%H:%M:%S.%f0Z',
1042 '%Y-%m-%dT%H:%M:%S',
1043 '%Y-%m-%dT%H:%M:%S.%f',
1044 '%Y-%m-%dT%H:%M',
1045 ]
1046 if day_first:
1047 format_expressions.extend([
1048 '%d-%m-%Y',
1049 '%d.%m.%Y',
1050 '%d.%m.%y',
1051 '%d/%m/%Y',
1052 '%d/%m/%y',
1053 '%d/%m/%Y %H:%M:%S',
1054 ])
1055 else:
1056 format_expressions.extend([
1057 '%m-%d-%Y',
1058 '%m.%d.%Y',
1059 '%m/%d/%Y',
1060 '%m/%d/%y',
1061 '%m/%d/%Y %H:%M:%S',
1062 ])
1063 for expression in format_expressions:
1064 try:
1065 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1066 except ValueError:
1067 pass
1068 if upload_date is None:
1069 timetuple = email.utils.parsedate_tz(date_str)
1070 if timetuple:
1071 try:
1072 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1073 except ValueError:
1074 pass
1075 if upload_date is not None:
1076 return compat_str(upload_date)
1077
1078
1079 def determine_ext(url, default_ext='unknown_video'):
1080 if url is None:
1081 return default_ext
1082 guess = url.partition('?')[0].rpartition('.')[2]
1083 if re.match(r'^[A-Za-z0-9]+$', guess):
1084 return guess
1085 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1086 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1087 return guess.rstrip('/')
1088 else:
1089 return default_ext
1090
1091
1092 def subtitles_filename(filename, sub_lang, sub_format):
1093 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1094
1095
1096 def date_from_str(date_str):
1097 """
1098 Return a datetime object from a string in the format YYYYMMDD or
1099 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1100 today = datetime.date.today()
1101 if date_str in ('now', 'today'):
1102 return today
1103 if date_str == 'yesterday':
1104 return today - datetime.timedelta(days=1)
1105 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1106 if match is not None:
1107 sign = match.group('sign')
1108 time = int(match.group('time'))
1109 if sign == '-':
1110 time = -time
1111 unit = match.group('unit')
1112 # A bad approximation?
1113 if unit == 'month':
1114 unit = 'day'
1115 time *= 30
1116 elif unit == 'year':
1117 unit = 'day'
1118 time *= 365
1119 unit += 's'
1120 delta = datetime.timedelta(**{unit: time})
1121 return today + delta
1122 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1123
1124
1125 def hyphenate_date(date_str):
1126 """
1127 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1128 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1129 if match is not None:
1130 return '-'.join(match.groups())
1131 else:
1132 return date_str
1133
1134
1135 class DateRange(object):
1136 """Represents a time interval between two dates"""
1137
1138 def __init__(self, start=None, end=None):
1139 """start and end must be strings in the format accepted by date"""
1140 if start is not None:
1141 self.start = date_from_str(start)
1142 else:
1143 self.start = datetime.datetime.min.date()
1144 if end is not None:
1145 self.end = date_from_str(end)
1146 else:
1147 self.end = datetime.datetime.max.date()
1148 if self.start > self.end:
1149 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1150
1151 @classmethod
1152 def day(cls, day):
1153 """Returns a range that only contains the given day"""
1154 return cls(day, day)
1155
1156 def __contains__(self, date):
1157 """Check if the date is in the range"""
1158 if not isinstance(date, datetime.date):
1159 date = date_from_str(date)
1160 return self.start <= date <= self.end
1161
1162 def __str__(self):
1163 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1164
1165
1166 def platform_name():
1167 """ Returns the platform name as a compat_str """
1168 res = platform.platform()
1169 if isinstance(res, bytes):
1170 res = res.decode(preferredencoding())
1171
1172 assert isinstance(res, compat_str)
1173 return res
1174
1175
1176 def _windows_write_string(s, out):
1177 """ Returns True if the string was written using special methods,
1178 False if it has yet to be written out."""
1179 # Adapted from http://stackoverflow.com/a/3259271/35070
1180
1181 import ctypes
1182 import ctypes.wintypes
1183
1184 WIN_OUTPUT_IDS = {
1185 1: -11,
1186 2: -12,
1187 }
1188
1189 try:
1190 fileno = out.fileno()
1191 except AttributeError:
1192 # If the output stream doesn't have a fileno, it's virtual
1193 return False
1194 except io.UnsupportedOperation:
1195 # Some strange Windows pseudo files?
1196 return False
1197 if fileno not in WIN_OUTPUT_IDS:
1198 return False
1199
1200 GetStdHandle = ctypes.WINFUNCTYPE(
1201 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1202 (b'GetStdHandle', ctypes.windll.kernel32))
1203 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1204
1205 WriteConsoleW = ctypes.WINFUNCTYPE(
1206 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1207 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1208 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1209 written = ctypes.wintypes.DWORD(0)
1210
1211 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1212 FILE_TYPE_CHAR = 0x0002
1213 FILE_TYPE_REMOTE = 0x8000
1214 GetConsoleMode = ctypes.WINFUNCTYPE(
1215 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1216 ctypes.POINTER(ctypes.wintypes.DWORD))(
1217 (b'GetConsoleMode', ctypes.windll.kernel32))
1218 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1219
1220 def not_a_console(handle):
1221 if handle == INVALID_HANDLE_VALUE or handle is None:
1222 return True
1223 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1224 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1225
1226 if not_a_console(h):
1227 return False
1228
1229 def next_nonbmp_pos(s):
1230 try:
1231 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1232 except StopIteration:
1233 return len(s)
1234
1235 while s:
1236 count = min(next_nonbmp_pos(s), 1024)
1237
1238 ret = WriteConsoleW(
1239 h, s, count if count else 2, ctypes.byref(written), None)
1240 if ret == 0:
1241 raise OSError('Failed to write string')
1242 if not count: # We just wrote a non-BMP character
1243 assert written.value == 2
1244 s = s[1:]
1245 else:
1246 assert written.value > 0
1247 s = s[written.value:]
1248 return True
1249
1250
1251 def write_string(s, out=None, encoding=None):
1252 if out is None:
1253 out = sys.stderr
1254 assert type(s) == compat_str
1255
1256 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1257 if _windows_write_string(s, out):
1258 return
1259
1260 if ('b' in getattr(out, 'mode', '') or
1261 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1262 byt = s.encode(encoding or preferredencoding(), 'ignore')
1263 out.write(byt)
1264 elif hasattr(out, 'buffer'):
1265 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1266 byt = s.encode(enc, 'ignore')
1267 out.buffer.write(byt)
1268 else:
1269 out.write(s)
1270 out.flush()
1271
1272
1273 def bytes_to_intlist(bs):
1274 if not bs:
1275 return []
1276 if isinstance(bs[0], int): # Python 3
1277 return list(bs)
1278 else:
1279 return [ord(c) for c in bs]
1280
1281
1282 def intlist_to_bytes(xs):
1283 if not xs:
1284 return b''
1285 return compat_struct_pack('%dB' % len(xs), *xs)
1286
1287
1288 # Cross-platform file locking
1289 if sys.platform == 'win32':
1290 import ctypes.wintypes
1291 import msvcrt
1292
1293 class OVERLAPPED(ctypes.Structure):
1294 _fields_ = [
1295 ('Internal', ctypes.wintypes.LPVOID),
1296 ('InternalHigh', ctypes.wintypes.LPVOID),
1297 ('Offset', ctypes.wintypes.DWORD),
1298 ('OffsetHigh', ctypes.wintypes.DWORD),
1299 ('hEvent', ctypes.wintypes.HANDLE),
1300 ]
1301
1302 kernel32 = ctypes.windll.kernel32
1303 LockFileEx = kernel32.LockFileEx
1304 LockFileEx.argtypes = [
1305 ctypes.wintypes.HANDLE, # hFile
1306 ctypes.wintypes.DWORD, # dwFlags
1307 ctypes.wintypes.DWORD, # dwReserved
1308 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1309 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1310 ctypes.POINTER(OVERLAPPED) # Overlapped
1311 ]
1312 LockFileEx.restype = ctypes.wintypes.BOOL
1313 UnlockFileEx = kernel32.UnlockFileEx
1314 UnlockFileEx.argtypes = [
1315 ctypes.wintypes.HANDLE, # hFile
1316 ctypes.wintypes.DWORD, # dwReserved
1317 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1318 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1319 ctypes.POINTER(OVERLAPPED) # Overlapped
1320 ]
1321 UnlockFileEx.restype = ctypes.wintypes.BOOL
1322 whole_low = 0xffffffff
1323 whole_high = 0x7fffffff
1324
1325 def _lock_file(f, exclusive):
1326 overlapped = OVERLAPPED()
1327 overlapped.Offset = 0
1328 overlapped.OffsetHigh = 0
1329 overlapped.hEvent = 0
1330 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1331 handle = msvcrt.get_osfhandle(f.fileno())
1332 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1333 whole_low, whole_high, f._lock_file_overlapped_p):
1334 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1335
1336 def _unlock_file(f):
1337 assert f._lock_file_overlapped_p
1338 handle = msvcrt.get_osfhandle(f.fileno())
1339 if not UnlockFileEx(handle, 0,
1340 whole_low, whole_high, f._lock_file_overlapped_p):
1341 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1342
1343 else:
1344 # Some platforms, such as Jython, is missing fcntl
1345 try:
1346 import fcntl
1347
1348 def _lock_file(f, exclusive):
1349 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1350
1351 def _unlock_file(f):
1352 fcntl.flock(f, fcntl.LOCK_UN)
1353 except ImportError:
1354 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1355
1356 def _lock_file(f, exclusive):
1357 raise IOError(UNSUPPORTED_MSG)
1358
1359 def _unlock_file(f):
1360 raise IOError(UNSUPPORTED_MSG)
1361
1362
1363 class locked_file(object):
1364 def __init__(self, filename, mode, encoding=None):
1365 assert mode in ['r', 'a', 'w']
1366 self.f = io.open(filename, mode, encoding=encoding)
1367 self.mode = mode
1368
1369 def __enter__(self):
1370 exclusive = self.mode != 'r'
1371 try:
1372 _lock_file(self.f, exclusive)
1373 except IOError:
1374 self.f.close()
1375 raise
1376 return self
1377
1378 def __exit__(self, etype, value, traceback):
1379 try:
1380 _unlock_file(self.f)
1381 finally:
1382 self.f.close()
1383
1384 def __iter__(self):
1385 return iter(self.f)
1386
1387 def write(self, *args):
1388 return self.f.write(*args)
1389
1390 def read(self, *args):
1391 return self.f.read(*args)
1392
1393
1394 def get_filesystem_encoding():
1395 encoding = sys.getfilesystemencoding()
1396 return encoding if encoding is not None else 'utf-8'
1397
1398
1399 def shell_quote(args):
1400 quoted_args = []
1401 encoding = get_filesystem_encoding()
1402 for a in args:
1403 if isinstance(a, bytes):
1404 # We may get a filename encoded with 'encodeFilename'
1405 a = a.decode(encoding)
1406 quoted_args.append(pipes.quote(a))
1407 return ' '.join(quoted_args)
1408
1409
1410 def smuggle_url(url, data):
1411 """ Pass additional data in a URL for internal use. """
1412
1413 sdata = compat_urllib_parse_urlencode(
1414 {'__youtubedl_smuggle': json.dumps(data)})
1415 return url + '#' + sdata
1416
1417
1418 def unsmuggle_url(smug_url, default=None):
1419 if '#__youtubedl_smuggle' not in smug_url:
1420 return smug_url, default
1421 url, _, sdata = smug_url.rpartition('#')
1422 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1423 data = json.loads(jsond)
1424 return url, data
1425
1426
1427 def format_bytes(bytes):
1428 if bytes is None:
1429 return 'N/A'
1430 if type(bytes) is str:
1431 bytes = float(bytes)
1432 if bytes == 0.0:
1433 exponent = 0
1434 else:
1435 exponent = int(math.log(bytes, 1024.0))
1436 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1437 converted = float(bytes) / float(1024 ** exponent)
1438 return '%.2f%s' % (converted, suffix)
1439
1440
1441 def lookup_unit_table(unit_table, s):
1442 units_re = '|'.join(re.escape(u) for u in unit_table)
1443 m = re.match(
1444 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1445 if not m:
1446 return None
1447 num_str = m.group('num').replace(',', '.')
1448 mult = unit_table[m.group('unit')]
1449 return int(float(num_str) * mult)
1450
1451
1452 def parse_filesize(s):
1453 if s is None:
1454 return None
1455
1456 # The lower-case forms are of course incorrect and unofficial,
1457 # but we support those too
1458 _UNIT_TABLE = {
1459 'B': 1,
1460 'b': 1,
1461 'KiB': 1024,
1462 'KB': 1000,
1463 'kB': 1024,
1464 'Kb': 1000,
1465 'MiB': 1024 ** 2,
1466 'MB': 1000 ** 2,
1467 'mB': 1024 ** 2,
1468 'Mb': 1000 ** 2,
1469 'GiB': 1024 ** 3,
1470 'GB': 1000 ** 3,
1471 'gB': 1024 ** 3,
1472 'Gb': 1000 ** 3,
1473 'TiB': 1024 ** 4,
1474 'TB': 1000 ** 4,
1475 'tB': 1024 ** 4,
1476 'Tb': 1000 ** 4,
1477 'PiB': 1024 ** 5,
1478 'PB': 1000 ** 5,
1479 'pB': 1024 ** 5,
1480 'Pb': 1000 ** 5,
1481 'EiB': 1024 ** 6,
1482 'EB': 1000 ** 6,
1483 'eB': 1024 ** 6,
1484 'Eb': 1000 ** 6,
1485 'ZiB': 1024 ** 7,
1486 'ZB': 1000 ** 7,
1487 'zB': 1024 ** 7,
1488 'Zb': 1000 ** 7,
1489 'YiB': 1024 ** 8,
1490 'YB': 1000 ** 8,
1491 'yB': 1024 ** 8,
1492 'Yb': 1000 ** 8,
1493 }
1494
1495 return lookup_unit_table(_UNIT_TABLE, s)
1496
1497
1498 def parse_count(s):
1499 if s is None:
1500 return None
1501
1502 s = s.strip()
1503
1504 if re.match(r'^[\d,.]+$', s):
1505 return str_to_int(s)
1506
1507 _UNIT_TABLE = {
1508 'k': 1000,
1509 'K': 1000,
1510 'm': 1000 ** 2,
1511 'M': 1000 ** 2,
1512 'kk': 1000 ** 2,
1513 'KK': 1000 ** 2,
1514 }
1515
1516 return lookup_unit_table(_UNIT_TABLE, s)
1517
1518
1519 def month_by_name(name):
1520 """ Return the number of a month by (locale-independently) English name """
1521
1522 try:
1523 return ENGLISH_MONTH_NAMES.index(name) + 1
1524 except ValueError:
1525 return None
1526
1527
1528 def month_by_abbreviation(abbrev):
1529 """ Return the number of a month by (locale-independently) English
1530 abbreviations """
1531
1532 try:
1533 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1534 except ValueError:
1535 return None
1536
1537
1538 def fix_xml_ampersands(xml_str):
1539 """Replace all the '&' by '&amp;' in XML"""
1540 return re.sub(
1541 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1542 '&amp;',
1543 xml_str)
1544
1545
1546 def setproctitle(title):
1547 assert isinstance(title, compat_str)
1548
1549 # ctypes in Jython is not complete
1550 # http://bugs.jython.org/issue2148
1551 if sys.platform.startswith('java'):
1552 return
1553
1554 try:
1555 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1556 except OSError:
1557 return
1558 title_bytes = title.encode('utf-8')
1559 buf = ctypes.create_string_buffer(len(title_bytes))
1560 buf.value = title_bytes
1561 try:
1562 libc.prctl(15, buf, 0, 0, 0)
1563 except AttributeError:
1564 return # Strange libc, just skip this
1565
1566
1567 def remove_start(s, start):
1568 return s[len(start):] if s is not None and s.startswith(start) else s
1569
1570
1571 def remove_end(s, end):
1572 return s[:-len(end)] if s is not None and s.endswith(end) else s
1573
1574
1575 def remove_quotes(s):
1576 if s is None or len(s) < 2:
1577 return s
1578 for quote in ('"', "'", ):
1579 if s[0] == quote and s[-1] == quote:
1580 return s[1:-1]
1581 return s
1582
1583
1584 def url_basename(url):
1585 path = compat_urlparse.urlparse(url).path
1586 return path.strip('/').split('/')[-1]
1587
1588
1589 class HEADRequest(compat_urllib_request.Request):
1590 def get_method(self):
1591 return 'HEAD'
1592
1593
1594 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1595 if get_attr:
1596 if v is not None:
1597 v = getattr(v, get_attr, None)
1598 if v == '':
1599 v = None
1600 if v is None:
1601 return default
1602 try:
1603 return int(v) * invscale // scale
1604 except ValueError:
1605 return default
1606
1607
1608 def str_or_none(v, default=None):
1609 return default if v is None else compat_str(v)
1610
1611
1612 def str_to_int(int_str):
1613 """ A more relaxed version of int_or_none """
1614 if int_str is None:
1615 return None
1616 int_str = re.sub(r'[,\.\+]', '', int_str)
1617 return int(int_str)
1618
1619
1620 def float_or_none(v, scale=1, invscale=1, default=None):
1621 if v is None:
1622 return default
1623 try:
1624 return float(v) * invscale / scale
1625 except ValueError:
1626 return default
1627
1628
1629 def parse_duration(s):
1630 if not isinstance(s, compat_basestring):
1631 return None
1632
1633 s = s.strip()
1634
1635 days, hours, mins, secs, ms = [None] * 5
1636 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1637 if m:
1638 days, hours, mins, secs, ms = m.groups()
1639 else:
1640 m = re.match(
1641 r'''(?ix)(?:P?T)?
1642 (?:
1643 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1644 )?
1645 (?:
1646 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1647 )?
1648 (?:
1649 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1650 )?
1651 (?:
1652 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1653 )?$''', s)
1654 if m:
1655 days, hours, mins, secs, ms = m.groups()
1656 else:
1657 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1658 if m:
1659 hours, mins = m.groups()
1660 else:
1661 return None
1662
1663 duration = 0
1664 if secs:
1665 duration += float(secs)
1666 if mins:
1667 duration += float(mins) * 60
1668 if hours:
1669 duration += float(hours) * 60 * 60
1670 if days:
1671 duration += float(days) * 24 * 60 * 60
1672 if ms:
1673 duration += float(ms)
1674 return duration
1675
1676
1677 def prepend_extension(filename, ext, expected_real_ext=None):
1678 name, real_ext = os.path.splitext(filename)
1679 return (
1680 '{0}.{1}{2}'.format(name, ext, real_ext)
1681 if not expected_real_ext or real_ext[1:] == expected_real_ext
1682 else '{0}.{1}'.format(filename, ext))
1683
1684
1685 def replace_extension(filename, ext, expected_real_ext=None):
1686 name, real_ext = os.path.splitext(filename)
1687 return '{0}.{1}'.format(
1688 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1689 ext)
1690
1691
1692 def check_executable(exe, args=[]):
1693 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1694 args can be a list of arguments for a short output (like -version) """
1695 try:
1696 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1697 except OSError:
1698 return False
1699 return exe
1700
1701
1702 def get_exe_version(exe, args=['--version'],
1703 version_re=None, unrecognized='present'):
1704 """ Returns the version of the specified executable,
1705 or False if the executable is not present """
1706 try:
1707 out, _ = subprocess.Popen(
1708 [encodeArgument(exe)] + args,
1709 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1710 except OSError:
1711 return False
1712 if isinstance(out, bytes): # Python 2.x
1713 out = out.decode('ascii', 'ignore')
1714 return detect_exe_version(out, version_re, unrecognized)
1715
1716
1717 def detect_exe_version(output, version_re=None, unrecognized='present'):
1718 assert isinstance(output, compat_str)
1719 if version_re is None:
1720 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1721 m = re.search(version_re, output)
1722 if m:
1723 return m.group(1)
1724 else:
1725 return unrecognized
1726
1727
1728 class PagedList(object):
1729 def __len__(self):
1730 # This is only useful for tests
1731 return len(self.getslice())
1732
1733
1734 class OnDemandPagedList(PagedList):
1735 def __init__(self, pagefunc, pagesize, use_cache=False):
1736 self._pagefunc = pagefunc
1737 self._pagesize = pagesize
1738 self._use_cache = use_cache
1739 if use_cache:
1740 self._cache = {}
1741
1742 def getslice(self, start=0, end=None):
1743 res = []
1744 for pagenum in itertools.count(start // self._pagesize):
1745 firstid = pagenum * self._pagesize
1746 nextfirstid = pagenum * self._pagesize + self._pagesize
1747 if start >= nextfirstid:
1748 continue
1749
1750 page_results = None
1751 if self._use_cache:
1752 page_results = self._cache.get(pagenum)
1753 if page_results is None:
1754 page_results = list(self._pagefunc(pagenum))
1755 if self._use_cache:
1756 self._cache[pagenum] = page_results
1757
1758 startv = (
1759 start % self._pagesize
1760 if firstid <= start < nextfirstid
1761 else 0)
1762
1763 endv = (
1764 ((end - 1) % self._pagesize) + 1
1765 if (end is not None and firstid <= end <= nextfirstid)
1766 else None)
1767
1768 if startv != 0 or endv is not None:
1769 page_results = page_results[startv:endv]
1770 res.extend(page_results)
1771
1772 # A little optimization - if current page is not "full", ie. does
1773 # not contain page_size videos then we can assume that this page
1774 # is the last one - there are no more ids on further pages -
1775 # i.e. no need to query again.
1776 if len(page_results) + startv < self._pagesize:
1777 break
1778
1779 # If we got the whole page, but the next page is not interesting,
1780 # break out early as well
1781 if end == nextfirstid:
1782 break
1783 return res
1784
1785
1786 class InAdvancePagedList(PagedList):
1787 def __init__(self, pagefunc, pagecount, pagesize):
1788 self._pagefunc = pagefunc
1789 self._pagecount = pagecount
1790 self._pagesize = pagesize
1791
1792 def getslice(self, start=0, end=None):
1793 res = []
1794 start_page = start // self._pagesize
1795 end_page = (
1796 self._pagecount if end is None else (end // self._pagesize + 1))
1797 skip_elems = start - start_page * self._pagesize
1798 only_more = None if end is None else end - start
1799 for pagenum in range(start_page, end_page):
1800 page = list(self._pagefunc(pagenum))
1801 if skip_elems:
1802 page = page[skip_elems:]
1803 skip_elems = None
1804 if only_more is not None:
1805 if len(page) < only_more:
1806 only_more -= len(page)
1807 else:
1808 page = page[:only_more]
1809 res.extend(page)
1810 break
1811 res.extend(page)
1812 return res
1813
1814
1815 def uppercase_escape(s):
1816 unicode_escape = codecs.getdecoder('unicode_escape')
1817 return re.sub(
1818 r'\\U[0-9a-fA-F]{8}',
1819 lambda m: unicode_escape(m.group(0))[0],
1820 s)
1821
1822
1823 def lowercase_escape(s):
1824 unicode_escape = codecs.getdecoder('unicode_escape')
1825 return re.sub(
1826 r'\\u[0-9a-fA-F]{4}',
1827 lambda m: unicode_escape(m.group(0))[0],
1828 s)
1829
1830
1831 def escape_rfc3986(s):
1832 """Escape non-ASCII characters as suggested by RFC 3986"""
1833 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1834 s = s.encode('utf-8')
1835 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1836
1837
1838 def escape_url(url):
1839 """Escape URL as suggested by RFC 3986"""
1840 url_parsed = compat_urllib_parse_urlparse(url)
1841 return url_parsed._replace(
1842 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1843 path=escape_rfc3986(url_parsed.path),
1844 params=escape_rfc3986(url_parsed.params),
1845 query=escape_rfc3986(url_parsed.query),
1846 fragment=escape_rfc3986(url_parsed.fragment)
1847 ).geturl()
1848
1849
1850 def read_batch_urls(batch_fd):
1851 def fixup(url):
1852 if not isinstance(url, compat_str):
1853 url = url.decode('utf-8', 'replace')
1854 BOM_UTF8 = '\xef\xbb\xbf'
1855 if url.startswith(BOM_UTF8):
1856 url = url[len(BOM_UTF8):]
1857 url = url.strip()
1858 if url.startswith(('#', ';', ']')):
1859 return False
1860 return url
1861
1862 with contextlib.closing(batch_fd) as fd:
1863 return [url for url in map(fixup, fd) if url]
1864
1865
1866 def urlencode_postdata(*args, **kargs):
1867 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1868
1869
1870 def update_url_query(url, query):
1871 if not query:
1872 return url
1873 parsed_url = compat_urlparse.urlparse(url)
1874 qs = compat_parse_qs(parsed_url.query)
1875 qs.update(query)
1876 return compat_urlparse.urlunparse(parsed_url._replace(
1877 query=compat_urllib_parse_urlencode(qs, True)))
1878
1879
1880 def update_Request(req, url=None, data=None, headers={}, query={}):
1881 req_headers = req.headers.copy()
1882 req_headers.update(headers)
1883 req_data = data or req.data
1884 req_url = update_url_query(url or req.get_full_url(), query)
1885 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1886 new_req = req_type(
1887 req_url, data=req_data, headers=req_headers,
1888 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1889 if hasattr(req, 'timeout'):
1890 new_req.timeout = req.timeout
1891 return new_req
1892
1893
1894 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1895 if isinstance(key_or_keys, (list, tuple)):
1896 for key in key_or_keys:
1897 if key not in d or d[key] is None or skip_false_values and not d[key]:
1898 continue
1899 return d[key]
1900 return default
1901 return d.get(key_or_keys, default)
1902
1903
1904 def try_get(src, getter, expected_type=None):
1905 try:
1906 v = getter(src)
1907 except (AttributeError, KeyError, TypeError, IndexError):
1908 pass
1909 else:
1910 if expected_type is None or isinstance(v, expected_type):
1911 return v
1912
1913
1914 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1915 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1916
1917
1918 US_RATINGS = {
1919 'G': 0,
1920 'PG': 10,
1921 'PG-13': 13,
1922 'R': 16,
1923 'NC': 18,
1924 }
1925
1926
1927 def parse_age_limit(s):
1928 if s is None:
1929 return None
1930 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1931 return int(m.group('age')) if m else US_RATINGS.get(s)
1932
1933
1934 def strip_jsonp(code):
1935 return re.sub(
1936 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1937
1938
1939 def js_to_json(code):
1940 def fix_kv(m):
1941 v = m.group(0)
1942 if v in ('true', 'false', 'null'):
1943 return v
1944 elif v.startswith('/*') or v == ',':
1945 return ""
1946
1947 if v[0] in ("'", '"'):
1948 v = re.sub(r'(?s)\\.|"', lambda m: {
1949 '"': '\\"',
1950 "\\'": "'",
1951 '\\\n': '',
1952 '\\x': '\\u00',
1953 }.get(m.group(0), m.group(0)), v[1:-1])
1954
1955 INTEGER_TABLE = (
1956 (r'^0[xX][0-9a-fA-F]+', 16),
1957 (r'^0+[0-7]+', 8),
1958 )
1959
1960 for regex, base in INTEGER_TABLE:
1961 im = re.match(regex, v)
1962 if im:
1963 i = int(im.group(0), base)
1964 return '"%d":' % i if v.endswith(':') else '%d' % i
1965
1966 return '"%s"' % v
1967
1968 return re.sub(r'''(?sx)
1969 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1970 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1971 /\*.*?\*/|,(?=\s*[\]}])|
1972 [a-zA-Z_][.a-zA-Z_0-9]*|
1973 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1974 [0-9]+(?=\s*:)
1975 ''', fix_kv, code)
1976
1977
1978 def qualities(quality_ids):
1979 """ Get a numeric quality value out of a list of possible values """
1980 def q(qid):
1981 try:
1982 return quality_ids.index(qid)
1983 except ValueError:
1984 return -1
1985 return q
1986
1987
1988 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1989
1990
1991 def limit_length(s, length):
1992 """ Add ellipses to overly long strings """
1993 if s is None:
1994 return None
1995 ELLIPSES = '...'
1996 if len(s) > length:
1997 return s[:length - len(ELLIPSES)] + ELLIPSES
1998 return s
1999
2000
2001 def version_tuple(v):
2002 return tuple(int(e) for e in re.split(r'[-.]', v))
2003
2004
2005 def is_outdated_version(version, limit, assume_new=True):
2006 if not version:
2007 return not assume_new
2008 try:
2009 return version_tuple(version) < version_tuple(limit)
2010 except ValueError:
2011 return not assume_new
2012
2013
2014 def ytdl_is_updateable():
2015 """ Returns if youtube-dl can be updated with -U """
2016 from zipimport import zipimporter
2017
2018 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2019
2020
2021 def args_to_str(args):
2022 # Get a short string representation for a subprocess command
2023 return ' '.join(compat_shlex_quote(a) for a in args)
2024
2025
2026 def error_to_compat_str(err):
2027 err_str = str(err)
2028 # On python 2 error byte string must be decoded with proper
2029 # encoding rather than ascii
2030 if sys.version_info[0] < 3:
2031 err_str = err_str.decode(preferredencoding())
2032 return err_str
2033
2034
2035 def mimetype2ext(mt):
2036 if mt is None:
2037 return None
2038
2039 ext = {
2040 'audio/mp4': 'm4a',
2041 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2042 # it's the most popular one
2043 'audio/mpeg': 'mp3',
2044 }.get(mt)
2045 if ext is not None:
2046 return ext
2047
2048 _, _, res = mt.rpartition('/')
2049
2050 return {
2051 '3gpp': '3gp',
2052 'smptett+xml': 'tt',
2053 'srt': 'srt',
2054 'ttaf+xml': 'dfxp',
2055 'ttml+xml': 'ttml',
2056 'vtt': 'vtt',
2057 'x-flv': 'flv',
2058 'x-mp4-fragmented': 'mp4',
2059 'x-ms-wmv': 'wmv',
2060 }.get(res, res)
2061
2062
2063 def urlhandle_detect_ext(url_handle):
2064 getheader = url_handle.headers.get
2065
2066 cd = getheader('Content-Disposition')
2067 if cd:
2068 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2069 if m:
2070 e = determine_ext(m.group('filename'), default_ext=None)
2071 if e:
2072 return e
2073
2074 return mimetype2ext(getheader('Content-Type'))
2075
2076
2077 def encode_data_uri(data, mime_type):
2078 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2079
2080
2081 def age_restricted(content_limit, age_limit):
2082 """ Returns True iff the content should be blocked """
2083
2084 if age_limit is None: # No limit set
2085 return False
2086 if content_limit is None:
2087 return False # Content available for everyone
2088 return age_limit < content_limit
2089
2090
2091 def is_html(first_bytes):
2092 """ Detect whether a file contains HTML by examining its first bytes. """
2093
2094 BOMS = [
2095 (b'\xef\xbb\xbf', 'utf-8'),
2096 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2097 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2098 (b'\xff\xfe', 'utf-16-le'),
2099 (b'\xfe\xff', 'utf-16-be'),
2100 ]
2101 for bom, enc in BOMS:
2102 if first_bytes.startswith(bom):
2103 s = first_bytes[len(bom):].decode(enc, 'replace')
2104 break
2105 else:
2106 s = first_bytes.decode('utf-8', 'replace')
2107
2108 return re.match(r'^\s*<', s)
2109
2110
2111 def determine_protocol(info_dict):
2112 protocol = info_dict.get('protocol')
2113 if protocol is not None:
2114 return protocol
2115
2116 url = info_dict['url']
2117 if url.startswith('rtmp'):
2118 return 'rtmp'
2119 elif url.startswith('mms'):
2120 return 'mms'
2121 elif url.startswith('rtsp'):
2122 return 'rtsp'
2123
2124 ext = determine_ext(url)
2125 if ext == 'm3u8':
2126 return 'm3u8'
2127 elif ext == 'f4m':
2128 return 'f4m'
2129
2130 return compat_urllib_parse_urlparse(url).scheme
2131
2132
2133 def render_table(header_row, data):
2134 """ Render a list of rows, each as a list of values """
2135 table = [header_row] + data
2136 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2137 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2138 return '\n'.join(format_str % tuple(row) for row in table)
2139
2140
2141 def _match_one(filter_part, dct):
2142 COMPARISON_OPERATORS = {
2143 '<': operator.lt,
2144 '<=': operator.le,
2145 '>': operator.gt,
2146 '>=': operator.ge,
2147 '=': operator.eq,
2148 '!=': operator.ne,
2149 }
2150 operator_rex = re.compile(r'''(?x)\s*
2151 (?P<key>[a-z_]+)
2152 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2153 (?:
2154 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2155 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2156 )
2157 \s*$
2158 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2159 m = operator_rex.search(filter_part)
2160 if m:
2161 op = COMPARISON_OPERATORS[m.group('op')]
2162 if m.group('strval') is not None:
2163 if m.group('op') not in ('=', '!='):
2164 raise ValueError(
2165 'Operator %s does not support string values!' % m.group('op'))
2166 comparison_value = m.group('strval')
2167 else:
2168 try:
2169 comparison_value = int(m.group('intval'))
2170 except ValueError:
2171 comparison_value = parse_filesize(m.group('intval'))
2172 if comparison_value is None:
2173 comparison_value = parse_filesize(m.group('intval') + 'B')
2174 if comparison_value is None:
2175 raise ValueError(
2176 'Invalid integer value %r in filter part %r' % (
2177 m.group('intval'), filter_part))
2178 actual_value = dct.get(m.group('key'))
2179 if actual_value is None:
2180 return m.group('none_inclusive')
2181 return op(actual_value, comparison_value)
2182
2183 UNARY_OPERATORS = {
2184 '': lambda v: v is not None,
2185 '!': lambda v: v is None,
2186 }
2187 operator_rex = re.compile(r'''(?x)\s*
2188 (?P<op>%s)\s*(?P<key>[a-z_]+)
2189 \s*$
2190 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2191 m = operator_rex.search(filter_part)
2192 if m:
2193 op = UNARY_OPERATORS[m.group('op')]
2194 actual_value = dct.get(m.group('key'))
2195 return op(actual_value)
2196
2197 raise ValueError('Invalid filter part %r' % filter_part)
2198
2199
2200 def match_str(filter_str, dct):
2201 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2202
2203 return all(
2204 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2205
2206
2207 def match_filter_func(filter_str):
2208 def _match_func(info_dict):
2209 if match_str(filter_str, info_dict):
2210 return None
2211 else:
2212 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2213 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2214 return _match_func
2215
2216
2217 def parse_dfxp_time_expr(time_expr):
2218 if not time_expr:
2219 return
2220
2221 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2222 if mobj:
2223 return float(mobj.group('time_offset'))
2224
2225 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2226 if mobj:
2227 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2228
2229
2230 def srt_subtitles_timecode(seconds):
2231 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2232
2233
2234 def dfxp2srt(dfxp_data):
2235 _x = functools.partial(xpath_with_ns, ns_map={
2236 'ttml': 'http://www.w3.org/ns/ttml',
2237 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2238 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2239 })
2240
2241 class TTMLPElementParser(object):
2242 out = ''
2243
2244 def start(self, tag, attrib):
2245 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2246 self.out += '\n'
2247
2248 def end(self, tag):
2249 pass
2250
2251 def data(self, data):
2252 self.out += data
2253
2254 def close(self):
2255 return self.out.strip()
2256
2257 def parse_node(node):
2258 target = TTMLPElementParser()
2259 parser = xml.etree.ElementTree.XMLParser(target=target)
2260 parser.feed(xml.etree.ElementTree.tostring(node))
2261 return parser.close()
2262
2263 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2264 out = []
2265 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2266
2267 if not paras:
2268 raise ValueError('Invalid dfxp/TTML subtitle')
2269
2270 for para, index in zip(paras, itertools.count(1)):
2271 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2272 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2273 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2274 if begin_time is None:
2275 continue
2276 if not end_time:
2277 if not dur:
2278 continue
2279 end_time = begin_time + dur
2280 out.append('%d\n%s --> %s\n%s\n\n' % (
2281 index,
2282 srt_subtitles_timecode(begin_time),
2283 srt_subtitles_timecode(end_time),
2284 parse_node(para)))
2285
2286 return ''.join(out)
2287
2288
2289 def cli_option(params, command_option, param):
2290 param = params.get(param)
2291 return [command_option, param] if param is not None else []
2292
2293
2294 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2295 param = params.get(param)
2296 assert isinstance(param, bool)
2297 if separator:
2298 return [command_option + separator + (true_value if param else false_value)]
2299 return [command_option, true_value if param else false_value]
2300
2301
2302 def cli_valueless_option(params, command_option, param, expected_value=True):
2303 param = params.get(param)
2304 return [command_option] if param == expected_value else []
2305
2306
2307 def cli_configuration_args(params, param, default=[]):
2308 ex_args = params.get(param)
2309 if ex_args is None:
2310 return default
2311 assert isinstance(ex_args, list)
2312 return ex_args
2313
2314
2315 class ISO639Utils(object):
2316 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2317 _lang_map = {
2318 'aa': 'aar',
2319 'ab': 'abk',
2320 'ae': 'ave',
2321 'af': 'afr',
2322 'ak': 'aka',
2323 'am': 'amh',
2324 'an': 'arg',
2325 'ar': 'ara',
2326 'as': 'asm',
2327 'av': 'ava',
2328 'ay': 'aym',
2329 'az': 'aze',
2330 'ba': 'bak',
2331 'be': 'bel',
2332 'bg': 'bul',
2333 'bh': 'bih',
2334 'bi': 'bis',
2335 'bm': 'bam',
2336 'bn': 'ben',
2337 'bo': 'bod',
2338 'br': 'bre',
2339 'bs': 'bos',
2340 'ca': 'cat',
2341 'ce': 'che',
2342 'ch': 'cha',
2343 'co': 'cos',
2344 'cr': 'cre',
2345 'cs': 'ces',
2346 'cu': 'chu',
2347 'cv': 'chv',
2348 'cy': 'cym',
2349 'da': 'dan',
2350 'de': 'deu',
2351 'dv': 'div',
2352 'dz': 'dzo',
2353 'ee': 'ewe',
2354 'el': 'ell',
2355 'en': 'eng',
2356 'eo': 'epo',
2357 'es': 'spa',
2358 'et': 'est',
2359 'eu': 'eus',
2360 'fa': 'fas',
2361 'ff': 'ful',
2362 'fi': 'fin',
2363 'fj': 'fij',
2364 'fo': 'fao',
2365 'fr': 'fra',
2366 'fy': 'fry',
2367 'ga': 'gle',
2368 'gd': 'gla',
2369 'gl': 'glg',
2370 'gn': 'grn',
2371 'gu': 'guj',
2372 'gv': 'glv',
2373 'ha': 'hau',
2374 'he': 'heb',
2375 'hi': 'hin',
2376 'ho': 'hmo',
2377 'hr': 'hrv',
2378 'ht': 'hat',
2379 'hu': 'hun',
2380 'hy': 'hye',
2381 'hz': 'her',
2382 'ia': 'ina',
2383 'id': 'ind',
2384 'ie': 'ile',
2385 'ig': 'ibo',
2386 'ii': 'iii',
2387 'ik': 'ipk',
2388 'io': 'ido',
2389 'is': 'isl',
2390 'it': 'ita',
2391 'iu': 'iku',
2392 'ja': 'jpn',
2393 'jv': 'jav',
2394 'ka': 'kat',
2395 'kg': 'kon',
2396 'ki': 'kik',
2397 'kj': 'kua',
2398 'kk': 'kaz',
2399 'kl': 'kal',
2400 'km': 'khm',
2401 'kn': 'kan',
2402 'ko': 'kor',
2403 'kr': 'kau',
2404 'ks': 'kas',
2405 'ku': 'kur',
2406 'kv': 'kom',
2407 'kw': 'cor',
2408 'ky': 'kir',
2409 'la': 'lat',
2410 'lb': 'ltz',
2411 'lg': 'lug',
2412 'li': 'lim',
2413 'ln': 'lin',
2414 'lo': 'lao',
2415 'lt': 'lit',
2416 'lu': 'lub',
2417 'lv': 'lav',
2418 'mg': 'mlg',
2419 'mh': 'mah',
2420 'mi': 'mri',
2421 'mk': 'mkd',
2422 'ml': 'mal',
2423 'mn': 'mon',
2424 'mr': 'mar',
2425 'ms': 'msa',
2426 'mt': 'mlt',
2427 'my': 'mya',
2428 'na': 'nau',
2429 'nb': 'nob',
2430 'nd': 'nde',
2431 'ne': 'nep',
2432 'ng': 'ndo',
2433 'nl': 'nld',
2434 'nn': 'nno',
2435 'no': 'nor',
2436 'nr': 'nbl',
2437 'nv': 'nav',
2438 'ny': 'nya',
2439 'oc': 'oci',
2440 'oj': 'oji',
2441 'om': 'orm',
2442 'or': 'ori',
2443 'os': 'oss',
2444 'pa': 'pan',
2445 'pi': 'pli',
2446 'pl': 'pol',
2447 'ps': 'pus',
2448 'pt': 'por',
2449 'qu': 'que',
2450 'rm': 'roh',
2451 'rn': 'run',
2452 'ro': 'ron',
2453 'ru': 'rus',
2454 'rw': 'kin',
2455 'sa': 'san',
2456 'sc': 'srd',
2457 'sd': 'snd',
2458 'se': 'sme',
2459 'sg': 'sag',
2460 'si': 'sin',
2461 'sk': 'slk',
2462 'sl': 'slv',
2463 'sm': 'smo',
2464 'sn': 'sna',
2465 'so': 'som',
2466 'sq': 'sqi',
2467 'sr': 'srp',
2468 'ss': 'ssw',
2469 'st': 'sot',
2470 'su': 'sun',
2471 'sv': 'swe',
2472 'sw': 'swa',
2473 'ta': 'tam',
2474 'te': 'tel',
2475 'tg': 'tgk',
2476 'th': 'tha',
2477 'ti': 'tir',
2478 'tk': 'tuk',
2479 'tl': 'tgl',
2480 'tn': 'tsn',
2481 'to': 'ton',
2482 'tr': 'tur',
2483 'ts': 'tso',
2484 'tt': 'tat',
2485 'tw': 'twi',
2486 'ty': 'tah',
2487 'ug': 'uig',
2488 'uk': 'ukr',
2489 'ur': 'urd',
2490 'uz': 'uzb',
2491 've': 'ven',
2492 'vi': 'vie',
2493 'vo': 'vol',
2494 'wa': 'wln',
2495 'wo': 'wol',
2496 'xh': 'xho',
2497 'yi': 'yid',
2498 'yo': 'yor',
2499 'za': 'zha',
2500 'zh': 'zho',
2501 'zu': 'zul',
2502 }
2503
2504 @classmethod
2505 def short2long(cls, code):
2506 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2507 return cls._lang_map.get(code[:2])
2508
2509 @classmethod
2510 def long2short(cls, code):
2511 """Convert language code from ISO 639-2/T to ISO 639-1"""
2512 for short_name, long_name in cls._lang_map.items():
2513 if long_name == code:
2514 return short_name
2515
2516
2517 class ISO3166Utils(object):
2518 # From http://data.okfn.org/data/core/country-list
2519 _country_map = {
2520 'AF': 'Afghanistan',
2521 'AX': 'Åland Islands',
2522 'AL': 'Albania',
2523 'DZ': 'Algeria',
2524 'AS': 'American Samoa',
2525 'AD': 'Andorra',
2526 'AO': 'Angola',
2527 'AI': 'Anguilla',
2528 'AQ': 'Antarctica',
2529 'AG': 'Antigua and Barbuda',
2530 'AR': 'Argentina',
2531 'AM': 'Armenia',
2532 'AW': 'Aruba',
2533 'AU': 'Australia',
2534 'AT': 'Austria',
2535 'AZ': 'Azerbaijan',
2536 'BS': 'Bahamas',
2537 'BH': 'Bahrain',
2538 'BD': 'Bangladesh',
2539 'BB': 'Barbados',
2540 'BY': 'Belarus',
2541 'BE': 'Belgium',
2542 'BZ': 'Belize',
2543 'BJ': 'Benin',
2544 'BM': 'Bermuda',
2545 'BT': 'Bhutan',
2546 'BO': 'Bolivia, Plurinational State of',
2547 'BQ': 'Bonaire, Sint Eustatius and Saba',
2548 'BA': 'Bosnia and Herzegovina',
2549 'BW': 'Botswana',
2550 'BV': 'Bouvet Island',
2551 'BR': 'Brazil',
2552 'IO': 'British Indian Ocean Territory',
2553 'BN': 'Brunei Darussalam',
2554 'BG': 'Bulgaria',
2555 'BF': 'Burkina Faso',
2556 'BI': 'Burundi',
2557 'KH': 'Cambodia',
2558 'CM': 'Cameroon',
2559 'CA': 'Canada',
2560 'CV': 'Cape Verde',
2561 'KY': 'Cayman Islands',
2562 'CF': 'Central African Republic',
2563 'TD': 'Chad',
2564 'CL': 'Chile',
2565 'CN': 'China',
2566 'CX': 'Christmas Island',
2567 'CC': 'Cocos (Keeling) Islands',
2568 'CO': 'Colombia',
2569 'KM': 'Comoros',
2570 'CG': 'Congo',
2571 'CD': 'Congo, the Democratic Republic of the',
2572 'CK': 'Cook Islands',
2573 'CR': 'Costa Rica',
2574 'CI': 'Côte d\'Ivoire',
2575 'HR': 'Croatia',
2576 'CU': 'Cuba',
2577 'CW': 'Curaçao',
2578 'CY': 'Cyprus',
2579 'CZ': 'Czech Republic',
2580 'DK': 'Denmark',
2581 'DJ': 'Djibouti',
2582 'DM': 'Dominica',
2583 'DO': 'Dominican Republic',
2584 'EC': 'Ecuador',
2585 'EG': 'Egypt',
2586 'SV': 'El Salvador',
2587 'GQ': 'Equatorial Guinea',
2588 'ER': 'Eritrea',
2589 'EE': 'Estonia',
2590 'ET': 'Ethiopia',
2591 'FK': 'Falkland Islands (Malvinas)',
2592 'FO': 'Faroe Islands',
2593 'FJ': 'Fiji',
2594 'FI': 'Finland',
2595 'FR': 'France',
2596 'GF': 'French Guiana',
2597 'PF': 'French Polynesia',
2598 'TF': 'French Southern Territories',
2599 'GA': 'Gabon',
2600 'GM': 'Gambia',
2601 'GE': 'Georgia',
2602 'DE': 'Germany',
2603 'GH': 'Ghana',
2604 'GI': 'Gibraltar',
2605 'GR': 'Greece',
2606 'GL': 'Greenland',
2607 'GD': 'Grenada',
2608 'GP': 'Guadeloupe',
2609 'GU': 'Guam',
2610 'GT': 'Guatemala',
2611 'GG': 'Guernsey',
2612 'GN': 'Guinea',
2613 'GW': 'Guinea-Bissau',
2614 'GY': 'Guyana',
2615 'HT': 'Haiti',
2616 'HM': 'Heard Island and McDonald Islands',
2617 'VA': 'Holy See (Vatican City State)',
2618 'HN': 'Honduras',
2619 'HK': 'Hong Kong',
2620 'HU': 'Hungary',
2621 'IS': 'Iceland',
2622 'IN': 'India',
2623 'ID': 'Indonesia',
2624 'IR': 'Iran, Islamic Republic of',
2625 'IQ': 'Iraq',
2626 'IE': 'Ireland',
2627 'IM': 'Isle of Man',
2628 'IL': 'Israel',
2629 'IT': 'Italy',
2630 'JM': 'Jamaica',
2631 'JP': 'Japan',
2632 'JE': 'Jersey',
2633 'JO': 'Jordan',
2634 'KZ': 'Kazakhstan',
2635 'KE': 'Kenya',
2636 'KI': 'Kiribati',
2637 'KP': 'Korea, Democratic People\'s Republic of',
2638 'KR': 'Korea, Republic of',
2639 'KW': 'Kuwait',
2640 'KG': 'Kyrgyzstan',
2641 'LA': 'Lao People\'s Democratic Republic',
2642 'LV': 'Latvia',
2643 'LB': 'Lebanon',
2644 'LS': 'Lesotho',
2645 'LR': 'Liberia',
2646 'LY': 'Libya',
2647 'LI': 'Liechtenstein',
2648 'LT': 'Lithuania',
2649 'LU': 'Luxembourg',
2650 'MO': 'Macao',
2651 'MK': 'Macedonia, the Former Yugoslav Republic of',
2652 'MG': 'Madagascar',
2653 'MW': 'Malawi',
2654 'MY': 'Malaysia',
2655 'MV': 'Maldives',
2656 'ML': 'Mali',
2657 'MT': 'Malta',
2658 'MH': 'Marshall Islands',
2659 'MQ': 'Martinique',
2660 'MR': 'Mauritania',
2661 'MU': 'Mauritius',
2662 'YT': 'Mayotte',
2663 'MX': 'Mexico',
2664 'FM': 'Micronesia, Federated States of',
2665 'MD': 'Moldova, Republic of',
2666 'MC': 'Monaco',
2667 'MN': 'Mongolia',
2668 'ME': 'Montenegro',
2669 'MS': 'Montserrat',
2670 'MA': 'Morocco',
2671 'MZ': 'Mozambique',
2672 'MM': 'Myanmar',
2673 'NA': 'Namibia',
2674 'NR': 'Nauru',
2675 'NP': 'Nepal',
2676 'NL': 'Netherlands',
2677 'NC': 'New Caledonia',
2678 'NZ': 'New Zealand',
2679 'NI': 'Nicaragua',
2680 'NE': 'Niger',
2681 'NG': 'Nigeria',
2682 'NU': 'Niue',
2683 'NF': 'Norfolk Island',
2684 'MP': 'Northern Mariana Islands',
2685 'NO': 'Norway',
2686 'OM': 'Oman',
2687 'PK': 'Pakistan',
2688 'PW': 'Palau',
2689 'PS': 'Palestine, State of',
2690 'PA': 'Panama',
2691 'PG': 'Papua New Guinea',
2692 'PY': 'Paraguay',
2693 'PE': 'Peru',
2694 'PH': 'Philippines',
2695 'PN': 'Pitcairn',
2696 'PL': 'Poland',
2697 'PT': 'Portugal',
2698 'PR': 'Puerto Rico',
2699 'QA': 'Qatar',
2700 'RE': 'Réunion',
2701 'RO': 'Romania',
2702 'RU': 'Russian Federation',
2703 'RW': 'Rwanda',
2704 'BL': 'Saint Barthélemy',
2705 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2706 'KN': 'Saint Kitts and Nevis',
2707 'LC': 'Saint Lucia',
2708 'MF': 'Saint Martin (French part)',
2709 'PM': 'Saint Pierre and Miquelon',
2710 'VC': 'Saint Vincent and the Grenadines',
2711 'WS': 'Samoa',
2712 'SM': 'San Marino',
2713 'ST': 'Sao Tome and Principe',
2714 'SA': 'Saudi Arabia',
2715 'SN': 'Senegal',
2716 'RS': 'Serbia',
2717 'SC': 'Seychelles',
2718 'SL': 'Sierra Leone',
2719 'SG': 'Singapore',
2720 'SX': 'Sint Maarten (Dutch part)',
2721 'SK': 'Slovakia',
2722 'SI': 'Slovenia',
2723 'SB': 'Solomon Islands',
2724 'SO': 'Somalia',
2725 'ZA': 'South Africa',
2726 'GS': 'South Georgia and the South Sandwich Islands',
2727 'SS': 'South Sudan',
2728 'ES': 'Spain',
2729 'LK': 'Sri Lanka',
2730 'SD': 'Sudan',
2731 'SR': 'Suriname',
2732 'SJ': 'Svalbard and Jan Mayen',
2733 'SZ': 'Swaziland',
2734 'SE': 'Sweden',
2735 'CH': 'Switzerland',
2736 'SY': 'Syrian Arab Republic',
2737 'TW': 'Taiwan, Province of China',
2738 'TJ': 'Tajikistan',
2739 'TZ': 'Tanzania, United Republic of',
2740 'TH': 'Thailand',
2741 'TL': 'Timor-Leste',
2742 'TG': 'Togo',
2743 'TK': 'Tokelau',
2744 'TO': 'Tonga',
2745 'TT': 'Trinidad and Tobago',
2746 'TN': 'Tunisia',
2747 'TR': 'Turkey',
2748 'TM': 'Turkmenistan',
2749 'TC': 'Turks and Caicos Islands',
2750 'TV': 'Tuvalu',
2751 'UG': 'Uganda',
2752 'UA': 'Ukraine',
2753 'AE': 'United Arab Emirates',
2754 'GB': 'United Kingdom',
2755 'US': 'United States',
2756 'UM': 'United States Minor Outlying Islands',
2757 'UY': 'Uruguay',
2758 'UZ': 'Uzbekistan',
2759 'VU': 'Vanuatu',
2760 'VE': 'Venezuela, Bolivarian Republic of',
2761 'VN': 'Viet Nam',
2762 'VG': 'Virgin Islands, British',
2763 'VI': 'Virgin Islands, U.S.',
2764 'WF': 'Wallis and Futuna',
2765 'EH': 'Western Sahara',
2766 'YE': 'Yemen',
2767 'ZM': 'Zambia',
2768 'ZW': 'Zimbabwe',
2769 }
2770
2771 @classmethod
2772 def short2full(cls, code):
2773 """Convert an ISO 3166-2 country code to the corresponding full name"""
2774 return cls._country_map.get(code.upper())
2775
2776
2777 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2778 def __init__(self, proxies=None):
2779 # Set default handlers
2780 for type in ('http', 'https'):
2781 setattr(self, '%s_open' % type,
2782 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2783 meth(r, proxy, type))
2784 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2785
2786 def proxy_open(self, req, proxy, type):
2787 req_proxy = req.headers.get('Ytdl-request-proxy')
2788 if req_proxy is not None:
2789 proxy = req_proxy
2790 del req.headers['Ytdl-request-proxy']
2791
2792 if proxy == '__noproxy__':
2793 return None # No Proxy
2794 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2795 req.add_header('Ytdl-socks-proxy', proxy)
2796 # youtube-dl's http/https handlers do wrapping the socket with socks
2797 return None
2798 return compat_urllib_request.ProxyHandler.proxy_open(
2799 self, req, proxy, type)
2800
2801
2802 def ohdave_rsa_encrypt(data, exponent, modulus):
2803 '''
2804 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2805
2806 Input:
2807 data: data to encrypt, bytes-like object
2808 exponent, modulus: parameter e and N of RSA algorithm, both integer
2809 Output: hex string of encrypted data
2810
2811 Limitation: supports one block encryption only
2812 '''
2813
2814 payload = int(binascii.hexlify(data[::-1]), 16)
2815 encrypted = pow(payload, exponent, modulus)
2816 return '%x' % encrypted
2817
2818
2819 def encode_base_n(num, n, table=None):
2820 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2821 if not table:
2822 table = FULL_TABLE[:n]
2823
2824 if n > len(table):
2825 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2826
2827 if num == 0:
2828 return table[0]
2829
2830 ret = ''
2831 while num:
2832 ret = table[num % n] + ret
2833 num = num // n
2834 return ret
2835
2836
2837 def decode_packed_codes(code):
2838 mobj = re.search(
2839 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2840 code)
2841 obfucasted_code, base, count, symbols = mobj.groups()
2842 base = int(base)
2843 count = int(count)
2844 symbols = symbols.split('|')
2845 symbol_table = {}
2846
2847 while count:
2848 count -= 1
2849 base_n_count = encode_base_n(count, base)
2850 symbol_table[base_n_count] = symbols[count] or base_n_count
2851
2852 return re.sub(
2853 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2854 obfucasted_code)
2855
2856
2857 def parse_m3u8_attributes(attrib):
2858 info = {}
2859 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2860 if val.startswith('"'):
2861 val = val[1:-1]
2862 info[key] = val
2863 return info