]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
8f5463f1c9a1e1a2660867abdc0f1f62e9147032
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import os
21 import pipes
22 import platform
23 import re
24 import ssl
25 import socket
26 import struct
27 import subprocess
28 import sys
29 import tempfile
30 import traceback
31 import xml.etree.ElementTree
32 import zlib
33
34 from .compat import (
35 compat_basestring,
36 compat_chr,
37 compat_getenv,
38 compat_html_entities,
39 compat_http_client,
40 compat_parse_qs,
41 compat_socket_create_connection,
42 compat_str,
43 compat_urllib_error,
44 compat_urllib_parse,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
47 compat_urlparse,
48 shlex_quote,
49 )
50
51
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
54
55 std_headers = {
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
61 }
62
63
64 def preferredencoding():
65 """Get preferred encoding.
66
67 Returns the best encoding scheme for the system, based on
68 locale.getpreferredencoding() and some further tweaks.
69 """
70 try:
71 pref = locale.getpreferredencoding()
72 'TEST'.encode(pref)
73 except:
74 pref = 'UTF-8'
75
76 return pref
77
78
79 def write_json_file(obj, fn):
80 """ Encode obj as JSON and write it to fn, atomically if possible """
81
82 fn = encodeFilename(fn)
83 if sys.version_info < (3, 0) and sys.platform != 'win32':
84 encoding = get_filesystem_encoding()
85 # os.path.basename returns a bytes object, but NamedTemporaryFile
86 # will fail if the filename contains non ascii characters unless we
87 # use a unicode object
88 path_basename = lambda f: os.path.basename(fn).decode(encoding)
89 # the same for os.path.dirname
90 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 else:
92 path_basename = os.path.basename
93 path_dirname = os.path.dirname
94
95 args = {
96 'suffix': '.tmp',
97 'prefix': path_basename(fn) + '.',
98 'dir': path_dirname(fn),
99 'delete': False,
100 }
101
102 # In Python 2.x, json.dump expects a bytestream.
103 # In Python 3.x, it writes to a character stream
104 if sys.version_info < (3, 0):
105 args['mode'] = 'wb'
106 else:
107 args.update({
108 'mode': 'w',
109 'encoding': 'utf-8',
110 })
111
112 tf = tempfile.NamedTemporaryFile(**args)
113
114 try:
115 with tf:
116 json.dump(obj, tf)
117 if sys.platform == 'win32':
118 # Need to remove existing file on Windows, else os.rename raises
119 # WindowsError or FileExistsError.
120 try:
121 os.unlink(fn)
122 except OSError:
123 pass
124 os.rename(tf.name, fn)
125 except:
126 try:
127 os.remove(tf.name)
128 except OSError:
129 pass
130 raise
131
132
133 if sys.version_info >= (2, 7):
134 def find_xpath_attr(node, xpath, key, val):
135 """ Find the xpath xpath[@key=val] """
136 assert re.match(r'^[a-zA-Z-]+$', key)
137 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
138 expr = xpath + "[@%s='%s']" % (key, val)
139 return node.find(expr)
140 else:
141 def find_xpath_attr(node, xpath, key, val):
142 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
143 # .//node does not match if a node is a direct child of . !
144 if isinstance(xpath, compat_str):
145 xpath = xpath.encode('ascii')
146
147 for f in node.findall(xpath):
148 if f.attrib.get(key) == val:
149 return f
150 return None
151
152 # On python2.6 the xml.etree.ElementTree.Element methods don't support
153 # the namespace parameter
154
155
156 def xpath_with_ns(path, ns_map):
157 components = [c.split(':') for c in path.split('/')]
158 replaced = []
159 for c in components:
160 if len(c) == 1:
161 replaced.append(c[0])
162 else:
163 ns, tag = c
164 replaced.append('{%s}%s' % (ns_map[ns], tag))
165 return '/'.join(replaced)
166
167
168 def xpath_text(node, xpath, name=None, fatal=False):
169 if sys.version_info < (2, 7): # Crazy 2.6
170 xpath = xpath.encode('ascii')
171
172 n = node.find(xpath)
173 if n is None or n.text is None:
174 if fatal:
175 name = xpath if name is None else name
176 raise ExtractorError('Could not find XML element %s' % name)
177 else:
178 return None
179 return n.text
180
181
182 def get_element_by_id(id, html):
183 """Return the content of the tag with the specified ID in the passed HTML document"""
184 return get_element_by_attribute("id", id, html)
185
186
187 def get_element_by_attribute(attribute, value, html):
188 """Return the content of the tag with the specified attribute in the passed HTML document"""
189
190 m = re.search(r'''(?xs)
191 <([a-zA-Z0-9:._-]+)
192 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 \s+%s=['"]?%s['"]?
194 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
195 \s*>
196 (?P<content>.*?)
197 </\1>
198 ''' % (re.escape(attribute), re.escape(value)), html)
199
200 if not m:
201 return None
202 res = m.group('content')
203
204 if res.startswith('"') or res.startswith("'"):
205 res = res[1:-1]
206
207 return unescapeHTML(res)
208
209
210 def clean_html(html):
211 """Clean an HTML snippet into a readable string"""
212
213 if html is None: # Convenience for sanitizing descriptions etc.
214 return html
215
216 # Newline vs <br />
217 html = html.replace('\n', ' ')
218 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
219 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 # Strip html tags
221 html = re.sub('<.*?>', '', html)
222 # Replace html entities
223 html = unescapeHTML(html)
224 return html.strip()
225
226
227 def sanitize_open(filename, open_mode):
228 """Try to open the given filename, and slightly tweak it if this fails.
229
230 Attempts to open the given filename. If this fails, it tries to change
231 the filename slightly, step by step, until it's either able to open it
232 or it fails and raises a final exception, like the standard open()
233 function.
234
235 It returns the tuple (stream, definitive_file_name).
236 """
237 try:
238 if filename == '-':
239 if sys.platform == 'win32':
240 import msvcrt
241 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
242 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
243 stream = open(encodeFilename(filename), open_mode)
244 return (stream, filename)
245 except (IOError, OSError) as err:
246 if err.errno in (errno.EACCES,):
247 raise
248
249 # In case of error, try to remove win32 forbidden chars
250 alt_filename = os.path.join(
251 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
252 for path_part in os.path.split(filename)
253 )
254 if alt_filename == filename:
255 raise
256 else:
257 # An exception here should be caught in the caller
258 stream = open(encodeFilename(filename), open_mode)
259 return (stream, alt_filename)
260
261
262 def timeconvert(timestr):
263 """Convert RFC 2822 defined time string into system timestamp"""
264 timestamp = None
265 timetuple = email.utils.parsedate_tz(timestr)
266 if timetuple is not None:
267 timestamp = email.utils.mktime_tz(timetuple)
268 return timestamp
269
270
271 def sanitize_filename(s, restricted=False, is_id=False):
272 """Sanitizes a string so it could be used as part of a filename.
273 If restricted is set, use a stricter subset of allowed characters.
274 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 """
276 def replace_insane(char):
277 if char == '?' or ord(char) < 32 or ord(char) == 127:
278 return ''
279 elif char == '"':
280 return '' if restricted else '\''
281 elif char == ':':
282 return '_-' if restricted else ' -'
283 elif char in '\\/|*<>':
284 return '_'
285 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 return '_'
287 if restricted and ord(char) > 127:
288 return '_'
289 return char
290
291 # Handle timestamps
292 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
293 result = ''.join(map(replace_insane, s))
294 if not is_id:
295 while '__' in result:
296 result = result.replace('__', '_')
297 result = result.strip('_')
298 # Common case of "Foreign band name - English song title"
299 if restricted and result.startswith('-_'):
300 result = result[2:]
301 if not result:
302 result = '_'
303 return result
304
305
306 def orderedSet(iterable):
307 """ Remove all duplicates from the input iterable """
308 res = []
309 for el in iterable:
310 if el not in res:
311 res.append(el)
312 return res
313
314
315 def _htmlentity_transform(entity):
316 """Transforms an HTML entity to a character."""
317 # Known non-numeric HTML entity
318 if entity in compat_html_entities.name2codepoint:
319 return compat_chr(compat_html_entities.name2codepoint[entity])
320
321 mobj = re.match(r'#(x?[0-9]+)', entity)
322 if mobj is not None:
323 numstr = mobj.group(1)
324 if numstr.startswith('x'):
325 base = 16
326 numstr = '0%s' % numstr
327 else:
328 base = 10
329 return compat_chr(int(numstr, base))
330
331 # Unknown entity in name, return its literal representation
332 return ('&%s;' % entity)
333
334
335 def unescapeHTML(s):
336 if s is None:
337 return None
338 assert type(s) == compat_str
339
340 return re.sub(
341 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
342
343
344 def encodeFilename(s, for_subprocess=False):
345 """
346 @param s The name of the file
347 """
348
349 assert type(s) == compat_str
350
351 # Python 3 has a Unicode API
352 if sys.version_info >= (3, 0):
353 return s
354
355 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
356 # Pass '' directly to use Unicode APIs on Windows 2000 and up
357 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
358 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
359 if not for_subprocess:
360 return s
361 else:
362 # For subprocess calls, encode with locale encoding
363 # Refer to http://stackoverflow.com/a/9951851/35070
364 encoding = preferredencoding()
365 else:
366 encoding = sys.getfilesystemencoding()
367 if encoding is None:
368 encoding = 'utf-8'
369 return s.encode(encoding, 'ignore')
370
371
372 def encodeArgument(s):
373 if not isinstance(s, compat_str):
374 # Legacy code that uses byte strings
375 # Uncomment the following line after fixing all post processors
376 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
377 s = s.decode('ascii')
378 return encodeFilename(s, True)
379
380
381 def decodeOption(optval):
382 if optval is None:
383 return optval
384 if isinstance(optval, bytes):
385 optval = optval.decode(preferredencoding())
386
387 assert isinstance(optval, compat_str)
388 return optval
389
390
391 def formatSeconds(secs):
392 if secs > 3600:
393 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 elif secs > 60:
395 return '%d:%02d' % (secs // 60, secs % 60)
396 else:
397 return '%d' % secs
398
399
400 def make_HTTPS_handler(params, **kwargs):
401 opts_no_check_certificate = params.get('nocheckcertificate', False)
402 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
403 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
404 if opts_no_check_certificate:
405 context.check_hostname = False
406 context.verify_mode = ssl.CERT_NONE
407 try:
408 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
409 except TypeError:
410 # Python 2.7.8
411 # (create_default_context present but HTTPSHandler has no context=)
412 pass
413
414 if sys.version_info < (3, 2):
415 return YoutubeDLHTTPSHandler(params, **kwargs)
416 else: # Python < 3.4
417 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
418 context.verify_mode = (ssl.CERT_NONE
419 if opts_no_check_certificate
420 else ssl.CERT_REQUIRED)
421 context.set_default_verify_paths()
422 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
423
424
425 class ExtractorError(Exception):
426 """Error during info extraction."""
427
428 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
429 """ tb, if given, is the original traceback (so that it can be printed out).
430 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
431 """
432
433 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
434 expected = True
435 if video_id is not None:
436 msg = video_id + ': ' + msg
437 if cause:
438 msg += ' (caused by %r)' % cause
439 if not expected:
440 if ytdl_is_updateable():
441 update_cmd = 'type youtube-dl -U to update'
442 else:
443 update_cmd = 'see https://yt-dl.org/update on how to update'
444 msg += '; please report this issue on https://yt-dl.org/bug .'
445 msg += ' Make sure you are using the latest version; %s.' % update_cmd
446 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
447 super(ExtractorError, self).__init__(msg)
448
449 self.traceback = tb
450 self.exc_info = sys.exc_info() # preserve original exception
451 self.cause = cause
452 self.video_id = video_id
453
454 def format_traceback(self):
455 if self.traceback is None:
456 return None
457 return ''.join(traceback.format_tb(self.traceback))
458
459
460 class UnsupportedError(ExtractorError):
461 def __init__(self, url):
462 super(UnsupportedError, self).__init__(
463 'Unsupported URL: %s' % url, expected=True)
464 self.url = url
465
466
467 class RegexNotFoundError(ExtractorError):
468 """Error when a regex didn't match"""
469 pass
470
471
472 class DownloadError(Exception):
473 """Download Error exception.
474
475 This exception may be thrown by FileDownloader objects if they are not
476 configured to continue on errors. They will contain the appropriate
477 error message.
478 """
479
480 def __init__(self, msg, exc_info=None):
481 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
482 super(DownloadError, self).__init__(msg)
483 self.exc_info = exc_info
484
485
486 class SameFileError(Exception):
487 """Same File exception.
488
489 This exception will be thrown by FileDownloader objects if they detect
490 multiple files would have to be downloaded to the same file on disk.
491 """
492 pass
493
494
495 class PostProcessingError(Exception):
496 """Post Processing exception.
497
498 This exception may be raised by PostProcessor's .run() method to
499 indicate an error in the postprocessing task.
500 """
501
502 def __init__(self, msg):
503 self.msg = msg
504
505
506 class MaxDownloadsReached(Exception):
507 """ --max-downloads limit has been reached. """
508 pass
509
510
511 class UnavailableVideoError(Exception):
512 """Unavailable Format exception.
513
514 This exception will be thrown when a video is requested
515 in a format that is not available for that video.
516 """
517 pass
518
519
520 class ContentTooShortError(Exception):
521 """Content Too Short exception.
522
523 This exception may be raised by FileDownloader objects when a file they
524 download is too small for what the server announced first, indicating
525 the connection was probably interrupted.
526 """
527 # Both in bytes
528 downloaded = None
529 expected = None
530
531 def __init__(self, downloaded, expected):
532 self.downloaded = downloaded
533 self.expected = expected
534
535
536 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
537 hc = http_class(*args, **kwargs)
538 source_address = ydl_handler._params.get('source_address')
539 if source_address is not None:
540 sa = (source_address, 0)
541 if hasattr(hc, 'source_address'): # Python 2.7+
542 hc.source_address = sa
543 else: # Python 2.6
544 def _hc_connect(self, *args, **kwargs):
545 sock = compat_socket_create_connection(
546 (self.host, self.port), self.timeout, sa)
547 if is_https:
548 self.sock = ssl.wrap_socket(
549 sock, self.key_file, self.cert_file,
550 ssl_version=ssl.PROTOCOL_TLSv1)
551 else:
552 self.sock = sock
553 hc.connect = functools.partial(_hc_connect, hc)
554
555 return hc
556
557
558 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
559 """Handler for HTTP requests and responses.
560
561 This class, when installed with an OpenerDirector, automatically adds
562 the standard headers to every HTTP request and handles gzipped and
563 deflated responses from web servers. If compression is to be avoided in
564 a particular request, the original request in the program code only has
565 to include the HTTP header "Youtubedl-No-Compression", which will be
566 removed before making the real request.
567
568 Part of this code was copied from:
569
570 http://techknack.net/python-urllib2-handlers/
571
572 Andrew Rowls, the author of that code, agreed to release it to the
573 public domain.
574 """
575
576 def __init__(self, params, *args, **kwargs):
577 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
578 self._params = params
579
580 def http_open(self, req):
581 return self.do_open(functools.partial(
582 _create_http_connection, self, compat_http_client.HTTPConnection, False),
583 req)
584
585 @staticmethod
586 def deflate(data):
587 try:
588 return zlib.decompress(data, -zlib.MAX_WBITS)
589 except zlib.error:
590 return zlib.decompress(data)
591
592 @staticmethod
593 def addinfourl_wrapper(stream, headers, url, code):
594 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
595 return compat_urllib_request.addinfourl(stream, headers, url, code)
596 ret = compat_urllib_request.addinfourl(stream, headers, url)
597 ret.code = code
598 return ret
599
600 def http_request(self, req):
601 for h, v in std_headers.items():
602 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
603 # The dict keys are capitalized because of this bug by urllib
604 if h.capitalize() not in req.headers:
605 req.add_header(h, v)
606 if 'Youtubedl-no-compression' in req.headers:
607 if 'Accept-encoding' in req.headers:
608 del req.headers['Accept-encoding']
609 del req.headers['Youtubedl-no-compression']
610
611 if sys.version_info < (2, 7) and '#' in req.get_full_url():
612 # Python 2.6 is brain-dead when it comes to fragments
613 req._Request__original = req._Request__original.partition('#')[0]
614 req._Request__r_type = req._Request__r_type.partition('#')[0]
615
616 return req
617
618 def http_response(self, req, resp):
619 old_resp = resp
620 # gzip
621 if resp.headers.get('Content-encoding', '') == 'gzip':
622 content = resp.read()
623 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
624 try:
625 uncompressed = io.BytesIO(gz.read())
626 except IOError as original_ioerror:
627 # There may be junk add the end of the file
628 # See http://stackoverflow.com/q/4928560/35070 for details
629 for i in range(1, 1024):
630 try:
631 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
632 uncompressed = io.BytesIO(gz.read())
633 except IOError:
634 continue
635 break
636 else:
637 raise original_ioerror
638 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
639 resp.msg = old_resp.msg
640 # deflate
641 if resp.headers.get('Content-encoding', '') == 'deflate':
642 gz = io.BytesIO(self.deflate(resp.read()))
643 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
644 resp.msg = old_resp.msg
645 return resp
646
647 https_request = http_request
648 https_response = http_response
649
650
651 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
652 def __init__(self, params, https_conn_class=None, *args, **kwargs):
653 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
654 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
655 self._params = params
656
657 def https_open(self, req):
658 kwargs = {}
659 if hasattr(self, '_context'): # python > 2.6
660 kwargs['context'] = self._context
661 if hasattr(self, '_check_hostname'): # python 3.x
662 kwargs['check_hostname'] = self._check_hostname
663 return self.do_open(functools.partial(
664 _create_http_connection, self, self._https_conn_class, True),
665 req, **kwargs)
666
667
668 def parse_iso8601(date_str, delimiter='T'):
669 """ Return a UNIX timestamp from the given date """
670
671 if date_str is None:
672 return None
673
674 m = re.search(
675 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
676 date_str)
677 if not m:
678 timezone = datetime.timedelta()
679 else:
680 date_str = date_str[:-len(m.group(0))]
681 if not m.group('sign'):
682 timezone = datetime.timedelta()
683 else:
684 sign = 1 if m.group('sign') == '+' else -1
685 timezone = datetime.timedelta(
686 hours=sign * int(m.group('hours')),
687 minutes=sign * int(m.group('minutes')))
688 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
689 dt = datetime.datetime.strptime(date_str, date_format) - timezone
690 return calendar.timegm(dt.timetuple())
691
692
693 def unified_strdate(date_str, day_first=True):
694 """Return a string with the date in the format YYYYMMDD"""
695
696 if date_str is None:
697 return None
698 upload_date = None
699 # Replace commas
700 date_str = date_str.replace(',', ' ')
701 # %z (UTC offset) is only supported in python>=3.2
702 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
703 # Remove AM/PM + timezone
704 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
705
706 format_expressions = [
707 '%d %B %Y',
708 '%d %b %Y',
709 '%B %d %Y',
710 '%b %d %Y',
711 '%b %dst %Y %I:%M%p',
712 '%b %dnd %Y %I:%M%p',
713 '%b %dth %Y %I:%M%p',
714 '%Y %m %d',
715 '%Y-%m-%d',
716 '%Y/%m/%d',
717 '%Y/%m/%d %H:%M:%S',
718 '%Y-%m-%d %H:%M:%S',
719 '%Y-%m-%d %H:%M:%S.%f',
720 '%d.%m.%Y %H:%M',
721 '%d.%m.%Y %H.%M',
722 '%Y-%m-%dT%H:%M:%SZ',
723 '%Y-%m-%dT%H:%M:%S.%fZ',
724 '%Y-%m-%dT%H:%M:%S.%f0Z',
725 '%Y-%m-%dT%H:%M:%S',
726 '%Y-%m-%dT%H:%M:%S.%f',
727 '%Y-%m-%dT%H:%M',
728 ]
729 if day_first:
730 format_expressions.extend([
731 '%d.%m.%Y',
732 '%d/%m/%Y',
733 '%d/%m/%y',
734 '%d/%m/%Y %H:%M:%S',
735 ])
736 else:
737 format_expressions.extend([
738 '%m.%d.%Y',
739 '%m/%d/%Y',
740 '%m/%d/%y',
741 '%m/%d/%Y %H:%M:%S',
742 ])
743 for expression in format_expressions:
744 try:
745 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
746 except ValueError:
747 pass
748 if upload_date is None:
749 timetuple = email.utils.parsedate_tz(date_str)
750 if timetuple:
751 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
752 return upload_date
753
754
755 def determine_ext(url, default_ext='unknown_video'):
756 if url is None:
757 return default_ext
758 guess = url.partition('?')[0].rpartition('.')[2]
759 if re.match(r'^[A-Za-z0-9]+$', guess):
760 return guess
761 else:
762 return default_ext
763
764
765 def subtitles_filename(filename, sub_lang, sub_format):
766 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
767
768
769 def date_from_str(date_str):
770 """
771 Return a datetime object from a string in the format YYYYMMDD or
772 (now|today)[+-][0-9](day|week|month|year)(s)?"""
773 today = datetime.date.today()
774 if date_str in ('now', 'today'):
775 return today
776 if date_str == 'yesterday':
777 return today - datetime.timedelta(days=1)
778 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
779 if match is not None:
780 sign = match.group('sign')
781 time = int(match.group('time'))
782 if sign == '-':
783 time = -time
784 unit = match.group('unit')
785 # A bad aproximation?
786 if unit == 'month':
787 unit = 'day'
788 time *= 30
789 elif unit == 'year':
790 unit = 'day'
791 time *= 365
792 unit += 's'
793 delta = datetime.timedelta(**{unit: time})
794 return today + delta
795 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
796
797
798 def hyphenate_date(date_str):
799 """
800 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
801 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
802 if match is not None:
803 return '-'.join(match.groups())
804 else:
805 return date_str
806
807
808 class DateRange(object):
809 """Represents a time interval between two dates"""
810
811 def __init__(self, start=None, end=None):
812 """start and end must be strings in the format accepted by date"""
813 if start is not None:
814 self.start = date_from_str(start)
815 else:
816 self.start = datetime.datetime.min.date()
817 if end is not None:
818 self.end = date_from_str(end)
819 else:
820 self.end = datetime.datetime.max.date()
821 if self.start > self.end:
822 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
823
824 @classmethod
825 def day(cls, day):
826 """Returns a range that only contains the given day"""
827 return cls(day, day)
828
829 def __contains__(self, date):
830 """Check if the date is in the range"""
831 if not isinstance(date, datetime.date):
832 date = date_from_str(date)
833 return self.start <= date <= self.end
834
835 def __str__(self):
836 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
837
838
839 def platform_name():
840 """ Returns the platform name as a compat_str """
841 res = platform.platform()
842 if isinstance(res, bytes):
843 res = res.decode(preferredencoding())
844
845 assert isinstance(res, compat_str)
846 return res
847
848
849 def _windows_write_string(s, out):
850 """ Returns True if the string was written using special methods,
851 False if it has yet to be written out."""
852 # Adapted from http://stackoverflow.com/a/3259271/35070
853
854 import ctypes
855 import ctypes.wintypes
856
857 WIN_OUTPUT_IDS = {
858 1: -11,
859 2: -12,
860 }
861
862 try:
863 fileno = out.fileno()
864 except AttributeError:
865 # If the output stream doesn't have a fileno, it's virtual
866 return False
867 except io.UnsupportedOperation:
868 # Some strange Windows pseudo files?
869 return False
870 if fileno not in WIN_OUTPUT_IDS:
871 return False
872
873 GetStdHandle = ctypes.WINFUNCTYPE(
874 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
875 (b"GetStdHandle", ctypes.windll.kernel32))
876 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
877
878 WriteConsoleW = ctypes.WINFUNCTYPE(
879 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
880 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
881 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
882 written = ctypes.wintypes.DWORD(0)
883
884 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
885 FILE_TYPE_CHAR = 0x0002
886 FILE_TYPE_REMOTE = 0x8000
887 GetConsoleMode = ctypes.WINFUNCTYPE(
888 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
889 ctypes.POINTER(ctypes.wintypes.DWORD))(
890 (b"GetConsoleMode", ctypes.windll.kernel32))
891 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
892
893 def not_a_console(handle):
894 if handle == INVALID_HANDLE_VALUE or handle is None:
895 return True
896 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
897 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
898
899 if not_a_console(h):
900 return False
901
902 def next_nonbmp_pos(s):
903 try:
904 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
905 except StopIteration:
906 return len(s)
907
908 while s:
909 count = min(next_nonbmp_pos(s), 1024)
910
911 ret = WriteConsoleW(
912 h, s, count if count else 2, ctypes.byref(written), None)
913 if ret == 0:
914 raise OSError('Failed to write string')
915 if not count: # We just wrote a non-BMP character
916 assert written.value == 2
917 s = s[1:]
918 else:
919 assert written.value > 0
920 s = s[written.value:]
921 return True
922
923
924 def write_string(s, out=None, encoding=None):
925 if out is None:
926 out = sys.stderr
927 assert type(s) == compat_str
928
929 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
930 if _windows_write_string(s, out):
931 return
932
933 if ('b' in getattr(out, 'mode', '') or
934 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
935 byt = s.encode(encoding or preferredencoding(), 'ignore')
936 out.write(byt)
937 elif hasattr(out, 'buffer'):
938 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
939 byt = s.encode(enc, 'ignore')
940 out.buffer.write(byt)
941 else:
942 out.write(s)
943 out.flush()
944
945
946 def bytes_to_intlist(bs):
947 if not bs:
948 return []
949 if isinstance(bs[0], int): # Python 3
950 return list(bs)
951 else:
952 return [ord(c) for c in bs]
953
954
955 def intlist_to_bytes(xs):
956 if not xs:
957 return b''
958 return struct_pack('%dB' % len(xs), *xs)
959
960
961 # Cross-platform file locking
962 if sys.platform == 'win32':
963 import ctypes.wintypes
964 import msvcrt
965
966 class OVERLAPPED(ctypes.Structure):
967 _fields_ = [
968 ('Internal', ctypes.wintypes.LPVOID),
969 ('InternalHigh', ctypes.wintypes.LPVOID),
970 ('Offset', ctypes.wintypes.DWORD),
971 ('OffsetHigh', ctypes.wintypes.DWORD),
972 ('hEvent', ctypes.wintypes.HANDLE),
973 ]
974
975 kernel32 = ctypes.windll.kernel32
976 LockFileEx = kernel32.LockFileEx
977 LockFileEx.argtypes = [
978 ctypes.wintypes.HANDLE, # hFile
979 ctypes.wintypes.DWORD, # dwFlags
980 ctypes.wintypes.DWORD, # dwReserved
981 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
982 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
983 ctypes.POINTER(OVERLAPPED) # Overlapped
984 ]
985 LockFileEx.restype = ctypes.wintypes.BOOL
986 UnlockFileEx = kernel32.UnlockFileEx
987 UnlockFileEx.argtypes = [
988 ctypes.wintypes.HANDLE, # hFile
989 ctypes.wintypes.DWORD, # dwReserved
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
992 ctypes.POINTER(OVERLAPPED) # Overlapped
993 ]
994 UnlockFileEx.restype = ctypes.wintypes.BOOL
995 whole_low = 0xffffffff
996 whole_high = 0x7fffffff
997
998 def _lock_file(f, exclusive):
999 overlapped = OVERLAPPED()
1000 overlapped.Offset = 0
1001 overlapped.OffsetHigh = 0
1002 overlapped.hEvent = 0
1003 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1004 handle = msvcrt.get_osfhandle(f.fileno())
1005 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1006 whole_low, whole_high, f._lock_file_overlapped_p):
1007 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1008
1009 def _unlock_file(f):
1010 assert f._lock_file_overlapped_p
1011 handle = msvcrt.get_osfhandle(f.fileno())
1012 if not UnlockFileEx(handle, 0,
1013 whole_low, whole_high, f._lock_file_overlapped_p):
1014 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1015
1016 else:
1017 import fcntl
1018
1019 def _lock_file(f, exclusive):
1020 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1021
1022 def _unlock_file(f):
1023 fcntl.flock(f, fcntl.LOCK_UN)
1024
1025
1026 class locked_file(object):
1027 def __init__(self, filename, mode, encoding=None):
1028 assert mode in ['r', 'a', 'w']
1029 self.f = io.open(filename, mode, encoding=encoding)
1030 self.mode = mode
1031
1032 def __enter__(self):
1033 exclusive = self.mode != 'r'
1034 try:
1035 _lock_file(self.f, exclusive)
1036 except IOError:
1037 self.f.close()
1038 raise
1039 return self
1040
1041 def __exit__(self, etype, value, traceback):
1042 try:
1043 _unlock_file(self.f)
1044 finally:
1045 self.f.close()
1046
1047 def __iter__(self):
1048 return iter(self.f)
1049
1050 def write(self, *args):
1051 return self.f.write(*args)
1052
1053 def read(self, *args):
1054 return self.f.read(*args)
1055
1056
1057 def get_filesystem_encoding():
1058 encoding = sys.getfilesystemencoding()
1059 return encoding if encoding is not None else 'utf-8'
1060
1061
1062 def shell_quote(args):
1063 quoted_args = []
1064 encoding = get_filesystem_encoding()
1065 for a in args:
1066 if isinstance(a, bytes):
1067 # We may get a filename encoded with 'encodeFilename'
1068 a = a.decode(encoding)
1069 quoted_args.append(pipes.quote(a))
1070 return ' '.join(quoted_args)
1071
1072
1073 def takewhile_inclusive(pred, seq):
1074 """ Like itertools.takewhile, but include the latest evaluated element
1075 (the first element so that Not pred(e)) """
1076 for e in seq:
1077 yield e
1078 if not pred(e):
1079 return
1080
1081
1082 def smuggle_url(url, data):
1083 """ Pass additional data in a URL for internal use. """
1084
1085 sdata = compat_urllib_parse.urlencode(
1086 {'__youtubedl_smuggle': json.dumps(data)})
1087 return url + '#' + sdata
1088
1089
1090 def unsmuggle_url(smug_url, default=None):
1091 if '#__youtubedl_smuggle' not in smug_url:
1092 return smug_url, default
1093 url, _, sdata = smug_url.rpartition('#')
1094 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1095 data = json.loads(jsond)
1096 return url, data
1097
1098
1099 def format_bytes(bytes):
1100 if bytes is None:
1101 return 'N/A'
1102 if type(bytes) is str:
1103 bytes = float(bytes)
1104 if bytes == 0.0:
1105 exponent = 0
1106 else:
1107 exponent = int(math.log(bytes, 1024.0))
1108 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1109 converted = float(bytes) / float(1024 ** exponent)
1110 return '%.2f%s' % (converted, suffix)
1111
1112
1113 def parse_filesize(s):
1114 if s is None:
1115 return None
1116
1117 # The lower-case forms are of course incorrect and inofficial,
1118 # but we support those too
1119 _UNIT_TABLE = {
1120 'B': 1,
1121 'b': 1,
1122 'KiB': 1024,
1123 'KB': 1000,
1124 'kB': 1024,
1125 'Kb': 1000,
1126 'MiB': 1024 ** 2,
1127 'MB': 1000 ** 2,
1128 'mB': 1024 ** 2,
1129 'Mb': 1000 ** 2,
1130 'GiB': 1024 ** 3,
1131 'GB': 1000 ** 3,
1132 'gB': 1024 ** 3,
1133 'Gb': 1000 ** 3,
1134 'TiB': 1024 ** 4,
1135 'TB': 1000 ** 4,
1136 'tB': 1024 ** 4,
1137 'Tb': 1000 ** 4,
1138 'PiB': 1024 ** 5,
1139 'PB': 1000 ** 5,
1140 'pB': 1024 ** 5,
1141 'Pb': 1000 ** 5,
1142 'EiB': 1024 ** 6,
1143 'EB': 1000 ** 6,
1144 'eB': 1024 ** 6,
1145 'Eb': 1000 ** 6,
1146 'ZiB': 1024 ** 7,
1147 'ZB': 1000 ** 7,
1148 'zB': 1024 ** 7,
1149 'Zb': 1000 ** 7,
1150 'YiB': 1024 ** 8,
1151 'YB': 1000 ** 8,
1152 'yB': 1024 ** 8,
1153 'Yb': 1000 ** 8,
1154 }
1155
1156 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1157 m = re.match(
1158 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1159 if not m:
1160 return None
1161
1162 num_str = m.group('num').replace(',', '.')
1163 mult = _UNIT_TABLE[m.group('unit')]
1164 return int(float(num_str) * mult)
1165
1166
1167 def get_term_width():
1168 columns = compat_getenv('COLUMNS', None)
1169 if columns:
1170 return int(columns)
1171
1172 try:
1173 sp = subprocess.Popen(
1174 ['stty', 'size'],
1175 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1176 out, err = sp.communicate()
1177 return int(out.split()[1])
1178 except:
1179 pass
1180 return None
1181
1182
1183 def month_by_name(name):
1184 """ Return the number of a month by (locale-independently) English name """
1185
1186 ENGLISH_NAMES = [
1187 'January', 'February', 'March', 'April', 'May', 'June',
1188 'July', 'August', 'September', 'October', 'November', 'December']
1189 try:
1190 return ENGLISH_NAMES.index(name) + 1
1191 except ValueError:
1192 return None
1193
1194
1195 def fix_xml_ampersands(xml_str):
1196 """Replace all the '&' by '&amp;' in XML"""
1197 return re.sub(
1198 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1199 '&amp;',
1200 xml_str)
1201
1202
1203 def setproctitle(title):
1204 assert isinstance(title, compat_str)
1205 try:
1206 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1207 except OSError:
1208 return
1209 title_bytes = title.encode('utf-8')
1210 buf = ctypes.create_string_buffer(len(title_bytes))
1211 buf.value = title_bytes
1212 try:
1213 libc.prctl(15, buf, 0, 0, 0)
1214 except AttributeError:
1215 return # Strange libc, just skip this
1216
1217
1218 def remove_start(s, start):
1219 if s.startswith(start):
1220 return s[len(start):]
1221 return s
1222
1223
1224 def remove_end(s, end):
1225 if s.endswith(end):
1226 return s[:-len(end)]
1227 return s
1228
1229
1230 def url_basename(url):
1231 path = compat_urlparse.urlparse(url).path
1232 return path.strip('/').split('/')[-1]
1233
1234
1235 class HEADRequest(compat_urllib_request.Request):
1236 def get_method(self):
1237 return "HEAD"
1238
1239
1240 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1241 if get_attr:
1242 if v is not None:
1243 v = getattr(v, get_attr, None)
1244 if v == '':
1245 v = None
1246 return default if v is None else (int(v) * invscale // scale)
1247
1248
1249 def str_or_none(v, default=None):
1250 return default if v is None else compat_str(v)
1251
1252
1253 def str_to_int(int_str):
1254 """ A more relaxed version of int_or_none """
1255 if int_str is None:
1256 return None
1257 int_str = re.sub(r'[,\.\+]', '', int_str)
1258 return int(int_str)
1259
1260
1261 def float_or_none(v, scale=1, invscale=1, default=None):
1262 return default if v is None else (float(v) * invscale / scale)
1263
1264
1265 def parse_duration(s):
1266 if not isinstance(s, compat_basestring):
1267 return None
1268
1269 s = s.strip()
1270
1271 m = re.match(
1272 r'''(?ix)(?:P?T)?
1273 (?:
1274 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1275 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1276
1277 (?:
1278 (?:
1279 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1280 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1281 )?
1282 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1283 )?
1284 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1285 )$''', s)
1286 if not m:
1287 return None
1288 res = 0
1289 if m.group('only_mins'):
1290 return float_or_none(m.group('only_mins'), invscale=60)
1291 if m.group('only_hours'):
1292 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1293 if m.group('secs'):
1294 res += int(m.group('secs'))
1295 if m.group('mins'):
1296 res += int(m.group('mins')) * 60
1297 if m.group('hours'):
1298 res += int(m.group('hours')) * 60 * 60
1299 if m.group('days'):
1300 res += int(m.group('days')) * 24 * 60 * 60
1301 if m.group('ms'):
1302 res += float(m.group('ms'))
1303 return res
1304
1305
1306 def prepend_extension(filename, ext):
1307 name, real_ext = os.path.splitext(filename)
1308 return '{0}.{1}{2}'.format(name, ext, real_ext)
1309
1310
1311 def check_executable(exe, args=[]):
1312 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1313 args can be a list of arguments for a short output (like -version) """
1314 try:
1315 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1316 except OSError:
1317 return False
1318 return exe
1319
1320
1321 def get_exe_version(exe, args=['--version'],
1322 version_re=None, unrecognized='present'):
1323 """ Returns the version of the specified executable,
1324 or False if the executable is not present """
1325 try:
1326 out, _ = subprocess.Popen(
1327 [exe] + args,
1328 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1329 except OSError:
1330 return False
1331 if isinstance(out, bytes): # Python 2.x
1332 out = out.decode('ascii', 'ignore')
1333 return detect_exe_version(out, version_re, unrecognized)
1334
1335
1336 def detect_exe_version(output, version_re=None, unrecognized='present'):
1337 assert isinstance(output, compat_str)
1338 if version_re is None:
1339 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1340 m = re.search(version_re, output)
1341 if m:
1342 return m.group(1)
1343 else:
1344 return unrecognized
1345
1346
1347 class PagedList(object):
1348 def __len__(self):
1349 # This is only useful for tests
1350 return len(self.getslice())
1351
1352
1353 class OnDemandPagedList(PagedList):
1354 def __init__(self, pagefunc, pagesize):
1355 self._pagefunc = pagefunc
1356 self._pagesize = pagesize
1357
1358 def getslice(self, start=0, end=None):
1359 res = []
1360 for pagenum in itertools.count(start // self._pagesize):
1361 firstid = pagenum * self._pagesize
1362 nextfirstid = pagenum * self._pagesize + self._pagesize
1363 if start >= nextfirstid:
1364 continue
1365
1366 page_results = list(self._pagefunc(pagenum))
1367
1368 startv = (
1369 start % self._pagesize
1370 if firstid <= start < nextfirstid
1371 else 0)
1372
1373 endv = (
1374 ((end - 1) % self._pagesize) + 1
1375 if (end is not None and firstid <= end <= nextfirstid)
1376 else None)
1377
1378 if startv != 0 or endv is not None:
1379 page_results = page_results[startv:endv]
1380 res.extend(page_results)
1381
1382 # A little optimization - if current page is not "full", ie. does
1383 # not contain page_size videos then we can assume that this page
1384 # is the last one - there are no more ids on further pages -
1385 # i.e. no need to query again.
1386 if len(page_results) + startv < self._pagesize:
1387 break
1388
1389 # If we got the whole page, but the next page is not interesting,
1390 # break out early as well
1391 if end == nextfirstid:
1392 break
1393 return res
1394
1395
1396 class InAdvancePagedList(PagedList):
1397 def __init__(self, pagefunc, pagecount, pagesize):
1398 self._pagefunc = pagefunc
1399 self._pagecount = pagecount
1400 self._pagesize = pagesize
1401
1402 def getslice(self, start=0, end=None):
1403 res = []
1404 start_page = start // self._pagesize
1405 end_page = (
1406 self._pagecount if end is None else (end // self._pagesize + 1))
1407 skip_elems = start - start_page * self._pagesize
1408 only_more = None if end is None else end - start
1409 for pagenum in range(start_page, end_page):
1410 page = list(self._pagefunc(pagenum))
1411 if skip_elems:
1412 page = page[skip_elems:]
1413 skip_elems = None
1414 if only_more is not None:
1415 if len(page) < only_more:
1416 only_more -= len(page)
1417 else:
1418 page = page[:only_more]
1419 res.extend(page)
1420 break
1421 res.extend(page)
1422 return res
1423
1424
1425 def uppercase_escape(s):
1426 unicode_escape = codecs.getdecoder('unicode_escape')
1427 return re.sub(
1428 r'\\U[0-9a-fA-F]{8}',
1429 lambda m: unicode_escape(m.group(0))[0],
1430 s)
1431
1432
1433 def escape_rfc3986(s):
1434 """Escape non-ASCII characters as suggested by RFC 3986"""
1435 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1436 s = s.encode('utf-8')
1437 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1438
1439
1440 def escape_url(url):
1441 """Escape URL as suggested by RFC 3986"""
1442 url_parsed = compat_urllib_parse_urlparse(url)
1443 return url_parsed._replace(
1444 path=escape_rfc3986(url_parsed.path),
1445 params=escape_rfc3986(url_parsed.params),
1446 query=escape_rfc3986(url_parsed.query),
1447 fragment=escape_rfc3986(url_parsed.fragment)
1448 ).geturl()
1449
1450 try:
1451 struct.pack('!I', 0)
1452 except TypeError:
1453 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1454 def struct_pack(spec, *args):
1455 if isinstance(spec, compat_str):
1456 spec = spec.encode('ascii')
1457 return struct.pack(spec, *args)
1458
1459 def struct_unpack(spec, *args):
1460 if isinstance(spec, compat_str):
1461 spec = spec.encode('ascii')
1462 return struct.unpack(spec, *args)
1463 else:
1464 struct_pack = struct.pack
1465 struct_unpack = struct.unpack
1466
1467
1468 def read_batch_urls(batch_fd):
1469 def fixup(url):
1470 if not isinstance(url, compat_str):
1471 url = url.decode('utf-8', 'replace')
1472 BOM_UTF8 = '\xef\xbb\xbf'
1473 if url.startswith(BOM_UTF8):
1474 url = url[len(BOM_UTF8):]
1475 url = url.strip()
1476 if url.startswith(('#', ';', ']')):
1477 return False
1478 return url
1479
1480 with contextlib.closing(batch_fd) as fd:
1481 return [url for url in map(fixup, fd) if url]
1482
1483
1484 def urlencode_postdata(*args, **kargs):
1485 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1486
1487
1488 try:
1489 etree_iter = xml.etree.ElementTree.Element.iter
1490 except AttributeError: # Python <=2.6
1491 etree_iter = lambda n: n.findall('.//*')
1492
1493
1494 def parse_xml(s):
1495 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1496 def doctype(self, name, pubid, system):
1497 pass # Ignore doctypes
1498
1499 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1500 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1501 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1502 # Fix up XML parser in Python 2.x
1503 if sys.version_info < (3, 0):
1504 for n in etree_iter(tree):
1505 if n.text is not None:
1506 if not isinstance(n.text, compat_str):
1507 n.text = n.text.decode('utf-8')
1508 return tree
1509
1510
1511 US_RATINGS = {
1512 'G': 0,
1513 'PG': 10,
1514 'PG-13': 13,
1515 'R': 16,
1516 'NC': 18,
1517 }
1518
1519
1520 def parse_age_limit(s):
1521 if s is None:
1522 return None
1523 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1524 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1525
1526
1527 def strip_jsonp(code):
1528 return re.sub(
1529 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1530
1531
1532 def js_to_json(code):
1533 def fix_kv(m):
1534 v = m.group(0)
1535 if v in ('true', 'false', 'null'):
1536 return v
1537 if v.startswith('"'):
1538 return v
1539 if v.startswith("'"):
1540 v = v[1:-1]
1541 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1542 '\\\\': '\\\\',
1543 "\\'": "'",
1544 '"': '\\"',
1545 }[m.group(0)], v)
1546 return '"%s"' % v
1547
1548 res = re.sub(r'''(?x)
1549 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1550 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1551 [a-zA-Z_][.a-zA-Z_0-9]*
1552 ''', fix_kv, code)
1553 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1554 return res
1555
1556
1557 def qualities(quality_ids):
1558 """ Get a numeric quality value out of a list of possible values """
1559 def q(qid):
1560 try:
1561 return quality_ids.index(qid)
1562 except ValueError:
1563 return -1
1564 return q
1565
1566
1567 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1568
1569
1570 def limit_length(s, length):
1571 """ Add ellipses to overly long strings """
1572 if s is None:
1573 return None
1574 ELLIPSES = '...'
1575 if len(s) > length:
1576 return s[:length - len(ELLIPSES)] + ELLIPSES
1577 return s
1578
1579
1580 def version_tuple(v):
1581 return tuple(int(e) for e in re.split(r'[-.]', v))
1582
1583
1584 def is_outdated_version(version, limit, assume_new=True):
1585 if not version:
1586 return not assume_new
1587 try:
1588 return version_tuple(version) < version_tuple(limit)
1589 except ValueError:
1590 return not assume_new
1591
1592
1593 def ytdl_is_updateable():
1594 """ Returns if youtube-dl can be updated with -U """
1595 from zipimport import zipimporter
1596
1597 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1598
1599
1600 def args_to_str(args):
1601 # Get a short string representation for a subprocess command
1602 return ' '.join(shlex_quote(a) for a in args)
1603
1604
1605 def urlhandle_detect_ext(url_handle):
1606 try:
1607 url_handle.headers
1608 getheader = lambda h: url_handle.headers[h]
1609 except AttributeError: # Python < 3
1610 getheader = url_handle.info().getheader
1611
1612 cd = getheader('Content-Disposition')
1613 if cd:
1614 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1615 if m:
1616 e = determine_ext(m.group('filename'), default_ext=None)
1617 if e:
1618 return e
1619
1620 return getheader('Content-Type').split("/")[1]
1621
1622
1623 def age_restricted(content_limit, age_limit):
1624 """ Returns True iff the content should be blocked """
1625
1626 if age_limit is None: # No limit set
1627 return False
1628 if content_limit is None:
1629 return False # Content available for everyone
1630 return age_limit < content_limit
1631
1632
1633 def is_html(first_bytes):
1634 """ Detect whether a file contains HTML by examining its first bytes. """
1635
1636 BOMS = [
1637 (b'\xef\xbb\xbf', 'utf-8'),
1638 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1639 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1640 (b'\xff\xfe', 'utf-16-le'),
1641 (b'\xfe\xff', 'utf-16-be'),
1642 ]
1643 for bom, enc in BOMS:
1644 if first_bytes.startswith(bom):
1645 s = first_bytes[len(bom):].decode(enc, 'replace')
1646 break
1647 else:
1648 s = first_bytes.decode('utf-8', 'replace')
1649
1650 return re.match(r'^\s*<', s)
1651
1652
1653 def determine_protocol(info_dict):
1654 protocol = info_dict.get('protocol')
1655 if protocol is not None:
1656 return protocol
1657
1658 url = info_dict['url']
1659 if url.startswith('rtmp'):
1660 return 'rtmp'
1661 elif url.startswith('mms'):
1662 return 'mms'
1663 elif url.startswith('rtsp'):
1664 return 'rtsp'
1665
1666 ext = determine_ext(url)
1667 if ext == 'm3u8':
1668 return 'm3u8'
1669 elif ext == 'f4m':
1670 return 'f4m'
1671
1672 return compat_urllib_parse_urlparse(url).scheme
1673
1674
1675 def render_table(header_row, data):
1676 """ Render a list of rows, each as a list of values """
1677 table = [header_row] + data
1678 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1679 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1680 return '\n'.join(format_str % tuple(row) for row in table)