]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Imported Upstream version 2015.01.16
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import os
21 import pipes
22 import platform
23 import re
24 import ssl
25 import socket
26 import struct
27 import subprocess
28 import sys
29 import tempfile
30 import traceback
31 import xml.etree.ElementTree
32 import zlib
33
34 from .compat import (
35 compat_chr,
36 compat_getenv,
37 compat_html_entities,
38 compat_http_client,
39 compat_parse_qs,
40 compat_socket_create_connection,
41 compat_str,
42 compat_urllib_error,
43 compat_urllib_parse,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
46 compat_urlparse,
47 shlex_quote,
48 )
49
50
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
53
54 std_headers = {
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
60 }
61
62
63 def preferredencoding():
64 """Get preferred encoding.
65
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
68 """
69 try:
70 pref = locale.getpreferredencoding()
71 'TEST'.encode(pref)
72 except:
73 pref = 'UTF-8'
74
75 return pref
76
77
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
80
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
90 else:
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
93
94 args = {
95 'suffix': '.tmp',
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
98 'delete': False,
99 }
100
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
104 args['mode'] = 'wb'
105 else:
106 args.update({
107 'mode': 'w',
108 'encoding': 'utf-8',
109 })
110
111 tf = tempfile.NamedTemporaryFile(**args)
112
113 try:
114 with tf:
115 json.dump(obj, tf)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
119 try:
120 os.unlink(fn)
121 except OSError:
122 pass
123 os.rename(tf.name, fn)
124 except:
125 try:
126 os.remove(tf.name)
127 except OSError:
128 pass
129 raise
130
131
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
139 else:
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
145
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
148 return f
149 return None
150
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
153
154
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
157 replaced = []
158 for c in components:
159 if len(c) == 1:
160 replaced.append(c[0])
161 else:
162 ns, tag = c
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
165
166
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
170
171 n = node.find(xpath)
172 if n is None or n.text is None:
173 if fatal:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
176 else:
177 return None
178 return n.text
179
180
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
184
185
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
188
189 m = re.search(r'''(?xs)
190 <([a-zA-Z0-9:._-]+)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
192 \s+%s=['"]?%s['"]?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s*>
195 (?P<content>.*?)
196 </\1>
197 ''' % (re.escape(attribute), re.escape(value)), html)
198
199 if not m:
200 return None
201 res = m.group('content')
202
203 if res.startswith('"') or res.startswith("'"):
204 res = res[1:-1]
205
206 return unescapeHTML(res)
207
208
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
211
212 if html is None: # Convenience for sanitizing descriptions etc.
213 return html
214
215 # Newline vs <br />
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
219 # Strip html tags
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
223 return html.strip()
224
225
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
228
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
232 function.
233
234 It returns the tuple (stream, definitive_file_name).
235 """
236 try:
237 if filename == '-':
238 if sys.platform == 'win32':
239 import msvcrt
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
246 raise
247
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
252 )
253 if alt_filename == filename:
254 raise
255 else:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
259
260
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
268
269
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
274 """
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
277 return ''
278 elif char == '"':
279 return '' if restricted else '\''
280 elif char == ':':
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
283 return '_'
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
285 return '_'
286 if restricted and ord(char) > 127:
287 return '_'
288 return char
289
290 # Handle timestamps
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
293 if not is_id:
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
299 result = result[2:]
300 if not result:
301 result = '_'
302 return result
303
304
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
307 res = []
308 for el in iterable:
309 if el not in res:
310 res.append(el)
311 return res
312
313
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
319
320 mobj = re.match(r'#(x?[0-9]+)', entity)
321 if mobj is not None:
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
324 base = 16
325 numstr = '0%s' % numstr
326 else:
327 base = 10
328 return compat_chr(int(numstr, base))
329
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
332
333
334 def unescapeHTML(s):
335 if s is None:
336 return None
337 assert type(s) == compat_str
338
339 return re.sub(
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
341
342
343 def encodeFilename(s, for_subprocess=False):
344 """
345 @param s The name of the file
346 """
347
348 assert type(s) == compat_str
349
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
352 return s
353
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
359 return s
360 else:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
364 else:
365 encoding = sys.getfilesystemencoding()
366 if encoding is None:
367 encoding = 'utf-8'
368 return s.encode(encoding, 'ignore')
369
370
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
378
379
380 def decodeOption(optval):
381 if optval is None:
382 return optval
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
385
386 assert isinstance(optval, compat_str)
387 return optval
388
389
390 def formatSeconds(secs):
391 if secs > 3600:
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
393 elif secs > 60:
394 return '%d:%02d' % (secs // 60, secs % 60)
395 else:
396 return '%d' % secs
397
398
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
406 try:
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
408 except TypeError:
409 # Python 2.7.8
410 # (create_default_context present but HTTPSHandler has no context=)
411 pass
412
413 if sys.version_info < (3, 2):
414 import httplib
415
416 class HTTPSConnectionV3(httplib.HTTPSConnection):
417 def __init__(self, *args, **kwargs):
418 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
419
420 def connect(self):
421 sock = socket.create_connection((self.host, self.port), self.timeout)
422 if getattr(self, '_tunnel_host', False):
423 self.sock = sock
424 self._tunnel()
425 try:
426 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
427 except ssl.SSLError:
428 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
429
430 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
431 else: # Python < 3.4
432 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
433 context.verify_mode = (ssl.CERT_NONE
434 if opts_no_check_certificate
435 else ssl.CERT_REQUIRED)
436 context.set_default_verify_paths()
437 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
438
439
440 class ExtractorError(Exception):
441 """Error during info extraction."""
442
443 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
444 """ tb, if given, is the original traceback (so that it can be printed out).
445 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
446 """
447
448 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
449 expected = True
450 if video_id is not None:
451 msg = video_id + ': ' + msg
452 if cause:
453 msg += ' (caused by %r)' % cause
454 if not expected:
455 if ytdl_is_updateable():
456 update_cmd = 'type youtube-dl -U to update'
457 else:
458 update_cmd = 'see https://yt-dl.org/update on how to update'
459 msg += '; please report this issue on https://yt-dl.org/bug .'
460 msg += ' Make sure you are using the latest version; %s.' % update_cmd
461 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
462 super(ExtractorError, self).__init__(msg)
463
464 self.traceback = tb
465 self.exc_info = sys.exc_info() # preserve original exception
466 self.cause = cause
467 self.video_id = video_id
468
469 def format_traceback(self):
470 if self.traceback is None:
471 return None
472 return ''.join(traceback.format_tb(self.traceback))
473
474
475 class UnsupportedError(ExtractorError):
476 def __init__(self, url):
477 super(UnsupportedError, self).__init__(
478 'Unsupported URL: %s' % url, expected=True)
479 self.url = url
480
481
482 class RegexNotFoundError(ExtractorError):
483 """Error when a regex didn't match"""
484 pass
485
486
487 class DownloadError(Exception):
488 """Download Error exception.
489
490 This exception may be thrown by FileDownloader objects if they are not
491 configured to continue on errors. They will contain the appropriate
492 error message.
493 """
494
495 def __init__(self, msg, exc_info=None):
496 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
497 super(DownloadError, self).__init__(msg)
498 self.exc_info = exc_info
499
500
501 class SameFileError(Exception):
502 """Same File exception.
503
504 This exception will be thrown by FileDownloader objects if they detect
505 multiple files would have to be downloaded to the same file on disk.
506 """
507 pass
508
509
510 class PostProcessingError(Exception):
511 """Post Processing exception.
512
513 This exception may be raised by PostProcessor's .run() method to
514 indicate an error in the postprocessing task.
515 """
516
517 def __init__(self, msg):
518 self.msg = msg
519
520
521 class MaxDownloadsReached(Exception):
522 """ --max-downloads limit has been reached. """
523 pass
524
525
526 class UnavailableVideoError(Exception):
527 """Unavailable Format exception.
528
529 This exception will be thrown when a video is requested
530 in a format that is not available for that video.
531 """
532 pass
533
534
535 class ContentTooShortError(Exception):
536 """Content Too Short exception.
537
538 This exception may be raised by FileDownloader objects when a file they
539 download is too small for what the server announced first, indicating
540 the connection was probably interrupted.
541 """
542 # Both in bytes
543 downloaded = None
544 expected = None
545
546 def __init__(self, downloaded, expected):
547 self.downloaded = downloaded
548 self.expected = expected
549
550
551 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
552 hc = http_class(*args, **kwargs)
553 source_address = ydl_handler._params.get('source_address')
554 if source_address is not None:
555 sa = (source_address, 0)
556 if hasattr(hc, 'source_address'): # Python 2.7+
557 hc.source_address = sa
558 else: # Python 2.6
559 def _hc_connect(self, *args, **kwargs):
560 sock = compat_socket_create_connection(
561 (self.host, self.port), self.timeout, sa)
562 if is_https:
563 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
564 else:
565 self.sock = sock
566 hc.connect = functools.partial(_hc_connect, hc)
567
568 return hc
569
570
571 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
572 """Handler for HTTP requests and responses.
573
574 This class, when installed with an OpenerDirector, automatically adds
575 the standard headers to every HTTP request and handles gzipped and
576 deflated responses from web servers. If compression is to be avoided in
577 a particular request, the original request in the program code only has
578 to include the HTTP header "Youtubedl-No-Compression", which will be
579 removed before making the real request.
580
581 Part of this code was copied from:
582
583 http://techknack.net/python-urllib2-handlers/
584
585 Andrew Rowls, the author of that code, agreed to release it to the
586 public domain.
587 """
588
589 def __init__(self, params, *args, **kwargs):
590 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
591 self._params = params
592
593 def http_open(self, req):
594 return self.do_open(functools.partial(
595 _create_http_connection, self, compat_http_client.HTTPConnection, False),
596 req)
597
598 @staticmethod
599 def deflate(data):
600 try:
601 return zlib.decompress(data, -zlib.MAX_WBITS)
602 except zlib.error:
603 return zlib.decompress(data)
604
605 @staticmethod
606 def addinfourl_wrapper(stream, headers, url, code):
607 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
608 return compat_urllib_request.addinfourl(stream, headers, url, code)
609 ret = compat_urllib_request.addinfourl(stream, headers, url)
610 ret.code = code
611 return ret
612
613 def http_request(self, req):
614 for h, v in std_headers.items():
615 if h not in req.headers:
616 req.add_header(h, v)
617 if 'Youtubedl-no-compression' in req.headers:
618 if 'Accept-encoding' in req.headers:
619 del req.headers['Accept-encoding']
620 del req.headers['Youtubedl-no-compression']
621 if 'Youtubedl-user-agent' in req.headers:
622 if 'User-agent' in req.headers:
623 del req.headers['User-agent']
624 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
625 del req.headers['Youtubedl-user-agent']
626
627 if sys.version_info < (2, 7) and '#' in req.get_full_url():
628 # Python 2.6 is brain-dead when it comes to fragments
629 req._Request__original = req._Request__original.partition('#')[0]
630 req._Request__r_type = req._Request__r_type.partition('#')[0]
631
632 return req
633
634 def http_response(self, req, resp):
635 old_resp = resp
636 # gzip
637 if resp.headers.get('Content-encoding', '') == 'gzip':
638 content = resp.read()
639 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
640 try:
641 uncompressed = io.BytesIO(gz.read())
642 except IOError as original_ioerror:
643 # There may be junk add the end of the file
644 # See http://stackoverflow.com/q/4928560/35070 for details
645 for i in range(1, 1024):
646 try:
647 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
648 uncompressed = io.BytesIO(gz.read())
649 except IOError:
650 continue
651 break
652 else:
653 raise original_ioerror
654 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
655 resp.msg = old_resp.msg
656 # deflate
657 if resp.headers.get('Content-encoding', '') == 'deflate':
658 gz = io.BytesIO(self.deflate(resp.read()))
659 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
660 resp.msg = old_resp.msg
661 return resp
662
663 https_request = http_request
664 https_response = http_response
665
666
667 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
668 def __init__(self, params, https_conn_class=None, *args, **kwargs):
669 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
670 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
671 self._params = params
672
673 def https_open(self, req):
674 return self.do_open(functools.partial(
675 _create_http_connection, self, self._https_conn_class, True),
676 req)
677
678
679 def parse_iso8601(date_str, delimiter='T'):
680 """ Return a UNIX timestamp from the given date """
681
682 if date_str is None:
683 return None
684
685 m = re.search(
686 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
687 date_str)
688 if not m:
689 timezone = datetime.timedelta()
690 else:
691 date_str = date_str[:-len(m.group(0))]
692 if not m.group('sign'):
693 timezone = datetime.timedelta()
694 else:
695 sign = 1 if m.group('sign') == '+' else -1
696 timezone = datetime.timedelta(
697 hours=sign * int(m.group('hours')),
698 minutes=sign * int(m.group('minutes')))
699 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
700 dt = datetime.datetime.strptime(date_str, date_format) - timezone
701 return calendar.timegm(dt.timetuple())
702
703
704 def unified_strdate(date_str, day_first=True):
705 """Return a string with the date in the format YYYYMMDD"""
706
707 if date_str is None:
708 return None
709 upload_date = None
710 # Replace commas
711 date_str = date_str.replace(',', ' ')
712 # %z (UTC offset) is only supported in python>=3.2
713 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
714 # Remove AM/PM + timezone
715 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
716
717 format_expressions = [
718 '%d %B %Y',
719 '%d %b %Y',
720 '%B %d %Y',
721 '%b %d %Y',
722 '%b %dst %Y %I:%M%p',
723 '%b %dnd %Y %I:%M%p',
724 '%b %dth %Y %I:%M%p',
725 '%Y %m %d',
726 '%Y-%m-%d',
727 '%Y/%m/%d',
728 '%Y/%m/%d %H:%M:%S',
729 '%Y-%m-%d %H:%M:%S',
730 '%Y-%m-%d %H:%M:%S.%f',
731 '%d.%m.%Y %H:%M',
732 '%d.%m.%Y %H.%M',
733 '%Y-%m-%dT%H:%M:%SZ',
734 '%Y-%m-%dT%H:%M:%S.%fZ',
735 '%Y-%m-%dT%H:%M:%S.%f0Z',
736 '%Y-%m-%dT%H:%M:%S',
737 '%Y-%m-%dT%H:%M:%S.%f',
738 '%Y-%m-%dT%H:%M',
739 ]
740 if day_first:
741 format_expressions.extend([
742 '%d.%m.%Y',
743 '%d/%m/%Y',
744 '%d/%m/%y',
745 '%d/%m/%Y %H:%M:%S',
746 ])
747 else:
748 format_expressions.extend([
749 '%m.%d.%Y',
750 '%m/%d/%Y',
751 '%m/%d/%y',
752 '%m/%d/%Y %H:%M:%S',
753 ])
754 for expression in format_expressions:
755 try:
756 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
757 except ValueError:
758 pass
759 if upload_date is None:
760 timetuple = email.utils.parsedate_tz(date_str)
761 if timetuple:
762 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
763 return upload_date
764
765
766 def determine_ext(url, default_ext='unknown_video'):
767 if url is None:
768 return default_ext
769 guess = url.partition('?')[0].rpartition('.')[2]
770 if re.match(r'^[A-Za-z0-9]+$', guess):
771 return guess
772 else:
773 return default_ext
774
775
776 def subtitles_filename(filename, sub_lang, sub_format):
777 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
778
779
780 def date_from_str(date_str):
781 """
782 Return a datetime object from a string in the format YYYYMMDD or
783 (now|today)[+-][0-9](day|week|month|year)(s)?"""
784 today = datetime.date.today()
785 if date_str in ('now', 'today'):
786 return today
787 if date_str == 'yesterday':
788 return today - datetime.timedelta(days=1)
789 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
790 if match is not None:
791 sign = match.group('sign')
792 time = int(match.group('time'))
793 if sign == '-':
794 time = -time
795 unit = match.group('unit')
796 # A bad aproximation?
797 if unit == 'month':
798 unit = 'day'
799 time *= 30
800 elif unit == 'year':
801 unit = 'day'
802 time *= 365
803 unit += 's'
804 delta = datetime.timedelta(**{unit: time})
805 return today + delta
806 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
807
808
809 def hyphenate_date(date_str):
810 """
811 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
812 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
813 if match is not None:
814 return '-'.join(match.groups())
815 else:
816 return date_str
817
818
819 class DateRange(object):
820 """Represents a time interval between two dates"""
821
822 def __init__(self, start=None, end=None):
823 """start and end must be strings in the format accepted by date"""
824 if start is not None:
825 self.start = date_from_str(start)
826 else:
827 self.start = datetime.datetime.min.date()
828 if end is not None:
829 self.end = date_from_str(end)
830 else:
831 self.end = datetime.datetime.max.date()
832 if self.start > self.end:
833 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
834
835 @classmethod
836 def day(cls, day):
837 """Returns a range that only contains the given day"""
838 return cls(day, day)
839
840 def __contains__(self, date):
841 """Check if the date is in the range"""
842 if not isinstance(date, datetime.date):
843 date = date_from_str(date)
844 return self.start <= date <= self.end
845
846 def __str__(self):
847 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
848
849
850 def platform_name():
851 """ Returns the platform name as a compat_str """
852 res = platform.platform()
853 if isinstance(res, bytes):
854 res = res.decode(preferredencoding())
855
856 assert isinstance(res, compat_str)
857 return res
858
859
860 def _windows_write_string(s, out):
861 """ Returns True if the string was written using special methods,
862 False if it has yet to be written out."""
863 # Adapted from http://stackoverflow.com/a/3259271/35070
864
865 import ctypes
866 import ctypes.wintypes
867
868 WIN_OUTPUT_IDS = {
869 1: -11,
870 2: -12,
871 }
872
873 try:
874 fileno = out.fileno()
875 except AttributeError:
876 # If the output stream doesn't have a fileno, it's virtual
877 return False
878 if fileno not in WIN_OUTPUT_IDS:
879 return False
880
881 GetStdHandle = ctypes.WINFUNCTYPE(
882 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
883 (b"GetStdHandle", ctypes.windll.kernel32))
884 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
885
886 WriteConsoleW = ctypes.WINFUNCTYPE(
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
888 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
889 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
890 written = ctypes.wintypes.DWORD(0)
891
892 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
893 FILE_TYPE_CHAR = 0x0002
894 FILE_TYPE_REMOTE = 0x8000
895 GetConsoleMode = ctypes.WINFUNCTYPE(
896 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
897 ctypes.POINTER(ctypes.wintypes.DWORD))(
898 (b"GetConsoleMode", ctypes.windll.kernel32))
899 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
900
901 def not_a_console(handle):
902 if handle == INVALID_HANDLE_VALUE or handle is None:
903 return True
904 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
905 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
906
907 if not_a_console(h):
908 return False
909
910 def next_nonbmp_pos(s):
911 try:
912 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
913 except StopIteration:
914 return len(s)
915
916 while s:
917 count = min(next_nonbmp_pos(s), 1024)
918
919 ret = WriteConsoleW(
920 h, s, count if count else 2, ctypes.byref(written), None)
921 if ret == 0:
922 raise OSError('Failed to write string')
923 if not count: # We just wrote a non-BMP character
924 assert written.value == 2
925 s = s[1:]
926 else:
927 assert written.value > 0
928 s = s[written.value:]
929 return True
930
931
932 def write_string(s, out=None, encoding=None):
933 if out is None:
934 out = sys.stderr
935 assert type(s) == compat_str
936
937 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
938 if _windows_write_string(s, out):
939 return
940
941 if ('b' in getattr(out, 'mode', '') or
942 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
943 byt = s.encode(encoding or preferredencoding(), 'ignore')
944 out.write(byt)
945 elif hasattr(out, 'buffer'):
946 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
947 byt = s.encode(enc, 'ignore')
948 out.buffer.write(byt)
949 else:
950 out.write(s)
951 out.flush()
952
953
954 def bytes_to_intlist(bs):
955 if not bs:
956 return []
957 if isinstance(bs[0], int): # Python 3
958 return list(bs)
959 else:
960 return [ord(c) for c in bs]
961
962
963 def intlist_to_bytes(xs):
964 if not xs:
965 return b''
966 return struct_pack('%dB' % len(xs), *xs)
967
968
969 # Cross-platform file locking
970 if sys.platform == 'win32':
971 import ctypes.wintypes
972 import msvcrt
973
974 class OVERLAPPED(ctypes.Structure):
975 _fields_ = [
976 ('Internal', ctypes.wintypes.LPVOID),
977 ('InternalHigh', ctypes.wintypes.LPVOID),
978 ('Offset', ctypes.wintypes.DWORD),
979 ('OffsetHigh', ctypes.wintypes.DWORD),
980 ('hEvent', ctypes.wintypes.HANDLE),
981 ]
982
983 kernel32 = ctypes.windll.kernel32
984 LockFileEx = kernel32.LockFileEx
985 LockFileEx.argtypes = [
986 ctypes.wintypes.HANDLE, # hFile
987 ctypes.wintypes.DWORD, # dwFlags
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
992 ]
993 LockFileEx.restype = ctypes.wintypes.BOOL
994 UnlockFileEx = kernel32.UnlockFileEx
995 UnlockFileEx.argtypes = [
996 ctypes.wintypes.HANDLE, # hFile
997 ctypes.wintypes.DWORD, # dwReserved
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1000 ctypes.POINTER(OVERLAPPED) # Overlapped
1001 ]
1002 UnlockFileEx.restype = ctypes.wintypes.BOOL
1003 whole_low = 0xffffffff
1004 whole_high = 0x7fffffff
1005
1006 def _lock_file(f, exclusive):
1007 overlapped = OVERLAPPED()
1008 overlapped.Offset = 0
1009 overlapped.OffsetHigh = 0
1010 overlapped.hEvent = 0
1011 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1016
1017 def _unlock_file(f):
1018 assert f._lock_file_overlapped_p
1019 handle = msvcrt.get_osfhandle(f.fileno())
1020 if not UnlockFileEx(handle, 0,
1021 whole_low, whole_high, f._lock_file_overlapped_p):
1022 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1023
1024 else:
1025 import fcntl
1026
1027 def _lock_file(f, exclusive):
1028 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1029
1030 def _unlock_file(f):
1031 fcntl.flock(f, fcntl.LOCK_UN)
1032
1033
1034 class locked_file(object):
1035 def __init__(self, filename, mode, encoding=None):
1036 assert mode in ['r', 'a', 'w']
1037 self.f = io.open(filename, mode, encoding=encoding)
1038 self.mode = mode
1039
1040 def __enter__(self):
1041 exclusive = self.mode != 'r'
1042 try:
1043 _lock_file(self.f, exclusive)
1044 except IOError:
1045 self.f.close()
1046 raise
1047 return self
1048
1049 def __exit__(self, etype, value, traceback):
1050 try:
1051 _unlock_file(self.f)
1052 finally:
1053 self.f.close()
1054
1055 def __iter__(self):
1056 return iter(self.f)
1057
1058 def write(self, *args):
1059 return self.f.write(*args)
1060
1061 def read(self, *args):
1062 return self.f.read(*args)
1063
1064
1065 def get_filesystem_encoding():
1066 encoding = sys.getfilesystemencoding()
1067 return encoding if encoding is not None else 'utf-8'
1068
1069
1070 def shell_quote(args):
1071 quoted_args = []
1072 encoding = get_filesystem_encoding()
1073 for a in args:
1074 if isinstance(a, bytes):
1075 # We may get a filename encoded with 'encodeFilename'
1076 a = a.decode(encoding)
1077 quoted_args.append(pipes.quote(a))
1078 return ' '.join(quoted_args)
1079
1080
1081 def takewhile_inclusive(pred, seq):
1082 """ Like itertools.takewhile, but include the latest evaluated element
1083 (the first element so that Not pred(e)) """
1084 for e in seq:
1085 yield e
1086 if not pred(e):
1087 return
1088
1089
1090 def smuggle_url(url, data):
1091 """ Pass additional data in a URL for internal use. """
1092
1093 sdata = compat_urllib_parse.urlencode(
1094 {'__youtubedl_smuggle': json.dumps(data)})
1095 return url + '#' + sdata
1096
1097
1098 def unsmuggle_url(smug_url, default=None):
1099 if '#__youtubedl_smuggle' not in smug_url:
1100 return smug_url, default
1101 url, _, sdata = smug_url.rpartition('#')
1102 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1103 data = json.loads(jsond)
1104 return url, data
1105
1106
1107 def format_bytes(bytes):
1108 if bytes is None:
1109 return 'N/A'
1110 if type(bytes) is str:
1111 bytes = float(bytes)
1112 if bytes == 0.0:
1113 exponent = 0
1114 else:
1115 exponent = int(math.log(bytes, 1024.0))
1116 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1117 converted = float(bytes) / float(1024 ** exponent)
1118 return '%.2f%s' % (converted, suffix)
1119
1120
1121 def parse_filesize(s):
1122 if s is None:
1123 return None
1124
1125 # The lower-case forms are of course incorrect and inofficial,
1126 # but we support those too
1127 _UNIT_TABLE = {
1128 'B': 1,
1129 'b': 1,
1130 'KiB': 1024,
1131 'KB': 1000,
1132 'kB': 1024,
1133 'Kb': 1000,
1134 'MiB': 1024 ** 2,
1135 'MB': 1000 ** 2,
1136 'mB': 1024 ** 2,
1137 'Mb': 1000 ** 2,
1138 'GiB': 1024 ** 3,
1139 'GB': 1000 ** 3,
1140 'gB': 1024 ** 3,
1141 'Gb': 1000 ** 3,
1142 'TiB': 1024 ** 4,
1143 'TB': 1000 ** 4,
1144 'tB': 1024 ** 4,
1145 'Tb': 1000 ** 4,
1146 'PiB': 1024 ** 5,
1147 'PB': 1000 ** 5,
1148 'pB': 1024 ** 5,
1149 'Pb': 1000 ** 5,
1150 'EiB': 1024 ** 6,
1151 'EB': 1000 ** 6,
1152 'eB': 1024 ** 6,
1153 'Eb': 1000 ** 6,
1154 'ZiB': 1024 ** 7,
1155 'ZB': 1000 ** 7,
1156 'zB': 1024 ** 7,
1157 'Zb': 1000 ** 7,
1158 'YiB': 1024 ** 8,
1159 'YB': 1000 ** 8,
1160 'yB': 1024 ** 8,
1161 'Yb': 1000 ** 8,
1162 }
1163
1164 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1165 m = re.match(
1166 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1167 if not m:
1168 return None
1169
1170 num_str = m.group('num').replace(',', '.')
1171 mult = _UNIT_TABLE[m.group('unit')]
1172 return int(float(num_str) * mult)
1173
1174
1175 def get_term_width():
1176 columns = compat_getenv('COLUMNS', None)
1177 if columns:
1178 return int(columns)
1179
1180 try:
1181 sp = subprocess.Popen(
1182 ['stty', 'size'],
1183 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1184 out, err = sp.communicate()
1185 return int(out.split()[1])
1186 except:
1187 pass
1188 return None
1189
1190
1191 def month_by_name(name):
1192 """ Return the number of a month by (locale-independently) English name """
1193
1194 ENGLISH_NAMES = [
1195 'January', 'February', 'March', 'April', 'May', 'June',
1196 'July', 'August', 'September', 'October', 'November', 'December']
1197 try:
1198 return ENGLISH_NAMES.index(name) + 1
1199 except ValueError:
1200 return None
1201
1202
1203 def fix_xml_ampersands(xml_str):
1204 """Replace all the '&' by '&amp;' in XML"""
1205 return re.sub(
1206 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1207 '&amp;',
1208 xml_str)
1209
1210
1211 def setproctitle(title):
1212 assert isinstance(title, compat_str)
1213 try:
1214 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1215 except OSError:
1216 return
1217 title_bytes = title.encode('utf-8')
1218 buf = ctypes.create_string_buffer(len(title_bytes))
1219 buf.value = title_bytes
1220 try:
1221 libc.prctl(15, buf, 0, 0, 0)
1222 except AttributeError:
1223 return # Strange libc, just skip this
1224
1225
1226 def remove_start(s, start):
1227 if s.startswith(start):
1228 return s[len(start):]
1229 return s
1230
1231
1232 def remove_end(s, end):
1233 if s.endswith(end):
1234 return s[:-len(end)]
1235 return s
1236
1237
1238 def url_basename(url):
1239 path = compat_urlparse.urlparse(url).path
1240 return path.strip('/').split('/')[-1]
1241
1242
1243 class HEADRequest(compat_urllib_request.Request):
1244 def get_method(self):
1245 return "HEAD"
1246
1247
1248 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1249 if get_attr:
1250 if v is not None:
1251 v = getattr(v, get_attr, None)
1252 if v == '':
1253 v = None
1254 return default if v is None else (int(v) * invscale // scale)
1255
1256
1257 def str_or_none(v, default=None):
1258 return default if v is None else compat_str(v)
1259
1260
1261 def str_to_int(int_str):
1262 """ A more relaxed version of int_or_none """
1263 if int_str is None:
1264 return None
1265 int_str = re.sub(r'[,\.\+]', '', int_str)
1266 return int(int_str)
1267
1268
1269 def float_or_none(v, scale=1, invscale=1, default=None):
1270 return default if v is None else (float(v) * invscale / scale)
1271
1272
1273 def parse_duration(s):
1274 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1275 return None
1276
1277 s = s.strip()
1278
1279 m = re.match(
1280 r'''(?ix)(?:P?T)?
1281 (?:
1282 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1283 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1284
1285 (?:
1286 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1287 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1288 )?
1289 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1290 )$''', s)
1291 if not m:
1292 return None
1293 res = 0
1294 if m.group('only_mins'):
1295 return float_or_none(m.group('only_mins'), invscale=60)
1296 if m.group('only_hours'):
1297 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1298 if m.group('secs'):
1299 res += int(m.group('secs'))
1300 if m.group('mins'):
1301 res += int(m.group('mins')) * 60
1302 if m.group('hours'):
1303 res += int(m.group('hours')) * 60 * 60
1304 if m.group('ms'):
1305 res += float(m.group('ms'))
1306 return res
1307
1308
1309 def prepend_extension(filename, ext):
1310 name, real_ext = os.path.splitext(filename)
1311 return '{0}.{1}{2}'.format(name, ext, real_ext)
1312
1313
1314 def check_executable(exe, args=[]):
1315 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316 args can be a list of arguments for a short output (like -version) """
1317 try:
1318 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1319 except OSError:
1320 return False
1321 return exe
1322
1323
1324 def get_exe_version(exe, args=['--version'],
1325 version_re=None, unrecognized='present'):
1326 """ Returns the version of the specified executable,
1327 or False if the executable is not present """
1328 try:
1329 out, _ = subprocess.Popen(
1330 [exe] + args,
1331 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1332 except OSError:
1333 return False
1334 if isinstance(out, bytes): # Python 2.x
1335 out = out.decode('ascii', 'ignore')
1336 return detect_exe_version(out, version_re, unrecognized)
1337
1338
1339 def detect_exe_version(output, version_re=None, unrecognized='present'):
1340 assert isinstance(output, compat_str)
1341 if version_re is None:
1342 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1343 m = re.search(version_re, output)
1344 if m:
1345 return m.group(1)
1346 else:
1347 return unrecognized
1348
1349
1350 class PagedList(object):
1351 def __len__(self):
1352 # This is only useful for tests
1353 return len(self.getslice())
1354
1355
1356 class OnDemandPagedList(PagedList):
1357 def __init__(self, pagefunc, pagesize):
1358 self._pagefunc = pagefunc
1359 self._pagesize = pagesize
1360
1361 def getslice(self, start=0, end=None):
1362 res = []
1363 for pagenum in itertools.count(start // self._pagesize):
1364 firstid = pagenum * self._pagesize
1365 nextfirstid = pagenum * self._pagesize + self._pagesize
1366 if start >= nextfirstid:
1367 continue
1368
1369 page_results = list(self._pagefunc(pagenum))
1370
1371 startv = (
1372 start % self._pagesize
1373 if firstid <= start < nextfirstid
1374 else 0)
1375
1376 endv = (
1377 ((end - 1) % self._pagesize) + 1
1378 if (end is not None and firstid <= end <= nextfirstid)
1379 else None)
1380
1381 if startv != 0 or endv is not None:
1382 page_results = page_results[startv:endv]
1383 res.extend(page_results)
1384
1385 # A little optimization - if current page is not "full", ie. does
1386 # not contain page_size videos then we can assume that this page
1387 # is the last one - there are no more ids on further pages -
1388 # i.e. no need to query again.
1389 if len(page_results) + startv < self._pagesize:
1390 break
1391
1392 # If we got the whole page, but the next page is not interesting,
1393 # break out early as well
1394 if end == nextfirstid:
1395 break
1396 return res
1397
1398
1399 class InAdvancePagedList(PagedList):
1400 def __init__(self, pagefunc, pagecount, pagesize):
1401 self._pagefunc = pagefunc
1402 self._pagecount = pagecount
1403 self._pagesize = pagesize
1404
1405 def getslice(self, start=0, end=None):
1406 res = []
1407 start_page = start // self._pagesize
1408 end_page = (
1409 self._pagecount if end is None else (end // self._pagesize + 1))
1410 skip_elems = start - start_page * self._pagesize
1411 only_more = None if end is None else end - start
1412 for pagenum in range(start_page, end_page):
1413 page = list(self._pagefunc(pagenum))
1414 if skip_elems:
1415 page = page[skip_elems:]
1416 skip_elems = None
1417 if only_more is not None:
1418 if len(page) < only_more:
1419 only_more -= len(page)
1420 else:
1421 page = page[:only_more]
1422 res.extend(page)
1423 break
1424 res.extend(page)
1425 return res
1426
1427
1428 def uppercase_escape(s):
1429 unicode_escape = codecs.getdecoder('unicode_escape')
1430 return re.sub(
1431 r'\\U[0-9a-fA-F]{8}',
1432 lambda m: unicode_escape(m.group(0))[0],
1433 s)
1434
1435
1436 def escape_rfc3986(s):
1437 """Escape non-ASCII characters as suggested by RFC 3986"""
1438 if sys.version_info < (3, 0) and isinstance(s, unicode):
1439 s = s.encode('utf-8')
1440 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1441
1442
1443 def escape_url(url):
1444 """Escape URL as suggested by RFC 3986"""
1445 url_parsed = compat_urllib_parse_urlparse(url)
1446 return url_parsed._replace(
1447 path=escape_rfc3986(url_parsed.path),
1448 params=escape_rfc3986(url_parsed.params),
1449 query=escape_rfc3986(url_parsed.query),
1450 fragment=escape_rfc3986(url_parsed.fragment)
1451 ).geturl()
1452
1453 try:
1454 struct.pack('!I', 0)
1455 except TypeError:
1456 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1457 def struct_pack(spec, *args):
1458 if isinstance(spec, compat_str):
1459 spec = spec.encode('ascii')
1460 return struct.pack(spec, *args)
1461
1462 def struct_unpack(spec, *args):
1463 if isinstance(spec, compat_str):
1464 spec = spec.encode('ascii')
1465 return struct.unpack(spec, *args)
1466 else:
1467 struct_pack = struct.pack
1468 struct_unpack = struct.unpack
1469
1470
1471 def read_batch_urls(batch_fd):
1472 def fixup(url):
1473 if not isinstance(url, compat_str):
1474 url = url.decode('utf-8', 'replace')
1475 BOM_UTF8 = '\xef\xbb\xbf'
1476 if url.startswith(BOM_UTF8):
1477 url = url[len(BOM_UTF8):]
1478 url = url.strip()
1479 if url.startswith(('#', ';', ']')):
1480 return False
1481 return url
1482
1483 with contextlib.closing(batch_fd) as fd:
1484 return [url for url in map(fixup, fd) if url]
1485
1486
1487 def urlencode_postdata(*args, **kargs):
1488 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1489
1490
1491 try:
1492 etree_iter = xml.etree.ElementTree.Element.iter
1493 except AttributeError: # Python <=2.6
1494 etree_iter = lambda n: n.findall('.//*')
1495
1496
1497 def parse_xml(s):
1498 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1499 def doctype(self, name, pubid, system):
1500 pass # Ignore doctypes
1501
1502 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1503 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1504 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1505 # Fix up XML parser in Python 2.x
1506 if sys.version_info < (3, 0):
1507 for n in etree_iter(tree):
1508 if n.text is not None:
1509 if not isinstance(n.text, compat_str):
1510 n.text = n.text.decode('utf-8')
1511 return tree
1512
1513
1514 US_RATINGS = {
1515 'G': 0,
1516 'PG': 10,
1517 'PG-13': 13,
1518 'R': 16,
1519 'NC': 18,
1520 }
1521
1522
1523 def parse_age_limit(s):
1524 if s is None:
1525 return None
1526 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1527 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1528
1529
1530 def strip_jsonp(code):
1531 return re.sub(
1532 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1533
1534
1535 def js_to_json(code):
1536 def fix_kv(m):
1537 v = m.group(0)
1538 if v in ('true', 'false', 'null'):
1539 return v
1540 if v.startswith('"'):
1541 return v
1542 if v.startswith("'"):
1543 v = v[1:-1]
1544 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1545 '\\\\': '\\\\',
1546 "\\'": "'",
1547 '"': '\\"',
1548 }[m.group(0)], v)
1549 return '"%s"' % v
1550
1551 res = re.sub(r'''(?x)
1552 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1553 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1554 [a-zA-Z_][a-zA-Z_0-9]*
1555 ''', fix_kv, code)
1556 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1557 return res
1558
1559
1560 def qualities(quality_ids):
1561 """ Get a numeric quality value out of a list of possible values """
1562 def q(qid):
1563 try:
1564 return quality_ids.index(qid)
1565 except ValueError:
1566 return -1
1567 return q
1568
1569
1570 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1571
1572
1573 def limit_length(s, length):
1574 """ Add ellipses to overly long strings """
1575 if s is None:
1576 return None
1577 ELLIPSES = '...'
1578 if len(s) > length:
1579 return s[:length - len(ELLIPSES)] + ELLIPSES
1580 return s
1581
1582
1583 def version_tuple(v):
1584 return tuple(int(e) for e in re.split(r'[-.]', v))
1585
1586
1587 def is_outdated_version(version, limit, assume_new=True):
1588 if not version:
1589 return not assume_new
1590 try:
1591 return version_tuple(version) < version_tuple(limit)
1592 except ValueError:
1593 return not assume_new
1594
1595
1596 def ytdl_is_updateable():
1597 """ Returns if youtube-dl can be updated with -U """
1598 from zipimport import zipimporter
1599
1600 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1601
1602
1603 def args_to_str(args):
1604 # Get a short string representation for a subprocess command
1605 return ' '.join(shlex_quote(a) for a in args)
1606
1607
1608 def urlhandle_detect_ext(url_handle):
1609 try:
1610 url_handle.headers
1611 getheader = lambda h: url_handle.headers[h]
1612 except AttributeError: # Python < 3
1613 getheader = url_handle.info().getheader
1614
1615 return getheader('Content-Type').split("/")[1]
1616
1617
1618 def age_restricted(content_limit, age_limit):
1619 """ Returns True iff the content should be blocked """
1620
1621 if age_limit is None: # No limit set
1622 return False
1623 if content_limit is None:
1624 return False # Content available for everyone
1625 return age_limit < content_limit