]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/copyright: Update my copyright years.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_getenv,
39 compat_html_entities,
40 compat_http_client,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
70 def preferredencoding():
71 """Get preferred encoding.
72
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
78 'TEST'.encode(pref)
79 except:
80 pref = 'UTF-8'
81
82 return pref
83
84
85 def write_json_file(obj, fn):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
87
88 fn = encodeFilename(fn)
89 if sys.version_info < (3, 0) and sys.platform != 'win32':
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
101 args = {
102 'suffix': '.tmp',
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
105 'delete': False,
106 }
107
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
111 args['mode'] = 'wb'
112 else:
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
118 tf = tempfile.NamedTemporaryFile(**args)
119
120 try:
121 with tf:
122 json.dump(obj, tf)
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
130 os.rename(tf.name, fn)
131 except:
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139 if sys.version_info >= (2, 7):
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
144 expr = xpath + "[@%s='%s']" % (key, val)
145 return node.find(expr)
146 else:
147 def find_xpath_attr(node, xpath, key, val):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath, compat_str):
151 xpath = xpath.encode('ascii')
152
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
160
161
162 def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
173
174 def xpath_text(node, xpath, name=None, fatal=False):
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
178 n = node.find(xpath)
179 if n is None or n.text is None:
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
188 def get_element_by_id(id, html):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
192
193 def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
195
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
212
213 return unescapeHTML(res)
214
215
216 def clean_html(html):
217 """Clean an HTML snippet into a readable string"""
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
230 return html.strip()
231
232
233 def sanitize_open(filename, open_mode):
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
244 if filename == '-':
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
252 if err.errno in (errno.EACCES,):
253 raise
254
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = os.path.join(
257 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
258 for path_part in os.path.split(filename)
259 )
260 if alt_filename == filename:
261 raise
262 else:
263 # An exception here should be caught in the caller
264 stream = open(encodeFilename(filename), open_mode)
265 return (stream, alt_filename)
266
267
268 def timeconvert(timestr):
269 """Convert RFC 2822 defined time string into system timestamp"""
270 timestamp = None
271 timetuple = email.utils.parsedate_tz(timestr)
272 if timetuple is not None:
273 timestamp = email.utils.mktime_tz(timetuple)
274 return timestamp
275
276
277 def sanitize_filename(s, restricted=False, is_id=False):
278 """Sanitizes a string so it could be used as part of a filename.
279 If restricted is set, use a stricter subset of allowed characters.
280 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
281 """
282 def replace_insane(char):
283 if char == '?' or ord(char) < 32 or ord(char) == 127:
284 return ''
285 elif char == '"':
286 return '' if restricted else '\''
287 elif char == ':':
288 return '_-' if restricted else ' -'
289 elif char in '\\/|*<>':
290 return '_'
291 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
292 return '_'
293 if restricted and ord(char) > 127:
294 return '_'
295 return char
296
297 # Handle timestamps
298 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
299 result = ''.join(map(replace_insane, s))
300 if not is_id:
301 while '__' in result:
302 result = result.replace('__', '_')
303 result = result.strip('_')
304 # Common case of "Foreign band name - English song title"
305 if restricted and result.startswith('-_'):
306 result = result[2:]
307 if result.startswith('-'):
308 result = '_' + result[len('-'):]
309 if not result:
310 result = '_'
311 return result
312
313
314 def orderedSet(iterable):
315 """ Remove all duplicates from the input iterable """
316 res = []
317 for el in iterable:
318 if el not in res:
319 res.append(el)
320 return res
321
322
323 def _htmlentity_transform(entity):
324 """Transforms an HTML entity to a character."""
325 # Known non-numeric HTML entity
326 if entity in compat_html_entities.name2codepoint:
327 return compat_chr(compat_html_entities.name2codepoint[entity])
328
329 mobj = re.match(r'#(x?[0-9]+)', entity)
330 if mobj is not None:
331 numstr = mobj.group(1)
332 if numstr.startswith('x'):
333 base = 16
334 numstr = '0%s' % numstr
335 else:
336 base = 10
337 return compat_chr(int(numstr, base))
338
339 # Unknown entity in name, return its literal representation
340 return ('&%s;' % entity)
341
342
343 def unescapeHTML(s):
344 if s is None:
345 return None
346 assert type(s) == compat_str
347
348 return re.sub(
349 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
350
351
352 def encodeFilename(s, for_subprocess=False):
353 """
354 @param s The name of the file
355 """
356
357 assert type(s) == compat_str
358
359 # Python 3 has a Unicode API
360 if sys.version_info >= (3, 0):
361 return s
362
363 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
364 # Pass '' directly to use Unicode APIs on Windows 2000 and up
365 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
366 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
367 if not for_subprocess:
368 return s
369 else:
370 # For subprocess calls, encode with locale encoding
371 # Refer to http://stackoverflow.com/a/9951851/35070
372 encoding = preferredencoding()
373 else:
374 encoding = sys.getfilesystemencoding()
375 if encoding is None:
376 encoding = 'utf-8'
377 return s.encode(encoding, 'ignore')
378
379
380 def encodeArgument(s):
381 if not isinstance(s, compat_str):
382 # Legacy code that uses byte strings
383 # Uncomment the following line after fixing all post processors
384 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
385 s = s.decode('ascii')
386 return encodeFilename(s, True)
387
388
389 def decodeOption(optval):
390 if optval is None:
391 return optval
392 if isinstance(optval, bytes):
393 optval = optval.decode(preferredencoding())
394
395 assert isinstance(optval, compat_str)
396 return optval
397
398
399 def formatSeconds(secs):
400 if secs > 3600:
401 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
402 elif secs > 60:
403 return '%d:%02d' % (secs // 60, secs % 60)
404 else:
405 return '%d' % secs
406
407
408 def make_HTTPS_handler(params, **kwargs):
409 opts_no_check_certificate = params.get('nocheckcertificate', False)
410 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
411 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
412 if opts_no_check_certificate:
413 context.check_hostname = False
414 context.verify_mode = ssl.CERT_NONE
415 try:
416 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
417 except TypeError:
418 # Python 2.7.8
419 # (create_default_context present but HTTPSHandler has no context=)
420 pass
421
422 if sys.version_info < (3, 2):
423 return YoutubeDLHTTPSHandler(params, **kwargs)
424 else: # Python < 3.4
425 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
426 context.verify_mode = (ssl.CERT_NONE
427 if opts_no_check_certificate
428 else ssl.CERT_REQUIRED)
429 context.set_default_verify_paths()
430 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
431
432
433 class ExtractorError(Exception):
434 """Error during info extraction."""
435
436 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
437 """ tb, if given, is the original traceback (so that it can be printed out).
438 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 """
440
441 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 expected = True
443 if video_id is not None:
444 msg = video_id + ': ' + msg
445 if cause:
446 msg += ' (caused by %r)' % cause
447 if not expected:
448 if ytdl_is_updateable():
449 update_cmd = 'type youtube-dl -U to update'
450 else:
451 update_cmd = 'see https://yt-dl.org/update on how to update'
452 msg += '; please report this issue on https://yt-dl.org/bug .'
453 msg += ' Make sure you are using the latest version; %s.' % update_cmd
454 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
455 super(ExtractorError, self).__init__(msg)
456
457 self.traceback = tb
458 self.exc_info = sys.exc_info() # preserve original exception
459 self.cause = cause
460 self.video_id = video_id
461
462 def format_traceback(self):
463 if self.traceback is None:
464 return None
465 return ''.join(traceback.format_tb(self.traceback))
466
467
468 class UnsupportedError(ExtractorError):
469 def __init__(self, url):
470 super(UnsupportedError, self).__init__(
471 'Unsupported URL: %s' % url, expected=True)
472 self.url = url
473
474
475 class RegexNotFoundError(ExtractorError):
476 """Error when a regex didn't match"""
477 pass
478
479
480 class DownloadError(Exception):
481 """Download Error exception.
482
483 This exception may be thrown by FileDownloader objects if they are not
484 configured to continue on errors. They will contain the appropriate
485 error message.
486 """
487
488 def __init__(self, msg, exc_info=None):
489 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
490 super(DownloadError, self).__init__(msg)
491 self.exc_info = exc_info
492
493
494 class SameFileError(Exception):
495 """Same File exception.
496
497 This exception will be thrown by FileDownloader objects if they detect
498 multiple files would have to be downloaded to the same file on disk.
499 """
500 pass
501
502
503 class PostProcessingError(Exception):
504 """Post Processing exception.
505
506 This exception may be raised by PostProcessor's .run() method to
507 indicate an error in the postprocessing task.
508 """
509
510 def __init__(self, msg):
511 self.msg = msg
512
513
514 class MaxDownloadsReached(Exception):
515 """ --max-downloads limit has been reached. """
516 pass
517
518
519 class UnavailableVideoError(Exception):
520 """Unavailable Format exception.
521
522 This exception will be thrown when a video is requested
523 in a format that is not available for that video.
524 """
525 pass
526
527
528 class ContentTooShortError(Exception):
529 """Content Too Short exception.
530
531 This exception may be raised by FileDownloader objects when a file they
532 download is too small for what the server announced first, indicating
533 the connection was probably interrupted.
534 """
535 # Both in bytes
536 downloaded = None
537 expected = None
538
539 def __init__(self, downloaded, expected):
540 self.downloaded = downloaded
541 self.expected = expected
542
543
544 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
545 hc = http_class(*args, **kwargs)
546 source_address = ydl_handler._params.get('source_address')
547 if source_address is not None:
548 sa = (source_address, 0)
549 if hasattr(hc, 'source_address'): # Python 2.7+
550 hc.source_address = sa
551 else: # Python 2.6
552 def _hc_connect(self, *args, **kwargs):
553 sock = compat_socket_create_connection(
554 (self.host, self.port), self.timeout, sa)
555 if is_https:
556 self.sock = ssl.wrap_socket(
557 sock, self.key_file, self.cert_file,
558 ssl_version=ssl.PROTOCOL_TLSv1)
559 else:
560 self.sock = sock
561 hc.connect = functools.partial(_hc_connect, hc)
562
563 return hc
564
565
566 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
567 """Handler for HTTP requests and responses.
568
569 This class, when installed with an OpenerDirector, automatically adds
570 the standard headers to every HTTP request and handles gzipped and
571 deflated responses from web servers. If compression is to be avoided in
572 a particular request, the original request in the program code only has
573 to include the HTTP header "Youtubedl-No-Compression", which will be
574 removed before making the real request.
575
576 Part of this code was copied from:
577
578 http://techknack.net/python-urllib2-handlers/
579
580 Andrew Rowls, the author of that code, agreed to release it to the
581 public domain.
582 """
583
584 def __init__(self, params, *args, **kwargs):
585 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
586 self._params = params
587
588 def http_open(self, req):
589 return self.do_open(functools.partial(
590 _create_http_connection, self, compat_http_client.HTTPConnection, False),
591 req)
592
593 @staticmethod
594 def deflate(data):
595 try:
596 return zlib.decompress(data, -zlib.MAX_WBITS)
597 except zlib.error:
598 return zlib.decompress(data)
599
600 @staticmethod
601 def addinfourl_wrapper(stream, headers, url, code):
602 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
603 return compat_urllib_request.addinfourl(stream, headers, url, code)
604 ret = compat_urllib_request.addinfourl(stream, headers, url)
605 ret.code = code
606 return ret
607
608 def http_request(self, req):
609 for h, v in std_headers.items():
610 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
611 # The dict keys are capitalized because of this bug by urllib
612 if h.capitalize() not in req.headers:
613 req.add_header(h, v)
614 if 'Youtubedl-no-compression' in req.headers:
615 if 'Accept-encoding' in req.headers:
616 del req.headers['Accept-encoding']
617 del req.headers['Youtubedl-no-compression']
618
619 if sys.version_info < (2, 7) and '#' in req.get_full_url():
620 # Python 2.6 is brain-dead when it comes to fragments
621 req._Request__original = req._Request__original.partition('#')[0]
622 req._Request__r_type = req._Request__r_type.partition('#')[0]
623
624 return req
625
626 def http_response(self, req, resp):
627 old_resp = resp
628 # gzip
629 if resp.headers.get('Content-encoding', '') == 'gzip':
630 content = resp.read()
631 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
632 try:
633 uncompressed = io.BytesIO(gz.read())
634 except IOError as original_ioerror:
635 # There may be junk add the end of the file
636 # See http://stackoverflow.com/q/4928560/35070 for details
637 for i in range(1, 1024):
638 try:
639 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
640 uncompressed = io.BytesIO(gz.read())
641 except IOError:
642 continue
643 break
644 else:
645 raise original_ioerror
646 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
647 resp.msg = old_resp.msg
648 # deflate
649 if resp.headers.get('Content-encoding', '') == 'deflate':
650 gz = io.BytesIO(self.deflate(resp.read()))
651 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
652 resp.msg = old_resp.msg
653 return resp
654
655 https_request = http_request
656 https_response = http_response
657
658
659 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
660 def __init__(self, params, https_conn_class=None, *args, **kwargs):
661 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
662 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
663 self._params = params
664
665 def https_open(self, req):
666 kwargs = {}
667 if hasattr(self, '_context'): # python > 2.6
668 kwargs['context'] = self._context
669 if hasattr(self, '_check_hostname'): # python 3.x
670 kwargs['check_hostname'] = self._check_hostname
671 return self.do_open(functools.partial(
672 _create_http_connection, self, self._https_conn_class, True),
673 req, **kwargs)
674
675
676 def parse_iso8601(date_str, delimiter='T', timezone=None):
677 """ Return a UNIX timestamp from the given date """
678
679 if date_str is None:
680 return None
681
682 if timezone is None:
683 m = re.search(
684 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
685 date_str)
686 if not m:
687 timezone = datetime.timedelta()
688 else:
689 date_str = date_str[:-len(m.group(0))]
690 if not m.group('sign'):
691 timezone = datetime.timedelta()
692 else:
693 sign = 1 if m.group('sign') == '+' else -1
694 timezone = datetime.timedelta(
695 hours=sign * int(m.group('hours')),
696 minutes=sign * int(m.group('minutes')))
697 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
698 dt = datetime.datetime.strptime(date_str, date_format) - timezone
699 return calendar.timegm(dt.timetuple())
700
701
702 def unified_strdate(date_str, day_first=True):
703 """Return a string with the date in the format YYYYMMDD"""
704
705 if date_str is None:
706 return None
707 upload_date = None
708 # Replace commas
709 date_str = date_str.replace(',', ' ')
710 # %z (UTC offset) is only supported in python>=3.2
711 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
712 # Remove AM/PM + timezone
713 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
714
715 format_expressions = [
716 '%d %B %Y',
717 '%d %b %Y',
718 '%B %d %Y',
719 '%b %d %Y',
720 '%b %dst %Y %I:%M%p',
721 '%b %dnd %Y %I:%M%p',
722 '%b %dth %Y %I:%M%p',
723 '%Y %m %d',
724 '%Y-%m-%d',
725 '%Y/%m/%d',
726 '%Y/%m/%d %H:%M:%S',
727 '%Y-%m-%d %H:%M:%S',
728 '%Y-%m-%d %H:%M:%S.%f',
729 '%d.%m.%Y %H:%M',
730 '%d.%m.%Y %H.%M',
731 '%Y-%m-%dT%H:%M:%SZ',
732 '%Y-%m-%dT%H:%M:%S.%fZ',
733 '%Y-%m-%dT%H:%M:%S.%f0Z',
734 '%Y-%m-%dT%H:%M:%S',
735 '%Y-%m-%dT%H:%M:%S.%f',
736 '%Y-%m-%dT%H:%M',
737 ]
738 if day_first:
739 format_expressions.extend([
740 '%d.%m.%Y',
741 '%d/%m/%Y',
742 '%d/%m/%y',
743 '%d/%m/%Y %H:%M:%S',
744 ])
745 else:
746 format_expressions.extend([
747 '%m.%d.%Y',
748 '%m/%d/%Y',
749 '%m/%d/%y',
750 '%m/%d/%Y %H:%M:%S',
751 ])
752 for expression in format_expressions:
753 try:
754 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
755 except ValueError:
756 pass
757 if upload_date is None:
758 timetuple = email.utils.parsedate_tz(date_str)
759 if timetuple:
760 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
761 return upload_date
762
763
764 def determine_ext(url, default_ext='unknown_video'):
765 if url is None:
766 return default_ext
767 guess = url.partition('?')[0].rpartition('.')[2]
768 if re.match(r'^[A-Za-z0-9]+$', guess):
769 return guess
770 else:
771 return default_ext
772
773
774 def subtitles_filename(filename, sub_lang, sub_format):
775 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
776
777
778 def date_from_str(date_str):
779 """
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
783 if date_str in ('now', 'today'):
784 return today
785 if date_str == 'yesterday':
786 return today - datetime.timedelta(days=1)
787 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
788 if match is not None:
789 sign = match.group('sign')
790 time = int(match.group('time'))
791 if sign == '-':
792 time = -time
793 unit = match.group('unit')
794 # A bad aproximation?
795 if unit == 'month':
796 unit = 'day'
797 time *= 30
798 elif unit == 'year':
799 unit = 'day'
800 time *= 365
801 unit += 's'
802 delta = datetime.timedelta(**{unit: time})
803 return today + delta
804 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
805
806
807 def hyphenate_date(date_str):
808 """
809 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
810 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
811 if match is not None:
812 return '-'.join(match.groups())
813 else:
814 return date_str
815
816
817 class DateRange(object):
818 """Represents a time interval between two dates"""
819
820 def __init__(self, start=None, end=None):
821 """start and end must be strings in the format accepted by date"""
822 if start is not None:
823 self.start = date_from_str(start)
824 else:
825 self.start = datetime.datetime.min.date()
826 if end is not None:
827 self.end = date_from_str(end)
828 else:
829 self.end = datetime.datetime.max.date()
830 if self.start > self.end:
831 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
832
833 @classmethod
834 def day(cls, day):
835 """Returns a range that only contains the given day"""
836 return cls(day, day)
837
838 def __contains__(self, date):
839 """Check if the date is in the range"""
840 if not isinstance(date, datetime.date):
841 date = date_from_str(date)
842 return self.start <= date <= self.end
843
844 def __str__(self):
845 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
846
847
848 def platform_name():
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
853
854 assert isinstance(res, compat_str)
855 return res
856
857
858 def _windows_write_string(s, out):
859 """ Returns True if the string was written using special methods,
860 False if it has yet to be written out."""
861 # Adapted from http://stackoverflow.com/a/3259271/35070
862
863 import ctypes
864 import ctypes.wintypes
865
866 WIN_OUTPUT_IDS = {
867 1: -11,
868 2: -12,
869 }
870
871 try:
872 fileno = out.fileno()
873 except AttributeError:
874 # If the output stream doesn't have a fileno, it's virtual
875 return False
876 except io.UnsupportedOperation:
877 # Some strange Windows pseudo files?
878 return False
879 if fileno not in WIN_OUTPUT_IDS:
880 return False
881
882 GetStdHandle = ctypes.WINFUNCTYPE(
883 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
884 (b"GetStdHandle", ctypes.windll.kernel32))
885 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
886
887 WriteConsoleW = ctypes.WINFUNCTYPE(
888 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
889 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
890 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
891 written = ctypes.wintypes.DWORD(0)
892
893 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
894 FILE_TYPE_CHAR = 0x0002
895 FILE_TYPE_REMOTE = 0x8000
896 GetConsoleMode = ctypes.WINFUNCTYPE(
897 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
898 ctypes.POINTER(ctypes.wintypes.DWORD))(
899 (b"GetConsoleMode", ctypes.windll.kernel32))
900 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
901
902 def not_a_console(handle):
903 if handle == INVALID_HANDLE_VALUE or handle is None:
904 return True
905 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
906 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
907
908 if not_a_console(h):
909 return False
910
911 def next_nonbmp_pos(s):
912 try:
913 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
914 except StopIteration:
915 return len(s)
916
917 while s:
918 count = min(next_nonbmp_pos(s), 1024)
919
920 ret = WriteConsoleW(
921 h, s, count if count else 2, ctypes.byref(written), None)
922 if ret == 0:
923 raise OSError('Failed to write string')
924 if not count: # We just wrote a non-BMP character
925 assert written.value == 2
926 s = s[1:]
927 else:
928 assert written.value > 0
929 s = s[written.value:]
930 return True
931
932
933 def write_string(s, out=None, encoding=None):
934 if out is None:
935 out = sys.stderr
936 assert type(s) == compat_str
937
938 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
939 if _windows_write_string(s, out):
940 return
941
942 if ('b' in getattr(out, 'mode', '') or
943 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
944 byt = s.encode(encoding or preferredencoding(), 'ignore')
945 out.write(byt)
946 elif hasattr(out, 'buffer'):
947 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
948 byt = s.encode(enc, 'ignore')
949 out.buffer.write(byt)
950 else:
951 out.write(s)
952 out.flush()
953
954
955 def bytes_to_intlist(bs):
956 if not bs:
957 return []
958 if isinstance(bs[0], int): # Python 3
959 return list(bs)
960 else:
961 return [ord(c) for c in bs]
962
963
964 def intlist_to_bytes(xs):
965 if not xs:
966 return b''
967 return struct_pack('%dB' % len(xs), *xs)
968
969
970 # Cross-platform file locking
971 if sys.platform == 'win32':
972 import ctypes.wintypes
973 import msvcrt
974
975 class OVERLAPPED(ctypes.Structure):
976 _fields_ = [
977 ('Internal', ctypes.wintypes.LPVOID),
978 ('InternalHigh', ctypes.wintypes.LPVOID),
979 ('Offset', ctypes.wintypes.DWORD),
980 ('OffsetHigh', ctypes.wintypes.DWORD),
981 ('hEvent', ctypes.wintypes.HANDLE),
982 ]
983
984 kernel32 = ctypes.windll.kernel32
985 LockFileEx = kernel32.LockFileEx
986 LockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwFlags
989 ctypes.wintypes.DWORD, # dwReserved
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
992 ctypes.POINTER(OVERLAPPED) # Overlapped
993 ]
994 LockFileEx.restype = ctypes.wintypes.BOOL
995 UnlockFileEx = kernel32.UnlockFileEx
996 UnlockFileEx.argtypes = [
997 ctypes.wintypes.HANDLE, # hFile
998 ctypes.wintypes.DWORD, # dwReserved
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1000 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1001 ctypes.POINTER(OVERLAPPED) # Overlapped
1002 ]
1003 UnlockFileEx.restype = ctypes.wintypes.BOOL
1004 whole_low = 0xffffffff
1005 whole_high = 0x7fffffff
1006
1007 def _lock_file(f, exclusive):
1008 overlapped = OVERLAPPED()
1009 overlapped.Offset = 0
1010 overlapped.OffsetHigh = 0
1011 overlapped.hEvent = 0
1012 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1013 handle = msvcrt.get_osfhandle(f.fileno())
1014 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1015 whole_low, whole_high, f._lock_file_overlapped_p):
1016 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1017
1018 def _unlock_file(f):
1019 assert f._lock_file_overlapped_p
1020 handle = msvcrt.get_osfhandle(f.fileno())
1021 if not UnlockFileEx(handle, 0,
1022 whole_low, whole_high, f._lock_file_overlapped_p):
1023 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1024
1025 else:
1026 import fcntl
1027
1028 def _lock_file(f, exclusive):
1029 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1030
1031 def _unlock_file(f):
1032 fcntl.flock(f, fcntl.LOCK_UN)
1033
1034
1035 class locked_file(object):
1036 def __init__(self, filename, mode, encoding=None):
1037 assert mode in ['r', 'a', 'w']
1038 self.f = io.open(filename, mode, encoding=encoding)
1039 self.mode = mode
1040
1041 def __enter__(self):
1042 exclusive = self.mode != 'r'
1043 try:
1044 _lock_file(self.f, exclusive)
1045 except IOError:
1046 self.f.close()
1047 raise
1048 return self
1049
1050 def __exit__(self, etype, value, traceback):
1051 try:
1052 _unlock_file(self.f)
1053 finally:
1054 self.f.close()
1055
1056 def __iter__(self):
1057 return iter(self.f)
1058
1059 def write(self, *args):
1060 return self.f.write(*args)
1061
1062 def read(self, *args):
1063 return self.f.read(*args)
1064
1065
1066 def get_filesystem_encoding():
1067 encoding = sys.getfilesystemencoding()
1068 return encoding if encoding is not None else 'utf-8'
1069
1070
1071 def shell_quote(args):
1072 quoted_args = []
1073 encoding = get_filesystem_encoding()
1074 for a in args:
1075 if isinstance(a, bytes):
1076 # We may get a filename encoded with 'encodeFilename'
1077 a = a.decode(encoding)
1078 quoted_args.append(pipes.quote(a))
1079 return ' '.join(quoted_args)
1080
1081
1082 def takewhile_inclusive(pred, seq):
1083 """ Like itertools.takewhile, but include the latest evaluated element
1084 (the first element so that Not pred(e)) """
1085 for e in seq:
1086 yield e
1087 if not pred(e):
1088 return
1089
1090
1091 def smuggle_url(url, data):
1092 """ Pass additional data in a URL for internal use. """
1093
1094 sdata = compat_urllib_parse.urlencode(
1095 {'__youtubedl_smuggle': json.dumps(data)})
1096 return url + '#' + sdata
1097
1098
1099 def unsmuggle_url(smug_url, default=None):
1100 if '#__youtubedl_smuggle' not in smug_url:
1101 return smug_url, default
1102 url, _, sdata = smug_url.rpartition('#')
1103 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1104 data = json.loads(jsond)
1105 return url, data
1106
1107
1108 def format_bytes(bytes):
1109 if bytes is None:
1110 return 'N/A'
1111 if type(bytes) is str:
1112 bytes = float(bytes)
1113 if bytes == 0.0:
1114 exponent = 0
1115 else:
1116 exponent = int(math.log(bytes, 1024.0))
1117 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1118 converted = float(bytes) / float(1024 ** exponent)
1119 return '%.2f%s' % (converted, suffix)
1120
1121
1122 def parse_filesize(s):
1123 if s is None:
1124 return None
1125
1126 # The lower-case forms are of course incorrect and inofficial,
1127 # but we support those too
1128 _UNIT_TABLE = {
1129 'B': 1,
1130 'b': 1,
1131 'KiB': 1024,
1132 'KB': 1000,
1133 'kB': 1024,
1134 'Kb': 1000,
1135 'MiB': 1024 ** 2,
1136 'MB': 1000 ** 2,
1137 'mB': 1024 ** 2,
1138 'Mb': 1000 ** 2,
1139 'GiB': 1024 ** 3,
1140 'GB': 1000 ** 3,
1141 'gB': 1024 ** 3,
1142 'Gb': 1000 ** 3,
1143 'TiB': 1024 ** 4,
1144 'TB': 1000 ** 4,
1145 'tB': 1024 ** 4,
1146 'Tb': 1000 ** 4,
1147 'PiB': 1024 ** 5,
1148 'PB': 1000 ** 5,
1149 'pB': 1024 ** 5,
1150 'Pb': 1000 ** 5,
1151 'EiB': 1024 ** 6,
1152 'EB': 1000 ** 6,
1153 'eB': 1024 ** 6,
1154 'Eb': 1000 ** 6,
1155 'ZiB': 1024 ** 7,
1156 'ZB': 1000 ** 7,
1157 'zB': 1024 ** 7,
1158 'Zb': 1000 ** 7,
1159 'YiB': 1024 ** 8,
1160 'YB': 1000 ** 8,
1161 'yB': 1024 ** 8,
1162 'Yb': 1000 ** 8,
1163 }
1164
1165 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1166 m = re.match(
1167 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1168 if not m:
1169 return None
1170
1171 num_str = m.group('num').replace(',', '.')
1172 mult = _UNIT_TABLE[m.group('unit')]
1173 return int(float(num_str) * mult)
1174
1175
1176 def get_term_width():
1177 columns = compat_getenv('COLUMNS', None)
1178 if columns:
1179 return int(columns)
1180
1181 try:
1182 sp = subprocess.Popen(
1183 ['stty', 'size'],
1184 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1185 out, err = sp.communicate()
1186 return int(out.split()[1])
1187 except:
1188 pass
1189 return None
1190
1191
1192 def month_by_name(name):
1193 """ Return the number of a month by (locale-independently) English name """
1194
1195 try:
1196 return ENGLISH_MONTH_NAMES.index(name) + 1
1197 except ValueError:
1198 return None
1199
1200
1201 def month_by_abbreviation(abbrev):
1202 """ Return the number of a month by (locale-independently) English
1203 abbreviations """
1204
1205 try:
1206 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1207 except ValueError:
1208 return None
1209
1210
1211 def fix_xml_ampersands(xml_str):
1212 """Replace all the '&' by '&amp;' in XML"""
1213 return re.sub(
1214 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1215 '&amp;',
1216 xml_str)
1217
1218
1219 def setproctitle(title):
1220 assert isinstance(title, compat_str)
1221 try:
1222 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1223 except OSError:
1224 return
1225 title_bytes = title.encode('utf-8')
1226 buf = ctypes.create_string_buffer(len(title_bytes))
1227 buf.value = title_bytes
1228 try:
1229 libc.prctl(15, buf, 0, 0, 0)
1230 except AttributeError:
1231 return # Strange libc, just skip this
1232
1233
1234 def remove_start(s, start):
1235 if s.startswith(start):
1236 return s[len(start):]
1237 return s
1238
1239
1240 def remove_end(s, end):
1241 if s.endswith(end):
1242 return s[:-len(end)]
1243 return s
1244
1245
1246 def url_basename(url):
1247 path = compat_urlparse.urlparse(url).path
1248 return path.strip('/').split('/')[-1]
1249
1250
1251 class HEADRequest(compat_urllib_request.Request):
1252 def get_method(self):
1253 return "HEAD"
1254
1255
1256 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1257 if get_attr:
1258 if v is not None:
1259 v = getattr(v, get_attr, None)
1260 if v == '':
1261 v = None
1262 return default if v is None else (int(v) * invscale // scale)
1263
1264
1265 def str_or_none(v, default=None):
1266 return default if v is None else compat_str(v)
1267
1268
1269 def str_to_int(int_str):
1270 """ A more relaxed version of int_or_none """
1271 if int_str is None:
1272 return None
1273 int_str = re.sub(r'[,\.\+]', '', int_str)
1274 return int(int_str)
1275
1276
1277 def float_or_none(v, scale=1, invscale=1, default=None):
1278 return default if v is None else (float(v) * invscale / scale)
1279
1280
1281 def parse_duration(s):
1282 if not isinstance(s, compat_basestring):
1283 return None
1284
1285 s = s.strip()
1286
1287 m = re.match(
1288 r'''(?ix)(?:P?T)?
1289 (?:
1290 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1291 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1292
1293 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1294 (?:
1295 (?:
1296 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1297 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1298 )?
1299 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1300 )?
1301 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1302 )$''', s)
1303 if not m:
1304 return None
1305 res = 0
1306 if m.group('only_mins'):
1307 return float_or_none(m.group('only_mins'), invscale=60)
1308 if m.group('only_hours'):
1309 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1310 if m.group('secs'):
1311 res += int(m.group('secs'))
1312 if m.group('mins_reversed'):
1313 res += int(m.group('mins_reversed')) * 60
1314 if m.group('mins'):
1315 res += int(m.group('mins')) * 60
1316 if m.group('hours'):
1317 res += int(m.group('hours')) * 60 * 60
1318 if m.group('hours_reversed'):
1319 res += int(m.group('hours_reversed')) * 60 * 60
1320 if m.group('days'):
1321 res += int(m.group('days')) * 24 * 60 * 60
1322 if m.group('ms'):
1323 res += float(m.group('ms'))
1324 return res
1325
1326
1327 def prepend_extension(filename, ext):
1328 name, real_ext = os.path.splitext(filename)
1329 return '{0}.{1}{2}'.format(name, ext, real_ext)
1330
1331
1332 def check_executable(exe, args=[]):
1333 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1334 args can be a list of arguments for a short output (like -version) """
1335 try:
1336 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1337 except OSError:
1338 return False
1339 return exe
1340
1341
1342 def get_exe_version(exe, args=['--version'],
1343 version_re=None, unrecognized='present'):
1344 """ Returns the version of the specified executable,
1345 or False if the executable is not present """
1346 try:
1347 out, _ = subprocess.Popen(
1348 [exe] + args,
1349 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1350 except OSError:
1351 return False
1352 if isinstance(out, bytes): # Python 2.x
1353 out = out.decode('ascii', 'ignore')
1354 return detect_exe_version(out, version_re, unrecognized)
1355
1356
1357 def detect_exe_version(output, version_re=None, unrecognized='present'):
1358 assert isinstance(output, compat_str)
1359 if version_re is None:
1360 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1361 m = re.search(version_re, output)
1362 if m:
1363 return m.group(1)
1364 else:
1365 return unrecognized
1366
1367
1368 class PagedList(object):
1369 def __len__(self):
1370 # This is only useful for tests
1371 return len(self.getslice())
1372
1373
1374 class OnDemandPagedList(PagedList):
1375 def __init__(self, pagefunc, pagesize):
1376 self._pagefunc = pagefunc
1377 self._pagesize = pagesize
1378
1379 def getslice(self, start=0, end=None):
1380 res = []
1381 for pagenum in itertools.count(start // self._pagesize):
1382 firstid = pagenum * self._pagesize
1383 nextfirstid = pagenum * self._pagesize + self._pagesize
1384 if start >= nextfirstid:
1385 continue
1386
1387 page_results = list(self._pagefunc(pagenum))
1388
1389 startv = (
1390 start % self._pagesize
1391 if firstid <= start < nextfirstid
1392 else 0)
1393
1394 endv = (
1395 ((end - 1) % self._pagesize) + 1
1396 if (end is not None and firstid <= end <= nextfirstid)
1397 else None)
1398
1399 if startv != 0 or endv is not None:
1400 page_results = page_results[startv:endv]
1401 res.extend(page_results)
1402
1403 # A little optimization - if current page is not "full", ie. does
1404 # not contain page_size videos then we can assume that this page
1405 # is the last one - there are no more ids on further pages -
1406 # i.e. no need to query again.
1407 if len(page_results) + startv < self._pagesize:
1408 break
1409
1410 # If we got the whole page, but the next page is not interesting,
1411 # break out early as well
1412 if end == nextfirstid:
1413 break
1414 return res
1415
1416
1417 class InAdvancePagedList(PagedList):
1418 def __init__(self, pagefunc, pagecount, pagesize):
1419 self._pagefunc = pagefunc
1420 self._pagecount = pagecount
1421 self._pagesize = pagesize
1422
1423 def getslice(self, start=0, end=None):
1424 res = []
1425 start_page = start // self._pagesize
1426 end_page = (
1427 self._pagecount if end is None else (end // self._pagesize + 1))
1428 skip_elems = start - start_page * self._pagesize
1429 only_more = None if end is None else end - start
1430 for pagenum in range(start_page, end_page):
1431 page = list(self._pagefunc(pagenum))
1432 if skip_elems:
1433 page = page[skip_elems:]
1434 skip_elems = None
1435 if only_more is not None:
1436 if len(page) < only_more:
1437 only_more -= len(page)
1438 else:
1439 page = page[:only_more]
1440 res.extend(page)
1441 break
1442 res.extend(page)
1443 return res
1444
1445
1446 def uppercase_escape(s):
1447 unicode_escape = codecs.getdecoder('unicode_escape')
1448 return re.sub(
1449 r'\\U[0-9a-fA-F]{8}',
1450 lambda m: unicode_escape(m.group(0))[0],
1451 s)
1452
1453
1454 def escape_rfc3986(s):
1455 """Escape non-ASCII characters as suggested by RFC 3986"""
1456 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1457 s = s.encode('utf-8')
1458 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1459
1460
1461 def escape_url(url):
1462 """Escape URL as suggested by RFC 3986"""
1463 url_parsed = compat_urllib_parse_urlparse(url)
1464 return url_parsed._replace(
1465 path=escape_rfc3986(url_parsed.path),
1466 params=escape_rfc3986(url_parsed.params),
1467 query=escape_rfc3986(url_parsed.query),
1468 fragment=escape_rfc3986(url_parsed.fragment)
1469 ).geturl()
1470
1471 try:
1472 struct.pack('!I', 0)
1473 except TypeError:
1474 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1475 def struct_pack(spec, *args):
1476 if isinstance(spec, compat_str):
1477 spec = spec.encode('ascii')
1478 return struct.pack(spec, *args)
1479
1480 def struct_unpack(spec, *args):
1481 if isinstance(spec, compat_str):
1482 spec = spec.encode('ascii')
1483 return struct.unpack(spec, *args)
1484 else:
1485 struct_pack = struct.pack
1486 struct_unpack = struct.unpack
1487
1488
1489 def read_batch_urls(batch_fd):
1490 def fixup(url):
1491 if not isinstance(url, compat_str):
1492 url = url.decode('utf-8', 'replace')
1493 BOM_UTF8 = '\xef\xbb\xbf'
1494 if url.startswith(BOM_UTF8):
1495 url = url[len(BOM_UTF8):]
1496 url = url.strip()
1497 if url.startswith(('#', ';', ']')):
1498 return False
1499 return url
1500
1501 with contextlib.closing(batch_fd) as fd:
1502 return [url for url in map(fixup, fd) if url]
1503
1504
1505 def urlencode_postdata(*args, **kargs):
1506 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1507
1508
1509 try:
1510 etree_iter = xml.etree.ElementTree.Element.iter
1511 except AttributeError: # Python <=2.6
1512 etree_iter = lambda n: n.findall('.//*')
1513
1514
1515 def parse_xml(s):
1516 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1517 def doctype(self, name, pubid, system):
1518 pass # Ignore doctypes
1519
1520 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1521 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1522 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1523 # Fix up XML parser in Python 2.x
1524 if sys.version_info < (3, 0):
1525 for n in etree_iter(tree):
1526 if n.text is not None:
1527 if not isinstance(n.text, compat_str):
1528 n.text = n.text.decode('utf-8')
1529 return tree
1530
1531
1532 US_RATINGS = {
1533 'G': 0,
1534 'PG': 10,
1535 'PG-13': 13,
1536 'R': 16,
1537 'NC': 18,
1538 }
1539
1540
1541 def parse_age_limit(s):
1542 if s is None:
1543 return None
1544 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1545 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1546
1547
1548 def strip_jsonp(code):
1549 return re.sub(
1550 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1551
1552
1553 def js_to_json(code):
1554 def fix_kv(m):
1555 v = m.group(0)
1556 if v in ('true', 'false', 'null'):
1557 return v
1558 if v.startswith('"'):
1559 return v
1560 if v.startswith("'"):
1561 v = v[1:-1]
1562 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1563 '\\\\': '\\\\',
1564 "\\'": "'",
1565 '"': '\\"',
1566 }[m.group(0)], v)
1567 return '"%s"' % v
1568
1569 res = re.sub(r'''(?x)
1570 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1571 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1572 [a-zA-Z_][.a-zA-Z_0-9]*
1573 ''', fix_kv, code)
1574 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1575 return res
1576
1577
1578 def qualities(quality_ids):
1579 """ Get a numeric quality value out of a list of possible values """
1580 def q(qid):
1581 try:
1582 return quality_ids.index(qid)
1583 except ValueError:
1584 return -1
1585 return q
1586
1587
1588 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1589
1590
1591 def limit_length(s, length):
1592 """ Add ellipses to overly long strings """
1593 if s is None:
1594 return None
1595 ELLIPSES = '...'
1596 if len(s) > length:
1597 return s[:length - len(ELLIPSES)] + ELLIPSES
1598 return s
1599
1600
1601 def version_tuple(v):
1602 return tuple(int(e) for e in re.split(r'[-.]', v))
1603
1604
1605 def is_outdated_version(version, limit, assume_new=True):
1606 if not version:
1607 return not assume_new
1608 try:
1609 return version_tuple(version) < version_tuple(limit)
1610 except ValueError:
1611 return not assume_new
1612
1613
1614 def ytdl_is_updateable():
1615 """ Returns if youtube-dl can be updated with -U """
1616 from zipimport import zipimporter
1617
1618 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1619
1620
1621 def args_to_str(args):
1622 # Get a short string representation for a subprocess command
1623 return ' '.join(shlex_quote(a) for a in args)
1624
1625
1626 def mimetype2ext(mt):
1627 _, _, res = mt.rpartition('/')
1628
1629 return {
1630 'x-ms-wmv': 'wmv',
1631 'x-mp4-fragmented': 'mp4',
1632 }.get(res, res)
1633
1634
1635 def urlhandle_detect_ext(url_handle):
1636 try:
1637 url_handle.headers
1638 getheader = lambda h: url_handle.headers[h]
1639 except AttributeError: # Python < 3
1640 getheader = url_handle.info().getheader
1641
1642 cd = getheader('Content-Disposition')
1643 if cd:
1644 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1645 if m:
1646 e = determine_ext(m.group('filename'), default_ext=None)
1647 if e:
1648 return e
1649
1650 return mimetype2ext(getheader('Content-Type'))
1651
1652
1653 def age_restricted(content_limit, age_limit):
1654 """ Returns True iff the content should be blocked """
1655
1656 if age_limit is None: # No limit set
1657 return False
1658 if content_limit is None:
1659 return False # Content available for everyone
1660 return age_limit < content_limit
1661
1662
1663 def is_html(first_bytes):
1664 """ Detect whether a file contains HTML by examining its first bytes. """
1665
1666 BOMS = [
1667 (b'\xef\xbb\xbf', 'utf-8'),
1668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1670 (b'\xff\xfe', 'utf-16-le'),
1671 (b'\xfe\xff', 'utf-16-be'),
1672 ]
1673 for bom, enc in BOMS:
1674 if first_bytes.startswith(bom):
1675 s = first_bytes[len(bom):].decode(enc, 'replace')
1676 break
1677 else:
1678 s = first_bytes.decode('utf-8', 'replace')
1679
1680 return re.match(r'^\s*<', s)
1681
1682
1683 def determine_protocol(info_dict):
1684 protocol = info_dict.get('protocol')
1685 if protocol is not None:
1686 return protocol
1687
1688 url = info_dict['url']
1689 if url.startswith('rtmp'):
1690 return 'rtmp'
1691 elif url.startswith('mms'):
1692 return 'mms'
1693 elif url.startswith('rtsp'):
1694 return 'rtsp'
1695
1696 ext = determine_ext(url)
1697 if ext == 'm3u8':
1698 return 'm3u8'
1699 elif ext == 'f4m':
1700 return 'f4m'
1701
1702 return compat_urllib_parse_urlparse(url).scheme
1703
1704
1705 def render_table(header_row, data):
1706 """ Render a list of rows, each as a list of values """
1707 table = [header_row] + data
1708 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1709 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1710 return '\n'.join(format_str % tuple(row) for row in table)
1711
1712
1713 def _match_one(filter_part, dct):
1714 COMPARISON_OPERATORS = {
1715 '<': operator.lt,
1716 '<=': operator.le,
1717 '>': operator.gt,
1718 '>=': operator.ge,
1719 '=': operator.eq,
1720 '!=': operator.ne,
1721 }
1722 operator_rex = re.compile(r'''(?x)\s*
1723 (?P<key>[a-z_]+)
1724 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1725 (?:
1726 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1727 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1728 )
1729 \s*$
1730 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1731 m = operator_rex.search(filter_part)
1732 if m:
1733 op = COMPARISON_OPERATORS[m.group('op')]
1734 if m.group('strval') is not None:
1735 if m.group('op') not in ('=', '!='):
1736 raise ValueError(
1737 'Operator %s does not support string values!' % m.group('op'))
1738 comparison_value = m.group('strval')
1739 else:
1740 try:
1741 comparison_value = int(m.group('intval'))
1742 except ValueError:
1743 comparison_value = parse_filesize(m.group('intval'))
1744 if comparison_value is None:
1745 comparison_value = parse_filesize(m.group('intval') + 'B')
1746 if comparison_value is None:
1747 raise ValueError(
1748 'Invalid integer value %r in filter part %r' % (
1749 m.group('intval'), filter_part))
1750 actual_value = dct.get(m.group('key'))
1751 if actual_value is None:
1752 return m.group('none_inclusive')
1753 return op(actual_value, comparison_value)
1754
1755 UNARY_OPERATORS = {
1756 '': lambda v: v is not None,
1757 '!': lambda v: v is None,
1758 }
1759 operator_rex = re.compile(r'''(?x)\s*
1760 (?P<op>%s)\s*(?P<key>[a-z_]+)
1761 \s*$
1762 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1763 m = operator_rex.search(filter_part)
1764 if m:
1765 op = UNARY_OPERATORS[m.group('op')]
1766 actual_value = dct.get(m.group('key'))
1767 return op(actual_value)
1768
1769 raise ValueError('Invalid filter part %r' % filter_part)
1770
1771
1772 def match_str(filter_str, dct):
1773 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1774
1775 return all(
1776 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1777
1778
1779 def match_filter_func(filter_str):
1780 def _match_func(info_dict):
1781 if match_str(filter_str, info_dict):
1782 return None
1783 else:
1784 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1785 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1786 return _match_func