]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
2864e51428e69591ba9592869de7f4bbc87072f9
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import calendar
5 import codecs
6 import contextlib
7 import ctypes
8 import datetime
9 import email.utils
10 import errno
11 import getpass
12 import gzip
13 import itertools
14 import io
15 import json
16 import locale
17 import math
18 import os
19 import pipes
20 import platform
21 import re
22 import ssl
23 import socket
24 import struct
25 import subprocess
26 import sys
27 import tempfile
28 import traceback
29 import xml.etree.ElementTree
30 import zlib
31
32 try:
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
36
37 try:
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
41
42 try:
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
46
47 try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
52 try:
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
57 try:
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
61
62 try:
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
66
67 try:
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
71
72 try:
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
76
77 try:
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
82 try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
88 try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
94 try:
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96 except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132
133 try:
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
179
180 try:
181 compat_str = unicode # Python 2
182 except NameError:
183 compat_str = str
184
185 try:
186 compat_chr = unichr # Python 2
187 except NameError:
188 compat_chr = chr
189
190 try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
195 try:
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
202 def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
206
207 if sys.version_info >= (3, 0):
208 compat_getenv = os.getenv
209 compat_expanduser = os.path.expanduser
210 else:
211 # Environment variables should be decoded with filesystem encoding.
212 # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
213
214 def compat_getenv(key, default=None):
215 env = os.getenv(key, default)
216 if env:
217 env = env.decode(get_filesystem_encoding())
218 return env
219
220 # HACK: The default implementations of os.path.expanduser from cpython do not decode
221 # environment variables with filesystem encoding. We will work around this by
222 # providing adjusted implementations.
223 # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
224 # for different platforms with correct environment variables decoding.
225
226 if os.name == 'posix':
227 def compat_expanduser(path):
228 """Expand ~ and ~user constructions. If user or $HOME is unknown,
229 do nothing."""
230 if not path.startswith('~'):
231 return path
232 i = path.find('/', 1)
233 if i < 0:
234 i = len(path)
235 if i == 1:
236 if 'HOME' not in os.environ:
237 import pwd
238 userhome = pwd.getpwuid(os.getuid()).pw_dir
239 else:
240 userhome = compat_getenv('HOME')
241 else:
242 import pwd
243 try:
244 pwent = pwd.getpwnam(path[1:i])
245 except KeyError:
246 return path
247 userhome = pwent.pw_dir
248 userhome = userhome.rstrip('/')
249 return (userhome + path[i:]) or '/'
250 elif os.name == 'nt' or os.name == 'ce':
251 def compat_expanduser(path):
252 """Expand ~ and ~user constructs.
253
254 If user or $HOME is unknown, do nothing."""
255 if path[:1] != '~':
256 return path
257 i, n = 1, len(path)
258 while i < n and path[i] not in '/\\':
259 i = i + 1
260
261 if 'HOME' in os.environ:
262 userhome = compat_getenv('HOME')
263 elif 'USERPROFILE' in os.environ:
264 userhome = compat_getenv('USERPROFILE')
265 elif not 'HOMEPATH' in os.environ:
266 return path
267 else:
268 try:
269 drive = compat_getenv('HOMEDRIVE')
270 except KeyError:
271 drive = ''
272 userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
273
274 if i != 1: #~user
275 userhome = os.path.join(os.path.dirname(userhome), path[1:i])
276
277 return userhome + path[i:]
278 else:
279 compat_expanduser = os.path.expanduser
280
281
282 # This is not clearly defined otherwise
283 compiled_regex_type = type(re.compile(''))
284
285 std_headers = {
286 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
287 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
288 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
289 'Accept-Encoding': 'gzip, deflate',
290 'Accept-Language': 'en-us,en;q=0.5',
291 }
292
293 def preferredencoding():
294 """Get preferred encoding.
295
296 Returns the best encoding scheme for the system, based on
297 locale.getpreferredencoding() and some further tweaks.
298 """
299 try:
300 pref = locale.getpreferredencoding()
301 u'TEST'.encode(pref)
302 except:
303 pref = 'UTF-8'
304
305 return pref
306
307 if sys.version_info < (3,0):
308 def compat_print(s):
309 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
310 else:
311 def compat_print(s):
312 assert type(s) == type(u'')
313 print(s)
314
315
316 def write_json_file(obj, fn):
317 """ Encode obj as JSON and write it to fn, atomically """
318
319 args = {
320 'suffix': '.tmp',
321 'prefix': os.path.basename(fn) + '.',
322 'dir': os.path.dirname(fn),
323 'delete': False,
324 }
325
326 # In Python 2.x, json.dump expects a bytestream.
327 # In Python 3.x, it writes to a character stream
328 if sys.version_info < (3, 0):
329 args['mode'] = 'wb'
330 else:
331 args.update({
332 'mode': 'w',
333 'encoding': 'utf-8',
334 })
335
336 tf = tempfile.NamedTemporaryFile(**args)
337
338 try:
339 with tf:
340 json.dump(obj, tf)
341 os.rename(tf.name, fn)
342 except:
343 try:
344 os.remove(tf.name)
345 except OSError:
346 pass
347 raise
348
349
350 if sys.version_info >= (2, 7):
351 def find_xpath_attr(node, xpath, key, val):
352 """ Find the xpath xpath[@key=val] """
353 assert re.match(r'^[a-zA-Z-]+$', key)
354 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
355 expr = xpath + u"[@%s='%s']" % (key, val)
356 return node.find(expr)
357 else:
358 def find_xpath_attr(node, xpath, key, val):
359 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
360 # .//node does not match if a node is a direct child of . !
361 if isinstance(xpath, unicode):
362 xpath = xpath.encode('ascii')
363
364 for f in node.findall(xpath):
365 if f.attrib.get(key) == val:
366 return f
367 return None
368
369 # On python2.6 the xml.etree.ElementTree.Element methods don't support
370 # the namespace parameter
371 def xpath_with_ns(path, ns_map):
372 components = [c.split(':') for c in path.split('/')]
373 replaced = []
374 for c in components:
375 if len(c) == 1:
376 replaced.append(c[0])
377 else:
378 ns, tag = c
379 replaced.append('{%s}%s' % (ns_map[ns], tag))
380 return '/'.join(replaced)
381
382
383 def xpath_text(node, xpath, name=None, fatal=False):
384 if sys.version_info < (2, 7): # Crazy 2.6
385 xpath = xpath.encode('ascii')
386
387 n = node.find(xpath)
388 if n is None:
389 if fatal:
390 name = xpath if name is None else name
391 raise ExtractorError('Could not find XML element %s' % name)
392 else:
393 return None
394 return n.text
395
396
397 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
398 class BaseHTMLParser(compat_html_parser.HTMLParser):
399 def __init(self):
400 compat_html_parser.HTMLParser.__init__(self)
401 self.html = None
402
403 def loads(self, html):
404 self.html = html
405 self.feed(html)
406 self.close()
407
408 class AttrParser(BaseHTMLParser):
409 """Modified HTMLParser that isolates a tag with the specified attribute"""
410 def __init__(self, attribute, value):
411 self.attribute = attribute
412 self.value = value
413 self.result = None
414 self.started = False
415 self.depth = {}
416 self.watch_startpos = False
417 self.error_count = 0
418 BaseHTMLParser.__init__(self)
419
420 def error(self, message):
421 if self.error_count > 10 or self.started:
422 raise compat_html_parser.HTMLParseError(message, self.getpos())
423 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
424 self.error_count += 1
425 self.goahead(1)
426
427 def handle_starttag(self, tag, attrs):
428 attrs = dict(attrs)
429 if self.started:
430 self.find_startpos(None)
431 if self.attribute in attrs and attrs[self.attribute] == self.value:
432 self.result = [tag]
433 self.started = True
434 self.watch_startpos = True
435 if self.started:
436 if not tag in self.depth: self.depth[tag] = 0
437 self.depth[tag] += 1
438
439 def handle_endtag(self, tag):
440 if self.started:
441 if tag in self.depth: self.depth[tag] -= 1
442 if self.depth[self.result[0]] == 0:
443 self.started = False
444 self.result.append(self.getpos())
445
446 def find_startpos(self, x):
447 """Needed to put the start position of the result (self.result[1])
448 after the opening tag with the requested id"""
449 if self.watch_startpos:
450 self.watch_startpos = False
451 self.result.append(self.getpos())
452 handle_entityref = handle_charref = handle_data = handle_comment = \
453 handle_decl = handle_pi = unknown_decl = find_startpos
454
455 def get_result(self):
456 if self.result is None:
457 return None
458 if len(self.result) != 3:
459 return None
460 lines = self.html.split('\n')
461 lines = lines[self.result[1][0]-1:self.result[2][0]]
462 lines[0] = lines[0][self.result[1][1]:]
463 if len(lines) == 1:
464 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
465 lines[-1] = lines[-1][:self.result[2][1]]
466 return '\n'.join(lines).strip()
467 # Hack for https://github.com/rg3/youtube-dl/issues/662
468 if sys.version_info < (2, 7, 3):
469 AttrParser.parse_endtag = (lambda self, i:
470 i + len("</scr'+'ipt>")
471 if self.rawdata[i:].startswith("</scr'+'ipt>")
472 else compat_html_parser.HTMLParser.parse_endtag(self, i))
473
474 def get_element_by_id(id, html):
475 """Return the content of the tag with the specified ID in the passed HTML document"""
476 return get_element_by_attribute("id", id, html)
477
478 def get_element_by_attribute(attribute, value, html):
479 """Return the content of the tag with the specified attribute in the passed HTML document"""
480 parser = AttrParser(attribute, value)
481 try:
482 parser.loads(html)
483 except compat_html_parser.HTMLParseError:
484 pass
485 return parser.get_result()
486
487 class MetaParser(BaseHTMLParser):
488 """
489 Modified HTMLParser that isolates a meta tag with the specified name
490 attribute.
491 """
492 def __init__(self, name):
493 BaseHTMLParser.__init__(self)
494 self.name = name
495 self.content = None
496 self.result = None
497
498 def handle_starttag(self, tag, attrs):
499 if tag != 'meta':
500 return
501 attrs = dict(attrs)
502 if attrs.get('name') == self.name:
503 self.result = attrs.get('content')
504
505 def get_result(self):
506 return self.result
507
508 def get_meta_content(name, html):
509 """
510 Return the content attribute from the meta tag with the given name attribute.
511 """
512 parser = MetaParser(name)
513 try:
514 parser.loads(html)
515 except compat_html_parser.HTMLParseError:
516 pass
517 return parser.get_result()
518
519
520 def clean_html(html):
521 """Clean an HTML snippet into a readable string"""
522 # Newline vs <br />
523 html = html.replace('\n', ' ')
524 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
525 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
526 # Strip html tags
527 html = re.sub('<.*?>', '', html)
528 # Replace html entities
529 html = unescapeHTML(html)
530 return html.strip()
531
532
533 def sanitize_open(filename, open_mode):
534 """Try to open the given filename, and slightly tweak it if this fails.
535
536 Attempts to open the given filename. If this fails, it tries to change
537 the filename slightly, step by step, until it's either able to open it
538 or it fails and raises a final exception, like the standard open()
539 function.
540
541 It returns the tuple (stream, definitive_file_name).
542 """
543 try:
544 if filename == u'-':
545 if sys.platform == 'win32':
546 import msvcrt
547 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
548 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
549 stream = open(encodeFilename(filename), open_mode)
550 return (stream, filename)
551 except (IOError, OSError) as err:
552 if err.errno in (errno.EACCES,):
553 raise
554
555 # In case of error, try to remove win32 forbidden chars
556 alt_filename = os.path.join(
557 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
558 for path_part in os.path.split(filename)
559 )
560 if alt_filename == filename:
561 raise
562 else:
563 # An exception here should be caught in the caller
564 stream = open(encodeFilename(filename), open_mode)
565 return (stream, alt_filename)
566
567
568 def timeconvert(timestr):
569 """Convert RFC 2822 defined time string into system timestamp"""
570 timestamp = None
571 timetuple = email.utils.parsedate_tz(timestr)
572 if timetuple is not None:
573 timestamp = email.utils.mktime_tz(timetuple)
574 return timestamp
575
576 def sanitize_filename(s, restricted=False, is_id=False):
577 """Sanitizes a string so it could be used as part of a filename.
578 If restricted is set, use a stricter subset of allowed characters.
579 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
580 """
581 def replace_insane(char):
582 if char == '?' or ord(char) < 32 or ord(char) == 127:
583 return ''
584 elif char == '"':
585 return '' if restricted else '\''
586 elif char == ':':
587 return '_-' if restricted else ' -'
588 elif char in '\\/|*<>':
589 return '_'
590 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
591 return '_'
592 if restricted and ord(char) > 127:
593 return '_'
594 return char
595
596 result = u''.join(map(replace_insane, s))
597 if not is_id:
598 while '__' in result:
599 result = result.replace('__', '_')
600 result = result.strip('_')
601 # Common case of "Foreign band name - English song title"
602 if restricted and result.startswith('-_'):
603 result = result[2:]
604 if not result:
605 result = '_'
606 return result
607
608 def orderedSet(iterable):
609 """ Remove all duplicates from the input iterable """
610 res = []
611 for el in iterable:
612 if el not in res:
613 res.append(el)
614 return res
615
616
617 def _htmlentity_transform(entity):
618 """Transforms an HTML entity to a character."""
619 # Known non-numeric HTML entity
620 if entity in compat_html_entities.name2codepoint:
621 return compat_chr(compat_html_entities.name2codepoint[entity])
622
623 mobj = re.match(r'#(x?[0-9]+)', entity)
624 if mobj is not None:
625 numstr = mobj.group(1)
626 if numstr.startswith(u'x'):
627 base = 16
628 numstr = u'0%s' % numstr
629 else:
630 base = 10
631 return compat_chr(int(numstr, base))
632
633 # Unknown entity in name, return its literal representation
634 return (u'&%s;' % entity)
635
636
637 def unescapeHTML(s):
638 if s is None:
639 return None
640 assert type(s) == compat_str
641
642 return re.sub(
643 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
644
645
646 def encodeFilename(s, for_subprocess=False):
647 """
648 @param s The name of the file
649 """
650
651 assert type(s) == compat_str
652
653 # Python 3 has a Unicode API
654 if sys.version_info >= (3, 0):
655 return s
656
657 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
658 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
659 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
660 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
661 if not for_subprocess:
662 return s
663 else:
664 # For subprocess calls, encode with locale encoding
665 # Refer to http://stackoverflow.com/a/9951851/35070
666 encoding = preferredencoding()
667 else:
668 encoding = sys.getfilesystemencoding()
669 if encoding is None:
670 encoding = 'utf-8'
671 return s.encode(encoding, 'ignore')
672
673
674 def encodeArgument(s):
675 if not isinstance(s, compat_str):
676 # Legacy code that uses byte strings
677 # Uncomment the following line after fixing all post processors
678 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
679 s = s.decode('ascii')
680 return encodeFilename(s, True)
681
682
683 def decodeOption(optval):
684 if optval is None:
685 return optval
686 if isinstance(optval, bytes):
687 optval = optval.decode(preferredencoding())
688
689 assert isinstance(optval, compat_str)
690 return optval
691
692 def formatSeconds(secs):
693 if secs > 3600:
694 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
695 elif secs > 60:
696 return '%d:%02d' % (secs // 60, secs % 60)
697 else:
698 return '%d' % secs
699
700
701 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
702 if sys.version_info < (3, 2):
703 import httplib
704
705 class HTTPSConnectionV3(httplib.HTTPSConnection):
706 def __init__(self, *args, **kwargs):
707 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
708
709 def connect(self):
710 sock = socket.create_connection((self.host, self.port), self.timeout)
711 if getattr(self, '_tunnel_host', False):
712 self.sock = sock
713 self._tunnel()
714 try:
715 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
716 except ssl.SSLError:
717 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
718
719 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
720 def https_open(self, req):
721 return self.do_open(HTTPSConnectionV3, req)
722 return HTTPSHandlerV3(**kwargs)
723 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
724 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
725 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
726 if opts_no_check_certificate:
727 context.verify_mode = ssl.CERT_NONE
728 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
729 else: # Python < 3.4
730 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
731 context.verify_mode = (ssl.CERT_NONE
732 if opts_no_check_certificate
733 else ssl.CERT_REQUIRED)
734 context.set_default_verify_paths()
735 try:
736 context.load_default_certs()
737 except AttributeError:
738 pass # Python < 3.4
739 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
740
741 class ExtractorError(Exception):
742 """Error during info extraction."""
743 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
744 """ tb, if given, is the original traceback (so that it can be printed out).
745 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
746 """
747
748 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
749 expected = True
750 if video_id is not None:
751 msg = video_id + ': ' + msg
752 if cause:
753 msg += u' (caused by %r)' % cause
754 if not expected:
755 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
756 super(ExtractorError, self).__init__(msg)
757
758 self.traceback = tb
759 self.exc_info = sys.exc_info() # preserve original exception
760 self.cause = cause
761 self.video_id = video_id
762
763 def format_traceback(self):
764 if self.traceback is None:
765 return None
766 return u''.join(traceback.format_tb(self.traceback))
767
768
769 class RegexNotFoundError(ExtractorError):
770 """Error when a regex didn't match"""
771 pass
772
773
774 class DownloadError(Exception):
775 """Download Error exception.
776
777 This exception may be thrown by FileDownloader objects if they are not
778 configured to continue on errors. They will contain the appropriate
779 error message.
780 """
781 def __init__(self, msg, exc_info=None):
782 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
783 super(DownloadError, self).__init__(msg)
784 self.exc_info = exc_info
785
786
787 class SameFileError(Exception):
788 """Same File exception.
789
790 This exception will be thrown by FileDownloader objects if they detect
791 multiple files would have to be downloaded to the same file on disk.
792 """
793 pass
794
795
796 class PostProcessingError(Exception):
797 """Post Processing exception.
798
799 This exception may be raised by PostProcessor's .run() method to
800 indicate an error in the postprocessing task.
801 """
802 def __init__(self, msg):
803 self.msg = msg
804
805 class MaxDownloadsReached(Exception):
806 """ --max-downloads limit has been reached. """
807 pass
808
809
810 class UnavailableVideoError(Exception):
811 """Unavailable Format exception.
812
813 This exception will be thrown when a video is requested
814 in a format that is not available for that video.
815 """
816 pass
817
818
819 class ContentTooShortError(Exception):
820 """Content Too Short exception.
821
822 This exception may be raised by FileDownloader objects when a file they
823 download is too small for what the server announced first, indicating
824 the connection was probably interrupted.
825 """
826 # Both in bytes
827 downloaded = None
828 expected = None
829
830 def __init__(self, downloaded, expected):
831 self.downloaded = downloaded
832 self.expected = expected
833
834 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
835 """Handler for HTTP requests and responses.
836
837 This class, when installed with an OpenerDirector, automatically adds
838 the standard headers to every HTTP request and handles gzipped and
839 deflated responses from web servers. If compression is to be avoided in
840 a particular request, the original request in the program code only has
841 to include the HTTP header "Youtubedl-No-Compression", which will be
842 removed before making the real request.
843
844 Part of this code was copied from:
845
846 http://techknack.net/python-urllib2-handlers/
847
848 Andrew Rowls, the author of that code, agreed to release it to the
849 public domain.
850 """
851
852 @staticmethod
853 def deflate(data):
854 try:
855 return zlib.decompress(data, -zlib.MAX_WBITS)
856 except zlib.error:
857 return zlib.decompress(data)
858
859 @staticmethod
860 def addinfourl_wrapper(stream, headers, url, code):
861 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
862 return compat_urllib_request.addinfourl(stream, headers, url, code)
863 ret = compat_urllib_request.addinfourl(stream, headers, url)
864 ret.code = code
865 return ret
866
867 def http_request(self, req):
868 for h, v in std_headers.items():
869 if h not in req.headers:
870 req.add_header(h, v)
871 if 'Youtubedl-no-compression' in req.headers:
872 if 'Accept-encoding' in req.headers:
873 del req.headers['Accept-encoding']
874 del req.headers['Youtubedl-no-compression']
875 if 'Youtubedl-user-agent' in req.headers:
876 if 'User-agent' in req.headers:
877 del req.headers['User-agent']
878 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
879 del req.headers['Youtubedl-user-agent']
880
881 if sys.version_info < (2, 7) and '#' in req.get_full_url():
882 # Python 2.6 is brain-dead when it comes to fragments
883 req._Request__original = req._Request__original.partition('#')[0]
884 req._Request__r_type = req._Request__r_type.partition('#')[0]
885
886 return req
887
888 def http_response(self, req, resp):
889 old_resp = resp
890 # gzip
891 if resp.headers.get('Content-encoding', '') == 'gzip':
892 content = resp.read()
893 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
894 try:
895 uncompressed = io.BytesIO(gz.read())
896 except IOError as original_ioerror:
897 # There may be junk add the end of the file
898 # See http://stackoverflow.com/q/4928560/35070 for details
899 for i in range(1, 1024):
900 try:
901 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
902 uncompressed = io.BytesIO(gz.read())
903 except IOError:
904 continue
905 break
906 else:
907 raise original_ioerror
908 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
909 resp.msg = old_resp.msg
910 # deflate
911 if resp.headers.get('Content-encoding', '') == 'deflate':
912 gz = io.BytesIO(self.deflate(resp.read()))
913 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
914 resp.msg = old_resp.msg
915 return resp
916
917 https_request = http_request
918 https_response = http_response
919
920
921 def parse_iso8601(date_str, delimiter='T'):
922 """ Return a UNIX timestamp from the given date """
923
924 if date_str is None:
925 return None
926
927 m = re.search(
928 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
929 date_str)
930 if not m:
931 timezone = datetime.timedelta()
932 else:
933 date_str = date_str[:-len(m.group(0))]
934 if not m.group('sign'):
935 timezone = datetime.timedelta()
936 else:
937 sign = 1 if m.group('sign') == '+' else -1
938 timezone = datetime.timedelta(
939 hours=sign * int(m.group('hours')),
940 minutes=sign * int(m.group('minutes')))
941 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
942 dt = datetime.datetime.strptime(date_str, date_format) - timezone
943 return calendar.timegm(dt.timetuple())
944
945
946 def unified_strdate(date_str):
947 """Return a string with the date in the format YYYYMMDD"""
948
949 if date_str is None:
950 return None
951
952 upload_date = None
953 #Replace commas
954 date_str = date_str.replace(',', ' ')
955 # %z (UTC offset) is only supported in python>=3.2
956 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
957 format_expressions = [
958 '%d %B %Y',
959 '%d %b %Y',
960 '%B %d %Y',
961 '%b %d %Y',
962 '%b %dst %Y %I:%M%p',
963 '%b %dnd %Y %I:%M%p',
964 '%b %dth %Y %I:%M%p',
965 '%Y-%m-%d',
966 '%Y/%m/%d',
967 '%d.%m.%Y',
968 '%d/%m/%Y',
969 '%d/%m/%y',
970 '%Y/%m/%d %H:%M:%S',
971 '%d/%m/%Y %H:%M:%S',
972 '%Y-%m-%d %H:%M:%S',
973 '%Y-%m-%d %H:%M:%S.%f',
974 '%d.%m.%Y %H:%M',
975 '%d.%m.%Y %H.%M',
976 '%Y-%m-%dT%H:%M:%SZ',
977 '%Y-%m-%dT%H:%M:%S.%fZ',
978 '%Y-%m-%dT%H:%M:%S.%f0Z',
979 '%Y-%m-%dT%H:%M:%S',
980 '%Y-%m-%dT%H:%M:%S.%f',
981 '%Y-%m-%dT%H:%M',
982 ]
983 for expression in format_expressions:
984 try:
985 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
986 except ValueError:
987 pass
988 if upload_date is None:
989 timetuple = email.utils.parsedate_tz(date_str)
990 if timetuple:
991 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
992 return upload_date
993
994 def determine_ext(url, default_ext=u'unknown_video'):
995 if url is None:
996 return default_ext
997 guess = url.partition(u'?')[0].rpartition(u'.')[2]
998 if re.match(r'^[A-Za-z0-9]+$', guess):
999 return guess
1000 else:
1001 return default_ext
1002
1003 def subtitles_filename(filename, sub_lang, sub_format):
1004 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
1005
1006 def date_from_str(date_str):
1007 """
1008 Return a datetime object from a string in the format YYYYMMDD or
1009 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1010 today = datetime.date.today()
1011 if date_str == 'now'or date_str == 'today':
1012 return today
1013 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1014 if match is not None:
1015 sign = match.group('sign')
1016 time = int(match.group('time'))
1017 if sign == '-':
1018 time = -time
1019 unit = match.group('unit')
1020 #A bad aproximation?
1021 if unit == 'month':
1022 unit = 'day'
1023 time *= 30
1024 elif unit == 'year':
1025 unit = 'day'
1026 time *= 365
1027 unit += 's'
1028 delta = datetime.timedelta(**{unit: time})
1029 return today + delta
1030 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
1031
1032 def hyphenate_date(date_str):
1033 """
1034 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1035 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1036 if match is not None:
1037 return '-'.join(match.groups())
1038 else:
1039 return date_str
1040
1041 class DateRange(object):
1042 """Represents a time interval between two dates"""
1043 def __init__(self, start=None, end=None):
1044 """start and end must be strings in the format accepted by date"""
1045 if start is not None:
1046 self.start = date_from_str(start)
1047 else:
1048 self.start = datetime.datetime.min.date()
1049 if end is not None:
1050 self.end = date_from_str(end)
1051 else:
1052 self.end = datetime.datetime.max.date()
1053 if self.start > self.end:
1054 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1055 @classmethod
1056 def day(cls, day):
1057 """Returns a range that only contains the given day"""
1058 return cls(day,day)
1059 def __contains__(self, date):
1060 """Check if the date is in the range"""
1061 if not isinstance(date, datetime.date):
1062 date = date_from_str(date)
1063 return self.start <= date <= self.end
1064 def __str__(self):
1065 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
1066
1067
1068 def platform_name():
1069 """ Returns the platform name as a compat_str """
1070 res = platform.platform()
1071 if isinstance(res, bytes):
1072 res = res.decode(preferredencoding())
1073
1074 assert isinstance(res, compat_str)
1075 return res
1076
1077
1078 def _windows_write_string(s, out):
1079 """ Returns True if the string was written using special methods,
1080 False if it has yet to be written out."""
1081 # Adapted from http://stackoverflow.com/a/3259271/35070
1082
1083 import ctypes
1084 import ctypes.wintypes
1085
1086 WIN_OUTPUT_IDS = {
1087 1: -11,
1088 2: -12,
1089 }
1090
1091 try:
1092 fileno = out.fileno()
1093 except AttributeError:
1094 # If the output stream doesn't have a fileno, it's virtual
1095 return False
1096 if fileno not in WIN_OUTPUT_IDS:
1097 return False
1098
1099 GetStdHandle = ctypes.WINFUNCTYPE(
1100 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1101 ("GetStdHandle", ctypes.windll.kernel32))
1102 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1103
1104 WriteConsoleW = ctypes.WINFUNCTYPE(
1105 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1106 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1107 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1108 written = ctypes.wintypes.DWORD(0)
1109
1110 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1111 FILE_TYPE_CHAR = 0x0002
1112 FILE_TYPE_REMOTE = 0x8000
1113 GetConsoleMode = ctypes.WINFUNCTYPE(
1114 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1115 ctypes.POINTER(ctypes.wintypes.DWORD))(
1116 ("GetConsoleMode", ctypes.windll.kernel32))
1117 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1118
1119 def not_a_console(handle):
1120 if handle == INVALID_HANDLE_VALUE or handle is None:
1121 return True
1122 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1123 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1124
1125 if not_a_console(h):
1126 return False
1127
1128 def next_nonbmp_pos(s):
1129 try:
1130 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1131 except StopIteration:
1132 return len(s)
1133
1134 while s:
1135 count = min(next_nonbmp_pos(s), 1024)
1136
1137 ret = WriteConsoleW(
1138 h, s, count if count else 2, ctypes.byref(written), None)
1139 if ret == 0:
1140 raise OSError('Failed to write string')
1141 if not count: # We just wrote a non-BMP character
1142 assert written.value == 2
1143 s = s[1:]
1144 else:
1145 assert written.value > 0
1146 s = s[written.value:]
1147 return True
1148
1149
1150 def write_string(s, out=None, encoding=None):
1151 if out is None:
1152 out = sys.stderr
1153 assert type(s) == compat_str
1154
1155 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1156 if _windows_write_string(s, out):
1157 return
1158
1159 if ('b' in getattr(out, 'mode', '') or
1160 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1161 byt = s.encode(encoding or preferredencoding(), 'ignore')
1162 out.write(byt)
1163 elif hasattr(out, 'buffer'):
1164 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1165 byt = s.encode(enc, 'ignore')
1166 out.buffer.write(byt)
1167 else:
1168 out.write(s)
1169 out.flush()
1170
1171
1172 def bytes_to_intlist(bs):
1173 if not bs:
1174 return []
1175 if isinstance(bs[0], int): # Python 3
1176 return list(bs)
1177 else:
1178 return [ord(c) for c in bs]
1179
1180
1181 def intlist_to_bytes(xs):
1182 if not xs:
1183 return b''
1184 if isinstance(chr(0), bytes): # Python 2
1185 return ''.join([chr(x) for x in xs])
1186 else:
1187 return bytes(xs)
1188
1189
1190 # Cross-platform file locking
1191 if sys.platform == 'win32':
1192 import ctypes.wintypes
1193 import msvcrt
1194
1195 class OVERLAPPED(ctypes.Structure):
1196 _fields_ = [
1197 ('Internal', ctypes.wintypes.LPVOID),
1198 ('InternalHigh', ctypes.wintypes.LPVOID),
1199 ('Offset', ctypes.wintypes.DWORD),
1200 ('OffsetHigh', ctypes.wintypes.DWORD),
1201 ('hEvent', ctypes.wintypes.HANDLE),
1202 ]
1203
1204 kernel32 = ctypes.windll.kernel32
1205 LockFileEx = kernel32.LockFileEx
1206 LockFileEx.argtypes = [
1207 ctypes.wintypes.HANDLE, # hFile
1208 ctypes.wintypes.DWORD, # dwFlags
1209 ctypes.wintypes.DWORD, # dwReserved
1210 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1211 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1212 ctypes.POINTER(OVERLAPPED) # Overlapped
1213 ]
1214 LockFileEx.restype = ctypes.wintypes.BOOL
1215 UnlockFileEx = kernel32.UnlockFileEx
1216 UnlockFileEx.argtypes = [
1217 ctypes.wintypes.HANDLE, # hFile
1218 ctypes.wintypes.DWORD, # dwReserved
1219 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1220 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1221 ctypes.POINTER(OVERLAPPED) # Overlapped
1222 ]
1223 UnlockFileEx.restype = ctypes.wintypes.BOOL
1224 whole_low = 0xffffffff
1225 whole_high = 0x7fffffff
1226
1227 def _lock_file(f, exclusive):
1228 overlapped = OVERLAPPED()
1229 overlapped.Offset = 0
1230 overlapped.OffsetHigh = 0
1231 overlapped.hEvent = 0
1232 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1233 handle = msvcrt.get_osfhandle(f.fileno())
1234 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1235 whole_low, whole_high, f._lock_file_overlapped_p):
1236 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1237
1238 def _unlock_file(f):
1239 assert f._lock_file_overlapped_p
1240 handle = msvcrt.get_osfhandle(f.fileno())
1241 if not UnlockFileEx(handle, 0,
1242 whole_low, whole_high, f._lock_file_overlapped_p):
1243 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1244
1245 else:
1246 import fcntl
1247
1248 def _lock_file(f, exclusive):
1249 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1250
1251 def _unlock_file(f):
1252 fcntl.flock(f, fcntl.LOCK_UN)
1253
1254
1255 class locked_file(object):
1256 def __init__(self, filename, mode, encoding=None):
1257 assert mode in ['r', 'a', 'w']
1258 self.f = io.open(filename, mode, encoding=encoding)
1259 self.mode = mode
1260
1261 def __enter__(self):
1262 exclusive = self.mode != 'r'
1263 try:
1264 _lock_file(self.f, exclusive)
1265 except IOError:
1266 self.f.close()
1267 raise
1268 return self
1269
1270 def __exit__(self, etype, value, traceback):
1271 try:
1272 _unlock_file(self.f)
1273 finally:
1274 self.f.close()
1275
1276 def __iter__(self):
1277 return iter(self.f)
1278
1279 def write(self, *args):
1280 return self.f.write(*args)
1281
1282 def read(self, *args):
1283 return self.f.read(*args)
1284
1285
1286 def get_filesystem_encoding():
1287 encoding = sys.getfilesystemencoding()
1288 return encoding if encoding is not None else 'utf-8'
1289
1290
1291 def shell_quote(args):
1292 quoted_args = []
1293 encoding = get_filesystem_encoding()
1294 for a in args:
1295 if isinstance(a, bytes):
1296 # We may get a filename encoded with 'encodeFilename'
1297 a = a.decode(encoding)
1298 quoted_args.append(pipes.quote(a))
1299 return u' '.join(quoted_args)
1300
1301
1302 def takewhile_inclusive(pred, seq):
1303 """ Like itertools.takewhile, but include the latest evaluated element
1304 (the first element so that Not pred(e)) """
1305 for e in seq:
1306 yield e
1307 if not pred(e):
1308 return
1309
1310
1311 def smuggle_url(url, data):
1312 """ Pass additional data in a URL for internal use. """
1313
1314 sdata = compat_urllib_parse.urlencode(
1315 {u'__youtubedl_smuggle': json.dumps(data)})
1316 return url + u'#' + sdata
1317
1318
1319 def unsmuggle_url(smug_url, default=None):
1320 if not '#__youtubedl_smuggle' in smug_url:
1321 return smug_url, default
1322 url, _, sdata = smug_url.rpartition(u'#')
1323 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1324 data = json.loads(jsond)
1325 return url, data
1326
1327
1328 def format_bytes(bytes):
1329 if bytes is None:
1330 return u'N/A'
1331 if type(bytes) is str:
1332 bytes = float(bytes)
1333 if bytes == 0.0:
1334 exponent = 0
1335 else:
1336 exponent = int(math.log(bytes, 1024.0))
1337 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1338 converted = float(bytes) / float(1024 ** exponent)
1339 return u'%.2f%s' % (converted, suffix)
1340
1341
1342 def get_term_width():
1343 columns = compat_getenv('COLUMNS', None)
1344 if columns:
1345 return int(columns)
1346
1347 try:
1348 sp = subprocess.Popen(
1349 ['stty', 'size'],
1350 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1351 out, err = sp.communicate()
1352 return int(out.split()[1])
1353 except:
1354 pass
1355 return None
1356
1357
1358 def month_by_name(name):
1359 """ Return the number of a month by (locale-independently) English name """
1360
1361 ENGLISH_NAMES = [
1362 u'January', u'February', u'March', u'April', u'May', u'June',
1363 u'July', u'August', u'September', u'October', u'November', u'December']
1364 try:
1365 return ENGLISH_NAMES.index(name) + 1
1366 except ValueError:
1367 return None
1368
1369
1370 def fix_xml_ampersands(xml_str):
1371 """Replace all the '&' by '&amp;' in XML"""
1372 return re.sub(
1373 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1374 u'&amp;',
1375 xml_str)
1376
1377
1378 def setproctitle(title):
1379 assert isinstance(title, compat_str)
1380 try:
1381 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1382 except OSError:
1383 return
1384 title_bytes = title.encode('utf-8')
1385 buf = ctypes.create_string_buffer(len(title_bytes))
1386 buf.value = title_bytes
1387 try:
1388 libc.prctl(15, buf, 0, 0, 0)
1389 except AttributeError:
1390 return # Strange libc, just skip this
1391
1392
1393 def remove_start(s, start):
1394 if s.startswith(start):
1395 return s[len(start):]
1396 return s
1397
1398
1399 def remove_end(s, end):
1400 if s.endswith(end):
1401 return s[:-len(end)]
1402 return s
1403
1404
1405 def url_basename(url):
1406 path = compat_urlparse.urlparse(url).path
1407 return path.strip(u'/').split(u'/')[-1]
1408
1409
1410 class HEADRequest(compat_urllib_request.Request):
1411 def get_method(self):
1412 return "HEAD"
1413
1414
1415 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1416 if get_attr:
1417 if v is not None:
1418 v = getattr(v, get_attr, None)
1419 if v == '':
1420 v = None
1421 return default if v is None else (int(v) * invscale // scale)
1422
1423
1424 def str_or_none(v, default=None):
1425 return default if v is None else compat_str(v)
1426
1427
1428 def str_to_int(int_str):
1429 """ A more relaxed version of int_or_none """
1430 if int_str is None:
1431 return None
1432 int_str = re.sub(r'[,\.\+]', u'', int_str)
1433 return int(int_str)
1434
1435
1436 def float_or_none(v, scale=1, invscale=1, default=None):
1437 return default if v is None else (float(v) * invscale / scale)
1438
1439
1440 def parse_duration(s):
1441 if s is None:
1442 return None
1443
1444 s = s.strip()
1445
1446 m = re.match(
1447 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1448 if not m:
1449 return None
1450 res = int(m.group('secs'))
1451 if m.group('mins'):
1452 res += int(m.group('mins')) * 60
1453 if m.group('hours'):
1454 res += int(m.group('hours')) * 60 * 60
1455 if m.group('ms'):
1456 res += float(m.group('ms'))
1457 return res
1458
1459
1460 def prepend_extension(filename, ext):
1461 name, real_ext = os.path.splitext(filename)
1462 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1463
1464
1465 def check_executable(exe, args=[]):
1466 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1467 args can be a list of arguments for a short output (like -version) """
1468 try:
1469 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1470 except OSError:
1471 return False
1472 return exe
1473
1474
1475 class PagedList(object):
1476 def __len__(self):
1477 # This is only useful for tests
1478 return len(self.getslice())
1479
1480
1481 class OnDemandPagedList(PagedList):
1482 def __init__(self, pagefunc, pagesize):
1483 self._pagefunc = pagefunc
1484 self._pagesize = pagesize
1485
1486 def getslice(self, start=0, end=None):
1487 res = []
1488 for pagenum in itertools.count(start // self._pagesize):
1489 firstid = pagenum * self._pagesize
1490 nextfirstid = pagenum * self._pagesize + self._pagesize
1491 if start >= nextfirstid:
1492 continue
1493
1494 page_results = list(self._pagefunc(pagenum))
1495
1496 startv = (
1497 start % self._pagesize
1498 if firstid <= start < nextfirstid
1499 else 0)
1500
1501 endv = (
1502 ((end - 1) % self._pagesize) + 1
1503 if (end is not None and firstid <= end <= nextfirstid)
1504 else None)
1505
1506 if startv != 0 or endv is not None:
1507 page_results = page_results[startv:endv]
1508 res.extend(page_results)
1509
1510 # A little optimization - if current page is not "full", ie. does
1511 # not contain page_size videos then we can assume that this page
1512 # is the last one - there are no more ids on further pages -
1513 # i.e. no need to query again.
1514 if len(page_results) + startv < self._pagesize:
1515 break
1516
1517 # If we got the whole page, but the next page is not interesting,
1518 # break out early as well
1519 if end == nextfirstid:
1520 break
1521 return res
1522
1523
1524 class InAdvancePagedList(PagedList):
1525 def __init__(self, pagefunc, pagecount, pagesize):
1526 self._pagefunc = pagefunc
1527 self._pagecount = pagecount
1528 self._pagesize = pagesize
1529
1530 def getslice(self, start=0, end=None):
1531 res = []
1532 start_page = start // self._pagesize
1533 end_page = (
1534 self._pagecount if end is None else (end // self._pagesize + 1))
1535 skip_elems = start - start_page * self._pagesize
1536 only_more = None if end is None else end - start
1537 for pagenum in range(start_page, end_page):
1538 page = list(self._pagefunc(pagenum))
1539 if skip_elems:
1540 page = page[skip_elems:]
1541 skip_elems = None
1542 if only_more is not None:
1543 if len(page) < only_more:
1544 only_more -= len(page)
1545 else:
1546 page = page[:only_more]
1547 res.extend(page)
1548 break
1549 res.extend(page)
1550 return res
1551
1552
1553 def uppercase_escape(s):
1554 unicode_escape = codecs.getdecoder('unicode_escape')
1555 return re.sub(
1556 r'\\U[0-9a-fA-F]{8}',
1557 lambda m: unicode_escape(m.group(0))[0],
1558 s)
1559
1560
1561 def escape_rfc3986(s):
1562 """Escape non-ASCII characters as suggested by RFC 3986"""
1563 if sys.version_info < (3, 0) and isinstance(s, unicode):
1564 s = s.encode('utf-8')
1565 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1566
1567
1568 def escape_url(url):
1569 """Escape URL as suggested by RFC 3986"""
1570 url_parsed = compat_urllib_parse_urlparse(url)
1571 return url_parsed._replace(
1572 path=escape_rfc3986(url_parsed.path),
1573 params=escape_rfc3986(url_parsed.params),
1574 query=escape_rfc3986(url_parsed.query),
1575 fragment=escape_rfc3986(url_parsed.fragment)
1576 ).geturl()
1577
1578 try:
1579 struct.pack(u'!I', 0)
1580 except TypeError:
1581 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1582 def struct_pack(spec, *args):
1583 if isinstance(spec, compat_str):
1584 spec = spec.encode('ascii')
1585 return struct.pack(spec, *args)
1586
1587 def struct_unpack(spec, *args):
1588 if isinstance(spec, compat_str):
1589 spec = spec.encode('ascii')
1590 return struct.unpack(spec, *args)
1591 else:
1592 struct_pack = struct.pack
1593 struct_unpack = struct.unpack
1594
1595
1596 def read_batch_urls(batch_fd):
1597 def fixup(url):
1598 if not isinstance(url, compat_str):
1599 url = url.decode('utf-8', 'replace')
1600 BOM_UTF8 = u'\xef\xbb\xbf'
1601 if url.startswith(BOM_UTF8):
1602 url = url[len(BOM_UTF8):]
1603 url = url.strip()
1604 if url.startswith(('#', ';', ']')):
1605 return False
1606 return url
1607
1608 with contextlib.closing(batch_fd) as fd:
1609 return [url for url in map(fixup, fd) if url]
1610
1611
1612 def urlencode_postdata(*args, **kargs):
1613 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1614
1615
1616 try:
1617 etree_iter = xml.etree.ElementTree.Element.iter
1618 except AttributeError: # Python <=2.6
1619 etree_iter = lambda n: n.findall('.//*')
1620
1621
1622 def parse_xml(s):
1623 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1624 def doctype(self, name, pubid, system):
1625 pass # Ignore doctypes
1626
1627 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1628 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1629 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1630 # Fix up XML parser in Python 2.x
1631 if sys.version_info < (3, 0):
1632 for n in etree_iter(tree):
1633 if n.text is not None:
1634 if not isinstance(n.text, compat_str):
1635 n.text = n.text.decode('utf-8')
1636 return tree
1637
1638
1639 if sys.version_info < (3, 0) and sys.platform == 'win32':
1640 def compat_getpass(prompt, *args, **kwargs):
1641 if isinstance(prompt, compat_str):
1642 prompt = prompt.encode(preferredencoding())
1643 return getpass.getpass(prompt, *args, **kwargs)
1644 else:
1645 compat_getpass = getpass.getpass
1646
1647
1648 US_RATINGS = {
1649 'G': 0,
1650 'PG': 10,
1651 'PG-13': 13,
1652 'R': 16,
1653 'NC': 18,
1654 }
1655
1656
1657 def parse_age_limit(s):
1658 if s is None:
1659 return None
1660 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1661 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1662
1663
1664 def strip_jsonp(code):
1665 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1666
1667
1668 def js_to_json(code):
1669 def fix_kv(m):
1670 v = m.group(0)
1671 if v in ('true', 'false', 'null'):
1672 return v
1673 if v.startswith('"'):
1674 return v
1675 if v.startswith("'"):
1676 v = v[1:-1]
1677 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1678 '\\\\': '\\\\',
1679 "\\'": "'",
1680 '"': '\\"',
1681 }[m.group(0)], v)
1682 return '"%s"' % v
1683
1684 res = re.sub(r'''(?x)
1685 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1686 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1687 [a-zA-Z_][a-zA-Z_0-9]*
1688 ''', fix_kv, code)
1689 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1690 return res
1691
1692
1693 def qualities(quality_ids):
1694 """ Get a numeric quality value out of a list of possible values """
1695 def q(qid):
1696 try:
1697 return quality_ids.index(qid)
1698 except ValueError:
1699 return -1
1700 return q
1701
1702
1703 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1704
1705 try:
1706 subprocess_check_output = subprocess.check_output
1707 except AttributeError:
1708 def subprocess_check_output(*args, **kwargs):
1709 assert 'input' not in kwargs
1710 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1711 output, _ = p.communicate()
1712 ret = p.poll()
1713 if ret:
1714 raise subprocess.CalledProcessError(ret, p.args, output=output)
1715 return output
1716
1717
1718 def limit_length(s, length):
1719 """ Add ellipses to overly long strings """
1720 if s is None:
1721 return None
1722 ELLIPSES = '...'
1723 if len(s) > length:
1724 return s[:length - len(ELLIPSES)] + ELLIPSES
1725 return s
1726
1727
1728 def version_tuple(v):
1729 return [int(e) for e in v.split('.')]
1730
1731
1732 def is_outdated_version(version, limit, assume_new=True):
1733 if not version:
1734 return not assume_new
1735 try:
1736 return version_tuple(version) < version_tuple(limit)
1737 except ValueError:
1738 return not assume_new