]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Merge tag 'upstream/2014.01.17.2'
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import ctypes
5 import datetime
6 import email.utils
7 import errno
8 import gzip
9 import io
10 import json
11 import locale
12 import math
13 import os
14 import pipes
15 import platform
16 import re
17 import ssl
18 import socket
19 import subprocess
20 import sys
21 import traceback
22 import zlib
23
24 try:
25 import urllib.request as compat_urllib_request
26 except ImportError: # Python 2
27 import urllib2 as compat_urllib_request
28
29 try:
30 import urllib.error as compat_urllib_error
31 except ImportError: # Python 2
32 import urllib2 as compat_urllib_error
33
34 try:
35 import urllib.parse as compat_urllib_parse
36 except ImportError: # Python 2
37 import urllib as compat_urllib_parse
38
39 try:
40 from urllib.parse import urlparse as compat_urllib_parse_urlparse
41 except ImportError: # Python 2
42 from urlparse import urlparse as compat_urllib_parse_urlparse
43
44 try:
45 import urllib.parse as compat_urlparse
46 except ImportError: # Python 2
47 import urlparse as compat_urlparse
48
49 try:
50 import http.cookiejar as compat_cookiejar
51 except ImportError: # Python 2
52 import cookielib as compat_cookiejar
53
54 try:
55 import html.entities as compat_html_entities
56 except ImportError: # Python 2
57 import htmlentitydefs as compat_html_entities
58
59 try:
60 import html.parser as compat_html_parser
61 except ImportError: # Python 2
62 import HTMLParser as compat_html_parser
63
64 try:
65 import http.client as compat_http_client
66 except ImportError: # Python 2
67 import httplib as compat_http_client
68
69 try:
70 from urllib.error import HTTPError as compat_HTTPError
71 except ImportError: # Python 2
72 from urllib2 import HTTPError as compat_HTTPError
73
74 try:
75 from urllib.request import urlretrieve as compat_urlretrieve
76 except ImportError: # Python 2
77 from urllib import urlretrieve as compat_urlretrieve
78
79
80 try:
81 from subprocess import DEVNULL
82 compat_subprocess_get_DEVNULL = lambda: DEVNULL
83 except ImportError:
84 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
85
86 try:
87 from urllib.parse import parse_qs as compat_parse_qs
88 except ImportError: # Python 2
89 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
90 # Python 2's version is apparently totally broken
91 def _unquote(string, encoding='utf-8', errors='replace'):
92 if string == '':
93 return string
94 res = string.split('%')
95 if len(res) == 1:
96 return string
97 if encoding is None:
98 encoding = 'utf-8'
99 if errors is None:
100 errors = 'replace'
101 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
102 pct_sequence = b''
103 string = res[0]
104 for item in res[1:]:
105 try:
106 if not item:
107 raise ValueError
108 pct_sequence += item[:2].decode('hex')
109 rest = item[2:]
110 if not rest:
111 # This segment was just a single percent-encoded character.
112 # May be part of a sequence of code units, so delay decoding.
113 # (Stored in pct_sequence).
114 continue
115 except ValueError:
116 rest = '%' + item
117 # Encountered non-percent-encoded characters. Flush the current
118 # pct_sequence.
119 string += pct_sequence.decode(encoding, errors) + rest
120 pct_sequence = b''
121 if pct_sequence:
122 # Flush the final pct_sequence
123 string += pct_sequence.decode(encoding, errors)
124 return string
125
126 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
127 encoding='utf-8', errors='replace'):
128 qs, _coerce_result = qs, unicode
129 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
130 r = []
131 for name_value in pairs:
132 if not name_value and not strict_parsing:
133 continue
134 nv = name_value.split('=', 1)
135 if len(nv) != 2:
136 if strict_parsing:
137 raise ValueError("bad query field: %r" % (name_value,))
138 # Handle case of a control-name with no equal sign
139 if keep_blank_values:
140 nv.append('')
141 else:
142 continue
143 if len(nv[1]) or keep_blank_values:
144 name = nv[0].replace('+', ' ')
145 name = _unquote(name, encoding=encoding, errors=errors)
146 name = _coerce_result(name)
147 value = nv[1].replace('+', ' ')
148 value = _unquote(value, encoding=encoding, errors=errors)
149 value = _coerce_result(value)
150 r.append((name, value))
151 return r
152
153 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
154 encoding='utf-8', errors='replace'):
155 parsed_result = {}
156 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
157 encoding=encoding, errors=errors)
158 for name, value in pairs:
159 if name in parsed_result:
160 parsed_result[name].append(value)
161 else:
162 parsed_result[name] = [value]
163 return parsed_result
164
165 try:
166 compat_str = unicode # Python 2
167 except NameError:
168 compat_str = str
169
170 try:
171 compat_chr = unichr # Python 2
172 except NameError:
173 compat_chr = chr
174
175 def compat_ord(c):
176 if type(c) is int: return c
177 else: return ord(c)
178
179 # This is not clearly defined otherwise
180 compiled_regex_type = type(re.compile(''))
181
182 std_headers = {
183 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
184 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
185 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
186 'Accept-Encoding': 'gzip, deflate',
187 'Accept-Language': 'en-us,en;q=0.5',
188 }
189
190 def preferredencoding():
191 """Get preferred encoding.
192
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
198 u'TEST'.encode(pref)
199 except:
200 pref = 'UTF-8'
201
202 return pref
203
204 if sys.version_info < (3,0):
205 def compat_print(s):
206 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
207 else:
208 def compat_print(s):
209 assert type(s) == type(u'')
210 print(s)
211
212 # In Python 2.x, json.dump expects a bytestream.
213 # In Python 3.x, it writes to a character stream
214 if sys.version_info < (3,0):
215 def write_json_file(obj, fn):
216 with open(fn, 'wb') as f:
217 json.dump(obj, f)
218 else:
219 def write_json_file(obj, fn):
220 with open(fn, 'w', encoding='utf-8') as f:
221 json.dump(obj, f)
222
223 if sys.version_info >= (2,7):
224 def find_xpath_attr(node, xpath, key, val):
225 """ Find the xpath xpath[@key=val] """
226 assert re.match(r'^[a-zA-Z]+$', key)
227 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
228 expr = xpath + u"[@%s='%s']" % (key, val)
229 return node.find(expr)
230 else:
231 def find_xpath_attr(node, xpath, key, val):
232 for f in node.findall(xpath):
233 if f.attrib.get(key) == val:
234 return f
235 return None
236
237 # On python2.6 the xml.etree.ElementTree.Element methods don't support
238 # the namespace parameter
239 def xpath_with_ns(path, ns_map):
240 components = [c.split(':') for c in path.split('/')]
241 replaced = []
242 for c in components:
243 if len(c) == 1:
244 replaced.append(c[0])
245 else:
246 ns, tag = c
247 replaced.append('{%s}%s' % (ns_map[ns], tag))
248 return '/'.join(replaced)
249
250 def htmlentity_transform(matchobj):
251 """Transforms an HTML entity to a character.
252
253 This function receives a match object and is intended to be used with
254 the re.sub() function.
255 """
256 entity = matchobj.group(1)
257
258 # Known non-numeric HTML entity
259 if entity in compat_html_entities.name2codepoint:
260 return compat_chr(compat_html_entities.name2codepoint[entity])
261
262 mobj = re.match(u'(?u)#(x?\\d+)', entity)
263 if mobj is not None:
264 numstr = mobj.group(1)
265 if numstr.startswith(u'x'):
266 base = 16
267 numstr = u'0%s' % numstr
268 else:
269 base = 10
270 return compat_chr(int(numstr, base))
271
272 # Unknown entity in name, return its literal representation
273 return (u'&%s;' % entity)
274
275 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
276 class BaseHTMLParser(compat_html_parser.HTMLParser):
277 def __init(self):
278 compat_html_parser.HTMLParser.__init__(self)
279 self.html = None
280
281 def loads(self, html):
282 self.html = html
283 self.feed(html)
284 self.close()
285
286 class AttrParser(BaseHTMLParser):
287 """Modified HTMLParser that isolates a tag with the specified attribute"""
288 def __init__(self, attribute, value):
289 self.attribute = attribute
290 self.value = value
291 self.result = None
292 self.started = False
293 self.depth = {}
294 self.watch_startpos = False
295 self.error_count = 0
296 BaseHTMLParser.__init__(self)
297
298 def error(self, message):
299 if self.error_count > 10 or self.started:
300 raise compat_html_parser.HTMLParseError(message, self.getpos())
301 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
302 self.error_count += 1
303 self.goahead(1)
304
305 def handle_starttag(self, tag, attrs):
306 attrs = dict(attrs)
307 if self.started:
308 self.find_startpos(None)
309 if self.attribute in attrs and attrs[self.attribute] == self.value:
310 self.result = [tag]
311 self.started = True
312 self.watch_startpos = True
313 if self.started:
314 if not tag in self.depth: self.depth[tag] = 0
315 self.depth[tag] += 1
316
317 def handle_endtag(self, tag):
318 if self.started:
319 if tag in self.depth: self.depth[tag] -= 1
320 if self.depth[self.result[0]] == 0:
321 self.started = False
322 self.result.append(self.getpos())
323
324 def find_startpos(self, x):
325 """Needed to put the start position of the result (self.result[1])
326 after the opening tag with the requested id"""
327 if self.watch_startpos:
328 self.watch_startpos = False
329 self.result.append(self.getpos())
330 handle_entityref = handle_charref = handle_data = handle_comment = \
331 handle_decl = handle_pi = unknown_decl = find_startpos
332
333 def get_result(self):
334 if self.result is None:
335 return None
336 if len(self.result) != 3:
337 return None
338 lines = self.html.split('\n')
339 lines = lines[self.result[1][0]-1:self.result[2][0]]
340 lines[0] = lines[0][self.result[1][1]:]
341 if len(lines) == 1:
342 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
343 lines[-1] = lines[-1][:self.result[2][1]]
344 return '\n'.join(lines).strip()
345 # Hack for https://github.com/rg3/youtube-dl/issues/662
346 if sys.version_info < (2, 7, 3):
347 AttrParser.parse_endtag = (lambda self, i:
348 i + len("</scr'+'ipt>")
349 if self.rawdata[i:].startswith("</scr'+'ipt>")
350 else compat_html_parser.HTMLParser.parse_endtag(self, i))
351
352 def get_element_by_id(id, html):
353 """Return the content of the tag with the specified ID in the passed HTML document"""
354 return get_element_by_attribute("id", id, html)
355
356 def get_element_by_attribute(attribute, value, html):
357 """Return the content of the tag with the specified attribute in the passed HTML document"""
358 parser = AttrParser(attribute, value)
359 try:
360 parser.loads(html)
361 except compat_html_parser.HTMLParseError:
362 pass
363 return parser.get_result()
364
365 class MetaParser(BaseHTMLParser):
366 """
367 Modified HTMLParser that isolates a meta tag with the specified name
368 attribute.
369 """
370 def __init__(self, name):
371 BaseHTMLParser.__init__(self)
372 self.name = name
373 self.content = None
374 self.result = None
375
376 def handle_starttag(self, tag, attrs):
377 if tag != 'meta':
378 return
379 attrs = dict(attrs)
380 if attrs.get('name') == self.name:
381 self.result = attrs.get('content')
382
383 def get_result(self):
384 return self.result
385
386 def get_meta_content(name, html):
387 """
388 Return the content attribute from the meta tag with the given name attribute.
389 """
390 parser = MetaParser(name)
391 try:
392 parser.loads(html)
393 except compat_html_parser.HTMLParseError:
394 pass
395 return parser.get_result()
396
397
398 def clean_html(html):
399 """Clean an HTML snippet into a readable string"""
400 # Newline vs <br />
401 html = html.replace('\n', ' ')
402 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
403 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
404 # Strip html tags
405 html = re.sub('<.*?>', '', html)
406 # Replace html entities
407 html = unescapeHTML(html)
408 return html.strip()
409
410
411 def sanitize_open(filename, open_mode):
412 """Try to open the given filename, and slightly tweak it if this fails.
413
414 Attempts to open the given filename. If this fails, it tries to change
415 the filename slightly, step by step, until it's either able to open it
416 or it fails and raises a final exception, like the standard open()
417 function.
418
419 It returns the tuple (stream, definitive_file_name).
420 """
421 try:
422 if filename == u'-':
423 if sys.platform == 'win32':
424 import msvcrt
425 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
426 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
427 stream = open(encodeFilename(filename), open_mode)
428 return (stream, filename)
429 except (IOError, OSError) as err:
430 if err.errno in (errno.EACCES,):
431 raise
432
433 # In case of error, try to remove win32 forbidden chars
434 alt_filename = os.path.join(
435 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
436 for path_part in os.path.split(filename)
437 )
438 if alt_filename == filename:
439 raise
440 else:
441 # An exception here should be caught in the caller
442 stream = open(encodeFilename(filename), open_mode)
443 return (stream, alt_filename)
444
445
446 def timeconvert(timestr):
447 """Convert RFC 2822 defined time string into system timestamp"""
448 timestamp = None
449 timetuple = email.utils.parsedate_tz(timestr)
450 if timetuple is not None:
451 timestamp = email.utils.mktime_tz(timetuple)
452 return timestamp
453
454 def sanitize_filename(s, restricted=False, is_id=False):
455 """Sanitizes a string so it could be used as part of a filename.
456 If restricted is set, use a stricter subset of allowed characters.
457 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
458 """
459 def replace_insane(char):
460 if char == '?' or ord(char) < 32 or ord(char) == 127:
461 return ''
462 elif char == '"':
463 return '' if restricted else '\''
464 elif char == ':':
465 return '_-' if restricted else ' -'
466 elif char in '\\/|*<>':
467 return '_'
468 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
469 return '_'
470 if restricted and ord(char) > 127:
471 return '_'
472 return char
473
474 result = u''.join(map(replace_insane, s))
475 if not is_id:
476 while '__' in result:
477 result = result.replace('__', '_')
478 result = result.strip('_')
479 # Common case of "Foreign band name - English song title"
480 if restricted and result.startswith('-_'):
481 result = result[2:]
482 if not result:
483 result = '_'
484 return result
485
486 def orderedSet(iterable):
487 """ Remove all duplicates from the input iterable """
488 res = []
489 for el in iterable:
490 if el not in res:
491 res.append(el)
492 return res
493
494 def unescapeHTML(s):
495 """
496 @param s a string
497 """
498 assert type(s) == type(u'')
499
500 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
501 return result
502
503
504 def encodeFilename(s, for_subprocess=False):
505 """
506 @param s The name of the file
507 """
508
509 assert type(s) == compat_str
510
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
513 return s
514
515 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
516 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
517 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
518 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
519 if not for_subprocess:
520 return s
521 else:
522 # For subprocess calls, encode with locale encoding
523 # Refer to http://stackoverflow.com/a/9951851/35070
524 encoding = preferredencoding()
525 else:
526 encoding = sys.getfilesystemencoding()
527 if encoding is None:
528 encoding = 'utf-8'
529 return s.encode(encoding, 'ignore')
530
531
532 def decodeOption(optval):
533 if optval is None:
534 return optval
535 if isinstance(optval, bytes):
536 optval = optval.decode(preferredencoding())
537
538 assert isinstance(optval, compat_str)
539 return optval
540
541 def formatSeconds(secs):
542 if secs > 3600:
543 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
544 elif secs > 60:
545 return '%d:%02d' % (secs // 60, secs % 60)
546 else:
547 return '%d' % secs
548
549
550 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
551 if sys.version_info < (3, 2):
552 import httplib
553
554 class HTTPSConnectionV3(httplib.HTTPSConnection):
555 def __init__(self, *args, **kwargs):
556 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
557
558 def connect(self):
559 sock = socket.create_connection((self.host, self.port), self.timeout)
560 if getattr(self, '_tunnel_host', False):
561 self.sock = sock
562 self._tunnel()
563 try:
564 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
565 except ssl.SSLError:
566 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
567
568 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
569 def https_open(self, req):
570 return self.do_open(HTTPSConnectionV3, req)
571 return HTTPSHandlerV3(**kwargs)
572 else:
573 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
574 context.verify_mode = (ssl.CERT_NONE
575 if opts_no_check_certificate
576 else ssl.CERT_REQUIRED)
577 context.set_default_verify_paths()
578 try:
579 context.load_default_certs()
580 except AttributeError:
581 pass # Python < 3.4
582 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
583
584 class ExtractorError(Exception):
585 """Error during info extraction."""
586 def __init__(self, msg, tb=None, expected=False, cause=None):
587 """ tb, if given, is the original traceback (so that it can be printed out).
588 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
589 """
590
591 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
592 expected = True
593 if not expected:
594 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
595 super(ExtractorError, self).__init__(msg)
596
597 self.traceback = tb
598 self.exc_info = sys.exc_info() # preserve original exception
599 self.cause = cause
600
601 def format_traceback(self):
602 if self.traceback is None:
603 return None
604 return u''.join(traceback.format_tb(self.traceback))
605
606
607 class RegexNotFoundError(ExtractorError):
608 """Error when a regex didn't match"""
609 pass
610
611
612 class DownloadError(Exception):
613 """Download Error exception.
614
615 This exception may be thrown by FileDownloader objects if they are not
616 configured to continue on errors. They will contain the appropriate
617 error message.
618 """
619 def __init__(self, msg, exc_info=None):
620 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
621 super(DownloadError, self).__init__(msg)
622 self.exc_info = exc_info
623
624
625 class SameFileError(Exception):
626 """Same File exception.
627
628 This exception will be thrown by FileDownloader objects if they detect
629 multiple files would have to be downloaded to the same file on disk.
630 """
631 pass
632
633
634 class PostProcessingError(Exception):
635 """Post Processing exception.
636
637 This exception may be raised by PostProcessor's .run() method to
638 indicate an error in the postprocessing task.
639 """
640 def __init__(self, msg):
641 self.msg = msg
642
643 class MaxDownloadsReached(Exception):
644 """ --max-downloads limit has been reached. """
645 pass
646
647
648 class UnavailableVideoError(Exception):
649 """Unavailable Format exception.
650
651 This exception will be thrown when a video is requested
652 in a format that is not available for that video.
653 """
654 pass
655
656
657 class ContentTooShortError(Exception):
658 """Content Too Short exception.
659
660 This exception may be raised by FileDownloader objects when a file they
661 download is too small for what the server announced first, indicating
662 the connection was probably interrupted.
663 """
664 # Both in bytes
665 downloaded = None
666 expected = None
667
668 def __init__(self, downloaded, expected):
669 self.downloaded = downloaded
670 self.expected = expected
671
672 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
673 """Handler for HTTP requests and responses.
674
675 This class, when installed with an OpenerDirector, automatically adds
676 the standard headers to every HTTP request and handles gzipped and
677 deflated responses from web servers. If compression is to be avoided in
678 a particular request, the original request in the program code only has
679 to include the HTTP header "Youtubedl-No-Compression", which will be
680 removed before making the real request.
681
682 Part of this code was copied from:
683
684 http://techknack.net/python-urllib2-handlers/
685
686 Andrew Rowls, the author of that code, agreed to release it to the
687 public domain.
688 """
689
690 @staticmethod
691 def deflate(data):
692 try:
693 return zlib.decompress(data, -zlib.MAX_WBITS)
694 except zlib.error:
695 return zlib.decompress(data)
696
697 @staticmethod
698 def addinfourl_wrapper(stream, headers, url, code):
699 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
700 return compat_urllib_request.addinfourl(stream, headers, url, code)
701 ret = compat_urllib_request.addinfourl(stream, headers, url)
702 ret.code = code
703 return ret
704
705 def http_request(self, req):
706 for h,v in std_headers.items():
707 if h in req.headers:
708 del req.headers[h]
709 req.add_header(h, v)
710 if 'Youtubedl-no-compression' in req.headers:
711 if 'Accept-encoding' in req.headers:
712 del req.headers['Accept-encoding']
713 del req.headers['Youtubedl-no-compression']
714 if 'Youtubedl-user-agent' in req.headers:
715 if 'User-agent' in req.headers:
716 del req.headers['User-agent']
717 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
718 del req.headers['Youtubedl-user-agent']
719 return req
720
721 def http_response(self, req, resp):
722 old_resp = resp
723 # gzip
724 if resp.headers.get('Content-encoding', '') == 'gzip':
725 content = resp.read()
726 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
727 try:
728 uncompressed = io.BytesIO(gz.read())
729 except IOError as original_ioerror:
730 # There may be junk add the end of the file
731 # See http://stackoverflow.com/q/4928560/35070 for details
732 for i in range(1, 1024):
733 try:
734 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
735 uncompressed = io.BytesIO(gz.read())
736 except IOError:
737 continue
738 break
739 else:
740 raise original_ioerror
741 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
742 resp.msg = old_resp.msg
743 # deflate
744 if resp.headers.get('Content-encoding', '') == 'deflate':
745 gz = io.BytesIO(self.deflate(resp.read()))
746 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
747 resp.msg = old_resp.msg
748 return resp
749
750 https_request = http_request
751 https_response = http_response
752
753 def unified_strdate(date_str):
754 """Return a string with the date in the format YYYYMMDD"""
755 upload_date = None
756 #Replace commas
757 date_str = date_str.replace(',',' ')
758 # %z (UTC offset) is only supported in python>=3.2
759 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
760 format_expressions = [
761 '%d %B %Y',
762 '%B %d %Y',
763 '%b %d %Y',
764 '%Y-%m-%d',
765 '%d/%m/%Y',
766 '%Y/%m/%d %H:%M:%S',
767 '%Y-%m-%d %H:%M:%S',
768 '%d.%m.%Y %H:%M',
769 '%Y-%m-%dT%H:%M:%SZ',
770 '%Y-%m-%dT%H:%M:%S.%fZ',
771 '%Y-%m-%dT%H:%M:%S.%f0Z',
772 '%Y-%m-%dT%H:%M:%S',
773 ]
774 for expression in format_expressions:
775 try:
776 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
777 except:
778 pass
779 if upload_date is None:
780 timetuple = email.utils.parsedate_tz(date_str)
781 if timetuple:
782 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
783 return upload_date
784
785 def determine_ext(url, default_ext=u'unknown_video'):
786 guess = url.partition(u'?')[0].rpartition(u'.')[2]
787 if re.match(r'^[A-Za-z0-9]+$', guess):
788 return guess
789 else:
790 return default_ext
791
792 def subtitles_filename(filename, sub_lang, sub_format):
793 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
794
795 def date_from_str(date_str):
796 """
797 Return a datetime object from a string in the format YYYYMMDD or
798 (now|today)[+-][0-9](day|week|month|year)(s)?"""
799 today = datetime.date.today()
800 if date_str == 'now'or date_str == 'today':
801 return today
802 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
803 if match is not None:
804 sign = match.group('sign')
805 time = int(match.group('time'))
806 if sign == '-':
807 time = -time
808 unit = match.group('unit')
809 #A bad aproximation?
810 if unit == 'month':
811 unit = 'day'
812 time *= 30
813 elif unit == 'year':
814 unit = 'day'
815 time *= 365
816 unit += 's'
817 delta = datetime.timedelta(**{unit: time})
818 return today + delta
819 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
820
821 def hyphenate_date(date_str):
822 """
823 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
824 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
825 if match is not None:
826 return '-'.join(match.groups())
827 else:
828 return date_str
829
830 class DateRange(object):
831 """Represents a time interval between two dates"""
832 def __init__(self, start=None, end=None):
833 """start and end must be strings in the format accepted by date"""
834 if start is not None:
835 self.start = date_from_str(start)
836 else:
837 self.start = datetime.datetime.min.date()
838 if end is not None:
839 self.end = date_from_str(end)
840 else:
841 self.end = datetime.datetime.max.date()
842 if self.start > self.end:
843 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
844 @classmethod
845 def day(cls, day):
846 """Returns a range that only contains the given day"""
847 return cls(day,day)
848 def __contains__(self, date):
849 """Check if the date is in the range"""
850 if not isinstance(date, datetime.date):
851 date = date_from_str(date)
852 return self.start <= date <= self.end
853 def __str__(self):
854 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
855
856
857 def platform_name():
858 """ Returns the platform name as a compat_str """
859 res = platform.platform()
860 if isinstance(res, bytes):
861 res = res.decode(preferredencoding())
862
863 assert isinstance(res, compat_str)
864 return res
865
866
867 def write_string(s, out=None):
868 if out is None:
869 out = sys.stderr
870 assert type(s) == compat_str
871
872 if ('b' in getattr(out, 'mode', '') or
873 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
874 s = s.encode(preferredencoding(), 'ignore')
875 try:
876 out.write(s)
877 except UnicodeEncodeError:
878 # In Windows shells, this can fail even when the codec is just charmap!?
879 # See https://wiki.python.org/moin/PrintFails#Issue
880 if sys.platform == 'win32' and hasattr(out, 'encoding'):
881 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
882 out.write(s)
883 else:
884 raise
885
886 out.flush()
887
888
889 def bytes_to_intlist(bs):
890 if not bs:
891 return []
892 if isinstance(bs[0], int): # Python 3
893 return list(bs)
894 else:
895 return [ord(c) for c in bs]
896
897
898 def intlist_to_bytes(xs):
899 if not xs:
900 return b''
901 if isinstance(chr(0), bytes): # Python 2
902 return ''.join([chr(x) for x in xs])
903 else:
904 return bytes(xs)
905
906
907 def get_cachedir(params={}):
908 cache_root = os.environ.get('XDG_CACHE_HOME',
909 os.path.expanduser('~/.cache'))
910 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
911
912
913 # Cross-platform file locking
914 if sys.platform == 'win32':
915 import ctypes.wintypes
916 import msvcrt
917
918 class OVERLAPPED(ctypes.Structure):
919 _fields_ = [
920 ('Internal', ctypes.wintypes.LPVOID),
921 ('InternalHigh', ctypes.wintypes.LPVOID),
922 ('Offset', ctypes.wintypes.DWORD),
923 ('OffsetHigh', ctypes.wintypes.DWORD),
924 ('hEvent', ctypes.wintypes.HANDLE),
925 ]
926
927 kernel32 = ctypes.windll.kernel32
928 LockFileEx = kernel32.LockFileEx
929 LockFileEx.argtypes = [
930 ctypes.wintypes.HANDLE, # hFile
931 ctypes.wintypes.DWORD, # dwFlags
932 ctypes.wintypes.DWORD, # dwReserved
933 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
934 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
935 ctypes.POINTER(OVERLAPPED) # Overlapped
936 ]
937 LockFileEx.restype = ctypes.wintypes.BOOL
938 UnlockFileEx = kernel32.UnlockFileEx
939 UnlockFileEx.argtypes = [
940 ctypes.wintypes.HANDLE, # hFile
941 ctypes.wintypes.DWORD, # dwReserved
942 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
943 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
944 ctypes.POINTER(OVERLAPPED) # Overlapped
945 ]
946 UnlockFileEx.restype = ctypes.wintypes.BOOL
947 whole_low = 0xffffffff
948 whole_high = 0x7fffffff
949
950 def _lock_file(f, exclusive):
951 overlapped = OVERLAPPED()
952 overlapped.Offset = 0
953 overlapped.OffsetHigh = 0
954 overlapped.hEvent = 0
955 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
956 handle = msvcrt.get_osfhandle(f.fileno())
957 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
958 whole_low, whole_high, f._lock_file_overlapped_p):
959 raise OSError('Locking file failed: %r' % ctypes.FormatError())
960
961 def _unlock_file(f):
962 assert f._lock_file_overlapped_p
963 handle = msvcrt.get_osfhandle(f.fileno())
964 if not UnlockFileEx(handle, 0,
965 whole_low, whole_high, f._lock_file_overlapped_p):
966 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
967
968 else:
969 import fcntl
970
971 def _lock_file(f, exclusive):
972 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
973
974 def _unlock_file(f):
975 fcntl.lockf(f, fcntl.LOCK_UN)
976
977
978 class locked_file(object):
979 def __init__(self, filename, mode, encoding=None):
980 assert mode in ['r', 'a', 'w']
981 self.f = io.open(filename, mode, encoding=encoding)
982 self.mode = mode
983
984 def __enter__(self):
985 exclusive = self.mode != 'r'
986 try:
987 _lock_file(self.f, exclusive)
988 except IOError:
989 self.f.close()
990 raise
991 return self
992
993 def __exit__(self, etype, value, traceback):
994 try:
995 _unlock_file(self.f)
996 finally:
997 self.f.close()
998
999 def __iter__(self):
1000 return iter(self.f)
1001
1002 def write(self, *args):
1003 return self.f.write(*args)
1004
1005 def read(self, *args):
1006 return self.f.read(*args)
1007
1008
1009 def shell_quote(args):
1010 quoted_args = []
1011 encoding = sys.getfilesystemencoding()
1012 if encoding is None:
1013 encoding = 'utf-8'
1014 for a in args:
1015 if isinstance(a, bytes):
1016 # We may get a filename encoded with 'encodeFilename'
1017 a = a.decode(encoding)
1018 quoted_args.append(pipes.quote(a))
1019 return u' '.join(quoted_args)
1020
1021
1022 def takewhile_inclusive(pred, seq):
1023 """ Like itertools.takewhile, but include the latest evaluated element
1024 (the first element so that Not pred(e)) """
1025 for e in seq:
1026 yield e
1027 if not pred(e):
1028 return
1029
1030
1031 def smuggle_url(url, data):
1032 """ Pass additional data in a URL for internal use. """
1033
1034 sdata = compat_urllib_parse.urlencode(
1035 {u'__youtubedl_smuggle': json.dumps(data)})
1036 return url + u'#' + sdata
1037
1038
1039 def unsmuggle_url(smug_url, default=None):
1040 if not '#__youtubedl_smuggle' in smug_url:
1041 return smug_url, default
1042 url, _, sdata = smug_url.rpartition(u'#')
1043 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1044 data = json.loads(jsond)
1045 return url, data
1046
1047
1048 def format_bytes(bytes):
1049 if bytes is None:
1050 return u'N/A'
1051 if type(bytes) is str:
1052 bytes = float(bytes)
1053 if bytes == 0.0:
1054 exponent = 0
1055 else:
1056 exponent = int(math.log(bytes, 1024.0))
1057 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1058 converted = float(bytes) / float(1024 ** exponent)
1059 return u'%.2f%s' % (converted, suffix)
1060
1061
1062 def str_to_int(int_str):
1063 int_str = re.sub(r'[,\.]', u'', int_str)
1064 return int(int_str)
1065
1066
1067 def get_term_width():
1068 columns = os.environ.get('COLUMNS', None)
1069 if columns:
1070 return int(columns)
1071
1072 try:
1073 sp = subprocess.Popen(
1074 ['stty', 'size'],
1075 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1076 out, err = sp.communicate()
1077 return int(out.split()[1])
1078 except:
1079 pass
1080 return None
1081
1082
1083 def month_by_name(name):
1084 """ Return the number of a month by (locale-independently) English name """
1085
1086 ENGLISH_NAMES = [
1087 u'January', u'February', u'March', u'April', u'May', u'June',
1088 u'July', u'August', u'September', u'October', u'November', u'December']
1089 try:
1090 return ENGLISH_NAMES.index(name) + 1
1091 except ValueError:
1092 return None
1093
1094
1095 def fix_xml_all_ampersand(xml_str):
1096 """Replace all the '&' by '&amp;' in XML"""
1097 return xml_str.replace(u'&', u'&amp;')
1098
1099
1100 def setproctitle(title):
1101 assert isinstance(title, compat_str)
1102 try:
1103 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1104 except OSError:
1105 return
1106 title = title
1107 buf = ctypes.create_string_buffer(len(title) + 1)
1108 buf.value = title.encode('utf-8')
1109 try:
1110 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1111 except AttributeError:
1112 return # Strange libc, just skip this
1113
1114
1115 def remove_start(s, start):
1116 if s.startswith(start):
1117 return s[len(start):]
1118 return s
1119
1120
1121 def url_basename(url):
1122 path = compat_urlparse.urlparse(url).path
1123 return path.strip(u'/').split(u'/')[-1]
1124
1125
1126 class HEADRequest(compat_urllib_request.Request):
1127 def get_method(self):
1128 return "HEAD"
1129
1130
1131 def int_or_none(v):
1132 return v if v is None else int(v)
1133
1134
1135 def parse_duration(s):
1136 if s is None:
1137 return None
1138
1139 m = re.match(
1140 r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
1141 if not m:
1142 return None
1143 res = int(m.group('secs'))
1144 if m.group('mins'):
1145 res += int(m.group('mins')) * 60
1146 if m.group('hours'):
1147 res += int(m.group('hours')) * 60 * 60
1148 return res
1149
1150
1151 def prepend_extension(filename, ext):
1152 name, real_ext = os.path.splitext(filename)
1153 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1154
1155
1156 def check_executable(exe, args=[]):
1157 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1158 args can be a list of arguments for a short output (like -version) """
1159 try:
1160 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1161 except OSError:
1162 return False
1163 return exe