]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/control: Update list of extractors.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import email.utils
6 import errno
7 import gzip
8 import io
9 import json
10 import locale
11 import math
12 import os
13 import pipes
14 import platform
15 import re
16 import ssl
17 import socket
18 import sys
19 import traceback
20 import xml.etree.ElementTree
21 import zlib
22
23 try:
24 import urllib.request as compat_urllib_request
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_request
27
28 try:
29 import urllib.error as compat_urllib_error
30 except ImportError: # Python 2
31 import urllib2 as compat_urllib_error
32
33 try:
34 import urllib.parse as compat_urllib_parse
35 except ImportError: # Python 2
36 import urllib as compat_urllib_parse
37
38 try:
39 from urllib.parse import urlparse as compat_urllib_parse_urlparse
40 except ImportError: # Python 2
41 from urlparse import urlparse as compat_urllib_parse_urlparse
42
43 try:
44 import urllib.parse as compat_urlparse
45 except ImportError: # Python 2
46 import urlparse as compat_urlparse
47
48 try:
49 import http.cookiejar as compat_cookiejar
50 except ImportError: # Python 2
51 import cookielib as compat_cookiejar
52
53 try:
54 import html.entities as compat_html_entities
55 except ImportError: # Python 2
56 import htmlentitydefs as compat_html_entities
57
58 try:
59 import html.parser as compat_html_parser
60 except ImportError: # Python 2
61 import HTMLParser as compat_html_parser
62
63 try:
64 import http.client as compat_http_client
65 except ImportError: # Python 2
66 import httplib as compat_http_client
67
68 try:
69 from urllib.error import HTTPError as compat_HTTPError
70 except ImportError: # Python 2
71 from urllib2 import HTTPError as compat_HTTPError
72
73 try:
74 from urllib.request import urlretrieve as compat_urlretrieve
75 except ImportError: # Python 2
76 from urllib import urlretrieve as compat_urlretrieve
77
78
79 try:
80 from subprocess import DEVNULL
81 compat_subprocess_get_DEVNULL = lambda: DEVNULL
82 except ImportError:
83 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
84
85 try:
86 from urllib.parse import parse_qs as compat_parse_qs
87 except ImportError: # Python 2
88 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
89 # Python 2's version is apparently totally broken
90 def _unquote(string, encoding='utf-8', errors='replace'):
91 if string == '':
92 return string
93 res = string.split('%')
94 if len(res) == 1:
95 return string
96 if encoding is None:
97 encoding = 'utf-8'
98 if errors is None:
99 errors = 'replace'
100 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
101 pct_sequence = b''
102 string = res[0]
103 for item in res[1:]:
104 try:
105 if not item:
106 raise ValueError
107 pct_sequence += item[:2].decode('hex')
108 rest = item[2:]
109 if not rest:
110 # This segment was just a single percent-encoded character.
111 # May be part of a sequence of code units, so delay decoding.
112 # (Stored in pct_sequence).
113 continue
114 except ValueError:
115 rest = '%' + item
116 # Encountered non-percent-encoded characters. Flush the current
117 # pct_sequence.
118 string += pct_sequence.decode(encoding, errors) + rest
119 pct_sequence = b''
120 if pct_sequence:
121 # Flush the final pct_sequence
122 string += pct_sequence.decode(encoding, errors)
123 return string
124
125 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
126 encoding='utf-8', errors='replace'):
127 qs, _coerce_result = qs, unicode
128 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
129 r = []
130 for name_value in pairs:
131 if not name_value and not strict_parsing:
132 continue
133 nv = name_value.split('=', 1)
134 if len(nv) != 2:
135 if strict_parsing:
136 raise ValueError("bad query field: %r" % (name_value,))
137 # Handle case of a control-name with no equal sign
138 if keep_blank_values:
139 nv.append('')
140 else:
141 continue
142 if len(nv[1]) or keep_blank_values:
143 name = nv[0].replace('+', ' ')
144 name = _unquote(name, encoding=encoding, errors=errors)
145 name = _coerce_result(name)
146 value = nv[1].replace('+', ' ')
147 value = _unquote(value, encoding=encoding, errors=errors)
148 value = _coerce_result(value)
149 r.append((name, value))
150 return r
151
152 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
153 encoding='utf-8', errors='replace'):
154 parsed_result = {}
155 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
156 encoding=encoding, errors=errors)
157 for name, value in pairs:
158 if name in parsed_result:
159 parsed_result[name].append(value)
160 else:
161 parsed_result[name] = [value]
162 return parsed_result
163
164 try:
165 compat_str = unicode # Python 2
166 except NameError:
167 compat_str = str
168
169 try:
170 compat_chr = unichr # Python 2
171 except NameError:
172 compat_chr = chr
173
174 def compat_ord(c):
175 if type(c) is int: return c
176 else: return ord(c)
177
178 # This is not clearly defined otherwise
179 compiled_regex_type = type(re.compile(''))
180
181 std_headers = {
182 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
183 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
184 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
185 'Accept-Encoding': 'gzip, deflate',
186 'Accept-Language': 'en-us,en;q=0.5',
187 }
188
189 def preferredencoding():
190 """Get preferred encoding.
191
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
197 u'TEST'.encode(pref)
198 except:
199 pref = 'UTF-8'
200
201 return pref
202
203 if sys.version_info < (3,0):
204 def compat_print(s):
205 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
206 else:
207 def compat_print(s):
208 assert type(s) == type(u'')
209 print(s)
210
211 # In Python 2.x, json.dump expects a bytestream.
212 # In Python 3.x, it writes to a character stream
213 if sys.version_info < (3,0):
214 def write_json_file(obj, fn):
215 with open(fn, 'wb') as f:
216 json.dump(obj, f)
217 else:
218 def write_json_file(obj, fn):
219 with open(fn, 'w', encoding='utf-8') as f:
220 json.dump(obj, f)
221
222 if sys.version_info >= (2,7):
223 def find_xpath_attr(node, xpath, key, val):
224 """ Find the xpath xpath[@key=val] """
225 assert re.match(r'^[a-zA-Z]+$', key)
226 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
227 expr = xpath + u"[@%s='%s']" % (key, val)
228 return node.find(expr)
229 else:
230 def find_xpath_attr(node, xpath, key, val):
231 for f in node.findall(xpath):
232 if f.attrib.get(key) == val:
233 return f
234 return None
235
236 # On python2.6 the xml.etree.ElementTree.Element methods don't support
237 # the namespace parameter
238 def xpath_with_ns(path, ns_map):
239 components = [c.split(':') for c in path.split('/')]
240 replaced = []
241 for c in components:
242 if len(c) == 1:
243 replaced.append(c[0])
244 else:
245 ns, tag = c
246 replaced.append('{%s}%s' % (ns_map[ns], tag))
247 return '/'.join(replaced)
248
249 def htmlentity_transform(matchobj):
250 """Transforms an HTML entity to a character.
251
252 This function receives a match object and is intended to be used with
253 the re.sub() function.
254 """
255 entity = matchobj.group(1)
256
257 # Known non-numeric HTML entity
258 if entity in compat_html_entities.name2codepoint:
259 return compat_chr(compat_html_entities.name2codepoint[entity])
260
261 mobj = re.match(u'(?u)#(x?\\d+)', entity)
262 if mobj is not None:
263 numstr = mobj.group(1)
264 if numstr.startswith(u'x'):
265 base = 16
266 numstr = u'0%s' % numstr
267 else:
268 base = 10
269 return compat_chr(int(numstr, base))
270
271 # Unknown entity in name, return its literal representation
272 return (u'&%s;' % entity)
273
274 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
275 class BaseHTMLParser(compat_html_parser.HTMLParser):
276 def __init(self):
277 compat_html_parser.HTMLParser.__init__(self)
278 self.html = None
279
280 def loads(self, html):
281 self.html = html
282 self.feed(html)
283 self.close()
284
285 class AttrParser(BaseHTMLParser):
286 """Modified HTMLParser that isolates a tag with the specified attribute"""
287 def __init__(self, attribute, value):
288 self.attribute = attribute
289 self.value = value
290 self.result = None
291 self.started = False
292 self.depth = {}
293 self.watch_startpos = False
294 self.error_count = 0
295 BaseHTMLParser.__init__(self)
296
297 def error(self, message):
298 if self.error_count > 10 or self.started:
299 raise compat_html_parser.HTMLParseError(message, self.getpos())
300 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
301 self.error_count += 1
302 self.goahead(1)
303
304 def handle_starttag(self, tag, attrs):
305 attrs = dict(attrs)
306 if self.started:
307 self.find_startpos(None)
308 if self.attribute in attrs and attrs[self.attribute] == self.value:
309 self.result = [tag]
310 self.started = True
311 self.watch_startpos = True
312 if self.started:
313 if not tag in self.depth: self.depth[tag] = 0
314 self.depth[tag] += 1
315
316 def handle_endtag(self, tag):
317 if self.started:
318 if tag in self.depth: self.depth[tag] -= 1
319 if self.depth[self.result[0]] == 0:
320 self.started = False
321 self.result.append(self.getpos())
322
323 def find_startpos(self, x):
324 """Needed to put the start position of the result (self.result[1])
325 after the opening tag with the requested id"""
326 if self.watch_startpos:
327 self.watch_startpos = False
328 self.result.append(self.getpos())
329 handle_entityref = handle_charref = handle_data = handle_comment = \
330 handle_decl = handle_pi = unknown_decl = find_startpos
331
332 def get_result(self):
333 if self.result is None:
334 return None
335 if len(self.result) != 3:
336 return None
337 lines = self.html.split('\n')
338 lines = lines[self.result[1][0]-1:self.result[2][0]]
339 lines[0] = lines[0][self.result[1][1]:]
340 if len(lines) == 1:
341 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
342 lines[-1] = lines[-1][:self.result[2][1]]
343 return '\n'.join(lines).strip()
344 # Hack for https://github.com/rg3/youtube-dl/issues/662
345 if sys.version_info < (2, 7, 3):
346 AttrParser.parse_endtag = (lambda self, i:
347 i + len("</scr'+'ipt>")
348 if self.rawdata[i:].startswith("</scr'+'ipt>")
349 else compat_html_parser.HTMLParser.parse_endtag(self, i))
350
351 def get_element_by_id(id, html):
352 """Return the content of the tag with the specified ID in the passed HTML document"""
353 return get_element_by_attribute("id", id, html)
354
355 def get_element_by_attribute(attribute, value, html):
356 """Return the content of the tag with the specified attribute in the passed HTML document"""
357 parser = AttrParser(attribute, value)
358 try:
359 parser.loads(html)
360 except compat_html_parser.HTMLParseError:
361 pass
362 return parser.get_result()
363
364 class MetaParser(BaseHTMLParser):
365 """
366 Modified HTMLParser that isolates a meta tag with the specified name
367 attribute.
368 """
369 def __init__(self, name):
370 BaseHTMLParser.__init__(self)
371 self.name = name
372 self.content = None
373 self.result = None
374
375 def handle_starttag(self, tag, attrs):
376 if tag != 'meta':
377 return
378 attrs = dict(attrs)
379 if attrs.get('name') == self.name:
380 self.result = attrs.get('content')
381
382 def get_result(self):
383 return self.result
384
385 def get_meta_content(name, html):
386 """
387 Return the content attribute from the meta tag with the given name attribute.
388 """
389 parser = MetaParser(name)
390 try:
391 parser.loads(html)
392 except compat_html_parser.HTMLParseError:
393 pass
394 return parser.get_result()
395
396
397 def clean_html(html):
398 """Clean an HTML snippet into a readable string"""
399 # Newline vs <br />
400 html = html.replace('\n', ' ')
401 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
402 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
403 # Strip html tags
404 html = re.sub('<.*?>', '', html)
405 # Replace html entities
406 html = unescapeHTML(html)
407 return html.strip()
408
409
410 def sanitize_open(filename, open_mode):
411 """Try to open the given filename, and slightly tweak it if this fails.
412
413 Attempts to open the given filename. If this fails, it tries to change
414 the filename slightly, step by step, until it's either able to open it
415 or it fails and raises a final exception, like the standard open()
416 function.
417
418 It returns the tuple (stream, definitive_file_name).
419 """
420 try:
421 if filename == u'-':
422 if sys.platform == 'win32':
423 import msvcrt
424 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
425 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
426 stream = open(encodeFilename(filename), open_mode)
427 return (stream, filename)
428 except (IOError, OSError) as err:
429 if err.errno in (errno.EACCES,):
430 raise
431
432 # In case of error, try to remove win32 forbidden chars
433 alt_filename = os.path.join(
434 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
435 for path_part in os.path.split(filename)
436 )
437 if alt_filename == filename:
438 raise
439 else:
440 # An exception here should be caught in the caller
441 stream = open(encodeFilename(filename), open_mode)
442 return (stream, alt_filename)
443
444
445 def timeconvert(timestr):
446 """Convert RFC 2822 defined time string into system timestamp"""
447 timestamp = None
448 timetuple = email.utils.parsedate_tz(timestr)
449 if timetuple is not None:
450 timestamp = email.utils.mktime_tz(timetuple)
451 return timestamp
452
453 def sanitize_filename(s, restricted=False, is_id=False):
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
457 """
458 def replace_insane(char):
459 if char == '?' or ord(char) < 32 or ord(char) == 127:
460 return ''
461 elif char == '"':
462 return '' if restricted else '\''
463 elif char == ':':
464 return '_-' if restricted else ' -'
465 elif char in '\\/|*<>':
466 return '_'
467 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
468 return '_'
469 if restricted and ord(char) > 127:
470 return '_'
471 return char
472
473 result = u''.join(map(replace_insane, s))
474 if not is_id:
475 while '__' in result:
476 result = result.replace('__', '_')
477 result = result.strip('_')
478 # Common case of "Foreign band name - English song title"
479 if restricted and result.startswith('-_'):
480 result = result[2:]
481 if not result:
482 result = '_'
483 return result
484
485 def orderedSet(iterable):
486 """ Remove all duplicates from the input iterable """
487 res = []
488 for el in iterable:
489 if el not in res:
490 res.append(el)
491 return res
492
493 def unescapeHTML(s):
494 """
495 @param s a string
496 """
497 assert type(s) == type(u'')
498
499 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
500 return result
501
502 def encodeFilename(s):
503 """
504 @param s The name of the file
505 """
506
507 assert type(s) == type(u'')
508
509 # Python 3 has a Unicode API
510 if sys.version_info >= (3, 0):
511 return s
512
513 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
514 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
515 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
516 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
517 return s
518 else:
519 encoding = sys.getfilesystemencoding()
520 if encoding is None:
521 encoding = 'utf-8'
522 return s.encode(encoding, 'ignore')
523
524 def decodeOption(optval):
525 if optval is None:
526 return optval
527 if isinstance(optval, bytes):
528 optval = optval.decode(preferredencoding())
529
530 assert isinstance(optval, compat_str)
531 return optval
532
533 def formatSeconds(secs):
534 if secs > 3600:
535 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
536 elif secs > 60:
537 return '%d:%02d' % (secs // 60, secs % 60)
538 else:
539 return '%d' % secs
540
541 def make_HTTPS_handler(opts_no_check_certificate):
542 if sys.version_info < (3, 2):
543 import httplib
544
545 class HTTPSConnectionV3(httplib.HTTPSConnection):
546 def __init__(self, *args, **kwargs):
547 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
548
549 def connect(self):
550 sock = socket.create_connection((self.host, self.port), self.timeout)
551 if self._tunnel_host:
552 self.sock = sock
553 self._tunnel()
554 try:
555 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
556 except ssl.SSLError:
557 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
558
559 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
560 def https_open(self, req):
561 return self.do_open(HTTPSConnectionV3, req)
562 return HTTPSHandlerV3()
563 else:
564 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
565 context.set_default_verify_paths()
566
567 context.verify_mode = (ssl.CERT_NONE
568 if opts_no_check_certificate
569 else ssl.CERT_REQUIRED)
570 return compat_urllib_request.HTTPSHandler(context=context)
571
572 class ExtractorError(Exception):
573 """Error during info extraction."""
574 def __init__(self, msg, tb=None, expected=False, cause=None):
575 """ tb, if given, is the original traceback (so that it can be printed out).
576 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
577 """
578
579 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
580 expected = True
581 if not expected:
582 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
583 super(ExtractorError, self).__init__(msg)
584
585 self.traceback = tb
586 self.exc_info = sys.exc_info() # preserve original exception
587 self.cause = cause
588
589 def format_traceback(self):
590 if self.traceback is None:
591 return None
592 return u''.join(traceback.format_tb(self.traceback))
593
594
595 class RegexNotFoundError(ExtractorError):
596 """Error when a regex didn't match"""
597 pass
598
599
600 class DownloadError(Exception):
601 """Download Error exception.
602
603 This exception may be thrown by FileDownloader objects if they are not
604 configured to continue on errors. They will contain the appropriate
605 error message.
606 """
607 def __init__(self, msg, exc_info=None):
608 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
609 super(DownloadError, self).__init__(msg)
610 self.exc_info = exc_info
611
612
613 class SameFileError(Exception):
614 """Same File exception.
615
616 This exception will be thrown by FileDownloader objects if they detect
617 multiple files would have to be downloaded to the same file on disk.
618 """
619 pass
620
621
622 class PostProcessingError(Exception):
623 """Post Processing exception.
624
625 This exception may be raised by PostProcessor's .run() method to
626 indicate an error in the postprocessing task.
627 """
628 def __init__(self, msg):
629 self.msg = msg
630
631 class MaxDownloadsReached(Exception):
632 """ --max-downloads limit has been reached. """
633 pass
634
635
636 class UnavailableVideoError(Exception):
637 """Unavailable Format exception.
638
639 This exception will be thrown when a video is requested
640 in a format that is not available for that video.
641 """
642 pass
643
644
645 class ContentTooShortError(Exception):
646 """Content Too Short exception.
647
648 This exception may be raised by FileDownloader objects when a file they
649 download is too small for what the server announced first, indicating
650 the connection was probably interrupted.
651 """
652 # Both in bytes
653 downloaded = None
654 expected = None
655
656 def __init__(self, downloaded, expected):
657 self.downloaded = downloaded
658 self.expected = expected
659
660 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
661 """Handler for HTTP requests and responses.
662
663 This class, when installed with an OpenerDirector, automatically adds
664 the standard headers to every HTTP request and handles gzipped and
665 deflated responses from web servers. If compression is to be avoided in
666 a particular request, the original request in the program code only has
667 to include the HTTP header "Youtubedl-No-Compression", which will be
668 removed before making the real request.
669
670 Part of this code was copied from:
671
672 http://techknack.net/python-urllib2-handlers/
673
674 Andrew Rowls, the author of that code, agreed to release it to the
675 public domain.
676 """
677
678 @staticmethod
679 def deflate(data):
680 try:
681 return zlib.decompress(data, -zlib.MAX_WBITS)
682 except zlib.error:
683 return zlib.decompress(data)
684
685 @staticmethod
686 def addinfourl_wrapper(stream, headers, url, code):
687 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
688 return compat_urllib_request.addinfourl(stream, headers, url, code)
689 ret = compat_urllib_request.addinfourl(stream, headers, url)
690 ret.code = code
691 return ret
692
693 def http_request(self, req):
694 for h,v in std_headers.items():
695 if h in req.headers:
696 del req.headers[h]
697 req.add_header(h, v)
698 if 'Youtubedl-no-compression' in req.headers:
699 if 'Accept-encoding' in req.headers:
700 del req.headers['Accept-encoding']
701 del req.headers['Youtubedl-no-compression']
702 if 'Youtubedl-user-agent' in req.headers:
703 if 'User-agent' in req.headers:
704 del req.headers['User-agent']
705 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
706 del req.headers['Youtubedl-user-agent']
707 return req
708
709 def http_response(self, req, resp):
710 old_resp = resp
711 # gzip
712 if resp.headers.get('Content-encoding', '') == 'gzip':
713 content = resp.read()
714 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
715 try:
716 uncompressed = io.BytesIO(gz.read())
717 except IOError as original_ioerror:
718 # There may be junk add the end of the file
719 # See http://stackoverflow.com/q/4928560/35070 for details
720 for i in range(1, 1024):
721 try:
722 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
723 uncompressed = io.BytesIO(gz.read())
724 except IOError:
725 continue
726 break
727 else:
728 raise original_ioerror
729 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
730 resp.msg = old_resp.msg
731 # deflate
732 if resp.headers.get('Content-encoding', '') == 'deflate':
733 gz = io.BytesIO(self.deflate(resp.read()))
734 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
735 resp.msg = old_resp.msg
736 return resp
737
738 https_request = http_request
739 https_response = http_response
740
741 def unified_strdate(date_str):
742 """Return a string with the date in the format YYYYMMDD"""
743 upload_date = None
744 #Replace commas
745 date_str = date_str.replace(',',' ')
746 # %z (UTC offset) is only supported in python>=3.2
747 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
748 format_expressions = [
749 '%d %B %Y',
750 '%B %d %Y',
751 '%b %d %Y',
752 '%Y-%m-%d',
753 '%d/%m/%Y',
754 '%Y/%m/%d %H:%M:%S',
755 '%d.%m.%Y %H:%M',
756 '%Y-%m-%dT%H:%M:%SZ',
757 '%Y-%m-%dT%H:%M:%S.%fZ',
758 '%Y-%m-%dT%H:%M:%S.%f0Z',
759 '%Y-%m-%dT%H:%M:%S',
760 ]
761 for expression in format_expressions:
762 try:
763 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
764 except:
765 pass
766 return upload_date
767
768 def determine_ext(url, default_ext=u'unknown_video'):
769 guess = url.partition(u'?')[0].rpartition(u'.')[2]
770 if re.match(r'^[A-Za-z0-9]+$', guess):
771 return guess
772 else:
773 return default_ext
774
775 def subtitles_filename(filename, sub_lang, sub_format):
776 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
777
778 def date_from_str(date_str):
779 """
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
783 if date_str == 'now'or date_str == 'today':
784 return today
785 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
786 if match is not None:
787 sign = match.group('sign')
788 time = int(match.group('time'))
789 if sign == '-':
790 time = -time
791 unit = match.group('unit')
792 #A bad aproximation?
793 if unit == 'month':
794 unit = 'day'
795 time *= 30
796 elif unit == 'year':
797 unit = 'day'
798 time *= 365
799 unit += 's'
800 delta = datetime.timedelta(**{unit: time})
801 return today + delta
802 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
803
804 class DateRange(object):
805 """Represents a time interval between two dates"""
806 def __init__(self, start=None, end=None):
807 """start and end must be strings in the format accepted by date"""
808 if start is not None:
809 self.start = date_from_str(start)
810 else:
811 self.start = datetime.datetime.min.date()
812 if end is not None:
813 self.end = date_from_str(end)
814 else:
815 self.end = datetime.datetime.max.date()
816 if self.start > self.end:
817 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
818 @classmethod
819 def day(cls, day):
820 """Returns a range that only contains the given day"""
821 return cls(day,day)
822 def __contains__(self, date):
823 """Check if the date is in the range"""
824 if not isinstance(date, datetime.date):
825 date = date_from_str(date)
826 return self.start <= date <= self.end
827 def __str__(self):
828 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
829
830
831 def platform_name():
832 """ Returns the platform name as a compat_str """
833 res = platform.platform()
834 if isinstance(res, bytes):
835 res = res.decode(preferredencoding())
836
837 assert isinstance(res, compat_str)
838 return res
839
840
841 def write_string(s, out=None):
842 if out is None:
843 out = sys.stderr
844 assert type(s) == type(u'')
845
846 if ('b' in getattr(out, 'mode', '') or
847 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
848 s = s.encode(preferredencoding(), 'ignore')
849 out.write(s)
850 out.flush()
851
852
853 def bytes_to_intlist(bs):
854 if not bs:
855 return []
856 if isinstance(bs[0], int): # Python 3
857 return list(bs)
858 else:
859 return [ord(c) for c in bs]
860
861
862 def intlist_to_bytes(xs):
863 if not xs:
864 return b''
865 if isinstance(chr(0), bytes): # Python 2
866 return ''.join([chr(x) for x in xs])
867 else:
868 return bytes(xs)
869
870
871 def get_cachedir(params={}):
872 cache_root = os.environ.get('XDG_CACHE_HOME',
873 os.path.expanduser('~/.cache'))
874 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
875
876
877 # Cross-platform file locking
878 if sys.platform == 'win32':
879 import ctypes.wintypes
880 import msvcrt
881
882 class OVERLAPPED(ctypes.Structure):
883 _fields_ = [
884 ('Internal', ctypes.wintypes.LPVOID),
885 ('InternalHigh', ctypes.wintypes.LPVOID),
886 ('Offset', ctypes.wintypes.DWORD),
887 ('OffsetHigh', ctypes.wintypes.DWORD),
888 ('hEvent', ctypes.wintypes.HANDLE),
889 ]
890
891 kernel32 = ctypes.windll.kernel32
892 LockFileEx = kernel32.LockFileEx
893 LockFileEx.argtypes = [
894 ctypes.wintypes.HANDLE, # hFile
895 ctypes.wintypes.DWORD, # dwFlags
896 ctypes.wintypes.DWORD, # dwReserved
897 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
898 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
899 ctypes.POINTER(OVERLAPPED) # Overlapped
900 ]
901 LockFileEx.restype = ctypes.wintypes.BOOL
902 UnlockFileEx = kernel32.UnlockFileEx
903 UnlockFileEx.argtypes = [
904 ctypes.wintypes.HANDLE, # hFile
905 ctypes.wintypes.DWORD, # dwReserved
906 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
907 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
908 ctypes.POINTER(OVERLAPPED) # Overlapped
909 ]
910 UnlockFileEx.restype = ctypes.wintypes.BOOL
911 whole_low = 0xffffffff
912 whole_high = 0x7fffffff
913
914 def _lock_file(f, exclusive):
915 overlapped = OVERLAPPED()
916 overlapped.Offset = 0
917 overlapped.OffsetHigh = 0
918 overlapped.hEvent = 0
919 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
920 handle = msvcrt.get_osfhandle(f.fileno())
921 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
922 whole_low, whole_high, f._lock_file_overlapped_p):
923 raise OSError('Locking file failed: %r' % ctypes.FormatError())
924
925 def _unlock_file(f):
926 assert f._lock_file_overlapped_p
927 handle = msvcrt.get_osfhandle(f.fileno())
928 if not UnlockFileEx(handle, 0,
929 whole_low, whole_high, f._lock_file_overlapped_p):
930 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
931
932 else:
933 import fcntl
934
935 def _lock_file(f, exclusive):
936 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
937
938 def _unlock_file(f):
939 fcntl.lockf(f, fcntl.LOCK_UN)
940
941
942 class locked_file(object):
943 def __init__(self, filename, mode, encoding=None):
944 assert mode in ['r', 'a', 'w']
945 self.f = io.open(filename, mode, encoding=encoding)
946 self.mode = mode
947
948 def __enter__(self):
949 exclusive = self.mode != 'r'
950 try:
951 _lock_file(self.f, exclusive)
952 except IOError:
953 self.f.close()
954 raise
955 return self
956
957 def __exit__(self, etype, value, traceback):
958 try:
959 _unlock_file(self.f)
960 finally:
961 self.f.close()
962
963 def __iter__(self):
964 return iter(self.f)
965
966 def write(self, *args):
967 return self.f.write(*args)
968
969 def read(self, *args):
970 return self.f.read(*args)
971
972
973 def shell_quote(args):
974 quoted_args = []
975 encoding = sys.getfilesystemencoding()
976 if encoding is None:
977 encoding = 'utf-8'
978 for a in args:
979 if isinstance(a, bytes):
980 # We may get a filename encoded with 'encodeFilename'
981 a = a.decode(encoding)
982 quoted_args.append(pipes.quote(a))
983 return u' '.join(quoted_args)
984
985
986 def takewhile_inclusive(pred, seq):
987 """ Like itertools.takewhile, but include the latest evaluated element
988 (the first element so that Not pred(e)) """
989 for e in seq:
990 yield e
991 if not pred(e):
992 return
993
994
995 def smuggle_url(url, data):
996 """ Pass additional data in a URL for internal use. """
997
998 sdata = compat_urllib_parse.urlencode(
999 {u'__youtubedl_smuggle': json.dumps(data)})
1000 return url + u'#' + sdata
1001
1002
1003 def unsmuggle_url(smug_url):
1004 if not '#__youtubedl_smuggle' in smug_url:
1005 return smug_url, None
1006 url, _, sdata = smug_url.rpartition(u'#')
1007 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1008 data = json.loads(jsond)
1009 return url, data
1010
1011
1012 def format_bytes(bytes):
1013 if bytes is None:
1014 return u'N/A'
1015 if type(bytes) is str:
1016 bytes = float(bytes)
1017 if bytes == 0.0:
1018 exponent = 0
1019 else:
1020 exponent = int(math.log(bytes, 1024.0))
1021 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1022 converted = float(bytes) / float(1024 ** exponent)
1023 return u'%.2f%s' % (converted, suffix)