]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/NEWS: Add link to my webpage to give more details.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import ctypes
5 import datetime
6 import email.utils
7 import errno
8 import gzip
9 import itertools
10 import io
11 import json
12 import locale
13 import math
14 import os
15 import pipes
16 import platform
17 import re
18 import ssl
19 import socket
20 import struct
21 import subprocess
22 import sys
23 import traceback
24 import zlib
25
26 try:
27 import urllib.request as compat_urllib_request
28 except ImportError: # Python 2
29 import urllib2 as compat_urllib_request
30
31 try:
32 import urllib.error as compat_urllib_error
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_error
35
36 try:
37 import urllib.parse as compat_urllib_parse
38 except ImportError: # Python 2
39 import urllib as compat_urllib_parse
40
41 try:
42 from urllib.parse import urlparse as compat_urllib_parse_urlparse
43 except ImportError: # Python 2
44 from urlparse import urlparse as compat_urllib_parse_urlparse
45
46 try:
47 import urllib.parse as compat_urlparse
48 except ImportError: # Python 2
49 import urlparse as compat_urlparse
50
51 try:
52 import http.cookiejar as compat_cookiejar
53 except ImportError: # Python 2
54 import cookielib as compat_cookiejar
55
56 try:
57 import html.entities as compat_html_entities
58 except ImportError: # Python 2
59 import htmlentitydefs as compat_html_entities
60
61 try:
62 import html.parser as compat_html_parser
63 except ImportError: # Python 2
64 import HTMLParser as compat_html_parser
65
66 try:
67 import http.client as compat_http_client
68 except ImportError: # Python 2
69 import httplib as compat_http_client
70
71 try:
72 from urllib.error import HTTPError as compat_HTTPError
73 except ImportError: # Python 2
74 from urllib2 import HTTPError as compat_HTTPError
75
76 try:
77 from urllib.request import urlretrieve as compat_urlretrieve
78 except ImportError: # Python 2
79 from urllib import urlretrieve as compat_urlretrieve
80
81
82 try:
83 from subprocess import DEVNULL
84 compat_subprocess_get_DEVNULL = lambda: DEVNULL
85 except ImportError:
86 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
87
88 try:
89 from urllib.parse import parse_qs as compat_parse_qs
90 except ImportError: # Python 2
91 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
92 # Python 2's version is apparently totally broken
93 def _unquote(string, encoding='utf-8', errors='replace'):
94 if string == '':
95 return string
96 res = string.split('%')
97 if len(res) == 1:
98 return string
99 if encoding is None:
100 encoding = 'utf-8'
101 if errors is None:
102 errors = 'replace'
103 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
104 pct_sequence = b''
105 string = res[0]
106 for item in res[1:]:
107 try:
108 if not item:
109 raise ValueError
110 pct_sequence += item[:2].decode('hex')
111 rest = item[2:]
112 if not rest:
113 # This segment was just a single percent-encoded character.
114 # May be part of a sequence of code units, so delay decoding.
115 # (Stored in pct_sequence).
116 continue
117 except ValueError:
118 rest = '%' + item
119 # Encountered non-percent-encoded characters. Flush the current
120 # pct_sequence.
121 string += pct_sequence.decode(encoding, errors) + rest
122 pct_sequence = b''
123 if pct_sequence:
124 # Flush the final pct_sequence
125 string += pct_sequence.decode(encoding, errors)
126 return string
127
128 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
130 qs, _coerce_result = qs, unicode
131 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
132 r = []
133 for name_value in pairs:
134 if not name_value and not strict_parsing:
135 continue
136 nv = name_value.split('=', 1)
137 if len(nv) != 2:
138 if strict_parsing:
139 raise ValueError("bad query field: %r" % (name_value,))
140 # Handle case of a control-name with no equal sign
141 if keep_blank_values:
142 nv.append('')
143 else:
144 continue
145 if len(nv[1]) or keep_blank_values:
146 name = nv[0].replace('+', ' ')
147 name = _unquote(name, encoding=encoding, errors=errors)
148 name = _coerce_result(name)
149 value = nv[1].replace('+', ' ')
150 value = _unquote(value, encoding=encoding, errors=errors)
151 value = _coerce_result(value)
152 r.append((name, value))
153 return r
154
155 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
156 encoding='utf-8', errors='replace'):
157 parsed_result = {}
158 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
159 encoding=encoding, errors=errors)
160 for name, value in pairs:
161 if name in parsed_result:
162 parsed_result[name].append(value)
163 else:
164 parsed_result[name] = [value]
165 return parsed_result
166
167 try:
168 compat_str = unicode # Python 2
169 except NameError:
170 compat_str = str
171
172 try:
173 compat_chr = unichr # Python 2
174 except NameError:
175 compat_chr = chr
176
177 def compat_ord(c):
178 if type(c) is int: return c
179 else: return ord(c)
180
181 # This is not clearly defined otherwise
182 compiled_regex_type = type(re.compile(''))
183
184 std_headers = {
185 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
186 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
187 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
188 'Accept-Encoding': 'gzip, deflate',
189 'Accept-Language': 'en-us,en;q=0.5',
190 }
191
192 def preferredencoding():
193 """Get preferred encoding.
194
195 Returns the best encoding scheme for the system, based on
196 locale.getpreferredencoding() and some further tweaks.
197 """
198 try:
199 pref = locale.getpreferredencoding()
200 u'TEST'.encode(pref)
201 except:
202 pref = 'UTF-8'
203
204 return pref
205
206 if sys.version_info < (3,0):
207 def compat_print(s):
208 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
209 else:
210 def compat_print(s):
211 assert type(s) == type(u'')
212 print(s)
213
214 # In Python 2.x, json.dump expects a bytestream.
215 # In Python 3.x, it writes to a character stream
216 if sys.version_info < (3,0):
217 def write_json_file(obj, fn):
218 with open(fn, 'wb') as f:
219 json.dump(obj, f)
220 else:
221 def write_json_file(obj, fn):
222 with open(fn, 'w', encoding='utf-8') as f:
223 json.dump(obj, f)
224
225 if sys.version_info >= (2,7):
226 def find_xpath_attr(node, xpath, key, val):
227 """ Find the xpath xpath[@key=val] """
228 assert re.match(r'^[a-zA-Z]+$', key)
229 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
230 expr = xpath + u"[@%s='%s']" % (key, val)
231 return node.find(expr)
232 else:
233 def find_xpath_attr(node, xpath, key, val):
234 for f in node.findall(xpath):
235 if f.attrib.get(key) == val:
236 return f
237 return None
238
239 # On python2.6 the xml.etree.ElementTree.Element methods don't support
240 # the namespace parameter
241 def xpath_with_ns(path, ns_map):
242 components = [c.split(':') for c in path.split('/')]
243 replaced = []
244 for c in components:
245 if len(c) == 1:
246 replaced.append(c[0])
247 else:
248 ns, tag = c
249 replaced.append('{%s}%s' % (ns_map[ns], tag))
250 return '/'.join(replaced)
251
252 def htmlentity_transform(matchobj):
253 """Transforms an HTML entity to a character.
254
255 This function receives a match object and is intended to be used with
256 the re.sub() function.
257 """
258 entity = matchobj.group(1)
259
260 # Known non-numeric HTML entity
261 if entity in compat_html_entities.name2codepoint:
262 return compat_chr(compat_html_entities.name2codepoint[entity])
263
264 mobj = re.match(u'(?u)#(x?\\d+)', entity)
265 if mobj is not None:
266 numstr = mobj.group(1)
267 if numstr.startswith(u'x'):
268 base = 16
269 numstr = u'0%s' % numstr
270 else:
271 base = 10
272 return compat_chr(int(numstr, base))
273
274 # Unknown entity in name, return its literal representation
275 return (u'&%s;' % entity)
276
277 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
278 class BaseHTMLParser(compat_html_parser.HTMLParser):
279 def __init(self):
280 compat_html_parser.HTMLParser.__init__(self)
281 self.html = None
282
283 def loads(self, html):
284 self.html = html
285 self.feed(html)
286 self.close()
287
288 class AttrParser(BaseHTMLParser):
289 """Modified HTMLParser that isolates a tag with the specified attribute"""
290 def __init__(self, attribute, value):
291 self.attribute = attribute
292 self.value = value
293 self.result = None
294 self.started = False
295 self.depth = {}
296 self.watch_startpos = False
297 self.error_count = 0
298 BaseHTMLParser.__init__(self)
299
300 def error(self, message):
301 if self.error_count > 10 or self.started:
302 raise compat_html_parser.HTMLParseError(message, self.getpos())
303 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
304 self.error_count += 1
305 self.goahead(1)
306
307 def handle_starttag(self, tag, attrs):
308 attrs = dict(attrs)
309 if self.started:
310 self.find_startpos(None)
311 if self.attribute in attrs and attrs[self.attribute] == self.value:
312 self.result = [tag]
313 self.started = True
314 self.watch_startpos = True
315 if self.started:
316 if not tag in self.depth: self.depth[tag] = 0
317 self.depth[tag] += 1
318
319 def handle_endtag(self, tag):
320 if self.started:
321 if tag in self.depth: self.depth[tag] -= 1
322 if self.depth[self.result[0]] == 0:
323 self.started = False
324 self.result.append(self.getpos())
325
326 def find_startpos(self, x):
327 """Needed to put the start position of the result (self.result[1])
328 after the opening tag with the requested id"""
329 if self.watch_startpos:
330 self.watch_startpos = False
331 self.result.append(self.getpos())
332 handle_entityref = handle_charref = handle_data = handle_comment = \
333 handle_decl = handle_pi = unknown_decl = find_startpos
334
335 def get_result(self):
336 if self.result is None:
337 return None
338 if len(self.result) != 3:
339 return None
340 lines = self.html.split('\n')
341 lines = lines[self.result[1][0]-1:self.result[2][0]]
342 lines[0] = lines[0][self.result[1][1]:]
343 if len(lines) == 1:
344 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
345 lines[-1] = lines[-1][:self.result[2][1]]
346 return '\n'.join(lines).strip()
347 # Hack for https://github.com/rg3/youtube-dl/issues/662
348 if sys.version_info < (2, 7, 3):
349 AttrParser.parse_endtag = (lambda self, i:
350 i + len("</scr'+'ipt>")
351 if self.rawdata[i:].startswith("</scr'+'ipt>")
352 else compat_html_parser.HTMLParser.parse_endtag(self, i))
353
354 def get_element_by_id(id, html):
355 """Return the content of the tag with the specified ID in the passed HTML document"""
356 return get_element_by_attribute("id", id, html)
357
358 def get_element_by_attribute(attribute, value, html):
359 """Return the content of the tag with the specified attribute in the passed HTML document"""
360 parser = AttrParser(attribute, value)
361 try:
362 parser.loads(html)
363 except compat_html_parser.HTMLParseError:
364 pass
365 return parser.get_result()
366
367 class MetaParser(BaseHTMLParser):
368 """
369 Modified HTMLParser that isolates a meta tag with the specified name
370 attribute.
371 """
372 def __init__(self, name):
373 BaseHTMLParser.__init__(self)
374 self.name = name
375 self.content = None
376 self.result = None
377
378 def handle_starttag(self, tag, attrs):
379 if tag != 'meta':
380 return
381 attrs = dict(attrs)
382 if attrs.get('name') == self.name:
383 self.result = attrs.get('content')
384
385 def get_result(self):
386 return self.result
387
388 def get_meta_content(name, html):
389 """
390 Return the content attribute from the meta tag with the given name attribute.
391 """
392 parser = MetaParser(name)
393 try:
394 parser.loads(html)
395 except compat_html_parser.HTMLParseError:
396 pass
397 return parser.get_result()
398
399
400 def clean_html(html):
401 """Clean an HTML snippet into a readable string"""
402 # Newline vs <br />
403 html = html.replace('\n', ' ')
404 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
405 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
406 # Strip html tags
407 html = re.sub('<.*?>', '', html)
408 # Replace html entities
409 html = unescapeHTML(html)
410 return html.strip()
411
412
413 def sanitize_open(filename, open_mode):
414 """Try to open the given filename, and slightly tweak it if this fails.
415
416 Attempts to open the given filename. If this fails, it tries to change
417 the filename slightly, step by step, until it's either able to open it
418 or it fails and raises a final exception, like the standard open()
419 function.
420
421 It returns the tuple (stream, definitive_file_name).
422 """
423 try:
424 if filename == u'-':
425 if sys.platform == 'win32':
426 import msvcrt
427 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
428 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
429 stream = open(encodeFilename(filename), open_mode)
430 return (stream, filename)
431 except (IOError, OSError) as err:
432 if err.errno in (errno.EACCES,):
433 raise
434
435 # In case of error, try to remove win32 forbidden chars
436 alt_filename = os.path.join(
437 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
438 for path_part in os.path.split(filename)
439 )
440 if alt_filename == filename:
441 raise
442 else:
443 # An exception here should be caught in the caller
444 stream = open(encodeFilename(filename), open_mode)
445 return (stream, alt_filename)
446
447
448 def timeconvert(timestr):
449 """Convert RFC 2822 defined time string into system timestamp"""
450 timestamp = None
451 timetuple = email.utils.parsedate_tz(timestr)
452 if timetuple is not None:
453 timestamp = email.utils.mktime_tz(timetuple)
454 return timestamp
455
456 def sanitize_filename(s, restricted=False, is_id=False):
457 """Sanitizes a string so it could be used as part of a filename.
458 If restricted is set, use a stricter subset of allowed characters.
459 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
460 """
461 def replace_insane(char):
462 if char == '?' or ord(char) < 32 or ord(char) == 127:
463 return ''
464 elif char == '"':
465 return '' if restricted else '\''
466 elif char == ':':
467 return '_-' if restricted else ' -'
468 elif char in '\\/|*<>':
469 return '_'
470 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
471 return '_'
472 if restricted and ord(char) > 127:
473 return '_'
474 return char
475
476 result = u''.join(map(replace_insane, s))
477 if not is_id:
478 while '__' in result:
479 result = result.replace('__', '_')
480 result = result.strip('_')
481 # Common case of "Foreign band name - English song title"
482 if restricted and result.startswith('-_'):
483 result = result[2:]
484 if not result:
485 result = '_'
486 return result
487
488 def orderedSet(iterable):
489 """ Remove all duplicates from the input iterable """
490 res = []
491 for el in iterable:
492 if el not in res:
493 res.append(el)
494 return res
495
496 def unescapeHTML(s):
497 """
498 @param s a string
499 """
500 assert type(s) == type(u'')
501
502 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
503 return result
504
505
506 def encodeFilename(s, for_subprocess=False):
507 """
508 @param s The name of the file
509 """
510
511 assert type(s) == compat_str
512
513 # Python 3 has a Unicode API
514 if sys.version_info >= (3, 0):
515 return s
516
517 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
518 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
519 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
520 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
521 if not for_subprocess:
522 return s
523 else:
524 # For subprocess calls, encode with locale encoding
525 # Refer to http://stackoverflow.com/a/9951851/35070
526 encoding = preferredencoding()
527 else:
528 encoding = sys.getfilesystemencoding()
529 if encoding is None:
530 encoding = 'utf-8'
531 return s.encode(encoding, 'ignore')
532
533
534 def decodeOption(optval):
535 if optval is None:
536 return optval
537 if isinstance(optval, bytes):
538 optval = optval.decode(preferredencoding())
539
540 assert isinstance(optval, compat_str)
541 return optval
542
543 def formatSeconds(secs):
544 if secs > 3600:
545 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
546 elif secs > 60:
547 return '%d:%02d' % (secs // 60, secs % 60)
548 else:
549 return '%d' % secs
550
551
552 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
553 if sys.version_info < (3, 2):
554 import httplib
555
556 class HTTPSConnectionV3(httplib.HTTPSConnection):
557 def __init__(self, *args, **kwargs):
558 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
559
560 def connect(self):
561 sock = socket.create_connection((self.host, self.port), self.timeout)
562 if getattr(self, '_tunnel_host', False):
563 self.sock = sock
564 self._tunnel()
565 try:
566 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
567 except ssl.SSLError:
568 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
569
570 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
571 def https_open(self, req):
572 return self.do_open(HTTPSConnectionV3, req)
573 return HTTPSHandlerV3(**kwargs)
574 else:
575 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
576 context.verify_mode = (ssl.CERT_NONE
577 if opts_no_check_certificate
578 else ssl.CERT_REQUIRED)
579 context.set_default_verify_paths()
580 try:
581 context.load_default_certs()
582 except AttributeError:
583 pass # Python < 3.4
584 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
585
586 class ExtractorError(Exception):
587 """Error during info extraction."""
588 def __init__(self, msg, tb=None, expected=False, cause=None):
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
591 """
592
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
594 expected = True
595 if not expected:
596 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
597 super(ExtractorError, self).__init__(msg)
598
599 self.traceback = tb
600 self.exc_info = sys.exc_info() # preserve original exception
601 self.cause = cause
602
603 def format_traceback(self):
604 if self.traceback is None:
605 return None
606 return u''.join(traceback.format_tb(self.traceback))
607
608
609 class RegexNotFoundError(ExtractorError):
610 """Error when a regex didn't match"""
611 pass
612
613
614 class DownloadError(Exception):
615 """Download Error exception.
616
617 This exception may be thrown by FileDownloader objects if they are not
618 configured to continue on errors. They will contain the appropriate
619 error message.
620 """
621 def __init__(self, msg, exc_info=None):
622 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
623 super(DownloadError, self).__init__(msg)
624 self.exc_info = exc_info
625
626
627 class SameFileError(Exception):
628 """Same File exception.
629
630 This exception will be thrown by FileDownloader objects if they detect
631 multiple files would have to be downloaded to the same file on disk.
632 """
633 pass
634
635
636 class PostProcessingError(Exception):
637 """Post Processing exception.
638
639 This exception may be raised by PostProcessor's .run() method to
640 indicate an error in the postprocessing task.
641 """
642 def __init__(self, msg):
643 self.msg = msg
644
645 class MaxDownloadsReached(Exception):
646 """ --max-downloads limit has been reached. """
647 pass
648
649
650 class UnavailableVideoError(Exception):
651 """Unavailable Format exception.
652
653 This exception will be thrown when a video is requested
654 in a format that is not available for that video.
655 """
656 pass
657
658
659 class ContentTooShortError(Exception):
660 """Content Too Short exception.
661
662 This exception may be raised by FileDownloader objects when a file they
663 download is too small for what the server announced first, indicating
664 the connection was probably interrupted.
665 """
666 # Both in bytes
667 downloaded = None
668 expected = None
669
670 def __init__(self, downloaded, expected):
671 self.downloaded = downloaded
672 self.expected = expected
673
674 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
675 """Handler for HTTP requests and responses.
676
677 This class, when installed with an OpenerDirector, automatically adds
678 the standard headers to every HTTP request and handles gzipped and
679 deflated responses from web servers. If compression is to be avoided in
680 a particular request, the original request in the program code only has
681 to include the HTTP header "Youtubedl-No-Compression", which will be
682 removed before making the real request.
683
684 Part of this code was copied from:
685
686 http://techknack.net/python-urllib2-handlers/
687
688 Andrew Rowls, the author of that code, agreed to release it to the
689 public domain.
690 """
691
692 @staticmethod
693 def deflate(data):
694 try:
695 return zlib.decompress(data, -zlib.MAX_WBITS)
696 except zlib.error:
697 return zlib.decompress(data)
698
699 @staticmethod
700 def addinfourl_wrapper(stream, headers, url, code):
701 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
702 return compat_urllib_request.addinfourl(stream, headers, url, code)
703 ret = compat_urllib_request.addinfourl(stream, headers, url)
704 ret.code = code
705 return ret
706
707 def http_request(self, req):
708 for h,v in std_headers.items():
709 if h in req.headers:
710 del req.headers[h]
711 req.add_header(h, v)
712 if 'Youtubedl-no-compression' in req.headers:
713 if 'Accept-encoding' in req.headers:
714 del req.headers['Accept-encoding']
715 del req.headers['Youtubedl-no-compression']
716 if 'Youtubedl-user-agent' in req.headers:
717 if 'User-agent' in req.headers:
718 del req.headers['User-agent']
719 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
720 del req.headers['Youtubedl-user-agent']
721 return req
722
723 def http_response(self, req, resp):
724 old_resp = resp
725 # gzip
726 if resp.headers.get('Content-encoding', '') == 'gzip':
727 content = resp.read()
728 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
729 try:
730 uncompressed = io.BytesIO(gz.read())
731 except IOError as original_ioerror:
732 # There may be junk add the end of the file
733 # See http://stackoverflow.com/q/4928560/35070 for details
734 for i in range(1, 1024):
735 try:
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
737 uncompressed = io.BytesIO(gz.read())
738 except IOError:
739 continue
740 break
741 else:
742 raise original_ioerror
743 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
744 resp.msg = old_resp.msg
745 # deflate
746 if resp.headers.get('Content-encoding', '') == 'deflate':
747 gz = io.BytesIO(self.deflate(resp.read()))
748 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
749 resp.msg = old_resp.msg
750 return resp
751
752 https_request = http_request
753 https_response = http_response
754
755
756 def unified_strdate(date_str):
757 """Return a string with the date in the format YYYYMMDD"""
758 upload_date = None
759 #Replace commas
760 date_str = date_str.replace(',', ' ')
761 # %z (UTC offset) is only supported in python>=3.2
762 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
763 format_expressions = [
764 '%d %B %Y',
765 '%d %b %Y',
766 '%B %d %Y',
767 '%b %d %Y',
768 '%Y-%m-%d',
769 '%d/%m/%Y',
770 '%Y/%m/%d %H:%M:%S',
771 '%Y-%m-%d %H:%M:%S',
772 '%d.%m.%Y %H:%M',
773 '%Y-%m-%dT%H:%M:%SZ',
774 '%Y-%m-%dT%H:%M:%S.%fZ',
775 '%Y-%m-%dT%H:%M:%S.%f0Z',
776 '%Y-%m-%dT%H:%M:%S',
777 '%Y-%m-%dT%H:%M',
778 ]
779 for expression in format_expressions:
780 try:
781 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
782 except ValueError:
783 pass
784 if upload_date is None:
785 timetuple = email.utils.parsedate_tz(date_str)
786 if timetuple:
787 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
788 return upload_date
789
790 def determine_ext(url, default_ext=u'unknown_video'):
791 guess = url.partition(u'?')[0].rpartition(u'.')[2]
792 if re.match(r'^[A-Za-z0-9]+$', guess):
793 return guess
794 else:
795 return default_ext
796
797 def subtitles_filename(filename, sub_lang, sub_format):
798 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
799
800 def date_from_str(date_str):
801 """
802 Return a datetime object from a string in the format YYYYMMDD or
803 (now|today)[+-][0-9](day|week|month|year)(s)?"""
804 today = datetime.date.today()
805 if date_str == 'now'or date_str == 'today':
806 return today
807 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
808 if match is not None:
809 sign = match.group('sign')
810 time = int(match.group('time'))
811 if sign == '-':
812 time = -time
813 unit = match.group('unit')
814 #A bad aproximation?
815 if unit == 'month':
816 unit = 'day'
817 time *= 30
818 elif unit == 'year':
819 unit = 'day'
820 time *= 365
821 unit += 's'
822 delta = datetime.timedelta(**{unit: time})
823 return today + delta
824 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
825
826 def hyphenate_date(date_str):
827 """
828 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
829 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
830 if match is not None:
831 return '-'.join(match.groups())
832 else:
833 return date_str
834
835 class DateRange(object):
836 """Represents a time interval between two dates"""
837 def __init__(self, start=None, end=None):
838 """start and end must be strings in the format accepted by date"""
839 if start is not None:
840 self.start = date_from_str(start)
841 else:
842 self.start = datetime.datetime.min.date()
843 if end is not None:
844 self.end = date_from_str(end)
845 else:
846 self.end = datetime.datetime.max.date()
847 if self.start > self.end:
848 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
849 @classmethod
850 def day(cls, day):
851 """Returns a range that only contains the given day"""
852 return cls(day,day)
853 def __contains__(self, date):
854 """Check if the date is in the range"""
855 if not isinstance(date, datetime.date):
856 date = date_from_str(date)
857 return self.start <= date <= self.end
858 def __str__(self):
859 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
860
861
862 def platform_name():
863 """ Returns the platform name as a compat_str """
864 res = platform.platform()
865 if isinstance(res, bytes):
866 res = res.decode(preferredencoding())
867
868 assert isinstance(res, compat_str)
869 return res
870
871
872 def write_string(s, out=None):
873 if out is None:
874 out = sys.stderr
875 assert type(s) == compat_str
876
877 if ('b' in getattr(out, 'mode', '') or
878 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
879 s = s.encode(preferredencoding(), 'ignore')
880 try:
881 out.write(s)
882 except UnicodeEncodeError:
883 # In Windows shells, this can fail even when the codec is just charmap!?
884 # See https://wiki.python.org/moin/PrintFails#Issue
885 if sys.platform == 'win32' and hasattr(out, 'encoding'):
886 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
887 out.write(s)
888 else:
889 raise
890
891 out.flush()
892
893
894 def bytes_to_intlist(bs):
895 if not bs:
896 return []
897 if isinstance(bs[0], int): # Python 3
898 return list(bs)
899 else:
900 return [ord(c) for c in bs]
901
902
903 def intlist_to_bytes(xs):
904 if not xs:
905 return b''
906 if isinstance(chr(0), bytes): # Python 2
907 return ''.join([chr(x) for x in xs])
908 else:
909 return bytes(xs)
910
911
912 def get_cachedir(params={}):
913 cache_root = os.environ.get('XDG_CACHE_HOME',
914 os.path.expanduser('~/.cache'))
915 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
916
917
918 # Cross-platform file locking
919 if sys.platform == 'win32':
920 import ctypes.wintypes
921 import msvcrt
922
923 class OVERLAPPED(ctypes.Structure):
924 _fields_ = [
925 ('Internal', ctypes.wintypes.LPVOID),
926 ('InternalHigh', ctypes.wintypes.LPVOID),
927 ('Offset', ctypes.wintypes.DWORD),
928 ('OffsetHigh', ctypes.wintypes.DWORD),
929 ('hEvent', ctypes.wintypes.HANDLE),
930 ]
931
932 kernel32 = ctypes.windll.kernel32
933 LockFileEx = kernel32.LockFileEx
934 LockFileEx.argtypes = [
935 ctypes.wintypes.HANDLE, # hFile
936 ctypes.wintypes.DWORD, # dwFlags
937 ctypes.wintypes.DWORD, # dwReserved
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
939 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
940 ctypes.POINTER(OVERLAPPED) # Overlapped
941 ]
942 LockFileEx.restype = ctypes.wintypes.BOOL
943 UnlockFileEx = kernel32.UnlockFileEx
944 UnlockFileEx.argtypes = [
945 ctypes.wintypes.HANDLE, # hFile
946 ctypes.wintypes.DWORD, # dwReserved
947 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
948 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
949 ctypes.POINTER(OVERLAPPED) # Overlapped
950 ]
951 UnlockFileEx.restype = ctypes.wintypes.BOOL
952 whole_low = 0xffffffff
953 whole_high = 0x7fffffff
954
955 def _lock_file(f, exclusive):
956 overlapped = OVERLAPPED()
957 overlapped.Offset = 0
958 overlapped.OffsetHigh = 0
959 overlapped.hEvent = 0
960 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
961 handle = msvcrt.get_osfhandle(f.fileno())
962 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
963 whole_low, whole_high, f._lock_file_overlapped_p):
964 raise OSError('Locking file failed: %r' % ctypes.FormatError())
965
966 def _unlock_file(f):
967 assert f._lock_file_overlapped_p
968 handle = msvcrt.get_osfhandle(f.fileno())
969 if not UnlockFileEx(handle, 0,
970 whole_low, whole_high, f._lock_file_overlapped_p):
971 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
972
973 else:
974 import fcntl
975
976 def _lock_file(f, exclusive):
977 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
978
979 def _unlock_file(f):
980 fcntl.lockf(f, fcntl.LOCK_UN)
981
982
983 class locked_file(object):
984 def __init__(self, filename, mode, encoding=None):
985 assert mode in ['r', 'a', 'w']
986 self.f = io.open(filename, mode, encoding=encoding)
987 self.mode = mode
988
989 def __enter__(self):
990 exclusive = self.mode != 'r'
991 try:
992 _lock_file(self.f, exclusive)
993 except IOError:
994 self.f.close()
995 raise
996 return self
997
998 def __exit__(self, etype, value, traceback):
999 try:
1000 _unlock_file(self.f)
1001 finally:
1002 self.f.close()
1003
1004 def __iter__(self):
1005 return iter(self.f)
1006
1007 def write(self, *args):
1008 return self.f.write(*args)
1009
1010 def read(self, *args):
1011 return self.f.read(*args)
1012
1013
1014 def shell_quote(args):
1015 quoted_args = []
1016 encoding = sys.getfilesystemencoding()
1017 if encoding is None:
1018 encoding = 'utf-8'
1019 for a in args:
1020 if isinstance(a, bytes):
1021 # We may get a filename encoded with 'encodeFilename'
1022 a = a.decode(encoding)
1023 quoted_args.append(pipes.quote(a))
1024 return u' '.join(quoted_args)
1025
1026
1027 def takewhile_inclusive(pred, seq):
1028 """ Like itertools.takewhile, but include the latest evaluated element
1029 (the first element so that Not pred(e)) """
1030 for e in seq:
1031 yield e
1032 if not pred(e):
1033 return
1034
1035
1036 def smuggle_url(url, data):
1037 """ Pass additional data in a URL for internal use. """
1038
1039 sdata = compat_urllib_parse.urlencode(
1040 {u'__youtubedl_smuggle': json.dumps(data)})
1041 return url + u'#' + sdata
1042
1043
1044 def unsmuggle_url(smug_url, default=None):
1045 if not '#__youtubedl_smuggle' in smug_url:
1046 return smug_url, default
1047 url, _, sdata = smug_url.rpartition(u'#')
1048 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1049 data = json.loads(jsond)
1050 return url, data
1051
1052
1053 def format_bytes(bytes):
1054 if bytes is None:
1055 return u'N/A'
1056 if type(bytes) is str:
1057 bytes = float(bytes)
1058 if bytes == 0.0:
1059 exponent = 0
1060 else:
1061 exponent = int(math.log(bytes, 1024.0))
1062 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1063 converted = float(bytes) / float(1024 ** exponent)
1064 return u'%.2f%s' % (converted, suffix)
1065
1066
1067 def str_to_int(int_str):
1068 int_str = re.sub(r'[,\.]', u'', int_str)
1069 return int(int_str)
1070
1071
1072 def get_term_width():
1073 columns = os.environ.get('COLUMNS', None)
1074 if columns:
1075 return int(columns)
1076
1077 try:
1078 sp = subprocess.Popen(
1079 ['stty', 'size'],
1080 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1081 out, err = sp.communicate()
1082 return int(out.split()[1])
1083 except:
1084 pass
1085 return None
1086
1087
1088 def month_by_name(name):
1089 """ Return the number of a month by (locale-independently) English name """
1090
1091 ENGLISH_NAMES = [
1092 u'January', u'February', u'March', u'April', u'May', u'June',
1093 u'July', u'August', u'September', u'October', u'November', u'December']
1094 try:
1095 return ENGLISH_NAMES.index(name) + 1
1096 except ValueError:
1097 return None
1098
1099
1100 def fix_xml_ampersands(xml_str):
1101 """Replace all the '&' by '&amp;' in XML"""
1102 return re.sub(
1103 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1104 u'&amp;',
1105 xml_str)
1106
1107
1108 def setproctitle(title):
1109 assert isinstance(title, compat_str)
1110 try:
1111 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1112 except OSError:
1113 return
1114 title = title
1115 buf = ctypes.create_string_buffer(len(title) + 1)
1116 buf.value = title.encode('utf-8')
1117 try:
1118 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1119 except AttributeError:
1120 return # Strange libc, just skip this
1121
1122
1123 def remove_start(s, start):
1124 if s.startswith(start):
1125 return s[len(start):]
1126 return s
1127
1128
1129 def url_basename(url):
1130 path = compat_urlparse.urlparse(url).path
1131 return path.strip(u'/').split(u'/')[-1]
1132
1133
1134 class HEADRequest(compat_urllib_request.Request):
1135 def get_method(self):
1136 return "HEAD"
1137
1138
1139 def int_or_none(v, scale=1):
1140 return v if v is None else (int(v) // scale)
1141
1142
1143 def parse_duration(s):
1144 if s is None:
1145 return None
1146
1147 m = re.match(
1148 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1149 if not m:
1150 return None
1151 res = int(m.group('secs'))
1152 if m.group('mins'):
1153 res += int(m.group('mins')) * 60
1154 if m.group('hours'):
1155 res += int(m.group('hours')) * 60 * 60
1156 return res
1157
1158
1159 def prepend_extension(filename, ext):
1160 name, real_ext = os.path.splitext(filename)
1161 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1162
1163
1164 def check_executable(exe, args=[]):
1165 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1166 args can be a list of arguments for a short output (like -version) """
1167 try:
1168 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1169 except OSError:
1170 return False
1171 return exe
1172
1173
1174 class PagedList(object):
1175 def __init__(self, pagefunc, pagesize):
1176 self._pagefunc = pagefunc
1177 self._pagesize = pagesize
1178
1179 def __len__(self):
1180 # This is only useful for tests
1181 return len(self.getslice())
1182
1183 def getslice(self, start=0, end=None):
1184 res = []
1185 for pagenum in itertools.count(start // self._pagesize):
1186 firstid = pagenum * self._pagesize
1187 nextfirstid = pagenum * self._pagesize + self._pagesize
1188 if start >= nextfirstid:
1189 continue
1190
1191 page_results = list(self._pagefunc(pagenum))
1192
1193 startv = (
1194 start % self._pagesize
1195 if firstid <= start < nextfirstid
1196 else 0)
1197
1198 endv = (
1199 ((end - 1) % self._pagesize) + 1
1200 if (end is not None and firstid <= end <= nextfirstid)
1201 else None)
1202
1203 if startv != 0 or endv is not None:
1204 page_results = page_results[startv:endv]
1205 res.extend(page_results)
1206
1207 # A little optimization - if current page is not "full", ie. does
1208 # not contain page_size videos then we can assume that this page
1209 # is the last one - there are no more ids on further pages -
1210 # i.e. no need to query again.
1211 if len(page_results) + startv < self._pagesize:
1212 break
1213
1214 # If we got the whole page, but the next page is not interesting,
1215 # break out early as well
1216 if end == nextfirstid:
1217 break
1218 return res
1219
1220
1221 def uppercase_escape(s):
1222 return re.sub(
1223 r'\\U([0-9a-fA-F]{8})',
1224 lambda m: compat_chr(int(m.group(1), base=16)), s)
1225
1226 try:
1227 struct.pack(u'!I', 0)
1228 except TypeError:
1229 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1230 def struct_pack(spec, *args):
1231 if isinstance(spec, compat_str):
1232 spec = spec.encode('ascii')
1233 return struct.pack(spec, *args)
1234
1235 def struct_unpack(spec, *args):
1236 if isinstance(spec, compat_str):
1237 spec = spec.encode('ascii')
1238 return struct.unpack(spec, *args)
1239 else:
1240 struct_pack = struct.pack
1241 struct_unpack = struct.unpack