]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
5dd5b2923d2a773a526006d71769d10486fe8730
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import errno
5 import gzip
6 import io
7 import json
8 import locale
9 import os
10 import re
11 import sys
12 import traceback
13 import zlib
14 import email.utils
15 import socket
16 import datetime
17
18 try:
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
22
23 try:
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
27
28 try:
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
32
33 try:
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
37
38 try:
39 import urllib.parse as compat_urlparse
40 except ImportError: # Python 2
41 import urlparse as compat_urlparse
42
43 try:
44 import http.cookiejar as compat_cookiejar
45 except ImportError: # Python 2
46 import cookielib as compat_cookiejar
47
48 try:
49 import html.entities as compat_html_entities
50 except ImportError: # Python 2
51 import htmlentitydefs as compat_html_entities
52
53 try:
54 import html.parser as compat_html_parser
55 except ImportError: # Python 2
56 import HTMLParser as compat_html_parser
57
58 try:
59 import http.client as compat_http_client
60 except ImportError: # Python 2
61 import httplib as compat_http_client
62
63 try:
64 from subprocess import DEVNULL
65 compat_subprocess_get_DEVNULL = lambda: DEVNULL
66 except ImportError:
67 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
68
69 try:
70 from urllib.parse import parse_qs as compat_parse_qs
71 except ImportError: # Python 2
72 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
73 # Python 2's version is apparently totally broken
74 def _unquote(string, encoding='utf-8', errors='replace'):
75 if string == '':
76 return string
77 res = string.split('%')
78 if len(res) == 1:
79 return string
80 if encoding is None:
81 encoding = 'utf-8'
82 if errors is None:
83 errors = 'replace'
84 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
85 pct_sequence = b''
86 string = res[0]
87 for item in res[1:]:
88 try:
89 if not item:
90 raise ValueError
91 pct_sequence += item[:2].decode('hex')
92 rest = item[2:]
93 if not rest:
94 # This segment was just a single percent-encoded character.
95 # May be part of a sequence of code units, so delay decoding.
96 # (Stored in pct_sequence).
97 continue
98 except ValueError:
99 rest = '%' + item
100 # Encountered non-percent-encoded characters. Flush the current
101 # pct_sequence.
102 string += pct_sequence.decode(encoding, errors) + rest
103 pct_sequence = b''
104 if pct_sequence:
105 # Flush the final pct_sequence
106 string += pct_sequence.decode(encoding, errors)
107 return string
108
109 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
110 encoding='utf-8', errors='replace'):
111 qs, _coerce_result = qs, unicode
112 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
113 r = []
114 for name_value in pairs:
115 if not name_value and not strict_parsing:
116 continue
117 nv = name_value.split('=', 1)
118 if len(nv) != 2:
119 if strict_parsing:
120 raise ValueError("bad query field: %r" % (name_value,))
121 # Handle case of a control-name with no equal sign
122 if keep_blank_values:
123 nv.append('')
124 else:
125 continue
126 if len(nv[1]) or keep_blank_values:
127 name = nv[0].replace('+', ' ')
128 name = _unquote(name, encoding=encoding, errors=errors)
129 name = _coerce_result(name)
130 value = nv[1].replace('+', ' ')
131 value = _unquote(value, encoding=encoding, errors=errors)
132 value = _coerce_result(value)
133 r.append((name, value))
134 return r
135
136 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
137 encoding='utf-8', errors='replace'):
138 parsed_result = {}
139 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
140 encoding=encoding, errors=errors)
141 for name, value in pairs:
142 if name in parsed_result:
143 parsed_result[name].append(value)
144 else:
145 parsed_result[name] = [value]
146 return parsed_result
147
148 try:
149 compat_str = unicode # Python 2
150 except NameError:
151 compat_str = str
152
153 try:
154 compat_chr = unichr # Python 2
155 except NameError:
156 compat_chr = chr
157
158 def compat_ord(c):
159 if type(c) is int: return c
160 else: return ord(c)
161
162 # This is not clearly defined otherwise
163 compiled_regex_type = type(re.compile(''))
164
165 std_headers = {
166 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
167 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
168 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
169 'Accept-Encoding': 'gzip, deflate',
170 'Accept-Language': 'en-us,en;q=0.5',
171 }
172
173 def preferredencoding():
174 """Get preferred encoding.
175
176 Returns the best encoding scheme for the system, based on
177 locale.getpreferredencoding() and some further tweaks.
178 """
179 try:
180 pref = locale.getpreferredencoding()
181 u'TEST'.encode(pref)
182 except:
183 pref = 'UTF-8'
184
185 return pref
186
187 if sys.version_info < (3,0):
188 def compat_print(s):
189 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
190 else:
191 def compat_print(s):
192 assert type(s) == type(u'')
193 print(s)
194
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3,0):
198 def write_json_file(obj, fn):
199 with open(fn, 'wb') as f:
200 json.dump(obj, f)
201 else:
202 def write_json_file(obj, fn):
203 with open(fn, 'w', encoding='utf-8') as f:
204 json.dump(obj, f)
205
206 if sys.version_info >= (2,7):
207 def find_xpath_attr(node, xpath, key, val):
208 """ Find the xpath xpath[@key=val] """
209 assert re.match(r'^[a-zA-Z]+$', key)
210 assert re.match(r'^[a-zA-Z@\s]*$', val)
211 expr = xpath + u"[@%s='%s']" % (key, val)
212 return node.find(expr)
213 else:
214 def find_xpath_attr(node, xpath, key, val):
215 for f in node.findall(xpath):
216 if f.attrib.get(key) == val:
217 return f
218 return None
219
220 def htmlentity_transform(matchobj):
221 """Transforms an HTML entity to a character.
222
223 This function receives a match object and is intended to be used with
224 the re.sub() function.
225 """
226 entity = matchobj.group(1)
227
228 # Known non-numeric HTML entity
229 if entity in compat_html_entities.name2codepoint:
230 return compat_chr(compat_html_entities.name2codepoint[entity])
231
232 mobj = re.match(u'(?u)#(x?\\d+)', entity)
233 if mobj is not None:
234 numstr = mobj.group(1)
235 if numstr.startswith(u'x'):
236 base = 16
237 numstr = u'0%s' % numstr
238 else:
239 base = 10
240 return compat_chr(int(numstr, base))
241
242 # Unknown entity in name, return its literal representation
243 return (u'&%s;' % entity)
244
245 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
246 class AttrParser(compat_html_parser.HTMLParser):
247 """Modified HTMLParser that isolates a tag with the specified attribute"""
248 def __init__(self, attribute, value):
249 self.attribute = attribute
250 self.value = value
251 self.result = None
252 self.started = False
253 self.depth = {}
254 self.html = None
255 self.watch_startpos = False
256 self.error_count = 0
257 compat_html_parser.HTMLParser.__init__(self)
258
259 def error(self, message):
260 if self.error_count > 10 or self.started:
261 raise compat_html_parser.HTMLParseError(message, self.getpos())
262 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
263 self.error_count += 1
264 self.goahead(1)
265
266 def loads(self, html):
267 self.html = html
268 self.feed(html)
269 self.close()
270
271 def handle_starttag(self, tag, attrs):
272 attrs = dict(attrs)
273 if self.started:
274 self.find_startpos(None)
275 if self.attribute in attrs and attrs[self.attribute] == self.value:
276 self.result = [tag]
277 self.started = True
278 self.watch_startpos = True
279 if self.started:
280 if not tag in self.depth: self.depth[tag] = 0
281 self.depth[tag] += 1
282
283 def handle_endtag(self, tag):
284 if self.started:
285 if tag in self.depth: self.depth[tag] -= 1
286 if self.depth[self.result[0]] == 0:
287 self.started = False
288 self.result.append(self.getpos())
289
290 def find_startpos(self, x):
291 """Needed to put the start position of the result (self.result[1])
292 after the opening tag with the requested id"""
293 if self.watch_startpos:
294 self.watch_startpos = False
295 self.result.append(self.getpos())
296 handle_entityref = handle_charref = handle_data = handle_comment = \
297 handle_decl = handle_pi = unknown_decl = find_startpos
298
299 def get_result(self):
300 if self.result is None:
301 return None
302 if len(self.result) != 3:
303 return None
304 lines = self.html.split('\n')
305 lines = lines[self.result[1][0]-1:self.result[2][0]]
306 lines[0] = lines[0][self.result[1][1]:]
307 if len(lines) == 1:
308 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
309 lines[-1] = lines[-1][:self.result[2][1]]
310 return '\n'.join(lines).strip()
311 # Hack for https://github.com/rg3/youtube-dl/issues/662
312 if sys.version_info < (2, 7, 3):
313 AttrParser.parse_endtag = (lambda self, i:
314 i + len("</scr'+'ipt>")
315 if self.rawdata[i:].startswith("</scr'+'ipt>")
316 else compat_html_parser.HTMLParser.parse_endtag(self, i))
317
318 def get_element_by_id(id, html):
319 """Return the content of the tag with the specified ID in the passed HTML document"""
320 return get_element_by_attribute("id", id, html)
321
322 def get_element_by_attribute(attribute, value, html):
323 """Return the content of the tag with the specified attribute in the passed HTML document"""
324 parser = AttrParser(attribute, value)
325 try:
326 parser.loads(html)
327 except compat_html_parser.HTMLParseError:
328 pass
329 return parser.get_result()
330
331
332 def clean_html(html):
333 """Clean an HTML snippet into a readable string"""
334 # Newline vs <br />
335 html = html.replace('\n', ' ')
336 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
337 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
338 # Strip html tags
339 html = re.sub('<.*?>', '', html)
340 # Replace html entities
341 html = unescapeHTML(html)
342 return html.strip()
343
344
345 def sanitize_open(filename, open_mode):
346 """Try to open the given filename, and slightly tweak it if this fails.
347
348 Attempts to open the given filename. If this fails, it tries to change
349 the filename slightly, step by step, until it's either able to open it
350 or it fails and raises a final exception, like the standard open()
351 function.
352
353 It returns the tuple (stream, definitive_file_name).
354 """
355 try:
356 if filename == u'-':
357 if sys.platform == 'win32':
358 import msvcrt
359 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
360 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
361 stream = open(encodeFilename(filename), open_mode)
362 return (stream, filename)
363 except (IOError, OSError) as err:
364 if err.errno in (errno.EACCES,):
365 raise
366
367 # In case of error, try to remove win32 forbidden chars
368 alt_filename = os.path.join(
369 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
370 for path_part in os.path.split(filename)
371 )
372 if alt_filename == filename:
373 raise
374 else:
375 # An exception here should be caught in the caller
376 stream = open(encodeFilename(filename), open_mode)
377 return (stream, alt_filename)
378
379
380 def timeconvert(timestr):
381 """Convert RFC 2822 defined time string into system timestamp"""
382 timestamp = None
383 timetuple = email.utils.parsedate_tz(timestr)
384 if timetuple is not None:
385 timestamp = email.utils.mktime_tz(timetuple)
386 return timestamp
387
388 def sanitize_filename(s, restricted=False, is_id=False):
389 """Sanitizes a string so it could be used as part of a filename.
390 If restricted is set, use a stricter subset of allowed characters.
391 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
392 """
393 def replace_insane(char):
394 if char == '?' or ord(char) < 32 or ord(char) == 127:
395 return ''
396 elif char == '"':
397 return '' if restricted else '\''
398 elif char == ':':
399 return '_-' if restricted else ' -'
400 elif char in '\\/|*<>':
401 return '_'
402 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
403 return '_'
404 if restricted and ord(char) > 127:
405 return '_'
406 return char
407
408 result = u''.join(map(replace_insane, s))
409 if not is_id:
410 while '__' in result:
411 result = result.replace('__', '_')
412 result = result.strip('_')
413 # Common case of "Foreign band name - English song title"
414 if restricted and result.startswith('-_'):
415 result = result[2:]
416 if not result:
417 result = '_'
418 return result
419
420 def orderedSet(iterable):
421 """ Remove all duplicates from the input iterable """
422 res = []
423 for el in iterable:
424 if el not in res:
425 res.append(el)
426 return res
427
428 def unescapeHTML(s):
429 """
430 @param s a string
431 """
432 assert type(s) == type(u'')
433
434 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
435 return result
436
437 def encodeFilename(s):
438 """
439 @param s The name of the file
440 """
441
442 assert type(s) == type(u'')
443
444 # Python 3 has a Unicode API
445 if sys.version_info >= (3, 0):
446 return s
447
448 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
449 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
450 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
451 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
452 return s
453 else:
454 encoding = sys.getfilesystemencoding()
455 if encoding is None:
456 encoding = 'utf-8'
457 return s.encode(encoding, 'ignore')
458
459 def decodeOption(optval):
460 if optval is None:
461 return optval
462 if isinstance(optval, bytes):
463 optval = optval.decode(preferredencoding())
464
465 assert isinstance(optval, compat_str)
466 return optval
467
468 def formatSeconds(secs):
469 if secs > 3600:
470 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
471 elif secs > 60:
472 return '%d:%02d' % (secs // 60, secs % 60)
473 else:
474 return '%d' % secs
475
476 def make_HTTPS_handler(opts):
477 if sys.version_info < (3,2):
478 # Python's 2.x handler is very simplistic
479 return compat_urllib_request.HTTPSHandler()
480 else:
481 import ssl
482 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
483 context.set_default_verify_paths()
484
485 context.verify_mode = (ssl.CERT_NONE
486 if opts.no_check_certificate
487 else ssl.CERT_REQUIRED)
488 return compat_urllib_request.HTTPSHandler(context=context)
489
490 class ExtractorError(Exception):
491 """Error during info extraction."""
492 def __init__(self, msg, tb=None, expected=False):
493 """ tb, if given, is the original traceback (so that it can be printed out).
494 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
495 """
496
497 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
498 expected = True
499 if not expected:
500 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
501 super(ExtractorError, self).__init__(msg)
502
503 self.traceback = tb
504 self.exc_info = sys.exc_info() # preserve original exception
505
506 def format_traceback(self):
507 if self.traceback is None:
508 return None
509 return u''.join(traceback.format_tb(self.traceback))
510
511
512 class DownloadError(Exception):
513 """Download Error exception.
514
515 This exception may be thrown by FileDownloader objects if they are not
516 configured to continue on errors. They will contain the appropriate
517 error message.
518 """
519 def __init__(self, msg, exc_info=None):
520 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
521 super(DownloadError, self).__init__(msg)
522 self.exc_info = exc_info
523
524
525 class SameFileError(Exception):
526 """Same File exception.
527
528 This exception will be thrown by FileDownloader objects if they detect
529 multiple files would have to be downloaded to the same file on disk.
530 """
531 pass
532
533
534 class PostProcessingError(Exception):
535 """Post Processing exception.
536
537 This exception may be raised by PostProcessor's .run() method to
538 indicate an error in the postprocessing task.
539 """
540 def __init__(self, msg):
541 self.msg = msg
542
543 class MaxDownloadsReached(Exception):
544 """ --max-downloads limit has been reached. """
545 pass
546
547
548 class UnavailableVideoError(Exception):
549 """Unavailable Format exception.
550
551 This exception will be thrown when a video is requested
552 in a format that is not available for that video.
553 """
554 pass
555
556
557 class ContentTooShortError(Exception):
558 """Content Too Short exception.
559
560 This exception may be raised by FileDownloader objects when a file they
561 download is too small for what the server announced first, indicating
562 the connection was probably interrupted.
563 """
564 # Both in bytes
565 downloaded = None
566 expected = None
567
568 def __init__(self, downloaded, expected):
569 self.downloaded = downloaded
570 self.expected = expected
571
572 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
573 """Handler for HTTP requests and responses.
574
575 This class, when installed with an OpenerDirector, automatically adds
576 the standard headers to every HTTP request and handles gzipped and
577 deflated responses from web servers. If compression is to be avoided in
578 a particular request, the original request in the program code only has
579 to include the HTTP header "Youtubedl-No-Compression", which will be
580 removed before making the real request.
581
582 Part of this code was copied from:
583
584 http://techknack.net/python-urllib2-handlers/
585
586 Andrew Rowls, the author of that code, agreed to release it to the
587 public domain.
588 """
589
590 @staticmethod
591 def deflate(data):
592 try:
593 return zlib.decompress(data, -zlib.MAX_WBITS)
594 except zlib.error:
595 return zlib.decompress(data)
596
597 @staticmethod
598 def addinfourl_wrapper(stream, headers, url, code):
599 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
600 return compat_urllib_request.addinfourl(stream, headers, url, code)
601 ret = compat_urllib_request.addinfourl(stream, headers, url)
602 ret.code = code
603 return ret
604
605 def http_request(self, req):
606 for h,v in std_headers.items():
607 if h in req.headers:
608 del req.headers[h]
609 req.add_header(h, v)
610 if 'Youtubedl-no-compression' in req.headers:
611 if 'Accept-encoding' in req.headers:
612 del req.headers['Accept-encoding']
613 del req.headers['Youtubedl-no-compression']
614 if 'Youtubedl-user-agent' in req.headers:
615 if 'User-agent' in req.headers:
616 del req.headers['User-agent']
617 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
618 del req.headers['Youtubedl-user-agent']
619 return req
620
621 def http_response(self, req, resp):
622 old_resp = resp
623 # gzip
624 if resp.headers.get('Content-encoding', '') == 'gzip':
625 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
626 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
627 resp.msg = old_resp.msg
628 # deflate
629 if resp.headers.get('Content-encoding', '') == 'deflate':
630 gz = io.BytesIO(self.deflate(resp.read()))
631 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
632 resp.msg = old_resp.msg
633 return resp
634
635 https_request = http_request
636 https_response = http_response
637
638 def unified_strdate(date_str):
639 """Return a string with the date in the format YYYYMMDD"""
640 upload_date = None
641 #Replace commas
642 date_str = date_str.replace(',',' ')
643 # %z (UTC offset) is only supported in python>=3.2
644 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
645 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
646 for expression in format_expressions:
647 try:
648 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
649 except:
650 pass
651 return upload_date
652
653 def determine_ext(url, default_ext=u'unknown_video'):
654 guess = url.partition(u'?')[0].rpartition(u'.')[2]
655 if re.match(r'^[A-Za-z0-9]+$', guess):
656 return guess
657 else:
658 return default_ext
659
660 def date_from_str(date_str):
661 """
662 Return a datetime object from a string in the format YYYYMMDD or
663 (now|today)[+-][0-9](day|week|month|year)(s)?"""
664 today = datetime.date.today()
665 if date_str == 'now'or date_str == 'today':
666 return today
667 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
668 if match is not None:
669 sign = match.group('sign')
670 time = int(match.group('time'))
671 if sign == '-':
672 time = -time
673 unit = match.group('unit')
674 #A bad aproximation?
675 if unit == 'month':
676 unit = 'day'
677 time *= 30
678 elif unit == 'year':
679 unit = 'day'
680 time *= 365
681 unit += 's'
682 delta = datetime.timedelta(**{unit: time})
683 return today + delta
684 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
685
686 class DateRange(object):
687 """Represents a time interval between two dates"""
688 def __init__(self, start=None, end=None):
689 """start and end must be strings in the format accepted by date"""
690 if start is not None:
691 self.start = date_from_str(start)
692 else:
693 self.start = datetime.datetime.min.date()
694 if end is not None:
695 self.end = date_from_str(end)
696 else:
697 self.end = datetime.datetime.max.date()
698 if self.start > self.end:
699 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
700 @classmethod
701 def day(cls, day):
702 """Returns a range that only contains the given day"""
703 return cls(day,day)
704 def __contains__(self, date):
705 """Check if the date is in the range"""
706 if not isinstance(date, datetime.date):
707 date = date_from_str(date)
708 return self.start <= date <= self.end
709 def __str__(self):
710 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())