]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/rules: Remove egg-info directory in the clean target.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import errno
5 import gzip
6 import io
7 import json
8 import locale
9 import os
10 import re
11 import sys
12 import traceback
13 import zlib
14 import email.utils
15 import socket
16 import datetime
17
18 try:
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
22
23 try:
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
27
28 try:
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
32
33 try:
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
37
38 try:
39 import http.cookiejar as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib as compat_cookiejar
42
43 try:
44 import html.entities as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs as compat_html_entities
47
48 try:
49 import html.parser as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser as compat_html_parser
52
53 try:
54 import http.client as compat_http_client
55 except ImportError: # Python 2
56 import httplib as compat_http_client
57
58 try:
59 from subprocess import DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: DEVNULL
61 except ImportError:
62 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
63
64 try:
65 from urllib.parse import parse_qs as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string, encoding='utf-8', errors='replace'):
70 if string == '':
71 return string
72 res = string.split('%')
73 if len(res) == 1:
74 return string
75 if encoding is None:
76 encoding = 'utf-8'
77 if errors is None:
78 errors = 'replace'
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
80 pct_sequence = b''
81 string = res[0]
82 for item in res[1:]:
83 try:
84 if not item:
85 raise ValueError
86 pct_sequence += item[:2].decode('hex')
87 rest = item[2:]
88 if not rest:
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
92 continue
93 except ValueError:
94 rest = '%' + item
95 # Encountered non-percent-encoded characters. Flush the current
96 # pct_sequence.
97 string += pct_sequence.decode(encoding, errors) + rest
98 pct_sequence = b''
99 if pct_sequence:
100 # Flush the final pct_sequence
101 string += pct_sequence.decode(encoding, errors)
102 return string
103
104 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
105 encoding='utf-8', errors='replace'):
106 qs, _coerce_result = qs, unicode
107 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
108 r = []
109 for name_value in pairs:
110 if not name_value and not strict_parsing:
111 continue
112 nv = name_value.split('=', 1)
113 if len(nv) != 2:
114 if strict_parsing:
115 raise ValueError("bad query field: %r" % (name_value,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values:
118 nv.append('')
119 else:
120 continue
121 if len(nv[1]) or keep_blank_values:
122 name = nv[0].replace('+', ' ')
123 name = _unquote(name, encoding=encoding, errors=errors)
124 name = _coerce_result(name)
125 value = nv[1].replace('+', ' ')
126 value = _unquote(value, encoding=encoding, errors=errors)
127 value = _coerce_result(value)
128 r.append((name, value))
129 return r
130
131 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
133 parsed_result = {}
134 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
135 encoding=encoding, errors=errors)
136 for name, value in pairs:
137 if name in parsed_result:
138 parsed_result[name].append(value)
139 else:
140 parsed_result[name] = [value]
141 return parsed_result
142
143 try:
144 compat_str = unicode # Python 2
145 except NameError:
146 compat_str = str
147
148 try:
149 compat_chr = unichr # Python 2
150 except NameError:
151 compat_chr = chr
152
153 def compat_ord(c):
154 if type(c) is int: return c
155 else: return ord(c)
156
157 # This is not clearly defined otherwise
158 compiled_regex_type = type(re.compile(''))
159
160 std_headers = {
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
166 }
167
168 def preferredencoding():
169 """Get preferred encoding.
170
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
173 """
174 try:
175 pref = locale.getpreferredencoding()
176 u'TEST'.encode(pref)
177 except:
178 pref = 'UTF-8'
179
180 return pref
181
182 if sys.version_info < (3,0):
183 def compat_print(s):
184 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
185 else:
186 def compat_print(s):
187 assert type(s) == type(u'')
188 print(s)
189
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys.version_info < (3,0):
193 def write_json_file(obj, fn):
194 with open(fn, 'wb') as f:
195 json.dump(obj, f)
196 else:
197 def write_json_file(obj, fn):
198 with open(fn, 'w', encoding='utf-8') as f:
199 json.dump(obj, f)
200
201 def htmlentity_transform(matchobj):
202 """Transforms an HTML entity to a character.
203
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
206 """
207 entity = matchobj.group(1)
208
209 # Known non-numeric HTML entity
210 if entity in compat_html_entities.name2codepoint:
211 return compat_chr(compat_html_entities.name2codepoint[entity])
212
213 mobj = re.match(u'(?u)#(x?\\d+)', entity)
214 if mobj is not None:
215 numstr = mobj.group(1)
216 if numstr.startswith(u'x'):
217 base = 16
218 numstr = u'0%s' % numstr
219 else:
220 base = 10
221 return compat_chr(int(numstr, base))
222
223 # Unknown entity in name, return its literal representation
224 return (u'&%s;' % entity)
225
226 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
227 class AttrParser(compat_html_parser.HTMLParser):
228 """Modified HTMLParser that isolates a tag with the specified attribute"""
229 def __init__(self, attribute, value):
230 self.attribute = attribute
231 self.value = value
232 self.result = None
233 self.started = False
234 self.depth = {}
235 self.html = None
236 self.watch_startpos = False
237 self.error_count = 0
238 compat_html_parser.HTMLParser.__init__(self)
239
240 def error(self, message):
241 if self.error_count > 10 or self.started:
242 raise compat_html_parser.HTMLParseError(message, self.getpos())
243 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
244 self.error_count += 1
245 self.goahead(1)
246
247 def loads(self, html):
248 self.html = html
249 self.feed(html)
250 self.close()
251
252 def handle_starttag(self, tag, attrs):
253 attrs = dict(attrs)
254 if self.started:
255 self.find_startpos(None)
256 if self.attribute in attrs and attrs[self.attribute] == self.value:
257 self.result = [tag]
258 self.started = True
259 self.watch_startpos = True
260 if self.started:
261 if not tag in self.depth: self.depth[tag] = 0
262 self.depth[tag] += 1
263
264 def handle_endtag(self, tag):
265 if self.started:
266 if tag in self.depth: self.depth[tag] -= 1
267 if self.depth[self.result[0]] == 0:
268 self.started = False
269 self.result.append(self.getpos())
270
271 def find_startpos(self, x):
272 """Needed to put the start position of the result (self.result[1])
273 after the opening tag with the requested id"""
274 if self.watch_startpos:
275 self.watch_startpos = False
276 self.result.append(self.getpos())
277 handle_entityref = handle_charref = handle_data = handle_comment = \
278 handle_decl = handle_pi = unknown_decl = find_startpos
279
280 def get_result(self):
281 if self.result is None:
282 return None
283 if len(self.result) != 3:
284 return None
285 lines = self.html.split('\n')
286 lines = lines[self.result[1][0]-1:self.result[2][0]]
287 lines[0] = lines[0][self.result[1][1]:]
288 if len(lines) == 1:
289 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
290 lines[-1] = lines[-1][:self.result[2][1]]
291 return '\n'.join(lines).strip()
292 # Hack for https://github.com/rg3/youtube-dl/issues/662
293 if sys.version_info < (2, 7, 3):
294 AttrParser.parse_endtag = (lambda self, i:
295 i + len("</scr'+'ipt>")
296 if self.rawdata[i:].startswith("</scr'+'ipt>")
297 else compat_html_parser.HTMLParser.parse_endtag(self, i))
298
299 def get_element_by_id(id, html):
300 """Return the content of the tag with the specified ID in the passed HTML document"""
301 return get_element_by_attribute("id", id, html)
302
303 def get_element_by_attribute(attribute, value, html):
304 """Return the content of the tag with the specified attribute in the passed HTML document"""
305 parser = AttrParser(attribute, value)
306 try:
307 parser.loads(html)
308 except compat_html_parser.HTMLParseError:
309 pass
310 return parser.get_result()
311
312
313 def clean_html(html):
314 """Clean an HTML snippet into a readable string"""
315 # Newline vs <br />
316 html = html.replace('\n', ' ')
317 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
318 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
319 # Strip html tags
320 html = re.sub('<.*?>', '', html)
321 # Replace html entities
322 html = unescapeHTML(html)
323 return html.strip()
324
325
326 def sanitize_open(filename, open_mode):
327 """Try to open the given filename, and slightly tweak it if this fails.
328
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
332 function.
333
334 It returns the tuple (stream, definitive_file_name).
335 """
336 try:
337 if filename == u'-':
338 if sys.platform == 'win32':
339 import msvcrt
340 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
341 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
342 stream = open(encodeFilename(filename), open_mode)
343 return (stream, filename)
344 except (IOError, OSError) as err:
345 if err.errno in (errno.EACCES,):
346 raise
347
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename = os.path.join(
350 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
351 for path_part in os.path.split(filename)
352 )
353 if alt_filename == filename:
354 raise
355 else:
356 # An exception here should be caught in the caller
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, alt_filename)
359
360
361 def timeconvert(timestr):
362 """Convert RFC 2822 defined time string into system timestamp"""
363 timestamp = None
364 timetuple = email.utils.parsedate_tz(timestr)
365 if timetuple is not None:
366 timestamp = email.utils.mktime_tz(timetuple)
367 return timestamp
368
369 def sanitize_filename(s, restricted=False, is_id=False):
370 """Sanitizes a string so it could be used as part of a filename.
371 If restricted is set, use a stricter subset of allowed characters.
372 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
373 """
374 def replace_insane(char):
375 if char == '?' or ord(char) < 32 or ord(char) == 127:
376 return ''
377 elif char == '"':
378 return '' if restricted else '\''
379 elif char == ':':
380 return '_-' if restricted else ' -'
381 elif char in '\\/|*<>':
382 return '_'
383 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
384 return '_'
385 if restricted and ord(char) > 127:
386 return '_'
387 return char
388
389 result = u''.join(map(replace_insane, s))
390 if not is_id:
391 while '__' in result:
392 result = result.replace('__', '_')
393 result = result.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted and result.startswith('-_'):
396 result = result[2:]
397 if not result:
398 result = '_'
399 return result
400
401 def orderedSet(iterable):
402 """ Remove all duplicates from the input iterable """
403 res = []
404 for el in iterable:
405 if el not in res:
406 res.append(el)
407 return res
408
409 def unescapeHTML(s):
410 """
411 @param s a string
412 """
413 assert type(s) == type(u'')
414
415 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
416 return result
417
418 def encodeFilename(s):
419 """
420 @param s The name of the file
421 """
422
423 assert type(s) == type(u'')
424
425 # Python 3 has a Unicode API
426 if sys.version_info >= (3, 0):
427 return s
428
429 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
430 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
431 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
432 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
433 return s
434 else:
435 encoding = sys.getfilesystemencoding()
436 if encoding is None:
437 encoding = 'utf-8'
438 return s.encode(encoding, 'ignore')
439
440 def decodeOption(optval):
441 if optval is None:
442 return optval
443 if isinstance(optval, bytes):
444 optval = optval.decode(preferredencoding())
445
446 assert isinstance(optval, compat_str)
447 return optval
448
449 def formatSeconds(secs):
450 if secs > 3600:
451 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
452 elif secs > 60:
453 return '%d:%02d' % (secs // 60, secs % 60)
454 else:
455 return '%d' % secs
456
457 def make_HTTPS_handler(opts):
458 if sys.version_info < (3,2):
459 # Python's 2.x handler is very simplistic
460 return compat_urllib_request.HTTPSHandler()
461 else:
462 import ssl
463 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
464 context.set_default_verify_paths()
465
466 context.verify_mode = (ssl.CERT_NONE
467 if opts.no_check_certificate
468 else ssl.CERT_REQUIRED)
469 return compat_urllib_request.HTTPSHandler(context=context)
470
471 class ExtractorError(Exception):
472 """Error during info extraction."""
473 def __init__(self, msg, tb=None):
474 """ tb, if given, is the original traceback (so that it can be printed out). """
475
476 if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
477 msg = msg + u'; please report this issue on GitHub.'
478 super(ExtractorError, self).__init__(msg)
479
480 self.traceback = tb
481 self.exc_info = sys.exc_info() # preserve original exception
482
483 def format_traceback(self):
484 if self.traceback is None:
485 return None
486 return u''.join(traceback.format_tb(self.traceback))
487
488
489 class DownloadError(Exception):
490 """Download Error exception.
491
492 This exception may be thrown by FileDownloader objects if they are not
493 configured to continue on errors. They will contain the appropriate
494 error message.
495 """
496 def __init__(self, msg, exc_info=None):
497 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
498 super(DownloadError, self).__init__(msg)
499 self.exc_info = exc_info
500
501
502 class SameFileError(Exception):
503 """Same File exception.
504
505 This exception will be thrown by FileDownloader objects if they detect
506 multiple files would have to be downloaded to the same file on disk.
507 """
508 pass
509
510
511 class PostProcessingError(Exception):
512 """Post Processing exception.
513
514 This exception may be raised by PostProcessor's .run() method to
515 indicate an error in the postprocessing task.
516 """
517 def __init__(self, msg):
518 self.msg = msg
519
520 class MaxDownloadsReached(Exception):
521 """ --max-downloads limit has been reached. """
522 pass
523
524
525 class UnavailableVideoError(Exception):
526 """Unavailable Format exception.
527
528 This exception will be thrown when a video is requested
529 in a format that is not available for that video.
530 """
531 pass
532
533
534 class ContentTooShortError(Exception):
535 """Content Too Short exception.
536
537 This exception may be raised by FileDownloader objects when a file they
538 download is too small for what the server announced first, indicating
539 the connection was probably interrupted.
540 """
541 # Both in bytes
542 downloaded = None
543 expected = None
544
545 def __init__(self, downloaded, expected):
546 self.downloaded = downloaded
547 self.expected = expected
548
549 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
550 """Handler for HTTP requests and responses.
551
552 This class, when installed with an OpenerDirector, automatically adds
553 the standard headers to every HTTP request and handles gzipped and
554 deflated responses from web servers. If compression is to be avoided in
555 a particular request, the original request in the program code only has
556 to include the HTTP header "Youtubedl-No-Compression", which will be
557 removed before making the real request.
558
559 Part of this code was copied from:
560
561 http://techknack.net/python-urllib2-handlers/
562
563 Andrew Rowls, the author of that code, agreed to release it to the
564 public domain.
565 """
566
567 @staticmethod
568 def deflate(data):
569 try:
570 return zlib.decompress(data, -zlib.MAX_WBITS)
571 except zlib.error:
572 return zlib.decompress(data)
573
574 @staticmethod
575 def addinfourl_wrapper(stream, headers, url, code):
576 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
577 return compat_urllib_request.addinfourl(stream, headers, url, code)
578 ret = compat_urllib_request.addinfourl(stream, headers, url)
579 ret.code = code
580 return ret
581
582 def http_request(self, req):
583 for h,v in std_headers.items():
584 if h in req.headers:
585 del req.headers[h]
586 req.add_header(h, v)
587 if 'Youtubedl-no-compression' in req.headers:
588 if 'Accept-encoding' in req.headers:
589 del req.headers['Accept-encoding']
590 del req.headers['Youtubedl-no-compression']
591 if 'Youtubedl-user-agent' in req.headers:
592 if 'User-agent' in req.headers:
593 del req.headers['User-agent']
594 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
595 del req.headers['Youtubedl-user-agent']
596 return req
597
598 def http_response(self, req, resp):
599 old_resp = resp
600 # gzip
601 if resp.headers.get('Content-encoding', '') == 'gzip':
602 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
603 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
604 resp.msg = old_resp.msg
605 # deflate
606 if resp.headers.get('Content-encoding', '') == 'deflate':
607 gz = io.BytesIO(self.deflate(resp.read()))
608 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
609 resp.msg = old_resp.msg
610 return resp
611
612 https_request = http_request
613 https_response = http_response
614
615 def unified_strdate(date_str):
616 """Return a string with the date in the format YYYYMMDD"""
617 upload_date = None
618 #Replace commas
619 date_str = date_str.replace(',',' ')
620 # %z (UTC offset) is only supported in python>=3.2
621 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
622 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
623 for expression in format_expressions:
624 try:
625 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
626 except:
627 pass
628 return upload_date
629
630 def date_from_str(date_str):
631 """
632 Return a datetime object from a string in the format YYYYMMDD or
633 (now|today)[+-][0-9](day|week|month|year)(s)?"""
634 today = datetime.date.today()
635 if date_str == 'now'or date_str == 'today':
636 return today
637 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
638 if match is not None:
639 sign = match.group('sign')
640 time = int(match.group('time'))
641 if sign == '-':
642 time = -time
643 unit = match.group('unit')
644 #A bad aproximation?
645 if unit == 'month':
646 unit = 'day'
647 time *= 30
648 elif unit == 'year':
649 unit = 'day'
650 time *= 365
651 unit += 's'
652 delta = datetime.timedelta(**{unit: time})
653 return today + delta
654 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
655
656 class DateRange(object):
657 """Represents a time interval between two dates"""
658 def __init__(self, start=None, end=None):
659 """start and end must be strings in the format accepted by date"""
660 if start is not None:
661 self.start = date_from_str(start)
662 else:
663 self.start = datetime.datetime.min.date()
664 if end is not None:
665 self.end = date_from_str(end)
666 else:
667 self.end = datetime.datetime.max.date()
668 if self.start > self.end:
669 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
670 @classmethod
671 def day(cls, day):
672 """Returns a range that only contains the given day"""
673 return cls(day,day)
674 def __contains__(self, date):
675 """Check if the date is in the range"""
676 if not isinstance(date, datetime.date):
677 date = date_from_str(date)
678 return self.start <= date <= self.end
679 def __str__(self):
680 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())