]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Merge tag 'upstream/2013.07.10'
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import errno
5 import gzip
6 import io
7 import json
8 import locale
9 import os
10 import re
11 import sys
12 import traceback
13 import zlib
14 import email.utils
15 import socket
16 import datetime
17
18 try:
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
22
23 try:
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
27
28 try:
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
32
33 try:
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
37
38 try:
39 import http.cookiejar as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib as compat_cookiejar
42
43 try:
44 import html.entities as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs as compat_html_entities
47
48 try:
49 import html.parser as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser as compat_html_parser
52
53 try:
54 import http.client as compat_http_client
55 except ImportError: # Python 2
56 import httplib as compat_http_client
57
58 try:
59 from subprocess import DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: DEVNULL
61 except ImportError:
62 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
63
64 try:
65 from urllib.parse import parse_qs as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string, encoding='utf-8', errors='replace'):
70 if string == '':
71 return string
72 res = string.split('%')
73 if len(res) == 1:
74 return string
75 if encoding is None:
76 encoding = 'utf-8'
77 if errors is None:
78 errors = 'replace'
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
80 pct_sequence = b''
81 string = res[0]
82 for item in res[1:]:
83 try:
84 if not item:
85 raise ValueError
86 pct_sequence += item[:2].decode('hex')
87 rest = item[2:]
88 if not rest:
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
92 continue
93 except ValueError:
94 rest = '%' + item
95 # Encountered non-percent-encoded characters. Flush the current
96 # pct_sequence.
97 string += pct_sequence.decode(encoding, errors) + rest
98 pct_sequence = b''
99 if pct_sequence:
100 # Flush the final pct_sequence
101 string += pct_sequence.decode(encoding, errors)
102 return string
103
104 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
105 encoding='utf-8', errors='replace'):
106 qs, _coerce_result = qs, unicode
107 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
108 r = []
109 for name_value in pairs:
110 if not name_value and not strict_parsing:
111 continue
112 nv = name_value.split('=', 1)
113 if len(nv) != 2:
114 if strict_parsing:
115 raise ValueError("bad query field: %r" % (name_value,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values:
118 nv.append('')
119 else:
120 continue
121 if len(nv[1]) or keep_blank_values:
122 name = nv[0].replace('+', ' ')
123 name = _unquote(name, encoding=encoding, errors=errors)
124 name = _coerce_result(name)
125 value = nv[1].replace('+', ' ')
126 value = _unquote(value, encoding=encoding, errors=errors)
127 value = _coerce_result(value)
128 r.append((name, value))
129 return r
130
131 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
133 parsed_result = {}
134 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
135 encoding=encoding, errors=errors)
136 for name, value in pairs:
137 if name in parsed_result:
138 parsed_result[name].append(value)
139 else:
140 parsed_result[name] = [value]
141 return parsed_result
142
143 try:
144 compat_str = unicode # Python 2
145 except NameError:
146 compat_str = str
147
148 try:
149 compat_chr = unichr # Python 2
150 except NameError:
151 compat_chr = chr
152
153 def compat_ord(c):
154 if type(c) is int: return c
155 else: return ord(c)
156
157 # This is not clearly defined otherwise
158 compiled_regex_type = type(re.compile(''))
159
160 std_headers = {
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
166 }
167
168 def preferredencoding():
169 """Get preferred encoding.
170
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
173 """
174 try:
175 pref = locale.getpreferredencoding()
176 u'TEST'.encode(pref)
177 except:
178 pref = 'UTF-8'
179
180 return pref
181
182 if sys.version_info < (3,0):
183 def compat_print(s):
184 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
185 else:
186 def compat_print(s):
187 assert type(s) == type(u'')
188 print(s)
189
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys.version_info < (3,0):
193 def write_json_file(obj, fn):
194 with open(fn, 'wb') as f:
195 json.dump(obj, f)
196 else:
197 def write_json_file(obj, fn):
198 with open(fn, 'w', encoding='utf-8') as f:
199 json.dump(obj, f)
200
201 def htmlentity_transform(matchobj):
202 """Transforms an HTML entity to a character.
203
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
206 """
207 entity = matchobj.group(1)
208
209 # Known non-numeric HTML entity
210 if entity in compat_html_entities.name2codepoint:
211 return compat_chr(compat_html_entities.name2codepoint[entity])
212
213 mobj = re.match(u'(?u)#(x?\\d+)', entity)
214 if mobj is not None:
215 numstr = mobj.group(1)
216 if numstr.startswith(u'x'):
217 base = 16
218 numstr = u'0%s' % numstr
219 else:
220 base = 10
221 return compat_chr(int(numstr, base))
222
223 # Unknown entity in name, return its literal representation
224 return (u'&%s;' % entity)
225
226 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
227 class AttrParser(compat_html_parser.HTMLParser):
228 """Modified HTMLParser that isolates a tag with the specified attribute"""
229 def __init__(self, attribute, value):
230 self.attribute = attribute
231 self.value = value
232 self.result = None
233 self.started = False
234 self.depth = {}
235 self.html = None
236 self.watch_startpos = False
237 self.error_count = 0
238 compat_html_parser.HTMLParser.__init__(self)
239
240 def error(self, message):
241 if self.error_count > 10 or self.started:
242 raise compat_html_parser.HTMLParseError(message, self.getpos())
243 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
244 self.error_count += 1
245 self.goahead(1)
246
247 def loads(self, html):
248 self.html = html
249 self.feed(html)
250 self.close()
251
252 def handle_starttag(self, tag, attrs):
253 attrs = dict(attrs)
254 if self.started:
255 self.find_startpos(None)
256 if self.attribute in attrs and attrs[self.attribute] == self.value:
257 self.result = [tag]
258 self.started = True
259 self.watch_startpos = True
260 if self.started:
261 if not tag in self.depth: self.depth[tag] = 0
262 self.depth[tag] += 1
263
264 def handle_endtag(self, tag):
265 if self.started:
266 if tag in self.depth: self.depth[tag] -= 1
267 if self.depth[self.result[0]] == 0:
268 self.started = False
269 self.result.append(self.getpos())
270
271 def find_startpos(self, x):
272 """Needed to put the start position of the result (self.result[1])
273 after the opening tag with the requested id"""
274 if self.watch_startpos:
275 self.watch_startpos = False
276 self.result.append(self.getpos())
277 handle_entityref = handle_charref = handle_data = handle_comment = \
278 handle_decl = handle_pi = unknown_decl = find_startpos
279
280 def get_result(self):
281 if self.result is None:
282 return None
283 if len(self.result) != 3:
284 return None
285 lines = self.html.split('\n')
286 lines = lines[self.result[1][0]-1:self.result[2][0]]
287 lines[0] = lines[0][self.result[1][1]:]
288 if len(lines) == 1:
289 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
290 lines[-1] = lines[-1][:self.result[2][1]]
291 return '\n'.join(lines).strip()
292 # Hack for https://github.com/rg3/youtube-dl/issues/662
293 if sys.version_info < (2, 7, 3):
294 AttrParser.parse_endtag = (lambda self, i:
295 i + len("</scr'+'ipt>")
296 if self.rawdata[i:].startswith("</scr'+'ipt>")
297 else compat_html_parser.HTMLParser.parse_endtag(self, i))
298
299 def get_element_by_id(id, html):
300 """Return the content of the tag with the specified ID in the passed HTML document"""
301 return get_element_by_attribute("id", id, html)
302
303 def get_element_by_attribute(attribute, value, html):
304 """Return the content of the tag with the specified attribute in the passed HTML document"""
305 parser = AttrParser(attribute, value)
306 try:
307 parser.loads(html)
308 except compat_html_parser.HTMLParseError:
309 pass
310 return parser.get_result()
311
312
313 def clean_html(html):
314 """Clean an HTML snippet into a readable string"""
315 # Newline vs <br />
316 html = html.replace('\n', ' ')
317 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
318 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
319 # Strip html tags
320 html = re.sub('<.*?>', '', html)
321 # Replace html entities
322 html = unescapeHTML(html)
323 return html.strip()
324
325
326 def sanitize_open(filename, open_mode):
327 """Try to open the given filename, and slightly tweak it if this fails.
328
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
332 function.
333
334 It returns the tuple (stream, definitive_file_name).
335 """
336 try:
337 if filename == u'-':
338 if sys.platform == 'win32':
339 import msvcrt
340 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
341 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
342 stream = open(encodeFilename(filename), open_mode)
343 return (stream, filename)
344 except (IOError, OSError) as err:
345 if err.errno in (errno.EACCES,):
346 raise
347
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename = os.path.join(
350 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
351 for path_part in os.path.split(filename)
352 )
353 if alt_filename == filename:
354 raise
355 else:
356 # An exception here should be caught in the caller
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, alt_filename)
359
360
361 def timeconvert(timestr):
362 """Convert RFC 2822 defined time string into system timestamp"""
363 timestamp = None
364 timetuple = email.utils.parsedate_tz(timestr)
365 if timetuple is not None:
366 timestamp = email.utils.mktime_tz(timetuple)
367 return timestamp
368
369 def sanitize_filename(s, restricted=False, is_id=False):
370 """Sanitizes a string so it could be used as part of a filename.
371 If restricted is set, use a stricter subset of allowed characters.
372 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
373 """
374 def replace_insane(char):
375 if char == '?' or ord(char) < 32 or ord(char) == 127:
376 return ''
377 elif char == '"':
378 return '' if restricted else '\''
379 elif char == ':':
380 return '_-' if restricted else ' -'
381 elif char in '\\/|*<>':
382 return '_'
383 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
384 return '_'
385 if restricted and ord(char) > 127:
386 return '_'
387 return char
388
389 result = u''.join(map(replace_insane, s))
390 if not is_id:
391 while '__' in result:
392 result = result.replace('__', '_')
393 result = result.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted and result.startswith('-_'):
396 result = result[2:]
397 if not result:
398 result = '_'
399 return result
400
401 def orderedSet(iterable):
402 """ Remove all duplicates from the input iterable """
403 res = []
404 for el in iterable:
405 if el not in res:
406 res.append(el)
407 return res
408
409 def unescapeHTML(s):
410 """
411 @param s a string
412 """
413 assert type(s) == type(u'')
414
415 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
416 return result
417
418 def encodeFilename(s):
419 """
420 @param s The name of the file
421 """
422
423 assert type(s) == type(u'')
424
425 # Python 3 has a Unicode API
426 if sys.version_info >= (3, 0):
427 return s
428
429 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
430 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
431 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
432 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
433 return s
434 else:
435 encoding = sys.getfilesystemencoding()
436 if encoding is None:
437 encoding = 'utf-8'
438 return s.encode(encoding, 'ignore')
439
440 def decodeOption(optval):
441 if optval is None:
442 return optval
443 if isinstance(optval, bytes):
444 optval = optval.decode(preferredencoding())
445
446 assert isinstance(optval, compat_str)
447 return optval
448
449 def formatSeconds(secs):
450 if secs > 3600:
451 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
452 elif secs > 60:
453 return '%d:%02d' % (secs // 60, secs % 60)
454 else:
455 return '%d' % secs
456
457 def make_HTTPS_handler(opts):
458 if sys.version_info < (3,2):
459 # Python's 2.x handler is very simplistic
460 return compat_urllib_request.HTTPSHandler()
461 else:
462 import ssl
463 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
464 context.set_default_verify_paths()
465
466 context.verify_mode = (ssl.CERT_NONE
467 if opts.no_check_certificate
468 else ssl.CERT_REQUIRED)
469 return compat_urllib_request.HTTPSHandler(context=context)
470
471 class ExtractorError(Exception):
472 """Error during info extraction."""
473 def __init__(self, msg, tb=None, expected=False):
474 """ tb, if given, is the original traceback (so that it can be printed out).
475 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
476 """
477
478 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
479 expected = True
480 if not expected:
481 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.'
482 super(ExtractorError, self).__init__(msg)
483
484 self.traceback = tb
485 self.exc_info = sys.exc_info() # preserve original exception
486
487 def format_traceback(self):
488 if self.traceback is None:
489 return None
490 return u''.join(traceback.format_tb(self.traceback))
491
492
493 class DownloadError(Exception):
494 """Download Error exception.
495
496 This exception may be thrown by FileDownloader objects if they are not
497 configured to continue on errors. They will contain the appropriate
498 error message.
499 """
500 def __init__(self, msg, exc_info=None):
501 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
502 super(DownloadError, self).__init__(msg)
503 self.exc_info = exc_info
504
505
506 class SameFileError(Exception):
507 """Same File exception.
508
509 This exception will be thrown by FileDownloader objects if they detect
510 multiple files would have to be downloaded to the same file on disk.
511 """
512 pass
513
514
515 class PostProcessingError(Exception):
516 """Post Processing exception.
517
518 This exception may be raised by PostProcessor's .run() method to
519 indicate an error in the postprocessing task.
520 """
521 def __init__(self, msg):
522 self.msg = msg
523
524 class MaxDownloadsReached(Exception):
525 """ --max-downloads limit has been reached. """
526 pass
527
528
529 class UnavailableVideoError(Exception):
530 """Unavailable Format exception.
531
532 This exception will be thrown when a video is requested
533 in a format that is not available for that video.
534 """
535 pass
536
537
538 class ContentTooShortError(Exception):
539 """Content Too Short exception.
540
541 This exception may be raised by FileDownloader objects when a file they
542 download is too small for what the server announced first, indicating
543 the connection was probably interrupted.
544 """
545 # Both in bytes
546 downloaded = None
547 expected = None
548
549 def __init__(self, downloaded, expected):
550 self.downloaded = downloaded
551 self.expected = expected
552
553 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
554 """Handler for HTTP requests and responses.
555
556 This class, when installed with an OpenerDirector, automatically adds
557 the standard headers to every HTTP request and handles gzipped and
558 deflated responses from web servers. If compression is to be avoided in
559 a particular request, the original request in the program code only has
560 to include the HTTP header "Youtubedl-No-Compression", which will be
561 removed before making the real request.
562
563 Part of this code was copied from:
564
565 http://techknack.net/python-urllib2-handlers/
566
567 Andrew Rowls, the author of that code, agreed to release it to the
568 public domain.
569 """
570
571 @staticmethod
572 def deflate(data):
573 try:
574 return zlib.decompress(data, -zlib.MAX_WBITS)
575 except zlib.error:
576 return zlib.decompress(data)
577
578 @staticmethod
579 def addinfourl_wrapper(stream, headers, url, code):
580 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
581 return compat_urllib_request.addinfourl(stream, headers, url, code)
582 ret = compat_urllib_request.addinfourl(stream, headers, url)
583 ret.code = code
584 return ret
585
586 def http_request(self, req):
587 for h,v in std_headers.items():
588 if h in req.headers:
589 del req.headers[h]
590 req.add_header(h, v)
591 if 'Youtubedl-no-compression' in req.headers:
592 if 'Accept-encoding' in req.headers:
593 del req.headers['Accept-encoding']
594 del req.headers['Youtubedl-no-compression']
595 if 'Youtubedl-user-agent' in req.headers:
596 if 'User-agent' in req.headers:
597 del req.headers['User-agent']
598 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
599 del req.headers['Youtubedl-user-agent']
600 return req
601
602 def http_response(self, req, resp):
603 old_resp = resp
604 # gzip
605 if resp.headers.get('Content-encoding', '') == 'gzip':
606 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
607 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
608 resp.msg = old_resp.msg
609 # deflate
610 if resp.headers.get('Content-encoding', '') == 'deflate':
611 gz = io.BytesIO(self.deflate(resp.read()))
612 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
613 resp.msg = old_resp.msg
614 return resp
615
616 https_request = http_request
617 https_response = http_response
618
619 def unified_strdate(date_str):
620 """Return a string with the date in the format YYYYMMDD"""
621 upload_date = None
622 #Replace commas
623 date_str = date_str.replace(',',' ')
624 # %z (UTC offset) is only supported in python>=3.2
625 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
626 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
627 for expression in format_expressions:
628 try:
629 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
630 except:
631 pass
632 return upload_date
633
634 def determine_ext(url):
635 guess = url.partition(u'?')[0].rpartition(u'.')[2]
636 if re.match(r'^[A-Za-z0-9]+$', guess):
637 return guess
638 else:
639 return u'unknown_video'
640
641 def date_from_str(date_str):
642 """
643 Return a datetime object from a string in the format YYYYMMDD or
644 (now|today)[+-][0-9](day|week|month|year)(s)?"""
645 today = datetime.date.today()
646 if date_str == 'now'or date_str == 'today':
647 return today
648 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
649 if match is not None:
650 sign = match.group('sign')
651 time = int(match.group('time'))
652 if sign == '-':
653 time = -time
654 unit = match.group('unit')
655 #A bad aproximation?
656 if unit == 'month':
657 unit = 'day'
658 time *= 30
659 elif unit == 'year':
660 unit = 'day'
661 time *= 365
662 unit += 's'
663 delta = datetime.timedelta(**{unit: time})
664 return today + delta
665 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
666
667 class DateRange(object):
668 """Represents a time interval between two dates"""
669 def __init__(self, start=None, end=None):
670 """start and end must be strings in the format accepted by date"""
671 if start is not None:
672 self.start = date_from_str(start)
673 else:
674 self.start = datetime.datetime.min.date()
675 if end is not None:
676 self.end = date_from_str(end)
677 else:
678 self.end = datetime.datetime.max.date()
679 if self.start > self.end:
680 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
681 @classmethod
682 def day(cls, day):
683 """Returns a range that only contains the given day"""
684 return cls(day,day)
685 def __contains__(self, date):
686 """Check if the date is in the range"""
687 if not isinstance(date, datetime.date):
688 date = date_from_str(date)
689 return self.start <= date <= self.end
690 def __str__(self):
691 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())