]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Imported Upstream version 2013.05.14
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import errno
5 import gzip
6 import io
7 import json
8 import locale
9 import os
10 import re
11 import sys
12 import traceback
13 import zlib
14 import email.utils
15 import json
16 import datetime
17
18 try:
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
22
23 try:
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
27
28 try:
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
32
33 try:
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
37
38 try:
39 import http.cookiejar as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib as compat_cookiejar
42
43 try:
44 import html.entities as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs as compat_html_entities
47
48 try:
49 import html.parser as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser as compat_html_parser
52
53 try:
54 import http.client as compat_http_client
55 except ImportError: # Python 2
56 import httplib as compat_http_client
57
58 try:
59 from subprocess import DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: DEVNULL
61 except ImportError:
62 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
63
64 try:
65 from urllib.parse import parse_qs as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string, encoding='utf-8', errors='replace'):
70 if string == '':
71 return string
72 res = string.split('%')
73 if len(res) == 1:
74 return string
75 if encoding is None:
76 encoding = 'utf-8'
77 if errors is None:
78 errors = 'replace'
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
80 pct_sequence = b''
81 string = res[0]
82 for item in res[1:]:
83 try:
84 if not item:
85 raise ValueError
86 pct_sequence += item[:2].decode('hex')
87 rest = item[2:]
88 if not rest:
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
92 continue
93 except ValueError:
94 rest = '%' + item
95 # Encountered non-percent-encoded characters. Flush the current
96 # pct_sequence.
97 string += pct_sequence.decode(encoding, errors) + rest
98 pct_sequence = b''
99 if pct_sequence:
100 # Flush the final pct_sequence
101 string += pct_sequence.decode(encoding, errors)
102 return string
103
104 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
105 encoding='utf-8', errors='replace'):
106 qs, _coerce_result = qs, unicode
107 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
108 r = []
109 for name_value in pairs:
110 if not name_value and not strict_parsing:
111 continue
112 nv = name_value.split('=', 1)
113 if len(nv) != 2:
114 if strict_parsing:
115 raise ValueError("bad query field: %r" % (name_value,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values:
118 nv.append('')
119 else:
120 continue
121 if len(nv[1]) or keep_blank_values:
122 name = nv[0].replace('+', ' ')
123 name = _unquote(name, encoding=encoding, errors=errors)
124 name = _coerce_result(name)
125 value = nv[1].replace('+', ' ')
126 value = _unquote(value, encoding=encoding, errors=errors)
127 value = _coerce_result(value)
128 r.append((name, value))
129 return r
130
131 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
133 parsed_result = {}
134 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
135 encoding=encoding, errors=errors)
136 for name, value in pairs:
137 if name in parsed_result:
138 parsed_result[name].append(value)
139 else:
140 parsed_result[name] = [value]
141 return parsed_result
142
143 try:
144 compat_str = unicode # Python 2
145 except NameError:
146 compat_str = str
147
148 try:
149 compat_chr = unichr # Python 2
150 except NameError:
151 compat_chr = chr
152
153 std_headers = {
154 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
155 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
156 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
157 'Accept-Encoding': 'gzip, deflate',
158 'Accept-Language': 'en-us,en;q=0.5',
159 }
160
161 def preferredencoding():
162 """Get preferred encoding.
163
164 Returns the best encoding scheme for the system, based on
165 locale.getpreferredencoding() and some further tweaks.
166 """
167 try:
168 pref = locale.getpreferredencoding()
169 u'TEST'.encode(pref)
170 except:
171 pref = 'UTF-8'
172
173 return pref
174
175 if sys.version_info < (3,0):
176 def compat_print(s):
177 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
178 else:
179 def compat_print(s):
180 assert type(s) == type(u'')
181 print(s)
182
183 # In Python 2.x, json.dump expects a bytestream.
184 # In Python 3.x, it writes to a character stream
185 if sys.version_info < (3,0):
186 def write_json_file(obj, fn):
187 with open(fn, 'wb') as f:
188 json.dump(obj, f)
189 else:
190 def write_json_file(obj, fn):
191 with open(fn, 'w', encoding='utf-8') as f:
192 json.dump(obj, f)
193
194 def htmlentity_transform(matchobj):
195 """Transforms an HTML entity to a character.
196
197 This function receives a match object and is intended to be used with
198 the re.sub() function.
199 """
200 entity = matchobj.group(1)
201
202 # Known non-numeric HTML entity
203 if entity in compat_html_entities.name2codepoint:
204 return compat_chr(compat_html_entities.name2codepoint[entity])
205
206 mobj = re.match(u'(?u)#(x?\\d+)', entity)
207 if mobj is not None:
208 numstr = mobj.group(1)
209 if numstr.startswith(u'x'):
210 base = 16
211 numstr = u'0%s' % numstr
212 else:
213 base = 10
214 return compat_chr(int(numstr, base))
215
216 # Unknown entity in name, return its literal representation
217 return (u'&%s;' % entity)
218
219 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
220 class AttrParser(compat_html_parser.HTMLParser):
221 """Modified HTMLParser that isolates a tag with the specified attribute"""
222 def __init__(self, attribute, value):
223 self.attribute = attribute
224 self.value = value
225 self.result = None
226 self.started = False
227 self.depth = {}
228 self.html = None
229 self.watch_startpos = False
230 self.error_count = 0
231 compat_html_parser.HTMLParser.__init__(self)
232
233 def error(self, message):
234 if self.error_count > 10 or self.started:
235 raise compat_html_parser.HTMLParseError(message, self.getpos())
236 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
237 self.error_count += 1
238 self.goahead(1)
239
240 def loads(self, html):
241 self.html = html
242 self.feed(html)
243 self.close()
244
245 def handle_starttag(self, tag, attrs):
246 attrs = dict(attrs)
247 if self.started:
248 self.find_startpos(None)
249 if self.attribute in attrs and attrs[self.attribute] == self.value:
250 self.result = [tag]
251 self.started = True
252 self.watch_startpos = True
253 if self.started:
254 if not tag in self.depth: self.depth[tag] = 0
255 self.depth[tag] += 1
256
257 def handle_endtag(self, tag):
258 if self.started:
259 if tag in self.depth: self.depth[tag] -= 1
260 if self.depth[self.result[0]] == 0:
261 self.started = False
262 self.result.append(self.getpos())
263
264 def find_startpos(self, x):
265 """Needed to put the start position of the result (self.result[1])
266 after the opening tag with the requested id"""
267 if self.watch_startpos:
268 self.watch_startpos = False
269 self.result.append(self.getpos())
270 handle_entityref = handle_charref = handle_data = handle_comment = \
271 handle_decl = handle_pi = unknown_decl = find_startpos
272
273 def get_result(self):
274 if self.result is None:
275 return None
276 if len(self.result) != 3:
277 return None
278 lines = self.html.split('\n')
279 lines = lines[self.result[1][0]-1:self.result[2][0]]
280 lines[0] = lines[0][self.result[1][1]:]
281 if len(lines) == 1:
282 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
283 lines[-1] = lines[-1][:self.result[2][1]]
284 return '\n'.join(lines).strip()
285 # Hack for https://github.com/rg3/youtube-dl/issues/662
286 if sys.version_info < (2, 7, 3):
287 AttrParser.parse_endtag = (lambda self, i:
288 i + len("</scr'+'ipt>")
289 if self.rawdata[i:].startswith("</scr'+'ipt>")
290 else compat_html_parser.HTMLParser.parse_endtag(self, i))
291
292 def get_element_by_id(id, html):
293 """Return the content of the tag with the specified ID in the passed HTML document"""
294 return get_element_by_attribute("id", id, html)
295
296 def get_element_by_attribute(attribute, value, html):
297 """Return the content of the tag with the specified attribute in the passed HTML document"""
298 parser = AttrParser(attribute, value)
299 try:
300 parser.loads(html)
301 except compat_html_parser.HTMLParseError:
302 pass
303 return parser.get_result()
304
305
306 def clean_html(html):
307 """Clean an HTML snippet into a readable string"""
308 # Newline vs <br />
309 html = html.replace('\n', ' ')
310 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
311 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
312 # Strip html tags
313 html = re.sub('<.*?>', '', html)
314 # Replace html entities
315 html = unescapeHTML(html)
316 return html.strip()
317
318
319 def sanitize_open(filename, open_mode):
320 """Try to open the given filename, and slightly tweak it if this fails.
321
322 Attempts to open the given filename. If this fails, it tries to change
323 the filename slightly, step by step, until it's either able to open it
324 or it fails and raises a final exception, like the standard open()
325 function.
326
327 It returns the tuple (stream, definitive_file_name).
328 """
329 try:
330 if filename == u'-':
331 if sys.platform == 'win32':
332 import msvcrt
333 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
334 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
335 stream = open(encodeFilename(filename), open_mode)
336 return (stream, filename)
337 except (IOError, OSError) as err:
338 if err.errno in (errno.EACCES,):
339 raise
340
341 # In case of error, try to remove win32 forbidden chars
342 alt_filename = os.path.join(
343 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
344 for path_part in os.path.split(filename)
345 )
346 if alt_filename == filename:
347 raise
348 else:
349 # An exception here should be caught in the caller
350 stream = open(encodeFilename(filename), open_mode)
351 return (stream, alt_filename)
352
353
354 def timeconvert(timestr):
355 """Convert RFC 2822 defined time string into system timestamp"""
356 timestamp = None
357 timetuple = email.utils.parsedate_tz(timestr)
358 if timetuple is not None:
359 timestamp = email.utils.mktime_tz(timetuple)
360 return timestamp
361
362 def sanitize_filename(s, restricted=False, is_id=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
366 """
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
369 return ''
370 elif char == '"':
371 return '' if restricted else '\''
372 elif char == ':':
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
375 return '_'
376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
377 return '_'
378 if restricted and ord(char) > 127:
379 return '_'
380 return char
381
382 result = u''.join(map(replace_insane, s))
383 if not is_id:
384 while '__' in result:
385 result = result.replace('__', '_')
386 result = result.strip('_')
387 # Common case of "Foreign band name - English song title"
388 if restricted and result.startswith('-_'):
389 result = result[2:]
390 if not result:
391 result = '_'
392 return result
393
394 def orderedSet(iterable):
395 """ Remove all duplicates from the input iterable """
396 res = []
397 for el in iterable:
398 if el not in res:
399 res.append(el)
400 return res
401
402 def unescapeHTML(s):
403 """
404 @param s a string
405 """
406 assert type(s) == type(u'')
407
408 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
409 return result
410
411 def encodeFilename(s):
412 """
413 @param s The name of the file
414 """
415
416 assert type(s) == type(u'')
417
418 # Python 3 has a Unicode API
419 if sys.version_info >= (3, 0):
420 return s
421
422 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
423 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
424 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
425 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
426 return s
427 else:
428 encoding = sys.getfilesystemencoding()
429 if encoding is None:
430 encoding = 'utf-8'
431 return s.encode(encoding, 'ignore')
432
433 def decodeOption(optval):
434 if optval is None:
435 return optval
436 if isinstance(optval, bytes):
437 optval = optval.decode(preferredencoding())
438
439 assert isinstance(optval, compat_str)
440 return optval
441
442 def formatSeconds(secs):
443 if secs > 3600:
444 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
445 elif secs > 60:
446 return '%d:%02d' % (secs // 60, secs % 60)
447 else:
448 return '%d' % secs
449
450 def make_HTTPS_handler(opts):
451 if sys.version_info < (3,2):
452 # Python's 2.x handler is very simplistic
453 return compat_urllib_request.HTTPSHandler()
454 else:
455 import ssl
456 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
457 context.set_default_verify_paths()
458
459 context.verify_mode = (ssl.CERT_NONE
460 if opts.no_check_certificate
461 else ssl.CERT_REQUIRED)
462 return compat_urllib_request.HTTPSHandler(context=context)
463
464 class ExtractorError(Exception):
465 """Error during info extraction."""
466 def __init__(self, msg, tb=None):
467 """ tb, if given, is the original traceback (so that it can be printed out). """
468 super(ExtractorError, self).__init__(msg)
469 self.traceback = tb
470 self.exc_info = sys.exc_info() # preserve original exception
471
472 def format_traceback(self):
473 if self.traceback is None:
474 return None
475 return u''.join(traceback.format_tb(self.traceback))
476
477
478 class DownloadError(Exception):
479 """Download Error exception.
480
481 This exception may be thrown by FileDownloader objects if they are not
482 configured to continue on errors. They will contain the appropriate
483 error message.
484 """
485 def __init__(self, msg, exc_info=None):
486 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
487 super(DownloadError, self).__init__(msg)
488 self.exc_info = exc_info
489
490
491 class SameFileError(Exception):
492 """Same File exception.
493
494 This exception will be thrown by FileDownloader objects if they detect
495 multiple files would have to be downloaded to the same file on disk.
496 """
497 pass
498
499
500 class PostProcessingError(Exception):
501 """Post Processing exception.
502
503 This exception may be raised by PostProcessor's .run() method to
504 indicate an error in the postprocessing task.
505 """
506 def __init__(self, msg):
507 self.msg = msg
508
509 class MaxDownloadsReached(Exception):
510 """ --max-downloads limit has been reached. """
511 pass
512
513
514 class UnavailableVideoError(Exception):
515 """Unavailable Format exception.
516
517 This exception will be thrown when a video is requested
518 in a format that is not available for that video.
519 """
520 pass
521
522
523 class ContentTooShortError(Exception):
524 """Content Too Short exception.
525
526 This exception may be raised by FileDownloader objects when a file they
527 download is too small for what the server announced first, indicating
528 the connection was probably interrupted.
529 """
530 # Both in bytes
531 downloaded = None
532 expected = None
533
534 def __init__(self, downloaded, expected):
535 self.downloaded = downloaded
536 self.expected = expected
537
538 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
539 """Handler for HTTP requests and responses.
540
541 This class, when installed with an OpenerDirector, automatically adds
542 the standard headers to every HTTP request and handles gzipped and
543 deflated responses from web servers. If compression is to be avoided in
544 a particular request, the original request in the program code only has
545 to include the HTTP header "Youtubedl-No-Compression", which will be
546 removed before making the real request.
547
548 Part of this code was copied from:
549
550 http://techknack.net/python-urllib2-handlers/
551
552 Andrew Rowls, the author of that code, agreed to release it to the
553 public domain.
554 """
555
556 @staticmethod
557 def deflate(data):
558 try:
559 return zlib.decompress(data, -zlib.MAX_WBITS)
560 except zlib.error:
561 return zlib.decompress(data)
562
563 @staticmethod
564 def addinfourl_wrapper(stream, headers, url, code):
565 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
566 return compat_urllib_request.addinfourl(stream, headers, url, code)
567 ret = compat_urllib_request.addinfourl(stream, headers, url)
568 ret.code = code
569 return ret
570
571 def http_request(self, req):
572 for h,v in std_headers.items():
573 if h in req.headers:
574 del req.headers[h]
575 req.add_header(h, v)
576 if 'Youtubedl-no-compression' in req.headers:
577 if 'Accept-encoding' in req.headers:
578 del req.headers['Accept-encoding']
579 del req.headers['Youtubedl-no-compression']
580 if 'Youtubedl-user-agent' in req.headers:
581 if 'User-agent' in req.headers:
582 del req.headers['User-agent']
583 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
584 del req.headers['Youtubedl-user-agent']
585 return req
586
587 def http_response(self, req, resp):
588 old_resp = resp
589 # gzip
590 if resp.headers.get('Content-encoding', '') == 'gzip':
591 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
592 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
593 resp.msg = old_resp.msg
594 # deflate
595 if resp.headers.get('Content-encoding', '') == 'deflate':
596 gz = io.BytesIO(self.deflate(resp.read()))
597 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
598 resp.msg = old_resp.msg
599 return resp
600
601 https_request = http_request
602 https_response = http_response
603
604 def unified_strdate(date_str):
605 """Return a string with the date in the format YYYYMMDD"""
606 upload_date = None
607 #Replace commas
608 date_str = date_str.replace(',',' ')
609 # %z (UTC offset) is only supported in python>=3.2
610 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
611 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
612 for expression in format_expressions:
613 try:
614 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
615 except:
616 pass
617 return upload_date
618
619 def date_from_str(date_str):
620 """
621 Return a datetime object from a string in the format YYYYMMDD or
622 (now|today)[+-][0-9](day|week|month|year)(s)?"""
623 today = datetime.date.today()
624 if date_str == 'now'or date_str == 'today':
625 return today
626 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
627 if match is not None:
628 sign = match.group('sign')
629 time = int(match.group('time'))
630 if sign == '-':
631 time = -time
632 unit = match.group('unit')
633 #A bad aproximation?
634 if unit == 'month':
635 unit = 'day'
636 time *= 30
637 elif unit == 'year':
638 unit = 'day'
639 time *= 365
640 unit += 's'
641 delta = datetime.timedelta(**{unit: time})
642 return today + delta
643 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
644
645 class DateRange(object):
646 """Represents a time interval between two dates"""
647 def __init__(self, start=None, end=None):
648 """start and end must be strings in the format accepted by date"""
649 if start is not None:
650 self.start = date_from_str(start)
651 else:
652 self.start = datetime.datetime.min.date()
653 if end is not None:
654 self.end = date_from_str(end)
655 else:
656 self.end = datetime.datetime.max.date()
657 if self.start > self.end:
658 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
659 @classmethod
660 def day(cls, day):
661 """Returns a range that only contains the given day"""
662 return cls(day,day)
663 def __contains__(self, date):
664 """Check if the date is in the range"""
665 if not isinstance(date, datetime.date):
666 date = date_from_str(date)
667 return self.start <= date <= self.end
668 def __str__(self):
669 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())