]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Imported Upstream version 2013.11.11
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import email.utils
6 import errno
7 import gzip
8 import io
9 import json
10 import locale
11 import os
12 import pipes
13 import platform
14 import re
15 import socket
16 import sys
17 import traceback
18 import zlib
19
20 try:
21 import urllib.request as compat_urllib_request
22 except ImportError: # Python 2
23 import urllib2 as compat_urllib_request
24
25 try:
26 import urllib.error as compat_urllib_error
27 except ImportError: # Python 2
28 import urllib2 as compat_urllib_error
29
30 try:
31 import urllib.parse as compat_urllib_parse
32 except ImportError: # Python 2
33 import urllib as compat_urllib_parse
34
35 try:
36 from urllib.parse import urlparse as compat_urllib_parse_urlparse
37 except ImportError: # Python 2
38 from urlparse import urlparse as compat_urllib_parse_urlparse
39
40 try:
41 import urllib.parse as compat_urlparse
42 except ImportError: # Python 2
43 import urlparse as compat_urlparse
44
45 try:
46 import http.cookiejar as compat_cookiejar
47 except ImportError: # Python 2
48 import cookielib as compat_cookiejar
49
50 try:
51 import html.entities as compat_html_entities
52 except ImportError: # Python 2
53 import htmlentitydefs as compat_html_entities
54
55 try:
56 import html.parser as compat_html_parser
57 except ImportError: # Python 2
58 import HTMLParser as compat_html_parser
59
60 try:
61 import http.client as compat_http_client
62 except ImportError: # Python 2
63 import httplib as compat_http_client
64
65 try:
66 from urllib.error import HTTPError as compat_HTTPError
67 except ImportError: # Python 2
68 from urllib2 import HTTPError as compat_HTTPError
69
70 try:
71 from urllib.request import urlretrieve as compat_urlretrieve
72 except ImportError: # Python 2
73 from urllib import urlretrieve as compat_urlretrieve
74
75
76 try:
77 from subprocess import DEVNULL
78 compat_subprocess_get_DEVNULL = lambda: DEVNULL
79 except ImportError:
80 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
81
82 try:
83 from urllib.parse import parse_qs as compat_parse_qs
84 except ImportError: # Python 2
85 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
86 # Python 2's version is apparently totally broken
87 def _unquote(string, encoding='utf-8', errors='replace'):
88 if string == '':
89 return string
90 res = string.split('%')
91 if len(res) == 1:
92 return string
93 if encoding is None:
94 encoding = 'utf-8'
95 if errors is None:
96 errors = 'replace'
97 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
98 pct_sequence = b''
99 string = res[0]
100 for item in res[1:]:
101 try:
102 if not item:
103 raise ValueError
104 pct_sequence += item[:2].decode('hex')
105 rest = item[2:]
106 if not rest:
107 # This segment was just a single percent-encoded character.
108 # May be part of a sequence of code units, so delay decoding.
109 # (Stored in pct_sequence).
110 continue
111 except ValueError:
112 rest = '%' + item
113 # Encountered non-percent-encoded characters. Flush the current
114 # pct_sequence.
115 string += pct_sequence.decode(encoding, errors) + rest
116 pct_sequence = b''
117 if pct_sequence:
118 # Flush the final pct_sequence
119 string += pct_sequence.decode(encoding, errors)
120 return string
121
122 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
123 encoding='utf-8', errors='replace'):
124 qs, _coerce_result = qs, unicode
125 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
126 r = []
127 for name_value in pairs:
128 if not name_value and not strict_parsing:
129 continue
130 nv = name_value.split('=', 1)
131 if len(nv) != 2:
132 if strict_parsing:
133 raise ValueError("bad query field: %r" % (name_value,))
134 # Handle case of a control-name with no equal sign
135 if keep_blank_values:
136 nv.append('')
137 else:
138 continue
139 if len(nv[1]) or keep_blank_values:
140 name = nv[0].replace('+', ' ')
141 name = _unquote(name, encoding=encoding, errors=errors)
142 name = _coerce_result(name)
143 value = nv[1].replace('+', ' ')
144 value = _unquote(value, encoding=encoding, errors=errors)
145 value = _coerce_result(value)
146 r.append((name, value))
147 return r
148
149 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
150 encoding='utf-8', errors='replace'):
151 parsed_result = {}
152 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
153 encoding=encoding, errors=errors)
154 for name, value in pairs:
155 if name in parsed_result:
156 parsed_result[name].append(value)
157 else:
158 parsed_result[name] = [value]
159 return parsed_result
160
161 try:
162 compat_str = unicode # Python 2
163 except NameError:
164 compat_str = str
165
166 try:
167 compat_chr = unichr # Python 2
168 except NameError:
169 compat_chr = chr
170
171 def compat_ord(c):
172 if type(c) is int: return c
173 else: return ord(c)
174
175 # This is not clearly defined otherwise
176 compiled_regex_type = type(re.compile(''))
177
178 std_headers = {
179 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
180 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
181 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 'Accept-Encoding': 'gzip, deflate',
183 'Accept-Language': 'en-us,en;q=0.5',
184 }
185
186 def preferredencoding():
187 """Get preferred encoding.
188
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 u'TEST'.encode(pref)
195 except:
196 pref = 'UTF-8'
197
198 return pref
199
200 if sys.version_info < (3,0):
201 def compat_print(s):
202 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
203 else:
204 def compat_print(s):
205 assert type(s) == type(u'')
206 print(s)
207
208 # In Python 2.x, json.dump expects a bytestream.
209 # In Python 3.x, it writes to a character stream
210 if sys.version_info < (3,0):
211 def write_json_file(obj, fn):
212 with open(fn, 'wb') as f:
213 json.dump(obj, f)
214 else:
215 def write_json_file(obj, fn):
216 with open(fn, 'w', encoding='utf-8') as f:
217 json.dump(obj, f)
218
219 if sys.version_info >= (2,7):
220 def find_xpath_attr(node, xpath, key, val):
221 """ Find the xpath xpath[@key=val] """
222 assert re.match(r'^[a-zA-Z]+$', key)
223 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
224 expr = xpath + u"[@%s='%s']" % (key, val)
225 return node.find(expr)
226 else:
227 def find_xpath_attr(node, xpath, key, val):
228 for f in node.findall(xpath):
229 if f.attrib.get(key) == val:
230 return f
231 return None
232
233 # On python2.6 the xml.etree.ElementTree.Element methods don't support
234 # the namespace parameter
235 def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
246 def htmlentity_transform(matchobj):
247 """Transforms an HTML entity to a character.
248
249 This function receives a match object and is intended to be used with
250 the re.sub() function.
251 """
252 entity = matchobj.group(1)
253
254 # Known non-numeric HTML entity
255 if entity in compat_html_entities.name2codepoint:
256 return compat_chr(compat_html_entities.name2codepoint[entity])
257
258 mobj = re.match(u'(?u)#(x?\\d+)', entity)
259 if mobj is not None:
260 numstr = mobj.group(1)
261 if numstr.startswith(u'x'):
262 base = 16
263 numstr = u'0%s' % numstr
264 else:
265 base = 10
266 return compat_chr(int(numstr, base))
267
268 # Unknown entity in name, return its literal representation
269 return (u'&%s;' % entity)
270
271 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
272 class BaseHTMLParser(compat_html_parser.HTMLParser):
273 def __init(self):
274 compat_html_parser.HTMLParser.__init__(self)
275 self.html = None
276
277 def loads(self, html):
278 self.html = html
279 self.feed(html)
280 self.close()
281
282 class AttrParser(BaseHTMLParser):
283 """Modified HTMLParser that isolates a tag with the specified attribute"""
284 def __init__(self, attribute, value):
285 self.attribute = attribute
286 self.value = value
287 self.result = None
288 self.started = False
289 self.depth = {}
290 self.watch_startpos = False
291 self.error_count = 0
292 BaseHTMLParser.__init__(self)
293
294 def error(self, message):
295 if self.error_count > 10 or self.started:
296 raise compat_html_parser.HTMLParseError(message, self.getpos())
297 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
298 self.error_count += 1
299 self.goahead(1)
300
301 def handle_starttag(self, tag, attrs):
302 attrs = dict(attrs)
303 if self.started:
304 self.find_startpos(None)
305 if self.attribute in attrs and attrs[self.attribute] == self.value:
306 self.result = [tag]
307 self.started = True
308 self.watch_startpos = True
309 if self.started:
310 if not tag in self.depth: self.depth[tag] = 0
311 self.depth[tag] += 1
312
313 def handle_endtag(self, tag):
314 if self.started:
315 if tag in self.depth: self.depth[tag] -= 1
316 if self.depth[self.result[0]] == 0:
317 self.started = False
318 self.result.append(self.getpos())
319
320 def find_startpos(self, x):
321 """Needed to put the start position of the result (self.result[1])
322 after the opening tag with the requested id"""
323 if self.watch_startpos:
324 self.watch_startpos = False
325 self.result.append(self.getpos())
326 handle_entityref = handle_charref = handle_data = handle_comment = \
327 handle_decl = handle_pi = unknown_decl = find_startpos
328
329 def get_result(self):
330 if self.result is None:
331 return None
332 if len(self.result) != 3:
333 return None
334 lines = self.html.split('\n')
335 lines = lines[self.result[1][0]-1:self.result[2][0]]
336 lines[0] = lines[0][self.result[1][1]:]
337 if len(lines) == 1:
338 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
339 lines[-1] = lines[-1][:self.result[2][1]]
340 return '\n'.join(lines).strip()
341 # Hack for https://github.com/rg3/youtube-dl/issues/662
342 if sys.version_info < (2, 7, 3):
343 AttrParser.parse_endtag = (lambda self, i:
344 i + len("</scr'+'ipt>")
345 if self.rawdata[i:].startswith("</scr'+'ipt>")
346 else compat_html_parser.HTMLParser.parse_endtag(self, i))
347
348 def get_element_by_id(id, html):
349 """Return the content of the tag with the specified ID in the passed HTML document"""
350 return get_element_by_attribute("id", id, html)
351
352 def get_element_by_attribute(attribute, value, html):
353 """Return the content of the tag with the specified attribute in the passed HTML document"""
354 parser = AttrParser(attribute, value)
355 try:
356 parser.loads(html)
357 except compat_html_parser.HTMLParseError:
358 pass
359 return parser.get_result()
360
361 class MetaParser(BaseHTMLParser):
362 """
363 Modified HTMLParser that isolates a meta tag with the specified name
364 attribute.
365 """
366 def __init__(self, name):
367 BaseHTMLParser.__init__(self)
368 self.name = name
369 self.content = None
370 self.result = None
371
372 def handle_starttag(self, tag, attrs):
373 if tag != 'meta':
374 return
375 attrs = dict(attrs)
376 if attrs.get('name') == self.name:
377 self.result = attrs.get('content')
378
379 def get_result(self):
380 return self.result
381
382 def get_meta_content(name, html):
383 """
384 Return the content attribute from the meta tag with the given name attribute.
385 """
386 parser = MetaParser(name)
387 try:
388 parser.loads(html)
389 except compat_html_parser.HTMLParseError:
390 pass
391 return parser.get_result()
392
393
394 def clean_html(html):
395 """Clean an HTML snippet into a readable string"""
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
404 return html.strip()
405
406
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
418 if filename == u'-':
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
427 raise
428
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = os.path.join(
431 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
432 for path_part in os.path.split(filename)
433 )
434 if alt_filename == filename:
435 raise
436 else:
437 # An exception here should be caught in the caller
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, alt_filename)
440
441
442 def timeconvert(timestr):
443 """Convert RFC 2822 defined time string into system timestamp"""
444 timestamp = None
445 timetuple = email.utils.parsedate_tz(timestr)
446 if timetuple is not None:
447 timestamp = email.utils.mktime_tz(timetuple)
448 return timestamp
449
450 def sanitize_filename(s, restricted=False, is_id=False):
451 """Sanitizes a string so it could be used as part of a filename.
452 If restricted is set, use a stricter subset of allowed characters.
453 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
454 """
455 def replace_insane(char):
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
470 result = u''.join(map(replace_insane, s))
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
478 if not result:
479 result = '_'
480 return result
481
482 def orderedSet(iterable):
483 """ Remove all duplicates from the input iterable """
484 res = []
485 for el in iterable:
486 if el not in res:
487 res.append(el)
488 return res
489
490 def unescapeHTML(s):
491 """
492 @param s a string
493 """
494 assert type(s) == type(u'')
495
496 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
497 return result
498
499 def encodeFilename(s):
500 """
501 @param s The name of the file
502 """
503
504 assert type(s) == type(u'')
505
506 # Python 3 has a Unicode API
507 if sys.version_info >= (3, 0):
508 return s
509
510 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
511 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
512 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
513 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
514 return s
515 else:
516 encoding = sys.getfilesystemencoding()
517 if encoding is None:
518 encoding = 'utf-8'
519 return s.encode(encoding, 'ignore')
520
521 def decodeOption(optval):
522 if optval is None:
523 return optval
524 if isinstance(optval, bytes):
525 optval = optval.decode(preferredencoding())
526
527 assert isinstance(optval, compat_str)
528 return optval
529
530 def formatSeconds(secs):
531 if secs > 3600:
532 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
533 elif secs > 60:
534 return '%d:%02d' % (secs // 60, secs % 60)
535 else:
536 return '%d' % secs
537
538 def make_HTTPS_handler(opts):
539 if sys.version_info < (3,2):
540 # Python's 2.x handler is very simplistic
541 return compat_urllib_request.HTTPSHandler()
542 else:
543 import ssl
544 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
545 context.set_default_verify_paths()
546
547 context.verify_mode = (ssl.CERT_NONE
548 if opts.no_check_certificate
549 else ssl.CERT_REQUIRED)
550 return compat_urllib_request.HTTPSHandler(context=context)
551
552 class ExtractorError(Exception):
553 """Error during info extraction."""
554 def __init__(self, msg, tb=None, expected=False, cause=None):
555 """ tb, if given, is the original traceback (so that it can be printed out).
556 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
557 """
558
559 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 expected = True
561 if not expected:
562 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
563 super(ExtractorError, self).__init__(msg)
564
565 self.traceback = tb
566 self.exc_info = sys.exc_info() # preserve original exception
567 self.cause = cause
568
569 def format_traceback(self):
570 if self.traceback is None:
571 return None
572 return u''.join(traceback.format_tb(self.traceback))
573
574
575 class RegexNotFoundError(ExtractorError):
576 """Error when a regex didn't match"""
577 pass
578
579
580 class DownloadError(Exception):
581 """Download Error exception.
582
583 This exception may be thrown by FileDownloader objects if they are not
584 configured to continue on errors. They will contain the appropriate
585 error message.
586 """
587 def __init__(self, msg, exc_info=None):
588 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
589 super(DownloadError, self).__init__(msg)
590 self.exc_info = exc_info
591
592
593 class SameFileError(Exception):
594 """Same File exception.
595
596 This exception will be thrown by FileDownloader objects if they detect
597 multiple files would have to be downloaded to the same file on disk.
598 """
599 pass
600
601
602 class PostProcessingError(Exception):
603 """Post Processing exception.
604
605 This exception may be raised by PostProcessor's .run() method to
606 indicate an error in the postprocessing task.
607 """
608 def __init__(self, msg):
609 self.msg = msg
610
611 class MaxDownloadsReached(Exception):
612 """ --max-downloads limit has been reached. """
613 pass
614
615
616 class UnavailableVideoError(Exception):
617 """Unavailable Format exception.
618
619 This exception will be thrown when a video is requested
620 in a format that is not available for that video.
621 """
622 pass
623
624
625 class ContentTooShortError(Exception):
626 """Content Too Short exception.
627
628 This exception may be raised by FileDownloader objects when a file they
629 download is too small for what the server announced first, indicating
630 the connection was probably interrupted.
631 """
632 # Both in bytes
633 downloaded = None
634 expected = None
635
636 def __init__(self, downloaded, expected):
637 self.downloaded = downloaded
638 self.expected = expected
639
640 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
641 """Handler for HTTP requests and responses.
642
643 This class, when installed with an OpenerDirector, automatically adds
644 the standard headers to every HTTP request and handles gzipped and
645 deflated responses from web servers. If compression is to be avoided in
646 a particular request, the original request in the program code only has
647 to include the HTTP header "Youtubedl-No-Compression", which will be
648 removed before making the real request.
649
650 Part of this code was copied from:
651
652 http://techknack.net/python-urllib2-handlers/
653
654 Andrew Rowls, the author of that code, agreed to release it to the
655 public domain.
656 """
657
658 @staticmethod
659 def deflate(data):
660 try:
661 return zlib.decompress(data, -zlib.MAX_WBITS)
662 except zlib.error:
663 return zlib.decompress(data)
664
665 @staticmethod
666 def addinfourl_wrapper(stream, headers, url, code):
667 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
668 return compat_urllib_request.addinfourl(stream, headers, url, code)
669 ret = compat_urllib_request.addinfourl(stream, headers, url)
670 ret.code = code
671 return ret
672
673 def http_request(self, req):
674 for h,v in std_headers.items():
675 if h in req.headers:
676 del req.headers[h]
677 req.add_header(h, v)
678 if 'Youtubedl-no-compression' in req.headers:
679 if 'Accept-encoding' in req.headers:
680 del req.headers['Accept-encoding']
681 del req.headers['Youtubedl-no-compression']
682 if 'Youtubedl-user-agent' in req.headers:
683 if 'User-agent' in req.headers:
684 del req.headers['User-agent']
685 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
686 del req.headers['Youtubedl-user-agent']
687 return req
688
689 def http_response(self, req, resp):
690 old_resp = resp
691 # gzip
692 if resp.headers.get('Content-encoding', '') == 'gzip':
693 content = resp.read()
694 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
695 try:
696 uncompressed = io.BytesIO(gz.read())
697 except IOError as original_ioerror:
698 # There may be junk add the end of the file
699 # See http://stackoverflow.com/q/4928560/35070 for details
700 for i in range(1, 1024):
701 try:
702 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
703 uncompressed = io.BytesIO(gz.read())
704 except IOError:
705 continue
706 break
707 else:
708 raise original_ioerror
709 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
710 resp.msg = old_resp.msg
711 # deflate
712 if resp.headers.get('Content-encoding', '') == 'deflate':
713 gz = io.BytesIO(self.deflate(resp.read()))
714 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
715 resp.msg = old_resp.msg
716 return resp
717
718 https_request = http_request
719 https_response = http_response
720
721 def unified_strdate(date_str):
722 """Return a string with the date in the format YYYYMMDD"""
723 upload_date = None
724 #Replace commas
725 date_str = date_str.replace(',',' ')
726 # %z (UTC offset) is only supported in python>=3.2
727 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
728 format_expressions = [
729 '%d %B %Y',
730 '%B %d %Y',
731 '%b %d %Y',
732 '%Y-%m-%d',
733 '%d/%m/%Y',
734 '%Y/%m/%d %H:%M:%S',
735 '%d.%m.%Y %H:%M',
736 '%Y-%m-%dT%H:%M:%SZ',
737 '%Y-%m-%dT%H:%M:%S',
738 ]
739 for expression in format_expressions:
740 try:
741 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
742 except:
743 pass
744 return upload_date
745
746 def determine_ext(url, default_ext=u'unknown_video'):
747 guess = url.partition(u'?')[0].rpartition(u'.')[2]
748 if re.match(r'^[A-Za-z0-9]+$', guess):
749 return guess
750 else:
751 return default_ext
752
753 def subtitles_filename(filename, sub_lang, sub_format):
754 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
755
756 def date_from_str(date_str):
757 """
758 Return a datetime object from a string in the format YYYYMMDD or
759 (now|today)[+-][0-9](day|week|month|year)(s)?"""
760 today = datetime.date.today()
761 if date_str == 'now'or date_str == 'today':
762 return today
763 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
764 if match is not None:
765 sign = match.group('sign')
766 time = int(match.group('time'))
767 if sign == '-':
768 time = -time
769 unit = match.group('unit')
770 #A bad aproximation?
771 if unit == 'month':
772 unit = 'day'
773 time *= 30
774 elif unit == 'year':
775 unit = 'day'
776 time *= 365
777 unit += 's'
778 delta = datetime.timedelta(**{unit: time})
779 return today + delta
780 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
781
782 class DateRange(object):
783 """Represents a time interval between two dates"""
784 def __init__(self, start=None, end=None):
785 """start and end must be strings in the format accepted by date"""
786 if start is not None:
787 self.start = date_from_str(start)
788 else:
789 self.start = datetime.datetime.min.date()
790 if end is not None:
791 self.end = date_from_str(end)
792 else:
793 self.end = datetime.datetime.max.date()
794 if self.start > self.end:
795 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
796 @classmethod
797 def day(cls, day):
798 """Returns a range that only contains the given day"""
799 return cls(day,day)
800 def __contains__(self, date):
801 """Check if the date is in the range"""
802 if not isinstance(date, datetime.date):
803 date = date_from_str(date)
804 return self.start <= date <= self.end
805 def __str__(self):
806 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
807
808
809 def platform_name():
810 """ Returns the platform name as a compat_str """
811 res = platform.platform()
812 if isinstance(res, bytes):
813 res = res.decode(preferredencoding())
814
815 assert isinstance(res, compat_str)
816 return res
817
818
819 def write_string(s, out=None):
820 if out is None:
821 out = sys.stderr
822 assert type(s) == type(u'')
823
824 if ('b' in getattr(out, 'mode', '') or
825 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
826 s = s.encode(preferredencoding(), 'ignore')
827 out.write(s)
828 out.flush()
829
830
831 def bytes_to_intlist(bs):
832 if not bs:
833 return []
834 if isinstance(bs[0], int): # Python 3
835 return list(bs)
836 else:
837 return [ord(c) for c in bs]
838
839
840 def intlist_to_bytes(xs):
841 if not xs:
842 return b''
843 if isinstance(chr(0), bytes): # Python 2
844 return ''.join([chr(x) for x in xs])
845 else:
846 return bytes(xs)
847
848
849 def get_cachedir(params={}):
850 cache_root = os.environ.get('XDG_CACHE_HOME',
851 os.path.expanduser('~/.cache'))
852 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
853
854
855 # Cross-platform file locking
856 if sys.platform == 'win32':
857 import ctypes.wintypes
858 import msvcrt
859
860 class OVERLAPPED(ctypes.Structure):
861 _fields_ = [
862 ('Internal', ctypes.wintypes.LPVOID),
863 ('InternalHigh', ctypes.wintypes.LPVOID),
864 ('Offset', ctypes.wintypes.DWORD),
865 ('OffsetHigh', ctypes.wintypes.DWORD),
866 ('hEvent', ctypes.wintypes.HANDLE),
867 ]
868
869 kernel32 = ctypes.windll.kernel32
870 LockFileEx = kernel32.LockFileEx
871 LockFileEx.argtypes = [
872 ctypes.wintypes.HANDLE, # hFile
873 ctypes.wintypes.DWORD, # dwFlags
874 ctypes.wintypes.DWORD, # dwReserved
875 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
876 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
877 ctypes.POINTER(OVERLAPPED) # Overlapped
878 ]
879 LockFileEx.restype = ctypes.wintypes.BOOL
880 UnlockFileEx = kernel32.UnlockFileEx
881 UnlockFileEx.argtypes = [
882 ctypes.wintypes.HANDLE, # hFile
883 ctypes.wintypes.DWORD, # dwReserved
884 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
885 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
886 ctypes.POINTER(OVERLAPPED) # Overlapped
887 ]
888 UnlockFileEx.restype = ctypes.wintypes.BOOL
889 whole_low = 0xffffffff
890 whole_high = 0x7fffffff
891
892 def _lock_file(f, exclusive):
893 overlapped = OVERLAPPED()
894 overlapped.Offset = 0
895 overlapped.OffsetHigh = 0
896 overlapped.hEvent = 0
897 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
898 handle = msvcrt.get_osfhandle(f.fileno())
899 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
900 whole_low, whole_high, f._lock_file_overlapped_p):
901 raise OSError('Locking file failed: %r' % ctypes.FormatError())
902
903 def _unlock_file(f):
904 assert f._lock_file_overlapped_p
905 handle = msvcrt.get_osfhandle(f.fileno())
906 if not UnlockFileEx(handle, 0,
907 whole_low, whole_high, f._lock_file_overlapped_p):
908 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
909
910 else:
911 import fcntl
912
913 def _lock_file(f, exclusive):
914 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
915
916 def _unlock_file(f):
917 fcntl.lockf(f, fcntl.LOCK_UN)
918
919
920 class locked_file(object):
921 def __init__(self, filename, mode, encoding=None):
922 assert mode in ['r', 'a', 'w']
923 self.f = io.open(filename, mode, encoding=encoding)
924 self.mode = mode
925
926 def __enter__(self):
927 exclusive = self.mode != 'r'
928 try:
929 _lock_file(self.f, exclusive)
930 except IOError:
931 self.f.close()
932 raise
933 return self
934
935 def __exit__(self, etype, value, traceback):
936 try:
937 _unlock_file(self.f)
938 finally:
939 self.f.close()
940
941 def __iter__(self):
942 return iter(self.f)
943
944 def write(self, *args):
945 return self.f.write(*args)
946
947 def read(self, *args):
948 return self.f.read(*args)
949
950
951 def shell_quote(args):
952 return ' '.join(map(pipes.quote, args))
953
954
955 def takewhile_inclusive(pred, seq):
956 """ Like itertools.takewhile, but include the latest evaluated element
957 (the first element so that Not pred(e)) """
958 for e in seq:
959 yield e
960 if not pred(e):
961 return
962
963
964 def smuggle_url(url, data):
965 """ Pass additional data in a URL for internal use. """
966
967 sdata = compat_urllib_parse.urlencode(
968 {u'__youtubedl_smuggle': json.dumps(data)})
969 return url + u'#' + sdata
970
971
972 def unsmuggle_url(smug_url):
973 if not '#__youtubedl_smuggle' in smug_url:
974 return smug_url, None
975 url, _, sdata = smug_url.rpartition(u'#')
976 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
977 data = json.loads(jsond)
978 return url, data