]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
debian/control: Update long description with list of supported sites.
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import email.utils
6 import errno
7 import gzip
8 import io
9 import json
10 import locale
11 import os
12 import pipes
13 import platform
14 import re
15 import socket
16 import sys
17 import traceback
18 import zlib
19
20 try:
21 import urllib.request as compat_urllib_request
22 except ImportError: # Python 2
23 import urllib2 as compat_urllib_request
24
25 try:
26 import urllib.error as compat_urllib_error
27 except ImportError: # Python 2
28 import urllib2 as compat_urllib_error
29
30 try:
31 import urllib.parse as compat_urllib_parse
32 except ImportError: # Python 2
33 import urllib as compat_urllib_parse
34
35 try:
36 from urllib.parse import urlparse as compat_urllib_parse_urlparse
37 except ImportError: # Python 2
38 from urlparse import urlparse as compat_urllib_parse_urlparse
39
40 try:
41 import urllib.parse as compat_urlparse
42 except ImportError: # Python 2
43 import urlparse as compat_urlparse
44
45 try:
46 import http.cookiejar as compat_cookiejar
47 except ImportError: # Python 2
48 import cookielib as compat_cookiejar
49
50 try:
51 import html.entities as compat_html_entities
52 except ImportError: # Python 2
53 import htmlentitydefs as compat_html_entities
54
55 try:
56 import html.parser as compat_html_parser
57 except ImportError: # Python 2
58 import HTMLParser as compat_html_parser
59
60 try:
61 import http.client as compat_http_client
62 except ImportError: # Python 2
63 import httplib as compat_http_client
64
65 try:
66 from urllib.error import HTTPError as compat_HTTPError
67 except ImportError: # Python 2
68 from urllib2 import HTTPError as compat_HTTPError
69
70 try:
71 from urllib.request import urlretrieve as compat_urlretrieve
72 except ImportError: # Python 2
73 from urllib import urlretrieve as compat_urlretrieve
74
75
76 try:
77 from subprocess import DEVNULL
78 compat_subprocess_get_DEVNULL = lambda: DEVNULL
79 except ImportError:
80 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
81
82 try:
83 from urllib.parse import parse_qs as compat_parse_qs
84 except ImportError: # Python 2
85 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
86 # Python 2's version is apparently totally broken
87 def _unquote(string, encoding='utf-8', errors='replace'):
88 if string == '':
89 return string
90 res = string.split('%')
91 if len(res) == 1:
92 return string
93 if encoding is None:
94 encoding = 'utf-8'
95 if errors is None:
96 errors = 'replace'
97 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
98 pct_sequence = b''
99 string = res[0]
100 for item in res[1:]:
101 try:
102 if not item:
103 raise ValueError
104 pct_sequence += item[:2].decode('hex')
105 rest = item[2:]
106 if not rest:
107 # This segment was just a single percent-encoded character.
108 # May be part of a sequence of code units, so delay decoding.
109 # (Stored in pct_sequence).
110 continue
111 except ValueError:
112 rest = '%' + item
113 # Encountered non-percent-encoded characters. Flush the current
114 # pct_sequence.
115 string += pct_sequence.decode(encoding, errors) + rest
116 pct_sequence = b''
117 if pct_sequence:
118 # Flush the final pct_sequence
119 string += pct_sequence.decode(encoding, errors)
120 return string
121
122 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
123 encoding='utf-8', errors='replace'):
124 qs, _coerce_result = qs, unicode
125 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
126 r = []
127 for name_value in pairs:
128 if not name_value and not strict_parsing:
129 continue
130 nv = name_value.split('=', 1)
131 if len(nv) != 2:
132 if strict_parsing:
133 raise ValueError("bad query field: %r" % (name_value,))
134 # Handle case of a control-name with no equal sign
135 if keep_blank_values:
136 nv.append('')
137 else:
138 continue
139 if len(nv[1]) or keep_blank_values:
140 name = nv[0].replace('+', ' ')
141 name = _unquote(name, encoding=encoding, errors=errors)
142 name = _coerce_result(name)
143 value = nv[1].replace('+', ' ')
144 value = _unquote(value, encoding=encoding, errors=errors)
145 value = _coerce_result(value)
146 r.append((name, value))
147 return r
148
149 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
150 encoding='utf-8', errors='replace'):
151 parsed_result = {}
152 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
153 encoding=encoding, errors=errors)
154 for name, value in pairs:
155 if name in parsed_result:
156 parsed_result[name].append(value)
157 else:
158 parsed_result[name] = [value]
159 return parsed_result
160
161 try:
162 compat_str = unicode # Python 2
163 except NameError:
164 compat_str = str
165
166 try:
167 compat_chr = unichr # Python 2
168 except NameError:
169 compat_chr = chr
170
171 def compat_ord(c):
172 if type(c) is int: return c
173 else: return ord(c)
174
175 # This is not clearly defined otherwise
176 compiled_regex_type = type(re.compile(''))
177
178 std_headers = {
179 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
180 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
181 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 'Accept-Encoding': 'gzip, deflate',
183 'Accept-Language': 'en-us,en;q=0.5',
184 }
185
186 def preferredencoding():
187 """Get preferred encoding.
188
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 u'TEST'.encode(pref)
195 except:
196 pref = 'UTF-8'
197
198 return pref
199
200 if sys.version_info < (3,0):
201 def compat_print(s):
202 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
203 else:
204 def compat_print(s):
205 assert type(s) == type(u'')
206 print(s)
207
208 # In Python 2.x, json.dump expects a bytestream.
209 # In Python 3.x, it writes to a character stream
210 if sys.version_info < (3,0):
211 def write_json_file(obj, fn):
212 with open(fn, 'wb') as f:
213 json.dump(obj, f)
214 else:
215 def write_json_file(obj, fn):
216 with open(fn, 'w', encoding='utf-8') as f:
217 json.dump(obj, f)
218
219 if sys.version_info >= (2,7):
220 def find_xpath_attr(node, xpath, key, val):
221 """ Find the xpath xpath[@key=val] """
222 assert re.match(r'^[a-zA-Z]+$', key)
223 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
224 expr = xpath + u"[@%s='%s']" % (key, val)
225 return node.find(expr)
226 else:
227 def find_xpath_attr(node, xpath, key, val):
228 for f in node.findall(xpath):
229 if f.attrib.get(key) == val:
230 return f
231 return None
232
233 # On python2.6 the xml.etree.ElementTree.Element methods don't support
234 # the namespace parameter
235 def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
246 def htmlentity_transform(matchobj):
247 """Transforms an HTML entity to a character.
248
249 This function receives a match object and is intended to be used with
250 the re.sub() function.
251 """
252 entity = matchobj.group(1)
253
254 # Known non-numeric HTML entity
255 if entity in compat_html_entities.name2codepoint:
256 return compat_chr(compat_html_entities.name2codepoint[entity])
257
258 mobj = re.match(u'(?u)#(x?\\d+)', entity)
259 if mobj is not None:
260 numstr = mobj.group(1)
261 if numstr.startswith(u'x'):
262 base = 16
263 numstr = u'0%s' % numstr
264 else:
265 base = 10
266 return compat_chr(int(numstr, base))
267
268 # Unknown entity in name, return its literal representation
269 return (u'&%s;' % entity)
270
271 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
272 class BaseHTMLParser(compat_html_parser.HTMLParser):
273 def __init(self):
274 compat_html_parser.HTMLParser.__init__(self)
275 self.html = None
276
277 def loads(self, html):
278 self.html = html
279 self.feed(html)
280 self.close()
281
282 class AttrParser(BaseHTMLParser):
283 """Modified HTMLParser that isolates a tag with the specified attribute"""
284 def __init__(self, attribute, value):
285 self.attribute = attribute
286 self.value = value
287 self.result = None
288 self.started = False
289 self.depth = {}
290 self.watch_startpos = False
291 self.error_count = 0
292 BaseHTMLParser.__init__(self)
293
294 def error(self, message):
295 if self.error_count > 10 or self.started:
296 raise compat_html_parser.HTMLParseError(message, self.getpos())
297 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
298 self.error_count += 1
299 self.goahead(1)
300
301 def handle_starttag(self, tag, attrs):
302 attrs = dict(attrs)
303 if self.started:
304 self.find_startpos(None)
305 if self.attribute in attrs and attrs[self.attribute] == self.value:
306 self.result = [tag]
307 self.started = True
308 self.watch_startpos = True
309 if self.started:
310 if not tag in self.depth: self.depth[tag] = 0
311 self.depth[tag] += 1
312
313 def handle_endtag(self, tag):
314 if self.started:
315 if tag in self.depth: self.depth[tag] -= 1
316 if self.depth[self.result[0]] == 0:
317 self.started = False
318 self.result.append(self.getpos())
319
320 def find_startpos(self, x):
321 """Needed to put the start position of the result (self.result[1])
322 after the opening tag with the requested id"""
323 if self.watch_startpos:
324 self.watch_startpos = False
325 self.result.append(self.getpos())
326 handle_entityref = handle_charref = handle_data = handle_comment = \
327 handle_decl = handle_pi = unknown_decl = find_startpos
328
329 def get_result(self):
330 if self.result is None:
331 return None
332 if len(self.result) != 3:
333 return None
334 lines = self.html.split('\n')
335 lines = lines[self.result[1][0]-1:self.result[2][0]]
336 lines[0] = lines[0][self.result[1][1]:]
337 if len(lines) == 1:
338 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
339 lines[-1] = lines[-1][:self.result[2][1]]
340 return '\n'.join(lines).strip()
341 # Hack for https://github.com/rg3/youtube-dl/issues/662
342 if sys.version_info < (2, 7, 3):
343 AttrParser.parse_endtag = (lambda self, i:
344 i + len("</scr'+'ipt>")
345 if self.rawdata[i:].startswith("</scr'+'ipt>")
346 else compat_html_parser.HTMLParser.parse_endtag(self, i))
347
348 def get_element_by_id(id, html):
349 """Return the content of the tag with the specified ID in the passed HTML document"""
350 return get_element_by_attribute("id", id, html)
351
352 def get_element_by_attribute(attribute, value, html):
353 """Return the content of the tag with the specified attribute in the passed HTML document"""
354 parser = AttrParser(attribute, value)
355 try:
356 parser.loads(html)
357 except compat_html_parser.HTMLParseError:
358 pass
359 return parser.get_result()
360
361 class MetaParser(BaseHTMLParser):
362 """
363 Modified HTMLParser that isolates a meta tag with the specified name
364 attribute.
365 """
366 def __init__(self, name):
367 BaseHTMLParser.__init__(self)
368 self.name = name
369 self.content = None
370 self.result = None
371
372 def handle_starttag(self, tag, attrs):
373 if tag != 'meta':
374 return
375 attrs = dict(attrs)
376 if attrs.get('name') == self.name:
377 self.result = attrs.get('content')
378
379 def get_result(self):
380 return self.result
381
382 def get_meta_content(name, html):
383 """
384 Return the content attribute from the meta tag with the given name attribute.
385 """
386 parser = MetaParser(name)
387 try:
388 parser.loads(html)
389 except compat_html_parser.HTMLParseError:
390 pass
391 return parser.get_result()
392
393
394 def clean_html(html):
395 """Clean an HTML snippet into a readable string"""
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
404 return html.strip()
405
406
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
418 if filename == u'-':
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
427 raise
428
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = os.path.join(
431 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
432 for path_part in os.path.split(filename)
433 )
434 if alt_filename == filename:
435 raise
436 else:
437 # An exception here should be caught in the caller
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, alt_filename)
440
441
442 def timeconvert(timestr):
443 """Convert RFC 2822 defined time string into system timestamp"""
444 timestamp = None
445 timetuple = email.utils.parsedate_tz(timestr)
446 if timetuple is not None:
447 timestamp = email.utils.mktime_tz(timetuple)
448 return timestamp
449
450 def sanitize_filename(s, restricted=False, is_id=False):
451 """Sanitizes a string so it could be used as part of a filename.
452 If restricted is set, use a stricter subset of allowed characters.
453 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
454 """
455 def replace_insane(char):
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
470 result = u''.join(map(replace_insane, s))
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
478 if not result:
479 result = '_'
480 return result
481
482 def orderedSet(iterable):
483 """ Remove all duplicates from the input iterable """
484 res = []
485 for el in iterable:
486 if el not in res:
487 res.append(el)
488 return res
489
490 def unescapeHTML(s):
491 """
492 @param s a string
493 """
494 assert type(s) == type(u'')
495
496 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
497 return result
498
499 def encodeFilename(s):
500 """
501 @param s The name of the file
502 """
503
504 assert type(s) == type(u'')
505
506 # Python 3 has a Unicode API
507 if sys.version_info >= (3, 0):
508 return s
509
510 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
511 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
512 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
513 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
514 return s
515 else:
516 encoding = sys.getfilesystemencoding()
517 if encoding is None:
518 encoding = 'utf-8'
519 return s.encode(encoding, 'ignore')
520
521 def decodeOption(optval):
522 if optval is None:
523 return optval
524 if isinstance(optval, bytes):
525 optval = optval.decode(preferredencoding())
526
527 assert isinstance(optval, compat_str)
528 return optval
529
530 def formatSeconds(secs):
531 if secs > 3600:
532 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
533 elif secs > 60:
534 return '%d:%02d' % (secs // 60, secs % 60)
535 else:
536 return '%d' % secs
537
538 def make_HTTPS_handler(opts):
539 if sys.version_info < (3,2):
540 # Python's 2.x handler is very simplistic
541 return compat_urllib_request.HTTPSHandler()
542 else:
543 import ssl
544 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
545 context.set_default_verify_paths()
546
547 context.verify_mode = (ssl.CERT_NONE
548 if opts.no_check_certificate
549 else ssl.CERT_REQUIRED)
550 return compat_urllib_request.HTTPSHandler(context=context)
551
552 class ExtractorError(Exception):
553 """Error during info extraction."""
554 def __init__(self, msg, tb=None, expected=False, cause=None):
555 """ tb, if given, is the original traceback (so that it can be printed out).
556 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
557 """
558
559 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 expected = True
561 if not expected:
562 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
563 super(ExtractorError, self).__init__(msg)
564
565 self.traceback = tb
566 self.exc_info = sys.exc_info() # preserve original exception
567 self.cause = cause
568
569 def format_traceback(self):
570 if self.traceback is None:
571 return None
572 return u''.join(traceback.format_tb(self.traceback))
573
574
575 class DownloadError(Exception):
576 """Download Error exception.
577
578 This exception may be thrown by FileDownloader objects if they are not
579 configured to continue on errors. They will contain the appropriate
580 error message.
581 """
582 def __init__(self, msg, exc_info=None):
583 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
584 super(DownloadError, self).__init__(msg)
585 self.exc_info = exc_info
586
587
588 class SameFileError(Exception):
589 """Same File exception.
590
591 This exception will be thrown by FileDownloader objects if they detect
592 multiple files would have to be downloaded to the same file on disk.
593 """
594 pass
595
596
597 class PostProcessingError(Exception):
598 """Post Processing exception.
599
600 This exception may be raised by PostProcessor's .run() method to
601 indicate an error in the postprocessing task.
602 """
603 def __init__(self, msg):
604 self.msg = msg
605
606 class MaxDownloadsReached(Exception):
607 """ --max-downloads limit has been reached. """
608 pass
609
610
611 class UnavailableVideoError(Exception):
612 """Unavailable Format exception.
613
614 This exception will be thrown when a video is requested
615 in a format that is not available for that video.
616 """
617 pass
618
619
620 class ContentTooShortError(Exception):
621 """Content Too Short exception.
622
623 This exception may be raised by FileDownloader objects when a file they
624 download is too small for what the server announced first, indicating
625 the connection was probably interrupted.
626 """
627 # Both in bytes
628 downloaded = None
629 expected = None
630
631 def __init__(self, downloaded, expected):
632 self.downloaded = downloaded
633 self.expected = expected
634
635 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
636 """Handler for HTTP requests and responses.
637
638 This class, when installed with an OpenerDirector, automatically adds
639 the standard headers to every HTTP request and handles gzipped and
640 deflated responses from web servers. If compression is to be avoided in
641 a particular request, the original request in the program code only has
642 to include the HTTP header "Youtubedl-No-Compression", which will be
643 removed before making the real request.
644
645 Part of this code was copied from:
646
647 http://techknack.net/python-urllib2-handlers/
648
649 Andrew Rowls, the author of that code, agreed to release it to the
650 public domain.
651 """
652
653 @staticmethod
654 def deflate(data):
655 try:
656 return zlib.decompress(data, -zlib.MAX_WBITS)
657 except zlib.error:
658 return zlib.decompress(data)
659
660 @staticmethod
661 def addinfourl_wrapper(stream, headers, url, code):
662 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
663 return compat_urllib_request.addinfourl(stream, headers, url, code)
664 ret = compat_urllib_request.addinfourl(stream, headers, url)
665 ret.code = code
666 return ret
667
668 def http_request(self, req):
669 for h,v in std_headers.items():
670 if h in req.headers:
671 del req.headers[h]
672 req.add_header(h, v)
673 if 'Youtubedl-no-compression' in req.headers:
674 if 'Accept-encoding' in req.headers:
675 del req.headers['Accept-encoding']
676 del req.headers['Youtubedl-no-compression']
677 if 'Youtubedl-user-agent' in req.headers:
678 if 'User-agent' in req.headers:
679 del req.headers['User-agent']
680 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
681 del req.headers['Youtubedl-user-agent']
682 return req
683
684 def http_response(self, req, resp):
685 old_resp = resp
686 # gzip
687 if resp.headers.get('Content-encoding', '') == 'gzip':
688 content = resp.read()
689 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
690 try:
691 uncompressed = io.BytesIO(gz.read())
692 except IOError as original_ioerror:
693 # There may be junk add the end of the file
694 # See http://stackoverflow.com/q/4928560/35070 for details
695 for i in range(1, 1024):
696 try:
697 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
698 uncompressed = io.BytesIO(gz.read())
699 except IOError:
700 continue
701 break
702 else:
703 raise original_ioerror
704 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
705 resp.msg = old_resp.msg
706 # deflate
707 if resp.headers.get('Content-encoding', '') == 'deflate':
708 gz = io.BytesIO(self.deflate(resp.read()))
709 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
710 resp.msg = old_resp.msg
711 return resp
712
713 https_request = http_request
714 https_response = http_response
715
716 def unified_strdate(date_str):
717 """Return a string with the date in the format YYYYMMDD"""
718 upload_date = None
719 #Replace commas
720 date_str = date_str.replace(',',' ')
721 # %z (UTC offset) is only supported in python>=3.2
722 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
723 format_expressions = [
724 '%d %B %Y',
725 '%B %d %Y',
726 '%b %d %Y',
727 '%Y-%m-%d',
728 '%d/%m/%Y',
729 '%Y/%m/%d %H:%M:%S',
730 '%d.%m.%Y %H:%M',
731 '%Y-%m-%dT%H:%M:%SZ',
732 '%Y-%m-%dT%H:%M:%S',
733 ]
734 for expression in format_expressions:
735 try:
736 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
737 except:
738 pass
739 return upload_date
740
741 def determine_ext(url, default_ext=u'unknown_video'):
742 guess = url.partition(u'?')[0].rpartition(u'.')[2]
743 if re.match(r'^[A-Za-z0-9]+$', guess):
744 return guess
745 else:
746 return default_ext
747
748 def subtitles_filename(filename, sub_lang, sub_format):
749 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
750
751 def date_from_str(date_str):
752 """
753 Return a datetime object from a string in the format YYYYMMDD or
754 (now|today)[+-][0-9](day|week|month|year)(s)?"""
755 today = datetime.date.today()
756 if date_str == 'now'or date_str == 'today':
757 return today
758 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
759 if match is not None:
760 sign = match.group('sign')
761 time = int(match.group('time'))
762 if sign == '-':
763 time = -time
764 unit = match.group('unit')
765 #A bad aproximation?
766 if unit == 'month':
767 unit = 'day'
768 time *= 30
769 elif unit == 'year':
770 unit = 'day'
771 time *= 365
772 unit += 's'
773 delta = datetime.timedelta(**{unit: time})
774 return today + delta
775 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
776
777 class DateRange(object):
778 """Represents a time interval between two dates"""
779 def __init__(self, start=None, end=None):
780 """start and end must be strings in the format accepted by date"""
781 if start is not None:
782 self.start = date_from_str(start)
783 else:
784 self.start = datetime.datetime.min.date()
785 if end is not None:
786 self.end = date_from_str(end)
787 else:
788 self.end = datetime.datetime.max.date()
789 if self.start > self.end:
790 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
791 @classmethod
792 def day(cls, day):
793 """Returns a range that only contains the given day"""
794 return cls(day,day)
795 def __contains__(self, date):
796 """Check if the date is in the range"""
797 if not isinstance(date, datetime.date):
798 date = date_from_str(date)
799 return self.start <= date <= self.end
800 def __str__(self):
801 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
802
803
804 def platform_name():
805 """ Returns the platform name as a compat_str """
806 res = platform.platform()
807 if isinstance(res, bytes):
808 res = res.decode(preferredencoding())
809
810 assert isinstance(res, compat_str)
811 return res
812
813
814 def write_string(s, out=None):
815 if out is None:
816 out = sys.stderr
817 assert type(s) == type(u'')
818
819 if ('b' in getattr(out, 'mode', '') or
820 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
821 s = s.encode(preferredencoding(), 'ignore')
822 out.write(s)
823 out.flush()
824
825
826 def bytes_to_intlist(bs):
827 if not bs:
828 return []
829 if isinstance(bs[0], int): # Python 3
830 return list(bs)
831 else:
832 return [ord(c) for c in bs]
833
834
835 def intlist_to_bytes(xs):
836 if not xs:
837 return b''
838 if isinstance(chr(0), bytes): # Python 2
839 return ''.join([chr(x) for x in xs])
840 else:
841 return bytes(xs)
842
843
844 def get_cachedir(params={}):
845 cache_root = os.environ.get('XDG_CACHE_HOME',
846 os.path.expanduser('~/.cache'))
847 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
848
849
850 # Cross-platform file locking
851 if sys.platform == 'win32':
852 import ctypes.wintypes
853 import msvcrt
854
855 class OVERLAPPED(ctypes.Structure):
856 _fields_ = [
857 ('Internal', ctypes.wintypes.LPVOID),
858 ('InternalHigh', ctypes.wintypes.LPVOID),
859 ('Offset', ctypes.wintypes.DWORD),
860 ('OffsetHigh', ctypes.wintypes.DWORD),
861 ('hEvent', ctypes.wintypes.HANDLE),
862 ]
863
864 kernel32 = ctypes.windll.kernel32
865 LockFileEx = kernel32.LockFileEx
866 LockFileEx.argtypes = [
867 ctypes.wintypes.HANDLE, # hFile
868 ctypes.wintypes.DWORD, # dwFlags
869 ctypes.wintypes.DWORD, # dwReserved
870 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
871 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
872 ctypes.POINTER(OVERLAPPED) # Overlapped
873 ]
874 LockFileEx.restype = ctypes.wintypes.BOOL
875 UnlockFileEx = kernel32.UnlockFileEx
876 UnlockFileEx.argtypes = [
877 ctypes.wintypes.HANDLE, # hFile
878 ctypes.wintypes.DWORD, # dwReserved
879 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
880 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
881 ctypes.POINTER(OVERLAPPED) # Overlapped
882 ]
883 UnlockFileEx.restype = ctypes.wintypes.BOOL
884 whole_low = 0xffffffff
885 whole_high = 0x7fffffff
886
887 def _lock_file(f, exclusive):
888 overlapped = OVERLAPPED()
889 overlapped.Offset = 0
890 overlapped.OffsetHigh = 0
891 overlapped.hEvent = 0
892 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
893 handle = msvcrt.get_osfhandle(f.fileno())
894 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
895 whole_low, whole_high, f._lock_file_overlapped_p):
896 raise OSError('Locking file failed: %r' % ctypes.FormatError())
897
898 def _unlock_file(f):
899 assert f._lock_file_overlapped_p
900 handle = msvcrt.get_osfhandle(f.fileno())
901 if not UnlockFileEx(handle, 0,
902 whole_low, whole_high, f._lock_file_overlapped_p):
903 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
904
905 else:
906 import fcntl
907
908 def _lock_file(f, exclusive):
909 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
910
911 def _unlock_file(f):
912 fcntl.lockf(f, fcntl.LOCK_UN)
913
914
915 class locked_file(object):
916 def __init__(self, filename, mode, encoding=None):
917 assert mode in ['r', 'a', 'w']
918 self.f = io.open(filename, mode, encoding=encoding)
919 self.mode = mode
920
921 def __enter__(self):
922 exclusive = self.mode != 'r'
923 try:
924 _lock_file(self.f, exclusive)
925 except IOError:
926 self.f.close()
927 raise
928 return self
929
930 def __exit__(self, etype, value, traceback):
931 try:
932 _unlock_file(self.f)
933 finally:
934 self.f.close()
935
936 def __iter__(self):
937 return iter(self.f)
938
939 def write(self, *args):
940 return self.f.write(*args)
941
942 def read(self, *args):
943 return self.f.read(*args)
944
945
946 def shell_quote(args):
947 return ' '.join(map(pipes.quote, args))
948
949
950 def takewhile_inclusive(pred, seq):
951 """ Like itertools.takewhile, but include the latest evaluated element
952 (the first element so that Not pred(e)) """
953 for e in seq:
954 yield e
955 if not pred(e):
956 return
957
958
959 def smuggle_url(url, data):
960 """ Pass additional data in a URL for internal use. """
961
962 sdata = compat_urllib_parse.urlencode(
963 {u'__youtubedl_smuggle': json.dumps(data)})
964 return url + u'#' + sdata
965
966
967 def unsmuggle_url(smug_url):
968 if not '#__youtubedl_smuggle' in smug_url:
969 return smug_url, None
970 url, _, sdata = smug_url.rpartition(u'#')
971 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
972 data = json.loads(jsond)
973 return url, data