2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
30 import xml
.etree
.ElementTree
41 compat_urllib_parse_urlparse
,
42 compat_urllib_request
,
47 # This is not clearly defined otherwise
48 compiled_regex_type
= type(re
.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref
= locale
.getpreferredencoding()
73 def write_json_file(obj
, fn
):
74 """ Encode obj as JSON and write it to fn, atomically if possible """
76 fn
= encodeFilename(fn
)
77 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
78 encoding
= get_filesystem_encoding()
79 # os.path.basename returns a bytes object, but NamedTemporaryFile
80 # will fail if the filename contains non ascii characters unless we
81 # use a unicode object
82 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
83 # the same for os.path.dirname
84 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
86 path_basename
= os
.path
.basename
87 path_dirname
= os
.path
.dirname
91 'prefix': path_basename(fn
) + '.',
92 'dir': path_dirname(fn
),
96 # In Python 2.x, json.dump expects a bytestream.
97 # In Python 3.x, it writes to a character stream
98 if sys
.version_info
< (3, 0):
106 tf
= tempfile
.NamedTemporaryFile(**args
)
111 if sys
.platform
== 'win32':
112 # Need to remove existing file on Windows, else os.rename raises
113 # WindowsError or FileExistsError.
118 os
.rename(tf
.name
, fn
)
127 if sys
.version_info
>= (2, 7):
128 def find_xpath_attr(node
, xpath
, key
, val
):
129 """ Find the xpath xpath[@key=val] """
130 assert re
.match(r
'^[a-zA-Z-]+$', key
)
131 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
132 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
133 return node
.find(expr
)
135 def find_xpath_attr(node
, xpath
, key
, val
):
136 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
137 # .//node does not match if a node is a direct child of . !
138 if isinstance(xpath
, unicode):
139 xpath
= xpath
.encode('ascii')
141 for f
in node
.findall(xpath
):
142 if f
.attrib
.get(key
) == val
:
146 # On python2.6 the xml.etree.ElementTree.Element methods don't support
147 # the namespace parameter
148 def xpath_with_ns(path
, ns_map
):
149 components
= [c
.split(':') for c
in path
.split('/')]
153 replaced
.append(c
[0])
156 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
157 return '/'.join(replaced
)
160 def xpath_text(node
, xpath
, name
=None, fatal
=False):
161 if sys
.version_info
< (2, 7): # Crazy 2.6
162 xpath
= xpath
.encode('ascii')
167 name
= xpath
if name
is None else name
168 raise ExtractorError('Could not find XML element %s' % name
)
174 def get_element_by_id(id, html
):
175 """Return the content of the tag with the specified ID in the passed HTML document"""
176 return get_element_by_attribute("id", id, html
)
179 def get_element_by_attribute(attribute
, value
, html
):
180 """Return the content of the tag with the specified attribute in the passed HTML document"""
182 m
= re
.search(r
'''(?xs)
184 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
186 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
194 res
= m
.group('content')
196 if res
.startswith('"') or res
.startswith("'"):
199 return unescapeHTML(res
)
202 def clean_html(html
):
203 """Clean an HTML snippet into a readable string"""
205 html
= html
.replace('\n', ' ')
206 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
207 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
209 html
= re
.sub('<.*?>', '', html
)
210 # Replace html entities
211 html
= unescapeHTML(html
)
215 def sanitize_open(filename
, open_mode
):
216 """Try to open the given filename, and slightly tweak it if this fails.
218 Attempts to open the given filename. If this fails, it tries to change
219 the filename slightly, step by step, until it's either able to open it
220 or it fails and raises a final exception, like the standard open()
223 It returns the tuple (stream, definitive_file_name).
227 if sys
.platform
== 'win32':
229 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
230 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
231 stream
= open(encodeFilename(filename
), open_mode
)
232 return (stream
, filename
)
233 except (IOError, OSError) as err
:
234 if err
.errno
in (errno
.EACCES
,):
237 # In case of error, try to remove win32 forbidden chars
238 alt_filename
= os
.path
.join(
239 re
.sub('[/<>:"\\|\\\\?\\*]', '#', path_part
)
240 for path_part
in os
.path
.split(filename
)
242 if alt_filename
== filename
:
245 # An exception here should be caught in the caller
246 stream
= open(encodeFilename(filename
), open_mode
)
247 return (stream
, alt_filename
)
250 def timeconvert(timestr
):
251 """Convert RFC 2822 defined time string into system timestamp"""
253 timetuple
= email
.utils
.parsedate_tz(timestr
)
254 if timetuple
is not None:
255 timestamp
= email
.utils
.mktime_tz(timetuple
)
258 def sanitize_filename(s
, restricted
=False, is_id
=False):
259 """Sanitizes a string so it could be used as part of a filename.
260 If restricted is set, use a stricter subset of allowed characters.
261 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
263 def replace_insane(char
):
264 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
267 return '' if restricted
else '\''
269 return '_-' if restricted
else ' -'
270 elif char
in '\\/|*<>':
272 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
274 if restricted
and ord(char
) > 127:
278 result
= ''.join(map(replace_insane
, s
))
280 while '__' in result
:
281 result
= result
.replace('__', '_')
282 result
= result
.strip('_')
283 # Common case of "Foreign band name - English song title"
284 if restricted
and result
.startswith('-_'):
290 def orderedSet(iterable
):
291 """ Remove all duplicates from the input iterable """
299 def _htmlentity_transform(entity
):
300 """Transforms an HTML entity to a character."""
301 # Known non-numeric HTML entity
302 if entity
in compat_html_entities
.name2codepoint
:
303 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
305 mobj
= re
.match(r
'#(x?[0-9]+)', entity
)
307 numstr
= mobj
.group(1)
308 if numstr
.startswith('x'):
310 numstr
= '0%s' % numstr
313 return compat_chr(int(numstr
, base
))
315 # Unknown entity in name, return its literal representation
316 return ('&%s;' % entity
)
322 assert type(s
) == compat_str
325 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
328 def encodeFilename(s
, for_subprocess
=False):
330 @param s The name of the file
333 assert type(s
) == compat_str
335 # Python 3 has a Unicode API
336 if sys
.version_info
>= (3, 0):
339 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
340 # Pass '' directly to use Unicode APIs on Windows 2000 and up
341 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
342 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
343 if not for_subprocess
:
346 # For subprocess calls, encode with locale encoding
347 # Refer to http://stackoverflow.com/a/9951851/35070
348 encoding
= preferredencoding()
350 encoding
= sys
.getfilesystemencoding()
353 return s
.encode(encoding
, 'ignore')
356 def encodeArgument(s
):
357 if not isinstance(s
, compat_str
):
358 # Legacy code that uses byte strings
359 # Uncomment the following line after fixing all post processors
360 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
361 s
= s
.decode('ascii')
362 return encodeFilename(s
, True)
365 def decodeOption(optval
):
368 if isinstance(optval
, bytes):
369 optval
= optval
.decode(preferredencoding())
371 assert isinstance(optval
, compat_str
)
374 def formatSeconds(secs
):
376 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
378 return '%d:%02d' % (secs
// 60, secs
% 60)
383 def make_HTTPS_handler(opts_no_check_certificate
, **kwargs
):
384 if sys
.version_info
< (3, 2):
387 class HTTPSConnectionV3(httplib
.HTTPSConnection
):
388 def __init__(self
, *args
, **kwargs
):
389 httplib
.HTTPSConnection
.__init
__(self
, *args
, **kwargs
)
392 sock
= socket
.create_connection((self
.host
, self
.port
), self
.timeout
)
393 if getattr(self
, '_tunnel_host', False):
397 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_TLSv1
)
399 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv23
)
401 class HTTPSHandlerV3(compat_urllib_request
.HTTPSHandler
):
402 def https_open(self
, req
):
403 return self
.do_open(HTTPSConnectionV3
, req
)
404 return HTTPSHandlerV3(**kwargs
)
405 elif hasattr(ssl
, 'create_default_context'): # Python >= 3.4
406 context
= ssl
.create_default_context(ssl
.Purpose
.CLIENT_AUTH
)
407 context
.options
&= ~ssl
.OP_NO_SSLv3
# Allow older, not-as-secure SSLv3
408 if opts_no_check_certificate
:
409 context
.verify_mode
= ssl
.CERT_NONE
410 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
412 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
413 context
.verify_mode
= (ssl
.CERT_NONE
414 if opts_no_check_certificate
415 else ssl
.CERT_REQUIRED
)
416 context
.set_default_verify_paths()
418 context
.load_default_certs()
419 except AttributeError:
421 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
424 class ExtractorError(Exception):
425 """Error during info extraction."""
426 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
427 """ tb, if given, is the original traceback (so that it can be printed out).
428 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
431 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
433 if video_id
is not None:
434 msg
= video_id
+ ': ' + msg
436 msg
+= ' (caused by %r)' % cause
438 if ytdl_is_updateable():
439 update_cmd
= 'type youtube-dl -U to update'
441 update_cmd
= 'see https://yt-dl.org/update on how to update'
442 msg
+= '; please report this issue on https://yt-dl.org/bug .'
443 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
444 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
445 super(ExtractorError
, self
).__init
__(msg
)
448 self
.exc_info
= sys
.exc_info() # preserve original exception
450 self
.video_id
= video_id
452 def format_traceback(self
):
453 if self
.traceback
is None:
455 return ''.join(traceback
.format_tb(self
.traceback
))
458 class RegexNotFoundError(ExtractorError
):
459 """Error when a regex didn't match"""
463 class DownloadError(Exception):
464 """Download Error exception.
466 This exception may be thrown by FileDownloader objects if they are not
467 configured to continue on errors. They will contain the appropriate
470 def __init__(self
, msg
, exc_info
=None):
471 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
472 super(DownloadError
, self
).__init
__(msg
)
473 self
.exc_info
= exc_info
476 class SameFileError(Exception):
477 """Same File exception.
479 This exception will be thrown by FileDownloader objects if they detect
480 multiple files would have to be downloaded to the same file on disk.
485 class PostProcessingError(Exception):
486 """Post Processing exception.
488 This exception may be raised by PostProcessor's .run() method to
489 indicate an error in the postprocessing task.
491 def __init__(self
, msg
):
494 class MaxDownloadsReached(Exception):
495 """ --max-downloads limit has been reached. """
499 class UnavailableVideoError(Exception):
500 """Unavailable Format exception.
502 This exception will be thrown when a video is requested
503 in a format that is not available for that video.
508 class ContentTooShortError(Exception):
509 """Content Too Short exception.
511 This exception may be raised by FileDownloader objects when a file they
512 download is too small for what the server announced first, indicating
513 the connection was probably interrupted.
519 def __init__(self
, downloaded
, expected
):
520 self
.downloaded
= downloaded
521 self
.expected
= expected
523 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
524 """Handler for HTTP requests and responses.
526 This class, when installed with an OpenerDirector, automatically adds
527 the standard headers to every HTTP request and handles gzipped and
528 deflated responses from web servers. If compression is to be avoided in
529 a particular request, the original request in the program code only has
530 to include the HTTP header "Youtubedl-No-Compression", which will be
531 removed before making the real request.
533 Part of this code was copied from:
535 http://techknack.net/python-urllib2-handlers/
537 Andrew Rowls, the author of that code, agreed to release it to the
544 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
546 return zlib
.decompress(data
)
549 def addinfourl_wrapper(stream
, headers
, url
, code
):
550 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
551 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
552 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
556 def http_request(self
, req
):
557 for h
, v
in std_headers
.items():
558 if h
not in req
.headers
:
560 if 'Youtubedl-no-compression' in req
.headers
:
561 if 'Accept-encoding' in req
.headers
:
562 del req
.headers
['Accept-encoding']
563 del req
.headers
['Youtubedl-no-compression']
564 if 'Youtubedl-user-agent' in req
.headers
:
565 if 'User-agent' in req
.headers
:
566 del req
.headers
['User-agent']
567 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
568 del req
.headers
['Youtubedl-user-agent']
570 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
571 # Python 2.6 is brain-dead when it comes to fragments
572 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
573 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
577 def http_response(self
, req
, resp
):
580 if resp
.headers
.get('Content-encoding', '') == 'gzip':
581 content
= resp
.read()
582 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
584 uncompressed
= io
.BytesIO(gz
.read())
585 except IOError as original_ioerror
:
586 # There may be junk add the end of the file
587 # See http://stackoverflow.com/q/4928560/35070 for details
588 for i
in range(1, 1024):
590 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
591 uncompressed
= io
.BytesIO(gz
.read())
596 raise original_ioerror
597 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
598 resp
.msg
= old_resp
.msg
600 if resp
.headers
.get('Content-encoding', '') == 'deflate':
601 gz
= io
.BytesIO(self
.deflate(resp
.read()))
602 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
603 resp
.msg
= old_resp
.msg
606 https_request
= http_request
607 https_response
= http_response
610 def parse_iso8601(date_str
, delimiter
='T'):
611 """ Return a UNIX timestamp from the given date """
617 r
'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
620 timezone
= datetime
.timedelta()
622 date_str
= date_str
[:-len(m
.group(0))]
623 if not m
.group('sign'):
624 timezone
= datetime
.timedelta()
626 sign
= 1 if m
.group('sign') == '+' else -1
627 timezone
= datetime
.timedelta(
628 hours
=sign
* int(m
.group('hours')),
629 minutes
=sign
* int(m
.group('minutes')))
630 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
631 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
632 return calendar
.timegm(dt
.timetuple())
635 def unified_strdate(date_str
):
636 """Return a string with the date in the format YYYYMMDD"""
643 date_str
= date_str
.replace(',', ' ')
644 # %z (UTC offset) is only supported in python>=3.2
645 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
646 format_expressions
= [
651 '%b %dst %Y %I:%M%p',
652 '%b %dnd %Y %I:%M%p',
653 '%b %dth %Y %I:%M%p',
662 '%Y-%m-%d %H:%M:%S.%f',
665 '%Y-%m-%dT%H:%M:%SZ',
666 '%Y-%m-%dT%H:%M:%S.%fZ',
667 '%Y-%m-%dT%H:%M:%S.%f0Z',
669 '%Y-%m-%dT%H:%M:%S.%f',
672 for expression
in format_expressions
:
674 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
677 if upload_date
is None:
678 timetuple
= email
.utils
.parsedate_tz(date_str
)
680 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
683 def determine_ext(url
, default_ext
='unknown_video'):
686 guess
= url
.partition('?')[0].rpartition('.')[2]
687 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
692 def subtitles_filename(filename
, sub_lang
, sub_format
):
693 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
695 def date_from_str(date_str
):
697 Return a datetime object from a string in the format YYYYMMDD or
698 (now|today)[+-][0-9](day|week|month|year)(s)?"""
699 today
= datetime
.date
.today()
700 if date_str
== 'now'or date_str
== 'today':
702 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
703 if match
is not None:
704 sign
= match
.group('sign')
705 time
= int(match
.group('time'))
708 unit
= match
.group('unit')
717 delta
= datetime
.timedelta(**{unit
: time
})
719 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
721 def hyphenate_date(date_str
):
723 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
724 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
725 if match
is not None:
726 return '-'.join(match
.groups())
730 class DateRange(object):
731 """Represents a time interval between two dates"""
732 def __init__(self
, start
=None, end
=None):
733 """start and end must be strings in the format accepted by date"""
734 if start
is not None:
735 self
.start
= date_from_str(start
)
737 self
.start
= datetime
.datetime
.min.date()
739 self
.end
= date_from_str(end
)
741 self
.end
= datetime
.datetime
.max.date()
742 if self
.start
> self
.end
:
743 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
746 """Returns a range that only contains the given day"""
748 def __contains__(self
, date
):
749 """Check if the date is in the range"""
750 if not isinstance(date
, datetime
.date
):
751 date
= date_from_str(date
)
752 return self
.start
<= date
<= self
.end
754 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())
758 """ Returns the platform name as a compat_str """
759 res
= platform
.platform()
760 if isinstance(res
, bytes):
761 res
= res
.decode(preferredencoding())
763 assert isinstance(res
, compat_str
)
767 def _windows_write_string(s
, out
):
768 """ Returns True if the string was written using special methods,
769 False if it has yet to be written out."""
770 # Adapted from http://stackoverflow.com/a/3259271/35070
773 import ctypes
.wintypes
781 fileno
= out
.fileno()
782 except AttributeError:
783 # If the output stream doesn't have a fileno, it's virtual
785 if fileno
not in WIN_OUTPUT_IDS
:
788 GetStdHandle
= ctypes
.WINFUNCTYPE(
789 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
790 ("GetStdHandle", ctypes
.windll
.kernel32
))
791 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
793 WriteConsoleW
= ctypes
.WINFUNCTYPE(
794 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
795 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
796 ctypes
.wintypes
.LPVOID
)(("WriteConsoleW", ctypes
.windll
.kernel32
))
797 written
= ctypes
.wintypes
.DWORD(0)
799 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(("GetFileType", ctypes
.windll
.kernel32
))
800 FILE_TYPE_CHAR
= 0x0002
801 FILE_TYPE_REMOTE
= 0x8000
802 GetConsoleMode
= ctypes
.WINFUNCTYPE(
803 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
804 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
805 ("GetConsoleMode", ctypes
.windll
.kernel32
))
806 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
808 def not_a_console(handle
):
809 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
811 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
812 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
817 def next_nonbmp_pos(s
):
819 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
820 except StopIteration:
824 count
= min(next_nonbmp_pos(s
), 1024)
827 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
829 raise OSError('Failed to write string')
830 if not count
: # We just wrote a non-BMP character
831 assert written
.value
== 2
834 assert written
.value
> 0
835 s
= s
[written
.value
:]
839 def write_string(s
, out
=None, encoding
=None):
842 assert type(s
) == compat_str
844 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
845 if _windows_write_string(s
, out
):
848 if ('b' in getattr(out
, 'mode', '') or
849 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
850 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
852 elif hasattr(out
, 'buffer'):
853 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
854 byt
= s
.encode(enc
, 'ignore')
855 out
.buffer.write(byt
)
861 def bytes_to_intlist(bs
):
864 if isinstance(bs
[0], int): # Python 3
867 return [ord(c
) for c
in bs
]
870 def intlist_to_bytes(xs
):
873 return struct_pack('%dB' % len(xs
), *xs
)
876 # Cross-platform file locking
877 if sys
.platform
== 'win32':
878 import ctypes
.wintypes
881 class OVERLAPPED(ctypes
.Structure
):
883 ('Internal', ctypes
.wintypes
.LPVOID
),
884 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
885 ('Offset', ctypes
.wintypes
.DWORD
),
886 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
887 ('hEvent', ctypes
.wintypes
.HANDLE
),
890 kernel32
= ctypes
.windll
.kernel32
891 LockFileEx
= kernel32
.LockFileEx
892 LockFileEx
.argtypes
= [
893 ctypes
.wintypes
.HANDLE
, # hFile
894 ctypes
.wintypes
.DWORD
, # dwFlags
895 ctypes
.wintypes
.DWORD
, # dwReserved
896 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
897 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
898 ctypes
.POINTER(OVERLAPPED
) # Overlapped
900 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
901 UnlockFileEx
= kernel32
.UnlockFileEx
902 UnlockFileEx
.argtypes
= [
903 ctypes
.wintypes
.HANDLE
, # hFile
904 ctypes
.wintypes
.DWORD
, # dwReserved
905 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
906 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
907 ctypes
.POINTER(OVERLAPPED
) # Overlapped
909 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
910 whole_low
= 0xffffffff
911 whole_high
= 0x7fffffff
913 def _lock_file(f
, exclusive
):
914 overlapped
= OVERLAPPED()
915 overlapped
.Offset
= 0
916 overlapped
.OffsetHigh
= 0
917 overlapped
.hEvent
= 0
918 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
919 handle
= msvcrt
.get_osfhandle(f
.fileno())
920 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
921 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
922 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
925 assert f
._lock
_file
_overlapped
_p
926 handle
= msvcrt
.get_osfhandle(f
.fileno())
927 if not UnlockFileEx(handle
, 0,
928 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
929 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
934 def _lock_file(f
, exclusive
):
935 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
938 fcntl
.flock(f
, fcntl
.LOCK_UN
)
941 class locked_file(object):
942 def __init__(self
, filename
, mode
, encoding
=None):
943 assert mode
in ['r', 'a', 'w']
944 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
948 exclusive
= self
.mode
!= 'r'
950 _lock_file(self
.f
, exclusive
)
956 def __exit__(self
, etype
, value
, traceback
):
965 def write(self
, *args
):
966 return self
.f
.write(*args
)
968 def read(self
, *args
):
969 return self
.f
.read(*args
)
972 def get_filesystem_encoding():
973 encoding
= sys
.getfilesystemencoding()
974 return encoding
if encoding
is not None else 'utf-8'
977 def shell_quote(args
):
979 encoding
= get_filesystem_encoding()
981 if isinstance(a
, bytes):
982 # We may get a filename encoded with 'encodeFilename'
983 a
= a
.decode(encoding
)
984 quoted_args
.append(pipes
.quote(a
))
985 return ' '.join(quoted_args
)
988 def takewhile_inclusive(pred
, seq
):
989 """ Like itertools.takewhile, but include the latest evaluated element
990 (the first element so that Not pred(e)) """
997 def smuggle_url(url
, data
):
998 """ Pass additional data in a URL for internal use. """
1000 sdata
= compat_urllib_parse
.urlencode(
1001 {'__youtubedl_smuggle': json
.dumps(data
)})
1002 return url
+ '#' + sdata
1005 def unsmuggle_url(smug_url
, default
=None):
1006 if not '#__youtubedl_smuggle' in smug_url
:
1007 return smug_url
, default
1008 url
, _
, sdata
= smug_url
.rpartition('#')
1009 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1010 data
= json
.loads(jsond
)
1014 def format_bytes(bytes):
1017 if type(bytes) is str:
1018 bytes = float(bytes)
1022 exponent
= int(math
.log(bytes, 1024.0))
1023 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1024 converted
= float(bytes) / float(1024 ** exponent
)
1025 return '%.2f%s' % (converted
, suffix
)
1028 def get_term_width():
1029 columns
= compat_getenv('COLUMNS', None)
1034 sp
= subprocess
.Popen(
1036 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
1037 out
, err
= sp
.communicate()
1038 return int(out
.split()[1])
1044 def month_by_name(name
):
1045 """ Return the number of a month by (locale-independently) English name """
1048 'January', 'February', 'March', 'April', 'May', 'June',
1049 'July', 'August', 'September', 'October', 'November', 'December']
1051 return ENGLISH_NAMES
.index(name
) + 1
1056 def fix_xml_ampersands(xml_str
):
1057 """Replace all the '&' by '&' in XML"""
1059 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1064 def setproctitle(title
):
1065 assert isinstance(title
, compat_str
)
1067 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1070 title_bytes
= title
.encode('utf-8')
1071 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1072 buf
.value
= title_bytes
1074 libc
.prctl(15, buf
, 0, 0, 0)
1075 except AttributeError:
1076 return # Strange libc, just skip this
1079 def remove_start(s
, start
):
1080 if s
.startswith(start
):
1081 return s
[len(start
):]
1085 def remove_end(s
, end
):
1087 return s
[:-len(end
)]
1091 def url_basename(url
):
1092 path
= compat_urlparse
.urlparse(url
).path
1093 return path
.strip('/').split('/')[-1]
1096 class HEADRequest(compat_urllib_request
.Request
):
1097 def get_method(self
):
1101 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1104 v
= getattr(v
, get_attr
, None)
1107 return default
if v
is None else (int(v
) * invscale
// scale
)
1110 def str_or_none(v
, default
=None):
1111 return default
if v
is None else compat_str(v
)
1114 def str_to_int(int_str
):
1115 """ A more relaxed version of int_or_none """
1118 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1122 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1123 return default
if v
is None else (float(v
) * invscale
/ scale
)
1126 def parse_duration(s
):
1135 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1136 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1138 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s
)
1141 res
= int(m
.group('secs'))
1143 res
+= int(m
.group('mins')) * 60
1144 if m
.group('hours'):
1145 res
+= int(m
.group('hours')) * 60 * 60
1147 res
+= float(m
.group('ms'))
1151 def prepend_extension(filename
, ext
):
1152 name
, real_ext
= os
.path
.splitext(filename
)
1153 return '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1156 def check_executable(exe
, args
=[]):
1157 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1158 args can be a list of arguments for a short output (like -version) """
1160 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1166 def get_exe_version(exe
, args
=['--version'],
1167 version_re
=r
'version\s+([0-9._-a-zA-Z]+)',
1168 unrecognized
='present'):
1169 """ Returns the version of the specified executable,
1170 or False if the executable is not present """
1172 out
, err
= subprocess
.Popen(
1174 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1177 firstline
= out
.partition(b
'\n')[0].decode('ascii', 'ignore')
1178 m
= re
.search(version_re
, firstline
)
1185 class PagedList(object):
1187 # This is only useful for tests
1188 return len(self
.getslice())
1191 class OnDemandPagedList(PagedList
):
1192 def __init__(self
, pagefunc
, pagesize
):
1193 self
._pagefunc
= pagefunc
1194 self
._pagesize
= pagesize
1196 def getslice(self
, start
=0, end
=None):
1198 for pagenum
in itertools
.count(start
// self
._pagesize
):
1199 firstid
= pagenum
* self
._pagesize
1200 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1201 if start
>= nextfirstid
:
1204 page_results
= list(self
._pagefunc
(pagenum
))
1207 start
% self
._pagesize
1208 if firstid
<= start
< nextfirstid
1212 ((end
- 1) % self
._pagesize
) + 1
1213 if (end
is not None and firstid
<= end
<= nextfirstid
)
1216 if startv
!= 0 or endv
is not None:
1217 page_results
= page_results
[startv
:endv
]
1218 res
.extend(page_results
)
1220 # A little optimization - if current page is not "full", ie. does
1221 # not contain page_size videos then we can assume that this page
1222 # is the last one - there are no more ids on further pages -
1223 # i.e. no need to query again.
1224 if len(page_results
) + startv
< self
._pagesize
:
1227 # If we got the whole page, but the next page is not interesting,
1228 # break out early as well
1229 if end
== nextfirstid
:
1234 class InAdvancePagedList(PagedList
):
1235 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1236 self
._pagefunc
= pagefunc
1237 self
._pagecount
= pagecount
1238 self
._pagesize
= pagesize
1240 def getslice(self
, start
=0, end
=None):
1242 start_page
= start
// self
._pagesize
1244 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1245 skip_elems
= start
- start_page
* self
._pagesize
1246 only_more
= None if end
is None else end
- start
1247 for pagenum
in range(start_page
, end_page
):
1248 page
= list(self
._pagefunc
(pagenum
))
1250 page
= page
[skip_elems
:]
1252 if only_more
is not None:
1253 if len(page
) < only_more
:
1254 only_more
-= len(page
)
1256 page
= page
[:only_more
]
1263 def uppercase_escape(s
):
1264 unicode_escape
= codecs
.getdecoder('unicode_escape')
1266 r
'\\U[0-9a-fA-F]{8}',
1267 lambda m
: unicode_escape(m
.group(0))[0],
1271 def escape_rfc3986(s
):
1272 """Escape non-ASCII characters as suggested by RFC 3986"""
1273 if sys
.version_info
< (3, 0) and isinstance(s
, unicode):
1274 s
= s
.encode('utf-8')
1275 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1278 def escape_url(url
):
1279 """Escape URL as suggested by RFC 3986"""
1280 url_parsed
= compat_urllib_parse_urlparse(url
)
1281 return url_parsed
._replace
(
1282 path
=escape_rfc3986(url_parsed
.path
),
1283 params
=escape_rfc3986(url_parsed
.params
),
1284 query
=escape_rfc3986(url_parsed
.query
),
1285 fragment
=escape_rfc3986(url_parsed
.fragment
)
1289 struct
.pack('!I', 0)
1291 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1292 def struct_pack(spec
, *args
):
1293 if isinstance(spec
, compat_str
):
1294 spec
= spec
.encode('ascii')
1295 return struct
.pack(spec
, *args
)
1297 def struct_unpack(spec
, *args
):
1298 if isinstance(spec
, compat_str
):
1299 spec
= spec
.encode('ascii')
1300 return struct
.unpack(spec
, *args
)
1302 struct_pack
= struct
.pack
1303 struct_unpack
= struct
.unpack
1306 def read_batch_urls(batch_fd
):
1308 if not isinstance(url
, compat_str
):
1309 url
= url
.decode('utf-8', 'replace')
1310 BOM_UTF8
= '\xef\xbb\xbf'
1311 if url
.startswith(BOM_UTF8
):
1312 url
= url
[len(BOM_UTF8
):]
1314 if url
.startswith(('#', ';', ']')):
1318 with contextlib
.closing(batch_fd
) as fd
:
1319 return [url
for url
in map(fixup
, fd
) if url
]
1322 def urlencode_postdata(*args
, **kargs
):
1323 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1327 etree_iter
= xml
.etree
.ElementTree
.Element
.iter
1328 except AttributeError: # Python <=2.6
1329 etree_iter
= lambda n
: n
.findall('.//*')
1333 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1334 def doctype(self
, name
, pubid
, system
):
1335 pass # Ignore doctypes
1337 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1338 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1339 tree
= xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1340 # Fix up XML parser in Python 2.x
1341 if sys
.version_info
< (3, 0):
1342 for n
in etree_iter(tree
):
1343 if n
.text
is not None:
1344 if not isinstance(n
.text
, compat_str
):
1345 n
.text
= n
.text
.decode('utf-8')
1358 def parse_age_limit(s
):
1361 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1362 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1365 def strip_jsonp(code
):
1367 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1370 def js_to_json(code
):
1373 if v
in ('true', 'false', 'null'):
1375 if v
.startswith('"'):
1377 if v
.startswith("'"):
1379 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1386 res = re.sub(r'''(?x)
1387 "(?
:[^
"\\]*(?:\\\\|\\")?
)*"|
1388 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1389 [a-zA-Z_][a-zA-Z_0-9]*
1391 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1395 def qualities(quality_ids):
1396 """ Get a numeric quality value out of a list of possible values """
1399 return quality_ids.index(qid)
1405 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1408 def limit_length(s, length):
1409 """ Add ellipses to overly long strings """
1414 return s[:length - len(ELLIPSES)] + ELLIPSES
1418 def version_tuple(v):
1419 return [int(e) for e in v.split('.')]
1422 def is_outdated_version(version, limit, assume_new=True):
1424 return not assume_new
1426 return version_tuple(version) < version_tuple(limit)
1428 return not assume_new
1431 def ytdl_is_updateable():
1432 """ Returns if youtube-dl can be updated with -U """
1433 from zipimport import zipimporter
1435 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')