2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
30 import xml
.etree
.ElementTree
41 compat_urllib_parse_urlparse
,
42 compat_urllib_request
,
48 # This is not clearly defined otherwise
49 compiled_regex_type
= type(re
.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref
= locale
.getpreferredencoding()
75 def write_json_file(obj
, fn
):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn
= encodeFilename(fn
)
79 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
80 encoding
= get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
85 # the same for os.path.dirname
86 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
88 path_basename
= os
.path
.basename
89 path_dirname
= os
.path
.dirname
93 'prefix': path_basename(fn
) + '.',
94 'dir': path_dirname(fn
),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys
.version_info
< (3, 0):
108 tf
= tempfile
.NamedTemporaryFile(**args
)
113 if sys
.platform
== 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os
.rename(tf
.name
, fn
)
129 if sys
.version_info
>= (2, 7):
130 def find_xpath_attr(node
, xpath
, key
, val
):
131 """ Find the xpath xpath[@key=val] """
132 assert re
.match(r
'^[a-zA-Z-]+$', key
)
133 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
134 expr
= xpath
+ "[@%s='%s']" % (key
, val
)
135 return node
.find(expr
)
137 def find_xpath_attr(node
, xpath
, key
, val
):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath
, unicode):
141 xpath
= xpath
.encode('ascii')
143 for f
in node
.findall(xpath
):
144 if f
.attrib
.get(key
) == val
:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path
, ns_map
):
153 components
= [c
.split(':') for c
in path
.split('/')]
157 replaced
.append(c
[0])
160 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
161 return '/'.join(replaced
)
164 def xpath_text(node
, xpath
, name
=None, fatal
=False):
165 if sys
.version_info
< (2, 7): # Crazy 2.6
166 xpath
= xpath
.encode('ascii')
171 name
= xpath
if name
is None else name
172 raise ExtractorError('Could not find XML element %s' % name
)
178 def get_element_by_id(id, html
):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html
)
183 def get_element_by_attribute(attribute
, value
, html
):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m
= re
.search(r
'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
198 res
= m
.group('content')
200 if res
.startswith('"') or res
.startswith("'"):
203 return unescapeHTML(res
)
206 def clean_html(html
):
207 """Clean an HTML snippet into a readable string"""
209 html
= html
.replace('\n', ' ')
210 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
211 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
213 html
= re
.sub('<.*?>', '', html
)
214 # Replace html entities
215 html
= unescapeHTML(html
)
219 def sanitize_open(filename
, open_mode
):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys
.platform
== 'win32':
233 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
234 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
235 stream
= open(encodeFilename(filename
), open_mode
)
236 return (stream
, filename
)
237 except (IOError, OSError) as err
:
238 if err
.errno
in (errno
.EACCES
,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename
= os
.path
.join(
243 re
.sub('[/<>:"\\|\\\\?\\*]', '#', path_part
)
244 for path_part
in os
.path
.split(filename
)
246 if alt_filename
== filename
:
249 # An exception here should be caught in the caller
250 stream
= open(encodeFilename(filename
), open_mode
)
251 return (stream
, alt_filename
)
254 def timeconvert(timestr
):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple
= email
.utils
.parsedate_tz(timestr
)
258 if timetuple
is not None:
259 timestamp
= email
.utils
.mktime_tz(timetuple
)
263 def sanitize_filename(s
, restricted
=False, is_id
=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char
):
269 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
272 return '' if restricted
else '\''
274 return '_-' if restricted
else ' -'
275 elif char
in '\\/|*<>':
277 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
279 if restricted
and ord(char
) > 127:
283 result
= ''.join(map(replace_insane
, s
))
285 while '__' in result
:
286 result
= result
.replace('__', '_')
287 result
= result
.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted
and result
.startswith('-_'):
296 def orderedSet(iterable
):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity
):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity
in compat_html_entities
.name2codepoint
:
309 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
311 mobj
= re
.match(r
'#(x?[0-9]+)', entity
)
313 numstr
= mobj
.group(1)
314 if numstr
.startswith('x'):
316 numstr
= '0%s' % numstr
319 return compat_chr(int(numstr
, base
))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity
)
328 assert type(s
) == compat_str
331 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
334 def encodeFilename(s
, for_subprocess
=False):
336 @param s The name of the file
339 assert type(s
) == compat_str
341 # Python 3 has a Unicode API
342 if sys
.version_info
>= (3, 0):
345 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess
:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding
= preferredencoding()
356 encoding
= sys
.getfilesystemencoding()
359 return s
.encode(encoding
, 'ignore')
362 def encodeArgument(s
):
363 if not isinstance(s
, compat_str
):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s
= s
.decode('ascii')
368 return encodeFilename(s
, True)
371 def decodeOption(optval
):
374 if isinstance(optval
, bytes):
375 optval
= optval
.decode(preferredencoding())
377 assert isinstance(optval
, compat_str
)
381 def formatSeconds(secs
):
383 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
385 return '%d:%02d' % (secs
// 60, secs
% 60)
390 def make_HTTPS_handler(opts_no_check_certificate
, **kwargs
):
391 if sys
.version_info
< (3, 2):
394 class HTTPSConnectionV3(httplib
.HTTPSConnection
):
395 def __init__(self
, *args
, **kwargs
):
396 httplib
.HTTPSConnection
.__init
__(self
, *args
, **kwargs
)
399 sock
= socket
.create_connection((self
.host
, self
.port
), self
.timeout
)
400 if getattr(self
, '_tunnel_host', False):
404 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_TLSv1
)
406 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv23
)
408 class HTTPSHandlerV3(compat_urllib_request
.HTTPSHandler
):
409 def https_open(self
, req
):
410 return self
.do_open(HTTPSConnectionV3
, req
)
411 return HTTPSHandlerV3(**kwargs
)
412 elif hasattr(ssl
, 'create_default_context'): # Python >= 3.4
413 context
= ssl
.create_default_context(ssl
.Purpose
.CLIENT_AUTH
)
414 context
.options
&= ~ssl
.OP_NO_SSLv3
# Allow older, not-as-secure SSLv3
415 if opts_no_check_certificate
:
416 context
.verify_mode
= ssl
.CERT_NONE
417 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
419 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
420 context
.verify_mode
= (ssl
.CERT_NONE
421 if opts_no_check_certificate
422 else ssl
.CERT_REQUIRED
)
423 context
.set_default_verify_paths()
425 context
.load_default_certs()
426 except AttributeError:
428 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
431 class ExtractorError(Exception):
432 """Error during info extraction."""
434 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
441 if video_id
is not None:
442 msg
= video_id
+ ': ' + msg
444 msg
+= ' (caused by %r)' % cause
446 if ytdl_is_updateable():
447 update_cmd
= 'type youtube-dl -U to update'
449 update_cmd
= 'see https://yt-dl.org/update on how to update'
450 msg
+= '; please report this issue on https://yt-dl.org/bug .'
451 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
452 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
453 super(ExtractorError
, self
).__init
__(msg
)
456 self
.exc_info
= sys
.exc_info() # preserve original exception
458 self
.video_id
= video_id
460 def format_traceback(self
):
461 if self
.traceback
is None:
463 return ''.join(traceback
.format_tb(self
.traceback
))
466 class RegexNotFoundError(ExtractorError
):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self
, msg
, exc_info
=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError
, self
).__init
__(msg
)
482 self
.exc_info
= exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self
, msg
):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self
, downloaded
, expected
):
531 self
.downloaded
= downloaded
532 self
.expected
= expected
535 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
536 """Handler for HTTP requests and responses.
538 This class, when installed with an OpenerDirector, automatically adds
539 the standard headers to every HTTP request and handles gzipped and
540 deflated responses from web servers. If compression is to be avoided in
541 a particular request, the original request in the program code only has
542 to include the HTTP header "Youtubedl-No-Compression", which will be
543 removed before making the real request.
545 Part of this code was copied from:
547 http://techknack.net/python-urllib2-handlers/
549 Andrew Rowls, the author of that code, agreed to release it to the
556 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
558 return zlib
.decompress(data
)
561 def addinfourl_wrapper(stream
, headers
, url
, code
):
562 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
563 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
564 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
568 def http_request(self
, req
):
569 for h
, v
in std_headers
.items():
570 if h
not in req
.headers
:
572 if 'Youtubedl-no-compression' in req
.headers
:
573 if 'Accept-encoding' in req
.headers
:
574 del req
.headers
['Accept-encoding']
575 del req
.headers
['Youtubedl-no-compression']
576 if 'Youtubedl-user-agent' in req
.headers
:
577 if 'User-agent' in req
.headers
:
578 del req
.headers
['User-agent']
579 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
580 del req
.headers
['Youtubedl-user-agent']
582 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
583 # Python 2.6 is brain-dead when it comes to fragments
584 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
585 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
589 def http_response(self
, req
, resp
):
592 if resp
.headers
.get('Content-encoding', '') == 'gzip':
593 content
= resp
.read()
594 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
596 uncompressed
= io
.BytesIO(gz
.read())
597 except IOError as original_ioerror
:
598 # There may be junk add the end of the file
599 # See http://stackoverflow.com/q/4928560/35070 for details
600 for i
in range(1, 1024):
602 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
603 uncompressed
= io
.BytesIO(gz
.read())
608 raise original_ioerror
609 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
610 resp
.msg
= old_resp
.msg
612 if resp
.headers
.get('Content-encoding', '') == 'deflate':
613 gz
= io
.BytesIO(self
.deflate(resp
.read()))
614 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
615 resp
.msg
= old_resp
.msg
618 https_request
= http_request
619 https_response
= http_response
622 def parse_iso8601(date_str
, delimiter
='T'):
623 """ Return a UNIX timestamp from the given date """
629 r
'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
632 timezone
= datetime
.timedelta()
634 date_str
= date_str
[:-len(m
.group(0))]
635 if not m
.group('sign'):
636 timezone
= datetime
.timedelta()
638 sign
= 1 if m
.group('sign') == '+' else -1
639 timezone
= datetime
.timedelta(
640 hours
=sign
* int(m
.group('hours')),
641 minutes
=sign
* int(m
.group('minutes')))
642 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
643 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
644 return calendar
.timegm(dt
.timetuple())
647 def unified_strdate(date_str
):
648 """Return a string with the date in the format YYYYMMDD"""
655 date_str
= date_str
.replace(',', ' ')
656 # %z (UTC offset) is only supported in python>=3.2
657 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
658 format_expressions
= [
663 '%b %dst %Y %I:%M%p',
664 '%b %dnd %Y %I:%M%p',
665 '%b %dth %Y %I:%M%p',
674 '%Y-%m-%d %H:%M:%S.%f',
677 '%Y-%m-%dT%H:%M:%SZ',
678 '%Y-%m-%dT%H:%M:%S.%fZ',
679 '%Y-%m-%dT%H:%M:%S.%f0Z',
681 '%Y-%m-%dT%H:%M:%S.%f',
684 for expression
in format_expressions
:
686 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
689 if upload_date
is None:
690 timetuple
= email
.utils
.parsedate_tz(date_str
)
692 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
696 def determine_ext(url
, default_ext
='unknown_video'):
699 guess
= url
.partition('?')[0].rpartition('.')[2]
700 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
706 def subtitles_filename(filename
, sub_lang
, sub_format
):
707 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
710 def date_from_str(date_str
):
712 Return a datetime object from a string in the format YYYYMMDD or
713 (now|today)[+-][0-9](day|week|month|year)(s)?"""
714 today
= datetime
.date
.today()
715 if date_str
== 'now'or date_str
== 'today':
717 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
718 if match
is not None:
719 sign
= match
.group('sign')
720 time
= int(match
.group('time'))
723 unit
= match
.group('unit')
724 # A bad aproximation?
732 delta
= datetime
.timedelta(**{unit
: time
})
734 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
737 def hyphenate_date(date_str
):
739 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
740 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
741 if match
is not None:
742 return '-'.join(match
.groups())
747 class DateRange(object):
748 """Represents a time interval between two dates"""
750 def __init__(self
, start
=None, end
=None):
751 """start and end must be strings in the format accepted by date"""
752 if start
is not None:
753 self
.start
= date_from_str(start
)
755 self
.start
= datetime
.datetime
.min.date()
757 self
.end
= date_from_str(end
)
759 self
.end
= datetime
.datetime
.max.date()
760 if self
.start
> self
.end
:
761 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
765 """Returns a range that only contains the given day"""
768 def __contains__(self
, date
):
769 """Check if the date is in the range"""
770 if not isinstance(date
, datetime
.date
):
771 date
= date_from_str(date
)
772 return self
.start
<= date
<= self
.end
775 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
779 """ Returns the platform name as a compat_str """
780 res
= platform
.platform()
781 if isinstance(res
, bytes):
782 res
= res
.decode(preferredencoding())
784 assert isinstance(res
, compat_str
)
788 def _windows_write_string(s
, out
):
789 """ Returns True if the string was written using special methods,
790 False if it has yet to be written out."""
791 # Adapted from http://stackoverflow.com/a/3259271/35070
794 import ctypes
.wintypes
802 fileno
= out
.fileno()
803 except AttributeError:
804 # If the output stream doesn't have a fileno, it's virtual
806 if fileno
not in WIN_OUTPUT_IDS
:
809 GetStdHandle
= ctypes
.WINFUNCTYPE(
810 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
811 ("GetStdHandle", ctypes
.windll
.kernel32
))
812 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
814 WriteConsoleW
= ctypes
.WINFUNCTYPE(
815 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
816 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
817 ctypes
.wintypes
.LPVOID
)(("WriteConsoleW", ctypes
.windll
.kernel32
))
818 written
= ctypes
.wintypes
.DWORD(0)
820 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(("GetFileType", ctypes
.windll
.kernel32
))
821 FILE_TYPE_CHAR
= 0x0002
822 FILE_TYPE_REMOTE
= 0x8000
823 GetConsoleMode
= ctypes
.WINFUNCTYPE(
824 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
825 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
826 ("GetConsoleMode", ctypes
.windll
.kernel32
))
827 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
829 def not_a_console(handle
):
830 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
832 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
833 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
838 def next_nonbmp_pos(s
):
840 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
841 except StopIteration:
845 count
= min(next_nonbmp_pos(s
), 1024)
848 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
850 raise OSError('Failed to write string')
851 if not count
: # We just wrote a non-BMP character
852 assert written
.value
== 2
855 assert written
.value
> 0
856 s
= s
[written
.value
:]
860 def write_string(s
, out
=None, encoding
=None):
863 assert type(s
) == compat_str
865 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
866 if _windows_write_string(s
, out
):
869 if ('b' in getattr(out
, 'mode', '') or
870 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
871 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
873 elif hasattr(out
, 'buffer'):
874 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
875 byt
= s
.encode(enc
, 'ignore')
876 out
.buffer.write(byt
)
882 def bytes_to_intlist(bs
):
885 if isinstance(bs
[0], int): # Python 3
888 return [ord(c
) for c
in bs
]
891 def intlist_to_bytes(xs
):
894 return struct_pack('%dB' % len(xs
), *xs
)
897 # Cross-platform file locking
898 if sys
.platform
== 'win32':
899 import ctypes
.wintypes
902 class OVERLAPPED(ctypes
.Structure
):
904 ('Internal', ctypes
.wintypes
.LPVOID
),
905 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
906 ('Offset', ctypes
.wintypes
.DWORD
),
907 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
908 ('hEvent', ctypes
.wintypes
.HANDLE
),
911 kernel32
= ctypes
.windll
.kernel32
912 LockFileEx
= kernel32
.LockFileEx
913 LockFileEx
.argtypes
= [
914 ctypes
.wintypes
.HANDLE
, # hFile
915 ctypes
.wintypes
.DWORD
, # dwFlags
916 ctypes
.wintypes
.DWORD
, # dwReserved
917 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
918 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
919 ctypes
.POINTER(OVERLAPPED
) # Overlapped
921 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
922 UnlockFileEx
= kernel32
.UnlockFileEx
923 UnlockFileEx
.argtypes
= [
924 ctypes
.wintypes
.HANDLE
, # hFile
925 ctypes
.wintypes
.DWORD
, # dwReserved
926 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
927 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
928 ctypes
.POINTER(OVERLAPPED
) # Overlapped
930 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
931 whole_low
= 0xffffffff
932 whole_high
= 0x7fffffff
934 def _lock_file(f
, exclusive
):
935 overlapped
= OVERLAPPED()
936 overlapped
.Offset
= 0
937 overlapped
.OffsetHigh
= 0
938 overlapped
.hEvent
= 0
939 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
940 handle
= msvcrt
.get_osfhandle(f
.fileno())
941 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
942 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
943 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
946 assert f
._lock
_file
_overlapped
_p
947 handle
= msvcrt
.get_osfhandle(f
.fileno())
948 if not UnlockFileEx(handle
, 0,
949 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
950 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
955 def _lock_file(f
, exclusive
):
956 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
959 fcntl
.flock(f
, fcntl
.LOCK_UN
)
962 class locked_file(object):
963 def __init__(self
, filename
, mode
, encoding
=None):
964 assert mode
in ['r', 'a', 'w']
965 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
969 exclusive
= self
.mode
!= 'r'
971 _lock_file(self
.f
, exclusive
)
977 def __exit__(self
, etype
, value
, traceback
):
986 def write(self
, *args
):
987 return self
.f
.write(*args
)
989 def read(self
, *args
):
990 return self
.f
.read(*args
)
993 def get_filesystem_encoding():
994 encoding
= sys
.getfilesystemencoding()
995 return encoding
if encoding
is not None else 'utf-8'
998 def shell_quote(args
):
1000 encoding
= get_filesystem_encoding()
1002 if isinstance(a
, bytes):
1003 # We may get a filename encoded with 'encodeFilename'
1004 a
= a
.decode(encoding
)
1005 quoted_args
.append(pipes
.quote(a
))
1006 return ' '.join(quoted_args
)
1009 def takewhile_inclusive(pred
, seq
):
1010 """ Like itertools.takewhile, but include the latest evaluated element
1011 (the first element so that Not pred(e)) """
1018 def smuggle_url(url
, data
):
1019 """ Pass additional data in a URL for internal use. """
1021 sdata
= compat_urllib_parse
.urlencode(
1022 {'__youtubedl_smuggle': json
.dumps(data
)})
1023 return url
+ '#' + sdata
1026 def unsmuggle_url(smug_url
, default
=None):
1027 if not '#__youtubedl_smuggle' in smug_url
:
1028 return smug_url
, default
1029 url
, _
, sdata
= smug_url
.rpartition('#')
1030 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1031 data
= json
.loads(jsond
)
1035 def format_bytes(bytes):
1038 if type(bytes) is str:
1039 bytes = float(bytes)
1043 exponent
= int(math
.log(bytes, 1024.0))
1044 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1045 converted
= float(bytes) / float(1024 ** exponent
)
1046 return '%.2f%s' % (converted
, suffix
)
1049 def parse_filesize(s
):
1053 # The lower-case forms are of course incorrect and inofficial,
1054 # but we support those too
1092 units_re
= '|'.join(re
.escape(u
) for u
in _UNIT_TABLE
)
1093 m
= re
.match(r
'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re
, s
)
1097 return int(float(m
.group('num')) * _UNIT_TABLE
[m
.group('unit')])
1100 def get_term_width():
1101 columns
= compat_getenv('COLUMNS', None)
1106 sp
= subprocess
.Popen(
1108 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
1109 out
, err
= sp
.communicate()
1110 return int(out
.split()[1])
1116 def month_by_name(name
):
1117 """ Return the number of a month by (locale-independently) English name """
1120 'January', 'February', 'March', 'April', 'May', 'June',
1121 'July', 'August', 'September', 'October', 'November', 'December']
1123 return ENGLISH_NAMES
.index(name
) + 1
1128 def fix_xml_ampersands(xml_str
):
1129 """Replace all the '&' by '&' in XML"""
1131 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1136 def setproctitle(title
):
1137 assert isinstance(title
, compat_str
)
1139 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1142 title_bytes
= title
.encode('utf-8')
1143 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1144 buf
.value
= title_bytes
1146 libc
.prctl(15, buf
, 0, 0, 0)
1147 except AttributeError:
1148 return # Strange libc, just skip this
1151 def remove_start(s
, start
):
1152 if s
.startswith(start
):
1153 return s
[len(start
):]
1157 def remove_end(s
, end
):
1159 return s
[:-len(end
)]
1163 def url_basename(url
):
1164 path
= compat_urlparse
.urlparse(url
).path
1165 return path
.strip('/').split('/')[-1]
1168 class HEADRequest(compat_urllib_request
.Request
):
1169 def get_method(self
):
1173 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1176 v
= getattr(v
, get_attr
, None)
1179 return default
if v
is None else (int(v
) * invscale
// scale
)
1182 def str_or_none(v
, default
=None):
1183 return default
if v
is None else compat_str(v
)
1186 def str_to_int(int_str
):
1187 """ A more relaxed version of int_or_none """
1190 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1194 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1195 return default
if v
is None else (float(v
) * invscale
/ scale
)
1198 def parse_duration(s
):
1207 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1208 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1210 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s
)
1213 res
= int(m
.group('secs'))
1215 res
+= int(m
.group('mins')) * 60
1216 if m
.group('hours'):
1217 res
+= int(m
.group('hours')) * 60 * 60
1219 res
+= float(m
.group('ms'))
1223 def prepend_extension(filename
, ext
):
1224 name
, real_ext
= os
.path
.splitext(filename
)
1225 return '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1228 def check_executable(exe
, args
=[]):
1229 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1230 args can be a list of arguments for a short output (like -version) """
1232 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1238 def get_exe_version(exe
, args
=['--version'],
1239 version_re
=r
'version\s+([0-9._-a-zA-Z]+)',
1240 unrecognized
='present'):
1241 """ Returns the version of the specified executable,
1242 or False if the executable is not present """
1244 out
, err
= subprocess
.Popen(
1246 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1249 firstline
= out
.partition(b
'\n')[0].decode('ascii', 'ignore')
1250 m
= re
.search(version_re
, firstline
)
1257 class PagedList(object):
1259 # This is only useful for tests
1260 return len(self
.getslice())
1263 class OnDemandPagedList(PagedList
):
1264 def __init__(self
, pagefunc
, pagesize
):
1265 self
._pagefunc
= pagefunc
1266 self
._pagesize
= pagesize
1268 def getslice(self
, start
=0, end
=None):
1270 for pagenum
in itertools
.count(start
// self
._pagesize
):
1271 firstid
= pagenum
* self
._pagesize
1272 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1273 if start
>= nextfirstid
:
1276 page_results
= list(self
._pagefunc
(pagenum
))
1279 start
% self
._pagesize
1280 if firstid
<= start
< nextfirstid
1284 ((end
- 1) % self
._pagesize
) + 1
1285 if (end
is not None and firstid
<= end
<= nextfirstid
)
1288 if startv
!= 0 or endv
is not None:
1289 page_results
= page_results
[startv
:endv
]
1290 res
.extend(page_results
)
1292 # A little optimization - if current page is not "full", ie. does
1293 # not contain page_size videos then we can assume that this page
1294 # is the last one - there are no more ids on further pages -
1295 # i.e. no need to query again.
1296 if len(page_results
) + startv
< self
._pagesize
:
1299 # If we got the whole page, but the next page is not interesting,
1300 # break out early as well
1301 if end
== nextfirstid
:
1306 class InAdvancePagedList(PagedList
):
1307 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1308 self
._pagefunc
= pagefunc
1309 self
._pagecount
= pagecount
1310 self
._pagesize
= pagesize
1312 def getslice(self
, start
=0, end
=None):
1314 start_page
= start
// self
._pagesize
1316 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1317 skip_elems
= start
- start_page
* self
._pagesize
1318 only_more
= None if end
is None else end
- start
1319 for pagenum
in range(start_page
, end_page
):
1320 page
= list(self
._pagefunc
(pagenum
))
1322 page
= page
[skip_elems
:]
1324 if only_more
is not None:
1325 if len(page
) < only_more
:
1326 only_more
-= len(page
)
1328 page
= page
[:only_more
]
1335 def uppercase_escape(s
):
1336 unicode_escape
= codecs
.getdecoder('unicode_escape')
1338 r
'\\U[0-9a-fA-F]{8}',
1339 lambda m
: unicode_escape(m
.group(0))[0],
1343 def escape_rfc3986(s
):
1344 """Escape non-ASCII characters as suggested by RFC 3986"""
1345 if sys
.version_info
< (3, 0) and isinstance(s
, unicode):
1346 s
= s
.encode('utf-8')
1347 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1350 def escape_url(url
):
1351 """Escape URL as suggested by RFC 3986"""
1352 url_parsed
= compat_urllib_parse_urlparse(url
)
1353 return url_parsed
._replace
(
1354 path
=escape_rfc3986(url_parsed
.path
),
1355 params
=escape_rfc3986(url_parsed
.params
),
1356 query
=escape_rfc3986(url_parsed
.query
),
1357 fragment
=escape_rfc3986(url_parsed
.fragment
)
1361 struct
.pack('!I', 0)
1363 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1364 def struct_pack(spec
, *args
):
1365 if isinstance(spec
, compat_str
):
1366 spec
= spec
.encode('ascii')
1367 return struct
.pack(spec
, *args
)
1369 def struct_unpack(spec
, *args
):
1370 if isinstance(spec
, compat_str
):
1371 spec
= spec
.encode('ascii')
1372 return struct
.unpack(spec
, *args
)
1374 struct_pack
= struct
.pack
1375 struct_unpack
= struct
.unpack
1378 def read_batch_urls(batch_fd
):
1380 if not isinstance(url
, compat_str
):
1381 url
= url
.decode('utf-8', 'replace')
1382 BOM_UTF8
= '\xef\xbb\xbf'
1383 if url
.startswith(BOM_UTF8
):
1384 url
= url
[len(BOM_UTF8
):]
1386 if url
.startswith(('#', ';', ']')):
1390 with contextlib
.closing(batch_fd
) as fd
:
1391 return [url
for url
in map(fixup
, fd
) if url
]
1394 def urlencode_postdata(*args
, **kargs
):
1395 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1399 etree_iter
= xml
.etree
.ElementTree
.Element
.iter
1400 except AttributeError: # Python <=2.6
1401 etree_iter
= lambda n
: n
.findall('.//*')
1405 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1406 def doctype(self
, name
, pubid
, system
):
1407 pass # Ignore doctypes
1409 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1410 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1411 tree
= xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1412 # Fix up XML parser in Python 2.x
1413 if sys
.version_info
< (3, 0):
1414 for n
in etree_iter(tree
):
1415 if n
.text
is not None:
1416 if not isinstance(n
.text
, compat_str
):
1417 n
.text
= n
.text
.decode('utf-8')
1430 def parse_age_limit(s
):
1433 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1434 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1437 def strip_jsonp(code
):
1439 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1442 def js_to_json(code
):
1445 if v
in ('true', 'false', 'null'):
1447 if v
.startswith('"'):
1449 if v
.startswith("'"):
1451 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1458 res = re.sub(r'''(?x)
1459 "(?
:[^
"\\]*(?:\\\\|\\")?
)*"|
1460 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1461 [a-zA-Z_][a-zA-Z_0-9]*
1463 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1467 def qualities(quality_ids):
1468 """ Get a numeric quality value out of a list of possible values """
1471 return quality_ids.index(qid)
1477 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1480 def limit_length(s, length):
1481 """ Add ellipses to overly long strings """
1486 return s[:length - len(ELLIPSES)] + ELLIPSES
1490 def version_tuple(v):
1491 return [int(e) for e in v.split('.')]
1494 def is_outdated_version(version, limit, assume_new=True):
1496 return not assume_new
1498 return version_tuple(version) < version_tuple(limit)
1500 return not assume_new
1503 def ytdl_is_updateable():
1504 """ Returns if youtube-dl can be updated with -U """
1505 from zipimport import zipimporter
1507 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1510 def args_to_str(args):
1511 # Get a short string representation for a subprocess command
1512 return ' '.join(shlex_quote(a) for a in args)