2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
31 import xml
.etree
.ElementTree
40 compat_socket_create_connection
,
44 compat_urllib_parse_urlparse
,
45 compat_urllib_request
,
51 # This is not clearly defined otherwise
52 compiled_regex_type
= type(re
.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref
= locale
.getpreferredencoding()
78 def write_json_file(obj
, fn
):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn
= encodeFilename(fn
)
82 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
83 encoding
= get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
88 # the same for os.path.dirname
89 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
91 path_basename
= os
.path
.basename
92 path_dirname
= os
.path
.dirname
96 'prefix': path_basename(fn
) + '.',
97 'dir': path_dirname(fn
),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys
.version_info
< (3, 0):
111 tf
= tempfile
.NamedTemporaryFile(**args
)
116 if sys
.platform
== 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os
.rename(tf
.name
, fn
)
132 if sys
.version_info
>= (2, 7):
133 def find_xpath_attr(node
, xpath
, key
, val
):
134 """ Find the xpath xpath[@key=val] """
135 assert re
.match(r
'^[a-zA-Z-]+$', key
)
136 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
137 expr
= xpath
+ "[@%s='%s']" % (key
, val
)
138 return node
.find(expr
)
140 def find_xpath_attr(node
, xpath
, key
, val
):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath
, unicode):
144 xpath
= xpath
.encode('ascii')
146 for f
in node
.findall(xpath
):
147 if f
.attrib
.get(key
) == val
:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path
, ns_map
):
156 components
= [c
.split(':') for c
in path
.split('/')]
160 replaced
.append(c
[0])
163 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
164 return '/'.join(replaced
)
167 def xpath_text(node
, xpath
, name
=None, fatal
=False):
168 if sys
.version_info
< (2, 7): # Crazy 2.6
169 xpath
= xpath
.encode('ascii')
172 if n
is None or n
.text
is None:
174 name
= xpath
if name
is None else name
175 raise ExtractorError('Could not find XML element %s' % name
)
181 def get_element_by_id(id, html
):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html
)
186 def get_element_by_attribute(attribute
, value
, html
):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m
= re
.search(r
'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
201 res
= m
.group('content')
203 if res
.startswith('"') or res
.startswith("'"):
206 return unescapeHTML(res
)
209 def clean_html(html
):
210 """Clean an HTML snippet into a readable string"""
212 if html
is None: # Convenience for sanitizing descriptions etc.
216 html
= html
.replace('\n', ' ')
217 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
218 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
220 html
= re
.sub('<.*?>', '', html
)
221 # Replace html entities
222 html
= unescapeHTML(html
)
226 def sanitize_open(filename
, open_mode
):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys
.platform
== 'win32':
240 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
241 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
242 stream
= open(encodeFilename(filename
), open_mode
)
243 return (stream
, filename
)
244 except (IOError, OSError) as err
:
245 if err
.errno
in (errno
.EACCES
,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename
= os
.path
.join(
250 re
.sub('[/<>:"\\|\\\\?\\*]', '#', path_part
)
251 for path_part
in os
.path
.split(filename
)
253 if alt_filename
== filename
:
256 # An exception here should be caught in the caller
257 stream
= open(encodeFilename(filename
), open_mode
)
258 return (stream
, alt_filename
)
261 def timeconvert(timestr
):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple
= email
.utils
.parsedate_tz(timestr
)
265 if timetuple
is not None:
266 timestamp
= email
.utils
.mktime_tz(timetuple
)
270 def sanitize_filename(s
, restricted
=False, is_id
=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char
):
276 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
279 return '' if restricted
else '\''
281 return '_-' if restricted
else ' -'
282 elif char
in '\\/|*<>':
284 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
286 if restricted
and ord(char
) > 127:
291 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
292 result
= ''.join(map(replace_insane
, s
))
294 while '__' in result
:
295 result
= result
.replace('__', '_')
296 result
= result
.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted
and result
.startswith('-_'):
305 def orderedSet(iterable
):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity
):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity
in compat_html_entities
.name2codepoint
:
318 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
320 mobj
= re
.match(r
'#(x?[0-9]+)', entity
)
322 numstr
= mobj
.group(1)
323 if numstr
.startswith('x'):
325 numstr
= '0%s' % numstr
328 return compat_chr(int(numstr
, base
))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity
)
337 assert type(s
) == compat_str
340 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
343 def encodeFilename(s
, for_subprocess
=False):
345 @param s The name of the file
348 assert type(s
) == compat_str
350 # Python 3 has a Unicode API
351 if sys
.version_info
>= (3, 0):
354 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess
:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding
= preferredencoding()
365 encoding
= sys
.getfilesystemencoding()
368 return s
.encode(encoding
, 'ignore')
371 def encodeArgument(s
):
372 if not isinstance(s
, compat_str
):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s
= s
.decode('ascii')
377 return encodeFilename(s
, True)
380 def decodeOption(optval
):
383 if isinstance(optval
, bytes):
384 optval
= optval
.decode(preferredencoding())
386 assert isinstance(optval
, compat_str
)
390 def formatSeconds(secs
):
392 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
394 return '%d:%02d' % (secs
// 60, secs
% 60)
399 def make_HTTPS_handler(params
, **kwargs
):
400 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
401 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
403 if opts_no_check_certificate
:
404 context
.check_hostname
= False
405 context
.verify_mode
= ssl
.CERT_NONE
407 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
410 # (create_default_context present but HTTPSHandler has no context=)
413 if sys
.version_info
< (3, 2):
416 class HTTPSConnectionV3(httplib
.HTTPSConnection
):
417 def __init__(self
, *args
, **kwargs
):
418 httplib
.HTTPSConnection
.__init
__(self
, *args
, **kwargs
)
421 sock
= socket
.create_connection((self
.host
, self
.port
), self
.timeout
)
422 if getattr(self
, '_tunnel_host', False):
426 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_TLSv1
)
428 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv23
)
430 return YoutubeDLHTTPSHandler(params
, https_conn_class
=HTTPSConnectionV3
, **kwargs
)
432 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
433 context
.verify_mode
= (ssl
.CERT_NONE
434 if opts_no_check_certificate
435 else ssl
.CERT_REQUIRED
)
436 context
.set_default_verify_paths()
437 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
440 class ExtractorError(Exception):
441 """Error during info extraction."""
443 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
444 """ tb, if given, is the original traceback (so that it can be printed out).
445 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
448 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
450 if video_id
is not None:
451 msg
= video_id
+ ': ' + msg
453 msg
+= ' (caused by %r)' % cause
455 if ytdl_is_updateable():
456 update_cmd
= 'type youtube-dl -U to update'
458 update_cmd
= 'see https://yt-dl.org/update on how to update'
459 msg
+= '; please report this issue on https://yt-dl.org/bug .'
460 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
461 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
462 super(ExtractorError
, self
).__init
__(msg
)
465 self
.exc_info
= sys
.exc_info() # preserve original exception
467 self
.video_id
= video_id
469 def format_traceback(self
):
470 if self
.traceback
is None:
472 return ''.join(traceback
.format_tb(self
.traceback
))
475 class UnsupportedError(ExtractorError
):
476 def __init__(self
, url
):
477 super(UnsupportedError
, self
).__init
__(
478 'Unsupported URL: %s' % url
, expected
=True)
482 class RegexNotFoundError(ExtractorError
):
483 """Error when a regex didn't match"""
487 class DownloadError(Exception):
488 """Download Error exception.
490 This exception may be thrown by FileDownloader objects if they are not
491 configured to continue on errors. They will contain the appropriate
495 def __init__(self
, msg
, exc_info
=None):
496 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
497 super(DownloadError
, self
).__init
__(msg
)
498 self
.exc_info
= exc_info
501 class SameFileError(Exception):
502 """Same File exception.
504 This exception will be thrown by FileDownloader objects if they detect
505 multiple files would have to be downloaded to the same file on disk.
510 class PostProcessingError(Exception):
511 """Post Processing exception.
513 This exception may be raised by PostProcessor's .run() method to
514 indicate an error in the postprocessing task.
517 def __init__(self
, msg
):
521 class MaxDownloadsReached(Exception):
522 """ --max-downloads limit has been reached. """
526 class UnavailableVideoError(Exception):
527 """Unavailable Format exception.
529 This exception will be thrown when a video is requested
530 in a format that is not available for that video.
535 class ContentTooShortError(Exception):
536 """Content Too Short exception.
538 This exception may be raised by FileDownloader objects when a file they
539 download is too small for what the server announced first, indicating
540 the connection was probably interrupted.
546 def __init__(self
, downloaded
, expected
):
547 self
.downloaded
= downloaded
548 self
.expected
= expected
551 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
552 hc
= http_class(*args
, **kwargs
)
553 source_address
= ydl_handler
._params
.get('source_address')
554 if source_address
is not None:
555 sa
= (source_address
, 0)
556 if hasattr(hc
, 'source_address'): # Python 2.7+
557 hc
.source_address
= sa
559 def _hc_connect(self
, *args
, **kwargs
):
560 sock
= compat_socket_create_connection(
561 (self
.host
, self
.port
), self
.timeout
, sa
)
563 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
)
566 hc
.connect
= functools
.partial(_hc_connect
, hc
)
571 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
572 """Handler for HTTP requests and responses.
574 This class, when installed with an OpenerDirector, automatically adds
575 the standard headers to every HTTP request and handles gzipped and
576 deflated responses from web servers. If compression is to be avoided in
577 a particular request, the original request in the program code only has
578 to include the HTTP header "Youtubedl-No-Compression", which will be
579 removed before making the real request.
581 Part of this code was copied from:
583 http://techknack.net/python-urllib2-handlers/
585 Andrew Rowls, the author of that code, agreed to release it to the
589 def __init__(self
, params
, *args
, **kwargs
):
590 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
591 self
._params
= params
593 def http_open(self
, req
):
594 return self
.do_open(functools
.partial(
595 _create_http_connection
, self
, compat_http_client
.HTTPConnection
, False),
601 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
603 return zlib
.decompress(data
)
606 def addinfourl_wrapper(stream
, headers
, url
, code
):
607 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
608 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
609 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
613 def http_request(self
, req
):
614 for h
, v
in std_headers
.items():
615 if h
not in req
.headers
:
617 if 'Youtubedl-no-compression' in req
.headers
:
618 if 'Accept-encoding' in req
.headers
:
619 del req
.headers
['Accept-encoding']
620 del req
.headers
['Youtubedl-no-compression']
621 if 'Youtubedl-user-agent' in req
.headers
:
622 if 'User-agent' in req
.headers
:
623 del req
.headers
['User-agent']
624 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
625 del req
.headers
['Youtubedl-user-agent']
627 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
628 # Python 2.6 is brain-dead when it comes to fragments
629 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
630 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
634 def http_response(self
, req
, resp
):
637 if resp
.headers
.get('Content-encoding', '') == 'gzip':
638 content
= resp
.read()
639 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
641 uncompressed
= io
.BytesIO(gz
.read())
642 except IOError as original_ioerror
:
643 # There may be junk add the end of the file
644 # See http://stackoverflow.com/q/4928560/35070 for details
645 for i
in range(1, 1024):
647 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
648 uncompressed
= io
.BytesIO(gz
.read())
653 raise original_ioerror
654 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
655 resp
.msg
= old_resp
.msg
657 if resp
.headers
.get('Content-encoding', '') == 'deflate':
658 gz
= io
.BytesIO(self
.deflate(resp
.read()))
659 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
660 resp
.msg
= old_resp
.msg
663 https_request
= http_request
664 https_response
= http_response
667 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
668 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
669 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
670 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
671 self
._params
= params
673 def https_open(self
, req
):
674 return self
.do_open(functools
.partial(
675 _create_http_connection
, self
, self
._https
_conn
_class
, True),
679 def parse_iso8601(date_str
, delimiter
='T'):
680 """ Return a UNIX timestamp from the given date """
686 r
'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
689 timezone
= datetime
.timedelta()
691 date_str
= date_str
[:-len(m
.group(0))]
692 if not m
.group('sign'):
693 timezone
= datetime
.timedelta()
695 sign
= 1 if m
.group('sign') == '+' else -1
696 timezone
= datetime
.timedelta(
697 hours
=sign
* int(m
.group('hours')),
698 minutes
=sign
* int(m
.group('minutes')))
699 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
700 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
701 return calendar
.timegm(dt
.timetuple())
704 def unified_strdate(date_str
, day_first
=True):
705 """Return a string with the date in the format YYYYMMDD"""
711 date_str
= date_str
.replace(',', ' ')
712 # %z (UTC offset) is only supported in python>=3.2
713 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
714 # Remove AM/PM + timezone
715 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str
)
717 format_expressions
= [
722 '%b %dst %Y %I:%M%p',
723 '%b %dnd %Y %I:%M%p',
724 '%b %dth %Y %I:%M%p',
730 '%Y-%m-%d %H:%M:%S.%f',
733 '%Y-%m-%dT%H:%M:%SZ',
734 '%Y-%m-%dT%H:%M:%S.%fZ',
735 '%Y-%m-%dT%H:%M:%S.%f0Z',
737 '%Y-%m-%dT%H:%M:%S.%f',
741 format_expressions
.extend([
748 format_expressions
.extend([
754 for expression
in format_expressions
:
756 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
759 if upload_date
is None:
760 timetuple
= email
.utils
.parsedate_tz(date_str
)
762 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
766 def determine_ext(url
, default_ext
='unknown_video'):
769 guess
= url
.partition('?')[0].rpartition('.')[2]
770 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
776 def subtitles_filename(filename
, sub_lang
, sub_format
):
777 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
780 def date_from_str(date_str
):
782 Return a datetime object from a string in the format YYYYMMDD or
783 (now|today)[+-][0-9](day|week|month|year)(s)?"""
784 today
= datetime
.date
.today()
785 if date_str
in ('now', 'today'):
787 if date_str
== 'yesterday':
788 return today
- datetime
.timedelta(days
=1)
789 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
790 if match
is not None:
791 sign
= match
.group('sign')
792 time
= int(match
.group('time'))
795 unit
= match
.group('unit')
796 # A bad aproximation?
804 delta
= datetime
.timedelta(**{unit
: time
})
806 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
809 def hyphenate_date(date_str
):
811 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
812 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
813 if match
is not None:
814 return '-'.join(match
.groups())
819 class DateRange(object):
820 """Represents a time interval between two dates"""
822 def __init__(self
, start
=None, end
=None):
823 """start and end must be strings in the format accepted by date"""
824 if start
is not None:
825 self
.start
= date_from_str(start
)
827 self
.start
= datetime
.datetime
.min.date()
829 self
.end
= date_from_str(end
)
831 self
.end
= datetime
.datetime
.max.date()
832 if self
.start
> self
.end
:
833 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
837 """Returns a range that only contains the given day"""
840 def __contains__(self
, date
):
841 """Check if the date is in the range"""
842 if not isinstance(date
, datetime
.date
):
843 date
= date_from_str(date
)
844 return self
.start
<= date
<= self
.end
847 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
851 """ Returns the platform name as a compat_str """
852 res
= platform
.platform()
853 if isinstance(res
, bytes):
854 res
= res
.decode(preferredencoding())
856 assert isinstance(res
, compat_str
)
860 def _windows_write_string(s
, out
):
861 """ Returns True if the string was written using special methods,
862 False if it has yet to be written out."""
863 # Adapted from http://stackoverflow.com/a/3259271/35070
866 import ctypes
.wintypes
874 fileno
= out
.fileno()
875 except AttributeError:
876 # If the output stream doesn't have a fileno, it's virtual
878 if fileno
not in WIN_OUTPUT_IDS
:
881 GetStdHandle
= ctypes
.WINFUNCTYPE(
882 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
883 (b
"GetStdHandle", ctypes
.windll
.kernel32
))
884 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
886 WriteConsoleW
= ctypes
.WINFUNCTYPE(
887 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
888 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
889 ctypes
.wintypes
.LPVOID
)((b
"WriteConsoleW", ctypes
.windll
.kernel32
))
890 written
= ctypes
.wintypes
.DWORD(0)
892 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
"GetFileType", ctypes
.windll
.kernel32
))
893 FILE_TYPE_CHAR
= 0x0002
894 FILE_TYPE_REMOTE
= 0x8000
895 GetConsoleMode
= ctypes
.WINFUNCTYPE(
896 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
897 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
898 (b
"GetConsoleMode", ctypes
.windll
.kernel32
))
899 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
901 def not_a_console(handle
):
902 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
904 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
905 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
910 def next_nonbmp_pos(s
):
912 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
913 except StopIteration:
917 count
= min(next_nonbmp_pos(s
), 1024)
920 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
922 raise OSError('Failed to write string')
923 if not count
: # We just wrote a non-BMP character
924 assert written
.value
== 2
927 assert written
.value
> 0
928 s
= s
[written
.value
:]
932 def write_string(s
, out
=None, encoding
=None):
935 assert type(s
) == compat_str
937 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
938 if _windows_write_string(s
, out
):
941 if ('b' in getattr(out
, 'mode', '') or
942 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
943 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
945 elif hasattr(out
, 'buffer'):
946 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
947 byt
= s
.encode(enc
, 'ignore')
948 out
.buffer.write(byt
)
954 def bytes_to_intlist(bs
):
957 if isinstance(bs
[0], int): # Python 3
960 return [ord(c
) for c
in bs
]
963 def intlist_to_bytes(xs
):
966 return struct_pack('%dB' % len(xs
), *xs
)
969 # Cross-platform file locking
970 if sys
.platform
== 'win32':
971 import ctypes
.wintypes
974 class OVERLAPPED(ctypes
.Structure
):
976 ('Internal', ctypes
.wintypes
.LPVOID
),
977 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
978 ('Offset', ctypes
.wintypes
.DWORD
),
979 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
980 ('hEvent', ctypes
.wintypes
.HANDLE
),
983 kernel32
= ctypes
.windll
.kernel32
984 LockFileEx
= kernel32
.LockFileEx
985 LockFileEx
.argtypes
= [
986 ctypes
.wintypes
.HANDLE
, # hFile
987 ctypes
.wintypes
.DWORD
, # dwFlags
988 ctypes
.wintypes
.DWORD
, # dwReserved
989 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
990 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
991 ctypes
.POINTER(OVERLAPPED
) # Overlapped
993 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
994 UnlockFileEx
= kernel32
.UnlockFileEx
995 UnlockFileEx
.argtypes
= [
996 ctypes
.wintypes
.HANDLE
, # hFile
997 ctypes
.wintypes
.DWORD
, # dwReserved
998 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
999 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1000 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1002 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1003 whole_low
= 0xffffffff
1004 whole_high
= 0x7fffffff
1006 def _lock_file(f
, exclusive
):
1007 overlapped
= OVERLAPPED()
1008 overlapped
.Offset
= 0
1009 overlapped
.OffsetHigh
= 0
1010 overlapped
.hEvent
= 0
1011 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1012 handle
= msvcrt
.get_osfhandle(f
.fileno())
1013 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1014 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1015 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1017 def _unlock_file(f
):
1018 assert f
._lock
_file
_overlapped
_p
1019 handle
= msvcrt
.get_osfhandle(f
.fileno())
1020 if not UnlockFileEx(handle
, 0,
1021 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1022 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1027 def _lock_file(f
, exclusive
):
1028 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1030 def _unlock_file(f
):
1031 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1034 class locked_file(object):
1035 def __init__(self
, filename
, mode
, encoding
=None):
1036 assert mode
in ['r', 'a', 'w']
1037 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1040 def __enter__(self
):
1041 exclusive
= self
.mode
!= 'r'
1043 _lock_file(self
.f
, exclusive
)
1049 def __exit__(self
, etype
, value
, traceback
):
1051 _unlock_file(self
.f
)
1058 def write(self
, *args
):
1059 return self
.f
.write(*args
)
1061 def read(self
, *args
):
1062 return self
.f
.read(*args
)
1065 def get_filesystem_encoding():
1066 encoding
= sys
.getfilesystemencoding()
1067 return encoding
if encoding
is not None else 'utf-8'
1070 def shell_quote(args
):
1072 encoding
= get_filesystem_encoding()
1074 if isinstance(a
, bytes):
1075 # We may get a filename encoded with 'encodeFilename'
1076 a
= a
.decode(encoding
)
1077 quoted_args
.append(pipes
.quote(a
))
1078 return ' '.join(quoted_args
)
1081 def takewhile_inclusive(pred
, seq
):
1082 """ Like itertools.takewhile, but include the latest evaluated element
1083 (the first element so that Not pred(e)) """
1090 def smuggle_url(url
, data
):
1091 """ Pass additional data in a URL for internal use. """
1093 sdata
= compat_urllib_parse
.urlencode(
1094 {'__youtubedl_smuggle': json
.dumps(data
)})
1095 return url
+ '#' + sdata
1098 def unsmuggle_url(smug_url
, default
=None):
1099 if '#__youtubedl_smuggle' not in smug_url
:
1100 return smug_url
, default
1101 url
, _
, sdata
= smug_url
.rpartition('#')
1102 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1103 data
= json
.loads(jsond
)
1107 def format_bytes(bytes):
1110 if type(bytes) is str:
1111 bytes = float(bytes)
1115 exponent
= int(math
.log(bytes, 1024.0))
1116 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1117 converted
= float(bytes) / float(1024 ** exponent
)
1118 return '%.2f%s' % (converted
, suffix
)
1121 def parse_filesize(s
):
1125 # The lower-case forms are of course incorrect and inofficial,
1126 # but we support those too
1164 units_re
= '|'.join(re
.escape(u
) for u
in _UNIT_TABLE
)
1166 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re
, s
)
1170 num_str
= m
.group('num').replace(',', '.')
1171 mult
= _UNIT_TABLE
[m
.group('unit')]
1172 return int(float(num_str
) * mult
)
1175 def get_term_width():
1176 columns
= compat_getenv('COLUMNS', None)
1181 sp
= subprocess
.Popen(
1183 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
1184 out
, err
= sp
.communicate()
1185 return int(out
.split()[1])
1191 def month_by_name(name
):
1192 """ Return the number of a month by (locale-independently) English name """
1195 'January', 'February', 'March', 'April', 'May', 'June',
1196 'July', 'August', 'September', 'October', 'November', 'December']
1198 return ENGLISH_NAMES
.index(name
) + 1
1203 def fix_xml_ampersands(xml_str
):
1204 """Replace all the '&' by '&' in XML"""
1206 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1211 def setproctitle(title
):
1212 assert isinstance(title
, compat_str
)
1214 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1217 title_bytes
= title
.encode('utf-8')
1218 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1219 buf
.value
= title_bytes
1221 libc
.prctl(15, buf
, 0, 0, 0)
1222 except AttributeError:
1223 return # Strange libc, just skip this
1226 def remove_start(s
, start
):
1227 if s
.startswith(start
):
1228 return s
[len(start
):]
1232 def remove_end(s
, end
):
1234 return s
[:-len(end
)]
1238 def url_basename(url
):
1239 path
= compat_urlparse
.urlparse(url
).path
1240 return path
.strip('/').split('/')[-1]
1243 class HEADRequest(compat_urllib_request
.Request
):
1244 def get_method(self
):
1248 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1251 v
= getattr(v
, get_attr
, None)
1254 return default
if v
is None else (int(v
) * invscale
// scale
)
1257 def str_or_none(v
, default
=None):
1258 return default
if v
is None else compat_str(v
)
1261 def str_to_int(int_str
):
1262 """ A more relaxed version of int_or_none """
1265 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1269 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1270 return default
if v
is None else (float(v
) * invscale
/ scale
)
1273 def parse_duration(s
):
1274 if not isinstance(s
, basestring
if sys
.version_info
< (3, 0) else compat_str
):
1282 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1283 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1286 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1287 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1289 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1294 if m
.group('only_mins'):
1295 return float_or_none(m
.group('only_mins'), invscale
=60)
1296 if m
.group('only_hours'):
1297 return float_or_none(m
.group('only_hours'), invscale
=60 * 60)
1299 res
+= int(m
.group('secs'))
1301 res
+= int(m
.group('mins')) * 60
1302 if m
.group('hours'):
1303 res
+= int(m
.group('hours')) * 60 * 60
1305 res
+= float(m
.group('ms'))
1309 def prepend_extension(filename
, ext
):
1310 name
, real_ext
= os
.path
.splitext(filename
)
1311 return '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1314 def check_executable(exe
, args
=[]):
1315 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316 args can be a list of arguments for a short output (like -version) """
1318 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1324 def get_exe_version(exe
, args
=['--version'],
1325 version_re
=None, unrecognized
='present'):
1326 """ Returns the version of the specified executable,
1327 or False if the executable is not present """
1329 out
, _
= subprocess
.Popen(
1331 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1334 if isinstance(out
, bytes): # Python 2.x
1335 out
= out
.decode('ascii', 'ignore')
1336 return detect_exe_version(out
, version_re
, unrecognized
)
1339 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1340 assert isinstance(output
, compat_str
)
1341 if version_re
is None:
1342 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1343 m
= re
.search(version_re
, output
)
1350 class PagedList(object):
1352 # This is only useful for tests
1353 return len(self
.getslice())
1356 class OnDemandPagedList(PagedList
):
1357 def __init__(self
, pagefunc
, pagesize
):
1358 self
._pagefunc
= pagefunc
1359 self
._pagesize
= pagesize
1361 def getslice(self
, start
=0, end
=None):
1363 for pagenum
in itertools
.count(start
// self
._pagesize
):
1364 firstid
= pagenum
* self
._pagesize
1365 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1366 if start
>= nextfirstid
:
1369 page_results
= list(self
._pagefunc
(pagenum
))
1372 start
% self
._pagesize
1373 if firstid
<= start
< nextfirstid
1377 ((end
- 1) % self
._pagesize
) + 1
1378 if (end
is not None and firstid
<= end
<= nextfirstid
)
1381 if startv
!= 0 or endv
is not None:
1382 page_results
= page_results
[startv
:endv
]
1383 res
.extend(page_results
)
1385 # A little optimization - if current page is not "full", ie. does
1386 # not contain page_size videos then we can assume that this page
1387 # is the last one - there are no more ids on further pages -
1388 # i.e. no need to query again.
1389 if len(page_results
) + startv
< self
._pagesize
:
1392 # If we got the whole page, but the next page is not interesting,
1393 # break out early as well
1394 if end
== nextfirstid
:
1399 class InAdvancePagedList(PagedList
):
1400 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1401 self
._pagefunc
= pagefunc
1402 self
._pagecount
= pagecount
1403 self
._pagesize
= pagesize
1405 def getslice(self
, start
=0, end
=None):
1407 start_page
= start
// self
._pagesize
1409 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1410 skip_elems
= start
- start_page
* self
._pagesize
1411 only_more
= None if end
is None else end
- start
1412 for pagenum
in range(start_page
, end_page
):
1413 page
= list(self
._pagefunc
(pagenum
))
1415 page
= page
[skip_elems
:]
1417 if only_more
is not None:
1418 if len(page
) < only_more
:
1419 only_more
-= len(page
)
1421 page
= page
[:only_more
]
1428 def uppercase_escape(s
):
1429 unicode_escape
= codecs
.getdecoder('unicode_escape')
1431 r
'\\U[0-9a-fA-F]{8}',
1432 lambda m
: unicode_escape(m
.group(0))[0],
1436 def escape_rfc3986(s
):
1437 """Escape non-ASCII characters as suggested by RFC 3986"""
1438 if sys
.version_info
< (3, 0) and isinstance(s
, unicode):
1439 s
= s
.encode('utf-8')
1440 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1443 def escape_url(url
):
1444 """Escape URL as suggested by RFC 3986"""
1445 url_parsed
= compat_urllib_parse_urlparse(url
)
1446 return url_parsed
._replace
(
1447 path
=escape_rfc3986(url_parsed
.path
),
1448 params
=escape_rfc3986(url_parsed
.params
),
1449 query
=escape_rfc3986(url_parsed
.query
),
1450 fragment
=escape_rfc3986(url_parsed
.fragment
)
1454 struct
.pack('!I', 0)
1456 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1457 def struct_pack(spec
, *args
):
1458 if isinstance(spec
, compat_str
):
1459 spec
= spec
.encode('ascii')
1460 return struct
.pack(spec
, *args
)
1462 def struct_unpack(spec
, *args
):
1463 if isinstance(spec
, compat_str
):
1464 spec
= spec
.encode('ascii')
1465 return struct
.unpack(spec
, *args
)
1467 struct_pack
= struct
.pack
1468 struct_unpack
= struct
.unpack
1471 def read_batch_urls(batch_fd
):
1473 if not isinstance(url
, compat_str
):
1474 url
= url
.decode('utf-8', 'replace')
1475 BOM_UTF8
= '\xef\xbb\xbf'
1476 if url
.startswith(BOM_UTF8
):
1477 url
= url
[len(BOM_UTF8
):]
1479 if url
.startswith(('#', ';', ']')):
1483 with contextlib
.closing(batch_fd
) as fd
:
1484 return [url
for url
in map(fixup
, fd
) if url
]
1487 def urlencode_postdata(*args
, **kargs
):
1488 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1492 etree_iter
= xml
.etree
.ElementTree
.Element
.iter
1493 except AttributeError: # Python <=2.6
1494 etree_iter
= lambda n
: n
.findall('.//*')
1498 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1499 def doctype(self
, name
, pubid
, system
):
1500 pass # Ignore doctypes
1502 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1503 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1504 tree
= xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1505 # Fix up XML parser in Python 2.x
1506 if sys
.version_info
< (3, 0):
1507 for n
in etree_iter(tree
):
1508 if n
.text
is not None:
1509 if not isinstance(n
.text
, compat_str
):
1510 n
.text
= n
.text
.decode('utf-8')
1523 def parse_age_limit(s
):
1526 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1527 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1530 def strip_jsonp(code
):
1532 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1535 def js_to_json(code
):
1538 if v
in ('true', 'false', 'null'):
1540 if v
.startswith('"'):
1542 if v
.startswith("'"):
1544 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1551 res = re.sub(r'''(?x)
1552 "(?
:[^
"\\]*(?:\\\\|\\")?
)*"|
1553 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1554 [a-zA-Z_][a-zA-Z_0-9]*
1556 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1560 def qualities(quality_ids):
1561 """ Get a numeric quality value out of a list of possible values """
1564 return quality_ids.index(qid)
1570 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1573 def limit_length(s, length):
1574 """ Add ellipses to overly long strings """
1579 return s[:length - len(ELLIPSES)] + ELLIPSES
1583 def version_tuple(v):
1584 return tuple(int(e) for e in re.split(r'[-.]', v))
1587 def is_outdated_version(version, limit, assume_new=True):
1589 return not assume_new
1591 return version_tuple(version) < version_tuple(limit)
1593 return not assume_new
1596 def ytdl_is_updateable():
1597 """ Returns if youtube-dl can be updated with -U """
1598 from zipimport import zipimporter
1600 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1603 def args_to_str(args):
1604 # Get a short string representation for a subprocess command
1605 return ' '.join(shlex_quote(a) for a in args)
1608 def urlhandle_detect_ext(url_handle):
1611 getheader = lambda h: url_handle.headers[h]
1612 except AttributeError: # Python < 3
1613 getheader = url_handle.info().getheader
1615 return getheader('Content-Type').split("/")[1]
1618 def age_restricted(content_limit, age_limit):
1619 """ Returns True iff the content should be blocked """
1621 if age_limit is None: # No limit set
1623 if content_limit is None:
1624 return False # Content available for everyone
1625 return age_limit < content_limit