2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
32 import xml
.etree
.ElementTree
42 compat_socket_create_connection
,
46 compat_urllib_parse_urlparse
,
47 compat_urllib_request
,
53 # This is not clearly defined otherwise
54 compiled_regex_type
= type(re
.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
65 ENGLISH_MONTH_NAMES
= [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
70 def preferredencoding():
71 """Get preferred encoding.
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
77 pref
= locale
.getpreferredencoding()
85 def write_json_file(obj
, fn
):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
88 fn
= encodeFilename(fn
)
89 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
90 encoding
= get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
95 # the same for os.path.dirname
96 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
98 path_basename
= os
.path
.basename
99 path_dirname
= os
.path
.dirname
103 'prefix': path_basename(fn
) + '.',
104 'dir': path_dirname(fn
),
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys
.version_info
< (3, 0):
118 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
123 if sys
.platform
== 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
130 os
.rename(tf
.name
, fn
)
139 if sys
.version_info
>= (2, 7):
140 def find_xpath_attr(node
, xpath
, key
, val
):
141 """ Find the xpath xpath[@key=val] """
142 assert re
.match(r
'^[a-zA-Z-]+$', key
)
143 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
144 expr
= xpath
+ "[@%s='%s']" % (key
, val
)
145 return node
.find(expr
)
147 def find_xpath_attr(node
, xpath
, key
, val
):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath
, compat_str
):
151 xpath
= xpath
.encode('ascii')
153 for f
in node
.findall(xpath
):
154 if f
.attrib
.get(key
) == val
:
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
162 def xpath_with_ns(path
, ns_map
):
163 components
= [c
.split(':') for c
in path
.split('/')]
167 replaced
.append(c
[0])
170 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
171 return '/'.join(replaced
)
174 def xpath_text(node
, xpath
, name
=None, fatal
=False):
175 if sys
.version_info
< (2, 7): # Crazy 2.6
176 xpath
= xpath
.encode('ascii')
179 if n
is None or n
.text
is None:
181 name
= xpath
if name
is None else name
182 raise ExtractorError('Could not find XML element %s' % name
)
188 def get_element_by_id(id, html
):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html
)
193 def get_element_by_attribute(attribute
, value
, html
):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
196 m
= re
.search(r
'''(?xs)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
204 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
208 res
= m
.group('content')
210 if res
.startswith('"') or res
.startswith("'"):
213 return unescapeHTML(res
)
216 def clean_html(html
):
217 """Clean an HTML snippet into a readable string"""
219 if html
is None: # Convenience for sanitizing descriptions etc.
223 html
= html
.replace('\n', ' ')
224 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
225 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
227 html
= re
.sub('<.*?>', '', html
)
228 # Replace html entities
229 html
= unescapeHTML(html
)
233 def sanitize_open(filename
, open_mode
):
234 """Try to open the given filename, and slightly tweak it if this fails.
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
241 It returns the tuple (stream, definitive_file_name).
245 if sys
.platform
== 'win32':
247 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
248 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
249 stream
= open(encodeFilename(filename
), open_mode
)
250 return (stream
, filename
)
251 except (IOError, OSError) as err
:
252 if err
.errno
in (errno
.EACCES
,):
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename
= sanitize_path(filename
)
257 if alt_filename
== filename
:
260 # An exception here should be caught in the caller
261 stream
= open(encodeFilename(alt_filename
), open_mode
)
262 return (stream
, alt_filename
)
265 def timeconvert(timestr
):
266 """Convert RFC 2822 defined time string into system timestamp"""
268 timetuple
= email
.utils
.parsedate_tz(timestr
)
269 if timetuple
is not None:
270 timestamp
= email
.utils
.mktime_tz(timetuple
)
274 def sanitize_filename(s
, restricted
=False, is_id
=False):
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
279 def replace_insane(char
):
280 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
283 return '' if restricted
else '\''
285 return '_-' if restricted
else ' -'
286 elif char
in '\\/|*<>':
288 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
290 if restricted
and ord(char
) > 127:
295 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
296 result
= ''.join(map(replace_insane
, s
))
298 while '__' in result
:
299 result
= result
.replace('__', '_')
300 result
= result
.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted
and result
.startswith('-_'):
304 if result
.startswith('-'):
305 result
= '_' + result
[len('-'):]
306 result
= result
.lstrip('.')
312 def sanitize_path(s
):
313 """Sanitizes and normalizes path on Windows"""
314 if sys
.platform
!= 'win32':
316 drive_or_unc
, _
= os
.path
.splitdrive(s
)
317 if sys
.version_info
< (2, 7) and not drive_or_unc
:
318 drive_or_unc
, _
= os
.path
.splitunc(s
)
319 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
323 path_part
if path_part
in ['.', '..'] else re
.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part
)
324 for path_part
in norm_path
]
326 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
327 return os
.path
.join(*sanitized_path
)
330 def orderedSet(iterable
):
331 """ Remove all duplicates from the input iterable """
339 def _htmlentity_transform(entity
):
340 """Transforms an HTML entity to a character."""
341 # Known non-numeric HTML entity
342 if entity
in compat_html_entities
.name2codepoint
:
343 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
345 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
347 numstr
= mobj
.group(1)
348 if numstr
.startswith('x'):
350 numstr
= '0%s' % numstr
353 return compat_chr(int(numstr
, base
))
355 # Unknown entity in name, return its literal representation
356 return ('&%s;' % entity
)
362 assert type(s
) == compat_str
365 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
368 def get_subprocess_encoding():
369 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
370 # For subprocess calls, encode with locale encoding
371 # Refer to http://stackoverflow.com/a/9951851/35070
372 encoding
= preferredencoding()
374 encoding
= sys
.getfilesystemencoding()
380 def encodeFilename(s
, for_subprocess
=False):
382 @param s The name of the file
385 assert type(s
) == compat_str
387 # Python 3 has a Unicode API
388 if sys
.version_info
>= (3, 0):
391 # Pass '' directly to use Unicode APIs on Windows 2000 and up
392 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
393 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
394 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
397 return s
.encode(get_subprocess_encoding(), 'ignore')
400 def decodeFilename(b
, for_subprocess
=False):
402 if sys
.version_info
>= (3, 0):
405 if not isinstance(b
, bytes):
408 return b
.decode(get_subprocess_encoding(), 'ignore')
411 def encodeArgument(s
):
412 if not isinstance(s
, compat_str
):
413 # Legacy code that uses byte strings
414 # Uncomment the following line after fixing all post processors
415 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
416 s
= s
.decode('ascii')
417 return encodeFilename(s
, True)
420 def decodeArgument(b
):
421 return decodeFilename(b
, True)
424 def decodeOption(optval
):
427 if isinstance(optval
, bytes):
428 optval
= optval
.decode(preferredencoding())
430 assert isinstance(optval
, compat_str
)
434 def formatSeconds(secs
):
436 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
438 return '%d:%02d' % (secs
// 60, secs
% 60)
443 def make_HTTPS_handler(params
, **kwargs
):
444 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
445 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
446 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
447 if opts_no_check_certificate
:
448 context
.check_hostname
= False
449 context
.verify_mode
= ssl
.CERT_NONE
451 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
454 # (create_default_context present but HTTPSHandler has no context=)
457 if sys
.version_info
< (3, 2):
458 return YoutubeDLHTTPSHandler(params
, **kwargs
)
460 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
461 context
.verify_mode
= (ssl
.CERT_NONE
462 if opts_no_check_certificate
463 else ssl
.CERT_REQUIRED
)
464 context
.set_default_verify_paths()
465 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
468 def bug_reports_message():
469 if ytdl_is_updateable():
470 update_cmd
= 'type youtube-dl -U to update'
472 update_cmd
= 'see https://yt-dl.org/update on how to update'
473 msg
= '; please report this issue on https://yt-dl.org/bug .'
474 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
475 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
479 class ExtractorError(Exception):
480 """Error during info extraction."""
482 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
483 """ tb, if given, is the original traceback (so that it can be printed out).
484 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
487 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
489 if video_id
is not None:
490 msg
= video_id
+ ': ' + msg
492 msg
+= ' (caused by %r)' % cause
494 msg
+= bug_reports_message()
495 super(ExtractorError
, self
).__init
__(msg
)
498 self
.exc_info
= sys
.exc_info() # preserve original exception
500 self
.video_id
= video_id
502 def format_traceback(self
):
503 if self
.traceback
is None:
505 return ''.join(traceback
.format_tb(self
.traceback
))
508 class UnsupportedError(ExtractorError
):
509 def __init__(self
, url
):
510 super(UnsupportedError
, self
).__init
__(
511 'Unsupported URL: %s' % url
, expected
=True)
515 class RegexNotFoundError(ExtractorError
):
516 """Error when a regex didn't match"""
520 class DownloadError(Exception):
521 """Download Error exception.
523 This exception may be thrown by FileDownloader objects if they are not
524 configured to continue on errors. They will contain the appropriate
528 def __init__(self
, msg
, exc_info
=None):
529 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
530 super(DownloadError
, self
).__init
__(msg
)
531 self
.exc_info
= exc_info
534 class SameFileError(Exception):
535 """Same File exception.
537 This exception will be thrown by FileDownloader objects if they detect
538 multiple files would have to be downloaded to the same file on disk.
543 class PostProcessingError(Exception):
544 """Post Processing exception.
546 This exception may be raised by PostProcessor's .run() method to
547 indicate an error in the postprocessing task.
550 def __init__(self
, msg
):
554 class MaxDownloadsReached(Exception):
555 """ --max-downloads limit has been reached. """
559 class UnavailableVideoError(Exception):
560 """Unavailable Format exception.
562 This exception will be thrown when a video is requested
563 in a format that is not available for that video.
568 class ContentTooShortError(Exception):
569 """Content Too Short exception.
571 This exception may be raised by FileDownloader objects when a file they
572 download is too small for what the server announced first, indicating
573 the connection was probably interrupted.
579 def __init__(self
, downloaded
, expected
):
580 self
.downloaded
= downloaded
581 self
.expected
= expected
584 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
585 hc
= http_class(*args
, **kwargs
)
586 source_address
= ydl_handler
._params
.get('source_address')
587 if source_address
is not None:
588 sa
= (source_address
, 0)
589 if hasattr(hc
, 'source_address'): # Python 2.7+
590 hc
.source_address
= sa
592 def _hc_connect(self
, *args
, **kwargs
):
593 sock
= compat_socket_create_connection(
594 (self
.host
, self
.port
), self
.timeout
, sa
)
596 self
.sock
= ssl
.wrap_socket(
597 sock
, self
.key_file
, self
.cert_file
,
598 ssl_version
=ssl
.PROTOCOL_TLSv1
)
601 hc
.connect
= functools
.partial(_hc_connect
, hc
)
606 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
607 """Handler for HTTP requests and responses.
609 This class, when installed with an OpenerDirector, automatically adds
610 the standard headers to every HTTP request and handles gzipped and
611 deflated responses from web servers. If compression is to be avoided in
612 a particular request, the original request in the program code only has
613 to include the HTTP header "Youtubedl-No-Compression", which will be
614 removed before making the real request.
616 Part of this code was copied from:
618 http://techknack.net/python-urllib2-handlers/
620 Andrew Rowls, the author of that code, agreed to release it to the
624 def __init__(self
, params
, *args
, **kwargs
):
625 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
626 self
._params
= params
628 def http_open(self
, req
):
629 return self
.do_open(functools
.partial(
630 _create_http_connection
, self
, compat_http_client
.HTTPConnection
, False),
636 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
638 return zlib
.decompress(data
)
641 def addinfourl_wrapper(stream
, headers
, url
, code
):
642 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
643 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
644 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
648 def http_request(self
, req
):
649 for h
, v
in std_headers
.items():
650 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
651 # The dict keys are capitalized because of this bug by urllib
652 if h
.capitalize() not in req
.headers
:
654 if 'Youtubedl-no-compression' in req
.headers
:
655 if 'Accept-encoding' in req
.headers
:
656 del req
.headers
['Accept-encoding']
657 del req
.headers
['Youtubedl-no-compression']
659 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
660 # Python 2.6 is brain-dead when it comes to fragments
661 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
662 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
666 def http_response(self
, req
, resp
):
669 if resp
.headers
.get('Content-encoding', '') == 'gzip':
670 content
= resp
.read()
671 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
673 uncompressed
= io
.BytesIO(gz
.read())
674 except IOError as original_ioerror
:
675 # There may be junk add the end of the file
676 # See http://stackoverflow.com/q/4928560/35070 for details
677 for i
in range(1, 1024):
679 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
680 uncompressed
= io
.BytesIO(gz
.read())
685 raise original_ioerror
686 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
687 resp
.msg
= old_resp
.msg
689 if resp
.headers
.get('Content-encoding', '') == 'deflate':
690 gz
= io
.BytesIO(self
.deflate(resp
.read()))
691 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
692 resp
.msg
= old_resp
.msg
695 https_request
= http_request
696 https_response
= http_response
699 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
700 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
701 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
702 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
703 self
._params
= params
705 def https_open(self
, req
):
707 if hasattr(self
, '_context'): # python > 2.6
708 kwargs
['context'] = self
._context
709 if hasattr(self
, '_check_hostname'): # python 3.x
710 kwargs
['check_hostname'] = self
._check
_hostname
711 return self
.do_open(functools
.partial(
712 _create_http_connection
, self
, self
._https
_conn
_class
, True),
716 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
717 """ Return a UNIX timestamp from the given date """
724 r
'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
727 timezone
= datetime
.timedelta()
729 date_str
= date_str
[:-len(m
.group(0))]
730 if not m
.group('sign'):
731 timezone
= datetime
.timedelta()
733 sign
= 1 if m
.group('sign') == '+' else -1
734 timezone
= datetime
.timedelta(
735 hours
=sign
* int(m
.group('hours')),
736 minutes
=sign
* int(m
.group('minutes')))
737 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
738 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
739 return calendar
.timegm(dt
.timetuple())
742 def unified_strdate(date_str
, day_first
=True):
743 """Return a string with the date in the format YYYYMMDD"""
749 date_str
= date_str
.replace(',', ' ')
750 # %z (UTC offset) is only supported in python>=3.2
751 if not re
.match(r
'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str
):
752 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
753 # Remove AM/PM + timezone
754 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
756 format_expressions
= [
761 '%b %dst %Y %I:%M%p',
762 '%b %dnd %Y %I:%M%p',
763 '%b %dth %Y %I:%M%p',
769 '%Y-%m-%d %H:%M:%S.%f',
772 '%Y-%m-%dT%H:%M:%SZ',
773 '%Y-%m-%dT%H:%M:%S.%fZ',
774 '%Y-%m-%dT%H:%M:%S.%f0Z',
776 '%Y-%m-%dT%H:%M:%S.%f',
780 format_expressions
.extend([
788 format_expressions
.extend([
795 for expression
in format_expressions
:
797 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
800 if upload_date
is None:
801 timetuple
= email
.utils
.parsedate_tz(date_str
)
803 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
807 def determine_ext(url
, default_ext
='unknown_video'):
810 guess
= url
.partition('?')[0].rpartition('.')[2]
811 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
817 def subtitles_filename(filename
, sub_lang
, sub_format
):
818 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
821 def date_from_str(date_str
):
823 Return a datetime object from a string in the format YYYYMMDD or
824 (now|today)[+-][0-9](day|week|month|year)(s)?"""
825 today
= datetime
.date
.today()
826 if date_str
in ('now', 'today'):
828 if date_str
== 'yesterday':
829 return today
- datetime
.timedelta(days
=1)
830 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
831 if match
is not None:
832 sign
= match
.group('sign')
833 time
= int(match
.group('time'))
836 unit
= match
.group('unit')
837 # A bad aproximation?
845 delta
= datetime
.timedelta(**{unit
: time
})
847 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
850 def hyphenate_date(date_str
):
852 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
853 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
854 if match
is not None:
855 return '-'.join(match
.groups())
860 class DateRange(object):
861 """Represents a time interval between two dates"""
863 def __init__(self
, start
=None, end
=None):
864 """start and end must be strings in the format accepted by date"""
865 if start
is not None:
866 self
.start
= date_from_str(start
)
868 self
.start
= datetime
.datetime
.min.date()
870 self
.end
= date_from_str(end
)
872 self
.end
= datetime
.datetime
.max.date()
873 if self
.start
> self
.end
:
874 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
878 """Returns a range that only contains the given day"""
881 def __contains__(self
, date
):
882 """Check if the date is in the range"""
883 if not isinstance(date
, datetime
.date
):
884 date
= date_from_str(date
)
885 return self
.start
<= date
<= self
.end
888 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
892 """ Returns the platform name as a compat_str """
893 res
= platform
.platform()
894 if isinstance(res
, bytes):
895 res
= res
.decode(preferredencoding())
897 assert isinstance(res
, compat_str
)
901 def _windows_write_string(s
, out
):
902 """ Returns True if the string was written using special methods,
903 False if it has yet to be written out."""
904 # Adapted from http://stackoverflow.com/a/3259271/35070
907 import ctypes
.wintypes
915 fileno
= out
.fileno()
916 except AttributeError:
917 # If the output stream doesn't have a fileno, it's virtual
919 except io
.UnsupportedOperation
:
920 # Some strange Windows pseudo files?
922 if fileno
not in WIN_OUTPUT_IDS
:
925 GetStdHandle
= ctypes
.WINFUNCTYPE(
926 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
927 (b
"GetStdHandle", ctypes
.windll
.kernel32
))
928 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
930 WriteConsoleW
= ctypes
.WINFUNCTYPE(
931 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
932 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
933 ctypes
.wintypes
.LPVOID
)((b
"WriteConsoleW", ctypes
.windll
.kernel32
))
934 written
= ctypes
.wintypes
.DWORD(0)
936 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
"GetFileType", ctypes
.windll
.kernel32
))
937 FILE_TYPE_CHAR
= 0x0002
938 FILE_TYPE_REMOTE
= 0x8000
939 GetConsoleMode
= ctypes
.WINFUNCTYPE(
940 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
941 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
942 (b
"GetConsoleMode", ctypes
.windll
.kernel32
))
943 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
945 def not_a_console(handle
):
946 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
948 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
949 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
954 def next_nonbmp_pos(s
):
956 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
957 except StopIteration:
961 count
= min(next_nonbmp_pos(s
), 1024)
964 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
966 raise OSError('Failed to write string')
967 if not count
: # We just wrote a non-BMP character
968 assert written
.value
== 2
971 assert written
.value
> 0
972 s
= s
[written
.value
:]
976 def write_string(s
, out
=None, encoding
=None):
979 assert type(s
) == compat_str
981 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
982 if _windows_write_string(s
, out
):
985 if ('b' in getattr(out
, 'mode', '') or
986 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
987 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
989 elif hasattr(out
, 'buffer'):
990 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
991 byt
= s
.encode(enc
, 'ignore')
992 out
.buffer.write(byt
)
998 def bytes_to_intlist(bs
):
1001 if isinstance(bs
[0], int): # Python 3
1004 return [ord(c
) for c
in bs
]
1007 def intlist_to_bytes(xs
):
1010 return struct_pack('%dB' % len(xs
), *xs
)
1013 # Cross-platform file locking
1014 if sys
.platform
== 'win32':
1015 import ctypes
.wintypes
1018 class OVERLAPPED(ctypes
.Structure
):
1020 ('Internal', ctypes
.wintypes
.LPVOID
),
1021 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1022 ('Offset', ctypes
.wintypes
.DWORD
),
1023 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1024 ('hEvent', ctypes
.wintypes
.HANDLE
),
1027 kernel32
= ctypes
.windll
.kernel32
1028 LockFileEx
= kernel32
.LockFileEx
1029 LockFileEx
.argtypes
= [
1030 ctypes
.wintypes
.HANDLE
, # hFile
1031 ctypes
.wintypes
.DWORD
, # dwFlags
1032 ctypes
.wintypes
.DWORD
, # dwReserved
1033 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1034 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1035 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1037 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1038 UnlockFileEx
= kernel32
.UnlockFileEx
1039 UnlockFileEx
.argtypes
= [
1040 ctypes
.wintypes
.HANDLE
, # hFile
1041 ctypes
.wintypes
.DWORD
, # dwReserved
1042 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1043 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1044 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1046 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1047 whole_low
= 0xffffffff
1048 whole_high
= 0x7fffffff
1050 def _lock_file(f
, exclusive
):
1051 overlapped
= OVERLAPPED()
1052 overlapped
.Offset
= 0
1053 overlapped
.OffsetHigh
= 0
1054 overlapped
.hEvent
= 0
1055 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1056 handle
= msvcrt
.get_osfhandle(f
.fileno())
1057 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1058 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1059 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1061 def _unlock_file(f
):
1062 assert f
._lock
_file
_overlapped
_p
1063 handle
= msvcrt
.get_osfhandle(f
.fileno())
1064 if not UnlockFileEx(handle
, 0,
1065 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1066 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1071 def _lock_file(f
, exclusive
):
1072 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1074 def _unlock_file(f
):
1075 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1078 class locked_file(object):
1079 def __init__(self
, filename
, mode
, encoding
=None):
1080 assert mode
in ['r', 'a', 'w']
1081 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1084 def __enter__(self
):
1085 exclusive
= self
.mode
!= 'r'
1087 _lock_file(self
.f
, exclusive
)
1093 def __exit__(self
, etype
, value
, traceback
):
1095 _unlock_file(self
.f
)
1102 def write(self
, *args
):
1103 return self
.f
.write(*args
)
1105 def read(self
, *args
):
1106 return self
.f
.read(*args
)
1109 def get_filesystem_encoding():
1110 encoding
= sys
.getfilesystemencoding()
1111 return encoding
if encoding
is not None else 'utf-8'
1114 def shell_quote(args
):
1116 encoding
= get_filesystem_encoding()
1118 if isinstance(a
, bytes):
1119 # We may get a filename encoded with 'encodeFilename'
1120 a
= a
.decode(encoding
)
1121 quoted_args
.append(pipes
.quote(a
))
1122 return ' '.join(quoted_args
)
1125 def smuggle_url(url
, data
):
1126 """ Pass additional data in a URL for internal use. """
1128 sdata
= compat_urllib_parse
.urlencode(
1129 {'__youtubedl_smuggle': json
.dumps(data
)})
1130 return url
+ '#' + sdata
1133 def unsmuggle_url(smug_url
, default
=None):
1134 if '#__youtubedl_smuggle' not in smug_url
:
1135 return smug_url
, default
1136 url
, _
, sdata
= smug_url
.rpartition('#')
1137 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1138 data
= json
.loads(jsond
)
1142 def format_bytes(bytes):
1145 if type(bytes) is str:
1146 bytes = float(bytes)
1150 exponent
= int(math
.log(bytes, 1024.0))
1151 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1152 converted
= float(bytes) / float(1024 ** exponent
)
1153 return '%.2f%s' % (converted
, suffix
)
1156 def parse_filesize(s
):
1160 # The lower-case forms are of course incorrect and inofficial,
1161 # but we support those too
1199 units_re
= '|'.join(re
.escape(u
) for u
in _UNIT_TABLE
)
1201 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re
, s
)
1205 num_str
= m
.group('num').replace(',', '.')
1206 mult
= _UNIT_TABLE
[m
.group('unit')]
1207 return int(float(num_str
) * mult
)
1210 def month_by_name(name
):
1211 """ Return the number of a month by (locale-independently) English name """
1214 return ENGLISH_MONTH_NAMES
.index(name
) + 1
1219 def month_by_abbreviation(abbrev
):
1220 """ Return the number of a month by (locale-independently) English
1224 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1229 def fix_xml_ampersands(xml_str
):
1230 """Replace all the '&' by '&' in XML"""
1232 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1237 def setproctitle(title
):
1238 assert isinstance(title
, compat_str
)
1240 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1243 title_bytes
= title
.encode('utf-8')
1244 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1245 buf
.value
= title_bytes
1247 libc
.prctl(15, buf
, 0, 0, 0)
1248 except AttributeError:
1249 return # Strange libc, just skip this
1252 def remove_start(s
, start
):
1253 if s
.startswith(start
):
1254 return s
[len(start
):]
1258 def remove_end(s
, end
):
1260 return s
[:-len(end
)]
1264 def url_basename(url
):
1265 path
= compat_urlparse
.urlparse(url
).path
1266 return path
.strip('/').split('/')[-1]
1269 class HEADRequest(compat_urllib_request
.Request
):
1270 def get_method(self
):
1274 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1277 v
= getattr(v
, get_attr
, None)
1280 return default
if v
is None else (int(v
) * invscale
// scale
)
1283 def str_or_none(v
, default
=None):
1284 return default
if v
is None else compat_str(v
)
1287 def str_to_int(int_str
):
1288 """ A more relaxed version of int_or_none """
1291 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1295 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1296 return default
if v
is None else (float(v
) * invscale
/ scale
)
1299 def parse_duration(s
):
1300 if not isinstance(s
, compat_basestring
):
1308 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1309 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1311 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1314 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1315 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1317 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1319 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1324 if m
.group('only_mins'):
1325 return float_or_none(m
.group('only_mins'), invscale
=60)
1326 if m
.group('only_hours'):
1327 return float_or_none(m
.group('only_hours'), invscale
=60 * 60)
1329 res
+= int(m
.group('secs'))
1330 if m
.group('mins_reversed'):
1331 res
+= int(m
.group('mins_reversed')) * 60
1333 res
+= int(m
.group('mins')) * 60
1334 if m
.group('hours'):
1335 res
+= int(m
.group('hours')) * 60 * 60
1336 if m
.group('hours_reversed'):
1337 res
+= int(m
.group('hours_reversed')) * 60 * 60
1339 res
+= int(m
.group('days')) * 24 * 60 * 60
1341 res
+= float(m
.group('ms'))
1345 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1346 name
, real_ext
= os
.path
.splitext(filename
)
1348 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1349 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1350 else '{0}.{1}'.format(filename
, ext
))
1353 def replace_extension(filename
, ext
, expected_real_ext
=None):
1354 name
, real_ext
= os
.path
.splitext(filename
)
1355 return '{0}.{1}'.format(
1356 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1360 def check_executable(exe
, args
=[]):
1361 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1362 args can be a list of arguments for a short output (like -version) """
1364 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1370 def get_exe_version(exe
, args
=['--version'],
1371 version_re
=None, unrecognized
='present'):
1372 """ Returns the version of the specified executable,
1373 or False if the executable is not present """
1375 out
, _
= subprocess
.Popen(
1376 [encodeArgument(exe
)] + args
,
1377 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1380 if isinstance(out
, bytes): # Python 2.x
1381 out
= out
.decode('ascii', 'ignore')
1382 return detect_exe_version(out
, version_re
, unrecognized
)
1385 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1386 assert isinstance(output
, compat_str
)
1387 if version_re
is None:
1388 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1389 m
= re
.search(version_re
, output
)
1396 class PagedList(object):
1398 # This is only useful for tests
1399 return len(self
.getslice())
1402 class OnDemandPagedList(PagedList
):
1403 def __init__(self
, pagefunc
, pagesize
):
1404 self
._pagefunc
= pagefunc
1405 self
._pagesize
= pagesize
1407 def getslice(self
, start
=0, end
=None):
1409 for pagenum
in itertools
.count(start
// self
._pagesize
):
1410 firstid
= pagenum
* self
._pagesize
1411 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1412 if start
>= nextfirstid
:
1415 page_results
= list(self
._pagefunc
(pagenum
))
1418 start
% self
._pagesize
1419 if firstid
<= start
< nextfirstid
1423 ((end
- 1) % self
._pagesize
) + 1
1424 if (end
is not None and firstid
<= end
<= nextfirstid
)
1427 if startv
!= 0 or endv
is not None:
1428 page_results
= page_results
[startv
:endv
]
1429 res
.extend(page_results
)
1431 # A little optimization - if current page is not "full", ie. does
1432 # not contain page_size videos then we can assume that this page
1433 # is the last one - there are no more ids on further pages -
1434 # i.e. no need to query again.
1435 if len(page_results
) + startv
< self
._pagesize
:
1438 # If we got the whole page, but the next page is not interesting,
1439 # break out early as well
1440 if end
== nextfirstid
:
1445 class InAdvancePagedList(PagedList
):
1446 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1447 self
._pagefunc
= pagefunc
1448 self
._pagecount
= pagecount
1449 self
._pagesize
= pagesize
1451 def getslice(self
, start
=0, end
=None):
1453 start_page
= start
// self
._pagesize
1455 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1456 skip_elems
= start
- start_page
* self
._pagesize
1457 only_more
= None if end
is None else end
- start
1458 for pagenum
in range(start_page
, end_page
):
1459 page
= list(self
._pagefunc
(pagenum
))
1461 page
= page
[skip_elems
:]
1463 if only_more
is not None:
1464 if len(page
) < only_more
:
1465 only_more
-= len(page
)
1467 page
= page
[:only_more
]
1474 def uppercase_escape(s
):
1475 unicode_escape
= codecs
.getdecoder('unicode_escape')
1477 r
'\\U[0-9a-fA-F]{8}',
1478 lambda m
: unicode_escape(m
.group(0))[0],
1482 def lowercase_escape(s
):
1483 unicode_escape
= codecs
.getdecoder('unicode_escape')
1485 r
'\\u[0-9a-fA-F]{4}',
1486 lambda m
: unicode_escape(m
.group(0))[0],
1490 def escape_rfc3986(s
):
1491 """Escape non-ASCII characters as suggested by RFC 3986"""
1492 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
1493 s
= s
.encode('utf-8')
1494 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1497 def escape_url(url
):
1498 """Escape URL as suggested by RFC 3986"""
1499 url_parsed
= compat_urllib_parse_urlparse(url
)
1500 return url_parsed
._replace
(
1501 path
=escape_rfc3986(url_parsed
.path
),
1502 params
=escape_rfc3986(url_parsed
.params
),
1503 query
=escape_rfc3986(url_parsed
.query
),
1504 fragment
=escape_rfc3986(url_parsed
.fragment
)
1508 struct
.pack('!I', 0)
1510 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1511 def struct_pack(spec
, *args
):
1512 if isinstance(spec
, compat_str
):
1513 spec
= spec
.encode('ascii')
1514 return struct
.pack(spec
, *args
)
1516 def struct_unpack(spec
, *args
):
1517 if isinstance(spec
, compat_str
):
1518 spec
= spec
.encode('ascii')
1519 return struct
.unpack(spec
, *args
)
1521 struct_pack
= struct
.pack
1522 struct_unpack
= struct
.unpack
1525 def read_batch_urls(batch_fd
):
1527 if not isinstance(url
, compat_str
):
1528 url
= url
.decode('utf-8', 'replace')
1529 BOM_UTF8
= '\xef\xbb\xbf'
1530 if url
.startswith(BOM_UTF8
):
1531 url
= url
[len(BOM_UTF8
):]
1533 if url
.startswith(('#', ';', ']')):
1537 with contextlib
.closing(batch_fd
) as fd
:
1538 return [url
for url
in map(fixup
, fd
) if url
]
1541 def urlencode_postdata(*args
, **kargs
):
1542 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1546 etree_iter
= xml
.etree
.ElementTree
.Element
.iter
1547 except AttributeError: # Python <=2.6
1548 etree_iter
= lambda n
: n
.findall('.//*')
1552 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1553 def doctype(self
, name
, pubid
, system
):
1554 pass # Ignore doctypes
1556 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1557 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1558 tree
= xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1559 # Fix up XML parser in Python 2.x
1560 if sys
.version_info
< (3, 0):
1561 for n
in etree_iter(tree
):
1562 if n
.text
is not None:
1563 if not isinstance(n
.text
, compat_str
):
1564 n
.text
= n
.text
.decode('utf-8')
1577 def parse_age_limit(s
):
1580 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1581 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1584 def strip_jsonp(code
):
1586 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1589 def js_to_json(code
):
1592 if v
in ('true', 'false', 'null'):
1594 if v
.startswith('"'):
1596 if v
.startswith("'"):
1598 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1605 res = re.sub(r'''(?x)
1606 "(?
:[^
"\\]*(?:\\\\|\\['"nu
]))*[^
"\\]*"|
1607 '(?:[^'\\]*(?
:\\\\|
\\['"nu]))*[^'\\]*'|
1608 [a-zA-Z_][.a-zA-Z_0-9]*
1610 res = re.sub(r',(\s
*[\
]}])', lambda m: m.group(1), res)
1614 def qualities(quality_ids):
1615 """ Get a numeric quality value out of a list of possible values """
1618 return quality_ids.index(qid)
1624 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1627 def limit_length(s, length):
1628 """ Add ellipses to overly long strings """
1633 return s[:length - len(ELLIPSES)] + ELLIPSES
1637 def version_tuple(v):
1638 return tuple(int(e) for e in re.split(r'[-.]', v))
1641 def is_outdated_version(version, limit, assume_new=True):
1643 return not assume_new
1645 return version_tuple(version) < version_tuple(limit)
1647 return not assume_new
1650 def ytdl_is_updateable():
1651 """ Returns if youtube-dl can be updated with -U """
1652 from zipimport import zipimporter
1654 return isinstance(globals().get('__loader__
'), zipimporter) or hasattr(sys, 'frozen
')
1657 def args_to_str(args):
1658 # Get a short string representation for a subprocess command
1659 return ' '.join(shlex_quote(a) for a in args)
1662 def mimetype2ext(mt):
1663 _, _, res = mt.rpartition('/')
1667 'x
-mp4
-fragmented
': 'mp4
',
1671 def urlhandle_detect_ext(url_handle):
1674 getheader = lambda h: url_handle.headers[h]
1675 except AttributeError: # Python < 3
1676 getheader = url_handle.info().getheader
1678 cd = getheader('Content
-Disposition
')
1680 m = re.match(r'attachment
;\s
*filename
="(?P<filename>[^"]+)"', cd)
1682 e = determine_ext(m.group('filename'), default_ext=None)
1686 return mimetype2ext(getheader('Content-Type'))
1689 def age_restricted(content_limit, age_limit):
1690 """ Returns True iff the content should be blocked """
1692 if age_limit is None: # No limit set
1694 if content_limit is None:
1695 return False # Content available for everyone
1696 return age_limit < content_limit
1699 def is_html(first_bytes):
1700 """ Detect whether a file contains HTML by examining its first bytes. """
1703 (b'\xef\xbb\xbf', 'utf-8'),
1704 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1705 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1706 (b'\xff\xfe', 'utf-16-le'),
1707 (b'\xfe\xff', 'utf-16-be'),
1709 for bom, enc in BOMS:
1710 if first_bytes.startswith(bom):
1711 s = first_bytes[len(bom):].decode(enc, 'replace')
1714 s = first_bytes.decode('utf-8', 'replace')
1716 return re.match(r'^\s*<', s)
1719 def determine_protocol(info_dict):
1720 protocol = info_dict.get('protocol')
1721 if protocol is not None:
1724 url = info_dict['url']
1725 if url.startswith('rtmp'):
1727 elif url.startswith('mms'):
1729 elif url.startswith('rtsp'):
1732 ext = determine_ext(url)
1738 return compat_urllib_parse_urlparse(url).scheme
1741 def render_table(header_row, data):
1742 """ Render a list of rows, each as a list of values """
1743 table = [header_row] + data
1744 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1745 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1746 return '\n'.join(format_str % tuple(row) for row in table)
1749 def _match_one(filter_part, dct):
1750 COMPARISON_OPERATORS = {
1758 operator_rex = re.compile(r'''(?x)\s*
1760 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1762 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1763 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1766 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1767 m = operator_rex.search(filter_part)
1769 op = COMPARISON_OPERATORS[m.group('op')]
1770 if m.group('strval') is not None:
1771 if m.group('op') not in ('=', '!='):
1773 'Operator %s does not support string values!' % m.group('op'))
1774 comparison_value = m.group('strval')
1777 comparison_value = int(m.group('intval'))
1779 comparison_value = parse_filesize(m.group('intval'))
1780 if comparison_value is None:
1781 comparison_value = parse_filesize(m.group('intval') + 'B')
1782 if comparison_value is None:
1784 'Invalid integer value %r in filter part %r' % (
1785 m.group('intval'), filter_part))
1786 actual_value = dct.get(m.group('key'))
1787 if actual_value is None:
1788 return m.group('none_inclusive')
1789 return op(actual_value, comparison_value)
1792 '': lambda v: v is not None,
1793 '!': lambda v: v is None,
1795 operator_rex = re.compile(r'''(?x)\s*
1796 (?P<op>%s)\s*(?P<key>[a-z_]+)
1798 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1799 m = operator_rex.search(filter_part)
1801 op = UNARY_OPERATORS[m.group('op')]
1802 actual_value = dct.get(m.group('key'))
1803 return op(actual_value)
1805 raise ValueError('Invalid filter part %r' % filter_part)
1808 def match_str(filter_str, dct):
1809 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1812 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1815 def match_filter_func(filter_str):
1816 def _match_func(info_dict):
1817 if match_str(filter_str, info_dict):
1820 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1821 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1825 def parse_dfxp_time_expr(time_expr):
1829 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1831 return float(mobj.group('time_offset'))
1833 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1835 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1838 def srt_subtitles_timecode(seconds):
1839 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1842 def dfxp2srt(dfxp_data):
1843 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1845 def parse_node(node):
1846 str_or_empty = functools.partial(str_or_none, default='')
1848 out = str_or_empty(node.text)
1851 if child.tag == _x('ttml:br'):
1852 out += '\n' + str_or_empty(child.tail)
1853 elif child.tag == _x('ttml:span'):
1854 out += str_or_empty(parse_node(child))
1856 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1860 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1862 paras = dfxp.findall(_x('.//ttml:p'))
1864 for para, index in zip(paras, itertools.count(1)):
1865 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1866 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1868 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1869 out.append('%d\n%s --> %s\n%s\n\n' % (
1871 srt_subtitles_timecode(begin_time),
1872 srt_subtitles_timecode(end_time),
1878 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1879 def __init__(self, proxies=None):
1880 # Set default handlers
1881 for type in ('http', 'https'):
1882 setattr(self, '%s_open' % type,
1883 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1884 meth(r, proxy, type))
1885 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1887 def proxy_open(self, req, proxy, type):
1888 req_proxy = req.headers.get('Ytdl-request-proxy')
1889 if req_proxy is not None:
1891 del req.headers['Ytdl-request-proxy']
1893 if proxy == '__noproxy__':
1894 return None # No Proxy
1895 return compat_urllib_request.ProxyHandler.proxy_open(
1896 self, req, proxy, type)