2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
32 import xml
.etree
.ElementTree
42 compat_socket_create_connection
,
46 compat_urllib_parse_urlparse
,
47 compat_urllib_request
,
53 # This is not clearly defined otherwise
54 compiled_regex_type
= type(re
.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES
= [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref
= locale
.getpreferredencoding()
87 def write_json_file(obj
, fn
):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn
= encodeFilename(fn
)
91 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
92 encoding
= get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
97 # the same for os.path.dirname
98 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
100 path_basename
= os
.path
.basename
101 path_dirname
= os
.path
.dirname
105 'prefix': path_basename(fn
) + '.',
106 'dir': path_dirname(fn
),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys
.version_info
< (3, 0):
120 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
125 if sys
.platform
== 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os
.rename(tf
.name
, fn
)
141 if sys
.version_info
>= (2, 7):
142 def find_xpath_attr(node
, xpath
, key
, val
):
143 """ Find the xpath xpath[@key=val] """
144 assert re
.match(r
'^[a-zA-Z-]+$', key
)
145 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
146 expr
= xpath
+ "[@%s='%s']" % (key
, val
)
147 return node
.find(expr
)
149 def find_xpath_attr(node
, xpath
, key
, val
):
150 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
151 # .//node does not match if a node is a direct child of . !
152 if isinstance(xpath
, compat_str
):
153 xpath
= xpath
.encode('ascii')
155 for f
in node
.findall(xpath
):
156 if f
.attrib
.get(key
) == val
:
160 # On python2.6 the xml.etree.ElementTree.Element methods don't support
161 # the namespace parameter
164 def xpath_with_ns(path
, ns_map
):
165 components
= [c
.split(':') for c
in path
.split('/')]
169 replaced
.append(c
[0])
172 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
173 return '/'.join(replaced
)
176 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
177 if sys
.version_info
< (2, 7): # Crazy 2.6
178 xpath
= xpath
.encode('ascii')
181 if n
is None or n
.text
is None:
182 if default
is not NO_DEFAULT
:
185 name
= xpath
if name
is None else name
186 raise ExtractorError('Could not find XML element %s' % name
)
192 def get_element_by_id(id, html
):
193 """Return the content of the tag with the specified ID in the passed HTML document"""
194 return get_element_by_attribute("id", id, html
)
197 def get_element_by_attribute(attribute
, value
, html
):
198 """Return the content of the tag with the specified attribute in the passed HTML document"""
200 m
= re
.search(r
'''(?xs)
202 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
204 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
208 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
212 res
= m
.group('content')
214 if res
.startswith('"') or res
.startswith("'"):
217 return unescapeHTML(res
)
220 def clean_html(html
):
221 """Clean an HTML snippet into a readable string"""
223 if html
is None: # Convenience for sanitizing descriptions etc.
227 html
= html
.replace('\n', ' ')
228 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
229 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
231 html
= re
.sub('<.*?>', '', html
)
232 # Replace html entities
233 html
= unescapeHTML(html
)
237 def sanitize_open(filename
, open_mode
):
238 """Try to open the given filename, and slightly tweak it if this fails.
240 Attempts to open the given filename. If this fails, it tries to change
241 the filename slightly, step by step, until it's either able to open it
242 or it fails and raises a final exception, like the standard open()
245 It returns the tuple (stream, definitive_file_name).
249 if sys
.platform
== 'win32':
251 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
252 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
253 stream
= open(encodeFilename(filename
), open_mode
)
254 return (stream
, filename
)
255 except (IOError, OSError) as err
:
256 if err
.errno
in (errno
.EACCES
,):
259 # In case of error, try to remove win32 forbidden chars
260 alt_filename
= sanitize_path(filename
)
261 if alt_filename
== filename
:
264 # An exception here should be caught in the caller
265 stream
= open(encodeFilename(alt_filename
), open_mode
)
266 return (stream
, alt_filename
)
269 def timeconvert(timestr
):
270 """Convert RFC 2822 defined time string into system timestamp"""
272 timetuple
= email
.utils
.parsedate_tz(timestr
)
273 if timetuple
is not None:
274 timestamp
= email
.utils
.mktime_tz(timetuple
)
278 def sanitize_filename(s
, restricted
=False, is_id
=False):
279 """Sanitizes a string so it could be used as part of a filename.
280 If restricted is set, use a stricter subset of allowed characters.
281 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
283 def replace_insane(char
):
284 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
287 return '' if restricted
else '\''
289 return '_-' if restricted
else ' -'
290 elif char
in '\\/|*<>':
292 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
294 if restricted
and ord(char
) > 127:
299 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
300 result
= ''.join(map(replace_insane
, s
))
302 while '__' in result
:
303 result
= result
.replace('__', '_')
304 result
= result
.strip('_')
305 # Common case of "Foreign band name - English song title"
306 if restricted
and result
.startswith('-_'):
308 if result
.startswith('-'):
309 result
= '_' + result
[len('-'):]
310 result
= result
.lstrip('.')
316 def sanitize_path(s
):
317 """Sanitizes and normalizes path on Windows"""
318 if sys
.platform
!= 'win32':
320 drive_or_unc
, _
= os
.path
.splitdrive(s
)
321 if sys
.version_info
< (2, 7) and not drive_or_unc
:
322 drive_or_unc
, _
= os
.path
.splitunc(s
)
323 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
327 path_part
if path_part
in ['.', '..'] else re
.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part
)
328 for path_part
in norm_path
]
330 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
331 return os
.path
.join(*sanitized_path
)
334 def orderedSet(iterable
):
335 """ Remove all duplicates from the input iterable """
343 def _htmlentity_transform(entity
):
344 """Transforms an HTML entity to a character."""
345 # Known non-numeric HTML entity
346 if entity
in compat_html_entities
.name2codepoint
:
347 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
349 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
351 numstr
= mobj
.group(1)
352 if numstr
.startswith('x'):
354 numstr
= '0%s' % numstr
357 return compat_chr(int(numstr
, base
))
359 # Unknown entity in name, return its literal representation
360 return ('&%s;' % entity
)
366 assert type(s
) == compat_str
369 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
372 def get_subprocess_encoding():
373 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
374 # For subprocess calls, encode with locale encoding
375 # Refer to http://stackoverflow.com/a/9951851/35070
376 encoding
= preferredencoding()
378 encoding
= sys
.getfilesystemencoding()
384 def encodeFilename(s
, for_subprocess
=False):
386 @param s The name of the file
389 assert type(s
) == compat_str
391 # Python 3 has a Unicode API
392 if sys
.version_info
>= (3, 0):
395 # Pass '' directly to use Unicode APIs on Windows 2000 and up
396 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
397 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
398 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
401 return s
.encode(get_subprocess_encoding(), 'ignore')
404 def decodeFilename(b
, for_subprocess
=False):
406 if sys
.version_info
>= (3, 0):
409 if not isinstance(b
, bytes):
412 return b
.decode(get_subprocess_encoding(), 'ignore')
415 def encodeArgument(s
):
416 if not isinstance(s
, compat_str
):
417 # Legacy code that uses byte strings
418 # Uncomment the following line after fixing all post processors
419 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
420 s
= s
.decode('ascii')
421 return encodeFilename(s
, True)
424 def decodeArgument(b
):
425 return decodeFilename(b
, True)
428 def decodeOption(optval
):
431 if isinstance(optval
, bytes):
432 optval
= optval
.decode(preferredencoding())
434 assert isinstance(optval
, compat_str
)
438 def formatSeconds(secs
):
440 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
442 return '%d:%02d' % (secs
// 60, secs
% 60)
447 def make_HTTPS_handler(params
, **kwargs
):
448 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
449 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
450 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
451 if opts_no_check_certificate
:
452 context
.check_hostname
= False
453 context
.verify_mode
= ssl
.CERT_NONE
455 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
458 # (create_default_context present but HTTPSHandler has no context=)
461 if sys
.version_info
< (3, 2):
462 return YoutubeDLHTTPSHandler(params
, **kwargs
)
464 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
465 context
.verify_mode
= (ssl
.CERT_NONE
466 if opts_no_check_certificate
467 else ssl
.CERT_REQUIRED
)
468 context
.set_default_verify_paths()
469 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
472 def bug_reports_message():
473 if ytdl_is_updateable():
474 update_cmd
= 'type youtube-dl -U to update'
476 update_cmd
= 'see https://yt-dl.org/update on how to update'
477 msg
= '; please report this issue on https://yt-dl.org/bug .'
478 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
479 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 class ExtractorError(Exception):
484 """Error during info extraction."""
486 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
487 """ tb, if given, is the original traceback (so that it can be printed out).
488 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
491 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
493 if video_id
is not None:
494 msg
= video_id
+ ': ' + msg
496 msg
+= ' (caused by %r)' % cause
498 msg
+= bug_reports_message()
499 super(ExtractorError
, self
).__init
__(msg
)
502 self
.exc_info
= sys
.exc_info() # preserve original exception
504 self
.video_id
= video_id
506 def format_traceback(self
):
507 if self
.traceback
is None:
509 return ''.join(traceback
.format_tb(self
.traceback
))
512 class UnsupportedError(ExtractorError
):
513 def __init__(self
, url
):
514 super(UnsupportedError
, self
).__init
__(
515 'Unsupported URL: %s' % url
, expected
=True)
519 class RegexNotFoundError(ExtractorError
):
520 """Error when a regex didn't match"""
524 class DownloadError(Exception):
525 """Download Error exception.
527 This exception may be thrown by FileDownloader objects if they are not
528 configured to continue on errors. They will contain the appropriate
532 def __init__(self
, msg
, exc_info
=None):
533 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
534 super(DownloadError
, self
).__init
__(msg
)
535 self
.exc_info
= exc_info
538 class SameFileError(Exception):
539 """Same File exception.
541 This exception will be thrown by FileDownloader objects if they detect
542 multiple files would have to be downloaded to the same file on disk.
547 class PostProcessingError(Exception):
548 """Post Processing exception.
550 This exception may be raised by PostProcessor's .run() method to
551 indicate an error in the postprocessing task.
554 def __init__(self
, msg
):
558 class MaxDownloadsReached(Exception):
559 """ --max-downloads limit has been reached. """
563 class UnavailableVideoError(Exception):
564 """Unavailable Format exception.
566 This exception will be thrown when a video is requested
567 in a format that is not available for that video.
572 class ContentTooShortError(Exception):
573 """Content Too Short exception.
575 This exception may be raised by FileDownloader objects when a file they
576 download is too small for what the server announced first, indicating
577 the connection was probably interrupted.
583 def __init__(self
, downloaded
, expected
):
584 self
.downloaded
= downloaded
585 self
.expected
= expected
588 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
589 hc
= http_class(*args
, **kwargs
)
590 source_address
= ydl_handler
._params
.get('source_address')
591 if source_address
is not None:
592 sa
= (source_address
, 0)
593 if hasattr(hc
, 'source_address'): # Python 2.7+
594 hc
.source_address
= sa
596 def _hc_connect(self
, *args
, **kwargs
):
597 sock
= compat_socket_create_connection(
598 (self
.host
, self
.port
), self
.timeout
, sa
)
600 self
.sock
= ssl
.wrap_socket(
601 sock
, self
.key_file
, self
.cert_file
,
602 ssl_version
=ssl
.PROTOCOL_TLSv1
)
605 hc
.connect
= functools
.partial(_hc_connect
, hc
)
610 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
611 """Handler for HTTP requests and responses.
613 This class, when installed with an OpenerDirector, automatically adds
614 the standard headers to every HTTP request and handles gzipped and
615 deflated responses from web servers. If compression is to be avoided in
616 a particular request, the original request in the program code only has
617 to include the HTTP header "Youtubedl-No-Compression", which will be
618 removed before making the real request.
620 Part of this code was copied from:
622 http://techknack.net/python-urllib2-handlers/
624 Andrew Rowls, the author of that code, agreed to release it to the
628 def __init__(self
, params
, *args
, **kwargs
):
629 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
630 self
._params
= params
632 def http_open(self
, req
):
633 return self
.do_open(functools
.partial(
634 _create_http_connection
, self
, compat_http_client
.HTTPConnection
, False),
640 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
642 return zlib
.decompress(data
)
645 def addinfourl_wrapper(stream
, headers
, url
, code
):
646 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
647 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
648 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
652 def http_request(self
, req
):
653 for h
, v
in std_headers
.items():
654 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
655 # The dict keys are capitalized because of this bug by urllib
656 if h
.capitalize() not in req
.headers
:
658 if 'Youtubedl-no-compression' in req
.headers
:
659 if 'Accept-encoding' in req
.headers
:
660 del req
.headers
['Accept-encoding']
661 del req
.headers
['Youtubedl-no-compression']
663 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
664 # Python 2.6 is brain-dead when it comes to fragments
665 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
666 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
670 def http_response(self
, req
, resp
):
673 if resp
.headers
.get('Content-encoding', '') == 'gzip':
674 content
= resp
.read()
675 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
677 uncompressed
= io
.BytesIO(gz
.read())
678 except IOError as original_ioerror
:
679 # There may be junk add the end of the file
680 # See http://stackoverflow.com/q/4928560/35070 for details
681 for i
in range(1, 1024):
683 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
684 uncompressed
= io
.BytesIO(gz
.read())
689 raise original_ioerror
690 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
691 resp
.msg
= old_resp
.msg
693 if resp
.headers
.get('Content-encoding', '') == 'deflate':
694 gz
= io
.BytesIO(self
.deflate(resp
.read()))
695 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
696 resp
.msg
= old_resp
.msg
699 https_request
= http_request
700 https_response
= http_response
703 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
704 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
705 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
706 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
707 self
._params
= params
709 def https_open(self
, req
):
711 if hasattr(self
, '_context'): # python > 2.6
712 kwargs
['context'] = self
._context
713 if hasattr(self
, '_check_hostname'): # python 3.x
714 kwargs
['check_hostname'] = self
._check
_hostname
715 return self
.do_open(functools
.partial(
716 _create_http_connection
, self
, self
._https
_conn
_class
, True),
720 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
721 """ Return a UNIX timestamp from the given date """
728 r
'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
731 timezone
= datetime
.timedelta()
733 date_str
= date_str
[:-len(m
.group(0))]
734 if not m
.group('sign'):
735 timezone
= datetime
.timedelta()
737 sign
= 1 if m
.group('sign') == '+' else -1
738 timezone
= datetime
.timedelta(
739 hours
=sign
* int(m
.group('hours')),
740 minutes
=sign
* int(m
.group('minutes')))
741 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
742 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
743 return calendar
.timegm(dt
.timetuple())
746 def unified_strdate(date_str
, day_first
=True):
747 """Return a string with the date in the format YYYYMMDD"""
753 date_str
= date_str
.replace(',', ' ')
754 # %z (UTC offset) is only supported in python>=3.2
755 if not re
.match(r
'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str
):
756 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
757 # Remove AM/PM + timezone
758 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
760 format_expressions
= [
765 '%b %dst %Y %I:%M%p',
766 '%b %dnd %Y %I:%M%p',
767 '%b %dth %Y %I:%M%p',
773 '%Y-%m-%d %H:%M:%S.%f',
776 '%Y-%m-%dT%H:%M:%SZ',
777 '%Y-%m-%dT%H:%M:%S.%fZ',
778 '%Y-%m-%dT%H:%M:%S.%f0Z',
780 '%Y-%m-%dT%H:%M:%S.%f',
784 format_expressions
.extend([
792 format_expressions
.extend([
799 for expression
in format_expressions
:
801 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
804 if upload_date
is None:
805 timetuple
= email
.utils
.parsedate_tz(date_str
)
807 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
811 def determine_ext(url
, default_ext
='unknown_video'):
814 guess
= url
.partition('?')[0].rpartition('.')[2]
815 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
821 def subtitles_filename(filename
, sub_lang
, sub_format
):
822 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
825 def date_from_str(date_str
):
827 Return a datetime object from a string in the format YYYYMMDD or
828 (now|today)[+-][0-9](day|week|month|year)(s)?"""
829 today
= datetime
.date
.today()
830 if date_str
in ('now', 'today'):
832 if date_str
== 'yesterday':
833 return today
- datetime
.timedelta(days
=1)
834 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
835 if match
is not None:
836 sign
= match
.group('sign')
837 time
= int(match
.group('time'))
840 unit
= match
.group('unit')
841 # A bad aproximation?
849 delta
= datetime
.timedelta(**{unit
: time
})
851 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
854 def hyphenate_date(date_str
):
856 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
857 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
858 if match
is not None:
859 return '-'.join(match
.groups())
864 class DateRange(object):
865 """Represents a time interval between two dates"""
867 def __init__(self
, start
=None, end
=None):
868 """start and end must be strings in the format accepted by date"""
869 if start
is not None:
870 self
.start
= date_from_str(start
)
872 self
.start
= datetime
.datetime
.min.date()
874 self
.end
= date_from_str(end
)
876 self
.end
= datetime
.datetime
.max.date()
877 if self
.start
> self
.end
:
878 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
882 """Returns a range that only contains the given day"""
885 def __contains__(self
, date
):
886 """Check if the date is in the range"""
887 if not isinstance(date
, datetime
.date
):
888 date
= date_from_str(date
)
889 return self
.start
<= date
<= self
.end
892 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
896 """ Returns the platform name as a compat_str """
897 res
= platform
.platform()
898 if isinstance(res
, bytes):
899 res
= res
.decode(preferredencoding())
901 assert isinstance(res
, compat_str
)
905 def _windows_write_string(s
, out
):
906 """ Returns True if the string was written using special methods,
907 False if it has yet to be written out."""
908 # Adapted from http://stackoverflow.com/a/3259271/35070
911 import ctypes
.wintypes
919 fileno
= out
.fileno()
920 except AttributeError:
921 # If the output stream doesn't have a fileno, it's virtual
923 except io
.UnsupportedOperation
:
924 # Some strange Windows pseudo files?
926 if fileno
not in WIN_OUTPUT_IDS
:
929 GetStdHandle
= ctypes
.WINFUNCTYPE(
930 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
931 (b
"GetStdHandle", ctypes
.windll
.kernel32
))
932 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
934 WriteConsoleW
= ctypes
.WINFUNCTYPE(
935 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
936 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
937 ctypes
.wintypes
.LPVOID
)((b
"WriteConsoleW", ctypes
.windll
.kernel32
))
938 written
= ctypes
.wintypes
.DWORD(0)
940 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
"GetFileType", ctypes
.windll
.kernel32
))
941 FILE_TYPE_CHAR
= 0x0002
942 FILE_TYPE_REMOTE
= 0x8000
943 GetConsoleMode
= ctypes
.WINFUNCTYPE(
944 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
945 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
946 (b
"GetConsoleMode", ctypes
.windll
.kernel32
))
947 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
949 def not_a_console(handle
):
950 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
952 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
953 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
958 def next_nonbmp_pos(s
):
960 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
961 except StopIteration:
965 count
= min(next_nonbmp_pos(s
), 1024)
968 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
970 raise OSError('Failed to write string')
971 if not count
: # We just wrote a non-BMP character
972 assert written
.value
== 2
975 assert written
.value
> 0
976 s
= s
[written
.value
:]
980 def write_string(s
, out
=None, encoding
=None):
983 assert type(s
) == compat_str
985 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
986 if _windows_write_string(s
, out
):
989 if ('b' in getattr(out
, 'mode', '') or
990 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
991 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
993 elif hasattr(out
, 'buffer'):
994 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
995 byt
= s
.encode(enc
, 'ignore')
996 out
.buffer.write(byt
)
1002 def bytes_to_intlist(bs
):
1005 if isinstance(bs
[0], int): # Python 3
1008 return [ord(c
) for c
in bs
]
1011 def intlist_to_bytes(xs
):
1014 return struct_pack('%dB' % len(xs
), *xs
)
1017 # Cross-platform file locking
1018 if sys
.platform
== 'win32':
1019 import ctypes
.wintypes
1022 class OVERLAPPED(ctypes
.Structure
):
1024 ('Internal', ctypes
.wintypes
.LPVOID
),
1025 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1026 ('Offset', ctypes
.wintypes
.DWORD
),
1027 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1028 ('hEvent', ctypes
.wintypes
.HANDLE
),
1031 kernel32
= ctypes
.windll
.kernel32
1032 LockFileEx
= kernel32
.LockFileEx
1033 LockFileEx
.argtypes
= [
1034 ctypes
.wintypes
.HANDLE
, # hFile
1035 ctypes
.wintypes
.DWORD
, # dwFlags
1036 ctypes
.wintypes
.DWORD
, # dwReserved
1037 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1038 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1039 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1041 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1042 UnlockFileEx
= kernel32
.UnlockFileEx
1043 UnlockFileEx
.argtypes
= [
1044 ctypes
.wintypes
.HANDLE
, # hFile
1045 ctypes
.wintypes
.DWORD
, # dwReserved
1046 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1047 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1048 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1050 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1051 whole_low
= 0xffffffff
1052 whole_high
= 0x7fffffff
1054 def _lock_file(f
, exclusive
):
1055 overlapped
= OVERLAPPED()
1056 overlapped
.Offset
= 0
1057 overlapped
.OffsetHigh
= 0
1058 overlapped
.hEvent
= 0
1059 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1060 handle
= msvcrt
.get_osfhandle(f
.fileno())
1061 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1062 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1063 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1065 def _unlock_file(f
):
1066 assert f
._lock
_file
_overlapped
_p
1067 handle
= msvcrt
.get_osfhandle(f
.fileno())
1068 if not UnlockFileEx(handle
, 0,
1069 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1070 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1075 def _lock_file(f
, exclusive
):
1076 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1078 def _unlock_file(f
):
1079 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1082 class locked_file(object):
1083 def __init__(self
, filename
, mode
, encoding
=None):
1084 assert mode
in ['r', 'a', 'w']
1085 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1088 def __enter__(self
):
1089 exclusive
= self
.mode
!= 'r'
1091 _lock_file(self
.f
, exclusive
)
1097 def __exit__(self
, etype
, value
, traceback
):
1099 _unlock_file(self
.f
)
1106 def write(self
, *args
):
1107 return self
.f
.write(*args
)
1109 def read(self
, *args
):
1110 return self
.f
.read(*args
)
1113 def get_filesystem_encoding():
1114 encoding
= sys
.getfilesystemencoding()
1115 return encoding
if encoding
is not None else 'utf-8'
1118 def shell_quote(args
):
1120 encoding
= get_filesystem_encoding()
1122 if isinstance(a
, bytes):
1123 # We may get a filename encoded with 'encodeFilename'
1124 a
= a
.decode(encoding
)
1125 quoted_args
.append(pipes
.quote(a
))
1126 return ' '.join(quoted_args
)
1129 def smuggle_url(url
, data
):
1130 """ Pass additional data in a URL for internal use. """
1132 sdata
= compat_urllib_parse
.urlencode(
1133 {'__youtubedl_smuggle': json
.dumps(data
)})
1134 return url
+ '#' + sdata
1137 def unsmuggle_url(smug_url
, default
=None):
1138 if '#__youtubedl_smuggle' not in smug_url
:
1139 return smug_url
, default
1140 url
, _
, sdata
= smug_url
.rpartition('#')
1141 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1142 data
= json
.loads(jsond
)
1146 def format_bytes(bytes):
1149 if type(bytes) is str:
1150 bytes = float(bytes)
1154 exponent
= int(math
.log(bytes, 1024.0))
1155 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1156 converted
= float(bytes) / float(1024 ** exponent
)
1157 return '%.2f%s' % (converted
, suffix
)
1160 def parse_filesize(s
):
1164 # The lower-case forms are of course incorrect and inofficial,
1165 # but we support those too
1203 units_re
= '|'.join(re
.escape(u
) for u
in _UNIT_TABLE
)
1205 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re
, s
)
1209 num_str
= m
.group('num').replace(',', '.')
1210 mult
= _UNIT_TABLE
[m
.group('unit')]
1211 return int(float(num_str
) * mult
)
1214 def month_by_name(name
):
1215 """ Return the number of a month by (locale-independently) English name """
1218 return ENGLISH_MONTH_NAMES
.index(name
) + 1
1223 def month_by_abbreviation(abbrev
):
1224 """ Return the number of a month by (locale-independently) English
1228 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1233 def fix_xml_ampersands(xml_str
):
1234 """Replace all the '&' by '&' in XML"""
1236 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1241 def setproctitle(title
):
1242 assert isinstance(title
, compat_str
)
1244 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1247 title_bytes
= title
.encode('utf-8')
1248 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1249 buf
.value
= title_bytes
1251 libc
.prctl(15, buf
, 0, 0, 0)
1252 except AttributeError:
1253 return # Strange libc, just skip this
1256 def remove_start(s
, start
):
1257 if s
.startswith(start
):
1258 return s
[len(start
):]
1262 def remove_end(s
, end
):
1264 return s
[:-len(end
)]
1268 def url_basename(url
):
1269 path
= compat_urlparse
.urlparse(url
).path
1270 return path
.strip('/').split('/')[-1]
1273 class HEADRequest(compat_urllib_request
.Request
):
1274 def get_method(self
):
1278 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1281 v
= getattr(v
, get_attr
, None)
1284 return default
if v
is None else (int(v
) * invscale
// scale
)
1287 def str_or_none(v
, default
=None):
1288 return default
if v
is None else compat_str(v
)
1291 def str_to_int(int_str
):
1292 """ A more relaxed version of int_or_none """
1295 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1299 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1300 return default
if v
is None else (float(v
) * invscale
/ scale
)
1303 def parse_duration(s
):
1304 if not isinstance(s
, compat_basestring
):
1312 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1313 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1315 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1318 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1319 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1321 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1323 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1328 if m
.group('only_mins'):
1329 return float_or_none(m
.group('only_mins'), invscale
=60)
1330 if m
.group('only_hours'):
1331 return float_or_none(m
.group('only_hours'), invscale
=60 * 60)
1333 res
+= int(m
.group('secs'))
1334 if m
.group('mins_reversed'):
1335 res
+= int(m
.group('mins_reversed')) * 60
1337 res
+= int(m
.group('mins')) * 60
1338 if m
.group('hours'):
1339 res
+= int(m
.group('hours')) * 60 * 60
1340 if m
.group('hours_reversed'):
1341 res
+= int(m
.group('hours_reversed')) * 60 * 60
1343 res
+= int(m
.group('days')) * 24 * 60 * 60
1345 res
+= float(m
.group('ms'))
1349 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1350 name
, real_ext
= os
.path
.splitext(filename
)
1352 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1353 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1354 else '{0}.{1}'.format(filename
, ext
))
1357 def replace_extension(filename
, ext
, expected_real_ext
=None):
1358 name
, real_ext
= os
.path
.splitext(filename
)
1359 return '{0}.{1}'.format(
1360 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1364 def check_executable(exe
, args
=[]):
1365 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1366 args can be a list of arguments for a short output (like -version) """
1368 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1374 def get_exe_version(exe
, args
=['--version'],
1375 version_re
=None, unrecognized
='present'):
1376 """ Returns the version of the specified executable,
1377 or False if the executable is not present """
1379 out
, _
= subprocess
.Popen(
1380 [encodeArgument(exe
)] + args
,
1381 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1384 if isinstance(out
, bytes): # Python 2.x
1385 out
= out
.decode('ascii', 'ignore')
1386 return detect_exe_version(out
, version_re
, unrecognized
)
1389 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1390 assert isinstance(output
, compat_str
)
1391 if version_re
is None:
1392 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1393 m
= re
.search(version_re
, output
)
1400 class PagedList(object):
1402 # This is only useful for tests
1403 return len(self
.getslice())
1406 class OnDemandPagedList(PagedList
):
1407 def __init__(self
, pagefunc
, pagesize
):
1408 self
._pagefunc
= pagefunc
1409 self
._pagesize
= pagesize
1411 def getslice(self
, start
=0, end
=None):
1413 for pagenum
in itertools
.count(start
// self
._pagesize
):
1414 firstid
= pagenum
* self
._pagesize
1415 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1416 if start
>= nextfirstid
:
1419 page_results
= list(self
._pagefunc
(pagenum
))
1422 start
% self
._pagesize
1423 if firstid
<= start
< nextfirstid
1427 ((end
- 1) % self
._pagesize
) + 1
1428 if (end
is not None and firstid
<= end
<= nextfirstid
)
1431 if startv
!= 0 or endv
is not None:
1432 page_results
= page_results
[startv
:endv
]
1433 res
.extend(page_results
)
1435 # A little optimization - if current page is not "full", ie. does
1436 # not contain page_size videos then we can assume that this page
1437 # is the last one - there are no more ids on further pages -
1438 # i.e. no need to query again.
1439 if len(page_results
) + startv
< self
._pagesize
:
1442 # If we got the whole page, but the next page is not interesting,
1443 # break out early as well
1444 if end
== nextfirstid
:
1449 class InAdvancePagedList(PagedList
):
1450 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1451 self
._pagefunc
= pagefunc
1452 self
._pagecount
= pagecount
1453 self
._pagesize
= pagesize
1455 def getslice(self
, start
=0, end
=None):
1457 start_page
= start
// self
._pagesize
1459 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1460 skip_elems
= start
- start_page
* self
._pagesize
1461 only_more
= None if end
is None else end
- start
1462 for pagenum
in range(start_page
, end_page
):
1463 page
= list(self
._pagefunc
(pagenum
))
1465 page
= page
[skip_elems
:]
1467 if only_more
is not None:
1468 if len(page
) < only_more
:
1469 only_more
-= len(page
)
1471 page
= page
[:only_more
]
1478 def uppercase_escape(s
):
1479 unicode_escape
= codecs
.getdecoder('unicode_escape')
1481 r
'\\U[0-9a-fA-F]{8}',
1482 lambda m
: unicode_escape(m
.group(0))[0],
1486 def lowercase_escape(s
):
1487 unicode_escape
= codecs
.getdecoder('unicode_escape')
1489 r
'\\u[0-9a-fA-F]{4}',
1490 lambda m
: unicode_escape(m
.group(0))[0],
1494 def escape_rfc3986(s
):
1495 """Escape non-ASCII characters as suggested by RFC 3986"""
1496 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
1497 s
= s
.encode('utf-8')
1498 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1501 def escape_url(url
):
1502 """Escape URL as suggested by RFC 3986"""
1503 url_parsed
= compat_urllib_parse_urlparse(url
)
1504 return url_parsed
._replace
(
1505 path
=escape_rfc3986(url_parsed
.path
),
1506 params
=escape_rfc3986(url_parsed
.params
),
1507 query
=escape_rfc3986(url_parsed
.query
),
1508 fragment
=escape_rfc3986(url_parsed
.fragment
)
1512 struct
.pack('!I', 0)
1514 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1515 def struct_pack(spec
, *args
):
1516 if isinstance(spec
, compat_str
):
1517 spec
= spec
.encode('ascii')
1518 return struct
.pack(spec
, *args
)
1520 def struct_unpack(spec
, *args
):
1521 if isinstance(spec
, compat_str
):
1522 spec
= spec
.encode('ascii')
1523 return struct
.unpack(spec
, *args
)
1525 struct_pack
= struct
.pack
1526 struct_unpack
= struct
.unpack
1529 def read_batch_urls(batch_fd
):
1531 if not isinstance(url
, compat_str
):
1532 url
= url
.decode('utf-8', 'replace')
1533 BOM_UTF8
= '\xef\xbb\xbf'
1534 if url
.startswith(BOM_UTF8
):
1535 url
= url
[len(BOM_UTF8
):]
1537 if url
.startswith(('#', ';', ']')):
1541 with contextlib
.closing(batch_fd
) as fd
:
1542 return [url
for url
in map(fixup
, fd
) if url
]
1545 def urlencode_postdata(*args
, **kargs
):
1546 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1550 etree_iter
= xml
.etree
.ElementTree
.Element
.iter
1551 except AttributeError: # Python <=2.6
1552 etree_iter
= lambda n
: n
.findall('.//*')
1556 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1557 def doctype(self
, name
, pubid
, system
):
1558 pass # Ignore doctypes
1560 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1561 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1562 tree
= xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1563 # Fix up XML parser in Python 2.x
1564 if sys
.version_info
< (3, 0):
1565 for n
in etree_iter(tree
):
1566 if n
.text
is not None:
1567 if not isinstance(n
.text
, compat_str
):
1568 n
.text
= n
.text
.decode('utf-8')
1581 def parse_age_limit(s
):
1584 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1585 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1588 def strip_jsonp(code
):
1590 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1593 def js_to_json(code
):
1596 if v
in ('true', 'false', 'null'):
1598 if v
.startswith('"'):
1600 if v
.startswith("'"):
1602 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1609 res = re.sub(r'''(?x)
1610 "(?
:[^
"\\]*(?:\\\\|\\['"nu
]))*[^
"\\]*"|
1611 '(?:[^'\\]*(?
:\\\\|
\\['"nu]))*[^'\\]*'|
1612 [a-zA-Z_][.a-zA-Z_0-9]*
1614 res = re.sub(r',(\s
*[\
]}])', lambda m: m.group(1), res)
1618 def qualities(quality_ids):
1619 """ Get a numeric quality value out of a list of possible values """
1622 return quality_ids.index(qid)
1628 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1631 def limit_length(s, length):
1632 """ Add ellipses to overly long strings """
1637 return s[:length - len(ELLIPSES)] + ELLIPSES
1641 def version_tuple(v):
1642 return tuple(int(e) for e in re.split(r'[-.]', v))
1645 def is_outdated_version(version, limit, assume_new=True):
1647 return not assume_new
1649 return version_tuple(version) < version_tuple(limit)
1651 return not assume_new
1654 def ytdl_is_updateable():
1655 """ Returns if youtube-dl can be updated with -U """
1656 from zipimport import zipimporter
1658 return isinstance(globals().get('__loader__
'), zipimporter) or hasattr(sys, 'frozen
')
1661 def args_to_str(args):
1662 # Get a short string representation for a subprocess command
1663 return ' '.join(shlex_quote(a) for a in args)
1666 def mimetype2ext(mt):
1667 _, _, res = mt.rpartition('/')
1671 'x
-mp4
-fragmented
': 'mp4
',
1676 def urlhandle_detect_ext(url_handle):
1679 getheader = lambda h: url_handle.headers[h]
1680 except AttributeError: # Python < 3
1681 getheader = url_handle.info().getheader
1683 cd = getheader('Content
-Disposition
')
1685 m = re.match(r'attachment
;\s
*filename
="(?P<filename>[^"]+)"', cd)
1687 e = determine_ext(m.group('filename'), default_ext=None)
1691 return mimetype2ext(getheader('Content-Type'))
1694 def age_restricted(content_limit, age_limit):
1695 """ Returns True iff the content should be blocked """
1697 if age_limit is None: # No limit set
1699 if content_limit is None:
1700 return False # Content available for everyone
1701 return age_limit < content_limit
1704 def is_html(first_bytes):
1705 """ Detect whether a file contains HTML by examining its first bytes. """
1708 (b'\xef\xbb\xbf', 'utf-8'),
1709 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1710 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1711 (b'\xff\xfe', 'utf-16-le'),
1712 (b'\xfe\xff', 'utf-16-be'),
1714 for bom, enc in BOMS:
1715 if first_bytes.startswith(bom):
1716 s = first_bytes[len(bom):].decode(enc, 'replace')
1719 s = first_bytes.decode('utf-8', 'replace')
1721 return re.match(r'^\s*<', s)
1724 def determine_protocol(info_dict):
1725 protocol = info_dict.get('protocol')
1726 if protocol is not None:
1729 url = info_dict['url']
1730 if url.startswith('rtmp'):
1732 elif url.startswith('mms'):
1734 elif url.startswith('rtsp'):
1737 ext = determine_ext(url)
1743 return compat_urllib_parse_urlparse(url).scheme
1746 def render_table(header_row, data):
1747 """ Render a list of rows, each as a list of values """
1748 table = [header_row] + data
1749 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1750 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1751 return '\n'.join(format_str % tuple(row) for row in table)
1754 def _match_one(filter_part, dct):
1755 COMPARISON_OPERATORS = {
1763 operator_rex = re.compile(r'''(?x)\s*
1765 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1767 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1768 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1771 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1772 m = operator_rex.search(filter_part)
1774 op = COMPARISON_OPERATORS[m.group('op')]
1775 if m.group('strval') is not None:
1776 if m.group('op') not in ('=', '!='):
1778 'Operator %s does not support string values!' % m.group('op'))
1779 comparison_value = m.group('strval')
1782 comparison_value = int(m.group('intval'))
1784 comparison_value = parse_filesize(m.group('intval'))
1785 if comparison_value is None:
1786 comparison_value = parse_filesize(m.group('intval') + 'B')
1787 if comparison_value is None:
1789 'Invalid integer value %r in filter part %r' % (
1790 m.group('intval'), filter_part))
1791 actual_value = dct.get(m.group('key'))
1792 if actual_value is None:
1793 return m.group('none_inclusive')
1794 return op(actual_value, comparison_value)
1797 '': lambda v: v is not None,
1798 '!': lambda v: v is None,
1800 operator_rex = re.compile(r'''(?x)\s*
1801 (?P<op>%s)\s*(?P<key>[a-z_]+)
1803 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1804 m = operator_rex.search(filter_part)
1806 op = UNARY_OPERATORS[m.group('op')]
1807 actual_value = dct.get(m.group('key'))
1808 return op(actual_value)
1810 raise ValueError('Invalid filter part %r' % filter_part)
1813 def match_str(filter_str, dct):
1814 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1817 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1820 def match_filter_func(filter_str):
1821 def _match_func(info_dict):
1822 if match_str(filter_str, info_dict):
1825 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1826 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1830 def parse_dfxp_time_expr(time_expr):
1834 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1836 return float(mobj.group('time_offset'))
1838 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1840 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1843 def srt_subtitles_timecode(seconds):
1844 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1847 def dfxp2srt(dfxp_data):
1848 _x = functools.partial(xpath_with_ns, ns_map={
1849 'ttml': 'http://www.w3.org/ns/ttml',
1850 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1853 def parse_node(node):
1854 str_or_empty = functools.partial(str_or_none, default='')
1856 out = str_or_empty(node.text)
1859 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1860 out += '\n' + str_or_empty(child.tail)
1861 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1862 out += str_or_empty(parse_node(child))
1864 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1868 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1870 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1873 raise ValueError('Invalid dfxp/TTML subtitle')
1875 for para, index in zip(paras, itertools.count(1)):
1876 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1877 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1879 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1880 out.append('%d\n%s --> %s\n%s\n\n' % (
1882 srt_subtitles_timecode(begin_time),
1883 srt_subtitles_timecode(end_time),
1889 class ISO639Utils(object):
1890 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2079 def short2long(cls, code):
2080 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2081 return cls._lang_map.get(code[:2])
2084 def long2short(cls, code):
2085 """Convert language code from ISO 639-2/T to ISO 639-1"""
2086 for short_name, long_name in cls._lang_map.items():
2087 if long_name == code:
2091 class ISO3166Utils(object):
2092 # From http://data.okfn.org/data/core/country-list
2094 'AF': 'Afghanistan',
2095 'AX': 'Åland Islands',
2098 'AS': 'American Samoa',
2103 'AG': 'Antigua and Barbuda',
2120 'BO': 'Bolivia, Plurinational State of',
2121 'BQ': 'Bonaire, Sint Eustatius and Saba',
2122 'BA': 'Bosnia and Herzegovina',
2124 'BV': 'Bouvet Island',
2126 'IO': 'British Indian Ocean Territory',
2127 'BN': 'Brunei Darussalam',
2129 'BF': 'Burkina Faso',
2135 'KY': 'Cayman Islands',
2136 'CF': 'Central African Republic',
2140 'CX': 'Christmas Island',
2141 'CC': 'Cocos (Keeling) Islands',
2145 'CD': 'Congo, the Democratic Republic of the',
2146 'CK': 'Cook Islands',
2148 'CI': 'Côte d\'Ivoire',
2153 'CZ': 'Czech Republic',
2157 'DO': 'Dominican Republic',
2160 'SV': 'El Salvador',
2161 'GQ': 'Equatorial Guinea',
2165 'FK': 'Falkland Islands (Malvinas)',
2166 'FO': 'Faroe Islands',
2170 'GF': 'French Guiana',
2171 'PF': 'French Polynesia',
2172 'TF': 'French Southern Territories',
2187 'GW': 'Guinea-Bissau',
2190 'HM': 'Heard Island and McDonald Islands',
2191 'VA': 'Holy See (Vatican City State)',
2198 'IR': 'Iran, Islamic Republic of',
2201 'IM': 'Isle of Man',
2211 'KP': 'Korea, Democratic People\'s Republic of',
2212 'KR': 'Korea, Republic of',
2215 'LA': 'Lao People\'s Democratic Republic',
2221 'LI': 'Liechtenstein',
2225 'MK': 'Macedonia, the Former Yugoslav Republic of',
2232 'MH': 'Marshall Islands',
2238 'FM': 'Micronesia, Federated States of',
2239 'MD': 'Moldova, Republic of',
2250 'NL': 'Netherlands',
2251 'NC': 'New Caledonia',
2252 'NZ': 'New Zealand',
2257 'NF': 'Norfolk Island',
2258 'MP': 'Northern Mariana Islands',
2263 'PS': 'Palestine, State of',
2265 'PG': 'Papua New Guinea',
2268 'PH': 'Philippines',
2272 'PR': 'Puerto Rico',
2276 'RU': 'Russian Federation',
2278 'BL': 'Saint Barthélemy',
2279 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2280 'KN': 'Saint Kitts and Nevis',
2281 'LC': 'Saint Lucia',
2282 'MF': 'Saint Martin (French part)',
2283 'PM': 'Saint Pierre and Miquelon',
2284 'VC': 'Saint Vincent and the Grenadines',
2287 'ST': 'Sao Tome and Principe',
2288 'SA': 'Saudi Arabia',
2292 'SL': 'Sierra Leone',
2294 'SX': 'Sint Maarten (Dutch part)',
2297 'SB': 'Solomon Islands',
2299 'ZA': 'South Africa',
2300 'GS': 'South Georgia and the South Sandwich Islands',
2301 'SS': 'South Sudan',
2306 'SJ': 'Svalbard and Jan Mayen',
2309 'CH': 'Switzerland',
2310 'SY': 'Syrian Arab Republic',
2311 'TW': 'Taiwan, Province of China',
2313 'TZ': 'Tanzania, United Republic of',
2315 'TL': 'Timor-Leste',
2319 'TT': 'Trinidad and Tobago',
2322 'TM': 'Turkmenistan',
2323 'TC': 'Turks and Caicos Islands',
2327 'AE': 'United Arab Emirates',
2328 'GB': 'United Kingdom',
2329 'US': 'United States',
2330 'UM': 'United States Minor Outlying Islands',
2334 'VE': 'Venezuela, Bolivarian Republic of',
2336 'VG': 'Virgin Islands, British',
2337 'VI': 'Virgin Islands, U.S.',
2338 'WF': 'Wallis and Futuna',
2339 'EH': 'Western Sahara',
2346 def short2full(cls, code):
2347 """Convert an ISO 3166-2 country code to the corresponding full name"""
2348 return cls._country_map.get(code.upper())
2351 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2352 def __init__(self, proxies=None):
2353 # Set default handlers
2354 for type in ('http', 'https'):
2355 setattr(self, '%s_open' % type,
2356 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2357 meth(r, proxy, type))
2358 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2360 def proxy_open(self, req, proxy, type):
2361 req_proxy = req.headers.get('Ytdl-request-proxy')
2362 if req_proxy is not None:
2364 del req.headers['Ytdl-request-proxy']
2366 if proxy == '__noproxy__':
2367 return None # No Proxy
2368 return compat_urllib_request.ProxyHandler.proxy_open(
2369 self, req, proxy, type)