4 from __future__
import unicode_literals
34 import xml
.etree
.ElementTree
41 compat_etree_fromstring
,
43 compat_html_entities_html5
,
49 compat_socket_create_connection
,
55 compat_urllib_parse_urlencode
,
56 compat_urllib_parse_urlparse
,
57 compat_urllib_parse_unquote_plus
,
58 compat_urllib_request
,
69 def register_socks_protocols():
70 # "Register" SOCKS protocols
71 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
72 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
73 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
74 if scheme
not in compat_urlparse
.uses_netloc
:
75 compat_urlparse
.uses_netloc
.append(scheme
)
78 # This is not clearly defined otherwise
79 compiled_regex_type
= type(re
.compile(''))
82 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
83 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
84 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
85 'Accept-Encoding': 'gzip, deflate',
86 'Accept-Language': 'en-us,en;q=0.5',
91 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
97 ENGLISH_MONTH_NAMES
= [
98 'January', 'February', 'March', 'April', 'May', 'June',
99 'July', 'August', 'September', 'October', 'November', 'December']
102 'en': ENGLISH_MONTH_NAMES
,
104 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
105 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
110 'flv', 'f4v', 'f4a', 'f4b',
111 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
112 'mkv', 'mka', 'mk3d',
121 'f4f', 'f4m', 'm3u8', 'smil')
123 # needed for sanitizing filenames in restricted mode
124 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
125 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
126 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
149 '%Y-%m-%d %H:%M:%S.%f',
152 '%Y-%m-%dT%H:%M:%SZ',
153 '%Y-%m-%dT%H:%M:%S.%fZ',
154 '%Y-%m-%dT%H:%M:%S.%f0Z',
156 '%Y-%m-%dT%H:%M:%S.%f',
159 '%b %d %Y at %H:%M:%S',
162 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
163 DATE_FORMATS_DAY_FIRST
.extend([
172 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
173 DATE_FORMATS_MONTH_FIRST
.extend([
181 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184 def preferredencoding():
185 """Get preferred encoding.
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
191 pref
= locale
.getpreferredencoding()
199 def write_json_file(obj
, fn
):
200 """ Encode obj as JSON and write it to fn, atomically if possible """
202 fn
= encodeFilename(fn
)
203 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
204 encoding
= get_filesystem_encoding()
205 # os.path.basename returns a bytes object, but NamedTemporaryFile
206 # will fail if the filename contains non ascii characters unless we
207 # use a unicode object
208 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
209 # the same for os.path.dirname
210 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
212 path_basename
= os
.path
.basename
213 path_dirname
= os
.path
.dirname
217 'prefix': path_basename(fn
) + '.',
218 'dir': path_dirname(fn
),
222 # In Python 2.x, json.dump expects a bytestream.
223 # In Python 3.x, it writes to a character stream
224 if sys
.version_info
< (3, 0):
232 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
237 if sys
.platform
== 'win32':
238 # Need to remove existing file on Windows, else os.rename raises
239 # WindowsError or FileExistsError.
244 os
.rename(tf
.name
, fn
)
253 if sys
.version_info
>= (2, 7):
254 def find_xpath_attr(node
, xpath
, key
, val
=None):
255 """ Find the xpath xpath[@key=val] """
256 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
257 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
258 return node
.find(expr
)
260 def find_xpath_attr(node
, xpath
, key
, val
=None):
261 for f
in node
.findall(compat_xpath(xpath
)):
262 if key
not in f
.attrib
:
264 if val
is None or f
.attrib
.get(key
) == val
:
268 # On python2.6 the xml.etree.ElementTree.Element methods don't support
269 # the namespace parameter
272 def xpath_with_ns(path
, ns_map
):
273 components
= [c
.split(':') for c
in path
.split('/')]
277 replaced
.append(c
[0])
280 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
281 return '/'.join(replaced
)
284 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
285 def _find_xpath(xpath
):
286 return node
.find(compat_xpath(xpath
))
288 if isinstance(xpath
, (str, compat_str
)):
289 n
= _find_xpath(xpath
)
297 if default
is not NO_DEFAULT
:
300 name
= xpath
if name
is None else name
301 raise ExtractorError('Could not find XML element %s' % name
)
307 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
308 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
309 if n
is None or n
== default
:
312 if default
is not NO_DEFAULT
:
315 name
= xpath
if name
is None else name
316 raise ExtractorError('Could not find XML element\'s text %s' % name
)
322 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
323 n
= find_xpath_attr(node
, xpath
, key
)
325 if default
is not NO_DEFAULT
:
328 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
329 raise ExtractorError('Could not find XML attribute %s' % name
)
335 def get_element_by_id(id, html
):
336 """Return the content of the tag with the specified ID in the passed HTML document"""
337 return get_element_by_attribute('id', id, html
)
340 def get_element_by_class(class_name
, html
):
341 """Return the content of the first tag with the specified class in the passed HTML document"""
342 retval
= get_elements_by_class(class_name
, html
)
343 return retval
[0] if retval
else None
346 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
347 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
348 return retval
[0] if retval
else None
351 def get_elements_by_class(class_name
, html
):
352 """Return the content of all tags with the specified class in the passed HTML document as a list"""
353 return get_elements_by_attribute(
354 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
355 html, escape_value=False)
358 def get_elements_by_attribute(attribute, value, html, escape_value=True):
359 """Return the content of the tag with the specified attribute in the passed HTML document"""
361 value = re.escape(value) if escape_value else value
364 for m in re.finditer(r'''(?xs)
366 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'))*?
368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'))*?
372 ''' % (re.escape(attribute), value), html):
373 res = m.group('content
')
375 if res.startswith('"') or res.startswith("'"):
378 retlist.append(unescapeHTML(res))
383 class HTMLAttributeParser(compat_HTMLParser):
384 """Trivial HTML parser to gather the attributes for a single element"""
387 compat_HTMLParser.__init__(self)
389 def handle_starttag(self, tag, attrs):
390 self.attrs = dict(attrs)
393 def extract_attributes(html_element):
394 """Given a string for an HTML element such as
396 a="foo" B="bar" c="&98;az" d=boz
397 empty= noval entity="&"
400 Decode and return a dictionary of attributes.
402 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
403 'empty
': '', 'noval
': None, 'entity
': '&',
404 'sq
': '"', 'dq': '\''
406 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
407 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
409 parser = HTMLAttributeParser()
410 parser.feed(html_element)
415 def clean_html(html):
416 """Clean an HTML snippet into a readable string"""
418 if html is None: # Convenience for sanitizing descriptions etc.
422 html = html.replace('\n', ' ')
423 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
424 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
426 html = re.sub('<.*?>', '', html)
427 # Replace html entities
428 html = unescapeHTML(html)
432 def sanitize_open(filename, open_mode):
433 """Try to open the given filename, and slightly tweak it if this fails.
435 Attempts to open the given filename. If this fails, it tries to change
436 the filename slightly, step by step, until it's either able to open it
437 or it fails and raises a final exception, like the standard open()
440 It returns the tuple (stream, definitive_file_name).
444 if sys.platform == 'win32':
446 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
447 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
448 stream = open(encodeFilename(filename), open_mode)
449 return (stream, filename)
450 except (IOError, OSError) as err:
451 if err.errno in (errno.EACCES,):
454 # In case of error, try to remove win32 forbidden chars
455 alt_filename = sanitize_path(filename)
456 if alt_filename == filename:
459 # An exception here should be caught in the caller
460 stream = open(encodeFilename(alt_filename), open_mode)
461 return (stream, alt_filename)
464 def timeconvert(timestr):
465 """Convert RFC 2822 defined time string into system timestamp"""
467 timetuple = email.utils.parsedate_tz(timestr)
468 if timetuple is not None:
469 timestamp = email.utils.mktime_tz(timetuple)
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
478 def replace_insane(char):
479 if restricted and char in ACCENT_CHARS:
480 return ACCENT_CHARS[char]
481 if char == '?' or ord(char) < 32 or ord(char) == 127:
484 return '' if restricted else '\''
486 return '_
-' if restricted else ' -'
487 elif char in '\\/|
*<>':
489 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
491 if restricted
and ord(char
) > 127:
496 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
497 result
= ''.join(map(replace_insane
, s
))
499 while '__' in result
:
500 result
= result
.replace('__', '_')
501 result
= result
.strip('_')
502 # Common case of "Foreign band name - English song title"
503 if restricted
and result
.startswith('-_'):
505 if result
.startswith('-'):
506 result
= '_' + result
[len('-'):]
507 result
= result
.lstrip('.')
513 def sanitize_path(s
):
514 """Sanitizes and normalizes path on Windows"""
515 if sys
.platform
!= 'win32':
517 drive_or_unc
, _
= os
.path
.splitdrive(s
)
518 if sys
.version_info
< (2, 7) and not drive_or_unc
:
519 drive_or_unc
, _
= os
.path
.splitunc(s
)
520 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
524 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
525 for path_part
in norm_path
]
527 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
528 return os
.path
.join(*sanitized_path
)
531 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
532 # unwanted failures due to missing protocol
533 def sanitize_url(url
):
534 return 'http:%s' % url
if url
.startswith('//') else url
537 def sanitized_Request(url
, *args
, **kwargs
):
538 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
541 def orderedSet(iterable
):
542 """ Remove all duplicates from the input iterable """
550 def _htmlentity_transform(entity_with_semicolon
):
551 """Transforms an HTML entity to a character."""
552 entity
= entity_with_semicolon
[:-1]
554 # Known non-numeric HTML entity
555 if entity
in compat_html_entities
.name2codepoint
:
556 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
558 # TODO: HTML5 allows entities without a semicolon. For example,
559 # 'Éric' should be decoded as 'Éric'.
560 if entity_with_semicolon
in compat_html_entities_html5
:
561 return compat_html_entities_html5
[entity_with_semicolon
]
563 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
565 numstr
= mobj
.group(1)
566 if numstr
.startswith('x'):
568 numstr
= '0%s' % numstr
571 # See https://github.com/rg3/youtube-dl/issues/7518
573 return compat_chr(int(numstr
, base
))
577 # Unknown entity in name, return its literal representation
578 return '&%s;' % entity
584 assert type(s
) == compat_str
587 r
'&([^;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
590 def get_subprocess_encoding():
591 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
592 # For subprocess calls, encode with locale encoding
593 # Refer to http://stackoverflow.com/a/9951851/35070
594 encoding
= preferredencoding()
596 encoding
= sys
.getfilesystemencoding()
602 def encodeFilename(s
, for_subprocess
=False):
604 @param s The name of the file
607 assert type(s
) == compat_str
609 # Python 3 has a Unicode API
610 if sys
.version_info
>= (3, 0):
613 # Pass '' directly to use Unicode APIs on Windows 2000 and up
614 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
615 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
616 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
619 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
620 if sys
.platform
.startswith('java'):
623 return s
.encode(get_subprocess_encoding(), 'ignore')
626 def decodeFilename(b
, for_subprocess
=False):
628 if sys
.version_info
>= (3, 0):
631 if not isinstance(b
, bytes):
634 return b
.decode(get_subprocess_encoding(), 'ignore')
637 def encodeArgument(s
):
638 if not isinstance(s
, compat_str
):
639 # Legacy code that uses byte strings
640 # Uncomment the following line after fixing all post processors
641 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
642 s
= s
.decode('ascii')
643 return encodeFilename(s
, True)
646 def decodeArgument(b
):
647 return decodeFilename(b
, True)
650 def decodeOption(optval
):
653 if isinstance(optval
, bytes):
654 optval
= optval
.decode(preferredencoding())
656 assert isinstance(optval
, compat_str
)
660 def formatSeconds(secs
):
662 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
664 return '%d:%02d' % (secs
// 60, secs
% 60)
669 def make_HTTPS_handler(params
, **kwargs
):
670 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
671 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
672 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
673 if opts_no_check_certificate
:
674 context
.check_hostname
= False
675 context
.verify_mode
= ssl
.CERT_NONE
677 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
680 # (create_default_context present but HTTPSHandler has no context=)
683 if sys
.version_info
< (3, 2):
684 return YoutubeDLHTTPSHandler(params
, **kwargs
)
686 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
687 context
.verify_mode
= (ssl
.CERT_NONE
688 if opts_no_check_certificate
689 else ssl
.CERT_REQUIRED
)
690 context
.set_default_verify_paths()
691 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
694 def bug_reports_message():
695 if ytdl_is_updateable():
696 update_cmd
= 'type youtube-dl -U to update'
698 update_cmd
= 'see https://yt-dl.org/update on how to update'
699 msg
= '; please report this issue on https://yt-dl.org/bug .'
700 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
701 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
705 class YoutubeDLError(Exception):
706 """Base exception for YoutubeDL errors."""
710 class ExtractorError(YoutubeDLError
):
711 """Error during info extraction."""
713 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
714 """ tb, if given, is the original traceback (so that it can be printed out).
715 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
718 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
720 if video_id
is not None:
721 msg
= video_id
+ ': ' + msg
723 msg
+= ' (caused by %r)' % cause
725 msg
+= bug_reports_message()
726 super(ExtractorError
, self
).__init
__(msg
)
729 self
.exc_info
= sys
.exc_info() # preserve original exception
731 self
.video_id
= video_id
733 def format_traceback(self
):
734 if self
.traceback
is None:
736 return ''.join(traceback
.format_tb(self
.traceback
))
739 class UnsupportedError(ExtractorError
):
740 def __init__(self
, url
):
741 super(UnsupportedError
, self
).__init
__(
742 'Unsupported URL: %s' % url
, expected
=True)
746 class RegexNotFoundError(ExtractorError
):
747 """Error when a regex didn't match"""
751 class GeoRestrictedError(ExtractorError
):
752 """Geographic restriction Error exception.
754 This exception may be thrown when a video is not available from your
755 geographic location due to geographic restrictions imposed by a website.
757 def __init__(self
, msg
, countries
=None):
758 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
760 self
.countries
= countries
763 class DownloadError(YoutubeDLError
):
764 """Download Error exception.
766 This exception may be thrown by FileDownloader objects if they are not
767 configured to continue on errors. They will contain the appropriate
771 def __init__(self
, msg
, exc_info
=None):
772 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
773 super(DownloadError
, self
).__init
__(msg
)
774 self
.exc_info
= exc_info
777 class SameFileError(YoutubeDLError
):
778 """Same File exception.
780 This exception will be thrown by FileDownloader objects if they detect
781 multiple files would have to be downloaded to the same file on disk.
786 class PostProcessingError(YoutubeDLError
):
787 """Post Processing exception.
789 This exception may be raised by PostProcessor's .run() method to
790 indicate an error in the postprocessing task.
793 def __init__(self
, msg
):
794 super(PostProcessingError
, self
).__init
__(msg
)
798 class MaxDownloadsReached(YoutubeDLError
):
799 """ --max-downloads limit has been reached. """
803 class UnavailableVideoError(YoutubeDLError
):
804 """Unavailable Format exception.
806 This exception will be thrown when a video is requested
807 in a format that is not available for that video.
812 class ContentTooShortError(YoutubeDLError
):
813 """Content Too Short exception.
815 This exception may be raised by FileDownloader objects when a file they
816 download is too small for what the server announced first, indicating
817 the connection was probably interrupted.
820 def __init__(self
, downloaded
, expected
):
821 super(ContentTooShortError
, self
).__init
__(
822 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
825 self
.downloaded
= downloaded
826 self
.expected
= expected
829 class XAttrMetadataError(YoutubeDLError
):
830 def __init__(self
, code
=None, msg
='Unknown error'):
831 super(XAttrMetadataError
, self
).__init
__(msg
)
835 # Parsing code and msg
836 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
) or
837 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
838 self
.reason
= 'NO_SPACE'
839 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
840 self
.reason
= 'VALUE_TOO_LONG'
842 self
.reason
= 'NOT_SUPPORTED'
845 class XAttrUnavailableError(YoutubeDLError
):
849 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
850 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
851 # expected HTTP responses to meet HTTP/1.0 or later (see also
852 # https://github.com/rg3/youtube-dl/issues/6727)
853 if sys
.version_info
< (3, 0):
854 kwargs
[b
'strict'] = True
855 hc
= http_class(*args
, **kwargs
)
856 source_address
= ydl_handler
._params
.get('source_address')
857 if source_address
is not None:
858 sa
= (source_address
, 0)
859 if hasattr(hc
, 'source_address'): # Python 2.7+
860 hc
.source_address
= sa
862 def _hc_connect(self
, *args
, **kwargs
):
863 sock
= compat_socket_create_connection(
864 (self
.host
, self
.port
), self
.timeout
, sa
)
866 self
.sock
= ssl
.wrap_socket(
867 sock
, self
.key_file
, self
.cert_file
,
868 ssl_version
=ssl
.PROTOCOL_TLSv1
)
871 hc
.connect
= functools
.partial(_hc_connect
, hc
)
876 def handle_youtubedl_headers(headers
):
877 filtered_headers
= headers
879 if 'Youtubedl-no-compression' in filtered_headers
:
880 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
881 del filtered_headers
['Youtubedl-no-compression']
883 return filtered_headers
886 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
887 """Handler for HTTP requests and responses.
889 This class, when installed with an OpenerDirector, automatically adds
890 the standard headers to every HTTP request and handles gzipped and
891 deflated responses from web servers. If compression is to be avoided in
892 a particular request, the original request in the program code only has
893 to include the HTTP header "Youtubedl-no-compression", which will be
894 removed before making the real request.
896 Part of this code was copied from:
898 http://techknack.net/python-urllib2-handlers/
900 Andrew Rowls, the author of that code, agreed to release it to the
904 def __init__(self
, params
, *args
, **kwargs
):
905 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
906 self
._params
= params
908 def http_open(self
, req
):
909 conn_class
= compat_http_client
.HTTPConnection
911 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
913 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
914 del req
.headers
['Ytdl-socks-proxy']
916 return self
.do_open(functools
.partial(
917 _create_http_connection
, self
, conn_class
, False),
923 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
925 return zlib
.decompress(data
)
928 def addinfourl_wrapper(stream
, headers
, url
, code
):
929 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
930 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
931 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
935 def http_request(self
, req
):
936 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
937 # always respected by websites, some tend to give out URLs with non percent-encoded
938 # non-ASCII characters (see telemb.py, ard.py [#3412])
939 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
940 # To work around aforementioned issue we will replace request's original URL with
941 # percent-encoded one
942 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
943 # the code of this workaround has been moved here from YoutubeDL.urlopen()
944 url
= req
.get_full_url()
945 url_escaped
= escape_url(url
)
947 # Substitute URL if any change after escaping
948 if url
!= url_escaped
:
949 req
= update_Request(req
, url
=url_escaped
)
951 for h
, v
in std_headers
.items():
952 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
953 # The dict keys are capitalized because of this bug by urllib
954 if h
.capitalize() not in req
.headers
:
957 req
.headers
= handle_youtubedl_headers(req
.headers
)
959 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
960 # Python 2.6 is brain-dead when it comes to fragments
961 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
962 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
966 def http_response(self
, req
, resp
):
969 if resp
.headers
.get('Content-encoding', '') == 'gzip':
970 content
= resp
.read()
971 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
973 uncompressed
= io
.BytesIO(gz
.read())
974 except IOError as original_ioerror
:
975 # There may be junk add the end of the file
976 # See http://stackoverflow.com/q/4928560/35070 for details
977 for i
in range(1, 1024):
979 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
980 uncompressed
= io
.BytesIO(gz
.read())
985 raise original_ioerror
986 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
987 resp
.msg
= old_resp
.msg
988 del resp
.headers
['Content-encoding']
990 if resp
.headers
.get('Content-encoding', '') == 'deflate':
991 gz
= io
.BytesIO(self
.deflate(resp
.read()))
992 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
993 resp
.msg
= old_resp
.msg
994 del resp
.headers
['Content-encoding']
995 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
996 # https://github.com/rg3/youtube-dl/issues/6457).
997 if 300 <= resp
.code
< 400:
998 location
= resp
.headers
.get('Location')
1000 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1001 if sys
.version_info
>= (3, 0):
1002 location
= location
.encode('iso-8859-1').decode('utf-8')
1004 location
= location
.decode('utf-8')
1005 location_escaped
= escape_url(location
)
1006 if location
!= location_escaped
:
1007 del resp
.headers
['Location']
1008 if sys
.version_info
< (3, 0):
1009 location_escaped
= location_escaped
.encode('utf-8')
1010 resp
.headers
['Location'] = location_escaped
1013 https_request
= http_request
1014 https_response
= http_response
1017 def make_socks_conn_class(base_class
, socks_proxy
):
1018 assert issubclass(base_class
, (
1019 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1021 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1022 if url_components
.scheme
.lower() == 'socks5':
1023 socks_type
= ProxyType
.SOCKS5
1024 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1025 socks_type
= ProxyType
.SOCKS4
1026 elif url_components
.scheme
.lower() == 'socks4a':
1027 socks_type
= ProxyType
.SOCKS4A
1029 def unquote_if_non_empty(s
):
1032 return compat_urllib_parse_unquote_plus(s
)
1036 url_components
.hostname
, url_components
.port
or 1080,
1038 unquote_if_non_empty(url_components
.username
),
1039 unquote_if_non_empty(url_components
.password
),
1042 class SocksConnection(base_class
):
1044 self
.sock
= sockssocket()
1045 self
.sock
.setproxy(*proxy_args
)
1046 if type(self
.timeout
) in (int, float):
1047 self
.sock
.settimeout(self
.timeout
)
1048 self
.sock
.connect((self
.host
, self
.port
))
1050 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1051 if hasattr(self
, '_context'): # Python > 2.6
1052 self
.sock
= self
._context
.wrap_socket(
1053 self
.sock
, server_hostname
=self
.host
)
1055 self
.sock
= ssl
.wrap_socket(self
.sock
)
1057 return SocksConnection
1060 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1061 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1062 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1063 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1064 self
._params
= params
1066 def https_open(self
, req
):
1068 conn_class
= self
._https
_conn
_class
1070 if hasattr(self
, '_context'): # python > 2.6
1071 kwargs
['context'] = self
._context
1072 if hasattr(self
, '_check_hostname'): # python 3.x
1073 kwargs
['check_hostname'] = self
._check
_hostname
1075 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1077 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1078 del req
.headers
['Ytdl-socks-proxy']
1080 return self
.do_open(functools
.partial(
1081 _create_http_connection
, self
, conn_class
, True),
1085 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1086 def __init__(self
, cookiejar
=None):
1087 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1089 def http_response(self
, request
, response
):
1090 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1091 # characters in Set-Cookie HTTP header of last response (see
1092 # https://github.com/rg3/youtube-dl/issues/6769).
1093 # In order to at least prevent crashing we will percent encode Set-Cookie
1094 # header before HTTPCookieProcessor starts processing it.
1095 # if sys.version_info < (3, 0) and response.headers:
1096 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1097 # set_cookie = response.headers.get(set_cookie_header)
1099 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1100 # if set_cookie != set_cookie_escaped:
1101 # del response.headers[set_cookie_header]
1102 # response.headers[set_cookie_header] = set_cookie_escaped
1103 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1105 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1106 https_response
= http_response
1109 def extract_timezone(date_str
):
1111 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1114 timezone
= datetime
.timedelta()
1116 date_str
= date_str
[:-len(m
.group('tz'))]
1117 if not m
.group('sign'):
1118 timezone
= datetime
.timedelta()
1120 sign
= 1 if m
.group('sign') == '+' else -1
1121 timezone
= datetime
.timedelta(
1122 hours
=sign
* int(m
.group('hours')),
1123 minutes
=sign
* int(m
.group('minutes')))
1124 return timezone
, date_str
1127 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1128 """ Return a UNIX timestamp from the given date """
1130 if date_str
is None:
1133 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1135 if timezone
is None:
1136 timezone
, date_str
= extract_timezone(date_str
)
1139 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1140 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1141 return calendar
.timegm(dt
.timetuple())
1146 def date_formats(day_first
=True):
1147 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1150 def unified_strdate(date_str
, day_first
=True):
1151 """Return a string with the date in the format YYYYMMDD"""
1153 if date_str
is None:
1157 date_str
= date_str
.replace(',', ' ')
1158 # Remove AM/PM + timezone
1159 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1160 _
, date_str
= extract_timezone(date_str
)
1162 for expression
in date_formats(day_first
):
1164 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1167 if upload_date
is None:
1168 timetuple
= email
.utils
.parsedate_tz(date_str
)
1171 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1174 if upload_date
is not None:
1175 return compat_str(upload_date
)
1178 def unified_timestamp(date_str
, day_first
=True):
1179 if date_str
is None:
1182 date_str
= date_str
.replace(',', ' ')
1184 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1185 timezone
, date_str
= extract_timezone(date_str
)
1187 # Remove AM/PM + timezone
1188 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1190 for expression
in date_formats(day_first
):
1192 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1193 return calendar
.timegm(dt
.timetuple())
1196 timetuple
= email
.utils
.parsedate_tz(date_str
)
1198 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1201 def determine_ext(url
, default_ext
='unknown_video'):
1204 guess
= url
.partition('?')[0].rpartition('.')[2]
1205 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1207 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1208 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1209 return guess
.rstrip('/')
1214 def subtitles_filename(filename
, sub_lang
, sub_format
):
1215 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
1218 def date_from_str(date_str
):
1220 Return a datetime object from a string in the format YYYYMMDD or
1221 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1222 today
= datetime
.date
.today()
1223 if date_str
in ('now', 'today'):
1225 if date_str
== 'yesterday':
1226 return today
- datetime
.timedelta(days
=1)
1227 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
1228 if match
is not None:
1229 sign
= match
.group('sign')
1230 time
= int(match
.group('time'))
1233 unit
= match
.group('unit')
1234 # A bad approximation?
1238 elif unit
== 'year':
1242 delta
= datetime
.timedelta(**{unit
: time
})
1243 return today
+ delta
1244 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
1247 def hyphenate_date(date_str
):
1249 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1250 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1251 if match
is not None:
1252 return '-'.join(match
.groups())
1257 class DateRange(object):
1258 """Represents a time interval between two dates"""
1260 def __init__(self
, start
=None, end
=None):
1261 """start and end must be strings in the format accepted by date"""
1262 if start
is not None:
1263 self
.start
= date_from_str(start
)
1265 self
.start
= datetime
.datetime
.min.date()
1267 self
.end
= date_from_str(end
)
1269 self
.end
= datetime
.datetime
.max.date()
1270 if self
.start
> self
.end
:
1271 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1275 """Returns a range that only contains the given day"""
1276 return cls(day
, day
)
1278 def __contains__(self
, date
):
1279 """Check if the date is in the range"""
1280 if not isinstance(date
, datetime
.date
):
1281 date
= date_from_str(date
)
1282 return self
.start
<= date
<= self
.end
1285 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1288 def platform_name():
1289 """ Returns the platform name as a compat_str """
1290 res
= platform
.platform()
1291 if isinstance(res
, bytes):
1292 res
= res
.decode(preferredencoding())
1294 assert isinstance(res
, compat_str
)
1298 def _windows_write_string(s
, out
):
1299 """ Returns True if the string was written using special methods,
1300 False if it has yet to be written out."""
1301 # Adapted from http://stackoverflow.com/a/3259271/35070
1304 import ctypes
.wintypes
1312 fileno
= out
.fileno()
1313 except AttributeError:
1314 # If the output stream doesn't have a fileno, it's virtual
1316 except io
.UnsupportedOperation
:
1317 # Some strange Windows pseudo files?
1319 if fileno
not in WIN_OUTPUT_IDS
:
1322 GetStdHandle
= ctypes
.WINFUNCTYPE(
1323 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1324 (b
'GetStdHandle', ctypes
.windll
.kernel32
))
1325 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1327 WriteConsoleW
= ctypes
.WINFUNCTYPE(
1328 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1329 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1330 ctypes
.wintypes
.LPVOID
)((b
'WriteConsoleW', ctypes
.windll
.kernel32
))
1331 written
= ctypes
.wintypes
.DWORD(0)
1333 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
'GetFileType', ctypes
.windll
.kernel32
))
1334 FILE_TYPE_CHAR
= 0x0002
1335 FILE_TYPE_REMOTE
= 0x8000
1336 GetConsoleMode
= ctypes
.WINFUNCTYPE(
1337 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1338 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1339 (b
'GetConsoleMode', ctypes
.windll
.kernel32
))
1340 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1342 def not_a_console(handle
):
1343 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1345 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1346 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1348 if not_a_console(h
):
1351 def next_nonbmp_pos(s
):
1353 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1354 except StopIteration:
1358 count
= min(next_nonbmp_pos(s
), 1024)
1360 ret
= WriteConsoleW(
1361 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1363 raise OSError('Failed to write string')
1364 if not count
: # We just wrote a non-BMP character
1365 assert written
.value
== 2
1368 assert written
.value
> 0
1369 s
= s
[written
.value
:]
1373 def write_string(s
, out
=None, encoding
=None):
1376 assert type(s
) == compat_str
1378 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1379 if _windows_write_string(s
, out
):
1382 if ('b' in getattr(out
, 'mode', '') or
1383 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1384 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1386 elif hasattr(out
, 'buffer'):
1387 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1388 byt
= s
.encode(enc
, 'ignore')
1389 out
.buffer.write(byt
)
1395 def bytes_to_intlist(bs
):
1398 if isinstance(bs
[0], int): # Python 3
1401 return [ord(c
) for c
in bs
]
1404 def intlist_to_bytes(xs
):
1407 return compat_struct_pack('%dB' % len(xs
), *xs
)
1410 # Cross-platform file locking
1411 if sys
.platform
== 'win32':
1412 import ctypes
.wintypes
1415 class OVERLAPPED(ctypes
.Structure
):
1417 ('Internal', ctypes
.wintypes
.LPVOID
),
1418 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1419 ('Offset', ctypes
.wintypes
.DWORD
),
1420 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1421 ('hEvent', ctypes
.wintypes
.HANDLE
),
1424 kernel32
= ctypes
.windll
.kernel32
1425 LockFileEx
= kernel32
.LockFileEx
1426 LockFileEx
.argtypes
= [
1427 ctypes
.wintypes
.HANDLE
, # hFile
1428 ctypes
.wintypes
.DWORD
, # dwFlags
1429 ctypes
.wintypes
.DWORD
, # dwReserved
1430 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1431 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1432 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1434 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1435 UnlockFileEx
= kernel32
.UnlockFileEx
1436 UnlockFileEx
.argtypes
= [
1437 ctypes
.wintypes
.HANDLE
, # hFile
1438 ctypes
.wintypes
.DWORD
, # dwReserved
1439 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1440 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1441 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1443 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1444 whole_low
= 0xffffffff
1445 whole_high
= 0x7fffffff
1447 def _lock_file(f
, exclusive
):
1448 overlapped
= OVERLAPPED()
1449 overlapped
.Offset
= 0
1450 overlapped
.OffsetHigh
= 0
1451 overlapped
.hEvent
= 0
1452 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1453 handle
= msvcrt
.get_osfhandle(f
.fileno())
1454 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1455 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1456 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1458 def _unlock_file(f
):
1459 assert f
._lock
_file
_overlapped
_p
1460 handle
= msvcrt
.get_osfhandle(f
.fileno())
1461 if not UnlockFileEx(handle
, 0,
1462 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1463 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1466 # Some platforms, such as Jython, is missing fcntl
1470 def _lock_file(f
, exclusive
):
1471 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1473 def _unlock_file(f
):
1474 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1476 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
1478 def _lock_file(f
, exclusive
):
1479 raise IOError(UNSUPPORTED_MSG
)
1481 def _unlock_file(f
):
1482 raise IOError(UNSUPPORTED_MSG
)
1485 class locked_file(object):
1486 def __init__(self
, filename
, mode
, encoding
=None):
1487 assert mode
in ['r', 'a', 'w']
1488 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1491 def __enter__(self
):
1492 exclusive
= self
.mode
!= 'r'
1494 _lock_file(self
.f
, exclusive
)
1500 def __exit__(self
, etype
, value
, traceback
):
1502 _unlock_file(self
.f
)
1509 def write(self
, *args
):
1510 return self
.f
.write(*args
)
1512 def read(self
, *args
):
1513 return self
.f
.read(*args
)
1516 def get_filesystem_encoding():
1517 encoding
= sys
.getfilesystemencoding()
1518 return encoding
if encoding
is not None else 'utf-8'
1521 def shell_quote(args
):
1523 encoding
= get_filesystem_encoding()
1525 if isinstance(a
, bytes):
1526 # We may get a filename encoded with 'encodeFilename'
1527 a
= a
.decode(encoding
)
1528 quoted_args
.append(pipes
.quote(a
))
1529 return ' '.join(quoted_args
)
1532 def smuggle_url(url
, data
):
1533 """ Pass additional data in a URL for internal use. """
1535 url
, idata
= unsmuggle_url(url
, {})
1537 sdata
= compat_urllib_parse_urlencode(
1538 {'__youtubedl_smuggle': json
.dumps(data
)})
1539 return url
+ '#' + sdata
1542 def unsmuggle_url(smug_url
, default
=None):
1543 if '#__youtubedl_smuggle' not in smug_url
:
1544 return smug_url
, default
1545 url
, _
, sdata
= smug_url
.rpartition('#')
1546 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1547 data
= json
.loads(jsond
)
1551 def format_bytes(bytes):
1554 if type(bytes) is str:
1555 bytes = float(bytes)
1559 exponent
= int(math
.log(bytes, 1024.0))
1560 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1561 converted
= float(bytes) / float(1024 ** exponent
)
1562 return '%.2f%s' % (converted
, suffix
)
1565 def lookup_unit_table(unit_table
, s
):
1566 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1568 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
1571 num_str
= m
.group('num').replace(',', '.')
1572 mult
= unit_table
[m
.group('unit')]
1573 return int(float(num_str
) * mult
)
1576 def parse_filesize(s
):
1580 # The lower-case forms are of course incorrect and unofficial,
1581 # but we support those too
1598 'megabytes': 1000 ** 2,
1599 'mebibytes': 1024 ** 2,
1605 'gigabytes': 1000 ** 3,
1606 'gibibytes': 1024 ** 3,
1612 'terabytes': 1000 ** 4,
1613 'tebibytes': 1024 ** 4,
1619 'petabytes': 1000 ** 5,
1620 'pebibytes': 1024 ** 5,
1626 'exabytes': 1000 ** 6,
1627 'exbibytes': 1024 ** 6,
1633 'zettabytes': 1000 ** 7,
1634 'zebibytes': 1024 ** 7,
1640 'yottabytes': 1000 ** 8,
1641 'yobibytes': 1024 ** 8,
1644 return lookup_unit_table(_UNIT_TABLE
, s
)
1653 if re
.match(r
'^[\d,.]+$', s
):
1654 return str_to_int(s
)
1665 return lookup_unit_table(_UNIT_TABLE
, s
)
1668 def month_by_name(name
, lang
='en'):
1669 """ Return the number of a month by (locale-independently) English name """
1671 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1674 return month_names
.index(name
) + 1
1679 def month_by_abbreviation(abbrev
):
1680 """ Return the number of a month by (locale-independently) English
1684 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1689 def fix_xml_ampersands(xml_str
):
1690 """Replace all the '&' by '&' in XML"""
1692 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1697 def setproctitle(title
):
1698 assert isinstance(title
, compat_str
)
1700 # ctypes in Jython is not complete
1701 # http://bugs.jython.org/issue2148
1702 if sys
.platform
.startswith('java'):
1706 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1710 # LoadLibrary in Windows Python 2.7.13 only expects
1711 # a bytestring, but since unicode_literals turns
1712 # every string into a unicode string, it fails.
1714 title_bytes
= title
.encode('utf-8')
1715 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1716 buf
.value
= title_bytes
1718 libc
.prctl(15, buf
, 0, 0, 0)
1719 except AttributeError:
1720 return # Strange libc, just skip this
1723 def remove_start(s
, start
):
1724 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1727 def remove_end(s
, end
):
1728 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1731 def remove_quotes(s
):
1732 if s
is None or len(s
) < 2:
1734 for quote
in ('"', "'", ):
1735 if s
[0] == quote
and s
[-1] == quote
:
1740 def url_basename(url
):
1741 path
= compat_urlparse
.urlparse(url
).path
1742 return path
.strip('/').split('/')[-1]
1746 return re
.match(r
'https?://[^?#&]+/', url
).group()
1749 def urljoin(base
, path
):
1750 if not isinstance(path
, compat_str
) or not path
:
1752 if re
.match(r
'^(?:https?:)?//', path
):
1754 if not isinstance(base
, compat_str
) or not re
.match(r
'^(?:https?:)?//', base
):
1756 return compat_urlparse
.urljoin(base
, path
)
1759 class HEADRequest(compat_urllib_request
.Request
):
1760 def get_method(self
):
1764 class PUTRequest(compat_urllib_request
.Request
):
1765 def get_method(self
):
1769 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1772 v
= getattr(v
, get_attr
, None)
1778 return int(v
) * invscale
// scale
1783 def str_or_none(v
, default
=None):
1784 return default
if v
is None else compat_str(v
)
1787 def str_to_int(int_str
):
1788 """ A more relaxed version of int_or_none """
1791 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1795 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1799 return float(v
) * invscale
/ scale
1804 def strip_or_none(v
):
1805 return None if v
is None else v
.strip()
1808 def parse_duration(s
):
1809 if not isinstance(s
, compat_basestring
):
1814 days
, hours
, mins
, secs
, ms
= [None] * 5
1815 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
1817 days
, hours
, mins
, secs
, ms
= m
.groups()
1822 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1825 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1828 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1831 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1834 days
, hours
, mins
, secs
, ms
= m
.groups()
1836 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
1838 hours
, mins
= m
.groups()
1844 duration
+= float(secs
)
1846 duration
+= float(mins
) * 60
1848 duration
+= float(hours
) * 60 * 60
1850 duration
+= float(days
) * 24 * 60 * 60
1852 duration
+= float(ms
)
1856 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1857 name
, real_ext
= os
.path
.splitext(filename
)
1859 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1860 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1861 else '{0}.{1}'.format(filename
, ext
))
1864 def replace_extension(filename
, ext
, expected_real_ext
=None):
1865 name
, real_ext
= os
.path
.splitext(filename
)
1866 return '{0}.{1}'.format(
1867 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1871 def check_executable(exe
, args
=[]):
1872 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1873 args can be a list of arguments for a short output (like -version) """
1875 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1881 def get_exe_version(exe
, args
=['--version'],
1882 version_re
=None, unrecognized
='present'):
1883 """ Returns the version of the specified executable,
1884 or False if the executable is not present """
1886 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1887 # SIGTTOU if youtube-dl is run in the background.
1888 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1889 out
, _
= subprocess
.Popen(
1890 [encodeArgument(exe
)] + args
,
1891 stdin
=subprocess
.PIPE
,
1892 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1895 if isinstance(out
, bytes): # Python 2.x
1896 out
= out
.decode('ascii', 'ignore')
1897 return detect_exe_version(out
, version_re
, unrecognized
)
1900 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1901 assert isinstance(output
, compat_str
)
1902 if version_re
is None:
1903 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1904 m
= re
.search(version_re
, output
)
1911 class PagedList(object):
1913 # This is only useful for tests
1914 return len(self
.getslice())
1917 class OnDemandPagedList(PagedList
):
1918 def __init__(self
, pagefunc
, pagesize
, use_cache
=False):
1919 self
._pagefunc
= pagefunc
1920 self
._pagesize
= pagesize
1921 self
._use
_cache
= use_cache
1925 def getslice(self
, start
=0, end
=None):
1927 for pagenum
in itertools
.count(start
// self
._pagesize
):
1928 firstid
= pagenum
* self
._pagesize
1929 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1930 if start
>= nextfirstid
:
1935 page_results
= self
._cache
.get(pagenum
)
1936 if page_results
is None:
1937 page_results
= list(self
._pagefunc
(pagenum
))
1939 self
._cache
[pagenum
] = page_results
1942 start
% self
._pagesize
1943 if firstid
<= start
< nextfirstid
1947 ((end
- 1) % self
._pagesize
) + 1
1948 if (end
is not None and firstid
<= end
<= nextfirstid
)
1951 if startv
!= 0 or endv
is not None:
1952 page_results
= page_results
[startv
:endv
]
1953 res
.extend(page_results
)
1955 # A little optimization - if current page is not "full", ie. does
1956 # not contain page_size videos then we can assume that this page
1957 # is the last one - there are no more ids on further pages -
1958 # i.e. no need to query again.
1959 if len(page_results
) + startv
< self
._pagesize
:
1962 # If we got the whole page, but the next page is not interesting,
1963 # break out early as well
1964 if end
== nextfirstid
:
1969 class InAdvancePagedList(PagedList
):
1970 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1971 self
._pagefunc
= pagefunc
1972 self
._pagecount
= pagecount
1973 self
._pagesize
= pagesize
1975 def getslice(self
, start
=0, end
=None):
1977 start_page
= start
// self
._pagesize
1979 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1980 skip_elems
= start
- start_page
* self
._pagesize
1981 only_more
= None if end
is None else end
- start
1982 for pagenum
in range(start_page
, end_page
):
1983 page
= list(self
._pagefunc
(pagenum
))
1985 page
= page
[skip_elems
:]
1987 if only_more
is not None:
1988 if len(page
) < only_more
:
1989 only_more
-= len(page
)
1991 page
= page
[:only_more
]
1998 def uppercase_escape(s
):
1999 unicode_escape
= codecs
.getdecoder('unicode_escape')
2001 r
'\\U[0-9a-fA-F]{8}',
2002 lambda m
: unicode_escape(m
.group(0))[0],
2006 def lowercase_escape(s
):
2007 unicode_escape
= codecs
.getdecoder('unicode_escape')
2009 r
'\\u[0-9a-fA-F]{4}',
2010 lambda m
: unicode_escape(m
.group(0))[0],
2014 def escape_rfc3986(s
):
2015 """Escape non-ASCII characters as suggested by RFC 3986"""
2016 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
2017 s
= s
.encode('utf-8')
2018 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2021 def escape_url(url
):
2022 """Escape URL as suggested by RFC 3986"""
2023 url_parsed
= compat_urllib_parse_urlparse(url
)
2024 return url_parsed
._replace
(
2025 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2026 path
=escape_rfc3986(url_parsed
.path
),
2027 params
=escape_rfc3986(url_parsed
.params
),
2028 query
=escape_rfc3986(url_parsed
.query
),
2029 fragment
=escape_rfc3986(url_parsed
.fragment
)
2033 def read_batch_urls(batch_fd
):
2035 if not isinstance(url
, compat_str
):
2036 url
= url
.decode('utf-8', 'replace')
2037 BOM_UTF8
= '\xef\xbb\xbf'
2038 if url
.startswith(BOM_UTF8
):
2039 url
= url
[len(BOM_UTF8
):]
2041 if url
.startswith(('#', ';', ']')):
2045 with contextlib
.closing(batch_fd
) as fd
:
2046 return [url
for url
in map(fixup
, fd
) if url
]
2049 def urlencode_postdata(*args
, **kargs
):
2050 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2053 def update_url_query(url
, query
):
2056 parsed_url
= compat_urlparse
.urlparse(url
)
2057 qs
= compat_parse_qs(parsed_url
.query
)
2059 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2060 query
=compat_urllib_parse_urlencode(qs
, True)))
2063 def update_Request(req
, url
=None, data
=None, headers
={}, query
={}):
2064 req_headers
= req
.headers
.copy()
2065 req_headers
.update(headers
)
2066 req_data
= data
or req
.data
2067 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2068 req_get_method
= req
.get_method()
2069 if req_get_method
== 'HEAD':
2070 req_type
= HEADRequest
2071 elif req_get_method
== 'PUT':
2072 req_type
= PUTRequest
2074 req_type
= compat_urllib_request
.Request
2076 req_url
, data
=req_data
, headers
=req_headers
,
2077 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2078 if hasattr(req
, 'timeout'):
2079 new_req
.timeout
= req
.timeout
2083 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2084 if isinstance(key_or_keys
, (list, tuple)):
2085 for key
in key_or_keys
:
2086 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2090 return d
.get(key_or_keys
, default
)
2093 def try_get(src
, getter
, expected_type
=None):
2096 except (AttributeError, KeyError, TypeError, IndexError):
2099 if expected_type
is None or isinstance(v
, expected_type
):
2103 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2104 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2116 TV_PARENTAL_GUIDELINES
= {
2126 def parse_age_limit(s
):
2128 return s
if 0 <= s
<= 21 else None
2129 if not isinstance(s
, compat_basestring
):
2131 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2133 return int(m
.group('age'))
2135 return US_RATINGS
[s
]
2136 return TV_PARENTAL_GUIDELINES
.get(s
)
2139 def strip_jsonp(code
):
2141 r
'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
2144 def js_to_json(code
):
2145 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2146 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
2148 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
2149 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
2154 if v
in ('true', 'false', 'null'):
2156 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
2159 if v
[0] in ("'", '"'):
2160 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
2165 }.get(m
.group(0), m
.group(0)), v
[1:-1])
2167 for regex
, base
in INTEGER_TABLE
:
2168 im
= re
.match(regex
, v
)
2170 i
= int(im
.group(1), base
)
2171 return '"%d":' % i
if v
.endswith(':') else '%d' % i
2175 return re
.sub(r
'''(?sx)
2176 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2177 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2178 {comment}|,(?={skip}[\]}}])|
2179 [a-zA-Z_][.a-zA-Z_0-9]*|
2180 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2182 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
2185 def qualities(quality_ids
):
2186 """ Get a numeric quality value out of a list of possible values """
2189 return quality_ids
.index(qid
)
2195 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
2198 def limit_length(s
, length
):
2199 """ Add ellipses to overly long strings """
2204 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
2208 def version_tuple(v
):
2209 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
2212 def is_outdated_version(version
, limit
, assume_new
=True):
2214 return not assume_new
2216 return version_tuple(version
) < version_tuple(limit
)
2218 return not assume_new
2221 def ytdl_is_updateable():
2222 """ Returns if youtube-dl can be updated with -U """
2223 from zipimport
import zipimporter
2225 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
2228 def args_to_str(args
):
2229 # Get a short string representation for a subprocess command
2230 return ' '.join(compat_shlex_quote(a
) for a
in args
)
2233 def error_to_compat_str(err
):
2235 # On python 2 error byte string must be decoded with proper
2236 # encoding rather than ascii
2237 if sys
.version_info
[0] < 3:
2238 err_str
= err_str
.decode(preferredencoding())
2242 def mimetype2ext(mt
):
2248 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2249 # it's the most popular one
2250 'audio/mpeg': 'mp3',
2255 _
, _
, res
= mt
.rpartition('/')
2256 res
= res
.split(';')[0].strip().lower()
2260 'smptett+xml': 'tt',
2266 'x-mp4-fragmented': 'mp4',
2269 'x-mpegurl': 'm3u8',
2270 'vnd.apple.mpegurl': 'm3u8',
2275 'vnd.ms-sstr+xml': 'ism',
2280 def parse_codecs(codecs_str
):
2281 # http://tools.ietf.org/html/rfc6381
2284 splited_codecs
= list(filter(None, map(
2285 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
2286 vcodec
, acodec
= None, None
2287 for full_codec
in splited_codecs
:
2288 codec
= full_codec
.split('.')[0]
2289 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2292 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2296 write_string('WARNING: Unknown codec %s' % full_codec
, sys
.stderr
)
2297 if not vcodec
and not acodec
:
2298 if len(splited_codecs
) == 2:
2303 elif len(splited_codecs
) == 1:
2310 'vcodec': vcodec
or 'none',
2311 'acodec': acodec
or 'none',
2316 def urlhandle_detect_ext(url_handle
):
2317 getheader
= url_handle
.headers
.get
2319 cd
= getheader('Content-Disposition')
2321 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
2323 e
= determine_ext(m
.group('filename'), default_ext
=None)
2327 return mimetype2ext(getheader('Content-Type'))
2330 def encode_data_uri(data
, mime_type
):
2331 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
2334 def age_restricted(content_limit
, age_limit
):
2335 """ Returns True iff the content should be blocked """
2337 if age_limit
is None: # No limit set
2339 if content_limit
is None:
2340 return False # Content available for everyone
2341 return age_limit
< content_limit
2344 def is_html(first_bytes
):
2345 """ Detect whether a file contains HTML by examining its first bytes. """
2348 (b
'\xef\xbb\xbf', 'utf-8'),
2349 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
2350 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
2351 (b
'\xff\xfe', 'utf-16-le'),
2352 (b
'\xfe\xff', 'utf-16-be'),
2354 for bom
, enc
in BOMS
:
2355 if first_bytes
.startswith(bom
):
2356 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
2359 s
= first_bytes
.decode('utf-8', 'replace')
2361 return re
.match(r
'^\s*<', s
)
2364 def determine_protocol(info_dict
):
2365 protocol
= info_dict
.get('protocol')
2366 if protocol
is not None:
2369 url
= info_dict
['url']
2370 if url
.startswith('rtmp'):
2372 elif url
.startswith('mms'):
2374 elif url
.startswith('rtsp'):
2377 ext
= determine_ext(url
)
2383 return compat_urllib_parse_urlparse(url
).scheme
2386 def render_table(header_row
, data
):
2387 """ Render a list of rows, each as a list of values """
2388 table
= [header_row
] + data
2389 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
2390 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
2391 return '\n'.join(format_str
% tuple(row
) for row
in table
)
2394 def _match_one(filter_part
, dct
):
2395 COMPARISON_OPERATORS
= {
2403 operator_rex
= re
.compile(r
'''(?x)\s*
2405 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2407 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2408 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
2409 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
2412 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2413 m = operator_rex.search(filter_part)
2415 op = COMPARISON_OPERATORS[m.group('op')]
2416 actual_value = dct.get(m.group('key'))
2417 if (m.group('quotedstrval') is not None or
2418 m.group('strval') is not None or
2419 # If the original field is a string and matching comparisonvalue is
2420 # a number we should respect the origin of the original field
2421 # and process comparison value as a string (see
2422 # https://github.com/rg3/youtube-dl/issues/11082).
2423 actual_value is not None and m.group('intval') is not None and
2424 isinstance(actual_value, compat_str)):
2425 if m.group('op') not in ('=', '!='):
2427 'Operator %s does not support string values!' % m.group('op'))
2428 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2429 quote = m.group('quote')
2430 if quote is not None:
2431 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2434 comparison_value = int(m.group('intval'))
2436 comparison_value = parse_filesize(m.group('intval'))
2437 if comparison_value is None:
2438 comparison_value = parse_filesize(m.group('intval') + 'B')
2439 if comparison_value is None:
2441 'Invalid integer value %r in filter part %r' % (
2442 m.group('intval'), filter_part))
2443 if actual_value is None:
2444 return m.group('none_inclusive')
2445 return op(actual_value, comparison_value)
2448 '': lambda v: v is not None,
2449 '!': lambda v: v is None,
2451 operator_rex = re.compile(r'''(?x
)\s
*
2452 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
2454 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2455 m = operator_rex.search(filter_part)
2457 op = UNARY_OPERATORS[m.group('op')]
2458 actual_value = dct.get(m.group('key'))
2459 return op(actual_value)
2461 raise ValueError('Invalid filter part %r' % filter_part)
2464 def match_str(filter_str, dct):
2465 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2468 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2471 def match_filter_func(filter_str):
2472 def _match_func(info_dict):
2473 if match_str(filter_str, info_dict):
2476 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2477 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2481 def parse_dfxp_time_expr(time_expr):
2485 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2487 return float(mobj.group('time_offset'))
2489 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2491 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2494 def srt_subtitles_timecode(seconds):
2495 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2498 def dfxp2srt(dfxp_data):
2499 _x = functools.partial(xpath_with_ns, ns_map={
2500 'ttml': 'http://www.w3.org/ns/ttml',
2501 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2502 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2505 class TTMLPElementParser(object):
2508 def start(self, tag, attrib):
2509 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2515 def data(self, data):
2519 return self.out.strip()
2521 def parse_node(node):
2522 target = TTMLPElementParser()
2523 parser = xml.etree.ElementTree.XMLParser(target=target)
2524 parser.feed(xml.etree.ElementTree.tostring(node))
2525 return parser.close()
2527 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2529 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2532 raise ValueError('Invalid dfxp/TTML subtitle')
2534 for para, index in zip(paras, itertools.count(1)):
2535 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2536 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2537 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2538 if begin_time is None:
2543 end_time = begin_time + dur
2544 out.append('%d\n%s --> %s\n%s\n\n' % (
2546 srt_subtitles_timecode(begin_time),
2547 srt_subtitles_timecode(end_time),
2553 def cli_option(params, command_option, param):
2554 param = params.get(param)
2556 param = compat_str(param)
2557 return [command_option, param] if param is not None else []
2560 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2561 param = params.get(param)
2562 assert isinstance(param, bool)
2564 return [command_option + separator + (true_value if param else false_value)]
2565 return [command_option, true_value if param else false_value]
2568 def cli_valueless_option(params, command_option, param, expected_value=True):
2569 param = params.get(param)
2570 return [command_option] if param == expected_value else []
2573 def cli_configuration_args(params, param, default=[]):
2574 ex_args = params.get(param)
2577 assert isinstance(ex_args, list)
2581 class ISO639Utils(object):
2582 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2771 def short2long(cls, code):
2772 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2773 return cls._lang_map.get(code[:2])
2776 def long2short(cls, code):
2777 """Convert language code from ISO 639-2/T to ISO 639-1"""
2778 for short_name, long_name in cls._lang_map.items():
2779 if long_name == code:
2783 class ISO3166Utils(object):
2784 # From http://data.okfn.org/data/core/country-list
2786 'AF': 'Afghanistan',
2787 'AX': 'Åland Islands',
2790 'AS': 'American Samoa',
2795 'AG': 'Antigua and Barbuda',
2812 'BO': 'Bolivia, Plurinational State of',
2813 'BQ': 'Bonaire, Sint Eustatius and Saba',
2814 'BA': 'Bosnia and Herzegovina',
2816 'BV': 'Bouvet Island',
2818 'IO': 'British Indian Ocean Territory',
2819 'BN': 'Brunei Darussalam',
2821 'BF': 'Burkina Faso',
2827 'KY': 'Cayman Islands',
2828 'CF': 'Central African Republic',
2832 'CX': 'Christmas Island',
2833 'CC': 'Cocos (Keeling) Islands',
2837 'CD': 'Congo, the Democratic Republic of the',
2838 'CK': 'Cook Islands',
2840 'CI': 'Côte d\'Ivoire',
2845 'CZ': 'Czech Republic',
2849 'DO': 'Dominican Republic',
2852 'SV': 'El Salvador',
2853 'GQ': 'Equatorial Guinea',
2857 'FK': 'Falkland Islands (Malvinas)',
2858 'FO': 'Faroe Islands',
2862 'GF': 'French Guiana',
2863 'PF': 'French Polynesia',
2864 'TF': 'French Southern Territories',
2879 'GW': 'Guinea-Bissau',
2882 'HM': 'Heard Island and McDonald Islands',
2883 'VA': 'Holy See (Vatican City State)',
2890 'IR': 'Iran, Islamic Republic of',
2893 'IM': 'Isle of Man',
2903 'KP': 'Korea, Democratic People\'s Republic of',
2904 'KR': 'Korea, Republic of',
2907 'LA': 'Lao People\'s Democratic Republic',
2913 'LI': 'Liechtenstein',
2917 'MK': 'Macedonia, the Former Yugoslav Republic of',
2924 'MH': 'Marshall Islands',
2930 'FM': 'Micronesia, Federated States of',
2931 'MD': 'Moldova, Republic of',
2942 'NL': 'Netherlands',
2943 'NC': 'New Caledonia',
2944 'NZ': 'New Zealand',
2949 'NF': 'Norfolk Island',
2950 'MP': 'Northern Mariana Islands',
2955 'PS': 'Palestine, State of',
2957 'PG': 'Papua New Guinea',
2960 'PH': 'Philippines',
2964 'PR': 'Puerto Rico',
2968 'RU': 'Russian Federation',
2970 'BL': 'Saint Barthélemy',
2971 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2972 'KN': 'Saint Kitts and Nevis',
2973 'LC': 'Saint Lucia',
2974 'MF': 'Saint Martin (French part)',
2975 'PM': 'Saint Pierre and Miquelon',
2976 'VC': 'Saint Vincent and the Grenadines',
2979 'ST': 'Sao Tome and Principe',
2980 'SA': 'Saudi Arabia',
2984 'SL': 'Sierra Leone',
2986 'SX': 'Sint Maarten (Dutch part)',
2989 'SB': 'Solomon Islands',
2991 'ZA': 'South Africa',
2992 'GS': 'South Georgia and the South Sandwich Islands',
2993 'SS': 'South Sudan',
2998 'SJ': 'Svalbard and Jan Mayen',
3001 'CH': 'Switzerland',
3002 'SY': 'Syrian Arab Republic',
3003 'TW': 'Taiwan, Province of China',
3005 'TZ': 'Tanzania, United Republic of',
3007 'TL': 'Timor-Leste',
3011 'TT': 'Trinidad and Tobago',
3014 'TM': 'Turkmenistan',
3015 'TC': 'Turks and Caicos Islands',
3019 'AE': 'United Arab Emirates',
3020 'GB': 'United Kingdom',
3021 'US': 'United States',
3022 'UM': 'United States Minor Outlying Islands',
3026 'VE': 'Venezuela, Bolivarian Republic of',
3028 'VG': 'Virgin Islands, British',
3029 'VI': 'Virgin Islands, U.S.',
3030 'WF': 'Wallis and Futuna',
3031 'EH': 'Western Sahara',
3038 def short2full(cls, code):
3039 """Convert an ISO 3166-2 country code to the corresponding full name"""
3040 return cls._country_map.get(code.upper())
3043 class GeoUtils(object):
3044 # Major IPv4 address blocks per country
3046 'AD': '85.94.160.0/19',
3047 'AE': '94.200.0.0/13',
3048 'AF': '149.54.0.0/17',
3049 'AG': '209.59.64.0/18',
3050 'AI': '204.14.248.0/21',
3051 'AL': '46.99.0.0/16',
3052 'AM': '46.70.0.0/15',
3053 'AO': '105.168.0.0/13',
3054 'AP': '159.117.192.0/21',
3055 'AR': '181.0.0.0/12',
3056 'AS': '202.70.112.0/20',
3057 'AT': '84.112.0.0/13',
3058 'AU': '1.128.0.0/11',
3059 'AW': '181.41.0.0/18',
3060 'AZ': '5.191.0.0/16',
3061 'BA': '31.176.128.0/17',
3062 'BB': '65.48.128.0/17',
3063 'BD': '114.130.0.0/16',
3065 'BF': '129.45.128.0/17',
3066 'BG': '95.42.0.0/15',
3067 'BH': '37.131.0.0/17',
3068 'BI': '154.117.192.0/18',
3069 'BJ': '137.255.0.0/16',
3070 'BL': '192.131.134.0/24',
3071 'BM': '196.12.64.0/18',
3072 'BN': '156.31.0.0/16',
3073 'BO': '161.56.0.0/16',
3074 'BQ': '161.0.80.0/20',
3075 'BR': '152.240.0.0/12',
3076 'BS': '24.51.64.0/18',
3077 'BT': '119.2.96.0/19',
3078 'BW': '168.167.0.0/16',
3079 'BY': '178.120.0.0/13',
3080 'BZ': '179.42.192.0/18',
3081 'CA': '99.224.0.0/11',
3082 'CD': '41.243.0.0/16',
3083 'CF': '196.32.200.0/21',
3084 'CG': '197.214.128.0/17',
3085 'CH': '85.0.0.0/13',
3086 'CI': '154.232.0.0/14',
3087 'CK': '202.65.32.0/19',
3088 'CL': '152.172.0.0/14',
3089 'CM': '165.210.0.0/15',
3090 'CN': '36.128.0.0/10',
3091 'CO': '181.240.0.0/12',
3092 'CR': '201.192.0.0/12',
3093 'CU': '152.206.0.0/15',
3094 'CV': '165.90.96.0/19',
3095 'CW': '190.88.128.0/17',
3096 'CY': '46.198.0.0/15',
3097 'CZ': '88.100.0.0/14',
3099 'DJ': '197.241.0.0/17',
3100 'DK': '87.48.0.0/12',
3101 'DM': '192.243.48.0/20',
3102 'DO': '152.166.0.0/15',
3103 'DZ': '41.96.0.0/12',
3104 'EC': '186.68.0.0/15',
3105 'EE': '90.190.0.0/15',
3106 'EG': '156.160.0.0/11',
3107 'ER': '196.200.96.0/20',
3108 'ES': '88.0.0.0/11',
3109 'ET': '196.188.0.0/14',
3110 'EU': '2.16.0.0/13',
3111 'FI': '91.152.0.0/13',
3112 'FJ': '144.120.0.0/16',
3113 'FM': '119.252.112.0/20',
3114 'FO': '88.85.32.0/19',
3116 'GA': '41.158.0.0/15',
3118 'GD': '74.122.88.0/21',
3119 'GE': '31.146.0.0/16',
3120 'GF': '161.22.64.0/18',
3121 'GG': '62.68.160.0/19',
3122 'GH': '45.208.0.0/14',
3123 'GI': '85.115.128.0/19',
3124 'GL': '88.83.0.0/19',
3125 'GM': '160.182.0.0/15',
3126 'GN': '197.149.192.0/18',
3127 'GP': '104.250.0.0/19',
3128 'GQ': '105.235.224.0/20',
3129 'GR': '94.64.0.0/13',
3130 'GT': '168.234.0.0/16',
3131 'GU': '168.123.0.0/16',
3132 'GW': '197.214.80.0/20',
3133 'GY': '181.41.64.0/18',
3134 'HK': '113.252.0.0/14',
3135 'HN': '181.210.0.0/16',
3136 'HR': '93.136.0.0/13',
3137 'HT': '148.102.128.0/17',
3138 'HU': '84.0.0.0/14',
3139 'ID': '39.192.0.0/10',
3140 'IE': '87.32.0.0/12',
3141 'IL': '79.176.0.0/13',
3142 'IM': '5.62.80.0/20',
3143 'IN': '117.192.0.0/10',
3144 'IO': '203.83.48.0/21',
3145 'IQ': '37.236.0.0/14',
3146 'IR': '2.176.0.0/12',
3147 'IS': '82.221.0.0/16',
3148 'IT': '79.0.0.0/10',
3149 'JE': '87.244.64.0/18',
3150 'JM': '72.27.0.0/17',
3151 'JO': '176.29.0.0/16',
3152 'JP': '126.0.0.0/8',
3153 'KE': '105.48.0.0/12',
3154 'KG': '158.181.128.0/17',
3155 'KH': '36.37.128.0/17',
3156 'KI': '103.25.140.0/22',
3157 'KM': '197.255.224.0/20',
3158 'KN': '198.32.32.0/19',
3159 'KP': '175.45.176.0/22',
3160 'KR': '175.192.0.0/10',
3161 'KW': '37.36.0.0/14',
3162 'KY': '64.96.0.0/15',
3163 'KZ': '2.72.0.0/13',
3164 'LA': '115.84.64.0/18',
3165 'LB': '178.135.0.0/16',
3166 'LC': '192.147.231.0/24',
3167 'LI': '82.117.0.0/19',
3168 'LK': '112.134.0.0/15',
3169 'LR': '41.86.0.0/19',
3170 'LS': '129.232.0.0/17',
3171 'LT': '78.56.0.0/13',
3172 'LU': '188.42.0.0/16',
3173 'LV': '46.109.0.0/16',
3174 'LY': '41.252.0.0/14',
3175 'MA': '105.128.0.0/11',
3176 'MC': '88.209.64.0/18',
3177 'MD': '37.246.0.0/16',
3178 'ME': '178.175.0.0/17',
3179 'MF': '74.112.232.0/21',
3180 'MG': '154.126.0.0/17',
3181 'MH': '117.103.88.0/21',
3182 'MK': '77.28.0.0/15',
3183 'ML': '154.118.128.0/18',
3184 'MM': '37.111.0.0/17',
3185 'MN': '49.0.128.0/17',
3186 'MO': '60.246.0.0/16',
3187 'MP': '202.88.64.0/20',
3188 'MQ': '109.203.224.0/19',
3189 'MR': '41.188.64.0/18',
3190 'MS': '208.90.112.0/22',
3191 'MT': '46.11.0.0/16',
3192 'MU': '105.16.0.0/12',
3193 'MV': '27.114.128.0/18',
3194 'MW': '105.234.0.0/16',
3195 'MX': '187.192.0.0/11',
3196 'MY': '175.136.0.0/13',
3197 'MZ': '197.218.0.0/15',
3198 'NA': '41.182.0.0/16',
3199 'NC': '101.101.0.0/18',
3200 'NE': '197.214.0.0/18',
3201 'NF': '203.17.240.0/22',
3202 'NG': '105.112.0.0/12',
3203 'NI': '186.76.0.0/15',
3204 'NL': '145.96.0.0/11',
3205 'NO': '84.208.0.0/13',
3206 'NP': '36.252.0.0/15',
3207 'NR': '203.98.224.0/19',
3208 'NU': '49.156.48.0/22',
3209 'NZ': '49.224.0.0/14',
3210 'OM': '5.36.0.0/15',
3211 'PA': '186.72.0.0/15',
3212 'PE': '186.160.0.0/14',
3213 'PF': '123.50.64.0/18',
3214 'PG': '124.240.192.0/19',
3215 'PH': '49.144.0.0/13',
3216 'PK': '39.32.0.0/11',
3217 'PL': '83.0.0.0/11',
3218 'PM': '70.36.0.0/20',
3219 'PR': '66.50.0.0/16',
3220 'PS': '188.161.0.0/16',
3221 'PT': '85.240.0.0/13',
3222 'PW': '202.124.224.0/20',
3223 'PY': '181.120.0.0/14',
3224 'QA': '37.210.0.0/15',
3225 'RE': '139.26.0.0/16',
3226 'RO': '79.112.0.0/13',
3227 'RS': '178.220.0.0/14',
3228 'RU': '5.136.0.0/13',
3229 'RW': '105.178.0.0/15',
3230 'SA': '188.48.0.0/13',
3231 'SB': '202.1.160.0/19',
3232 'SC': '154.192.0.0/11',
3233 'SD': '154.96.0.0/13',
3234 'SE': '78.64.0.0/12',
3235 'SG': '152.56.0.0/14',
3236 'SI': '188.196.0.0/14',
3237 'SK': '78.98.0.0/15',
3238 'SL': '197.215.0.0/17',
3239 'SM': '89.186.32.0/19',
3240 'SN': '41.82.0.0/15',
3241 'SO': '197.220.64.0/19',
3242 'SR': '186.179.128.0/17',
3243 'SS': '105.235.208.0/21',
3244 'ST': '197.159.160.0/19',
3245 'SV': '168.243.0.0/16',
3246 'SX': '190.102.0.0/20',
3248 'SZ': '41.84.224.0/19',
3249 'TC': '65.255.48.0/20',
3250 'TD': '154.68.128.0/19',
3251 'TG': '196.168.0.0/14',
3252 'TH': '171.96.0.0/13',
3253 'TJ': '85.9.128.0/18',
3254 'TK': '27.96.24.0/21',
3255 'TL': '180.189.160.0/20',
3256 'TM': '95.85.96.0/19',
3257 'TN': '197.0.0.0/11',
3258 'TO': '175.176.144.0/21',
3259 'TR': '78.160.0.0/11',
3260 'TT': '186.44.0.0/15',
3261 'TV': '202.2.96.0/19',
3262 'TW': '120.96.0.0/11',
3263 'TZ': '156.156.0.0/14',
3264 'UA': '93.72.0.0/13',
3265 'UG': '154.224.0.0/13',
3267 'UY': '167.56.0.0/13',
3268 'UZ': '82.215.64.0/18',
3269 'VA': '212.77.0.0/19',
3270 'VC': '24.92.144.0/20',
3271 'VE': '186.88.0.0/13',
3272 'VG': '172.103.64.0/18',
3273 'VI': '146.226.0.0/16',
3274 'VN': '14.160.0.0/11',
3275 'VU': '202.80.32.0/20',
3276 'WF': '117.20.32.0/21',
3277 'WS': '202.4.32.0/19',
3278 'YE': '134.35.0.0/16',
3279 'YT': '41.242.116.0/22',
3280 'ZA': '41.0.0.0/11',
3281 'ZM': '165.56.0.0/13',
3282 'ZW': '41.85.192.0/19',
3286 def random_ipv4(cls, code):
3287 block = cls._country_ip_map.get(code.upper())
3290 addr, preflen = block.split('/')
3291 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3292 addr_max = addr_min | (0xffffffff >> int(preflen))
3293 return compat_str(socket.inet_ntoa(
3294 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3297 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3298 def __init__(self, proxies=None):
3299 # Set default handlers
3300 for type in ('http', 'https'):
3301 setattr(self, '%s_open' % type,
3302 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3303 meth(r, proxy, type))
3304 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3306 def proxy_open(self, req, proxy, type):
3307 req_proxy = req.headers.get('Ytdl-request-proxy')
3308 if req_proxy is not None:
3310 del req.headers['Ytdl-request-proxy']
3312 if proxy == '__noproxy__':
3313 return None # No Proxy
3314 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3315 req.add_header('Ytdl-socks-proxy', proxy)
3316 # youtube-dl's http/https handlers do wrapping the socket with socks
3318 return compat_urllib_request.ProxyHandler.proxy_open(
3319 self, req, proxy, type)
3322 def ohdave_rsa_encrypt(data, exponent, modulus):
3324 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
3327 data: data to encrypt, bytes-like object
3328 exponent, modulus: parameter e and N of RSA algorithm, both integer
3329 Output: hex string of encrypted data
3331 Limitation: supports one block encryption only
3334 payload = int(binascii.hexlify(data[::-1]), 16)
3335 encrypted = pow(payload, exponent, modulus)
3336 return '%x' % encrypted
3339 def encode_base_n(num, n, table=None):
3340 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
'
3342 table = FULL_TABLE[:n]
3345 raise ValueError('base
%d exceeds table length
%d' % (n, len(table)))
3352 ret = table[num % n] + ret
3357 def decode_packed_codes(code):
3358 mobj = re.search(PACKED_CODES_RE, code)
3359 obfucasted_code, base, count, symbols = mobj.groups()
3362 symbols = symbols.split('|
')
3367 base_n_count = encode_base_n(count, base)
3368 symbol_table[base_n_count] = symbols[count] or base_n_count
3371 r'\b(\w
+)\b', lambda mobj: symbol_table[mobj.group(0)],
3375 def parse_m3u8_attributes(attrib):
3377 for (key, val) in re.findall(r'(?P
<key
>[A
-Z0
-9-]+)=(?P
<val
>"[^"]+"|[^",]+)(?
:,|$
)', attrib):
3378 if val.startswith('"'):
3384 def urshift(val, n):
3385 return val >> n if val >= 0 else (val + 0x100000000) >> n
3388 # Based on png2str() written by @gdkchan and improved by @yokrysty
3389 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3390 def decode_png(png_data):
3391 # Reference: https://www.w3.org/TR/PNG/
3392 header = png_data[8:]
3394 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3395 raise IOError('Not a valid PNG file.')
3397 int_map = {1: '>B', 2: '>H', 4: '>I'}
3398 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3403 length = unpack_integer(header[:4])
3406 chunk_type = header[:4]
3409 chunk_data = header[:length]
3410 header = header[length:]
3412 header = header[4:] # Skip CRC
3420 ihdr = chunks[0]['data']
3422 width = unpack_integer(ihdr[:4])
3423 height = unpack_integer(ihdr[4:8])
3427 for chunk in chunks:
3428 if chunk['type'] == b'IDAT':
3429 idat += chunk['data']
3432 raise IOError('Unable to read PNG data.')
3434 decompressed_data = bytearray(zlib.decompress(idat))
3439 def _get_pixel(idx):
3444 for y in range(height):
3445 basePos = y * (1 + stride)
3446 filter_type = decompressed_data[basePos]
3450 pixels.append(current_row)
3452 for x in range(stride):
3453 color = decompressed_data[1 + basePos + x]
3454 basex = y * stride + x
3459 left = _get_pixel(basex - 3)
3461 up = _get_pixel(basex - stride)
3463 if filter_type == 1: # Sub
3464 color = (color + left) & 0xff
3465 elif filter_type == 2: # Up
3466 color = (color + up) & 0xff
3467 elif filter_type == 3: # Average
3468 color = (color + ((left + up) >> 1)) & 0xff
3469 elif filter_type == 4: # Paeth
3475 c = _get_pixel(basex - stride - 3)
3483 if pa <= pb and pa <= pc:
3484 color = (color + a) & 0xff
3486 color = (color + b) & 0xff
3488 color = (color + c) & 0xff
3490 current_row.append(color)
3492 return width, height, pixels
3495 def write_xattr(path, key, value):
3496 # This mess below finds the best xattr tool for the job
3498 # try the pyxattr module...
3501 if hasattr(xattr, 'set'): # pyxattr
3502 # Unicode arguments are not supported in python-pyxattr until
3504 # See https://github.com/rg3/youtube-dl/issues/5498
3505 pyxattr_required_version = '0.5.0'
3506 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3507 # TODO: fallback to CLI tools
3508 raise XAttrUnavailableError(
3509 'python-pyxattr is detected but is too old. '
3510 'youtube-dl requires %s or above while your version is %s. '
3511 'Falling back to other xattr implementations' % (
3512 pyxattr_required_version, xattr.__version__))
3514 setxattr = xattr.set
3516 setxattr = xattr.setxattr
3519 setxattr(path, key, value)
3520 except EnvironmentError as e:
3521 raise XAttrMetadataError(e.errno, e.strerror)
3524 if compat_os_name == 'nt':
3525 # Write xattrs to NTFS Alternate Data Streams:
3526 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3527 assert ':' not in key
3528 assert os.path.exists(path)
3530 ads_fn = path + ':' + key
3532 with open(ads_fn, 'wb') as f:
3534 except EnvironmentError as e:
3535 raise XAttrMetadataError(e.errno, e.strerror)
3537 user_has_setfattr = check_executable('setfattr', ['--version'])
3538 user_has_xattr = check_executable('xattr', ['-h'])
3540 if user_has_setfattr or user_has_xattr:
3542 value = value.decode('utf-8')
3543 if user_has_setfattr:
3544 executable = 'setfattr'
3545 opts = ['-n', key, '-v', value]
3546 elif user_has_xattr:
3547 executable = 'xattr'
3548 opts = ['-w', key, value]
3550 cmd = ([encodeFilename(executable, True)] +
3551 [encodeArgument(o) for o in opts] +
3552 [encodeFilename(path, True)])
3555 p = subprocess.Popen(
3556 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3557 except EnvironmentError as e:
3558 raise XAttrMetadataError(e.errno, e.strerror)
3559 stdout, stderr = p.communicate()
3560 stderr = stderr.decode('utf-8', 'replace')
3561 if p.returncode != 0:
3562 raise XAttrMetadataError(p.returncode, stderr)
3565 # On Unix, and can't find pyxattr, setfattr, or xattr.
3566 if sys.platform.startswith('linux'):
3567 raise XAttrUnavailableError(
3568 "Couldn
't find a tool to set the xattrs. "
3569 "Install either the python 'pyxattr
' or 'xattr
' "
3570 "modules, or the GNU 'attr
' package "
3571 "(which contains the 'setfattr
' tool).")
3573 raise XAttrUnavailableError(
3574 "Couldn't find a tool to
set the xattrs
. "
3575 "Install either the python
'xattr' module
, "
3576 "or the
'xattr' binary
.")