2 # -*- coding: utf-8 -*-
4 from __future__
import unicode_literals
33 import xml
.etree
.ElementTree
39 compat_etree_fromstring
,
44 compat_socket_create_connection
,
48 compat_urllib_parse_urlparse
,
49 compat_urllib_request
,
55 # This is not clearly defined otherwise
56 compiled_regex_type
= type(re
.compile(''))
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
69 ENGLISH_MONTH_NAMES
= [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
74 def preferredencoding():
75 """Get preferred encoding.
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
81 pref
= locale
.getpreferredencoding()
89 def write_json_file(obj
, fn
):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
92 fn
= encodeFilename(fn
)
93 if sys
.version_info
< (3, 0) and sys
.platform
!= 'win32':
94 encoding
= get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename
= lambda f
: os
.path
.basename(fn
).decode(encoding
)
99 # the same for os.path.dirname
100 path_dirname
= lambda f
: os
.path
.dirname(fn
).decode(encoding
)
102 path_basename
= os
.path
.basename
103 path_dirname
= os
.path
.dirname
107 'prefix': path_basename(fn
) + '.',
108 'dir': path_dirname(fn
),
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys
.version_info
< (3, 0):
122 tf
= tempfile
.NamedTemporaryFile(**compat_kwargs(args
))
127 if sys
.platform
== 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
134 os
.rename(tf
.name
, fn
)
143 if sys
.version_info
>= (2, 7):
144 def find_xpath_attr(node
, xpath
, key
, val
=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re
.match(r
'^[a-zA-Z_-]+$', key
)
148 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
149 expr
= xpath
+ ('[@%s]' % key
if val
is None else "[@%s='%s']" % (key
, val
))
150 return node
.find(expr
)
152 def find_xpath_attr(node
, xpath
, key
, val
=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath
, compat_str
):
156 xpath
= xpath
.encode('ascii')
158 for f
in node
.findall(xpath
):
159 if key
not in f
.attrib
:
161 if val
is None or f
.attrib
.get(key
) == val
:
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
169 def xpath_with_ns(path
, ns_map
):
170 components
= [c
.split(':') for c
in path
.split('/')]
174 replaced
.append(c
[0])
177 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
178 return '/'.join(replaced
)
181 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
182 def _find_xpath(xpath
):
183 if sys
.version_info
< (2, 7): # Crazy 2.6
184 xpath
= xpath
.encode('ascii')
185 return node
.find(xpath
)
187 if isinstance(xpath
, (str, compat_str
)):
188 n
= _find_xpath(xpath
)
196 if default
is not NO_DEFAULT
:
199 name
= xpath
if name
is None else name
200 raise ExtractorError('Could not find XML element %s' % name
)
206 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
207 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
208 if n
is None or n
== default
:
211 if default
is not NO_DEFAULT
:
214 name
= xpath
if name
is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name
)
221 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
222 n
= find_xpath_attr(node
, xpath
, key
)
224 if default
is not NO_DEFAULT
:
227 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name
)
234 def get_element_by_id(id, html
):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html
)
239 def get_element_by_attribute(attribute
, value
, html
):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 m
= re
.search(r
'''(?xs)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
250 ''' % (re
.escape(attribute
), re
.escape(value
)), html
)
254 res
= m
.group('content')
256 if res
.startswith('"') or res
.startswith("'"):
259 return unescapeHTML(res
)
262 def clean_html(html
):
263 """Clean an HTML snippet into a readable string"""
265 if html
is None: # Convenience for sanitizing descriptions etc.
269 html
= html
.replace('\n', ' ')
270 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
271 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
273 html
= re
.sub('<.*?>', '', html
)
274 # Replace html entities
275 html
= unescapeHTML(html
)
279 def sanitize_open(filename
, open_mode
):
280 """Try to open the given filename, and slightly tweak it if this fails.
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
287 It returns the tuple (stream, definitive_file_name).
291 if sys
.platform
== 'win32':
293 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
294 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
295 stream
= open(encodeFilename(filename
), open_mode
)
296 return (stream
, filename
)
297 except (IOError, OSError) as err
:
298 if err
.errno
in (errno
.EACCES
,):
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename
= sanitize_path(filename
)
303 if alt_filename
== filename
:
306 # An exception here should be caught in the caller
307 stream
= open(encodeFilename(alt_filename
), open_mode
)
308 return (stream
, alt_filename
)
311 def timeconvert(timestr
):
312 """Convert RFC 2822 defined time string into system timestamp"""
314 timetuple
= email
.utils
.parsedate_tz(timestr
)
315 if timetuple
is not None:
316 timestamp
= email
.utils
.mktime_tz(timetuple
)
320 def sanitize_filename(s
, restricted
=False, is_id
=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
325 def replace_insane(char
):
326 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
329 return '' if restricted
else '\''
331 return '_-' if restricted
else ' -'
332 elif char
in '\\/|*<>':
334 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
336 if restricted
and ord(char
) > 127:
341 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
342 result
= ''.join(map(replace_insane
, s
))
344 while '__' in result
:
345 result
= result
.replace('__', '_')
346 result
= result
.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted
and result
.startswith('-_'):
350 if result
.startswith('-'):
351 result
= '_' + result
[len('-'):]
352 result
= result
.lstrip('.')
358 def sanitize_path(s
):
359 """Sanitizes and normalizes path on Windows"""
360 if sys
.platform
!= 'win32':
362 drive_or_unc
, _
= os
.path
.splitdrive(s
)
363 if sys
.version_info
< (2, 7) and not drive_or_unc
:
364 drive_or_unc
, _
= os
.path
.splitunc(s
)
365 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
369 path_part
if path_part
in ['.', '..'] else re
.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part
)
370 for path_part
in norm_path
]
372 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
373 return os
.path
.join(*sanitized_path
)
376 def orderedSet(iterable
):
377 """ Remove all duplicates from the input iterable """
385 def _htmlentity_transform(entity
):
386 """Transforms an HTML entity to a character."""
387 # Known non-numeric HTML entity
388 if entity
in compat_html_entities
.name2codepoint
:
389 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
391 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
393 numstr
= mobj
.group(1)
394 if numstr
.startswith('x'):
396 numstr
= '0%s' % numstr
399 return compat_chr(int(numstr
, base
))
401 # Unknown entity in name, return its literal representation
402 return ('&%s;' % entity
)
408 assert type(s
) == compat_str
411 r
'&([^;]+);', lambda m
: _htmlentity_transform(m
.group(1)), s
)
414 def get_subprocess_encoding():
415 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
416 # For subprocess calls, encode with locale encoding
417 # Refer to http://stackoverflow.com/a/9951851/35070
418 encoding
= preferredencoding()
420 encoding
= sys
.getfilesystemencoding()
426 def encodeFilename(s
, for_subprocess
=False):
428 @param s The name of the file
431 assert type(s
) == compat_str
433 # Python 3 has a Unicode API
434 if sys
.version_info
>= (3, 0):
437 # Pass '' directly to use Unicode APIs on Windows 2000 and up
438 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
439 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
440 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
443 return s
.encode(get_subprocess_encoding(), 'ignore')
446 def decodeFilename(b
, for_subprocess
=False):
448 if sys
.version_info
>= (3, 0):
451 if not isinstance(b
, bytes):
454 return b
.decode(get_subprocess_encoding(), 'ignore')
457 def encodeArgument(s
):
458 if not isinstance(s
, compat_str
):
459 # Legacy code that uses byte strings
460 # Uncomment the following line after fixing all post processors
461 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
462 s
= s
.decode('ascii')
463 return encodeFilename(s
, True)
466 def decodeArgument(b
):
467 return decodeFilename(b
, True)
470 def decodeOption(optval
):
473 if isinstance(optval
, bytes):
474 optval
= optval
.decode(preferredencoding())
476 assert isinstance(optval
, compat_str
)
480 def formatSeconds(secs
):
482 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
484 return '%d:%02d' % (secs
// 60, secs
% 60)
489 def make_HTTPS_handler(params
, **kwargs
):
490 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
491 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
492 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
493 if opts_no_check_certificate
:
494 context
.check_hostname
= False
495 context
.verify_mode
= ssl
.CERT_NONE
497 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
500 # (create_default_context present but HTTPSHandler has no context=)
503 if sys
.version_info
< (3, 2):
504 return YoutubeDLHTTPSHandler(params
, **kwargs
)
506 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
507 context
.verify_mode
= (ssl
.CERT_NONE
508 if opts_no_check_certificate
509 else ssl
.CERT_REQUIRED
)
510 context
.set_default_verify_paths()
511 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
514 def bug_reports_message():
515 if ytdl_is_updateable():
516 update_cmd
= 'type youtube-dl -U to update'
518 update_cmd
= 'see https://yt-dl.org/update on how to update'
519 msg
= '; please report this issue on https://yt-dl.org/bug .'
520 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
521 msg
+= ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
525 class ExtractorError(Exception):
526 """Error during info extraction."""
528 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
529 """ tb, if given, is the original traceback (so that it can be printed out).
530 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
533 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
535 if video_id
is not None:
536 msg
= video_id
+ ': ' + msg
538 msg
+= ' (caused by %r)' % cause
540 msg
+= bug_reports_message()
541 super(ExtractorError
, self
).__init
__(msg
)
544 self
.exc_info
= sys
.exc_info() # preserve original exception
546 self
.video_id
= video_id
548 def format_traceback(self
):
549 if self
.traceback
is None:
551 return ''.join(traceback
.format_tb(self
.traceback
))
554 class UnsupportedError(ExtractorError
):
555 def __init__(self
, url
):
556 super(UnsupportedError
, self
).__init
__(
557 'Unsupported URL: %s' % url
, expected
=True)
561 class RegexNotFoundError(ExtractorError
):
562 """Error when a regex didn't match"""
566 class DownloadError(Exception):
567 """Download Error exception.
569 This exception may be thrown by FileDownloader objects if they are not
570 configured to continue on errors. They will contain the appropriate
574 def __init__(self
, msg
, exc_info
=None):
575 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
576 super(DownloadError
, self
).__init
__(msg
)
577 self
.exc_info
= exc_info
580 class SameFileError(Exception):
581 """Same File exception.
583 This exception will be thrown by FileDownloader objects if they detect
584 multiple files would have to be downloaded to the same file on disk.
589 class PostProcessingError(Exception):
590 """Post Processing exception.
592 This exception may be raised by PostProcessor's .run() method to
593 indicate an error in the postprocessing task.
596 def __init__(self
, msg
):
600 class MaxDownloadsReached(Exception):
601 """ --max-downloads limit has been reached. """
605 class UnavailableVideoError(Exception):
606 """Unavailable Format exception.
608 This exception will be thrown when a video is requested
609 in a format that is not available for that video.
614 class ContentTooShortError(Exception):
615 """Content Too Short exception.
617 This exception may be raised by FileDownloader objects when a file they
618 download is too small for what the server announced first, indicating
619 the connection was probably interrupted.
622 def __init__(self
, downloaded
, expected
):
624 self
.downloaded
= downloaded
625 self
.expected
= expected
628 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
629 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
630 # expected HTTP responses to meet HTTP/1.0 or later (see also
631 # https://github.com/rg3/youtube-dl/issues/6727)
632 if sys
.version_info
< (3, 0):
633 kwargs
[b
'strict'] = True
634 hc
= http_class(*args
, **kwargs
)
635 source_address
= ydl_handler
._params
.get('source_address')
636 if source_address
is not None:
637 sa
= (source_address
, 0)
638 if hasattr(hc
, 'source_address'): # Python 2.7+
639 hc
.source_address
= sa
641 def _hc_connect(self
, *args
, **kwargs
):
642 sock
= compat_socket_create_connection(
643 (self
.host
, self
.port
), self
.timeout
, sa
)
645 self
.sock
= ssl
.wrap_socket(
646 sock
, self
.key_file
, self
.cert_file
,
647 ssl_version
=ssl
.PROTOCOL_TLSv1
)
650 hc
.connect
= functools
.partial(_hc_connect
, hc
)
655 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
656 """Handler for HTTP requests and responses.
658 This class, when installed with an OpenerDirector, automatically adds
659 the standard headers to every HTTP request and handles gzipped and
660 deflated responses from web servers. If compression is to be avoided in
661 a particular request, the original request in the program code only has
662 to include the HTTP header "Youtubedl-No-Compression", which will be
663 removed before making the real request.
665 Part of this code was copied from:
667 http://techknack.net/python-urllib2-handlers/
669 Andrew Rowls, the author of that code, agreed to release it to the
673 def __init__(self
, params
, *args
, **kwargs
):
674 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
675 self
._params
= params
677 def http_open(self
, req
):
678 return self
.do_open(functools
.partial(
679 _create_http_connection
, self
, compat_http_client
.HTTPConnection
, False),
685 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
687 return zlib
.decompress(data
)
690 def addinfourl_wrapper(stream
, headers
, url
, code
):
691 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
692 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
693 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
697 def http_request(self
, req
):
698 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
699 # always respected by websites, some tend to give out URLs with non percent-encoded
700 # non-ASCII characters (see telemb.py, ard.py [#3412])
701 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
702 # To work around aforementioned issue we will replace request's original URL with
703 # percent-encoded one
704 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
705 # the code of this workaround has been moved here from YoutubeDL.urlopen()
706 url
= req
.get_full_url()
707 url_escaped
= escape_url(url
)
709 # Substitute URL if any change after escaping
710 if url
!= url_escaped
:
711 req_type
= HEADRequest
if req
.get_method() == 'HEAD' else compat_urllib_request
.Request
713 url_escaped
, data
=req
.data
, headers
=req
.headers
,
714 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
715 new_req
.timeout
= req
.timeout
718 for h
, v
in std_headers
.items():
719 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
720 # The dict keys are capitalized because of this bug by urllib
721 if h
.capitalize() not in req
.headers
:
723 if 'Youtubedl-no-compression' in req
.headers
:
724 if 'Accept-encoding' in req
.headers
:
725 del req
.headers
['Accept-encoding']
726 del req
.headers
['Youtubedl-no-compression']
728 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
729 # Python 2.6 is brain-dead when it comes to fragments
730 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
731 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
735 def http_response(self
, req
, resp
):
738 if resp
.headers
.get('Content-encoding', '') == 'gzip':
739 content
= resp
.read()
740 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
742 uncompressed
= io
.BytesIO(gz
.read())
743 except IOError as original_ioerror
:
744 # There may be junk add the end of the file
745 # See http://stackoverflow.com/q/4928560/35070 for details
746 for i
in range(1, 1024):
748 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
749 uncompressed
= io
.BytesIO(gz
.read())
754 raise original_ioerror
755 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
756 resp
.msg
= old_resp
.msg
758 if resp
.headers
.get('Content-encoding', '') == 'deflate':
759 gz
= io
.BytesIO(self
.deflate(resp
.read()))
760 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
761 resp
.msg
= old_resp
.msg
762 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
763 # https://github.com/rg3/youtube-dl/issues/6457).
764 if 300 <= resp
.code
< 400:
765 location
= resp
.headers
.get('Location')
767 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
768 if sys
.version_info
>= (3, 0):
769 location
= location
.encode('iso-8859-1').decode('utf-8')
770 location_escaped
= escape_url(location
)
771 if location
!= location_escaped
:
772 del resp
.headers
['Location']
773 resp
.headers
['Location'] = location_escaped
776 https_request
= http_request
777 https_response
= http_response
780 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
781 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
782 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
783 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
784 self
._params
= params
786 def https_open(self
, req
):
788 if hasattr(self
, '_context'): # python > 2.6
789 kwargs
['context'] = self
._context
790 if hasattr(self
, '_check_hostname'): # python 3.x
791 kwargs
['check_hostname'] = self
._check
_hostname
792 return self
.do_open(functools
.partial(
793 _create_http_connection
, self
, self
._https
_conn
_class
, True),
797 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
798 def __init__(self
, cookiejar
=None):
799 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
801 def http_response(self
, request
, response
):
802 # Python 2 will choke on next HTTP request in row if there are non-ASCII
803 # characters in Set-Cookie HTTP header of last response (see
804 # https://github.com/rg3/youtube-dl/issues/6769).
805 # In order to at least prevent crashing we will percent encode Set-Cookie
806 # header before HTTPCookieProcessor starts processing it.
807 # if sys.version_info < (3, 0) and response.headers:
808 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
809 # set_cookie = response.headers.get(set_cookie_header)
811 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
812 # if set_cookie != set_cookie_escaped:
813 # del response.headers[set_cookie_header]
814 # response.headers[set_cookie_header] = set_cookie_escaped
815 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
817 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
818 https_response
= http_response
821 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
822 """ Return a UNIX timestamp from the given date """
827 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
831 r
'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
834 timezone
= datetime
.timedelta()
836 date_str
= date_str
[:-len(m
.group(0))]
837 if not m
.group('sign'):
838 timezone
= datetime
.timedelta()
840 sign
= 1 if m
.group('sign') == '+' else -1
841 timezone
= datetime
.timedelta(
842 hours
=sign
* int(m
.group('hours')),
843 minutes
=sign
* int(m
.group('minutes')))
845 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
846 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
847 return calendar
.timegm(dt
.timetuple())
852 def unified_strdate(date_str
, day_first
=True):
853 """Return a string with the date in the format YYYYMMDD"""
859 date_str
= date_str
.replace(',', ' ')
860 # %z (UTC offset) is only supported in python>=3.2
861 if not re
.match(r
'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str
):
862 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
863 # Remove AM/PM + timezone
864 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
866 format_expressions
= [
871 '%b %dst %Y %I:%M%p',
872 '%b %dnd %Y %I:%M%p',
873 '%b %dth %Y %I:%M%p',
879 '%Y-%m-%d %H:%M:%S.%f',
882 '%Y-%m-%dT%H:%M:%SZ',
883 '%Y-%m-%dT%H:%M:%S.%fZ',
884 '%Y-%m-%dT%H:%M:%S.%f0Z',
886 '%Y-%m-%dT%H:%M:%S.%f',
890 format_expressions
.extend([
898 format_expressions
.extend([
905 for expression
in format_expressions
:
907 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
910 if upload_date
is None:
911 timetuple
= email
.utils
.parsedate_tz(date_str
)
913 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
914 if upload_date
is not None:
915 return compat_str(upload_date
)
918 def determine_ext(url
, default_ext
='unknown_video'):
921 guess
= url
.partition('?')[0].rpartition('.')[2]
922 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
928 def subtitles_filename(filename
, sub_lang
, sub_format
):
929 return filename
.rsplit('.', 1)[0] + '.' + sub_lang
+ '.' + sub_format
932 def date_from_str(date_str
):
934 Return a datetime object from a string in the format YYYYMMDD or
935 (now|today)[+-][0-9](day|week|month|year)(s)?"""
936 today
= datetime
.date
.today()
937 if date_str
in ('now', 'today'):
939 if date_str
== 'yesterday':
940 return today
- datetime
.timedelta(days
=1)
941 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
942 if match
is not None:
943 sign
= match
.group('sign')
944 time
= int(match
.group('time'))
947 unit
= match
.group('unit')
948 # A bad aproximation?
956 delta
= datetime
.timedelta(**{unit
: time
})
958 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
961 def hyphenate_date(date_str
):
963 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
964 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
965 if match
is not None:
966 return '-'.join(match
.groups())
971 class DateRange(object):
972 """Represents a time interval between two dates"""
974 def __init__(self
, start
=None, end
=None):
975 """start and end must be strings in the format accepted by date"""
976 if start
is not None:
977 self
.start
= date_from_str(start
)
979 self
.start
= datetime
.datetime
.min.date()
981 self
.end
= date_from_str(end
)
983 self
.end
= datetime
.datetime
.max.date()
984 if self
.start
> self
.end
:
985 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
989 """Returns a range that only contains the given day"""
992 def __contains__(self
, date
):
993 """Check if the date is in the range"""
994 if not isinstance(date
, datetime
.date
):
995 date
= date_from_str(date
)
996 return self
.start
<= date
<= self
.end
999 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1002 def platform_name():
1003 """ Returns the platform name as a compat_str """
1004 res
= platform
.platform()
1005 if isinstance(res
, bytes):
1006 res
= res
.decode(preferredencoding())
1008 assert isinstance(res
, compat_str
)
1012 def _windows_write_string(s
, out
):
1013 """ Returns True if the string was written using special methods,
1014 False if it has yet to be written out."""
1015 # Adapted from http://stackoverflow.com/a/3259271/35070
1018 import ctypes
.wintypes
1026 fileno
= out
.fileno()
1027 except AttributeError:
1028 # If the output stream doesn't have a fileno, it's virtual
1030 except io
.UnsupportedOperation
:
1031 # Some strange Windows pseudo files?
1033 if fileno
not in WIN_OUTPUT_IDS
:
1036 GetStdHandle
= ctypes
.WINFUNCTYPE(
1037 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1038 (b
"GetStdHandle", ctypes
.windll
.kernel32
))
1039 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1041 WriteConsoleW
= ctypes
.WINFUNCTYPE(
1042 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1043 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1044 ctypes
.wintypes
.LPVOID
)((b
"WriteConsoleW", ctypes
.windll
.kernel32
))
1045 written
= ctypes
.wintypes
.DWORD(0)
1047 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)((b
"GetFileType", ctypes
.windll
.kernel32
))
1048 FILE_TYPE_CHAR
= 0x0002
1049 FILE_TYPE_REMOTE
= 0x8000
1050 GetConsoleMode
= ctypes
.WINFUNCTYPE(
1051 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1052 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1053 (b
"GetConsoleMode", ctypes
.windll
.kernel32
))
1054 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1056 def not_a_console(handle
):
1057 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1059 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
or
1060 GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1062 if not_a_console(h
):
1065 def next_nonbmp_pos(s
):
1067 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1068 except StopIteration:
1072 count
= min(next_nonbmp_pos(s
), 1024)
1074 ret
= WriteConsoleW(
1075 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1077 raise OSError('Failed to write string')
1078 if not count
: # We just wrote a non-BMP character
1079 assert written
.value
== 2
1082 assert written
.value
> 0
1083 s
= s
[written
.value
:]
1087 def write_string(s
, out
=None, encoding
=None):
1090 assert type(s
) == compat_str
1092 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1093 if _windows_write_string(s
, out
):
1096 if ('b' in getattr(out
, 'mode', '') or
1097 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1098 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1100 elif hasattr(out
, 'buffer'):
1101 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1102 byt
= s
.encode(enc
, 'ignore')
1103 out
.buffer.write(byt
)
1109 def bytes_to_intlist(bs
):
1112 if isinstance(bs
[0], int): # Python 3
1115 return [ord(c
) for c
in bs
]
1118 def intlist_to_bytes(xs
):
1121 return struct_pack('%dB' % len(xs
), *xs
)
1124 # Cross-platform file locking
1125 if sys
.platform
== 'win32':
1126 import ctypes
.wintypes
1129 class OVERLAPPED(ctypes
.Structure
):
1131 ('Internal', ctypes
.wintypes
.LPVOID
),
1132 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1133 ('Offset', ctypes
.wintypes
.DWORD
),
1134 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1135 ('hEvent', ctypes
.wintypes
.HANDLE
),
1138 kernel32
= ctypes
.windll
.kernel32
1139 LockFileEx
= kernel32
.LockFileEx
1140 LockFileEx
.argtypes
= [
1141 ctypes
.wintypes
.HANDLE
, # hFile
1142 ctypes
.wintypes
.DWORD
, # dwFlags
1143 ctypes
.wintypes
.DWORD
, # dwReserved
1144 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1145 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1146 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1148 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1149 UnlockFileEx
= kernel32
.UnlockFileEx
1150 UnlockFileEx
.argtypes
= [
1151 ctypes
.wintypes
.HANDLE
, # hFile
1152 ctypes
.wintypes
.DWORD
, # dwReserved
1153 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1154 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1155 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1157 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1158 whole_low
= 0xffffffff
1159 whole_high
= 0x7fffffff
1161 def _lock_file(f
, exclusive
):
1162 overlapped
= OVERLAPPED()
1163 overlapped
.Offset
= 0
1164 overlapped
.OffsetHigh
= 0
1165 overlapped
.hEvent
= 0
1166 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1167 handle
= msvcrt
.get_osfhandle(f
.fileno())
1168 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1169 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1170 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1172 def _unlock_file(f
):
1173 assert f
._lock
_file
_overlapped
_p
1174 handle
= msvcrt
.get_osfhandle(f
.fileno())
1175 if not UnlockFileEx(handle
, 0,
1176 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1177 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1182 def _lock_file(f
, exclusive
):
1183 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1185 def _unlock_file(f
):
1186 fcntl
.flock(f
, fcntl
.LOCK_UN
)
1189 class locked_file(object):
1190 def __init__(self
, filename
, mode
, encoding
=None):
1191 assert mode
in ['r', 'a', 'w']
1192 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1195 def __enter__(self
):
1196 exclusive
= self
.mode
!= 'r'
1198 _lock_file(self
.f
, exclusive
)
1204 def __exit__(self
, etype
, value
, traceback
):
1206 _unlock_file(self
.f
)
1213 def write(self
, *args
):
1214 return self
.f
.write(*args
)
1216 def read(self
, *args
):
1217 return self
.f
.read(*args
)
1220 def get_filesystem_encoding():
1221 encoding
= sys
.getfilesystemencoding()
1222 return encoding
if encoding
is not None else 'utf-8'
1225 def shell_quote(args
):
1227 encoding
= get_filesystem_encoding()
1229 if isinstance(a
, bytes):
1230 # We may get a filename encoded with 'encodeFilename'
1231 a
= a
.decode(encoding
)
1232 quoted_args
.append(pipes
.quote(a
))
1233 return ' '.join(quoted_args
)
1236 def smuggle_url(url
, data
):
1237 """ Pass additional data in a URL for internal use. """
1239 sdata
= compat_urllib_parse
.urlencode(
1240 {'__youtubedl_smuggle': json
.dumps(data
)})
1241 return url
+ '#' + sdata
1244 def unsmuggle_url(smug_url
, default
=None):
1245 if '#__youtubedl_smuggle' not in smug_url
:
1246 return smug_url
, default
1247 url
, _
, sdata
= smug_url
.rpartition('#')
1248 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
1249 data
= json
.loads(jsond
)
1253 def format_bytes(bytes):
1256 if type(bytes) is str:
1257 bytes = float(bytes)
1261 exponent
= int(math
.log(bytes, 1024.0))
1262 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
1263 converted
= float(bytes) / float(1024 ** exponent
)
1264 return '%.2f%s' % (converted
, suffix
)
1267 def parse_filesize(s
):
1271 # The lower-case forms are of course incorrect and inofficial,
1272 # but we support those too
1310 units_re
= '|'.join(re
.escape(u
) for u
in _UNIT_TABLE
)
1312 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re
, s
)
1316 num_str
= m
.group('num').replace(',', '.')
1317 mult
= _UNIT_TABLE
[m
.group('unit')]
1318 return int(float(num_str
) * mult
)
1321 def month_by_name(name
):
1322 """ Return the number of a month by (locale-independently) English name """
1325 return ENGLISH_MONTH_NAMES
.index(name
) + 1
1330 def month_by_abbreviation(abbrev
):
1331 """ Return the number of a month by (locale-independently) English
1335 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1340 def fix_xml_ampersands(xml_str
):
1341 """Replace all the '&' by '&' in XML"""
1343 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1348 def setproctitle(title
):
1349 assert isinstance(title
, compat_str
)
1351 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1354 title_bytes
= title
.encode('utf-8')
1355 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1356 buf
.value
= title_bytes
1358 libc
.prctl(15, buf
, 0, 0, 0)
1359 except AttributeError:
1360 return # Strange libc, just skip this
1363 def remove_start(s
, start
):
1364 if s
.startswith(start
):
1365 return s
[len(start
):]
1369 def remove_end(s
, end
):
1371 return s
[:-len(end
)]
1375 def url_basename(url
):
1376 path
= compat_urlparse
.urlparse(url
).path
1377 return path
.strip('/').split('/')[-1]
1380 class HEADRequest(compat_urllib_request
.Request
):
1381 def get_method(self
):
1385 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1388 v
= getattr(v
, get_attr
, None)
1394 return int(v
) * invscale
// scale
1399 def str_or_none(v
, default
=None):
1400 return default
if v
is None else compat_str(v
)
1403 def str_to_int(int_str
):
1404 """ A more relaxed version of int_or_none """
1407 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
1411 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1415 return float(v
) * invscale
/ scale
1420 def parse_duration(s
):
1421 if not isinstance(s
, compat_basestring
):
1429 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1430 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1432 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1435 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1436 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1438 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1440 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1445 if m
.group('only_mins'):
1446 return float_or_none(m
.group('only_mins'), invscale
=60)
1447 if m
.group('only_hours'):
1448 return float_or_none(m
.group('only_hours'), invscale
=60 * 60)
1450 res
+= int(m
.group('secs'))
1451 if m
.group('mins_reversed'):
1452 res
+= int(m
.group('mins_reversed')) * 60
1454 res
+= int(m
.group('mins')) * 60
1455 if m
.group('hours'):
1456 res
+= int(m
.group('hours')) * 60 * 60
1457 if m
.group('hours_reversed'):
1458 res
+= int(m
.group('hours_reversed')) * 60 * 60
1460 res
+= int(m
.group('days')) * 24 * 60 * 60
1462 res
+= float(m
.group('ms'))
1466 def prepend_extension(filename
, ext
, expected_real_ext
=None):
1467 name
, real_ext
= os
.path
.splitext(filename
)
1469 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
1470 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
1471 else '{0}.{1}'.format(filename
, ext
))
1474 def replace_extension(filename
, ext
, expected_real_ext
=None):
1475 name
, real_ext
= os
.path
.splitext(filename
)
1476 return '{0}.{1}'.format(
1477 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
1481 def check_executable(exe
, args
=[]):
1482 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1483 args can be a list of arguments for a short output (like -version) """
1485 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1491 def get_exe_version(exe
, args
=['--version'],
1492 version_re
=None, unrecognized
='present'):
1493 """ Returns the version of the specified executable,
1494 or False if the executable is not present """
1496 out
, _
= subprocess
.Popen(
1497 [encodeArgument(exe
)] + args
,
1498 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
1501 if isinstance(out
, bytes): # Python 2.x
1502 out
= out
.decode('ascii', 'ignore')
1503 return detect_exe_version(out
, version_re
, unrecognized
)
1506 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
1507 assert isinstance(output
, compat_str
)
1508 if version_re
is None:
1509 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
1510 m
= re
.search(version_re
, output
)
1517 class PagedList(object):
1519 # This is only useful for tests
1520 return len(self
.getslice())
1523 class OnDemandPagedList(PagedList
):
1524 def __init__(self
, pagefunc
, pagesize
):
1525 self
._pagefunc
= pagefunc
1526 self
._pagesize
= pagesize
1528 def getslice(self
, start
=0, end
=None):
1530 for pagenum
in itertools
.count(start
// self
._pagesize
):
1531 firstid
= pagenum
* self
._pagesize
1532 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1533 if start
>= nextfirstid
:
1536 page_results
= list(self
._pagefunc
(pagenum
))
1539 start
% self
._pagesize
1540 if firstid
<= start
< nextfirstid
1544 ((end
- 1) % self
._pagesize
) + 1
1545 if (end
is not None and firstid
<= end
<= nextfirstid
)
1548 if startv
!= 0 or endv
is not None:
1549 page_results
= page_results
[startv
:endv
]
1550 res
.extend(page_results
)
1552 # A little optimization - if current page is not "full", ie. does
1553 # not contain page_size videos then we can assume that this page
1554 # is the last one - there are no more ids on further pages -
1555 # i.e. no need to query again.
1556 if len(page_results
) + startv
< self
._pagesize
:
1559 # If we got the whole page, but the next page is not interesting,
1560 # break out early as well
1561 if end
== nextfirstid
:
1566 class InAdvancePagedList(PagedList
):
1567 def __init__(self
, pagefunc
, pagecount
, pagesize
):
1568 self
._pagefunc
= pagefunc
1569 self
._pagecount
= pagecount
1570 self
._pagesize
= pagesize
1572 def getslice(self
, start
=0, end
=None):
1574 start_page
= start
// self
._pagesize
1576 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
1577 skip_elems
= start
- start_page
* self
._pagesize
1578 only_more
= None if end
is None else end
- start
1579 for pagenum
in range(start_page
, end_page
):
1580 page
= list(self
._pagefunc
(pagenum
))
1582 page
= page
[skip_elems
:]
1584 if only_more
is not None:
1585 if len(page
) < only_more
:
1586 only_more
-= len(page
)
1588 page
= page
[:only_more
]
1595 def uppercase_escape(s
):
1596 unicode_escape
= codecs
.getdecoder('unicode_escape')
1598 r
'\\U[0-9a-fA-F]{8}',
1599 lambda m
: unicode_escape(m
.group(0))[0],
1603 def lowercase_escape(s
):
1604 unicode_escape
= codecs
.getdecoder('unicode_escape')
1606 r
'\\u[0-9a-fA-F]{4}',
1607 lambda m
: unicode_escape(m
.group(0))[0],
1611 def escape_rfc3986(s
):
1612 """Escape non-ASCII characters as suggested by RFC 3986"""
1613 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
1614 s
= s
.encode('utf-8')
1615 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
1618 def escape_url(url
):
1619 """Escape URL as suggested by RFC 3986"""
1620 url_parsed
= compat_urllib_parse_urlparse(url
)
1621 return url_parsed
._replace
(
1622 path
=escape_rfc3986(url_parsed
.path
),
1623 params
=escape_rfc3986(url_parsed
.params
),
1624 query
=escape_rfc3986(url_parsed
.query
),
1625 fragment
=escape_rfc3986(url_parsed
.fragment
)
1629 struct
.pack('!I', 0)
1631 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1632 def struct_pack(spec
, *args
):
1633 if isinstance(spec
, compat_str
):
1634 spec
= spec
.encode('ascii')
1635 return struct
.pack(spec
, *args
)
1637 def struct_unpack(spec
, *args
):
1638 if isinstance(spec
, compat_str
):
1639 spec
= spec
.encode('ascii')
1640 return struct
.unpack(spec
, *args
)
1642 struct_pack
= struct
.pack
1643 struct_unpack
= struct
.unpack
1646 def read_batch_urls(batch_fd
):
1648 if not isinstance(url
, compat_str
):
1649 url
= url
.decode('utf-8', 'replace')
1650 BOM_UTF8
= '\xef\xbb\xbf'
1651 if url
.startswith(BOM_UTF8
):
1652 url
= url
[len(BOM_UTF8
):]
1654 if url
.startswith(('#', ';', ']')):
1658 with contextlib
.closing(batch_fd
) as fd
:
1659 return [url
for url
in map(fixup
, fd
) if url
]
1662 def urlencode_postdata(*args
, **kargs
):
1663 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1666 def encode_dict(d
, encoding
='utf-8'):
1667 return dict((k
.encode(encoding
), v
.encode(encoding
)) for k
, v
in d
.items())
1679 def parse_age_limit(s
):
1682 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
1683 return int(m
.group('age')) if m
else US_RATINGS
.get(s
, None)
1686 def strip_jsonp(code
):
1688 r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r
'\1', code
)
1691 def js_to_json(code
):
1694 if v
in ('true', 'false', 'null'):
1696 if v
.startswith('"'):
1697 v
= re
.sub(r
"\\'", "'", v
[1:-1])
1698 elif v
.startswith("'"):
1700 v
= re
.sub(r
"\\\\|\\'|\"", lambda m: {
1707 res = re.sub(r'''(?x)
1708 "(?
:[^
"\\]*(?:\\\\|\\['"nu
]))*[^
"\\]*"|
1709 '(?:[^'\\]*(?
:\\\\|
\\['"nu]))*[^'\\]*'|
1710 [a-zA-Z_][.a-zA-Z_0-9]*
1712 res = re.sub(r',(\s
*[\
]}])', lambda m: m.group(1), res)
1716 def qualities(quality_ids):
1717 """ Get a numeric quality value out of a list of possible values """
1720 return quality_ids.index(qid)
1726 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1729 def limit_length(s, length):
1730 """ Add ellipses to overly long strings """
1735 return s[:length - len(ELLIPSES)] + ELLIPSES
1739 def version_tuple(v):
1740 return tuple(int(e) for e in re.split(r'[-.]', v))
1743 def is_outdated_version(version, limit, assume_new=True):
1745 return not assume_new
1747 return version_tuple(version) < version_tuple(limit)
1749 return not assume_new
1752 def ytdl_is_updateable():
1753 """ Returns if youtube-dl can be updated with -U """
1754 from zipimport import zipimporter
1756 return isinstance(globals().get('__loader__
'), zipimporter) or hasattr(sys, 'frozen
')
1759 def args_to_str(args):
1760 # Get a short string representation for a subprocess command
1761 return ' '.join(shlex_quote(a) for a in args)
1764 def mimetype2ext(mt):
1765 _, _, res = mt.rpartition('/')
1769 'x
-mp4
-fragmented
': 'mp4
',
1774 def urlhandle_detect_ext(url_handle):
1777 getheader = lambda h: url_handle.headers[h]
1778 except AttributeError: # Python < 3
1779 getheader = url_handle.info().getheader
1781 cd = getheader('Content
-Disposition
')
1783 m = re.match(r'attachment
;\s
*filename
="(?P<filename>[^"]+)"', cd)
1785 e = determine_ext(m.group('filename'), default_ext=None)
1789 return mimetype2ext(getheader('Content-Type'))
1792 def encode_data_uri(data, mime_type):
1793 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1796 def age_restricted(content_limit, age_limit):
1797 """ Returns True iff the content should be blocked """
1799 if age_limit is None: # No limit set
1801 if content_limit is None:
1802 return False # Content available for everyone
1803 return age_limit < content_limit
1806 def is_html(first_bytes):
1807 """ Detect whether a file contains HTML by examining its first bytes. """
1810 (b'\xef\xbb\xbf', 'utf-8'),
1811 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1812 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1813 (b'\xff\xfe', 'utf-16-le'),
1814 (b'\xfe\xff', 'utf-16-be'),
1816 for bom, enc in BOMS:
1817 if first_bytes.startswith(bom):
1818 s = first_bytes[len(bom):].decode(enc, 'replace')
1821 s = first_bytes.decode('utf-8', 'replace')
1823 return re.match(r'^\s*<', s)
1826 def determine_protocol(info_dict):
1827 protocol = info_dict.get('protocol')
1828 if protocol is not None:
1831 url = info_dict['url']
1832 if url.startswith('rtmp'):
1834 elif url.startswith('mms'):
1836 elif url.startswith('rtsp'):
1839 ext = determine_ext(url)
1845 return compat_urllib_parse_urlparse(url).scheme
1848 def render_table(header_row, data):
1849 """ Render a list of rows, each as a list of values """
1850 table = [header_row] + data
1851 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1852 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1853 return '\n'.join(format_str % tuple(row) for row in table)
1856 def _match_one(filter_part, dct):
1857 COMPARISON_OPERATORS = {
1865 operator_rex = re.compile(r'''(?x)\s*
1867 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1869 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1870 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1873 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1874 m = operator_rex.search(filter_part)
1876 op = COMPARISON_OPERATORS[m.group('op')]
1877 if m.group('strval') is not None:
1878 if m.group('op') not in ('=', '!='):
1880 'Operator %s does not support string values!' % m.group('op'))
1881 comparison_value = m.group('strval')
1884 comparison_value = int(m.group('intval'))
1886 comparison_value = parse_filesize(m.group('intval'))
1887 if comparison_value is None:
1888 comparison_value = parse_filesize(m.group('intval') + 'B')
1889 if comparison_value is None:
1891 'Invalid integer value %r in filter part %r' % (
1892 m.group('intval'), filter_part))
1893 actual_value = dct.get(m.group('key'))
1894 if actual_value is None:
1895 return m.group('none_inclusive')
1896 return op(actual_value, comparison_value)
1899 '': lambda v: v is not None,
1900 '!': lambda v: v is None,
1902 operator_rex = re.compile(r'''(?x)\s*
1903 (?P<op>%s)\s*(?P<key>[a-z_]+)
1905 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1906 m = operator_rex.search(filter_part)
1908 op = UNARY_OPERATORS[m.group('op')]
1909 actual_value = dct.get(m.group('key'))
1910 return op(actual_value)
1912 raise ValueError('Invalid filter part %r' % filter_part)
1915 def match_str(filter_str, dct):
1916 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1919 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1922 def match_filter_func(filter_str):
1923 def _match_func(info_dict):
1924 if match_str(filter_str, info_dict):
1927 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1928 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1932 def parse_dfxp_time_expr(time_expr):
1936 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1938 return float(mobj.group('time_offset'))
1940 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1942 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1945 def srt_subtitles_timecode(seconds):
1946 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1949 def dfxp2srt(dfxp_data):
1950 _x = functools.partial(xpath_with_ns, ns_map={
1951 'ttml': 'http://www.w3.org/ns/ttml',
1952 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1955 def parse_node(node):
1956 str_or_empty = functools.partial(str_or_none, default='')
1958 out = str_or_empty(node.text)
1961 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1962 out += '\n' + str_or_empty(child.tail)
1963 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1964 out += str_or_empty(parse_node(child))
1966 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1970 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1972 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1975 raise ValueError('Invalid dfxp/TTML subtitle')
1977 for para, index in zip(paras, itertools.count(1)):
1978 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1979 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1981 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1982 out.append('%d\n%s --> %s\n%s\n\n' % (
1984 srt_subtitles_timecode(begin_time),
1985 srt_subtitles_timecode(end_time),
1991 def cli_option(params, command_option, param):
1992 param = params.get(param)
1993 return [command_option, param] if param is not None else []
1996 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1997 param = params.get(param)
1998 assert isinstance(param, bool)
2000 return [command_option + separator + (true_value if param else false_value)]
2001 return [command_option, true_value if param else false_value]
2004 def cli_valueless_option(params, command_option, param, expected_value=True):
2005 param = params.get(param)
2006 return [command_option] if param == expected_value else []
2009 def cli_configuration_args(params, param, default=[]):
2010 ex_args = params.get(param)
2013 assert isinstance(ex_args, list)
2017 class ISO639Utils(object):
2018 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2207 def short2long(cls, code):
2208 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2209 return cls._lang_map.get(code[:2])
2212 def long2short(cls, code):
2213 """Convert language code from ISO 639-2/T to ISO 639-1"""
2214 for short_name, long_name in cls._lang_map.items():
2215 if long_name == code:
2219 class ISO3166Utils(object):
2220 # From http://data.okfn.org/data/core/country-list
2222 'AF': 'Afghanistan',
2223 'AX': 'Åland Islands',
2226 'AS': 'American Samoa',
2231 'AG': 'Antigua and Barbuda',
2248 'BO': 'Bolivia, Plurinational State of',
2249 'BQ': 'Bonaire, Sint Eustatius and Saba',
2250 'BA': 'Bosnia and Herzegovina',
2252 'BV': 'Bouvet Island',
2254 'IO': 'British Indian Ocean Territory',
2255 'BN': 'Brunei Darussalam',
2257 'BF': 'Burkina Faso',
2263 'KY': 'Cayman Islands',
2264 'CF': 'Central African Republic',
2268 'CX': 'Christmas Island',
2269 'CC': 'Cocos (Keeling) Islands',
2273 'CD': 'Congo, the Democratic Republic of the',
2274 'CK': 'Cook Islands',
2276 'CI': 'Côte d\'Ivoire',
2281 'CZ': 'Czech Republic',
2285 'DO': 'Dominican Republic',
2288 'SV': 'El Salvador',
2289 'GQ': 'Equatorial Guinea',
2293 'FK': 'Falkland Islands (Malvinas)',
2294 'FO': 'Faroe Islands',
2298 'GF': 'French Guiana',
2299 'PF': 'French Polynesia',
2300 'TF': 'French Southern Territories',
2315 'GW': 'Guinea-Bissau',
2318 'HM': 'Heard Island and McDonald Islands',
2319 'VA': 'Holy See (Vatican City State)',
2326 'IR': 'Iran, Islamic Republic of',
2329 'IM': 'Isle of Man',
2339 'KP': 'Korea, Democratic People\'s Republic of',
2340 'KR': 'Korea, Republic of',
2343 'LA': 'Lao People\'s Democratic Republic',
2349 'LI': 'Liechtenstein',
2353 'MK': 'Macedonia, the Former Yugoslav Republic of',
2360 'MH': 'Marshall Islands',
2366 'FM': 'Micronesia, Federated States of',
2367 'MD': 'Moldova, Republic of',
2378 'NL': 'Netherlands',
2379 'NC': 'New Caledonia',
2380 'NZ': 'New Zealand',
2385 'NF': 'Norfolk Island',
2386 'MP': 'Northern Mariana Islands',
2391 'PS': 'Palestine, State of',
2393 'PG': 'Papua New Guinea',
2396 'PH': 'Philippines',
2400 'PR': 'Puerto Rico',
2404 'RU': 'Russian Federation',
2406 'BL': 'Saint Barthélemy',
2407 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2408 'KN': 'Saint Kitts and Nevis',
2409 'LC': 'Saint Lucia',
2410 'MF': 'Saint Martin (French part)',
2411 'PM': 'Saint Pierre and Miquelon',
2412 'VC': 'Saint Vincent and the Grenadines',
2415 'ST': 'Sao Tome and Principe',
2416 'SA': 'Saudi Arabia',
2420 'SL': 'Sierra Leone',
2422 'SX': 'Sint Maarten (Dutch part)',
2425 'SB': 'Solomon Islands',
2427 'ZA': 'South Africa',
2428 'GS': 'South Georgia and the South Sandwich Islands',
2429 'SS': 'South Sudan',
2434 'SJ': 'Svalbard and Jan Mayen',
2437 'CH': 'Switzerland',
2438 'SY': 'Syrian Arab Republic',
2439 'TW': 'Taiwan, Province of China',
2441 'TZ': 'Tanzania, United Republic of',
2443 'TL': 'Timor-Leste',
2447 'TT': 'Trinidad and Tobago',
2450 'TM': 'Turkmenistan',
2451 'TC': 'Turks and Caicos Islands',
2455 'AE': 'United Arab Emirates',
2456 'GB': 'United Kingdom',
2457 'US': 'United States',
2458 'UM': 'United States Minor Outlying Islands',
2462 'VE': 'Venezuela, Bolivarian Republic of',
2464 'VG': 'Virgin Islands, British',
2465 'VI': 'Virgin Islands, U.S.',
2466 'WF': 'Wallis and Futuna',
2467 'EH': 'Western Sahara',
2474 def short2full(cls, code):
2475 """Convert an ISO 3166-2 country code to the corresponding full name"""
2476 return cls._country_map.get(code.upper())
2479 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2480 def __init__(self, proxies=None):
2481 # Set default handlers
2482 for type in ('http', 'https'):
2483 setattr(self, '%s_open' % type,
2484 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2485 meth(r, proxy, type))
2486 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2488 def proxy_open(self, req, proxy, type):
2489 req_proxy = req.headers.get('Ytdl-request-proxy')
2490 if req_proxy is not None:
2492 del req.headers['Ytdl-request-proxy']
2494 if proxy == '__noproxy__':
2495 return None # No Proxy
2496 return compat_urllib_request.ProxyHandler.proxy_open(
2497 self, req, proxy, type)