2 # -*- coding: utf-8 -*-
28 import xml
.etree
.ElementTree
32 import urllib
.request
as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2
as compat_urllib_request
37 import urllib
.error
as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2
as compat_urllib_error
42 import urllib
.parse
as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib
as compat_urllib_parse
47 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse
import urlparse
as compat_urllib_parse_urlparse
52 import urllib
.parse
as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse
as compat_urlparse
57 import http
.cookiejar
as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib
as compat_cookiejar
62 import html
.entities
as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs
as compat_html_entities
67 import html
.parser
as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser
as compat_html_parser
72 import http
.client
as compat_http_client
73 except ImportError: # Python 2
74 import httplib
as compat_http_client
77 from urllib
.error
import HTTPError
as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2
import HTTPError
as compat_HTTPError
82 from urllib
.request
import urlretrieve
as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib
import urlretrieve
as compat_urlretrieve
88 from subprocess
import DEVNULL
89 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
91 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
94 from urllib
.parse
import parse_qs
as compat_parse_qs
95 except ImportError: # Python 2
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string
, encoding
='utf-8', errors
='replace'):
101 res
= string
.split('%')
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
115 pct_sequence
+= item
[:2].decode('hex')
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
124 # Encountered non-percent-encoded characters. Flush the current
126 string
+= pct_sequence
.decode(encoding
, errors
) + rest
129 # Flush the final pct_sequence
130 string
+= pct_sequence
.decode(encoding
, errors
)
133 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
134 encoding
='utf-8', errors
='replace'):
135 qs
, _coerce_result
= qs
, unicode
136 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
138 for name_value
in pairs
:
139 if not name_value
and not strict_parsing
:
141 nv
= name_value
.split('=', 1)
144 raise ValueError("bad query field: %r" % (name_value
,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values
:
150 if len(nv
[1]) or keep_blank_values
:
151 name
= nv
[0].replace('+', ' ')
152 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
153 name
= _coerce_result(name
)
154 value
= nv
[1].replace('+', ' ')
155 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
156 value
= _coerce_result(value
)
157 r
.append((name
, value
))
160 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
161 encoding
='utf-8', errors
='replace'):
163 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
164 encoding
=encoding
, errors
=errors
)
165 for name
, value
in pairs
:
166 if name
in parsed_result
:
167 parsed_result
[name
].append(value
)
169 parsed_result
[name
] = [value
]
173 compat_str
= unicode # Python 2
178 compat_chr
= unichr # Python 2
183 from xml
.etree
.ElementTree
import ParseError
as compat_xml_parse_error
184 except ImportError: # Python 2.6
185 from xml
.parsers
.expat
import ExpatError
as compat_xml_parse_error
188 if type(c
) is int: return c
191 # This is not clearly defined otherwise
192 compiled_regex_type
= type(re
.compile(''))
195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
202 def preferredencoding():
203 """Get preferred encoding.
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
209 pref
= locale
.getpreferredencoding()
216 if sys
.version_info
< (3,0):
218 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
221 assert type(s
) == type(u
'')
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys
.version_info
< (3,0):
227 def write_json_file(obj
, fn
):
228 with open(fn
, 'wb') as f
:
231 def write_json_file(obj
, fn
):
232 with open(fn
, 'w', encoding
='utf-8') as f
:
235 if sys
.version_info
>= (2,7):
236 def find_xpath_attr(node
, xpath
, key
, val
):
237 """ Find the xpath xpath[@key=val] """
238 assert re
.match(r
'^[a-zA-Z]+$', key
)
239 assert re
.match(r
'^[a-zA-Z0-9@\s:._]*$', val
)
240 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
241 return node
.find(expr
)
243 def find_xpath_attr(node
, xpath
, key
, val
):
244 for f
in node
.findall(xpath
):
245 if f
.attrib
.get(key
) == val
:
249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
250 # the namespace parameter
251 def xpath_with_ns(path
, ns_map
):
252 components
= [c
.split(':') for c
in path
.split('/')]
256 replaced
.append(c
[0])
259 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
260 return '/'.join(replaced
)
262 def htmlentity_transform(matchobj
):
263 """Transforms an HTML entity to a character.
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
268 entity
= matchobj
.group(1)
270 # Known non-numeric HTML entity
271 if entity
in compat_html_entities
.name2codepoint
:
272 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
274 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
276 numstr
= mobj
.group(1)
277 if numstr
.startswith(u
'x'):
279 numstr
= u
'0%s' % numstr
282 return compat_chr(int(numstr
, base
))
284 # Unknown entity in name, return its literal representation
285 return (u
'&%s;' % entity
)
287 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
288 class BaseHTMLParser(compat_html_parser
.HTMLParser
):
290 compat_html_parser
.HTMLParser
.__init
__(self
)
293 def loads(self
, html
):
298 class AttrParser(BaseHTMLParser
):
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self
, attribute
, value
):
301 self
.attribute
= attribute
306 self
.watch_startpos
= False
308 BaseHTMLParser
.__init
__(self
)
310 def error(self
, message
):
311 if self
.error_count
> 10 or self
.started
:
312 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
313 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
314 self
.error_count
+= 1
317 def handle_starttag(self
, tag
, attrs
):
320 self
.find_startpos(None)
321 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
324 self
.watch_startpos
= True
326 if not tag
in self
.depth
: self
.depth
[tag
] = 0
329 def handle_endtag(self
, tag
):
331 if tag
in self
.depth
: self
.depth
[tag
] -= 1
332 if self
.depth
[self
.result
[0]] == 0:
334 self
.result
.append(self
.getpos())
336 def find_startpos(self
, x
):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self
.watch_startpos
:
340 self
.watch_startpos
= False
341 self
.result
.append(self
.getpos())
342 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
343 handle_decl
= handle_pi
= unknown_decl
= find_startpos
345 def get_result(self
):
346 if self
.result
is None:
348 if len(self
.result
) != 3:
350 lines
= self
.html
.split('\n')
351 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
352 lines
[0] = lines
[0][self
.result
[1][1]:]
354 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
355 lines
[-1] = lines
[-1][:self
.result
[2][1]]
356 return '\n'.join(lines
).strip()
357 # Hack for https://github.com/rg3/youtube-dl/issues/662
358 if sys
.version_info
< (2, 7, 3):
359 AttrParser
.parse_endtag
= (lambda self
, i
:
360 i
+ len("</scr'+'ipt>")
361 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
362 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
364 def get_element_by_id(id, html
):
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html
)
368 def get_element_by_attribute(attribute
, value
, html
):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser
= AttrParser(attribute
, value
)
373 except compat_html_parser
.HTMLParseError
:
375 return parser
.get_result()
377 class MetaParser(BaseHTMLParser
):
379 Modified HTMLParser that isolates a meta tag with the specified name
382 def __init__(self
, name
):
383 BaseHTMLParser
.__init
__(self
)
388 def handle_starttag(self
, tag
, attrs
):
392 if attrs
.get('name') == self
.name
:
393 self
.result
= attrs
.get('content')
395 def get_result(self
):
398 def get_meta_content(name
, html
):
400 Return the content attribute from the meta tag with the given name attribute.
402 parser
= MetaParser(name
)
405 except compat_html_parser
.HTMLParseError
:
407 return parser
.get_result()
410 def clean_html(html
):
411 """Clean an HTML snippet into a readable string"""
413 html
= html
.replace('\n', ' ')
414 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
415 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
417 html
= re
.sub('<.*?>', '', html
)
418 # Replace html entities
419 html
= unescapeHTML(html
)
423 def sanitize_open(filename
, open_mode
):
424 """Try to open the given filename, and slightly tweak it if this fails.
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
431 It returns the tuple (stream, definitive_file_name).
435 if sys
.platform
== 'win32':
437 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
438 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
439 stream
= open(encodeFilename(filename
), open_mode
)
440 return (stream
, filename
)
441 except (IOError, OSError) as err
:
442 if err
.errno
in (errno
.EACCES
,):
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename
= os
.path
.join(
447 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
448 for path_part
in os
.path
.split(filename
)
450 if alt_filename
== filename
:
453 # An exception here should be caught in the caller
454 stream
= open(encodeFilename(filename
), open_mode
)
455 return (stream
, alt_filename
)
458 def timeconvert(timestr
):
459 """Convert RFC 2822 defined time string into system timestamp"""
461 timetuple
= email
.utils
.parsedate_tz(timestr
)
462 if timetuple
is not None:
463 timestamp
= email
.utils
.mktime_tz(timetuple
)
466 def sanitize_filename(s
, restricted
=False, is_id
=False):
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
471 def replace_insane(char
):
472 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
475 return '' if restricted
else '\''
477 return '_-' if restricted
else ' -'
478 elif char
in '\\/|*<>':
480 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
482 if restricted
and ord(char
) > 127:
486 result
= u
''.join(map(replace_insane
, s
))
488 while '__' in result
:
489 result
= result
.replace('__', '_')
490 result
= result
.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted
and result
.startswith('-_'):
498 def orderedSet(iterable
):
499 """ Remove all duplicates from the input iterable """
510 assert type(s
) == compat_str
512 result
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, s
)
516 def encodeFilename(s
, for_subprocess
=False):
518 @param s The name of the file
521 assert type(s
) == compat_str
523 # Python 3 has a Unicode API
524 if sys
.version_info
>= (3, 0):
527 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
531 if not for_subprocess
:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding
= preferredencoding()
538 encoding
= sys
.getfilesystemencoding()
541 return s
.encode(encoding
, 'ignore')
544 def encodeArgument(s
):
545 if not isinstance(s
, compat_str
):
546 # Legacy code that uses byte strings
547 # Uncomment the following line after fixing all post processors
548 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
549 s
= s
.decode('ascii')
550 return encodeFilename(s
, True)
553 def decodeOption(optval
):
556 if isinstance(optval
, bytes):
557 optval
= optval
.decode(preferredencoding())
559 assert isinstance(optval
, compat_str
)
562 def formatSeconds(secs
):
564 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
566 return '%d:%02d' % (secs
// 60, secs
% 60)
571 def make_HTTPS_handler(opts_no_check_certificate
, **kwargs
):
572 if sys
.version_info
< (3, 2):
575 class HTTPSConnectionV3(httplib
.HTTPSConnection
):
576 def __init__(self
, *args
, **kwargs
):
577 httplib
.HTTPSConnection
.__init
__(self
, *args
, **kwargs
)
580 sock
= socket
.create_connection((self
.host
, self
.port
), self
.timeout
)
581 if getattr(self
, '_tunnel_host', False):
585 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv3
)
587 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv23
)
589 class HTTPSHandlerV3(compat_urllib_request
.HTTPSHandler
):
590 def https_open(self
, req
):
591 return self
.do_open(HTTPSConnectionV3
, req
)
592 return HTTPSHandlerV3(**kwargs
)
594 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv3
)
595 context
.verify_mode
= (ssl
.CERT_NONE
596 if opts_no_check_certificate
597 else ssl
.CERT_REQUIRED
)
598 context
.set_default_verify_paths()
600 context
.load_default_certs()
601 except AttributeError:
603 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
605 class ExtractorError(Exception):
606 """Error during info extraction."""
607 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
608 """ tb, if given, is the original traceback (so that it can be printed out).
609 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
612 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
614 if video_id
is not None:
615 msg
= video_id
+ ': ' + msg
617 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
618 super(ExtractorError
, self
).__init
__(msg
)
621 self
.exc_info
= sys
.exc_info() # preserve original exception
623 self
.video_id
= video_id
625 def format_traceback(self
):
626 if self
.traceback
is None:
628 return u
''.join(traceback
.format_tb(self
.traceback
))
631 class RegexNotFoundError(ExtractorError
):
632 """Error when a regex didn't match"""
636 class DownloadError(Exception):
637 """Download Error exception.
639 This exception may be thrown by FileDownloader objects if they are not
640 configured to continue on errors. They will contain the appropriate
643 def __init__(self
, msg
, exc_info
=None):
644 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
645 super(DownloadError
, self
).__init
__(msg
)
646 self
.exc_info
= exc_info
649 class SameFileError(Exception):
650 """Same File exception.
652 This exception will be thrown by FileDownloader objects if they detect
653 multiple files would have to be downloaded to the same file on disk.
658 class PostProcessingError(Exception):
659 """Post Processing exception.
661 This exception may be raised by PostProcessor's .run() method to
662 indicate an error in the postprocessing task.
664 def __init__(self
, msg
):
667 class MaxDownloadsReached(Exception):
668 """ --max-downloads limit has been reached. """
672 class UnavailableVideoError(Exception):
673 """Unavailable Format exception.
675 This exception will be thrown when a video is requested
676 in a format that is not available for that video.
681 class ContentTooShortError(Exception):
682 """Content Too Short exception.
684 This exception may be raised by FileDownloader objects when a file they
685 download is too small for what the server announced first, indicating
686 the connection was probably interrupted.
692 def __init__(self
, downloaded
, expected
):
693 self
.downloaded
= downloaded
694 self
.expected
= expected
696 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
697 """Handler for HTTP requests and responses.
699 This class, when installed with an OpenerDirector, automatically adds
700 the standard headers to every HTTP request and handles gzipped and
701 deflated responses from web servers. If compression is to be avoided in
702 a particular request, the original request in the program code only has
703 to include the HTTP header "Youtubedl-No-Compression", which will be
704 removed before making the real request.
706 Part of this code was copied from:
708 http://techknack.net/python-urllib2-handlers/
710 Andrew Rowls, the author of that code, agreed to release it to the
717 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
719 return zlib
.decompress(data
)
722 def addinfourl_wrapper(stream
, headers
, url
, code
):
723 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
724 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
725 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
729 def http_request(self
, req
):
730 for h
,v
in std_headers
.items():
734 if 'Youtubedl-no-compression' in req
.headers
:
735 if 'Accept-encoding' in req
.headers
:
736 del req
.headers
['Accept-encoding']
737 del req
.headers
['Youtubedl-no-compression']
738 if 'Youtubedl-user-agent' in req
.headers
:
739 if 'User-agent' in req
.headers
:
740 del req
.headers
['User-agent']
741 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
742 del req
.headers
['Youtubedl-user-agent']
745 def http_response(self
, req
, resp
):
748 if resp
.headers
.get('Content-encoding', '') == 'gzip':
749 content
= resp
.read()
750 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
752 uncompressed
= io
.BytesIO(gz
.read())
753 except IOError as original_ioerror
:
754 # There may be junk add the end of the file
755 # See http://stackoverflow.com/q/4928560/35070 for details
756 for i
in range(1, 1024):
758 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
759 uncompressed
= io
.BytesIO(gz
.read())
764 raise original_ioerror
765 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
766 resp
.msg
= old_resp
.msg
768 if resp
.headers
.get('Content-encoding', '') == 'deflate':
769 gz
= io
.BytesIO(self
.deflate(resp
.read()))
770 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
771 resp
.msg
= old_resp
.msg
774 https_request
= http_request
775 https_response
= http_response
778 def parse_iso8601(date_str
, delimiter
='T'):
779 """ Return a UNIX timestamp from the given date """
785 r
'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
788 timezone
= datetime
.timedelta()
790 date_str
= date_str
[:-len(m
.group(0))]
791 if not m
.group('sign'):
792 timezone
= datetime
.timedelta()
794 sign
= 1 if m
.group('sign') == '+' else -1
795 timezone
= datetime
.timedelta(
796 hours
=sign
* int(m
.group('hours')),
797 minutes
=sign
* int(m
.group('minutes')))
798 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
799 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
800 return calendar
.timegm(dt
.timetuple())
803 def unified_strdate(date_str
):
804 """Return a string with the date in the format YYYYMMDD"""
811 date_str
= date_str
.replace(',', ' ')
812 # %z (UTC offset) is only supported in python>=3.2
813 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
814 format_expressions
= [
819 '%b %dst %Y %I:%M%p',
820 '%b %dnd %Y %I:%M%p',
821 '%b %dth %Y %I:%M%p',
829 '%Y-%m-%dT%H:%M:%SZ',
830 '%Y-%m-%dT%H:%M:%S.%fZ',
831 '%Y-%m-%dT%H:%M:%S.%f0Z',
833 '%Y-%m-%dT%H:%M:%S.%f',
836 for expression
in format_expressions
:
838 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
841 if upload_date
is None:
842 timetuple
= email
.utils
.parsedate_tz(date_str
)
844 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
847 def determine_ext(url
, default_ext
=u
'unknown_video'):
848 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
849 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
854 def subtitles_filename(filename
, sub_lang
, sub_format
):
855 return filename
.rsplit('.', 1)[0] + u
'.' + sub_lang
+ u
'.' + sub_format
857 def date_from_str(date_str
):
859 Return a datetime object from a string in the format YYYYMMDD or
860 (now|today)[+-][0-9](day|week|month|year)(s)?"""
861 today
= datetime
.date
.today()
862 if date_str
== 'now'or date_str
== 'today':
864 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
865 if match
is not None:
866 sign
= match
.group('sign')
867 time
= int(match
.group('time'))
870 unit
= match
.group('unit')
879 delta
= datetime
.timedelta(**{unit
: time
})
881 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
883 def hyphenate_date(date_str
):
885 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
886 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
887 if match
is not None:
888 return '-'.join(match
.groups())
892 class DateRange(object):
893 """Represents a time interval between two dates"""
894 def __init__(self
, start
=None, end
=None):
895 """start and end must be strings in the format accepted by date"""
896 if start
is not None:
897 self
.start
= date_from_str(start
)
899 self
.start
= datetime
.datetime
.min.date()
901 self
.end
= date_from_str(end
)
903 self
.end
= datetime
.datetime
.max.date()
904 if self
.start
> self
.end
:
905 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
908 """Returns a range that only contains the given day"""
910 def __contains__(self
, date
):
911 """Check if the date is in the range"""
912 if not isinstance(date
, datetime
.date
):
913 date
= date_from_str(date
)
914 return self
.start
<= date
<= self
.end
916 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())
920 """ Returns the platform name as a compat_str """
921 res
= platform
.platform()
922 if isinstance(res
, bytes):
923 res
= res
.decode(preferredencoding())
925 assert isinstance(res
, compat_str
)
929 def _windows_write_string(s
, out
):
930 """ Returns True if the string was written using special methods,
931 False if it has yet to be written out."""
932 # Adapted from http://stackoverflow.com/a/3259271/35070
935 import ctypes
.wintypes
943 fileno
= out
.fileno()
944 except AttributeError:
945 # If the output stream doesn't have a fileno, it's virtual
947 if fileno
not in WIN_OUTPUT_IDS
:
950 GetStdHandle
= ctypes
.WINFUNCTYPE(
951 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
952 ("GetStdHandle", ctypes
.windll
.kernel32
))
953 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
955 WriteConsoleW
= ctypes
.WINFUNCTYPE(
956 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
957 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
958 ctypes
.wintypes
.LPVOID
)(("WriteConsoleW", ctypes
.windll
.kernel32
))
959 written
= ctypes
.wintypes
.DWORD(0)
961 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(("GetFileType", ctypes
.windll
.kernel32
))
962 FILE_TYPE_CHAR
= 0x0002
963 FILE_TYPE_REMOTE
= 0x8000
964 GetConsoleMode
= ctypes
.WINFUNCTYPE(
965 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
966 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
967 ("GetConsoleMode", ctypes
.windll
.kernel32
))
968 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
970 def not_a_console(handle
):
971 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
973 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
974 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
979 def next_nonbmp_pos(s
):
981 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
982 except StopIteration:
986 count
= min(next_nonbmp_pos(s
), 1024)
989 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
991 raise OSError('Failed to write string')
992 if not count
: # We just wrote a non-BMP character
993 assert written
.value
== 2
996 assert written
.value
> 0
997 s
= s
[written
.value
:]
1001 def write_string(s
, out
=None, encoding
=None):
1004 assert type(s
) == compat_str
1006 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1007 if _windows_write_string(s
, out
):
1010 if ('b' in getattr(out
, 'mode', '') or
1011 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1012 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1014 elif hasattr(out
, 'buffer'):
1015 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1016 byt
= s
.encode(enc
, 'ignore')
1017 out
.buffer.write(byt
)
1023 def bytes_to_intlist(bs
):
1026 if isinstance(bs
[0], int): # Python 3
1029 return [ord(c
) for c
in bs
]
1032 def intlist_to_bytes(xs
):
1035 if isinstance(chr(0), bytes): # Python 2
1036 return ''.join([chr(x
) for x
in xs
])
1041 def get_cachedir(params
={}):
1042 cache_root
= os
.environ
.get('XDG_CACHE_HOME',
1043 os
.path
.expanduser('~/.cache'))
1044 return params
.get('cachedir', os
.path
.join(cache_root
, 'youtube-dl'))
1047 # Cross-platform file locking
1048 if sys
.platform
== 'win32':
1049 import ctypes
.wintypes
1052 class OVERLAPPED(ctypes
.Structure
):
1054 ('Internal', ctypes
.wintypes
.LPVOID
),
1055 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1056 ('Offset', ctypes
.wintypes
.DWORD
),
1057 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1058 ('hEvent', ctypes
.wintypes
.HANDLE
),
1061 kernel32
= ctypes
.windll
.kernel32
1062 LockFileEx
= kernel32
.LockFileEx
1063 LockFileEx
.argtypes
= [
1064 ctypes
.wintypes
.HANDLE
, # hFile
1065 ctypes
.wintypes
.DWORD
, # dwFlags
1066 ctypes
.wintypes
.DWORD
, # dwReserved
1067 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1068 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1069 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1071 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1072 UnlockFileEx
= kernel32
.UnlockFileEx
1073 UnlockFileEx
.argtypes
= [
1074 ctypes
.wintypes
.HANDLE
, # hFile
1075 ctypes
.wintypes
.DWORD
, # dwReserved
1076 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1077 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1078 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1080 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1081 whole_low
= 0xffffffff
1082 whole_high
= 0x7fffffff
1084 def _lock_file(f
, exclusive
):
1085 overlapped
= OVERLAPPED()
1086 overlapped
.Offset
= 0
1087 overlapped
.OffsetHigh
= 0
1088 overlapped
.hEvent
= 0
1089 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1090 handle
= msvcrt
.get_osfhandle(f
.fileno())
1091 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1092 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1093 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1095 def _unlock_file(f
):
1096 assert f
._lock
_file
_overlapped
_p
1097 handle
= msvcrt
.get_osfhandle(f
.fileno())
1098 if not UnlockFileEx(handle
, 0,
1099 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1100 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1105 def _lock_file(f
, exclusive
):
1106 fcntl
.lockf(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1108 def _unlock_file(f
):
1109 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
1112 class locked_file(object):
1113 def __init__(self
, filename
, mode
, encoding
=None):
1114 assert mode
in ['r', 'a', 'w']
1115 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1118 def __enter__(self
):
1119 exclusive
= self
.mode
!= 'r'
1121 _lock_file(self
.f
, exclusive
)
1127 def __exit__(self
, etype
, value
, traceback
):
1129 _unlock_file(self
.f
)
1136 def write(self
, *args
):
1137 return self
.f
.write(*args
)
1139 def read(self
, *args
):
1140 return self
.f
.read(*args
)
1143 def shell_quote(args
):
1145 encoding
= sys
.getfilesystemencoding()
1146 if encoding
is None:
1149 if isinstance(a
, bytes):
1150 # We may get a filename encoded with 'encodeFilename'
1151 a
= a
.decode(encoding
)
1152 quoted_args
.append(pipes
.quote(a
))
1153 return u
' '.join(quoted_args
)
1156 def takewhile_inclusive(pred
, seq
):
1157 """ Like itertools.takewhile, but include the latest evaluated element
1158 (the first element so that Not pred(e)) """
1165 def smuggle_url(url
, data
):
1166 """ Pass additional data in a URL for internal use. """
1168 sdata
= compat_urllib_parse
.urlencode(
1169 {u
'__youtubedl_smuggle': json
.dumps(data
)})
1170 return url
+ u
'#' + sdata
1173 def unsmuggle_url(smug_url
, default
=None):
1174 if not '#__youtubedl_smuggle' in smug_url
:
1175 return smug_url
, default
1176 url
, _
, sdata
= smug_url
.rpartition(u
'#')
1177 jsond
= compat_parse_qs(sdata
)[u
'__youtubedl_smuggle'][0]
1178 data
= json
.loads(jsond
)
1182 def format_bytes(bytes):
1185 if type(bytes) is str:
1186 bytes = float(bytes)
1190 exponent
= int(math
.log(bytes, 1024.0))
1191 suffix
= [u
'B', u
'KiB', u
'MiB', u
'GiB', u
'TiB', u
'PiB', u
'EiB', u
'ZiB', u
'YiB'][exponent
]
1192 converted
= float(bytes) / float(1024 ** exponent
)
1193 return u
'%.2f%s' % (converted
, suffix
)
1196 def str_to_int(int_str
):
1197 int_str
= re
.sub(r
'[,\.]', u
'', int_str
)
1201 def get_term_width():
1202 columns
= os
.environ
.get('COLUMNS', None)
1207 sp
= subprocess
.Popen(
1209 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
1210 out
, err
= sp
.communicate()
1211 return int(out
.split()[1])
1217 def month_by_name(name
):
1218 """ Return the number of a month by (locale-independently) English name """
1221 u
'January', u
'February', u
'March', u
'April', u
'May', u
'June',
1222 u
'July', u
'August', u
'September', u
'October', u
'November', u
'December']
1224 return ENGLISH_NAMES
.index(name
) + 1
1229 def fix_xml_ampersands(xml_str
):
1230 """Replace all the '&' by '&' in XML"""
1232 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1237 def setproctitle(title
):
1238 assert isinstance(title
, compat_str
)
1240 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1243 title_bytes
= title
.encode('utf-8')
1244 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1245 buf
.value
= title_bytes
1247 libc
.prctl(15, buf
, 0, 0, 0)
1248 except AttributeError:
1249 return # Strange libc, just skip this
1252 def remove_start(s
, start
):
1253 if s
.startswith(start
):
1254 return s
[len(start
):]
1258 def url_basename(url
):
1259 path
= compat_urlparse
.urlparse(url
).path
1260 return path
.strip(u
'/').split(u
'/')[-1]
1263 class HEADRequest(compat_urllib_request
.Request
):
1264 def get_method(self
):
1268 def int_or_none(v
, scale
=1, default
=None, get_attr
=None):
1271 v
= getattr(v
, get_attr
, None)
1272 return default
if v
is None else (int(v
) // scale
)
1275 def float_or_none(v
, scale
=1, default
=None):
1276 return default
if v
is None else (float(v
) / scale
)
1279 def parse_duration(s
):
1284 r
'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s
)
1287 res
= int(m
.group('secs'))
1289 res
+= int(m
.group('mins')) * 60
1290 if m
.group('hours'):
1291 res
+= int(m
.group('hours')) * 60 * 60
1295 def prepend_extension(filename
, ext
):
1296 name
, real_ext
= os
.path
.splitext(filename
)
1297 return u
'{0}.{1}{2}'.format(name
, ext
, real_ext
)
1300 def check_executable(exe
, args
=[]):
1301 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1302 args can be a list of arguments for a short output (like -version) """
1304 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1310 class PagedList(object):
1311 def __init__(self
, pagefunc
, pagesize
):
1312 self
._pagefunc
= pagefunc
1313 self
._pagesize
= pagesize
1316 # This is only useful for tests
1317 return len(self
.getslice())
1319 def getslice(self
, start
=0, end
=None):
1321 for pagenum
in itertools
.count(start
// self
._pagesize
):
1322 firstid
= pagenum
* self
._pagesize
1323 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1324 if start
>= nextfirstid
:
1327 page_results
= list(self
._pagefunc
(pagenum
))
1330 start
% self
._pagesize
1331 if firstid
<= start
< nextfirstid
1335 ((end
- 1) % self
._pagesize
) + 1
1336 if (end
is not None and firstid
<= end
<= nextfirstid
)
1339 if startv
!= 0 or endv
is not None:
1340 page_results
= page_results
[startv
:endv
]
1341 res
.extend(page_results
)
1343 # A little optimization - if current page is not "full", ie. does
1344 # not contain page_size videos then we can assume that this page
1345 # is the last one - there are no more ids on further pages -
1346 # i.e. no need to query again.
1347 if len(page_results
) + startv
< self
._pagesize
:
1350 # If we got the whole page, but the next page is not interesting,
1351 # break out early as well
1352 if end
== nextfirstid
:
1357 def uppercase_escape(s
):
1358 unicode_escape
= codecs
.getdecoder('unicode_escape')
1360 r
'\\U[0-9a-fA-F]{8}',
1361 lambda m
: unicode_escape(m
.group(0))[0],
1365 struct
.pack(u
'!I', 0)
1367 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1368 def struct_pack(spec
, *args
):
1369 if isinstance(spec
, compat_str
):
1370 spec
= spec
.encode('ascii')
1371 return struct
.pack(spec
, *args
)
1373 def struct_unpack(spec
, *args
):
1374 if isinstance(spec
, compat_str
):
1375 spec
= spec
.encode('ascii')
1376 return struct
.unpack(spec
, *args
)
1378 struct_pack
= struct
.pack
1379 struct_unpack
= struct
.unpack
1382 def read_batch_urls(batch_fd
):
1384 if not isinstance(url
, compat_str
):
1385 url
= url
.decode('utf-8', 'replace')
1386 BOM_UTF8
= u
'\xef\xbb\xbf'
1387 if url
.startswith(BOM_UTF8
):
1388 url
= url
[len(BOM_UTF8
):]
1390 if url
.startswith(('#', ';', ']')):
1394 with contextlib
.closing(batch_fd
) as fd
:
1395 return [url
for url
in map(fixup
, fd
) if url
]
1398 def urlencode_postdata(*args
, **kargs
):
1399 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1403 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1404 def doctype(self
, name
, pubid
, system
):
1405 pass # Ignore doctypes
1407 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1408 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1409 return xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1412 if sys
.version_info
< (3, 0) and sys
.platform
== 'win32':
1413 def compat_getpass(prompt
, *args
, **kwargs
):
1414 if isinstance(prompt
, compat_str
):
1415 prompt
= prompt
.encode(preferredencoding())
1416 return getpass
.getpass(prompt
, *args
, **kwargs
)
1418 compat_getpass
= getpass
.getpass
1430 def strip_jsonp(code
):
1431 return re
.sub(r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r
'\1', code
)
1434 def qualities(quality_ids
):
1435 """ Get a numeric quality value out of a list of possible values """
1438 return quality_ids
.index(qid
)
1444 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
1447 subprocess_check_output
= subprocess
.check_output
1448 except AttributeError:
1449 def subprocess_check_output(*args
, **kwargs
):
1450 assert 'input' not in kwargs
1451 p
= subprocess
.Popen(*args
, stdout
=subprocess
.PIPE
, **kwargs
)
1452 output
, _
= p
.communicate()
1455 raise subprocess
.CalledProcessError(ret
, p
.args
, output
=output
)