2 # -*- coding: utf-8 -*-
28 import xml
.etree
.ElementTree
32 import urllib
.request
as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2
as compat_urllib_request
37 import urllib
.error
as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2
as compat_urllib_error
42 import urllib
.parse
as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib
as compat_urllib_parse
47 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse
import urlparse
as compat_urllib_parse_urlparse
52 import urllib
.parse
as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse
as compat_urlparse
57 import http
.cookiejar
as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib
as compat_cookiejar
62 import html
.entities
as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs
as compat_html_entities
67 import html
.parser
as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser
as compat_html_parser
72 import http
.client
as compat_http_client
73 except ImportError: # Python 2
74 import httplib
as compat_http_client
77 from urllib
.error
import HTTPError
as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2
import HTTPError
as compat_HTTPError
82 from urllib
.request
import urlretrieve
as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib
import urlretrieve
as compat_urlretrieve
88 from subprocess
import DEVNULL
89 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
91 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
94 from urllib
.parse
import unquote
as compat_urllib_parse_unquote
96 def compat_urllib_parse_unquote(string
, encoding
='utf-8', errors
='replace'):
99 res
= string
.split('%')
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
113 pct_sequence
+= item
[:2].decode('hex')
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
122 # Encountered non-percent-encoded characters. Flush the current
124 string
+= pct_sequence
.decode(encoding
, errors
) + rest
127 # Flush the final pct_sequence
128 string
+= pct_sequence
.decode(encoding
, errors
)
133 from urllib
.parse
import parse_qs
as compat_parse_qs
134 except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
138 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
139 encoding
='utf-8', errors
='replace'):
140 qs
, _coerce_result
= qs
, unicode
141 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
143 for name_value
in pairs
:
144 if not name_value
and not strict_parsing
:
146 nv
= name_value
.split('=', 1)
149 raise ValueError("bad query field: %r" % (name_value
,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values
:
155 if len(nv
[1]) or keep_blank_values
:
156 name
= nv
[0].replace('+', ' ')
157 name
= compat_urllib_parse_unquote(
158 name
, encoding
=encoding
, errors
=errors
)
159 name
= _coerce_result(name
)
160 value
= nv
[1].replace('+', ' ')
161 value
= compat_urllib_parse_unquote(
162 value
, encoding
=encoding
, errors
=errors
)
163 value
= _coerce_result(value
)
164 r
.append((name
, value
))
167 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
168 encoding
='utf-8', errors
='replace'):
170 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
171 encoding
=encoding
, errors
=errors
)
172 for name
, value
in pairs
:
173 if name
in parsed_result
:
174 parsed_result
[name
].append(value
)
176 parsed_result
[name
] = [value
]
180 compat_str
= unicode # Python 2
185 compat_chr
= unichr # Python 2
190 from xml
.etree
.ElementTree
import ParseError
as compat_xml_parse_error
191 except ImportError: # Python 2.6
192 from xml
.parsers
.expat
import ExpatError
as compat_xml_parse_error
195 if type(c
) is int: return c
198 # This is not clearly defined otherwise
199 compiled_regex_type
= type(re
.compile(''))
202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
209 def preferredencoding():
210 """Get preferred encoding.
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
216 pref
= locale
.getpreferredencoding()
223 if sys
.version_info
< (3,0):
225 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
228 assert type(s
) == type(u
'')
231 # In Python 2.x, json.dump expects a bytestream.
232 # In Python 3.x, it writes to a character stream
233 if sys
.version_info
< (3,0):
234 def write_json_file(obj
, fn
):
235 with open(fn
, 'wb') as f
:
238 def write_json_file(obj
, fn
):
239 with open(fn
, 'w', encoding
='utf-8') as f
:
242 if sys
.version_info
>= (2,7):
243 def find_xpath_attr(node
, xpath
, key
, val
):
244 """ Find the xpath xpath[@key=val] """
245 assert re
.match(r
'^[a-zA-Z-]+$', key
)
246 assert re
.match(r
'^[a-zA-Z0-9@\s:._-]*$', val
)
247 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
248 return node
.find(expr
)
250 def find_xpath_attr(node
, xpath
, key
, val
):
251 for f
in node
.findall(xpath
):
252 if f
.attrib
.get(key
) == val
:
256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
257 # the namespace parameter
258 def xpath_with_ns(path
, ns_map
):
259 components
= [c
.split(':') for c
in path
.split('/')]
263 replaced
.append(c
[0])
266 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
267 return '/'.join(replaced
)
269 def htmlentity_transform(matchobj
):
270 """Transforms an HTML entity to a character.
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
275 entity
= matchobj
.group(1)
277 # Known non-numeric HTML entity
278 if entity
in compat_html_entities
.name2codepoint
:
279 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
281 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
283 numstr
= mobj
.group(1)
284 if numstr
.startswith(u
'x'):
286 numstr
= u
'0%s' % numstr
289 return compat_chr(int(numstr
, base
))
291 # Unknown entity in name, return its literal representation
292 return (u
'&%s;' % entity
)
294 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
295 class BaseHTMLParser(compat_html_parser
.HTMLParser
):
297 compat_html_parser
.HTMLParser
.__init
__(self
)
300 def loads(self
, html
):
305 class AttrParser(BaseHTMLParser
):
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self
, attribute
, value
):
308 self
.attribute
= attribute
313 self
.watch_startpos
= False
315 BaseHTMLParser
.__init
__(self
)
317 def error(self
, message
):
318 if self
.error_count
> 10 or self
.started
:
319 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
320 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
321 self
.error_count
+= 1
324 def handle_starttag(self
, tag
, attrs
):
327 self
.find_startpos(None)
328 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
331 self
.watch_startpos
= True
333 if not tag
in self
.depth
: self
.depth
[tag
] = 0
336 def handle_endtag(self
, tag
):
338 if tag
in self
.depth
: self
.depth
[tag
] -= 1
339 if self
.depth
[self
.result
[0]] == 0:
341 self
.result
.append(self
.getpos())
343 def find_startpos(self
, x
):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self
.watch_startpos
:
347 self
.watch_startpos
= False
348 self
.result
.append(self
.getpos())
349 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
350 handle_decl
= handle_pi
= unknown_decl
= find_startpos
352 def get_result(self
):
353 if self
.result
is None:
355 if len(self
.result
) != 3:
357 lines
= self
.html
.split('\n')
358 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
359 lines
[0] = lines
[0][self
.result
[1][1]:]
361 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
362 lines
[-1] = lines
[-1][:self
.result
[2][1]]
363 return '\n'.join(lines
).strip()
364 # Hack for https://github.com/rg3/youtube-dl/issues/662
365 if sys
.version_info
< (2, 7, 3):
366 AttrParser
.parse_endtag
= (lambda self
, i
:
367 i
+ len("</scr'+'ipt>")
368 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
369 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
371 def get_element_by_id(id, html
):
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html
)
375 def get_element_by_attribute(attribute
, value
, html
):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser
= AttrParser(attribute
, value
)
380 except compat_html_parser
.HTMLParseError
:
382 return parser
.get_result()
384 class MetaParser(BaseHTMLParser
):
386 Modified HTMLParser that isolates a meta tag with the specified name
389 def __init__(self
, name
):
390 BaseHTMLParser
.__init
__(self
)
395 def handle_starttag(self
, tag
, attrs
):
399 if attrs
.get('name') == self
.name
:
400 self
.result
= attrs
.get('content')
402 def get_result(self
):
405 def get_meta_content(name
, html
):
407 Return the content attribute from the meta tag with the given name attribute.
409 parser
= MetaParser(name
)
412 except compat_html_parser
.HTMLParseError
:
414 return parser
.get_result()
417 def clean_html(html
):
418 """Clean an HTML snippet into a readable string"""
420 html
= html
.replace('\n', ' ')
421 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
422 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
424 html
= re
.sub('<.*?>', '', html
)
425 # Replace html entities
426 html
= unescapeHTML(html
)
430 def sanitize_open(filename
, open_mode
):
431 """Try to open the given filename, and slightly tweak it if this fails.
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
438 It returns the tuple (stream, definitive_file_name).
442 if sys
.platform
== 'win32':
444 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
445 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
446 stream
= open(encodeFilename(filename
), open_mode
)
447 return (stream
, filename
)
448 except (IOError, OSError) as err
:
449 if err
.errno
in (errno
.EACCES
,):
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename
= os
.path
.join(
454 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
455 for path_part
in os
.path
.split(filename
)
457 if alt_filename
== filename
:
460 # An exception here should be caught in the caller
461 stream
= open(encodeFilename(filename
), open_mode
)
462 return (stream
, alt_filename
)
465 def timeconvert(timestr
):
466 """Convert RFC 2822 defined time string into system timestamp"""
468 timetuple
= email
.utils
.parsedate_tz(timestr
)
469 if timetuple
is not None:
470 timestamp
= email
.utils
.mktime_tz(timetuple
)
473 def sanitize_filename(s
, restricted
=False, is_id
=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
478 def replace_insane(char
):
479 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
482 return '' if restricted
else '\''
484 return '_-' if restricted
else ' -'
485 elif char
in '\\/|*<>':
487 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
489 if restricted
and ord(char
) > 127:
493 result
= u
''.join(map(replace_insane
, s
))
495 while '__' in result
:
496 result
= result
.replace('__', '_')
497 result
= result
.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted
and result
.startswith('-_'):
505 def orderedSet(iterable
):
506 """ Remove all duplicates from the input iterable """
517 assert type(s
) == compat_str
519 result
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, s
)
523 def encodeFilename(s
, for_subprocess
=False):
525 @param s The name of the file
528 assert type(s
) == compat_str
530 # Python 3 has a Unicode API
531 if sys
.version_info
>= (3, 0):
534 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
538 if not for_subprocess
:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding
= preferredencoding()
545 encoding
= sys
.getfilesystemencoding()
548 return s
.encode(encoding
, 'ignore')
551 def encodeArgument(s
):
552 if not isinstance(s
, compat_str
):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s
= s
.decode('ascii')
557 return encodeFilename(s
, True)
560 def decodeOption(optval
):
563 if isinstance(optval
, bytes):
564 optval
= optval
.decode(preferredencoding())
566 assert isinstance(optval
, compat_str
)
569 def formatSeconds(secs
):
571 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
573 return '%d:%02d' % (secs
// 60, secs
% 60)
578 def make_HTTPS_handler(opts_no_check_certificate
, **kwargs
):
579 if sys
.version_info
< (3, 2):
582 class HTTPSConnectionV3(httplib
.HTTPSConnection
):
583 def __init__(self
, *args
, **kwargs
):
584 httplib
.HTTPSConnection
.__init
__(self
, *args
, **kwargs
)
587 sock
= socket
.create_connection((self
.host
, self
.port
), self
.timeout
)
588 if getattr(self
, '_tunnel_host', False):
592 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv3
)
594 self
.sock
= ssl
.wrap_socket(sock
, self
.key_file
, self
.cert_file
, ssl_version
=ssl
.PROTOCOL_SSLv23
)
596 class HTTPSHandlerV3(compat_urllib_request
.HTTPSHandler
):
597 def https_open(self
, req
):
598 return self
.do_open(HTTPSConnectionV3
, req
)
599 return HTTPSHandlerV3(**kwargs
)
601 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv3
)
602 context
.verify_mode
= (ssl
.CERT_NONE
603 if opts_no_check_certificate
604 else ssl
.CERT_REQUIRED
)
605 context
.set_default_verify_paths()
607 context
.load_default_certs()
608 except AttributeError:
610 return compat_urllib_request
.HTTPSHandler(context
=context
, **kwargs
)
612 class ExtractorError(Exception):
613 """Error during info extraction."""
614 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
619 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
621 if video_id
is not None:
622 msg
= video_id
+ ': ' + msg
624 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
625 super(ExtractorError
, self
).__init
__(msg
)
628 self
.exc_info
= sys
.exc_info() # preserve original exception
630 self
.video_id
= video_id
632 def format_traceback(self
):
633 if self
.traceback
is None:
635 return u
''.join(traceback
.format_tb(self
.traceback
))
638 class RegexNotFoundError(ExtractorError
):
639 """Error when a regex didn't match"""
643 class DownloadError(Exception):
644 """Download Error exception.
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
650 def __init__(self
, msg
, exc_info
=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError
, self
).__init
__(msg
)
653 self
.exc_info
= exc_info
656 class SameFileError(Exception):
657 """Same File exception.
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
665 class PostProcessingError(Exception):
666 """Post Processing exception.
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
671 def __init__(self
, msg
):
674 class MaxDownloadsReached(Exception):
675 """ --max-downloads limit has been reached. """
679 class UnavailableVideoError(Exception):
680 """Unavailable Format exception.
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
688 class ContentTooShortError(Exception):
689 """Content Too Short exception.
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
699 def __init__(self
, downloaded
, expected
):
700 self
.downloaded
= downloaded
701 self
.expected
= expected
703 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
704 """Handler for HTTP requests and responses.
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
713 Part of this code was copied from:
715 http://techknack.net/python-urllib2-handlers/
717 Andrew Rowls, the author of that code, agreed to release it to the
724 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
726 return zlib
.decompress(data
)
729 def addinfourl_wrapper(stream
, headers
, url
, code
):
730 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
731 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
732 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
736 def http_request(self
, req
):
737 for h
,v
in std_headers
.items():
741 if 'Youtubedl-no-compression' in req
.headers
:
742 if 'Accept-encoding' in req
.headers
:
743 del req
.headers
['Accept-encoding']
744 del req
.headers
['Youtubedl-no-compression']
745 if 'Youtubedl-user-agent' in req
.headers
:
746 if 'User-agent' in req
.headers
:
747 del req
.headers
['User-agent']
748 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
749 del req
.headers
['Youtubedl-user-agent']
752 def http_response(self
, req
, resp
):
755 if resp
.headers
.get('Content-encoding', '') == 'gzip':
756 content
= resp
.read()
757 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
759 uncompressed
= io
.BytesIO(gz
.read())
760 except IOError as original_ioerror
:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i
in range(1, 1024):
765 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
766 uncompressed
= io
.BytesIO(gz
.read())
771 raise original_ioerror
772 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
773 resp
.msg
= old_resp
.msg
775 if resp
.headers
.get('Content-encoding', '') == 'deflate':
776 gz
= io
.BytesIO(self
.deflate(resp
.read()))
777 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
778 resp
.msg
= old_resp
.msg
781 https_request
= http_request
782 https_response
= http_response
785 def parse_iso8601(date_str
, delimiter
='T'):
786 """ Return a UNIX timestamp from the given date """
792 r
'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
795 timezone
= datetime
.timedelta()
797 date_str
= date_str
[:-len(m
.group(0))]
798 if not m
.group('sign'):
799 timezone
= datetime
.timedelta()
801 sign
= 1 if m
.group('sign') == '+' else -1
802 timezone
= datetime
.timedelta(
803 hours
=sign
* int(m
.group('hours')),
804 minutes
=sign
* int(m
.group('minutes')))
805 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
806 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
807 return calendar
.timegm(dt
.timetuple())
810 def unified_strdate(date_str
):
811 """Return a string with the date in the format YYYYMMDD"""
818 date_str
= date_str
.replace(',', ' ')
819 # %z (UTC offset) is only supported in python>=3.2
820 date_str
= re
.sub(r
' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str
)
821 format_expressions
= [
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
836 '%Y-%m-%dT%H:%M:%SZ',
837 '%Y-%m-%dT%H:%M:%S.%fZ',
838 '%Y-%m-%dT%H:%M:%S.%f0Z',
840 '%Y-%m-%dT%H:%M:%S.%f',
843 for expression
in format_expressions
:
845 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
848 if upload_date
is None:
849 timetuple
= email
.utils
.parsedate_tz(date_str
)
851 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
854 def determine_ext(url
, default_ext
=u
'unknown_video'):
857 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
858 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
863 def subtitles_filename(filename
, sub_lang
, sub_format
):
864 return filename
.rsplit('.', 1)[0] + u
'.' + sub_lang
+ u
'.' + sub_format
866 def date_from_str(date_str
):
868 Return a datetime object from a string in the format YYYYMMDD or
869 (now|today)[+-][0-9](day|week|month|year)(s)?"""
870 today
= datetime
.date
.today()
871 if date_str
== 'now'or date_str
== 'today':
873 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
874 if match
is not None:
875 sign
= match
.group('sign')
876 time
= int(match
.group('time'))
879 unit
= match
.group('unit')
888 delta
= datetime
.timedelta(**{unit
: time
})
890 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
892 def hyphenate_date(date_str
):
894 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
895 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
896 if match
is not None:
897 return '-'.join(match
.groups())
901 class DateRange(object):
902 """Represents a time interval between two dates"""
903 def __init__(self
, start
=None, end
=None):
904 """start and end must be strings in the format accepted by date"""
905 if start
is not None:
906 self
.start
= date_from_str(start
)
908 self
.start
= datetime
.datetime
.min.date()
910 self
.end
= date_from_str(end
)
912 self
.end
= datetime
.datetime
.max.date()
913 if self
.start
> self
.end
:
914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
917 """Returns a range that only contains the given day"""
919 def __contains__(self
, date
):
920 """Check if the date is in the range"""
921 if not isinstance(date
, datetime
.date
):
922 date
= date_from_str(date
)
923 return self
.start
<= date
<= self
.end
925 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())
929 """ Returns the platform name as a compat_str """
930 res
= platform
.platform()
931 if isinstance(res
, bytes):
932 res
= res
.decode(preferredencoding())
934 assert isinstance(res
, compat_str
)
938 def _windows_write_string(s
, out
):
939 """ Returns True if the string was written using special methods,
940 False if it has yet to be written out."""
941 # Adapted from http://stackoverflow.com/a/3259271/35070
944 import ctypes
.wintypes
952 fileno
= out
.fileno()
953 except AttributeError:
954 # If the output stream doesn't have a fileno, it's virtual
956 if fileno
not in WIN_OUTPUT_IDS
:
959 GetStdHandle
= ctypes
.WINFUNCTYPE(
960 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
961 ("GetStdHandle", ctypes
.windll
.kernel32
))
962 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
964 WriteConsoleW
= ctypes
.WINFUNCTYPE(
965 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
966 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
967 ctypes
.wintypes
.LPVOID
)(("WriteConsoleW", ctypes
.windll
.kernel32
))
968 written
= ctypes
.wintypes
.DWORD(0)
970 GetFileType
= ctypes
.WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(("GetFileType", ctypes
.windll
.kernel32
))
971 FILE_TYPE_CHAR
= 0x0002
972 FILE_TYPE_REMOTE
= 0x8000
973 GetConsoleMode
= ctypes
.WINFUNCTYPE(
974 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
975 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
976 ("GetConsoleMode", ctypes
.windll
.kernel32
))
977 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
979 def not_a_console(handle
):
980 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
982 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
983 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
988 def next_nonbmp_pos(s
):
990 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
991 except StopIteration:
995 count
= min(next_nonbmp_pos(s
), 1024)
998 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1000 raise OSError('Failed to write string')
1001 if not count
: # We just wrote a non-BMP character
1002 assert written
.value
== 2
1005 assert written
.value
> 0
1006 s
= s
[written
.value
:]
1010 def write_string(s
, out
=None, encoding
=None):
1013 assert type(s
) == compat_str
1015 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1016 if _windows_write_string(s
, out
):
1019 if ('b' in getattr(out
, 'mode', '') or
1020 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1021 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1023 elif hasattr(out
, 'buffer'):
1024 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1025 byt
= s
.encode(enc
, 'ignore')
1026 out
.buffer.write(byt
)
1032 def bytes_to_intlist(bs
):
1035 if isinstance(bs
[0], int): # Python 3
1038 return [ord(c
) for c
in bs
]
1041 def intlist_to_bytes(xs
):
1044 if isinstance(chr(0), bytes): # Python 2
1045 return ''.join([chr(x
) for x
in xs
])
1050 def get_cachedir(params
={}):
1051 cache_root
= os
.environ
.get('XDG_CACHE_HOME',
1052 os
.path
.expanduser('~/.cache'))
1053 return params
.get('cachedir', os
.path
.join(cache_root
, 'youtube-dl'))
1056 # Cross-platform file locking
1057 if sys
.platform
== 'win32':
1058 import ctypes
.wintypes
1061 class OVERLAPPED(ctypes
.Structure
):
1063 ('Internal', ctypes
.wintypes
.LPVOID
),
1064 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1065 ('Offset', ctypes
.wintypes
.DWORD
),
1066 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1067 ('hEvent', ctypes
.wintypes
.HANDLE
),
1070 kernel32
= ctypes
.windll
.kernel32
1071 LockFileEx
= kernel32
.LockFileEx
1072 LockFileEx
.argtypes
= [
1073 ctypes
.wintypes
.HANDLE
, # hFile
1074 ctypes
.wintypes
.DWORD
, # dwFlags
1075 ctypes
.wintypes
.DWORD
, # dwReserved
1076 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1077 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1078 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1080 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1081 UnlockFileEx
= kernel32
.UnlockFileEx
1082 UnlockFileEx
.argtypes
= [
1083 ctypes
.wintypes
.HANDLE
, # hFile
1084 ctypes
.wintypes
.DWORD
, # dwReserved
1085 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1086 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1087 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1089 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1090 whole_low
= 0xffffffff
1091 whole_high
= 0x7fffffff
1093 def _lock_file(f
, exclusive
):
1094 overlapped
= OVERLAPPED()
1095 overlapped
.Offset
= 0
1096 overlapped
.OffsetHigh
= 0
1097 overlapped
.hEvent
= 0
1098 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1099 handle
= msvcrt
.get_osfhandle(f
.fileno())
1100 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
1101 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1102 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
1104 def _unlock_file(f
):
1105 assert f
._lock
_file
_overlapped
_p
1106 handle
= msvcrt
.get_osfhandle(f
.fileno())
1107 if not UnlockFileEx(handle
, 0,
1108 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1109 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1114 def _lock_file(f
, exclusive
):
1115 fcntl
.lockf(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
1117 def _unlock_file(f
):
1118 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
1121 class locked_file(object):
1122 def __init__(self
, filename
, mode
, encoding
=None):
1123 assert mode
in ['r', 'a', 'w']
1124 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
1127 def __enter__(self
):
1128 exclusive
= self
.mode
!= 'r'
1130 _lock_file(self
.f
, exclusive
)
1136 def __exit__(self
, etype
, value
, traceback
):
1138 _unlock_file(self
.f
)
1145 def write(self
, *args
):
1146 return self
.f
.write(*args
)
1148 def read(self
, *args
):
1149 return self
.f
.read(*args
)
1152 def shell_quote(args
):
1154 encoding
= sys
.getfilesystemencoding()
1155 if encoding
is None:
1158 if isinstance(a
, bytes):
1159 # We may get a filename encoded with 'encodeFilename'
1160 a
= a
.decode(encoding
)
1161 quoted_args
.append(pipes
.quote(a
))
1162 return u
' '.join(quoted_args
)
1165 def takewhile_inclusive(pred
, seq
):
1166 """ Like itertools.takewhile, but include the latest evaluated element
1167 (the first element so that Not pred(e)) """
1174 def smuggle_url(url
, data
):
1175 """ Pass additional data in a URL for internal use. """
1177 sdata
= compat_urllib_parse
.urlencode(
1178 {u
'__youtubedl_smuggle': json
.dumps(data
)})
1179 return url
+ u
'#' + sdata
1182 def unsmuggle_url(smug_url
, default
=None):
1183 if not '#__youtubedl_smuggle' in smug_url
:
1184 return smug_url
, default
1185 url
, _
, sdata
= smug_url
.rpartition(u
'#')
1186 jsond
= compat_parse_qs(sdata
)[u
'__youtubedl_smuggle'][0]
1187 data
= json
.loads(jsond
)
1191 def format_bytes(bytes):
1194 if type(bytes) is str:
1195 bytes = float(bytes)
1199 exponent
= int(math
.log(bytes, 1024.0))
1200 suffix
= [u
'B', u
'KiB', u
'MiB', u
'GiB', u
'TiB', u
'PiB', u
'EiB', u
'ZiB', u
'YiB'][exponent
]
1201 converted
= float(bytes) / float(1024 ** exponent
)
1202 return u
'%.2f%s' % (converted
, suffix
)
1205 def get_term_width():
1206 columns
= os
.environ
.get('COLUMNS', None)
1211 sp
= subprocess
.Popen(
1213 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
1214 out
, err
= sp
.communicate()
1215 return int(out
.split()[1])
1221 def month_by_name(name
):
1222 """ Return the number of a month by (locale-independently) English name """
1225 u
'January', u
'February', u
'March', u
'April', u
'May', u
'June',
1226 u
'July', u
'August', u
'September', u
'October', u
'November', u
'December']
1228 return ENGLISH_NAMES
.index(name
) + 1
1233 def fix_xml_ampersands(xml_str
):
1234 """Replace all the '&' by '&' in XML"""
1236 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1241 def setproctitle(title
):
1242 assert isinstance(title
, compat_str
)
1244 libc
= ctypes
.cdll
.LoadLibrary("libc.so.6")
1247 title_bytes
= title
.encode('utf-8')
1248 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1249 buf
.value
= title_bytes
1251 libc
.prctl(15, buf
, 0, 0, 0)
1252 except AttributeError:
1253 return # Strange libc, just skip this
1256 def remove_start(s
, start
):
1257 if s
.startswith(start
):
1258 return s
[len(start
):]
1262 def url_basename(url
):
1263 path
= compat_urlparse
.urlparse(url
).path
1264 return path
.strip(u
'/').split(u
'/')[-1]
1267 class HEADRequest(compat_urllib_request
.Request
):
1268 def get_method(self
):
1272 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1275 v
= getattr(v
, get_attr
, None)
1276 return default
if v
is None else (int(v
) * invscale
// scale
)
1279 def str_to_int(int_str
):
1282 int_str
= re
.sub(r
'[,\.]', u
'', int_str
)
1286 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
1287 return default
if v
is None else (float(v
) * invscale
/ scale
)
1290 def parse_duration(s
):
1295 r
'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s
)
1298 res
= int(m
.group('secs'))
1300 res
+= int(m
.group('mins')) * 60
1301 if m
.group('hours'):
1302 res
+= int(m
.group('hours')) * 60 * 60
1306 def prepend_extension(filename
, ext
):
1307 name
, real_ext
= os
.path
.splitext(filename
)
1308 return u
'{0}.{1}{2}'.format(name
, ext
, real_ext
)
1311 def check_executable(exe
, args
=[]):
1312 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1313 args can be a list of arguments for a short output (like -version) """
1315 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
1321 class PagedList(object):
1322 def __init__(self
, pagefunc
, pagesize
):
1323 self
._pagefunc
= pagefunc
1324 self
._pagesize
= pagesize
1327 # This is only useful for tests
1328 return len(self
.getslice())
1330 def getslice(self
, start
=0, end
=None):
1332 for pagenum
in itertools
.count(start
// self
._pagesize
):
1333 firstid
= pagenum
* self
._pagesize
1334 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
1335 if start
>= nextfirstid
:
1338 page_results
= list(self
._pagefunc
(pagenum
))
1341 start
% self
._pagesize
1342 if firstid
<= start
< nextfirstid
1346 ((end
- 1) % self
._pagesize
) + 1
1347 if (end
is not None and firstid
<= end
<= nextfirstid
)
1350 if startv
!= 0 or endv
is not None:
1351 page_results
= page_results
[startv
:endv
]
1352 res
.extend(page_results
)
1354 # A little optimization - if current page is not "full", ie. does
1355 # not contain page_size videos then we can assume that this page
1356 # is the last one - there are no more ids on further pages -
1357 # i.e. no need to query again.
1358 if len(page_results
) + startv
< self
._pagesize
:
1361 # If we got the whole page, but the next page is not interesting,
1362 # break out early as well
1363 if end
== nextfirstid
:
1368 def uppercase_escape(s
):
1369 unicode_escape
= codecs
.getdecoder('unicode_escape')
1371 r
'\\U[0-9a-fA-F]{8}',
1372 lambda m
: unicode_escape(m
.group(0))[0],
1376 struct
.pack(u
'!I', 0)
1378 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1379 def struct_pack(spec
, *args
):
1380 if isinstance(spec
, compat_str
):
1381 spec
= spec
.encode('ascii')
1382 return struct
.pack(spec
, *args
)
1384 def struct_unpack(spec
, *args
):
1385 if isinstance(spec
, compat_str
):
1386 spec
= spec
.encode('ascii')
1387 return struct
.unpack(spec
, *args
)
1389 struct_pack
= struct
.pack
1390 struct_unpack
= struct
.unpack
1393 def read_batch_urls(batch_fd
):
1395 if not isinstance(url
, compat_str
):
1396 url
= url
.decode('utf-8', 'replace')
1397 BOM_UTF8
= u
'\xef\xbb\xbf'
1398 if url
.startswith(BOM_UTF8
):
1399 url
= url
[len(BOM_UTF8
):]
1401 if url
.startswith(('#', ';', ']')):
1405 with contextlib
.closing(batch_fd
) as fd
:
1406 return [url
for url
in map(fixup
, fd
) if url
]
1409 def urlencode_postdata(*args
, **kargs
):
1410 return compat_urllib_parse
.urlencode(*args
, **kargs
).encode('ascii')
1414 class TreeBuilder(xml
.etree
.ElementTree
.TreeBuilder
):
1415 def doctype(self
, name
, pubid
, system
):
1416 pass # Ignore doctypes
1418 parser
= xml
.etree
.ElementTree
.XMLParser(target
=TreeBuilder())
1419 kwargs
= {'parser': parser
} if sys
.version_info
>= (2, 7) else {}
1420 return xml
.etree
.ElementTree
.XML(s
.encode('utf-8'), **kwargs
)
1423 if sys
.version_info
< (3, 0) and sys
.platform
== 'win32':
1424 def compat_getpass(prompt
, *args
, **kwargs
):
1425 if isinstance(prompt
, compat_str
):
1426 prompt
= prompt
.encode(preferredencoding())
1427 return getpass
.getpass(prompt
, *args
, **kwargs
)
1429 compat_getpass
= getpass
.getpass
1441 def strip_jsonp(code
):
1442 return re
.sub(r
'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r
'\1', code
)
1445 def qualities(quality_ids
):
1446 """ Get a numeric quality value out of a list of possible values """
1449 return quality_ids
.index(qid
)
1455 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
1458 subprocess_check_output
= subprocess
.check_output
1459 except AttributeError:
1460 def subprocess_check_output(*args
, **kwargs
):
1461 assert 'input' not in kwargs
1462 p
= subprocess
.Popen(*args
, stdout
=subprocess
.PIPE
, **kwargs
)
1463 output
, _
= p
.communicate()
1466 raise subprocess
.CalledProcessError(ret
, p
.args
, output
=output
)