2 # -*- coding: utf-8 -*-
20 import urllib
.request
as compat_urllib_request
21 except ImportError: # Python 2
22 import urllib2
as compat_urllib_request
25 import urllib
.error
as compat_urllib_error
26 except ImportError: # Python 2
27 import urllib2
as compat_urllib_error
30 import urllib
.parse
as compat_urllib_parse
31 except ImportError: # Python 2
32 import urllib
as compat_urllib_parse
35 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
36 except ImportError: # Python 2
37 from urlparse
import urlparse
as compat_urllib_parse_urlparse
40 import urllib
.parse
as compat_urlparse
41 except ImportError: # Python 2
42 import urlparse
as compat_urlparse
45 import http
.cookiejar
as compat_cookiejar
46 except ImportError: # Python 2
47 import cookielib
as compat_cookiejar
50 import html
.entities
as compat_html_entities
51 except ImportError: # Python 2
52 import htmlentitydefs
as compat_html_entities
55 import html
.parser
as compat_html_parser
56 except ImportError: # Python 2
57 import HTMLParser
as compat_html_parser
60 import http
.client
as compat_http_client
61 except ImportError: # Python 2
62 import httplib
as compat_http_client
65 from urllib
.error
import HTTPError
as compat_HTTPError
66 except ImportError: # Python 2
67 from urllib2
import HTTPError
as compat_HTTPError
70 from urllib
.request
import urlretrieve
as compat_urlretrieve
71 except ImportError: # Python 2
72 from urllib
import urlretrieve
as compat_urlretrieve
76 from subprocess
import DEVNULL
77 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
79 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
82 from urllib
.parse
import parse_qs
as compat_parse_qs
83 except ImportError: # Python 2
84 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
85 # Python 2's version is apparently totally broken
86 def _unquote(string
, encoding
='utf-8', errors
='replace'):
89 res
= string
.split('%')
96 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
103 pct_sequence
+= item
[:2].decode('hex')
106 # This segment was just a single percent-encoded character.
107 # May be part of a sequence of code units, so delay decoding.
108 # (Stored in pct_sequence).
112 # Encountered non-percent-encoded characters. Flush the current
114 string
+= pct_sequence
.decode(encoding
, errors
) + rest
117 # Flush the final pct_sequence
118 string
+= pct_sequence
.decode(encoding
, errors
)
121 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
122 encoding
='utf-8', errors
='replace'):
123 qs
, _coerce_result
= qs
, unicode
124 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
126 for name_value
in pairs
:
127 if not name_value
and not strict_parsing
:
129 nv
= name_value
.split('=', 1)
132 raise ValueError("bad query field: %r" % (name_value
,))
133 # Handle case of a control-name with no equal sign
134 if keep_blank_values
:
138 if len(nv
[1]) or keep_blank_values
:
139 name
= nv
[0].replace('+', ' ')
140 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
141 name
= _coerce_result(name
)
142 value
= nv
[1].replace('+', ' ')
143 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
144 value
= _coerce_result(value
)
145 r
.append((name
, value
))
148 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
149 encoding
='utf-8', errors
='replace'):
151 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
152 encoding
=encoding
, errors
=errors
)
153 for name
, value
in pairs
:
154 if name
in parsed_result
:
155 parsed_result
[name
].append(value
)
157 parsed_result
[name
] = [value
]
161 compat_str
= unicode # Python 2
166 compat_chr
= unichr # Python 2
171 if type(c
) is int: return c
174 # This is not clearly defined otherwise
175 compiled_regex_type
= type(re
.compile(''))
178 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
179 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
180 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
181 'Accept-Encoding': 'gzip, deflate',
182 'Accept-Language': 'en-us,en;q=0.5',
185 def preferredencoding():
186 """Get preferred encoding.
188 Returns the best encoding scheme for the system, based on
189 locale.getpreferredencoding() and some further tweaks.
192 pref
= locale
.getpreferredencoding()
199 if sys
.version_info
< (3,0):
201 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
204 assert type(s
) == type(u
'')
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys
.version_info
< (3,0):
210 def write_json_file(obj
, fn
):
211 with open(fn
, 'wb') as f
:
214 def write_json_file(obj
, fn
):
215 with open(fn
, 'w', encoding
='utf-8') as f
:
218 if sys
.version_info
>= (2,7):
219 def find_xpath_attr(node
, xpath
, key
, val
):
220 """ Find the xpath xpath[@key=val] """
221 assert re
.match(r
'^[a-zA-Z]+$', key
)
222 assert re
.match(r
'^[a-zA-Z0-9@\s]*$', val
)
223 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
224 return node
.find(expr
)
226 def find_xpath_attr(node
, xpath
, key
, val
):
227 for f
in node
.findall(xpath
):
228 if f
.attrib
.get(key
) == val
:
232 def htmlentity_transform(matchobj
):
233 """Transforms an HTML entity to a character.
235 This function receives a match object and is intended to be used with
236 the re.sub() function.
238 entity
= matchobj
.group(1)
240 # Known non-numeric HTML entity
241 if entity
in compat_html_entities
.name2codepoint
:
242 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
244 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
246 numstr
= mobj
.group(1)
247 if numstr
.startswith(u
'x'):
249 numstr
= u
'0%s' % numstr
252 return compat_chr(int(numstr
, base
))
254 # Unknown entity in name, return its literal representation
255 return (u
'&%s;' % entity
)
257 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
258 class BaseHTMLParser(compat_html_parser
.HTMLParser
):
260 compat_html_parser
.HTMLParser
.__init
__(self
)
263 def loads(self
, html
):
268 class AttrParser(BaseHTMLParser
):
269 """Modified HTMLParser that isolates a tag with the specified attribute"""
270 def __init__(self
, attribute
, value
):
271 self
.attribute
= attribute
276 self
.watch_startpos
= False
278 BaseHTMLParser
.__init
__(self
)
280 def error(self
, message
):
281 if self
.error_count
> 10 or self
.started
:
282 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
283 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
284 self
.error_count
+= 1
287 def handle_starttag(self
, tag
, attrs
):
290 self
.find_startpos(None)
291 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
294 self
.watch_startpos
= True
296 if not tag
in self
.depth
: self
.depth
[tag
] = 0
299 def handle_endtag(self
, tag
):
301 if tag
in self
.depth
: self
.depth
[tag
] -= 1
302 if self
.depth
[self
.result
[0]] == 0:
304 self
.result
.append(self
.getpos())
306 def find_startpos(self
, x
):
307 """Needed to put the start position of the result (self.result[1])
308 after the opening tag with the requested id"""
309 if self
.watch_startpos
:
310 self
.watch_startpos
= False
311 self
.result
.append(self
.getpos())
312 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
313 handle_decl
= handle_pi
= unknown_decl
= find_startpos
315 def get_result(self
):
316 if self
.result
is None:
318 if len(self
.result
) != 3:
320 lines
= self
.html
.split('\n')
321 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
322 lines
[0] = lines
[0][self
.result
[1][1]:]
324 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
325 lines
[-1] = lines
[-1][:self
.result
[2][1]]
326 return '\n'.join(lines
).strip()
327 # Hack for https://github.com/rg3/youtube-dl/issues/662
328 if sys
.version_info
< (2, 7, 3):
329 AttrParser
.parse_endtag
= (lambda self
, i
:
330 i
+ len("</scr'+'ipt>")
331 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
332 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
334 def get_element_by_id(id, html
):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute("id", id, html
)
338 def get_element_by_attribute(attribute
, value
, html
):
339 """Return the content of the tag with the specified attribute in the passed HTML document"""
340 parser
= AttrParser(attribute
, value
)
343 except compat_html_parser
.HTMLParseError
:
345 return parser
.get_result()
347 class MetaParser(BaseHTMLParser
):
349 Modified HTMLParser that isolates a meta tag with the specified name
352 def __init__(self
, name
):
353 BaseHTMLParser
.__init
__(self
)
358 def handle_starttag(self
, tag
, attrs
):
362 if attrs
.get('name') == self
.name
:
363 self
.result
= attrs
.get('content')
365 def get_result(self
):
368 def get_meta_content(name
, html
):
370 Return the content attribute from the meta tag with the given name attribute.
372 parser
= MetaParser(name
)
375 except compat_html_parser
.HTMLParseError
:
377 return parser
.get_result()
380 def clean_html(html
):
381 """Clean an HTML snippet into a readable string"""
383 html
= html
.replace('\n', ' ')
384 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
385 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
387 html
= re
.sub('<.*?>', '', html
)
388 # Replace html entities
389 html
= unescapeHTML(html
)
393 def sanitize_open(filename
, open_mode
):
394 """Try to open the given filename, and slightly tweak it if this fails.
396 Attempts to open the given filename. If this fails, it tries to change
397 the filename slightly, step by step, until it's either able to open it
398 or it fails and raises a final exception, like the standard open()
401 It returns the tuple (stream, definitive_file_name).
405 if sys
.platform
== 'win32':
407 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
408 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
409 stream
= open(encodeFilename(filename
), open_mode
)
410 return (stream
, filename
)
411 except (IOError, OSError) as err
:
412 if err
.errno
in (errno
.EACCES
,):
415 # In case of error, try to remove win32 forbidden chars
416 alt_filename
= os
.path
.join(
417 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
418 for path_part
in os
.path
.split(filename
)
420 if alt_filename
== filename
:
423 # An exception here should be caught in the caller
424 stream
= open(encodeFilename(filename
), open_mode
)
425 return (stream
, alt_filename
)
428 def timeconvert(timestr
):
429 """Convert RFC 2822 defined time string into system timestamp"""
431 timetuple
= email
.utils
.parsedate_tz(timestr
)
432 if timetuple
is not None:
433 timestamp
= email
.utils
.mktime_tz(timetuple
)
436 def sanitize_filename(s
, restricted
=False, is_id
=False):
437 """Sanitizes a string so it could be used as part of a filename.
438 If restricted is set, use a stricter subset of allowed characters.
439 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
441 def replace_insane(char
):
442 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
445 return '' if restricted
else '\''
447 return '_-' if restricted
else ' -'
448 elif char
in '\\/|*<>':
450 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
452 if restricted
and ord(char
) > 127:
456 result
= u
''.join(map(replace_insane
, s
))
458 while '__' in result
:
459 result
= result
.replace('__', '_')
460 result
= result
.strip('_')
461 # Common case of "Foreign band name - English song title"
462 if restricted
and result
.startswith('-_'):
468 def orderedSet(iterable
):
469 """ Remove all duplicates from the input iterable """
480 assert type(s
) == type(u
'')
482 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
485 def encodeFilename(s
):
487 @param s The name of the file
490 assert type(s
) == type(u
'')
492 # Python 3 has a Unicode API
493 if sys
.version_info
>= (3, 0):
496 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
497 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
498 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
499 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
502 encoding
= sys
.getfilesystemencoding()
505 return s
.encode(encoding
, 'ignore')
507 def decodeOption(optval
):
510 if isinstance(optval
, bytes):
511 optval
= optval
.decode(preferredencoding())
513 assert isinstance(optval
, compat_str
)
516 def formatSeconds(secs
):
518 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
520 return '%d:%02d' % (secs
// 60, secs
% 60)
524 def make_HTTPS_handler(opts
):
525 if sys
.version_info
< (3,2):
526 # Python's 2.x handler is very simplistic
527 return compat_urllib_request
.HTTPSHandler()
530 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
531 context
.set_default_verify_paths()
533 context
.verify_mode
= (ssl
.CERT_NONE
534 if opts
.no_check_certificate
535 else ssl
.CERT_REQUIRED
)
536 return compat_urllib_request
.HTTPSHandler(context
=context
)
538 class ExtractorError(Exception):
539 """Error during info extraction."""
540 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None):
541 """ tb, if given, is the original traceback (so that it can be printed out).
542 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
545 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
548 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
549 super(ExtractorError
, self
).__init
__(msg
)
552 self
.exc_info
= sys
.exc_info() # preserve original exception
555 def format_traceback(self
):
556 if self
.traceback
is None:
558 return u
''.join(traceback
.format_tb(self
.traceback
))
561 class DownloadError(Exception):
562 """Download Error exception.
564 This exception may be thrown by FileDownloader objects if they are not
565 configured to continue on errors. They will contain the appropriate
568 def __init__(self
, msg
, exc_info
=None):
569 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
570 super(DownloadError
, self
).__init
__(msg
)
571 self
.exc_info
= exc_info
574 class SameFileError(Exception):
575 """Same File exception.
577 This exception will be thrown by FileDownloader objects if they detect
578 multiple files would have to be downloaded to the same file on disk.
583 class PostProcessingError(Exception):
584 """Post Processing exception.
586 This exception may be raised by PostProcessor's .run() method to
587 indicate an error in the postprocessing task.
589 def __init__(self
, msg
):
592 class MaxDownloadsReached(Exception):
593 """ --max-downloads limit has been reached. """
597 class UnavailableVideoError(Exception):
598 """Unavailable Format exception.
600 This exception will be thrown when a video is requested
601 in a format that is not available for that video.
606 class ContentTooShortError(Exception):
607 """Content Too Short exception.
609 This exception may be raised by FileDownloader objects when a file they
610 download is too small for what the server announced first, indicating
611 the connection was probably interrupted.
617 def __init__(self
, downloaded
, expected
):
618 self
.downloaded
= downloaded
619 self
.expected
= expected
621 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
622 """Handler for HTTP requests and responses.
624 This class, when installed with an OpenerDirector, automatically adds
625 the standard headers to every HTTP request and handles gzipped and
626 deflated responses from web servers. If compression is to be avoided in
627 a particular request, the original request in the program code only has
628 to include the HTTP header "Youtubedl-No-Compression", which will be
629 removed before making the real request.
631 Part of this code was copied from:
633 http://techknack.net/python-urllib2-handlers/
635 Andrew Rowls, the author of that code, agreed to release it to the
642 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
644 return zlib
.decompress(data
)
647 def addinfourl_wrapper(stream
, headers
, url
, code
):
648 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
649 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
650 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
654 def http_request(self
, req
):
655 for h
,v
in std_headers
.items():
659 if 'Youtubedl-no-compression' in req
.headers
:
660 if 'Accept-encoding' in req
.headers
:
661 del req
.headers
['Accept-encoding']
662 del req
.headers
['Youtubedl-no-compression']
663 if 'Youtubedl-user-agent' in req
.headers
:
664 if 'User-agent' in req
.headers
:
665 del req
.headers
['User-agent']
666 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
667 del req
.headers
['Youtubedl-user-agent']
670 def http_response(self
, req
, resp
):
673 if resp
.headers
.get('Content-encoding', '') == 'gzip':
674 content
= resp
.read()
675 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
677 uncompressed
= io
.BytesIO(gz
.read())
678 except IOError as original_ioerror
:
679 # There may be junk add the end of the file
680 # See http://stackoverflow.com/q/4928560/35070 for details
681 for i
in range(1, 1024):
683 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
684 uncompressed
= io
.BytesIO(gz
.read())
689 raise original_ioerror
690 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
691 resp
.msg
= old_resp
.msg
693 if resp
.headers
.get('Content-encoding', '') == 'deflate':
694 gz
= io
.BytesIO(self
.deflate(resp
.read()))
695 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
696 resp
.msg
= old_resp
.msg
699 https_request
= http_request
700 https_response
= http_response
702 def unified_strdate(date_str
):
703 """Return a string with the date in the format YYYYMMDD"""
706 date_str
= date_str
.replace(',',' ')
707 # %z (UTC offset) is only supported in python>=3.2
708 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
709 format_expressions
= [
717 '%Y-%m-%dT%H:%M:%SZ',
719 for expression
in format_expressions
:
721 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
726 def determine_ext(url
, default_ext
=u
'unknown_video'):
727 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
728 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
733 def subtitles_filename(filename
, sub_lang
, sub_format
):
734 return filename
.rsplit('.', 1)[0] + u
'.' + sub_lang
+ u
'.' + sub_format
736 def date_from_str(date_str
):
738 Return a datetime object from a string in the format YYYYMMDD or
739 (now|today)[+-][0-9](day|week|month|year)(s)?"""
740 today
= datetime
.date
.today()
741 if date_str
== 'now'or date_str
== 'today':
743 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
744 if match
is not None:
745 sign
= match
.group('sign')
746 time
= int(match
.group('time'))
749 unit
= match
.group('unit')
758 delta
= datetime
.timedelta(**{unit
: time
})
760 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
762 class DateRange(object):
763 """Represents a time interval between two dates"""
764 def __init__(self
, start
=None, end
=None):
765 """start and end must be strings in the format accepted by date"""
766 if start
is not None:
767 self
.start
= date_from_str(start
)
769 self
.start
= datetime
.datetime
.min.date()
771 self
.end
= date_from_str(end
)
773 self
.end
= datetime
.datetime
.max.date()
774 if self
.start
> self
.end
:
775 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
778 """Returns a range that only contains the given day"""
780 def __contains__(self
, date
):
781 """Check if the date is in the range"""
782 if not isinstance(date
, datetime
.date
):
783 date
= date_from_str(date
)
784 return self
.start
<= date
<= self
.end
786 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())
790 """ Returns the platform name as a compat_str """
791 res
= platform
.platform()
792 if isinstance(res
, bytes):
793 res
= res
.decode(preferredencoding())
795 assert isinstance(res
, compat_str
)
799 def write_string(s
, out
=None):
802 assert type(s
) == type(u
'')
804 if ('b' in getattr(out
, 'mode', '') or
805 sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
806 s
= s
.encode(preferredencoding(), 'ignore')
811 def bytes_to_intlist(bs
):
814 if isinstance(bs
[0], int): # Python 3
817 return [ord(c
) for c
in bs
]
820 def intlist_to_bytes(xs
):
823 if isinstance(chr(0), bytes): # Python 2
824 return ''.join([chr(x
) for x
in xs
])