2 # -*- coding: utf-8 -*-
20 import urllib
.request
as compat_urllib_request
21 except ImportError: # Python 2
22 import urllib2
as compat_urllib_request
25 import urllib
.error
as compat_urllib_error
26 except ImportError: # Python 2
27 import urllib2
as compat_urllib_error
30 import urllib
.parse
as compat_urllib_parse
31 except ImportError: # Python 2
32 import urllib
as compat_urllib_parse
35 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
36 except ImportError: # Python 2
37 from urlparse
import urlparse
as compat_urllib_parse_urlparse
40 import urllib
.parse
as compat_urlparse
41 except ImportError: # Python 2
42 import urlparse
as compat_urlparse
45 import http
.cookiejar
as compat_cookiejar
46 except ImportError: # Python 2
47 import cookielib
as compat_cookiejar
50 import html
.entities
as compat_html_entities
51 except ImportError: # Python 2
52 import htmlentitydefs
as compat_html_entities
55 import html
.parser
as compat_html_parser
56 except ImportError: # Python 2
57 import HTMLParser
as compat_html_parser
60 import http
.client
as compat_http_client
61 except ImportError: # Python 2
62 import httplib
as compat_http_client
65 from urllib
.error
import HTTPError
as compat_HTTPError
66 except ImportError: # Python 2
67 from urllib2
import HTTPError
as compat_HTTPError
70 from subprocess
import DEVNULL
71 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
73 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
76 from urllib
.parse
import parse_qs
as compat_parse_qs
77 except ImportError: # Python 2
78 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
79 # Python 2's version is apparently totally broken
80 def _unquote(string
, encoding
='utf-8', errors
='replace'):
83 res
= string
.split('%')
90 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
97 pct_sequence
+= item
[:2].decode('hex')
100 # This segment was just a single percent-encoded character.
101 # May be part of a sequence of code units, so delay decoding.
102 # (Stored in pct_sequence).
106 # Encountered non-percent-encoded characters. Flush the current
108 string
+= pct_sequence
.decode(encoding
, errors
) + rest
111 # Flush the final pct_sequence
112 string
+= pct_sequence
.decode(encoding
, errors
)
115 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
116 encoding
='utf-8', errors
='replace'):
117 qs
, _coerce_result
= qs
, unicode
118 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
120 for name_value
in pairs
:
121 if not name_value
and not strict_parsing
:
123 nv
= name_value
.split('=', 1)
126 raise ValueError("bad query field: %r" % (name_value
,))
127 # Handle case of a control-name with no equal sign
128 if keep_blank_values
:
132 if len(nv
[1]) or keep_blank_values
:
133 name
= nv
[0].replace('+', ' ')
134 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
135 name
= _coerce_result(name
)
136 value
= nv
[1].replace('+', ' ')
137 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
138 value
= _coerce_result(value
)
139 r
.append((name
, value
))
142 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
143 encoding
='utf-8', errors
='replace'):
145 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
146 encoding
=encoding
, errors
=errors
)
147 for name
, value
in pairs
:
148 if name
in parsed_result
:
149 parsed_result
[name
].append(value
)
151 parsed_result
[name
] = [value
]
155 compat_str
= unicode # Python 2
160 compat_chr
= unichr # Python 2
165 if type(c
) is int: return c
168 # This is not clearly defined otherwise
169 compiled_regex_type
= type(re
.compile(''))
172 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
173 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
174 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
175 'Accept-Encoding': 'gzip, deflate',
176 'Accept-Language': 'en-us,en;q=0.5',
179 def preferredencoding():
180 """Get preferred encoding.
182 Returns the best encoding scheme for the system, based on
183 locale.getpreferredencoding() and some further tweaks.
186 pref
= locale
.getpreferredencoding()
193 if sys
.version_info
< (3,0):
195 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
198 assert type(s
) == type(u
'')
201 # In Python 2.x, json.dump expects a bytestream.
202 # In Python 3.x, it writes to a character stream
203 if sys
.version_info
< (3,0):
204 def write_json_file(obj
, fn
):
205 with open(fn
, 'wb') as f
:
208 def write_json_file(obj
, fn
):
209 with open(fn
, 'w', encoding
='utf-8') as f
:
212 if sys
.version_info
>= (2,7):
213 def find_xpath_attr(node
, xpath
, key
, val
):
214 """ Find the xpath xpath[@key=val] """
215 assert re
.match(r
'^[a-zA-Z]+$', key
)
216 assert re
.match(r
'^[a-zA-Z0-9@\s]*$', val
)
217 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
218 return node
.find(expr
)
220 def find_xpath_attr(node
, xpath
, key
, val
):
221 for f
in node
.findall(xpath
):
222 if f
.attrib
.get(key
) == val
:
226 def htmlentity_transform(matchobj
):
227 """Transforms an HTML entity to a character.
229 This function receives a match object and is intended to be used with
230 the re.sub() function.
232 entity
= matchobj
.group(1)
234 # Known non-numeric HTML entity
235 if entity
in compat_html_entities
.name2codepoint
:
236 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
238 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
240 numstr
= mobj
.group(1)
241 if numstr
.startswith(u
'x'):
243 numstr
= u
'0%s' % numstr
246 return compat_chr(int(numstr
, base
))
248 # Unknown entity in name, return its literal representation
249 return (u
'&%s;' % entity
)
251 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
252 class AttrParser(compat_html_parser
.HTMLParser
):
253 """Modified HTMLParser that isolates a tag with the specified attribute"""
254 def __init__(self
, attribute
, value
):
255 self
.attribute
= attribute
261 self
.watch_startpos
= False
263 compat_html_parser
.HTMLParser
.__init
__(self
)
265 def error(self
, message
):
266 if self
.error_count
> 10 or self
.started
:
267 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
268 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
269 self
.error_count
+= 1
272 def loads(self
, html
):
277 def handle_starttag(self
, tag
, attrs
):
280 self
.find_startpos(None)
281 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
284 self
.watch_startpos
= True
286 if not tag
in self
.depth
: self
.depth
[tag
] = 0
289 def handle_endtag(self
, tag
):
291 if tag
in self
.depth
: self
.depth
[tag
] -= 1
292 if self
.depth
[self
.result
[0]] == 0:
294 self
.result
.append(self
.getpos())
296 def find_startpos(self
, x
):
297 """Needed to put the start position of the result (self.result[1])
298 after the opening tag with the requested id"""
299 if self
.watch_startpos
:
300 self
.watch_startpos
= False
301 self
.result
.append(self
.getpos())
302 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
303 handle_decl
= handle_pi
= unknown_decl
= find_startpos
305 def get_result(self
):
306 if self
.result
is None:
308 if len(self
.result
) != 3:
310 lines
= self
.html
.split('\n')
311 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
312 lines
[0] = lines
[0][self
.result
[1][1]:]
314 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
315 lines
[-1] = lines
[-1][:self
.result
[2][1]]
316 return '\n'.join(lines
).strip()
317 # Hack for https://github.com/rg3/youtube-dl/issues/662
318 if sys
.version_info
< (2, 7, 3):
319 AttrParser
.parse_endtag
= (lambda self
, i
:
320 i
+ len("</scr'+'ipt>")
321 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
322 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
324 def get_element_by_id(id, html
):
325 """Return the content of the tag with the specified ID in the passed HTML document"""
326 return get_element_by_attribute("id", id, html
)
328 def get_element_by_attribute(attribute
, value
, html
):
329 """Return the content of the tag with the specified attribute in the passed HTML document"""
330 parser
= AttrParser(attribute
, value
)
333 except compat_html_parser
.HTMLParseError
:
335 return parser
.get_result()
338 def clean_html(html
):
339 """Clean an HTML snippet into a readable string"""
341 html
= html
.replace('\n', ' ')
342 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
343 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
345 html
= re
.sub('<.*?>', '', html
)
346 # Replace html entities
347 html
= unescapeHTML(html
)
351 def sanitize_open(filename
, open_mode
):
352 """Try to open the given filename, and slightly tweak it if this fails.
354 Attempts to open the given filename. If this fails, it tries to change
355 the filename slightly, step by step, until it's either able to open it
356 or it fails and raises a final exception, like the standard open()
359 It returns the tuple (stream, definitive_file_name).
363 if sys
.platform
== 'win32':
365 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
366 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
367 stream
= open(encodeFilename(filename
), open_mode
)
368 return (stream
, filename
)
369 except (IOError, OSError) as err
:
370 if err
.errno
in (errno
.EACCES
,):
373 # In case of error, try to remove win32 forbidden chars
374 alt_filename
= os
.path
.join(
375 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
376 for path_part
in os
.path
.split(filename
)
378 if alt_filename
== filename
:
381 # An exception here should be caught in the caller
382 stream
= open(encodeFilename(filename
), open_mode
)
383 return (stream
, alt_filename
)
386 def timeconvert(timestr
):
387 """Convert RFC 2822 defined time string into system timestamp"""
389 timetuple
= email
.utils
.parsedate_tz(timestr
)
390 if timetuple
is not None:
391 timestamp
= email
.utils
.mktime_tz(timetuple
)
394 def sanitize_filename(s
, restricted
=False, is_id
=False):
395 """Sanitizes a string so it could be used as part of a filename.
396 If restricted is set, use a stricter subset of allowed characters.
397 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
399 def replace_insane(char
):
400 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
403 return '' if restricted
else '\''
405 return '_-' if restricted
else ' -'
406 elif char
in '\\/|*<>':
408 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
410 if restricted
and ord(char
) > 127:
414 result
= u
''.join(map(replace_insane
, s
))
416 while '__' in result
:
417 result
= result
.replace('__', '_')
418 result
= result
.strip('_')
419 # Common case of "Foreign band name - English song title"
420 if restricted
and result
.startswith('-_'):
426 def orderedSet(iterable
):
427 """ Remove all duplicates from the input iterable """
438 assert type(s
) == type(u
'')
440 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
443 def encodeFilename(s
):
445 @param s The name of the file
448 assert type(s
) == type(u
'')
450 # Python 3 has a Unicode API
451 if sys
.version_info
>= (3, 0):
454 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
455 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
456 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
457 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
460 encoding
= sys
.getfilesystemencoding()
463 return s
.encode(encoding
, 'ignore')
465 def decodeOption(optval
):
468 if isinstance(optval
, bytes):
469 optval
= optval
.decode(preferredencoding())
471 assert isinstance(optval
, compat_str
)
474 def formatSeconds(secs
):
476 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
478 return '%d:%02d' % (secs
// 60, secs
% 60)
482 def make_HTTPS_handler(opts
):
483 if sys
.version_info
< (3,2):
484 # Python's 2.x handler is very simplistic
485 return compat_urllib_request
.HTTPSHandler()
488 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
489 context
.set_default_verify_paths()
491 context
.verify_mode
= (ssl
.CERT_NONE
492 if opts
.no_check_certificate
493 else ssl
.CERT_REQUIRED
)
494 return compat_urllib_request
.HTTPSHandler(context
=context
)
496 class ExtractorError(Exception):
497 """Error during info extraction."""
498 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None):
499 """ tb, if given, is the original traceback (so that it can be printed out).
500 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
503 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
506 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
507 super(ExtractorError
, self
).__init
__(msg
)
510 self
.exc_info
= sys
.exc_info() # preserve original exception
513 def format_traceback(self
):
514 if self
.traceback
is None:
516 return u
''.join(traceback
.format_tb(self
.traceback
))
519 class DownloadError(Exception):
520 """Download Error exception.
522 This exception may be thrown by FileDownloader objects if they are not
523 configured to continue on errors. They will contain the appropriate
526 def __init__(self
, msg
, exc_info
=None):
527 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
528 super(DownloadError
, self
).__init
__(msg
)
529 self
.exc_info
= exc_info
532 class SameFileError(Exception):
533 """Same File exception.
535 This exception will be thrown by FileDownloader objects if they detect
536 multiple files would have to be downloaded to the same file on disk.
541 class PostProcessingError(Exception):
542 """Post Processing exception.
544 This exception may be raised by PostProcessor's .run() method to
545 indicate an error in the postprocessing task.
547 def __init__(self
, msg
):
550 class MaxDownloadsReached(Exception):
551 """ --max-downloads limit has been reached. """
555 class UnavailableVideoError(Exception):
556 """Unavailable Format exception.
558 This exception will be thrown when a video is requested
559 in a format that is not available for that video.
564 class ContentTooShortError(Exception):
565 """Content Too Short exception.
567 This exception may be raised by FileDownloader objects when a file they
568 download is too small for what the server announced first, indicating
569 the connection was probably interrupted.
575 def __init__(self
, downloaded
, expected
):
576 self
.downloaded
= downloaded
577 self
.expected
= expected
579 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
580 """Handler for HTTP requests and responses.
582 This class, when installed with an OpenerDirector, automatically adds
583 the standard headers to every HTTP request and handles gzipped and
584 deflated responses from web servers. If compression is to be avoided in
585 a particular request, the original request in the program code only has
586 to include the HTTP header "Youtubedl-No-Compression", which will be
587 removed before making the real request.
589 Part of this code was copied from:
591 http://techknack.net/python-urllib2-handlers/
593 Andrew Rowls, the author of that code, agreed to release it to the
600 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
602 return zlib
.decompress(data
)
605 def addinfourl_wrapper(stream
, headers
, url
, code
):
606 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
607 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
608 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
612 def http_request(self
, req
):
613 for h
,v
in std_headers
.items():
617 if 'Youtubedl-no-compression' in req
.headers
:
618 if 'Accept-encoding' in req
.headers
:
619 del req
.headers
['Accept-encoding']
620 del req
.headers
['Youtubedl-no-compression']
621 if 'Youtubedl-user-agent' in req
.headers
:
622 if 'User-agent' in req
.headers
:
623 del req
.headers
['User-agent']
624 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
625 del req
.headers
['Youtubedl-user-agent']
628 def http_response(self
, req
, resp
):
631 if resp
.headers
.get('Content-encoding', '') == 'gzip':
632 content
= resp
.read()
633 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
635 uncompressed
= io
.BytesIO(gz
.read())
636 except IOError as original_ioerror
:
637 # There may be junk add the end of the file
638 # See http://stackoverflow.com/q/4928560/35070 for details
639 for i
in range(1, 1024):
641 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
642 uncompressed
= io
.BytesIO(gz
.read())
647 raise original_ioerror
648 resp
= self
.addinfourl_wrapper(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
649 resp
.msg
= old_resp
.msg
651 if resp
.headers
.get('Content-encoding', '') == 'deflate':
652 gz
= io
.BytesIO(self
.deflate(resp
.read()))
653 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
654 resp
.msg
= old_resp
.msg
657 https_request
= http_request
658 https_response
= http_response
660 def unified_strdate(date_str
):
661 """Return a string with the date in the format YYYYMMDD"""
664 date_str
= date_str
.replace(',',' ')
665 # %z (UTC offset) is only supported in python>=3.2
666 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
667 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
668 for expression
in format_expressions
:
670 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
675 def determine_ext(url
, default_ext
=u
'unknown_video'):
676 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
677 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
682 def subtitles_filename(filename
, sub_lang
, sub_format
):
683 return filename
.rsplit('.', 1)[0] + u
'.' + sub_lang
+ u
'.' + sub_format
685 def date_from_str(date_str
):
687 Return a datetime object from a string in the format YYYYMMDD or
688 (now|today)[+-][0-9](day|week|month|year)(s)?"""
689 today
= datetime
.date
.today()
690 if date_str
== 'now'or date_str
== 'today':
692 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
693 if match
is not None:
694 sign
= match
.group('sign')
695 time
= int(match
.group('time'))
698 unit
= match
.group('unit')
707 delta
= datetime
.timedelta(**{unit
: time
})
709 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
711 class DateRange(object):
712 """Represents a time interval between two dates"""
713 def __init__(self
, start
=None, end
=None):
714 """start and end must be strings in the format accepted by date"""
715 if start
is not None:
716 self
.start
= date_from_str(start
)
718 self
.start
= datetime
.datetime
.min.date()
720 self
.end
= date_from_str(end
)
722 self
.end
= datetime
.datetime
.max.date()
723 if self
.start
> self
.end
:
724 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
727 """Returns a range that only contains the given day"""
729 def __contains__(self
, date
):
730 """Check if the date is in the range"""
731 if not isinstance(date
, datetime
.date
):
732 date
= date_from_str(date
)
733 return self
.start
<= date
<= self
.end
735 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())
739 """ Returns the platform name as a compat_str """
740 res
= platform
.platform()
741 if isinstance(res
, bytes):
742 res
= res
.decode(preferredencoding())
744 assert isinstance(res
, compat_str
)
748 def bytes_to_intlist(bs
):
751 if isinstance(bs
[0], int): # Python 3
754 return [ord(c
) for c
in bs
]
757 def intlist_to_bytes(xs
):
760 if isinstance(chr(0), bytes): # Python 2
761 return ''.join([chr(x
) for x
in xs
])