2 # -*- coding: utf-8 -*-
19 import urllib
.request
as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2
as compat_urllib_request
24 import urllib
.error
as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2
as compat_urllib_error
29 import urllib
.parse
as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib
as compat_urllib_parse
34 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse
import urlparse
as compat_urllib_parse_urlparse
39 import http
.cookiejar
as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib
as compat_cookiejar
44 import html
.entities
as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs
as compat_html_entities
49 import html
.parser
as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser
as compat_html_parser
54 import http
.client
as compat_http_client
55 except ImportError: # Python 2
56 import httplib
as compat_http_client
59 from subprocess
import DEVNULL
60 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
62 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
65 from urllib
.parse
import parse_qs
as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string
, encoding
='utf-8', errors
='replace'):
72 res
= string
.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence
+= item
[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string
+= pct_sequence
.decode(encoding
, errors
) + rest
100 # Flush the final pct_sequence
101 string
+= pct_sequence
.decode(encoding
, errors
)
104 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
105 encoding
='utf-8', errors
='replace'):
106 qs
, _coerce_result
= qs
, unicode
107 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
109 for name_value
in pairs
:
110 if not name_value
and not strict_parsing
:
112 nv
= name_value
.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value
,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values
:
121 if len(nv
[1]) or keep_blank_values
:
122 name
= nv
[0].replace('+', ' ')
123 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
124 name
= _coerce_result(name
)
125 value
= nv
[1].replace('+', ' ')
126 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
127 value
= _coerce_result(value
)
128 r
.append((name
, value
))
131 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
132 encoding
='utf-8', errors
='replace'):
134 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
135 encoding
=encoding
, errors
=errors
)
136 for name
, value
in pairs
:
137 if name
in parsed_result
:
138 parsed_result
[name
].append(value
)
140 parsed_result
[name
] = [value
]
144 compat_str
= unicode # Python 2
149 compat_chr
= unichr # Python 2
154 if type(c
) is int: return c
157 # This is not clearly defined otherwise
158 compiled_regex_type
= type(re
.compile(''))
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
168 def preferredencoding():
169 """Get preferred encoding.
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
175 pref
= locale
.getpreferredencoding()
182 if sys
.version_info
< (3,0):
184 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
187 assert type(s
) == type(u
'')
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys
.version_info
< (3,0):
193 def write_json_file(obj
, fn
):
194 with open(fn
, 'wb') as f
:
197 def write_json_file(obj
, fn
):
198 with open(fn
, 'w', encoding
='utf-8') as f
:
201 def htmlentity_transform(matchobj
):
202 """Transforms an HTML entity to a character.
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
207 entity
= matchobj
.group(1)
209 # Known non-numeric HTML entity
210 if entity
in compat_html_entities
.name2codepoint
:
211 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
213 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
215 numstr
= mobj
.group(1)
216 if numstr
.startswith(u
'x'):
218 numstr
= u
'0%s' % numstr
221 return compat_chr(int(numstr
, base
))
223 # Unknown entity in name, return its literal representation
224 return (u
'&%s;' % entity
)
226 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
227 class AttrParser(compat_html_parser
.HTMLParser
):
228 """Modified HTMLParser that isolates a tag with the specified attribute"""
229 def __init__(self
, attribute
, value
):
230 self
.attribute
= attribute
236 self
.watch_startpos
= False
238 compat_html_parser
.HTMLParser
.__init
__(self
)
240 def error(self
, message
):
241 if self
.error_count
> 10 or self
.started
:
242 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
243 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
244 self
.error_count
+= 1
247 def loads(self
, html
):
252 def handle_starttag(self
, tag
, attrs
):
255 self
.find_startpos(None)
256 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
259 self
.watch_startpos
= True
261 if not tag
in self
.depth
: self
.depth
[tag
] = 0
264 def handle_endtag(self
, tag
):
266 if tag
in self
.depth
: self
.depth
[tag
] -= 1
267 if self
.depth
[self
.result
[0]] == 0:
269 self
.result
.append(self
.getpos())
271 def find_startpos(self
, x
):
272 """Needed to put the start position of the result (self.result[1])
273 after the opening tag with the requested id"""
274 if self
.watch_startpos
:
275 self
.watch_startpos
= False
276 self
.result
.append(self
.getpos())
277 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
278 handle_decl
= handle_pi
= unknown_decl
= find_startpos
280 def get_result(self
):
281 if self
.result
is None:
283 if len(self
.result
) != 3:
285 lines
= self
.html
.split('\n')
286 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
287 lines
[0] = lines
[0][self
.result
[1][1]:]
289 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
290 lines
[-1] = lines
[-1][:self
.result
[2][1]]
291 return '\n'.join(lines
).strip()
292 # Hack for https://github.com/rg3/youtube-dl/issues/662
293 if sys
.version_info
< (2, 7, 3):
294 AttrParser
.parse_endtag
= (lambda self
, i
:
295 i
+ len("</scr'+'ipt>")
296 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
297 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
299 def get_element_by_id(id, html
):
300 """Return the content of the tag with the specified ID in the passed HTML document"""
301 return get_element_by_attribute("id", id, html
)
303 def get_element_by_attribute(attribute
, value
, html
):
304 """Return the content of the tag with the specified attribute in the passed HTML document"""
305 parser
= AttrParser(attribute
, value
)
308 except compat_html_parser
.HTMLParseError
:
310 return parser
.get_result()
313 def clean_html(html
):
314 """Clean an HTML snippet into a readable string"""
316 html
= html
.replace('\n', ' ')
317 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
318 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
320 html
= re
.sub('<.*?>', '', html
)
321 # Replace html entities
322 html
= unescapeHTML(html
)
326 def sanitize_open(filename
, open_mode
):
327 """Try to open the given filename, and slightly tweak it if this fails.
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
334 It returns the tuple (stream, definitive_file_name).
338 if sys
.platform
== 'win32':
340 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
341 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
342 stream
= open(encodeFilename(filename
), open_mode
)
343 return (stream
, filename
)
344 except (IOError, OSError) as err
:
345 if err
.errno
in (errno
.EACCES
,):
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename
= os
.path
.join(
350 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
351 for path_part
in os
.path
.split(filename
)
353 if alt_filename
== filename
:
356 # An exception here should be caught in the caller
357 stream
= open(encodeFilename(filename
), open_mode
)
358 return (stream
, alt_filename
)
361 def timeconvert(timestr
):
362 """Convert RFC 2822 defined time string into system timestamp"""
364 timetuple
= email
.utils
.parsedate_tz(timestr
)
365 if timetuple
is not None:
366 timestamp
= email
.utils
.mktime_tz(timetuple
)
369 def sanitize_filename(s
, restricted
=False, is_id
=False):
370 """Sanitizes a string so it could be used as part of a filename.
371 If restricted is set, use a stricter subset of allowed characters.
372 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
374 def replace_insane(char
):
375 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
378 return '' if restricted
else '\''
380 return '_-' if restricted
else ' -'
381 elif char
in '\\/|*<>':
383 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
385 if restricted
and ord(char
) > 127:
389 result
= u
''.join(map(replace_insane
, s
))
391 while '__' in result
:
392 result
= result
.replace('__', '_')
393 result
= result
.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted
and result
.startswith('-_'):
401 def orderedSet(iterable
):
402 """ Remove all duplicates from the input iterable """
413 assert type(s
) == type(u
'')
415 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
418 def encodeFilename(s
):
420 @param s The name of the file
423 assert type(s
) == type(u
'')
425 # Python 3 has a Unicode API
426 if sys
.version_info
>= (3, 0):
429 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
430 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
431 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
432 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
435 encoding
= sys
.getfilesystemencoding()
438 return s
.encode(encoding
, 'ignore')
440 def decodeOption(optval
):
443 if isinstance(optval
, bytes):
444 optval
= optval
.decode(preferredencoding())
446 assert isinstance(optval
, compat_str
)
449 def formatSeconds(secs
):
451 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
453 return '%d:%02d' % (secs
// 60, secs
% 60)
457 def make_HTTPS_handler(opts
):
458 if sys
.version_info
< (3,2):
459 # Python's 2.x handler is very simplistic
460 return compat_urllib_request
.HTTPSHandler()
463 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
464 context
.set_default_verify_paths()
466 context
.verify_mode
= (ssl
.CERT_NONE
467 if opts
.no_check_certificate
468 else ssl
.CERT_REQUIRED
)
469 return compat_urllib_request
.HTTPSHandler(context
=context
)
471 class ExtractorError(Exception):
472 """Error during info extraction."""
473 def __init__(self
, msg
, tb
=None):
474 """ tb, if given, is the original traceback (so that it can be printed out). """
476 if not sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
477 msg
= msg
+ u
'; please report this issue on GitHub.'
478 super(ExtractorError
, self
).__init
__(msg
)
481 self
.exc_info
= sys
.exc_info() # preserve original exception
483 def format_traceback(self
):
484 if self
.traceback
is None:
486 return u
''.join(traceback
.format_tb(self
.traceback
))
489 class DownloadError(Exception):
490 """Download Error exception.
492 This exception may be thrown by FileDownloader objects if they are not
493 configured to continue on errors. They will contain the appropriate
496 def __init__(self
, msg
, exc_info
=None):
497 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
498 super(DownloadError
, self
).__init
__(msg
)
499 self
.exc_info
= exc_info
502 class SameFileError(Exception):
503 """Same File exception.
505 This exception will be thrown by FileDownloader objects if they detect
506 multiple files would have to be downloaded to the same file on disk.
511 class PostProcessingError(Exception):
512 """Post Processing exception.
514 This exception may be raised by PostProcessor's .run() method to
515 indicate an error in the postprocessing task.
517 def __init__(self
, msg
):
520 class MaxDownloadsReached(Exception):
521 """ --max-downloads limit has been reached. """
525 class UnavailableVideoError(Exception):
526 """Unavailable Format exception.
528 This exception will be thrown when a video is requested
529 in a format that is not available for that video.
534 class ContentTooShortError(Exception):
535 """Content Too Short exception.
537 This exception may be raised by FileDownloader objects when a file they
538 download is too small for what the server announced first, indicating
539 the connection was probably interrupted.
545 def __init__(self
, downloaded
, expected
):
546 self
.downloaded
= downloaded
547 self
.expected
= expected
549 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
550 """Handler for HTTP requests and responses.
552 This class, when installed with an OpenerDirector, automatically adds
553 the standard headers to every HTTP request and handles gzipped and
554 deflated responses from web servers. If compression is to be avoided in
555 a particular request, the original request in the program code only has
556 to include the HTTP header "Youtubedl-No-Compression", which will be
557 removed before making the real request.
559 Part of this code was copied from:
561 http://techknack.net/python-urllib2-handlers/
563 Andrew Rowls, the author of that code, agreed to release it to the
570 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
572 return zlib
.decompress(data
)
575 def addinfourl_wrapper(stream
, headers
, url
, code
):
576 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
577 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
578 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
582 def http_request(self
, req
):
583 for h
,v
in std_headers
.items():
587 if 'Youtubedl-no-compression' in req
.headers
:
588 if 'Accept-encoding' in req
.headers
:
589 del req
.headers
['Accept-encoding']
590 del req
.headers
['Youtubedl-no-compression']
591 if 'Youtubedl-user-agent' in req
.headers
:
592 if 'User-agent' in req
.headers
:
593 del req
.headers
['User-agent']
594 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
595 del req
.headers
['Youtubedl-user-agent']
598 def http_response(self
, req
, resp
):
601 if resp
.headers
.get('Content-encoding', '') == 'gzip':
602 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
603 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
604 resp
.msg
= old_resp
.msg
606 if resp
.headers
.get('Content-encoding', '') == 'deflate':
607 gz
= io
.BytesIO(self
.deflate(resp
.read()))
608 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
609 resp
.msg
= old_resp
.msg
612 https_request
= http_request
613 https_response
= http_response
615 def unified_strdate(date_str
):
616 """Return a string with the date in the format YYYYMMDD"""
619 date_str
= date_str
.replace(',',' ')
620 # %z (UTC offset) is only supported in python>=3.2
621 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
622 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
623 for expression
in format_expressions
:
625 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
630 def date_from_str(date_str
):
632 Return a datetime object from a string in the format YYYYMMDD or
633 (now|today)[+-][0-9](day|week|month|year)(s)?"""
634 today
= datetime
.date
.today()
635 if date_str
== 'now'or date_str
== 'today':
637 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
638 if match
is not None:
639 sign
= match
.group('sign')
640 time
= int(match
.group('time'))
643 unit
= match
.group('unit')
652 delta
= datetime
.timedelta(**{unit
: time
})
654 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
656 class DateRange(object):
657 """Represents a time interval between two dates"""
658 def __init__(self
, start
=None, end
=None):
659 """start and end must be strings in the format accepted by date"""
660 if start
is not None:
661 self
.start
= date_from_str(start
)
663 self
.start
= datetime
.datetime
.min.date()
665 self
.end
= date_from_str(end
)
667 self
.end
= datetime
.datetime
.max.date()
668 if self
.start
> self
.end
:
669 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
672 """Returns a range that only contains the given day"""
674 def __contains__(self
, date
):
675 """Check if the date is in the range"""
676 if not isinstance(date
, datetime
.date
):
677 date
= date_from_str(date
)
678 return self
.start
<= date
<= self
.end
680 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())