2 # -*- coding: utf-8 -*-
19 import urllib
.request
as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2
as compat_urllib_request
24 import urllib
.error
as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2
as compat_urllib_error
29 import urllib
.parse
as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib
as compat_urllib_parse
34 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse
import urlparse
as compat_urllib_parse_urlparse
39 import http
.cookiejar
as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib
as compat_cookiejar
44 import html
.entities
as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs
as compat_html_entities
49 import html
.parser
as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser
as compat_html_parser
54 import http
.client
as compat_http_client
55 except ImportError: # Python 2
56 import httplib
as compat_http_client
59 from subprocess
import DEVNULL
60 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
62 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
65 from urllib
.parse
import parse_qs
as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string
, encoding
='utf-8', errors
='replace'):
72 res
= string
.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence
+= item
[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string
+= pct_sequence
.decode(encoding
, errors
) + rest
100 # Flush the final pct_sequence
101 string
+= pct_sequence
.decode(encoding
, errors
)
104 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
105 encoding
='utf-8', errors
='replace'):
106 qs
, _coerce_result
= qs
, unicode
107 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
109 for name_value
in pairs
:
110 if not name_value
and not strict_parsing
:
112 nv
= name_value
.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value
,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values
:
121 if len(nv
[1]) or keep_blank_values
:
122 name
= nv
[0].replace('+', ' ')
123 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
124 name
= _coerce_result(name
)
125 value
= nv
[1].replace('+', ' ')
126 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
127 value
= _coerce_result(value
)
128 r
.append((name
, value
))
131 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
132 encoding
='utf-8', errors
='replace'):
134 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
135 encoding
=encoding
, errors
=errors
)
136 for name
, value
in pairs
:
137 if name
in parsed_result
:
138 parsed_result
[name
].append(value
)
140 parsed_result
[name
] = [value
]
144 compat_str
= unicode # Python 2
149 compat_chr
= unichr # Python 2
154 if type(c
) is int: return c
158 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
159 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
160 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
161 'Accept-Encoding': 'gzip, deflate',
162 'Accept-Language': 'en-us,en;q=0.5',
165 def preferredencoding():
166 """Get preferred encoding.
168 Returns the best encoding scheme for the system, based on
169 locale.getpreferredencoding() and some further tweaks.
172 pref
= locale
.getpreferredencoding()
179 if sys
.version_info
< (3,0):
181 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
184 assert type(s
) == type(u
'')
187 # In Python 2.x, json.dump expects a bytestream.
188 # In Python 3.x, it writes to a character stream
189 if sys
.version_info
< (3,0):
190 def write_json_file(obj
, fn
):
191 with open(fn
, 'wb') as f
:
194 def write_json_file(obj
, fn
):
195 with open(fn
, 'w', encoding
='utf-8') as f
:
198 def htmlentity_transform(matchobj
):
199 """Transforms an HTML entity to a character.
201 This function receives a match object and is intended to be used with
202 the re.sub() function.
204 entity
= matchobj
.group(1)
206 # Known non-numeric HTML entity
207 if entity
in compat_html_entities
.name2codepoint
:
208 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
210 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
212 numstr
= mobj
.group(1)
213 if numstr
.startswith(u
'x'):
215 numstr
= u
'0%s' % numstr
218 return compat_chr(int(numstr
, base
))
220 # Unknown entity in name, return its literal representation
221 return (u
'&%s;' % entity
)
223 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
224 class AttrParser(compat_html_parser
.HTMLParser
):
225 """Modified HTMLParser that isolates a tag with the specified attribute"""
226 def __init__(self
, attribute
, value
):
227 self
.attribute
= attribute
233 self
.watch_startpos
= False
235 compat_html_parser
.HTMLParser
.__init
__(self
)
237 def error(self
, message
):
238 if self
.error_count
> 10 or self
.started
:
239 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
240 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
241 self
.error_count
+= 1
244 def loads(self
, html
):
249 def handle_starttag(self
, tag
, attrs
):
252 self
.find_startpos(None)
253 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
256 self
.watch_startpos
= True
258 if not tag
in self
.depth
: self
.depth
[tag
] = 0
261 def handle_endtag(self
, tag
):
263 if tag
in self
.depth
: self
.depth
[tag
] -= 1
264 if self
.depth
[self
.result
[0]] == 0:
266 self
.result
.append(self
.getpos())
268 def find_startpos(self
, x
):
269 """Needed to put the start position of the result (self.result[1])
270 after the opening tag with the requested id"""
271 if self
.watch_startpos
:
272 self
.watch_startpos
= False
273 self
.result
.append(self
.getpos())
274 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
275 handle_decl
= handle_pi
= unknown_decl
= find_startpos
277 def get_result(self
):
278 if self
.result
is None:
280 if len(self
.result
) != 3:
282 lines
= self
.html
.split('\n')
283 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
284 lines
[0] = lines
[0][self
.result
[1][1]:]
286 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
287 lines
[-1] = lines
[-1][:self
.result
[2][1]]
288 return '\n'.join(lines
).strip()
289 # Hack for https://github.com/rg3/youtube-dl/issues/662
290 if sys
.version_info
< (2, 7, 3):
291 AttrParser
.parse_endtag
= (lambda self
, i
:
292 i
+ len("</scr'+'ipt>")
293 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
294 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
296 def get_element_by_id(id, html
):
297 """Return the content of the tag with the specified ID in the passed HTML document"""
298 return get_element_by_attribute("id", id, html
)
300 def get_element_by_attribute(attribute
, value
, html
):
301 """Return the content of the tag with the specified attribute in the passed HTML document"""
302 parser
= AttrParser(attribute
, value
)
305 except compat_html_parser
.HTMLParseError
:
307 return parser
.get_result()
310 def clean_html(html
):
311 """Clean an HTML snippet into a readable string"""
313 html
= html
.replace('\n', ' ')
314 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
315 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
317 html
= re
.sub('<.*?>', '', html
)
318 # Replace html entities
319 html
= unescapeHTML(html
)
323 def sanitize_open(filename
, open_mode
):
324 """Try to open the given filename, and slightly tweak it if this fails.
326 Attempts to open the given filename. If this fails, it tries to change
327 the filename slightly, step by step, until it's either able to open it
328 or it fails and raises a final exception, like the standard open()
331 It returns the tuple (stream, definitive_file_name).
335 if sys
.platform
== 'win32':
337 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
338 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
339 stream
= open(encodeFilename(filename
), open_mode
)
340 return (stream
, filename
)
341 except (IOError, OSError) as err
:
342 if err
.errno
in (errno
.EACCES
,):
345 # In case of error, try to remove win32 forbidden chars
346 alt_filename
= os
.path
.join(
347 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
348 for path_part
in os
.path
.split(filename
)
350 if alt_filename
== filename
:
353 # An exception here should be caught in the caller
354 stream
= open(encodeFilename(filename
), open_mode
)
355 return (stream
, alt_filename
)
358 def timeconvert(timestr
):
359 """Convert RFC 2822 defined time string into system timestamp"""
361 timetuple
= email
.utils
.parsedate_tz(timestr
)
362 if timetuple
is not None:
363 timestamp
= email
.utils
.mktime_tz(timetuple
)
366 def sanitize_filename(s
, restricted
=False, is_id
=False):
367 """Sanitizes a string so it could be used as part of a filename.
368 If restricted is set, use a stricter subset of allowed characters.
369 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
371 def replace_insane(char
):
372 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
375 return '' if restricted
else '\''
377 return '_-' if restricted
else ' -'
378 elif char
in '\\/|*<>':
380 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
382 if restricted
and ord(char
) > 127:
386 result
= u
''.join(map(replace_insane
, s
))
388 while '__' in result
:
389 result
= result
.replace('__', '_')
390 result
= result
.strip('_')
391 # Common case of "Foreign band name - English song title"
392 if restricted
and result
.startswith('-_'):
398 def orderedSet(iterable
):
399 """ Remove all duplicates from the input iterable """
410 assert type(s
) == type(u
'')
412 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
415 def encodeFilename(s
):
417 @param s The name of the file
420 assert type(s
) == type(u
'')
422 # Python 3 has a Unicode API
423 if sys
.version_info
>= (3, 0):
426 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
427 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
428 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
429 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
432 encoding
= sys
.getfilesystemencoding()
435 return s
.encode(encoding
, 'ignore')
437 def decodeOption(optval
):
440 if isinstance(optval
, bytes):
441 optval
= optval
.decode(preferredencoding())
443 assert isinstance(optval
, compat_str
)
446 def formatSeconds(secs
):
448 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
450 return '%d:%02d' % (secs
// 60, secs
% 60)
454 def make_HTTPS_handler(opts
):
455 if sys
.version_info
< (3,2):
456 # Python's 2.x handler is very simplistic
457 return compat_urllib_request
.HTTPSHandler()
460 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
461 context
.set_default_verify_paths()
463 context
.verify_mode
= (ssl
.CERT_NONE
464 if opts
.no_check_certificate
465 else ssl
.CERT_REQUIRED
)
466 return compat_urllib_request
.HTTPSHandler(context
=context
)
468 class ExtractorError(Exception):
469 """Error during info extraction."""
470 def __init__(self
, msg
, tb
=None):
471 """ tb, if given, is the original traceback (so that it can be printed out). """
472 super(ExtractorError
, self
).__init
__(msg
)
474 self
.exc_info
= sys
.exc_info() # preserve original exception
476 def format_traceback(self
):
477 if self
.traceback
is None:
479 return u
''.join(traceback
.format_tb(self
.traceback
))
482 class DownloadError(Exception):
483 """Download Error exception.
485 This exception may be thrown by FileDownloader objects if they are not
486 configured to continue on errors. They will contain the appropriate
489 def __init__(self
, msg
, exc_info
=None):
490 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
491 super(DownloadError
, self
).__init
__(msg
)
492 self
.exc_info
= exc_info
495 class SameFileError(Exception):
496 """Same File exception.
498 This exception will be thrown by FileDownloader objects if they detect
499 multiple files would have to be downloaded to the same file on disk.
504 class PostProcessingError(Exception):
505 """Post Processing exception.
507 This exception may be raised by PostProcessor's .run() method to
508 indicate an error in the postprocessing task.
510 def __init__(self
, msg
):
513 class MaxDownloadsReached(Exception):
514 """ --max-downloads limit has been reached. """
518 class UnavailableVideoError(Exception):
519 """Unavailable Format exception.
521 This exception will be thrown when a video is requested
522 in a format that is not available for that video.
527 class ContentTooShortError(Exception):
528 """Content Too Short exception.
530 This exception may be raised by FileDownloader objects when a file they
531 download is too small for what the server announced first, indicating
532 the connection was probably interrupted.
538 def __init__(self
, downloaded
, expected
):
539 self
.downloaded
= downloaded
540 self
.expected
= expected
542 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
543 """Handler for HTTP requests and responses.
545 This class, when installed with an OpenerDirector, automatically adds
546 the standard headers to every HTTP request and handles gzipped and
547 deflated responses from web servers. If compression is to be avoided in
548 a particular request, the original request in the program code only has
549 to include the HTTP header "Youtubedl-No-Compression", which will be
550 removed before making the real request.
552 Part of this code was copied from:
554 http://techknack.net/python-urllib2-handlers/
556 Andrew Rowls, the author of that code, agreed to release it to the
563 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
565 return zlib
.decompress(data
)
568 def addinfourl_wrapper(stream
, headers
, url
, code
):
569 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
570 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
571 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
575 def http_request(self
, req
):
576 for h
,v
in std_headers
.items():
580 if 'Youtubedl-no-compression' in req
.headers
:
581 if 'Accept-encoding' in req
.headers
:
582 del req
.headers
['Accept-encoding']
583 del req
.headers
['Youtubedl-no-compression']
584 if 'Youtubedl-user-agent' in req
.headers
:
585 if 'User-agent' in req
.headers
:
586 del req
.headers
['User-agent']
587 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
588 del req
.headers
['Youtubedl-user-agent']
591 def http_response(self
, req
, resp
):
594 if resp
.headers
.get('Content-encoding', '') == 'gzip':
595 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
596 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
597 resp
.msg
= old_resp
.msg
599 if resp
.headers
.get('Content-encoding', '') == 'deflate':
600 gz
= io
.BytesIO(self
.deflate(resp
.read()))
601 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
602 resp
.msg
= old_resp
.msg
605 https_request
= http_request
606 https_response
= http_response
608 def unified_strdate(date_str
):
609 """Return a string with the date in the format YYYYMMDD"""
612 date_str
= date_str
.replace(',',' ')
613 # %z (UTC offset) is only supported in python>=3.2
614 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
615 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
616 for expression
in format_expressions
:
618 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
623 def date_from_str(date_str
):
625 Return a datetime object from a string in the format YYYYMMDD or
626 (now|today)[+-][0-9](day|week|month|year)(s)?"""
627 today
= datetime
.date
.today()
628 if date_str
== 'now'or date_str
== 'today':
630 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
631 if match
is not None:
632 sign
= match
.group('sign')
633 time
= int(match
.group('time'))
636 unit
= match
.group('unit')
645 delta
= datetime
.timedelta(**{unit
: time
})
647 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
649 class DateRange(object):
650 """Represents a time interval between two dates"""
651 def __init__(self
, start
=None, end
=None):
652 """start and end must be strings in the format accepted by date"""
653 if start
is not None:
654 self
.start
= date_from_str(start
)
656 self
.start
= datetime
.datetime
.min.date()
658 self
.end
= date_from_str(end
)
660 self
.end
= datetime
.datetime
.max.date()
661 if self
.start
> self
.end
:
662 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
665 """Returns a range that only contains the given day"""
667 def __contains__(self
, date
):
668 """Check if the date is in the range"""
669 if not isinstance(date
, datetime
.date
):
670 date
= date_from_str(date
)
671 return self
.start
<= date
<= self
.end
673 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())