2 # -*- coding: utf-8 -*-
19 import urllib
.request
as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2
as compat_urllib_request
24 import urllib
.error
as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2
as compat_urllib_error
29 import urllib
.parse
as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib
as compat_urllib_parse
34 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse
import urlparse
as compat_urllib_parse_urlparse
39 import http
.cookiejar
as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib
as compat_cookiejar
44 import html
.entities
as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs
as compat_html_entities
49 import html
.parser
as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser
as compat_html_parser
54 import http
.client
as compat_http_client
55 except ImportError: # Python 2
56 import httplib
as compat_http_client
59 from subprocess
import DEVNULL
60 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
62 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
65 from urllib
.parse
import parse_qs
as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string
, encoding
='utf-8', errors
='replace'):
72 res
= string
.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence
+= item
[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string
+= pct_sequence
.decode(encoding
, errors
) + rest
100 # Flush the final pct_sequence
101 string
+= pct_sequence
.decode(encoding
, errors
)
104 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
105 encoding
='utf-8', errors
='replace'):
106 qs
, _coerce_result
= qs
, unicode
107 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
109 for name_value
in pairs
:
110 if not name_value
and not strict_parsing
:
112 nv
= name_value
.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value
,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values
:
121 if len(nv
[1]) or keep_blank_values
:
122 name
= nv
[0].replace('+', ' ')
123 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
124 name
= _coerce_result(name
)
125 value
= nv
[1].replace('+', ' ')
126 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
127 value
= _coerce_result(value
)
128 r
.append((name
, value
))
131 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
132 encoding
='utf-8', errors
='replace'):
134 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
135 encoding
=encoding
, errors
=errors
)
136 for name
, value
in pairs
:
137 if name
in parsed_result
:
138 parsed_result
[name
].append(value
)
140 parsed_result
[name
] = [value
]
144 compat_str
= unicode # Python 2
149 compat_chr
= unichr # Python 2
154 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
155 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
156 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
157 'Accept-Encoding': 'gzip, deflate',
158 'Accept-Language': 'en-us,en;q=0.5',
161 def preferredencoding():
162 """Get preferred encoding.
164 Returns the best encoding scheme for the system, based on
165 locale.getpreferredencoding() and some further tweaks.
168 pref
= locale
.getpreferredencoding()
175 if sys
.version_info
< (3,0):
177 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
180 assert type(s
) == type(u
'')
183 # In Python 2.x, json.dump expects a bytestream.
184 # In Python 3.x, it writes to a character stream
185 if sys
.version_info
< (3,0):
186 def write_json_file(obj
, fn
):
187 with open(fn
, 'wb') as f
:
190 def write_json_file(obj
, fn
):
191 with open(fn
, 'w', encoding
='utf-8') as f
:
194 def htmlentity_transform(matchobj
):
195 """Transforms an HTML entity to a character.
197 This function receives a match object and is intended to be used with
198 the re.sub() function.
200 entity
= matchobj
.group(1)
202 # Known non-numeric HTML entity
203 if entity
in compat_html_entities
.name2codepoint
:
204 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
206 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
208 numstr
= mobj
.group(1)
209 if numstr
.startswith(u
'x'):
211 numstr
= u
'0%s' % numstr
214 return compat_chr(int(numstr
, base
))
216 # Unknown entity in name, return its literal representation
217 return (u
'&%s;' % entity
)
219 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
220 class AttrParser(compat_html_parser
.HTMLParser
):
221 """Modified HTMLParser that isolates a tag with the specified attribute"""
222 def __init__(self
, attribute
, value
):
223 self
.attribute
= attribute
229 self
.watch_startpos
= False
231 compat_html_parser
.HTMLParser
.__init
__(self
)
233 def error(self
, message
):
234 if self
.error_count
> 10 or self
.started
:
235 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
236 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
237 self
.error_count
+= 1
240 def loads(self
, html
):
245 def handle_starttag(self
, tag
, attrs
):
248 self
.find_startpos(None)
249 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
252 self
.watch_startpos
= True
254 if not tag
in self
.depth
: self
.depth
[tag
] = 0
257 def handle_endtag(self
, tag
):
259 if tag
in self
.depth
: self
.depth
[tag
] -= 1
260 if self
.depth
[self
.result
[0]] == 0:
262 self
.result
.append(self
.getpos())
264 def find_startpos(self
, x
):
265 """Needed to put the start position of the result (self.result[1])
266 after the opening tag with the requested id"""
267 if self
.watch_startpos
:
268 self
.watch_startpos
= False
269 self
.result
.append(self
.getpos())
270 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
271 handle_decl
= handle_pi
= unknown_decl
= find_startpos
273 def get_result(self
):
274 if self
.result
is None:
276 if len(self
.result
) != 3:
278 lines
= self
.html
.split('\n')
279 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
280 lines
[0] = lines
[0][self
.result
[1][1]:]
282 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
283 lines
[-1] = lines
[-1][:self
.result
[2][1]]
284 return '\n'.join(lines
).strip()
285 # Hack for https://github.com/rg3/youtube-dl/issues/662
286 if sys
.version_info
< (2, 7, 3):
287 AttrParser
.parse_endtag
= (lambda self
, i
:
288 i
+ len("</scr'+'ipt>")
289 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
290 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
292 def get_element_by_id(id, html
):
293 """Return the content of the tag with the specified ID in the passed HTML document"""
294 return get_element_by_attribute("id", id, html
)
296 def get_element_by_attribute(attribute
, value
, html
):
297 """Return the content of the tag with the specified attribute in the passed HTML document"""
298 parser
= AttrParser(attribute
, value
)
301 except compat_html_parser
.HTMLParseError
:
303 return parser
.get_result()
306 def clean_html(html
):
307 """Clean an HTML snippet into a readable string"""
309 html
= html
.replace('\n', ' ')
310 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
311 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
313 html
= re
.sub('<.*?>', '', html
)
314 # Replace html entities
315 html
= unescapeHTML(html
)
319 def sanitize_open(filename
, open_mode
):
320 """Try to open the given filename, and slightly tweak it if this fails.
322 Attempts to open the given filename. If this fails, it tries to change
323 the filename slightly, step by step, until it's either able to open it
324 or it fails and raises a final exception, like the standard open()
327 It returns the tuple (stream, definitive_file_name).
331 if sys
.platform
== 'win32':
333 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
334 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
335 stream
= open(encodeFilename(filename
), open_mode
)
336 return (stream
, filename
)
337 except (IOError, OSError) as err
:
338 if err
.errno
in (errno
.EACCES
,):
341 # In case of error, try to remove win32 forbidden chars
342 alt_filename
= os
.path
.join(
343 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
344 for path_part
in os
.path
.split(filename
)
346 if alt_filename
== filename
:
349 # An exception here should be caught in the caller
350 stream
= open(encodeFilename(filename
), open_mode
)
351 return (stream
, alt_filename
)
354 def timeconvert(timestr
):
355 """Convert RFC 2822 defined time string into system timestamp"""
357 timetuple
= email
.utils
.parsedate_tz(timestr
)
358 if timetuple
is not None:
359 timestamp
= email
.utils
.mktime_tz(timetuple
)
362 def sanitize_filename(s
, restricted
=False, is_id
=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
367 def replace_insane(char
):
368 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
371 return '' if restricted
else '\''
373 return '_-' if restricted
else ' -'
374 elif char
in '\\/|*<>':
376 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
378 if restricted
and ord(char
) > 127:
382 result
= u
''.join(map(replace_insane
, s
))
384 while '__' in result
:
385 result
= result
.replace('__', '_')
386 result
= result
.strip('_')
387 # Common case of "Foreign band name - English song title"
388 if restricted
and result
.startswith('-_'):
394 def orderedSet(iterable
):
395 """ Remove all duplicates from the input iterable """
406 assert type(s
) == type(u
'')
408 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
411 def encodeFilename(s
):
413 @param s The name of the file
416 assert type(s
) == type(u
'')
418 # Python 3 has a Unicode API
419 if sys
.version_info
>= (3, 0):
422 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
423 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
424 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
425 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
428 encoding
= sys
.getfilesystemencoding()
431 return s
.encode(encoding
, 'ignore')
433 def decodeOption(optval
):
436 if isinstance(optval
, bytes):
437 optval
= optval
.decode(preferredencoding())
439 assert isinstance(optval
, compat_str
)
442 def formatSeconds(secs
):
444 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
446 return '%d:%02d' % (secs
// 60, secs
% 60)
450 def make_HTTPS_handler(opts
):
451 if sys
.version_info
< (3,2):
452 # Python's 2.x handler is very simplistic
453 return compat_urllib_request
.HTTPSHandler()
456 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
457 context
.set_default_verify_paths()
459 context
.verify_mode
= (ssl
.CERT_NONE
460 if opts
.no_check_certificate
461 else ssl
.CERT_REQUIRED
)
462 return compat_urllib_request
.HTTPSHandler(context
=context
)
464 class ExtractorError(Exception):
465 """Error during info extraction."""
466 def __init__(self
, msg
, tb
=None):
467 """ tb, if given, is the original traceback (so that it can be printed out). """
468 super(ExtractorError
, self
).__init
__(msg
)
470 self
.exc_info
= sys
.exc_info() # preserve original exception
472 def format_traceback(self
):
473 if self
.traceback
is None:
475 return u
''.join(traceback
.format_tb(self
.traceback
))
478 class DownloadError(Exception):
479 """Download Error exception.
481 This exception may be thrown by FileDownloader objects if they are not
482 configured to continue on errors. They will contain the appropriate
485 def __init__(self
, msg
, exc_info
=None):
486 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
487 super(DownloadError
, self
).__init
__(msg
)
488 self
.exc_info
= exc_info
491 class SameFileError(Exception):
492 """Same File exception.
494 This exception will be thrown by FileDownloader objects if they detect
495 multiple files would have to be downloaded to the same file on disk.
500 class PostProcessingError(Exception):
501 """Post Processing exception.
503 This exception may be raised by PostProcessor's .run() method to
504 indicate an error in the postprocessing task.
506 def __init__(self
, msg
):
509 class MaxDownloadsReached(Exception):
510 """ --max-downloads limit has been reached. """
514 class UnavailableVideoError(Exception):
515 """Unavailable Format exception.
517 This exception will be thrown when a video is requested
518 in a format that is not available for that video.
523 class ContentTooShortError(Exception):
524 """Content Too Short exception.
526 This exception may be raised by FileDownloader objects when a file they
527 download is too small for what the server announced first, indicating
528 the connection was probably interrupted.
534 def __init__(self
, downloaded
, expected
):
535 self
.downloaded
= downloaded
536 self
.expected
= expected
538 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
539 """Handler for HTTP requests and responses.
541 This class, when installed with an OpenerDirector, automatically adds
542 the standard headers to every HTTP request and handles gzipped and
543 deflated responses from web servers. If compression is to be avoided in
544 a particular request, the original request in the program code only has
545 to include the HTTP header "Youtubedl-No-Compression", which will be
546 removed before making the real request.
548 Part of this code was copied from:
550 http://techknack.net/python-urllib2-handlers/
552 Andrew Rowls, the author of that code, agreed to release it to the
559 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
561 return zlib
.decompress(data
)
564 def addinfourl_wrapper(stream
, headers
, url
, code
):
565 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
566 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
567 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
571 def http_request(self
, req
):
572 for h
,v
in std_headers
.items():
576 if 'Youtubedl-no-compression' in req
.headers
:
577 if 'Accept-encoding' in req
.headers
:
578 del req
.headers
['Accept-encoding']
579 del req
.headers
['Youtubedl-no-compression']
580 if 'Youtubedl-user-agent' in req
.headers
:
581 if 'User-agent' in req
.headers
:
582 del req
.headers
['User-agent']
583 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
584 del req
.headers
['Youtubedl-user-agent']
587 def http_response(self
, req
, resp
):
590 if resp
.headers
.get('Content-encoding', '') == 'gzip':
591 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
592 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
593 resp
.msg
= old_resp
.msg
595 if resp
.headers
.get('Content-encoding', '') == 'deflate':
596 gz
= io
.BytesIO(self
.deflate(resp
.read()))
597 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
598 resp
.msg
= old_resp
.msg
601 https_request
= http_request
602 https_response
= http_response
604 def unified_strdate(date_str
):
605 """Return a string with the date in the format YYYYMMDD"""
608 date_str
= date_str
.replace(',',' ')
609 # %z (UTC offset) is only supported in python>=3.2
610 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
611 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
612 for expression
in format_expressions
:
614 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
619 def date_from_str(date_str
):
621 Return a datetime object from a string in the format YYYYMMDD or
622 (now|today)[+-][0-9](day|week|month|year)(s)?"""
623 today
= datetime
.date
.today()
624 if date_str
== 'now'or date_str
== 'today':
626 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
627 if match
is not None:
628 sign
= match
.group('sign')
629 time
= int(match
.group('time'))
632 unit
= match
.group('unit')
641 delta
= datetime
.timedelta(**{unit
: time
})
643 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
645 class DateRange(object):
646 """Represents a time interval between two dates"""
647 def __init__(self
, start
=None, end
=None):
648 """start and end must be strings in the format accepted by date"""
649 if start
is not None:
650 self
.start
= date_from_str(start
)
652 self
.start
= datetime
.datetime
.min.date()
654 self
.end
= date_from_str(end
)
656 self
.end
= datetime
.datetime
.max.date()
657 if self
.start
> self
.end
:
658 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
661 """Returns a range that only contains the given day"""
663 def __contains__(self
, date
):
664 """Check if the date is in the range"""
665 if not isinstance(date
, datetime
.date
):
666 date
= date_from_str(date
)
667 return self
.start
<= date
<= self
.end
669 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())