2 # -*- coding: utf-8 -*-
15 import urllib
.request
as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2
as compat_urllib_request
20 import urllib
.error
as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2
as compat_urllib_error
25 import urllib
.parse
as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib
as compat_urllib_parse
30 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
31 except ImportError: # Python 2
32 from urlparse
import urlparse
as compat_urllib_parse_urlparse
35 import http
.cookiejar
as compat_cookiejar
36 except ImportError: # Python 2
37 import cookielib
as compat_cookiejar
40 import html
.entities
as compat_html_entities
41 except ImportError: # Python 2
42 import htmlentitydefs
as compat_html_entities
45 import html
.parser
as compat_html_parser
46 except ImportError: # Python 2
47 import HTMLParser
as compat_html_parser
50 import http
.client
as compat_http_client
51 except ImportError: # Python 2
52 import httplib
as compat_http_client
55 from urllib
.parse
import parse_qs
as compat_parse_qs
56 except ImportError: # Python 2
57 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
58 # Python 2's version is apparently totally broken
59 def _unquote(string
, encoding
='utf-8', errors
='replace'):
62 res
= string
.split('%')
69 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
76 pct_sequence
+= item
[:2].decode('hex')
79 # This segment was just a single percent-encoded character.
80 # May be part of a sequence of code units, so delay decoding.
81 # (Stored in pct_sequence).
85 # Encountered non-percent-encoded characters. Flush the current
87 string
+= pct_sequence
.decode(encoding
, errors
) + rest
90 # Flush the final pct_sequence
91 string
+= pct_sequence
.decode(encoding
, errors
)
94 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
95 encoding
='utf-8', errors
='replace'):
96 qs
, _coerce_result
= qs
, unicode
97 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
99 for name_value
in pairs
:
100 if not name_value
and not strict_parsing
:
102 nv
= name_value
.split('=', 1)
105 raise ValueError("bad query field: %r" % (name_value
,))
106 # Handle case of a control-name with no equal sign
107 if keep_blank_values
:
111 if len(nv
[1]) or keep_blank_values
:
112 name
= nv
[0].replace('+', ' ')
113 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
114 name
= _coerce_result(name
)
115 value
= nv
[1].replace('+', ' ')
116 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
117 value
= _coerce_result(value
)
118 r
.append((name
, value
))
121 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
122 encoding
='utf-8', errors
='replace'):
124 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
125 encoding
=encoding
, errors
=errors
)
126 for name
, value
in pairs
:
127 if name
in parsed_result
:
128 parsed_result
[name
].append(value
)
130 parsed_result
[name
] = [value
]
134 compat_str
= unicode # Python 2
139 compat_chr
= unichr # Python 2
144 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
145 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
146 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
147 'Accept-Encoding': 'gzip, deflate',
148 'Accept-Language': 'en-us,en;q=0.5',
150 def preferredencoding():
151 """Get preferred encoding.
153 Returns the best encoding scheme for the system, based on
154 locale.getpreferredencoding() and some further tweaks.
157 pref
= locale
.getpreferredencoding()
164 if sys
.version_info
< (3,0):
166 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
169 assert type(s
) == type(u
'')
172 def htmlentity_transform(matchobj
):
173 """Transforms an HTML entity to a character.
175 This function receives a match object and is intended to be used with
176 the re.sub() function.
178 entity
= matchobj
.group(1)
180 # Known non-numeric HTML entity
181 if entity
in compat_html_entities
.name2codepoint
:
182 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
184 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
186 numstr
= mobj
.group(1)
187 if numstr
.startswith(u
'x'):
189 numstr
= u
'0%s' % numstr
192 return compat_chr(int(numstr
, base
))
194 # Unknown entity in name, return its literal representation
195 return (u
'&%s;' % entity
)
197 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
198 class IDParser(compat_html_parser
.HTMLParser
):
199 """Modified HTMLParser that isolates a tag with the specified id"""
200 def __init__(self
, id):
206 self
.watch_startpos
= False
208 compat_html_parser
.HTMLParser
.__init
__(self
)
210 def error(self
, message
):
211 if self
.error_count
> 10 or self
.started
:
212 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
213 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
214 self
.error_count
+= 1
217 def loads(self
, html
):
222 def handle_starttag(self
, tag
, attrs
):
225 self
.find_startpos(None)
226 if 'id' in attrs
and attrs
['id'] == self
.id:
229 self
.watch_startpos
= True
231 if not tag
in self
.depth
: self
.depth
[tag
] = 0
234 def handle_endtag(self
, tag
):
236 if tag
in self
.depth
: self
.depth
[tag
] -= 1
237 if self
.depth
[self
.result
[0]] == 0:
239 self
.result
.append(self
.getpos())
241 def find_startpos(self
, x
):
242 """Needed to put the start position of the result (self.result[1])
243 after the opening tag with the requested id"""
244 if self
.watch_startpos
:
245 self
.watch_startpos
= False
246 self
.result
.append(self
.getpos())
247 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
248 handle_decl
= handle_pi
= unknown_decl
= find_startpos
250 def get_result(self
):
251 if self
.result
is None:
253 if len(self
.result
) != 3:
255 lines
= self
.html
.split('\n')
256 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
257 lines
[0] = lines
[0][self
.result
[1][1]:]
259 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
260 lines
[-1] = lines
[-1][:self
.result
[2][1]]
261 return '\n'.join(lines
).strip()
263 def get_element_by_id(id, html
):
264 """Return the content of the tag with the specified id in the passed HTML document"""
265 parser
= IDParser(id)
268 except compat_html_parser
.HTMLParseError
:
270 return parser
.get_result()
273 def clean_html(html
):
274 """Clean an HTML snippet into a readable string"""
276 html
= html
.replace('\n', ' ')
277 html
= re
.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html
)
279 html
= re
.sub('<.*?>', '', html
)
280 # Replace html entities
281 html
= unescapeHTML(html
)
285 def sanitize_open(filename
, open_mode
):
286 """Try to open the given filename, and slightly tweak it if this fails.
288 Attempts to open the given filename. If this fails, it tries to change
289 the filename slightly, step by step, until it's either able to open it
290 or it fails and raises a final exception, like the standard open()
293 It returns the tuple (stream, definitive_file_name).
297 if sys
.platform
== 'win32':
299 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
300 return (sys
.stdout
, filename
)
301 stream
= open(encodeFilename(filename
), open_mode
)
302 return (stream
, filename
)
303 except (IOError, OSError) as err
:
304 # In case of error, try to remove win32 forbidden chars
305 filename
= re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', filename
)
307 # An exception here should be caught in the caller
308 stream
= open(encodeFilename(filename
), open_mode
)
309 return (stream
, filename
)
312 def timeconvert(timestr
):
313 """Convert RFC 2822 defined time string into system timestamp"""
315 timetuple
= email
.utils
.parsedate_tz(timestr
)
316 if timetuple
is not None:
317 timestamp
= email
.utils
.mktime_tz(timetuple
)
320 def sanitize_filename(s
, restricted
=False, is_id
=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
325 def replace_insane(char
):
326 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
329 return '' if restricted
else '\''
331 return '_-' if restricted
else ' -'
332 elif char
in '\\/|*<>':
334 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
336 if restricted
and ord(char
) > 127:
340 result
= u
''.join(map(replace_insane
, s
))
342 while '__' in result
:
343 result
= result
.replace('__', '_')
344 result
= result
.strip('_')
345 # Common case of "Foreign band name - English song title"
346 if restricted
and result
.startswith('-_'):
352 def orderedSet(iterable
):
353 """ Remove all duplicates from the input iterable """
364 assert type(s
) == type(u
'')
366 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
369 def encodeFilename(s
):
371 @param s The name of the file
374 assert type(s
) == type(u
'')
376 # Python 3 has a Unicode API
377 if sys
.version_info
>= (3, 0):
380 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
381 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
382 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
383 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
386 return s
.encode(sys
.getfilesystemencoding(), 'ignore')
388 class DownloadError(Exception):
389 """Download Error exception.
391 This exception may be thrown by FileDownloader objects if they are not
392 configured to continue on errors. They will contain the appropriate
398 class SameFileError(Exception):
399 """Same File exception.
401 This exception will be thrown by FileDownloader objects if they detect
402 multiple files would have to be downloaded to the same file on disk.
407 class PostProcessingError(Exception):
408 """Post Processing exception.
410 This exception may be raised by PostProcessor's .run() method to
411 indicate an error in the postprocessing task.
415 class MaxDownloadsReached(Exception):
416 """ --max-downloads limit has been reached. """
420 class UnavailableVideoError(Exception):
421 """Unavailable Format exception.
423 This exception will be thrown when a video is requested
424 in a format that is not available for that video.
429 class ContentTooShortError(Exception):
430 """Content Too Short exception.
432 This exception may be raised by FileDownloader objects when a file they
433 download is too small for what the server announced first, indicating
434 the connection was probably interrupted.
440 def __init__(self
, downloaded
, expected
):
441 self
.downloaded
= downloaded
442 self
.expected
= expected
445 class Trouble(Exception):
446 """Trouble helper exception
448 This is an exception to be handled with
449 FileDownloader.trouble
452 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
453 """Handler for HTTP requests and responses.
455 This class, when installed with an OpenerDirector, automatically adds
456 the standard headers to every HTTP request and handles gzipped and
457 deflated responses from web servers. If compression is to be avoided in
458 a particular request, the original request in the program code only has
459 to include the HTTP header "Youtubedl-No-Compression", which will be
460 removed before making the real request.
462 Part of this code was copied from:
464 http://techknack.net/python-urllib2-handlers/
466 Andrew Rowls, the author of that code, agreed to release it to the
473 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
475 return zlib
.decompress(data
)
478 def addinfourl_wrapper(stream
, headers
, url
, code
):
479 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
480 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
481 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
485 def http_request(self
, req
):
486 for h
in std_headers
:
489 req
.add_header(h
, std_headers
[h
])
490 if 'Youtubedl-no-compression' in req
.headers
:
491 if 'Accept-encoding' in req
.headers
:
492 del req
.headers
['Accept-encoding']
493 del req
.headers
['Youtubedl-no-compression']
496 def http_response(self
, req
, resp
):
499 if resp
.headers
.get('Content-encoding', '') == 'gzip':
500 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
501 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
502 resp
.msg
= old_resp
.msg
504 if resp
.headers
.get('Content-encoding', '') == 'deflate':
505 gz
= io
.BytesIO(self
.deflate(resp
.read()))
506 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
507 resp
.msg
= old_resp
.msg
510 https_request
= http_request
511 https_response
= http_response