]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
922e17eccfac611a1d90bf83e913383c9afce30d
2 # -*- coding: utf-8 -*-
17 import cStringIO
as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
35 def yield_preferredencoding():
37 pref
= locale
.getpreferredencoding()
43 return yield_preferredencoding().next()
46 def htmlentity_transform(matchobj
):
47 """Transforms an HTML entity to a Unicode character.
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
52 entity
= matchobj
.group(1)
54 # Known non-numeric HTML entity
55 if entity
in htmlentitydefs
.name2codepoint
:
56 return unichr(htmlentitydefs
.name2codepoint
[entity
])
59 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
61 numstr
= mobj
.group(1)
62 if numstr
.startswith(u
'x'):
64 numstr
= u
'0%s' % numstr
67 return unichr(long(numstr
, base
))
69 # Unknown entity in name, return its literal representation
70 return (u
'&%s;' % entity
)
72 HTMLParser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
73 class IDParser(HTMLParser
.HTMLParser
):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self
, id):
81 self
.watch_startpos
= False
83 HTMLParser
.HTMLParser
.__init
__(self
)
85 def error(self
, message
):
86 print >> sys
.stderr
, self
.getpos()
87 if self
.error_count
> 10 or self
.started
:
88 raise HTMLParser
.HTMLParseError(message
, self
.getpos())
89 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
93 def loads(self
, html
):
98 def handle_starttag(self
, tag
, attrs
):
101 self
.find_startpos(None)
102 if 'id' in attrs
and attrs
['id'] == self
.id:
105 self
.watch_startpos
= True
107 if not tag
in self
.depth
: self
.depth
[tag
] = 0
110 def handle_endtag(self
, tag
):
112 if tag
in self
.depth
: self
.depth
[tag
] -= 1
113 if self
.depth
[self
.result
[0]] == 0:
115 self
.result
.append(self
.getpos())
117 def find_startpos(self
, x
):
118 """Needed to put the start position of the result (self.result[1])
119 after the opening tag with the requested id"""
120 if self
.watch_startpos
:
121 self
.watch_startpos
= False
122 self
.result
.append(self
.getpos())
123 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
124 handle_decl
= handle_pi
= unknown_decl
= find_startpos
126 def get_result(self
):
127 if self
.result
== None: return None
128 if len(self
.result
) != 3: return None
129 lines
= self
.html
.split('\n')
130 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
131 lines
[0] = lines
[0][self
.result
[1][1]:]
133 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
134 lines
[-1] = lines
[-1][:self
.result
[2][1]]
135 return '\n'.join(lines
).strip()
137 def get_element_by_id(id, html
):
138 """Return the content of the tag with the specified id in the passed HTML document"""
139 parser
= IDParser(id)
142 except HTMLParser
.HTMLParseError
:
144 return parser
.get_result()
147 def clean_html(html
):
148 """Clean an HTML snippet into a readable string"""
150 html
= html
.replace('\n', ' ')
151 html
= re
.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html
)
153 html
= re
.sub('<.*?>', '', html
)
154 # Replace html entities
155 html
= unescapeHTML(html
)
159 def sanitize_open(filename
, open_mode
):
160 """Try to open the given filename, and slightly tweak it if this fails.
162 Attempts to open the given filename. If this fails, it tries to change
163 the filename slightly, step by step, until it's either able to open it
164 or it fails and raises a final exception, like the standard open()
167 It returns the tuple (stream, definitive_file_name).
171 if sys
.platform
== 'win32':
173 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
174 return (sys
.stdout
, filename
)
175 stream
= open(encodeFilename(filename
), open_mode
)
176 return (stream
, filename
)
177 except (IOError, OSError), err
:
178 # In case of error, try to remove win32 forbidden chars
179 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
181 # An exception here should be caught in the caller
182 stream
= open(encodeFilename(filename
), open_mode
)
183 return (stream
, filename
)
186 def timeconvert(timestr
):
187 """Convert RFC 2822 defined time string into system timestamp"""
189 timetuple
= email
.utils
.parsedate_tz(timestr
)
190 if timetuple
is not None:
191 timestamp
= email
.utils
.mktime_tz(timetuple
)
194 def sanitize_filename(s
):
195 """Sanitizes a string so it could be used as part of a filename."""
196 def replace_insane(char
):
197 if char
in u
' .\\/|?*<>:"' or ord(char
) < 32:
200 return u
''.join(map(replace_insane
, s
)).strip('_')
202 def orderedSet(iterable
):
203 """ Remove all duplicates from the input iterable """
212 @param s a string (of type unicode)
214 assert type(s
) == type(u
'')
216 result
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, s
)
219 def encodeFilename(s
):
221 @param s The name of the file (of type unicode)
224 assert type(s
) == type(u
'')
226 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
227 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
228 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
229 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
232 return s
.encode(sys
.getfilesystemencoding(), 'ignore')
234 class DownloadError(Exception):
235 """Download Error exception.
237 This exception may be thrown by FileDownloader objects if they are not
238 configured to continue on errors. They will contain the appropriate
244 class SameFileError(Exception):
245 """Same File exception.
247 This exception will be thrown by FileDownloader objects if they detect
248 multiple files would have to be downloaded to the same file on disk.
253 class PostProcessingError(Exception):
254 """Post Processing exception.
256 This exception may be raised by PostProcessor's .run() method to
257 indicate an error in the postprocessing task.
261 class MaxDownloadsReached(Exception):
262 """ --max-downloads limit has been reached. """
266 class UnavailableVideoError(Exception):
267 """Unavailable Format exception.
269 This exception will be thrown when a video is requested
270 in a format that is not available for that video.
275 class ContentTooShortError(Exception):
276 """Content Too Short exception.
278 This exception may be raised by FileDownloader objects when a file they
279 download is too small for what the server announced first, indicating
280 the connection was probably interrupted.
286 def __init__(self
, downloaded
, expected
):
287 self
.downloaded
= downloaded
288 self
.expected
= expected
291 class Trouble(Exception):
292 """Trouble helper exception
294 This is an exception to be handled with
295 FileDownloader.trouble
298 class YoutubeDLHandler(urllib2
.HTTPHandler
):
299 """Handler for HTTP requests and responses.
301 This class, when installed with an OpenerDirector, automatically adds
302 the standard headers to every HTTP request and handles gzipped and
303 deflated responses from web servers. If compression is to be avoided in
304 a particular request, the original request in the program code only has
305 to include the HTTP header "Youtubedl-No-Compression", which will be
306 removed before making the real request.
308 Part of this code was copied from:
310 http://techknack.net/python-urllib2-handlers/
312 Andrew Rowls, the author of that code, agreed to release it to the
319 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
321 return zlib
.decompress(data
)
324 def addinfourl_wrapper(stream
, headers
, url
, code
):
325 if hasattr(urllib2
.addinfourl
, 'getcode'):
326 return urllib2
.addinfourl(stream
, headers
, url
, code
)
327 ret
= urllib2
.addinfourl(stream
, headers
, url
)
331 def http_request(self
, req
):
332 for h
in std_headers
:
335 req
.add_header(h
, std_headers
[h
])
336 if 'Youtubedl-no-compression' in req
.headers
:
337 if 'Accept-encoding' in req
.headers
:
338 del req
.headers
['Accept-encoding']
339 del req
.headers
['Youtubedl-no-compression']
342 def http_response(self
, req
, resp
):
345 if resp
.headers
.get('Content-encoding', '') == 'gzip':
346 gz
= gzip
.GzipFile(fileobj
=StringIO
.StringIO(resp
.read()), mode
='r')
347 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
348 resp
.msg
= old_resp
.msg
350 if resp
.headers
.get('Content-encoding', '') == 'deflate':
351 gz
= StringIO
.StringIO(self
.deflate(resp
.read()))
352 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
353 resp
.msg
= old_resp
.msg