]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
Imported Upstream version 2012.02.27+gita171dbf
[youtubedl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import htmlentitydefs
6 import HTMLParser
7 import locale
8 import os
9 import re
10 import sys
11 import zlib
12 import urllib2
13 import email.utils
14 import json
15
16 try:
17 import cStringIO as StringIO
18 except ImportError:
19 import StringIO
20
21 std_headers = {
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
27 }
28
29 def preferredencoding():
30 """Get preferred encoding.
31
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
34 """
35 def yield_preferredencoding():
36 try:
37 pref = locale.getpreferredencoding()
38 u'TEST'.encode(pref)
39 except:
40 pref = 'UTF-8'
41 while True:
42 yield pref
43 return yield_preferredencoding().next()
44
45
46 def htmlentity_transform(matchobj):
47 """Transforms an HTML entity to a Unicode character.
48
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
51 """
52 entity = matchobj.group(1)
53
54 # Known non-numeric HTML entity
55 if entity in htmlentitydefs.name2codepoint:
56 return unichr(htmlentitydefs.name2codepoint[entity])
57
58 # Unicode character
59 mobj = re.match(ur'(?u)#(x?\d+)', entity)
60 if mobj is not None:
61 numstr = mobj.group(1)
62 if numstr.startswith(u'x'):
63 base = 16
64 numstr = u'0%s' % numstr
65 else:
66 base = 10
67 return unichr(long(numstr, base))
68
69 # Unknown entity in name, return its literal representation
70 return (u'&%s;' % entity)
71
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self, id):
76 self.id = id
77 self.result = None
78 self.started = False
79 self.depth = {}
80 self.html = None
81 self.watch_startpos = False
82 self.error_count = 0
83 HTMLParser.HTMLParser.__init__(self)
84
85 def error(self, message):
86 print >> sys.stderr, self.getpos()
87 if self.error_count > 10 or self.started:
88 raise HTMLParser.HTMLParseError(message, self.getpos())
89 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
90 self.error_count += 1
91 self.goahead(1)
92
93 def loads(self, html):
94 self.html = html
95 self.feed(html)
96 self.close()
97
98 def handle_starttag(self, tag, attrs):
99 attrs = dict(attrs)
100 if self.started:
101 self.find_startpos(None)
102 if 'id' in attrs and attrs['id'] == self.id:
103 self.result = [tag]
104 self.started = True
105 self.watch_startpos = True
106 if self.started:
107 if not tag in self.depth: self.depth[tag] = 0
108 self.depth[tag] += 1
109
110 def handle_endtag(self, tag):
111 if self.started:
112 if tag in self.depth: self.depth[tag] -= 1
113 if self.depth[self.result[0]] == 0:
114 self.started = False
115 self.result.append(self.getpos())
116
117 def find_startpos(self, x):
118 """Needed to put the start position of the result (self.result[1])
119 after the opening tag with the requested id"""
120 if self.watch_startpos:
121 self.watch_startpos = False
122 self.result.append(self.getpos())
123 handle_entityref = handle_charref = handle_data = handle_comment = \
124 handle_decl = handle_pi = unknown_decl = find_startpos
125
126 def get_result(self):
127 if self.result == None: return None
128 if len(self.result) != 3: return None
129 lines = self.html.split('\n')
130 lines = lines[self.result[1][0]-1:self.result[2][0]]
131 lines[0] = lines[0][self.result[1][1]:]
132 if len(lines) == 1:
133 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
134 lines[-1] = lines[-1][:self.result[2][1]]
135 return '\n'.join(lines).strip()
136
137 def get_element_by_id(id, html):
138 """Return the content of the tag with the specified id in the passed HTML document"""
139 parser = IDParser(id)
140 try:
141 parser.loads(html)
142 except HTMLParser.HTMLParseError:
143 pass
144 return parser.get_result()
145
146
147 def clean_html(html):
148 """Clean an HTML snippet into a readable string"""
149 # Newline vs <br />
150 html = html.replace('\n', ' ')
151 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
152 # Strip html tags
153 html = re.sub('<.*?>', '', html)
154 # Replace html entities
155 html = unescapeHTML(html)
156 return html
157
158
159 def sanitize_open(filename, open_mode):
160 """Try to open the given filename, and slightly tweak it if this fails.
161
162 Attempts to open the given filename. If this fails, it tries to change
163 the filename slightly, step by step, until it's either able to open it
164 or it fails and raises a final exception, like the standard open()
165 function.
166
167 It returns the tuple (stream, definitive_file_name).
168 """
169 try:
170 if filename == u'-':
171 if sys.platform == 'win32':
172 import msvcrt
173 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
174 return (sys.stdout, filename)
175 stream = open(encodeFilename(filename), open_mode)
176 return (stream, filename)
177 except (IOError, OSError), err:
178 # In case of error, try to remove win32 forbidden chars
179 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
180
181 # An exception here should be caught in the caller
182 stream = open(encodeFilename(filename), open_mode)
183 return (stream, filename)
184
185
186 def timeconvert(timestr):
187 """Convert RFC 2822 defined time string into system timestamp"""
188 timestamp = None
189 timetuple = email.utils.parsedate_tz(timestr)
190 if timetuple is not None:
191 timestamp = email.utils.mktime_tz(timetuple)
192 return timestamp
193
194 def sanitize_filename(s):
195 """Sanitizes a string so it could be used as part of a filename."""
196 def replace_insane(char):
197 if char in u' .\\/|?*<>:"' or ord(char) < 32:
198 return '_'
199 return char
200 return u''.join(map(replace_insane, s)).strip('_')
201
202 def orderedSet(iterable):
203 """ Remove all duplicates from the input iterable """
204 res = []
205 for el in iterable:
206 if el not in res:
207 res.append(el)
208 return res
209
210 def unescapeHTML(s):
211 """
212 @param s a string (of type unicode)
213 """
214 assert type(s) == type(u'')
215
216 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
217 return result
218
219 def encodeFilename(s):
220 """
221 @param s The name of the file (of type unicode)
222 """
223
224 assert type(s) == type(u'')
225
226 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
227 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
228 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
229 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
230 return s
231 else:
232 return s.encode(sys.getfilesystemencoding(), 'ignore')
233
234 class DownloadError(Exception):
235 """Download Error exception.
236
237 This exception may be thrown by FileDownloader objects if they are not
238 configured to continue on errors. They will contain the appropriate
239 error message.
240 """
241 pass
242
243
244 class SameFileError(Exception):
245 """Same File exception.
246
247 This exception will be thrown by FileDownloader objects if they detect
248 multiple files would have to be downloaded to the same file on disk.
249 """
250 pass
251
252
253 class PostProcessingError(Exception):
254 """Post Processing exception.
255
256 This exception may be raised by PostProcessor's .run() method to
257 indicate an error in the postprocessing task.
258 """
259 pass
260
261 class MaxDownloadsReached(Exception):
262 """ --max-downloads limit has been reached. """
263 pass
264
265
266 class UnavailableVideoError(Exception):
267 """Unavailable Format exception.
268
269 This exception will be thrown when a video is requested
270 in a format that is not available for that video.
271 """
272 pass
273
274
275 class ContentTooShortError(Exception):
276 """Content Too Short exception.
277
278 This exception may be raised by FileDownloader objects when a file they
279 download is too small for what the server announced first, indicating
280 the connection was probably interrupted.
281 """
282 # Both in bytes
283 downloaded = None
284 expected = None
285
286 def __init__(self, downloaded, expected):
287 self.downloaded = downloaded
288 self.expected = expected
289
290
291 class Trouble(Exception):
292 """Trouble helper exception
293
294 This is an exception to be handled with
295 FileDownloader.trouble
296 """
297
298 class YoutubeDLHandler(urllib2.HTTPHandler):
299 """Handler for HTTP requests and responses.
300
301 This class, when installed with an OpenerDirector, automatically adds
302 the standard headers to every HTTP request and handles gzipped and
303 deflated responses from web servers. If compression is to be avoided in
304 a particular request, the original request in the program code only has
305 to include the HTTP header "Youtubedl-No-Compression", which will be
306 removed before making the real request.
307
308 Part of this code was copied from:
309
310 http://techknack.net/python-urllib2-handlers/
311
312 Andrew Rowls, the author of that code, agreed to release it to the
313 public domain.
314 """
315
316 @staticmethod
317 def deflate(data):
318 try:
319 return zlib.decompress(data, -zlib.MAX_WBITS)
320 except zlib.error:
321 return zlib.decompress(data)
322
323 @staticmethod
324 def addinfourl_wrapper(stream, headers, url, code):
325 if hasattr(urllib2.addinfourl, 'getcode'):
326 return urllib2.addinfourl(stream, headers, url, code)
327 ret = urllib2.addinfourl(stream, headers, url)
328 ret.code = code
329 return ret
330
331 def http_request(self, req):
332 for h in std_headers:
333 if h in req.headers:
334 del req.headers[h]
335 req.add_header(h, std_headers[h])
336 if 'Youtubedl-no-compression' in req.headers:
337 if 'Accept-encoding' in req.headers:
338 del req.headers['Accept-encoding']
339 del req.headers['Youtubedl-no-compression']
340 return req
341
342 def http_response(self, req, resp):
343 old_resp = resp
344 # gzip
345 if resp.headers.get('Content-encoding', '') == 'gzip':
346 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
347 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
348 resp.msg = old_resp.msg
349 # deflate
350 if resp.headers.get('Content-encoding', '') == 'deflate':
351 gz = StringIO.StringIO(self.deflate(resp.read()))
352 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
353 resp.msg = old_resp.msg
354 return resp