]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/utils.py
   2 # -*- coding: utf-8 -*- 
  17         import cStringIO 
as StringIO
 
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', 
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
  25         'Accept-Encoding': 'gzip, deflate', 
  26         'Accept-Language': 'en-us,en;q=0.5', 
  29 def preferredencoding(): 
  30         """Get preferred encoding. 
  32         Returns the best encoding scheme for the system, based on 
  33         locale.getpreferredencoding() and some further tweaks. 
  35         def yield_preferredencoding(): 
  37                         pref 
= locale
.getpreferredencoding() 
  43         return yield_preferredencoding().next() 
  46 def htmlentity_transform(matchobj
): 
  47         """Transforms an HTML entity to a Unicode character. 
  49         This function receives a match object and is intended to be used with 
  50         the re.sub() function. 
  52         entity 
= matchobj
.group(1) 
  54         # Known non-numeric HTML entity 
  55         if entity 
in htmlentitydefs
.name2codepoint
: 
  56                 return unichr(htmlentitydefs
.name2codepoint
[entity
]) 
  59         mobj 
= re
.match(ur
'(?u)#(x?\d+)', entity
) 
  61                 numstr 
= mobj
.group(1) 
  62                 if numstr
.startswith(u
'x'): 
  64                         numstr 
= u
'0%s' % numstr
 
  67                 return unichr(long(numstr
, base
)) 
  69         # Unknown entity in name, return its literal representation 
  70         return (u
'&%s;' % entity
) 
  72 HTMLParser
.locatestarttagend 
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix 
  73 class IDParser(HTMLParser
.HTMLParser
): 
  74         """Modified HTMLParser that isolates a tag with the specified id""" 
  75         def __init__(self
, id): 
  81                 self
.watch_startpos 
= False 
  83                 HTMLParser
.HTMLParser
.__init
__(self
) 
  85         def error(self
, message
): 
  86                 print >> sys
.stderr
, self
.getpos() 
  87                 if self
.error_count 
> 10 or self
.started
: 
  88                         raise HTMLParser
.HTMLParseError(message
, self
.getpos()) 
  89                 self
.rawdata 
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line 
  93         def loads(self
, html
): 
  98         def handle_starttag(self
, tag
, attrs
): 
 101                         self
.find_startpos(None) 
 102                 if 'id' in attrs 
and attrs
['id'] == self
.id: 
 105                         self
.watch_startpos 
= True 
 107                         if not tag 
in self
.depth
: self
.depth
[tag
] = 0 
 110         def handle_endtag(self
, tag
): 
 112                         if tag 
in self
.depth
: self
.depth
[tag
] -= 1 
 113                         if self
.depth
[self
.result
[0]] == 0: 
 115                                 self
.result
.append(self
.getpos()) 
 117         def find_startpos(self
, x
): 
 118                 """Needed to put the start position of the result (self.result[1]) 
 119                 after the opening tag with the requested id""" 
 120                 if self
.watch_startpos
: 
 121                         self
.watch_startpos 
= False 
 122                         self
.result
.append(self
.getpos()) 
 123         handle_entityref 
= handle_charref 
= handle_data 
= handle_comment 
= \
 
 124         handle_decl 
= handle_pi 
= unknown_decl 
= find_startpos
 
 126         def get_result(self
): 
 127                 if self
.result 
== None: return None 
 128                 if len(self
.result
) != 3: return None 
 129                 lines 
= self
.html
.split('\n') 
 130                 lines 
= lines
[self
.result
[1][0]-1:self
.result
[2][0]] 
 131                 lines
[0] = lines
[0][self
.result
[1][1]:] 
 133                         lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]] 
 134                 lines
[-1] = lines
[-1][:self
.result
[2][1]] 
 135                 return '\n'.join(lines
).strip() 
 137 def get_element_by_id(id, html
): 
 138         """Return the content of the tag with the specified id in the passed HTML document""" 
 139         parser 
= IDParser(id) 
 142         except HTMLParser
.HTMLParseError
: 
 144         return parser
.get_result() 
 147 def clean_html(html
): 
 148         """Clean an HTML snippet into a readable string""" 
 150         html 
= html
.replace('\n', ' ') 
 151         html 
= re
.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html
) 
 153         html 
= re
.sub('<.*?>', '', html
) 
 154         # Replace html entities 
 155         html 
= unescapeHTML(html
) 
 159 def sanitize_open(filename
, open_mode
): 
 160         """Try to open the given filename, and slightly tweak it if this fails. 
 162         Attempts to open the given filename. If this fails, it tries to change 
 163         the filename slightly, step by step, until it's either able to open it 
 164         or it fails and raises a final exception, like the standard open() 
 167         It returns the tuple (stream, definitive_file_name). 
 171                         if sys
.platform 
== 'win32': 
 173                                 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
) 
 174                         return (sys
.stdout
, filename
) 
 175                 stream 
= open(encodeFilename(filename
), open_mode
) 
 176                 return (stream
, filename
) 
 177         except (IOError, OSError), err
: 
 178                 # In case of error, try to remove win32 forbidden chars 
 179                 filename 
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
) 
 181                 # An exception here should be caught in the caller 
 182                 stream 
= open(encodeFilename(filename
), open_mode
) 
 183                 return (stream
, filename
) 
 186 def timeconvert(timestr
): 
 187         """Convert RFC 2822 defined time string into system timestamp""" 
 189         timetuple 
= email
.utils
.parsedate_tz(timestr
) 
 190         if timetuple 
is not None: 
 191                 timestamp 
= email
.utils
.mktime_tz(timetuple
) 
 194 def sanitize_filename(s
): 
 195         """Sanitizes a string so it could be used as part of a filename.""" 
 196         def replace_insane(char
): 
 197                 if char 
in u
' .\\/|?*<>:"' or ord(char
) < 32: 
 200         return u
''.join(map(replace_insane
, s
)).strip('_') 
 202 def orderedSet(iterable
): 
 203         """ Remove all duplicates from the input iterable """ 
 212         @param s a string (of type unicode) 
 214         assert type(s
) == type(u
'') 
 216         result 
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, s
) 
 219 def encodeFilename(s
): 
 221         @param s The name of the file (of type unicode) 
 224         assert type(s
) == type(u
'') 
 226         if sys
.platform 
== 'win32' and sys
.getwindowsversion()[0] >= 5: 
 227                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up 
 228                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would 
 229                 # match Windows 9x series as well. Besides, NT 4 is obsolete.) 
 232                 return s
.encode(sys
.getfilesystemencoding(), 'ignore') 
 234 class DownloadError(Exception): 
 235         """Download Error exception. 
 237         This exception may be thrown by FileDownloader objects if they are not 
 238         configured to continue on errors. They will contain the appropriate 
 244 class SameFileError(Exception): 
 245         """Same File exception. 
 247         This exception will be thrown by FileDownloader objects if they detect 
 248         multiple files would have to be downloaded to the same file on disk. 
 253 class PostProcessingError(Exception): 
 254         """Post Processing exception. 
 256         This exception may be raised by PostProcessor's .run() method to 
 257         indicate an error in the postprocessing task. 
 261 class MaxDownloadsReached(Exception): 
 262         """ --max-downloads limit has been reached. """ 
 266 class UnavailableVideoError(Exception): 
 267         """Unavailable Format exception. 
 269         This exception will be thrown when a video is requested 
 270         in a format that is not available for that video. 
 275 class ContentTooShortError(Exception): 
 276         """Content Too Short exception. 
 278         This exception may be raised by FileDownloader objects when a file they 
 279         download is too small for what the server announced first, indicating 
 280         the connection was probably interrupted. 
 286         def __init__(self
, downloaded
, expected
): 
 287                 self
.downloaded 
= downloaded
 
 288                 self
.expected 
= expected
 
 291 class Trouble(Exception): 
 292         """Trouble helper exception 
 294         This is an exception to be handled with 
 295         FileDownloader.trouble 
 298 class YoutubeDLHandler(urllib2
.HTTPHandler
): 
 299         """Handler for HTTP requests and responses. 
 301         This class, when installed with an OpenerDirector, automatically adds 
 302         the standard headers to every HTTP request and handles gzipped and 
 303         deflated responses from web servers. If compression is to be avoided in 
 304         a particular request, the original request in the program code only has 
 305         to include the HTTP header "Youtubedl-No-Compression", which will be 
 306         removed before making the real request. 
 308         Part of this code was copied from: 
 310         http://techknack.net/python-urllib2-handlers/ 
 312         Andrew Rowls, the author of that code, agreed to release it to the 
 319                         return zlib
.decompress(data
, -zlib
.MAX_WBITS
) 
 321                         return zlib
.decompress(data
) 
 324         def addinfourl_wrapper(stream
, headers
, url
, code
): 
 325                 if hasattr(urllib2
.addinfourl
, 'getcode'): 
 326                         return urllib2
.addinfourl(stream
, headers
, url
, code
) 
 327                 ret 
= urllib2
.addinfourl(stream
, headers
, url
) 
 331         def http_request(self
, req
): 
 332                 for h 
in std_headers
: 
 335                         req
.add_header(h
, std_headers
[h
]) 
 336                 if 'Youtubedl-no-compression' in req
.headers
: 
 337                         if 'Accept-encoding' in req
.headers
: 
 338                                 del req
.headers
['Accept-encoding'] 
 339                         del req
.headers
['Youtubedl-no-compression'] 
 342         def http_response(self
, req
, resp
): 
 345                 if resp
.headers
.get('Content-encoding', '') == 'gzip': 
 346                         gz 
= gzip
.GzipFile(fileobj
=StringIO
.StringIO(resp
.read()), mode
='r') 
 347                         resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 348                         resp
.msg 
= old_resp
.msg
 
 350                 if resp
.headers
.get('Content-encoding', '') == 'deflate': 
 351                         gz 
= StringIO
.StringIO(self
.deflate(resp
.read())) 
 352                         resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 353                         resp
.msg 
= old_resp
.msg