2 # -*- coding: utf-8 -*- 
  15     import urllib
.request 
as compat_urllib_request
 
  16 except ImportError: # Python 2 
  17     import urllib2 
as compat_urllib_request
 
  20     import urllib
.error 
as compat_urllib_error
 
  21 except ImportError: # Python 2 
  22     import urllib2 
as compat_urllib_error
 
  25     import urllib
.parse 
as compat_urllib_parse
 
  26 except ImportError: # Python 2 
  27     import urllib 
as compat_urllib_parse
 
  30     from urllib
.parse 
import urlparse 
as compat_urllib_parse_urlparse
 
  31 except ImportError: # Python 2 
  32     from urlparse 
import urlparse 
as compat_urllib_parse_urlparse
 
  35     import http
.cookiejar 
as compat_cookiejar
 
  36 except ImportError: # Python 2 
  37     import cookielib 
as compat_cookiejar
 
  40     import html
.entities 
as compat_html_entities
 
  41 except ImportError: # Python 2 
  42     import htmlentitydefs 
as compat_html_entities
 
  45     import html
.parser 
as compat_html_parser
 
  46 except ImportError: # Python 2 
  47     import HTMLParser 
as compat_html_parser
 
  50     import http
.client 
as compat_http_client
 
  51 except ImportError: # Python 2 
  52     import httplib 
as compat_http_client
 
  55     from urllib
.parse 
import parse_qs 
as compat_parse_qs
 
  56 except ImportError: # Python 2 
  57     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. 
  58     # Python 2's version is apparently totally broken 
  59     def _unquote(string
, encoding
='utf-8', errors
='replace'): 
  62         res 
= string
.split('%') 
  69         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded 
  76                 pct_sequence 
+= item
[:2].decode('hex') 
  79                     # This segment was just a single percent-encoded character. 
  80                     # May be part of a sequence of code units, so delay decoding. 
  81                     # (Stored in pct_sequence). 
  85             # Encountered non-percent-encoded characters. Flush the current 
  87             string 
+= pct_sequence
.decode(encoding
, errors
) + rest
 
  90             # Flush the final pct_sequence 
  91             string 
+= pct_sequence
.decode(encoding
, errors
) 
  94     def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False, 
  95                 encoding
='utf-8', errors
='replace'): 
  96         qs
, _coerce_result 
= qs
, unicode 
  97         pairs 
= [s2 
for s1 
in qs
.split('&') for s2 
in s1
.split(';')] 
  99         for name_value 
in pairs
: 
 100             if not name_value 
and not strict_parsing
: 
 102             nv 
= name_value
.split('=', 1) 
 105                     raise ValueError("bad query field: %r" % (name_value
,)) 
 106                 # Handle case of a control-name with no equal sign 
 107                 if keep_blank_values
: 
 111             if len(nv
[1]) or keep_blank_values
: 
 112                 name 
= nv
[0].replace('+', ' ') 
 113                 name 
= _unquote(name
, encoding
=encoding
, errors
=errors
) 
 114                 name 
= _coerce_result(name
) 
 115                 value 
= nv
[1].replace('+', ' ') 
 116                 value 
= _unquote(value
, encoding
=encoding
, errors
=errors
) 
 117                 value 
= _coerce_result(value
) 
 118                 r
.append((name
, value
)) 
 121     def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False, 
 122                 encoding
='utf-8', errors
='replace'): 
 124         pairs 
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
, 
 125                         encoding
=encoding
, errors
=errors
) 
 126         for name
, value 
in pairs
: 
 127             if name 
in parsed_result
: 
 128                 parsed_result
[name
].append(value
) 
 130                 parsed_result
[name
] = [value
] 
 134     compat_str 
= unicode # Python 2 
 139     compat_chr 
= unichr # Python 2 
 144     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 
 145     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
 146     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
 147     'Accept-Encoding': 'gzip, deflate', 
 148     'Accept-Language': 'en-us,en;q=0.5', 
 150 def preferredencoding(): 
 151     """Get preferred encoding. 
 153     Returns the best encoding scheme for the system, based on 
 154     locale.getpreferredencoding() and some further tweaks. 
 157         pref 
= locale
.getpreferredencoding() 
 164 if sys
.version_info 
< (3,0): 
 166         print(s
.encode(preferredencoding(), 'xmlcharrefreplace')) 
 169         assert type(s
) == type(u
'') 
 172 def htmlentity_transform(matchobj
): 
 173     """Transforms an HTML entity to a character. 
 175     This function receives a match object and is intended to be used with 
 176     the re.sub() function. 
 178     entity 
= matchobj
.group(1) 
 180     # Known non-numeric HTML entity 
 181     if entity 
in compat_html_entities
.name2codepoint
: 
 182         return compat_chr(compat_html_entities
.name2codepoint
[entity
]) 
 184     mobj 
= re
.match(u
'(?u)#(x?\\d+)', entity
) 
 186         numstr 
= mobj
.group(1) 
 187         if numstr
.startswith(u
'x'): 
 189             numstr 
= u
'0%s' % numstr
 
 192         return compat_chr(int(numstr
, base
)) 
 194     # Unknown entity in name, return its literal representation 
 195     return (u
'&%s;' % entity
) 
 197 compat_html_parser
.locatestarttagend 
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix 
 198 class IDParser(compat_html_parser
.HTMLParser
): 
 199     """Modified HTMLParser that isolates a tag with the specified id""" 
 200     def __init__(self
, id): 
 206         self
.watch_startpos 
= False 
 208         compat_html_parser
.HTMLParser
.__init
__(self
) 
 210     def error(self
, message
): 
 211         if self
.error_count 
> 10 or self
.started
: 
 212             raise compat_html_parser
.HTMLParseError(message
, self
.getpos()) 
 213         self
.rawdata 
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line 
 214         self
.error_count 
+= 1 
 217     def loads(self
, html
): 
 222     def handle_starttag(self
, tag
, attrs
): 
 225             self
.find_startpos(None) 
 226         if 'id' in attrs 
and attrs
['id'] == self
.id: 
 229             self
.watch_startpos 
= True 
 231             if not tag 
in self
.depth
: self
.depth
[tag
] = 0 
 234     def handle_endtag(self
, tag
): 
 236             if tag 
in self
.depth
: self
.depth
[tag
] -= 1 
 237             if self
.depth
[self
.result
[0]] == 0: 
 239                 self
.result
.append(self
.getpos()) 
 241     def find_startpos(self
, x
): 
 242         """Needed to put the start position of the result (self.result[1]) 
 243         after the opening tag with the requested id""" 
 244         if self
.watch_startpos
: 
 245             self
.watch_startpos 
= False 
 246             self
.result
.append(self
.getpos()) 
 247     handle_entityref 
= handle_charref 
= handle_data 
= handle_comment 
= \
 
 248     handle_decl 
= handle_pi 
= unknown_decl 
= find_startpos
 
 250     def get_result(self
): 
 251         if self
.result 
is None: 
 253         if len(self
.result
) != 3: 
 255         lines 
= self
.html
.split('\n') 
 256         lines 
= lines
[self
.result
[1][0]-1:self
.result
[2][0]] 
 257         lines
[0] = lines
[0][self
.result
[1][1]:] 
 259             lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]] 
 260         lines
[-1] = lines
[-1][:self
.result
[2][1]] 
 261         return '\n'.join(lines
).strip() 
 263 def get_element_by_id(id, html
): 
 264     """Return the content of the tag with the specified id in the passed HTML document""" 
 265     parser 
= IDParser(id) 
 268     except compat_html_parser
.HTMLParseError
: 
 270     return parser
.get_result() 
 273 def clean_html(html
): 
 274     """Clean an HTML snippet into a readable string""" 
 276     html 
= html
.replace('\n', ' ') 
 277     html 
= re
.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html
) 
 279     html 
= re
.sub('<.*?>', '', html
) 
 280     # Replace html entities 
 281     html 
= unescapeHTML(html
) 
 285 def sanitize_open(filename
, open_mode
): 
 286     """Try to open the given filename, and slightly tweak it if this fails. 
 288     Attempts to open the given filename. If this fails, it tries to change 
 289     the filename slightly, step by step, until it's either able to open it 
 290     or it fails and raises a final exception, like the standard open() 
 293     It returns the tuple (stream, definitive_file_name). 
 297             if sys
.platform 
== 'win32': 
 299                 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
) 
 300             return (sys
.stdout
, filename
) 
 301         stream 
= open(encodeFilename(filename
), open_mode
) 
 302         return (stream
, filename
) 
 303     except (IOError, OSError) as err
: 
 304         # In case of error, try to remove win32 forbidden chars 
 305         filename 
= re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', filename
) 
 307         # An exception here should be caught in the caller 
 308         stream 
= open(encodeFilename(filename
), open_mode
) 
 309         return (stream
, filename
) 
 312 def timeconvert(timestr
): 
 313     """Convert RFC 2822 defined time string into system timestamp""" 
 315     timetuple 
= email
.utils
.parsedate_tz(timestr
) 
 316     if timetuple 
is not None: 
 317         timestamp 
= email
.utils
.mktime_tz(timetuple
) 
 320 def sanitize_filename(s
, restricted
=False, is_id
=False): 
 321     """Sanitizes a string so it could be used as part of a filename. 
 322     If restricted is set, use a stricter subset of allowed characters. 
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible 
 325     def replace_insane(char
): 
 326         if char 
== '?' or ord(char
) < 32 or ord(char
) == 127: 
 329             return '' if restricted 
else '\'' 
 331             return '_-' if restricted 
else ' -' 
 332         elif char 
in '\\/|*<>': 
 334         if restricted 
and (char 
in '!&\'()[]{}$;`^,#' or char
.isspace()): 
 336         if restricted 
and ord(char
) > 127: 
 340     result 
= u
''.join(map(replace_insane
, s
)) 
 342         while '__' in result
: 
 343             result 
= result
.replace('__', '_') 
 344         result 
= result
.strip('_') 
 345         # Common case of "Foreign band name - English song title" 
 346         if restricted 
and result
.startswith('-_'): 
 352 def orderedSet(iterable
): 
 353     """ Remove all duplicates from the input iterable """ 
 364     assert type(s
) == type(u
'') 
 366     result 
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
) 
 369 def encodeFilename(s
): 
 371     @param s The name of the file 
 374     assert type(s
) == type(u
'') 
 376     # Python 3 has a Unicode API 
 377     if sys
.version_info 
>= (3, 0): 
 380     if sys
.platform 
== 'win32' and sys
.getwindowsversion()[0] >= 5: 
 381         # Pass u'' directly to use Unicode APIs on Windows 2000 and up 
 382         # (Detecting Windows NT 4 is tricky because 'major >= 4' would 
 383         # match Windows 9x series as well. Besides, NT 4 is obsolete.) 
 386         return s
.encode(sys
.getfilesystemencoding(), 'ignore') 
 388 class DownloadError(Exception): 
 389     """Download Error exception. 
 391     This exception may be thrown by FileDownloader objects if they are not 
 392     configured to continue on errors. They will contain the appropriate 
 398 class SameFileError(Exception): 
 399     """Same File exception. 
 401     This exception will be thrown by FileDownloader objects if they detect 
 402     multiple files would have to be downloaded to the same file on disk. 
 407 class PostProcessingError(Exception): 
 408     """Post Processing exception. 
 410     This exception may be raised by PostProcessor's .run() method to 
 411     indicate an error in the postprocessing task. 
 415 class MaxDownloadsReached(Exception): 
 416     """ --max-downloads limit has been reached. """ 
 420 class UnavailableVideoError(Exception): 
 421     """Unavailable Format exception. 
 423     This exception will be thrown when a video is requested 
 424     in a format that is not available for that video. 
 429 class ContentTooShortError(Exception): 
 430     """Content Too Short exception. 
 432     This exception may be raised by FileDownloader objects when a file they 
 433     download is too small for what the server announced first, indicating 
 434     the connection was probably interrupted. 
 440     def __init__(self
, downloaded
, expected
): 
 441         self
.downloaded 
= downloaded
 
 442         self
.expected 
= expected
 
 445 class Trouble(Exception): 
 446     """Trouble helper exception 
 448     This is an exception to be handled with 
 449     FileDownloader.trouble 
 452 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
): 
 453     """Handler for HTTP requests and responses. 
 455     This class, when installed with an OpenerDirector, automatically adds 
 456     the standard headers to every HTTP request and handles gzipped and 
 457     deflated responses from web servers. If compression is to be avoided in 
 458     a particular request, the original request in the program code only has 
 459     to include the HTTP header "Youtubedl-No-Compression", which will be 
 460     removed before making the real request. 
 462     Part of this code was copied from: 
 464     http://techknack.net/python-urllib2-handlers/ 
 466     Andrew Rowls, the author of that code, agreed to release it to the 
 473             return zlib
.decompress(data
, -zlib
.MAX_WBITS
) 
 475             return zlib
.decompress(data
) 
 478     def addinfourl_wrapper(stream
, headers
, url
, code
): 
 479         if hasattr(compat_urllib_request
.addinfourl
, 'getcode'): 
 480             return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
) 
 481         ret 
= compat_urllib_request
.addinfourl(stream
, headers
, url
) 
 485     def http_request(self
, req
): 
 486         for h 
in std_headers
: 
 489             req
.add_header(h
, std_headers
[h
]) 
 490         if 'Youtubedl-no-compression' in req
.headers
: 
 491             if 'Accept-encoding' in req
.headers
: 
 492                 del req
.headers
['Accept-encoding'] 
 493             del req
.headers
['Youtubedl-no-compression'] 
 496     def http_response(self
, req
, resp
): 
 499         if resp
.headers
.get('Content-encoding', '') == 'gzip': 
 500             gz 
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r') 
 501             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 502             resp
.msg 
= old_resp
.msg
 
 504         if resp
.headers
.get('Content-encoding', '') == 'deflate': 
 505             gz 
= io
.BytesIO(self
.deflate(resp
.read())) 
 506             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 507             resp
.msg 
= old_resp
.msg
 
 510     https_request 
= http_request
 
 511     https_response 
= http_response