2 # -*- coding: utf-8 -*- 
  19     import urllib
.request 
as compat_urllib_request
 
  20 except ImportError: # Python 2 
  21     import urllib2 
as compat_urllib_request
 
  24     import urllib
.error 
as compat_urllib_error
 
  25 except ImportError: # Python 2 
  26     import urllib2 
as compat_urllib_error
 
  29     import urllib
.parse 
as compat_urllib_parse
 
  30 except ImportError: # Python 2 
  31     import urllib 
as compat_urllib_parse
 
  34     from urllib
.parse 
import urlparse 
as compat_urllib_parse_urlparse
 
  35 except ImportError: # Python 2 
  36     from urlparse 
import urlparse 
as compat_urllib_parse_urlparse
 
  39     import http
.cookiejar 
as compat_cookiejar
 
  40 except ImportError: # Python 2 
  41     import cookielib 
as compat_cookiejar
 
  44     import html
.entities 
as compat_html_entities
 
  45 except ImportError: # Python 2 
  46     import htmlentitydefs 
as compat_html_entities
 
  49     import html
.parser 
as compat_html_parser
 
  50 except ImportError: # Python 2 
  51     import HTMLParser 
as compat_html_parser
 
  54     import http
.client 
as compat_http_client
 
  55 except ImportError: # Python 2 
  56     import httplib 
as compat_http_client
 
  59     from subprocess 
import DEVNULL
 
  60     compat_subprocess_get_DEVNULL 
= lambda: DEVNULL
 
  62     compat_subprocess_get_DEVNULL 
= lambda: open(os
.path
.devnull
, 'w') 
  65     from urllib
.parse 
import parse_qs 
as compat_parse_qs
 
  66 except ImportError: # Python 2 
  67     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. 
  68     # Python 2's version is apparently totally broken 
  69     def _unquote(string
, encoding
='utf-8', errors
='replace'): 
  72         res 
= string
.split('%') 
  79         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded 
  86                 pct_sequence 
+= item
[:2].decode('hex') 
  89                     # This segment was just a single percent-encoded character. 
  90                     # May be part of a sequence of code units, so delay decoding. 
  91                     # (Stored in pct_sequence). 
  95             # Encountered non-percent-encoded characters. Flush the current 
  97             string 
+= pct_sequence
.decode(encoding
, errors
) + rest
 
 100             # Flush the final pct_sequence 
 101             string 
+= pct_sequence
.decode(encoding
, errors
) 
 104     def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False, 
 105                 encoding
='utf-8', errors
='replace'): 
 106         qs
, _coerce_result 
= qs
, unicode 
 107         pairs 
= [s2 
for s1 
in qs
.split('&') for s2 
in s1
.split(';')] 
 109         for name_value 
in pairs
: 
 110             if not name_value 
and not strict_parsing
: 
 112             nv 
= name_value
.split('=', 1) 
 115                     raise ValueError("bad query field: %r" % (name_value
,)) 
 116                 # Handle case of a control-name with no equal sign 
 117                 if keep_blank_values
: 
 121             if len(nv
[1]) or keep_blank_values
: 
 122                 name 
= nv
[0].replace('+', ' ') 
 123                 name 
= _unquote(name
, encoding
=encoding
, errors
=errors
) 
 124                 name 
= _coerce_result(name
) 
 125                 value 
= nv
[1].replace('+', ' ') 
 126                 value 
= _unquote(value
, encoding
=encoding
, errors
=errors
) 
 127                 value 
= _coerce_result(value
) 
 128                 r
.append((name
, value
)) 
 131     def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False, 
 132                 encoding
='utf-8', errors
='replace'): 
 134         pairs 
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
, 
 135                         encoding
=encoding
, errors
=errors
) 
 136         for name
, value 
in pairs
: 
 137             if name 
in parsed_result
: 
 138                 parsed_result
[name
].append(value
) 
 140                 parsed_result
[name
] = [value
] 
 144     compat_str 
= unicode # Python 2 
 149     compat_chr 
= unichr # Python 2 
 154     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 
 155     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
 156     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
 157     'Accept-Encoding': 'gzip, deflate', 
 158     'Accept-Language': 'en-us,en;q=0.5', 
 161 def preferredencoding(): 
 162     """Get preferred encoding. 
 164     Returns the best encoding scheme for the system, based on 
 165     locale.getpreferredencoding() and some further tweaks. 
 168         pref 
= locale
.getpreferredencoding() 
 175 if sys
.version_info 
< (3,0): 
 177         print(s
.encode(preferredencoding(), 'xmlcharrefreplace')) 
 180         assert type(s
) == type(u
'') 
 183 # In Python 2.x, json.dump expects a bytestream. 
 184 # In Python 3.x, it writes to a character stream 
 185 if sys
.version_info 
< (3,0): 
 186     def write_json_file(obj
, fn
): 
 187         with open(fn
, 'wb') as f
: 
 190     def write_json_file(obj
, fn
): 
 191         with open(fn
, 'w', encoding
='utf-8') as f
: 
 194 def htmlentity_transform(matchobj
): 
 195     """Transforms an HTML entity to a character. 
 197     This function receives a match object and is intended to be used with 
 198     the re.sub() function. 
 200     entity 
= matchobj
.group(1) 
 202     # Known non-numeric HTML entity 
 203     if entity 
in compat_html_entities
.name2codepoint
: 
 204         return compat_chr(compat_html_entities
.name2codepoint
[entity
]) 
 206     mobj 
= re
.match(u
'(?u)#(x?\\d+)', entity
) 
 208         numstr 
= mobj
.group(1) 
 209         if numstr
.startswith(u
'x'): 
 211             numstr 
= u
'0%s' % numstr
 
 214         return compat_chr(int(numstr
, base
)) 
 216     # Unknown entity in name, return its literal representation 
 217     return (u
'&%s;' % entity
) 
 219 compat_html_parser
.locatestarttagend 
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix 
 220 class AttrParser(compat_html_parser
.HTMLParser
): 
 221     """Modified HTMLParser that isolates a tag with the specified attribute""" 
 222     def __init__(self
, attribute
, value
): 
 223         self
.attribute 
= attribute
 
 229         self
.watch_startpos 
= False 
 231         compat_html_parser
.HTMLParser
.__init
__(self
) 
 233     def error(self
, message
): 
 234         if self
.error_count 
> 10 or self
.started
: 
 235             raise compat_html_parser
.HTMLParseError(message
, self
.getpos()) 
 236         self
.rawdata 
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line 
 237         self
.error_count 
+= 1 
 240     def loads(self
, html
): 
 245     def handle_starttag(self
, tag
, attrs
): 
 248             self
.find_startpos(None) 
 249         if self
.attribute 
in attrs 
and attrs
[self
.attribute
] == self
.value
: 
 252             self
.watch_startpos 
= True 
 254             if not tag 
in self
.depth
: self
.depth
[tag
] = 0 
 257     def handle_endtag(self
, tag
): 
 259             if tag 
in self
.depth
: self
.depth
[tag
] -= 1 
 260             if self
.depth
[self
.result
[0]] == 0: 
 262                 self
.result
.append(self
.getpos()) 
 264     def find_startpos(self
, x
): 
 265         """Needed to put the start position of the result (self.result[1]) 
 266         after the opening tag with the requested id""" 
 267         if self
.watch_startpos
: 
 268             self
.watch_startpos 
= False 
 269             self
.result
.append(self
.getpos()) 
 270     handle_entityref 
= handle_charref 
= handle_data 
= handle_comment 
= \
 
 271     handle_decl 
= handle_pi 
= unknown_decl 
= find_startpos
 
 273     def get_result(self
): 
 274         if self
.result 
is None: 
 276         if len(self
.result
) != 3: 
 278         lines 
= self
.html
.split('\n') 
 279         lines 
= lines
[self
.result
[1][0]-1:self
.result
[2][0]] 
 280         lines
[0] = lines
[0][self
.result
[1][1]:] 
 282             lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]] 
 283         lines
[-1] = lines
[-1][:self
.result
[2][1]] 
 284         return '\n'.join(lines
).strip() 
 285 # Hack for https://github.com/rg3/youtube-dl/issues/662 
 286 if sys
.version_info 
< (2, 7, 3): 
 287     AttrParser
.parse_endtag 
= (lambda self
, i
: 
 288         i 
+ len("</scr'+'ipt>") 
 289         if self
.rawdata
[i
:].startswith("</scr'+'ipt>") 
 290         else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
)) 
 292 def get_element_by_id(id, html
): 
 293     """Return the content of the tag with the specified ID in the passed HTML document""" 
 294     return get_element_by_attribute("id", id, html
) 
 296 def get_element_by_attribute(attribute
, value
, html
): 
 297     """Return the content of the tag with the specified attribute in the passed HTML document""" 
 298     parser 
= AttrParser(attribute
, value
) 
 301     except compat_html_parser
.HTMLParseError
: 
 303     return parser
.get_result() 
 306 def clean_html(html
): 
 307     """Clean an HTML snippet into a readable string""" 
 309     html 
= html
.replace('\n', ' ') 
 310     html 
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
) 
 311     html 
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
) 
 313     html 
= re
.sub('<.*?>', '', html
) 
 314     # Replace html entities 
 315     html 
= unescapeHTML(html
) 
 319 def sanitize_open(filename
, open_mode
): 
 320     """Try to open the given filename, and slightly tweak it if this fails. 
 322     Attempts to open the given filename. If this fails, it tries to change 
 323     the filename slightly, step by step, until it's either able to open it 
 324     or it fails and raises a final exception, like the standard open() 
 327     It returns the tuple (stream, definitive_file_name). 
 331             if sys
.platform 
== 'win32': 
 333                 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
) 
 334             return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
) 
 335         stream 
= open(encodeFilename(filename
), open_mode
) 
 336         return (stream
, filename
) 
 337     except (IOError, OSError) as err
: 
 338         if err
.errno 
in (errno
.EACCES
,): 
 341         # In case of error, try to remove win32 forbidden chars 
 342         alt_filename 
= os
.path
.join( 
 343                         re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
) 
 344                         for path_part 
in os
.path
.split(filename
) 
 346         if alt_filename 
== filename
: 
 349             # An exception here should be caught in the caller 
 350             stream 
= open(encodeFilename(filename
), open_mode
) 
 351             return (stream
, alt_filename
) 
 354 def timeconvert(timestr
): 
 355     """Convert RFC 2822 defined time string into system timestamp""" 
 357     timetuple 
= email
.utils
.parsedate_tz(timestr
) 
 358     if timetuple 
is not None: 
 359         timestamp 
= email
.utils
.mktime_tz(timetuple
) 
 362 def sanitize_filename(s
, restricted
=False, is_id
=False): 
 363     """Sanitizes a string so it could be used as part of a filename. 
 364     If restricted is set, use a stricter subset of allowed characters. 
 365     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible 
 367     def replace_insane(char
): 
 368         if char 
== '?' or ord(char
) < 32 or ord(char
) == 127: 
 371             return '' if restricted 
else '\'' 
 373             return '_-' if restricted 
else ' -' 
 374         elif char 
in '\\/|*<>': 
 376         if restricted 
and (char 
in '!&\'()[]{}$;`^,#' or char
.isspace()): 
 378         if restricted 
and ord(char
) > 127: 
 382     result 
= u
''.join(map(replace_insane
, s
)) 
 384         while '__' in result
: 
 385             result 
= result
.replace('__', '_') 
 386         result 
= result
.strip('_') 
 387         # Common case of "Foreign band name - English song title" 
 388         if restricted 
and result
.startswith('-_'): 
 394 def orderedSet(iterable
): 
 395     """ Remove all duplicates from the input iterable """ 
 406     assert type(s
) == type(u
'') 
 408     result 
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
) 
 411 def encodeFilename(s
): 
 413     @param s The name of the file 
 416     assert type(s
) == type(u
'') 
 418     # Python 3 has a Unicode API 
 419     if sys
.version_info 
>= (3, 0): 
 422     if sys
.platform 
== 'win32' and sys
.getwindowsversion()[0] >= 5: 
 423         # Pass u'' directly to use Unicode APIs on Windows 2000 and up 
 424         # (Detecting Windows NT 4 is tricky because 'major >= 4' would 
 425         # match Windows 9x series as well. Besides, NT 4 is obsolete.) 
 428         encoding 
= sys
.getfilesystemencoding() 
 431         return s
.encode(encoding
, 'ignore') 
 433 def decodeOption(optval
): 
 436     if isinstance(optval
, bytes): 
 437         optval 
= optval
.decode(preferredencoding()) 
 439     assert isinstance(optval
, compat_str
) 
 442 def formatSeconds(secs
): 
 444         return '%d:%02d:%02d' % (secs 
// 3600, (secs 
% 3600) // 60, secs 
% 60) 
 446         return '%d:%02d' % (secs 
// 60, secs 
% 60) 
 450 def make_HTTPS_handler(opts
): 
 451     if sys
.version_info 
< (3,2): 
 452         # Python's 2.x handler is very simplistic 
 453         return compat_urllib_request
.HTTPSHandler() 
 456         context 
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
) 
 457         context
.set_default_verify_paths() 
 459         context
.verify_mode 
= (ssl
.CERT_NONE
 
 460                                if opts
.no_check_certificate
 
 461                                else ssl
.CERT_REQUIRED
) 
 462         return compat_urllib_request
.HTTPSHandler(context
=context
) 
 464 class ExtractorError(Exception): 
 465     """Error during info extraction.""" 
 466     def __init__(self
, msg
, tb
=None): 
 467         """ tb, if given, is the original traceback (so that it can be printed out). """ 
 468         super(ExtractorError
, self
).__init
__(msg
) 
 470         self
.exc_info 
= sys
.exc_info()  # preserve original exception 
 472     def format_traceback(self
): 
 473         if self
.traceback 
is None: 
 475         return u
''.join(traceback
.format_tb(self
.traceback
)) 
 478 class DownloadError(Exception): 
 479     """Download Error exception. 
 481     This exception may be thrown by FileDownloader objects if they are not 
 482     configured to continue on errors. They will contain the appropriate 
 485     def __init__(self
, msg
, exc_info
=None): 
 486         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ 
 487         super(DownloadError
, self
).__init
__(msg
) 
 488         self
.exc_info 
= exc_info
 
 491 class SameFileError(Exception): 
 492     """Same File exception. 
 494     This exception will be thrown by FileDownloader objects if they detect 
 495     multiple files would have to be downloaded to the same file on disk. 
 500 class PostProcessingError(Exception): 
 501     """Post Processing exception. 
 503     This exception may be raised by PostProcessor's .run() method to 
 504     indicate an error in the postprocessing task. 
 506     def __init__(self
, msg
): 
 509 class MaxDownloadsReached(Exception): 
 510     """ --max-downloads limit has been reached. """ 
 514 class UnavailableVideoError(Exception): 
 515     """Unavailable Format exception. 
 517     This exception will be thrown when a video is requested 
 518     in a format that is not available for that video. 
 523 class ContentTooShortError(Exception): 
 524     """Content Too Short exception. 
 526     This exception may be raised by FileDownloader objects when a file they 
 527     download is too small for what the server announced first, indicating 
 528     the connection was probably interrupted. 
 534     def __init__(self
, downloaded
, expected
): 
 535         self
.downloaded 
= downloaded
 
 536         self
.expected 
= expected
 
 538 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
): 
 539     """Handler for HTTP requests and responses. 
 541     This class, when installed with an OpenerDirector, automatically adds 
 542     the standard headers to every HTTP request and handles gzipped and 
 543     deflated responses from web servers. If compression is to be avoided in 
 544     a particular request, the original request in the program code only has 
 545     to include the HTTP header "Youtubedl-No-Compression", which will be 
 546     removed before making the real request. 
 548     Part of this code was copied from: 
 550     http://techknack.net/python-urllib2-handlers/ 
 552     Andrew Rowls, the author of that code, agreed to release it to the 
 559             return zlib
.decompress(data
, -zlib
.MAX_WBITS
) 
 561             return zlib
.decompress(data
) 
 564     def addinfourl_wrapper(stream
, headers
, url
, code
): 
 565         if hasattr(compat_urllib_request
.addinfourl
, 'getcode'): 
 566             return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
) 
 567         ret 
= compat_urllib_request
.addinfourl(stream
, headers
, url
) 
 571     def http_request(self
, req
): 
 572         for h
,v 
in std_headers
.items(): 
 576         if 'Youtubedl-no-compression' in req
.headers
: 
 577             if 'Accept-encoding' in req
.headers
: 
 578                 del req
.headers
['Accept-encoding'] 
 579             del req
.headers
['Youtubedl-no-compression'] 
 580         if 'Youtubedl-user-agent' in req
.headers
: 
 581             if 'User-agent' in req
.headers
: 
 582                 del req
.headers
['User-agent'] 
 583             req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent'] 
 584             del req
.headers
['Youtubedl-user-agent'] 
 587     def http_response(self
, req
, resp
): 
 590         if resp
.headers
.get('Content-encoding', '') == 'gzip': 
 591             gz 
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r') 
 592             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 593             resp
.msg 
= old_resp
.msg
 
 595         if resp
.headers
.get('Content-encoding', '') == 'deflate': 
 596             gz 
= io
.BytesIO(self
.deflate(resp
.read())) 
 597             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 598             resp
.msg 
= old_resp
.msg
 
 601     https_request 
= http_request
 
 602     https_response 
= http_response
 
 604 def unified_strdate(date_str
): 
 605     """Return a string with the date in the format YYYYMMDD""" 
 608     date_str 
= date_str
.replace(',',' ') 
 609     # %z (UTC offset) is only supported in python>=3.2 
 610     date_str 
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
) 
 611     format_expressions 
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S'] 
 612     for expression 
in format_expressions
: 
 614             upload_date 
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d') 
 619 def date_from_str(date_str
): 
 621     Return a datetime object from a string in the format YYYYMMDD or 
 622     (now|today)[+-][0-9](day|week|month|year)(s)?""" 
 623     today 
= datetime
.date
.today() 
 624     if date_str 
== 'now'or date_str 
== 'today': 
 626     match 
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
) 
 627     if match 
is not None: 
 628         sign 
= match
.group('sign') 
 629         time 
= int(match
.group('time')) 
 632         unit 
= match
.group('unit') 
 641         delta 
= datetime
.timedelta(**{unit
: time
}) 
 643     return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date() 
 645 class DateRange(object): 
 646     """Represents a time interval between two dates""" 
 647     def __init__(self
, start
=None, end
=None): 
 648         """start and end must be strings in the format accepted by date""" 
 649         if start 
is not None: 
 650             self
.start 
= date_from_str(start
) 
 652             self
.start 
= datetime
.datetime
.min.date() 
 654             self
.end 
= date_from_str(end
) 
 656             self
.end 
= datetime
.datetime
.max.date() 
 657         if self
.start 
> self
.end
: 
 658             raise ValueError('Date range: "%s" , the start date must be before the end date' % self
) 
 661         """Returns a range that only contains the given day""" 
 663     def __contains__(self
, date
): 
 664         """Check if the date is in the range""" 
 665         if not isinstance(date
, datetime
.date
): 
 666             date 
= date_from_str(date
) 
 667         return self
.start 
<= date 
<= self
.end
 
 669         return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())