2 # -*- coding: utf-8 -*- 
  19     import urllib
.request 
as compat_urllib_request
 
  20 except ImportError: # Python 2 
  21     import urllib2 
as compat_urllib_request
 
  24     import urllib
.error 
as compat_urllib_error
 
  25 except ImportError: # Python 2 
  26     import urllib2 
as compat_urllib_error
 
  29     import urllib
.parse 
as compat_urllib_parse
 
  30 except ImportError: # Python 2 
  31     import urllib 
as compat_urllib_parse
 
  34     from urllib
.parse 
import urlparse 
as compat_urllib_parse_urlparse
 
  35 except ImportError: # Python 2 
  36     from urlparse 
import urlparse 
as compat_urllib_parse_urlparse
 
  39     import urllib
.parse 
as compat_urlparse
 
  40 except ImportError: # Python 2 
  41     import urlparse 
as compat_urlparse
 
  44     import http
.cookiejar 
as compat_cookiejar
 
  45 except ImportError: # Python 2 
  46     import cookielib 
as compat_cookiejar
 
  49     import html
.entities 
as compat_html_entities
 
  50 except ImportError: # Python 2 
  51     import htmlentitydefs 
as compat_html_entities
 
  54     import html
.parser 
as compat_html_parser
 
  55 except ImportError: # Python 2 
  56     import HTMLParser 
as compat_html_parser
 
  59     import http
.client 
as compat_http_client
 
  60 except ImportError: # Python 2 
  61     import httplib 
as compat_http_client
 
  64     from subprocess 
import DEVNULL
 
  65     compat_subprocess_get_DEVNULL 
= lambda: DEVNULL
 
  67     compat_subprocess_get_DEVNULL 
= lambda: open(os
.path
.devnull
, 'w') 
  70     from urllib
.parse 
import parse_qs 
as compat_parse_qs
 
  71 except ImportError: # Python 2 
  72     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. 
  73     # Python 2's version is apparently totally broken 
  74     def _unquote(string
, encoding
='utf-8', errors
='replace'): 
  77         res 
= string
.split('%') 
  84         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded 
  91                 pct_sequence 
+= item
[:2].decode('hex') 
  94                     # This segment was just a single percent-encoded character. 
  95                     # May be part of a sequence of code units, so delay decoding. 
  96                     # (Stored in pct_sequence). 
 100             # Encountered non-percent-encoded characters. Flush the current 
 102             string 
+= pct_sequence
.decode(encoding
, errors
) + rest
 
 105             # Flush the final pct_sequence 
 106             string 
+= pct_sequence
.decode(encoding
, errors
) 
 109     def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False, 
 110                 encoding
='utf-8', errors
='replace'): 
 111         qs
, _coerce_result 
= qs
, unicode 
 112         pairs 
= [s2 
for s1 
in qs
.split('&') for s2 
in s1
.split(';')] 
 114         for name_value 
in pairs
: 
 115             if not name_value 
and not strict_parsing
: 
 117             nv 
= name_value
.split('=', 1) 
 120                     raise ValueError("bad query field: %r" % (name_value
,)) 
 121                 # Handle case of a control-name with no equal sign 
 122                 if keep_blank_values
: 
 126             if len(nv
[1]) or keep_blank_values
: 
 127                 name 
= nv
[0].replace('+', ' ') 
 128                 name 
= _unquote(name
, encoding
=encoding
, errors
=errors
) 
 129                 name 
= _coerce_result(name
) 
 130                 value 
= nv
[1].replace('+', ' ') 
 131                 value 
= _unquote(value
, encoding
=encoding
, errors
=errors
) 
 132                 value 
= _coerce_result(value
) 
 133                 r
.append((name
, value
)) 
 136     def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False, 
 137                 encoding
='utf-8', errors
='replace'): 
 139         pairs 
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
, 
 140                         encoding
=encoding
, errors
=errors
) 
 141         for name
, value 
in pairs
: 
 142             if name 
in parsed_result
: 
 143                 parsed_result
[name
].append(value
) 
 145                 parsed_result
[name
] = [value
] 
 149     compat_str 
= unicode # Python 2 
 154     compat_chr 
= unichr # Python 2 
 159     if type(c
) is int: return c
 
 162 # This is not clearly defined otherwise 
 163 compiled_regex_type 
= type(re
.compile('')) 
 166     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 
 167     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
 168     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
 169     'Accept-Encoding': 'gzip, deflate', 
 170     'Accept-Language': 'en-us,en;q=0.5', 
 173 def preferredencoding(): 
 174     """Get preferred encoding. 
 176     Returns the best encoding scheme for the system, based on 
 177     locale.getpreferredencoding() and some further tweaks. 
 180         pref 
= locale
.getpreferredencoding() 
 187 if sys
.version_info 
< (3,0): 
 189         print(s
.encode(preferredencoding(), 'xmlcharrefreplace')) 
 192         assert type(s
) == type(u
'') 
 195 # In Python 2.x, json.dump expects a bytestream. 
 196 # In Python 3.x, it writes to a character stream 
 197 if sys
.version_info 
< (3,0): 
 198     def write_json_file(obj
, fn
): 
 199         with open(fn
, 'wb') as f
: 
 202     def write_json_file(obj
, fn
): 
 203         with open(fn
, 'w', encoding
='utf-8') as f
: 
 206 if sys
.version_info 
>= (2,7): 
 207     def find_xpath_attr(node
, xpath
, key
, val
): 
 208         """ Find the xpath xpath[@key=val] """ 
 209         assert re
.match(r
'^[a-zA-Z]+$', key
) 
 210         assert re
.match(r
'^[a-zA-Z@\s]*$', val
) 
 211         expr 
= xpath 
+ u
"[@%s='%s']" % (key
, val
) 
 212         return node
.find(expr
) 
 214     def find_xpath_attr(node
, xpath
, key
, val
): 
 215         for f 
in node
.findall(xpath
): 
 216             if f
.attrib
.get(key
) == val
: 
 220 def htmlentity_transform(matchobj
): 
 221     """Transforms an HTML entity to a character. 
 223     This function receives a match object and is intended to be used with 
 224     the re.sub() function. 
 226     entity 
= matchobj
.group(1) 
 228     # Known non-numeric HTML entity 
 229     if entity 
in compat_html_entities
.name2codepoint
: 
 230         return compat_chr(compat_html_entities
.name2codepoint
[entity
]) 
 232     mobj 
= re
.match(u
'(?u)#(x?\\d+)', entity
) 
 234         numstr 
= mobj
.group(1) 
 235         if numstr
.startswith(u
'x'): 
 237             numstr 
= u
'0%s' % numstr
 
 240         return compat_chr(int(numstr
, base
)) 
 242     # Unknown entity in name, return its literal representation 
 243     return (u
'&%s;' % entity
) 
 245 compat_html_parser
.locatestarttagend 
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix 
 246 class AttrParser(compat_html_parser
.HTMLParser
): 
 247     """Modified HTMLParser that isolates a tag with the specified attribute""" 
 248     def __init__(self
, attribute
, value
): 
 249         self
.attribute 
= attribute
 
 255         self
.watch_startpos 
= False 
 257         compat_html_parser
.HTMLParser
.__init
__(self
) 
 259     def error(self
, message
): 
 260         if self
.error_count 
> 10 or self
.started
: 
 261             raise compat_html_parser
.HTMLParseError(message
, self
.getpos()) 
 262         self
.rawdata 
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line 
 263         self
.error_count 
+= 1 
 266     def loads(self
, html
): 
 271     def handle_starttag(self
, tag
, attrs
): 
 274             self
.find_startpos(None) 
 275         if self
.attribute 
in attrs 
and attrs
[self
.attribute
] == self
.value
: 
 278             self
.watch_startpos 
= True 
 280             if not tag 
in self
.depth
: self
.depth
[tag
] = 0 
 283     def handle_endtag(self
, tag
): 
 285             if tag 
in self
.depth
: self
.depth
[tag
] -= 1 
 286             if self
.depth
[self
.result
[0]] == 0: 
 288                 self
.result
.append(self
.getpos()) 
 290     def find_startpos(self
, x
): 
 291         """Needed to put the start position of the result (self.result[1]) 
 292         after the opening tag with the requested id""" 
 293         if self
.watch_startpos
: 
 294             self
.watch_startpos 
= False 
 295             self
.result
.append(self
.getpos()) 
 296     handle_entityref 
= handle_charref 
= handle_data 
= handle_comment 
= \
 
 297     handle_decl 
= handle_pi 
= unknown_decl 
= find_startpos
 
 299     def get_result(self
): 
 300         if self
.result 
is None: 
 302         if len(self
.result
) != 3: 
 304         lines 
= self
.html
.split('\n') 
 305         lines 
= lines
[self
.result
[1][0]-1:self
.result
[2][0]] 
 306         lines
[0] = lines
[0][self
.result
[1][1]:] 
 308             lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]] 
 309         lines
[-1] = lines
[-1][:self
.result
[2][1]] 
 310         return '\n'.join(lines
).strip() 
 311 # Hack for https://github.com/rg3/youtube-dl/issues/662 
 312 if sys
.version_info 
< (2, 7, 3): 
 313     AttrParser
.parse_endtag 
= (lambda self
, i
: 
 314         i 
+ len("</scr'+'ipt>") 
 315         if self
.rawdata
[i
:].startswith("</scr'+'ipt>") 
 316         else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
)) 
 318 def get_element_by_id(id, html
): 
 319     """Return the content of the tag with the specified ID in the passed HTML document""" 
 320     return get_element_by_attribute("id", id, html
) 
 322 def get_element_by_attribute(attribute
, value
, html
): 
 323     """Return the content of the tag with the specified attribute in the passed HTML document""" 
 324     parser 
= AttrParser(attribute
, value
) 
 327     except compat_html_parser
.HTMLParseError
: 
 329     return parser
.get_result() 
 332 def clean_html(html
): 
 333     """Clean an HTML snippet into a readable string""" 
 335     html 
= html
.replace('\n', ' ') 
 336     html 
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
) 
 337     html 
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
) 
 339     html 
= re
.sub('<.*?>', '', html
) 
 340     # Replace html entities 
 341     html 
= unescapeHTML(html
) 
 345 def sanitize_open(filename
, open_mode
): 
 346     """Try to open the given filename, and slightly tweak it if this fails. 
 348     Attempts to open the given filename. If this fails, it tries to change 
 349     the filename slightly, step by step, until it's either able to open it 
 350     or it fails and raises a final exception, like the standard open() 
 353     It returns the tuple (stream, definitive_file_name). 
 357             if sys
.platform 
== 'win32': 
 359                 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
) 
 360             return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
) 
 361         stream 
= open(encodeFilename(filename
), open_mode
) 
 362         return (stream
, filename
) 
 363     except (IOError, OSError) as err
: 
 364         if err
.errno 
in (errno
.EACCES
,): 
 367         # In case of error, try to remove win32 forbidden chars 
 368         alt_filename 
= os
.path
.join( 
 369                         re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
) 
 370                         for path_part 
in os
.path
.split(filename
) 
 372         if alt_filename 
== filename
: 
 375             # An exception here should be caught in the caller 
 376             stream 
= open(encodeFilename(filename
), open_mode
) 
 377             return (stream
, alt_filename
) 
 380 def timeconvert(timestr
): 
 381     """Convert RFC 2822 defined time string into system timestamp""" 
 383     timetuple 
= email
.utils
.parsedate_tz(timestr
) 
 384     if timetuple 
is not None: 
 385         timestamp 
= email
.utils
.mktime_tz(timetuple
) 
 388 def sanitize_filename(s
, restricted
=False, is_id
=False): 
 389     """Sanitizes a string so it could be used as part of a filename. 
 390     If restricted is set, use a stricter subset of allowed characters. 
 391     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible 
 393     def replace_insane(char
): 
 394         if char 
== '?' or ord(char
) < 32 or ord(char
) == 127: 
 397             return '' if restricted 
else '\'' 
 399             return '_-' if restricted 
else ' -' 
 400         elif char 
in '\\/|*<>': 
 402         if restricted 
and (char 
in '!&\'()[]{}$;`^,#' or char
.isspace()): 
 404         if restricted 
and ord(char
) > 127: 
 408     result 
= u
''.join(map(replace_insane
, s
)) 
 410         while '__' in result
: 
 411             result 
= result
.replace('__', '_') 
 412         result 
= result
.strip('_') 
 413         # Common case of "Foreign band name - English song title" 
 414         if restricted 
and result
.startswith('-_'): 
 420 def orderedSet(iterable
): 
 421     """ Remove all duplicates from the input iterable """ 
 432     assert type(s
) == type(u
'') 
 434     result 
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
) 
 437 def encodeFilename(s
): 
 439     @param s The name of the file 
 442     assert type(s
) == type(u
'') 
 444     # Python 3 has a Unicode API 
 445     if sys
.version_info 
>= (3, 0): 
 448     if sys
.platform 
== 'win32' and sys
.getwindowsversion()[0] >= 5: 
 449         # Pass u'' directly to use Unicode APIs on Windows 2000 and up 
 450         # (Detecting Windows NT 4 is tricky because 'major >= 4' would 
 451         # match Windows 9x series as well. Besides, NT 4 is obsolete.) 
 454         encoding 
= sys
.getfilesystemencoding() 
 457         return s
.encode(encoding
, 'ignore') 
 459 def decodeOption(optval
): 
 462     if isinstance(optval
, bytes): 
 463         optval 
= optval
.decode(preferredencoding()) 
 465     assert isinstance(optval
, compat_str
) 
 468 def formatSeconds(secs
): 
 470         return '%d:%02d:%02d' % (secs 
// 3600, (secs 
% 3600) // 60, secs 
% 60) 
 472         return '%d:%02d' % (secs 
// 60, secs 
% 60) 
 476 def make_HTTPS_handler(opts
): 
 477     if sys
.version_info 
< (3,2): 
 478         # Python's 2.x handler is very simplistic 
 479         return compat_urllib_request
.HTTPSHandler() 
 482         context 
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
) 
 483         context
.set_default_verify_paths() 
 485         context
.verify_mode 
= (ssl
.CERT_NONE
 
 486                                if opts
.no_check_certificate
 
 487                                else ssl
.CERT_REQUIRED
) 
 488         return compat_urllib_request
.HTTPSHandler(context
=context
) 
 490 class ExtractorError(Exception): 
 491     """Error during info extraction.""" 
 492     def __init__(self
, msg
, tb
=None, expected
=False): 
 493         """ tb, if given, is the original traceback (so that it can be printed out). 
 494         If expected is set, this is a normal error message and most likely not a bug in youtube-dl. 
 497         if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
): 
 500             msg 
= msg 
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.' 
 501         super(ExtractorError
, self
).__init
__(msg
) 
 504         self
.exc_info 
= sys
.exc_info()  # preserve original exception 
 506     def format_traceback(self
): 
 507         if self
.traceback 
is None: 
 509         return u
''.join(traceback
.format_tb(self
.traceback
)) 
 512 class DownloadError(Exception): 
 513     """Download Error exception. 
 515     This exception may be thrown by FileDownloader objects if they are not 
 516     configured to continue on errors. They will contain the appropriate 
 519     def __init__(self
, msg
, exc_info
=None): 
 520         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ 
 521         super(DownloadError
, self
).__init
__(msg
) 
 522         self
.exc_info 
= exc_info
 
 525 class SameFileError(Exception): 
 526     """Same File exception. 
 528     This exception will be thrown by FileDownloader objects if they detect 
 529     multiple files would have to be downloaded to the same file on disk. 
 534 class PostProcessingError(Exception): 
 535     """Post Processing exception. 
 537     This exception may be raised by PostProcessor's .run() method to 
 538     indicate an error in the postprocessing task. 
 540     def __init__(self
, msg
): 
 543 class MaxDownloadsReached(Exception): 
 544     """ --max-downloads limit has been reached. """ 
 548 class UnavailableVideoError(Exception): 
 549     """Unavailable Format exception. 
 551     This exception will be thrown when a video is requested 
 552     in a format that is not available for that video. 
 557 class ContentTooShortError(Exception): 
 558     """Content Too Short exception. 
 560     This exception may be raised by FileDownloader objects when a file they 
 561     download is too small for what the server announced first, indicating 
 562     the connection was probably interrupted. 
 568     def __init__(self
, downloaded
, expected
): 
 569         self
.downloaded 
= downloaded
 
 570         self
.expected 
= expected
 
 572 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
): 
 573     """Handler for HTTP requests and responses. 
 575     This class, when installed with an OpenerDirector, automatically adds 
 576     the standard headers to every HTTP request and handles gzipped and 
 577     deflated responses from web servers. If compression is to be avoided in 
 578     a particular request, the original request in the program code only has 
 579     to include the HTTP header "Youtubedl-No-Compression", which will be 
 580     removed before making the real request. 
 582     Part of this code was copied from: 
 584     http://techknack.net/python-urllib2-handlers/ 
 586     Andrew Rowls, the author of that code, agreed to release it to the 
 593             return zlib
.decompress(data
, -zlib
.MAX_WBITS
) 
 595             return zlib
.decompress(data
) 
 598     def addinfourl_wrapper(stream
, headers
, url
, code
): 
 599         if hasattr(compat_urllib_request
.addinfourl
, 'getcode'): 
 600             return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
) 
 601         ret 
= compat_urllib_request
.addinfourl(stream
, headers
, url
) 
 605     def http_request(self
, req
): 
 606         for h
,v 
in std_headers
.items(): 
 610         if 'Youtubedl-no-compression' in req
.headers
: 
 611             if 'Accept-encoding' in req
.headers
: 
 612                 del req
.headers
['Accept-encoding'] 
 613             del req
.headers
['Youtubedl-no-compression'] 
 614         if 'Youtubedl-user-agent' in req
.headers
: 
 615             if 'User-agent' in req
.headers
: 
 616                 del req
.headers
['User-agent'] 
 617             req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent'] 
 618             del req
.headers
['Youtubedl-user-agent'] 
 621     def http_response(self
, req
, resp
): 
 624         if resp
.headers
.get('Content-encoding', '') == 'gzip': 
 625             gz 
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r') 
 626             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 627             resp
.msg 
= old_resp
.msg
 
 629         if resp
.headers
.get('Content-encoding', '') == 'deflate': 
 630             gz 
= io
.BytesIO(self
.deflate(resp
.read())) 
 631             resp 
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
) 
 632             resp
.msg 
= old_resp
.msg
 
 635     https_request 
= http_request
 
 636     https_response 
= http_response
 
 638 def unified_strdate(date_str
): 
 639     """Return a string with the date in the format YYYYMMDD""" 
 642     date_str 
= date_str
.replace(',',' ') 
 643     # %z (UTC offset) is only supported in python>=3.2 
 644     date_str 
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
) 
 645     format_expressions 
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] 
 646     for expression 
in format_expressions
: 
 648             upload_date 
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d') 
 653 def determine_ext(url
, default_ext
=u
'unknown_video'): 
 654     guess 
= url
.partition(u
'?')[0].rpartition(u
'.')[2] 
 655     if re
.match(r
'^[A-Za-z0-9]+$', guess
): 
 660 def date_from_str(date_str
): 
 662     Return a datetime object from a string in the format YYYYMMDD or 
 663     (now|today)[+-][0-9](day|week|month|year)(s)?""" 
 664     today 
= datetime
.date
.today() 
 665     if date_str 
== 'now'or date_str 
== 'today': 
 667     match 
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
) 
 668     if match 
is not None: 
 669         sign 
= match
.group('sign') 
 670         time 
= int(match
.group('time')) 
 673         unit 
= match
.group('unit') 
 682         delta 
= datetime
.timedelta(**{unit
: time
}) 
 684     return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date() 
 686 class DateRange(object): 
 687     """Represents a time interval between two dates""" 
 688     def __init__(self
, start
=None, end
=None): 
 689         """start and end must be strings in the format accepted by date""" 
 690         if start 
is not None: 
 691             self
.start 
= date_from_str(start
) 
 693             self
.start 
= datetime
.datetime
.min.date() 
 695             self
.end 
= date_from_str(end
) 
 697             self
.end 
= datetime
.datetime
.max.date() 
 698         if self
.start 
> self
.end
: 
 699             raise ValueError('Date range: "%s" , the start date must be before the end date' % self
) 
 702         """Returns a range that only contains the given day""" 
 704     def __contains__(self
, date
): 
 705         """Check if the date is in the range""" 
 706         if not isinstance(date
, datetime
.date
): 
 707             date 
= date_from_str(date
) 
 708         return self
.start 
<= date 
<= self
.end
 
 710         return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())