2 # -*- coding: utf-8 -*-
19 import urllib
.request
as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2
as compat_urllib_request
24 import urllib
.error
as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2
as compat_urllib_error
29 import urllib
.parse
as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib
as compat_urllib_parse
34 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse
import urlparse
as compat_urllib_parse_urlparse
39 import urllib
.parse
as compat_urlparse
40 except ImportError: # Python 2
41 import urlparse
as compat_urlparse
44 import http
.cookiejar
as compat_cookiejar
45 except ImportError: # Python 2
46 import cookielib
as compat_cookiejar
49 import html
.entities
as compat_html_entities
50 except ImportError: # Python 2
51 import htmlentitydefs
as compat_html_entities
54 import html
.parser
as compat_html_parser
55 except ImportError: # Python 2
56 import HTMLParser
as compat_html_parser
59 import http
.client
as compat_http_client
60 except ImportError: # Python 2
61 import httplib
as compat_http_client
64 from subprocess
import DEVNULL
65 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
67 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
70 from urllib
.parse
import parse_qs
as compat_parse_qs
71 except ImportError: # Python 2
72 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
73 # Python 2's version is apparently totally broken
74 def _unquote(string
, encoding
='utf-8', errors
='replace'):
77 res
= string
.split('%')
84 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
91 pct_sequence
+= item
[:2].decode('hex')
94 # This segment was just a single percent-encoded character.
95 # May be part of a sequence of code units, so delay decoding.
96 # (Stored in pct_sequence).
100 # Encountered non-percent-encoded characters. Flush the current
102 string
+= pct_sequence
.decode(encoding
, errors
) + rest
105 # Flush the final pct_sequence
106 string
+= pct_sequence
.decode(encoding
, errors
)
109 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
110 encoding
='utf-8', errors
='replace'):
111 qs
, _coerce_result
= qs
, unicode
112 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
114 for name_value
in pairs
:
115 if not name_value
and not strict_parsing
:
117 nv
= name_value
.split('=', 1)
120 raise ValueError("bad query field: %r" % (name_value
,))
121 # Handle case of a control-name with no equal sign
122 if keep_blank_values
:
126 if len(nv
[1]) or keep_blank_values
:
127 name
= nv
[0].replace('+', ' ')
128 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
129 name
= _coerce_result(name
)
130 value
= nv
[1].replace('+', ' ')
131 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
132 value
= _coerce_result(value
)
133 r
.append((name
, value
))
136 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
137 encoding
='utf-8', errors
='replace'):
139 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
140 encoding
=encoding
, errors
=errors
)
141 for name
, value
in pairs
:
142 if name
in parsed_result
:
143 parsed_result
[name
].append(value
)
145 parsed_result
[name
] = [value
]
149 compat_str
= unicode # Python 2
154 compat_chr
= unichr # Python 2
159 if type(c
) is int: return c
162 # This is not clearly defined otherwise
163 compiled_regex_type
= type(re
.compile(''))
166 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
167 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
168 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
169 'Accept-Encoding': 'gzip, deflate',
170 'Accept-Language': 'en-us,en;q=0.5',
173 def preferredencoding():
174 """Get preferred encoding.
176 Returns the best encoding scheme for the system, based on
177 locale.getpreferredencoding() and some further tweaks.
180 pref
= locale
.getpreferredencoding()
187 if sys
.version_info
< (3,0):
189 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
192 assert type(s
) == type(u
'')
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys
.version_info
< (3,0):
198 def write_json_file(obj
, fn
):
199 with open(fn
, 'wb') as f
:
202 def write_json_file(obj
, fn
):
203 with open(fn
, 'w', encoding
='utf-8') as f
:
206 if sys
.version_info
>= (2,7):
207 def find_xpath_attr(node
, xpath
, key
, val
):
208 """ Find the xpath xpath[@key=val] """
209 assert re
.match(r
'^[a-zA-Z]+$', key
)
210 assert re
.match(r
'^[a-zA-Z@\s]*$', val
)
211 expr
= xpath
+ u
"[@%s='%s']" % (key
, val
)
212 return node
.find(expr
)
214 def find_xpath_attr(node
, xpath
, key
, val
):
215 for f
in node
.findall(xpath
):
216 if f
.attrib
.get(key
) == val
:
220 def htmlentity_transform(matchobj
):
221 """Transforms an HTML entity to a character.
223 This function receives a match object and is intended to be used with
224 the re.sub() function.
226 entity
= matchobj
.group(1)
228 # Known non-numeric HTML entity
229 if entity
in compat_html_entities
.name2codepoint
:
230 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
232 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
234 numstr
= mobj
.group(1)
235 if numstr
.startswith(u
'x'):
237 numstr
= u
'0%s' % numstr
240 return compat_chr(int(numstr
, base
))
242 # Unknown entity in name, return its literal representation
243 return (u
'&%s;' % entity
)
245 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
246 class AttrParser(compat_html_parser
.HTMLParser
):
247 """Modified HTMLParser that isolates a tag with the specified attribute"""
248 def __init__(self
, attribute
, value
):
249 self
.attribute
= attribute
255 self
.watch_startpos
= False
257 compat_html_parser
.HTMLParser
.__init
__(self
)
259 def error(self
, message
):
260 if self
.error_count
> 10 or self
.started
:
261 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
262 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
263 self
.error_count
+= 1
266 def loads(self
, html
):
271 def handle_starttag(self
, tag
, attrs
):
274 self
.find_startpos(None)
275 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
278 self
.watch_startpos
= True
280 if not tag
in self
.depth
: self
.depth
[tag
] = 0
283 def handle_endtag(self
, tag
):
285 if tag
in self
.depth
: self
.depth
[tag
] -= 1
286 if self
.depth
[self
.result
[0]] == 0:
288 self
.result
.append(self
.getpos())
290 def find_startpos(self
, x
):
291 """Needed to put the start position of the result (self.result[1])
292 after the opening tag with the requested id"""
293 if self
.watch_startpos
:
294 self
.watch_startpos
= False
295 self
.result
.append(self
.getpos())
296 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
297 handle_decl
= handle_pi
= unknown_decl
= find_startpos
299 def get_result(self
):
300 if self
.result
is None:
302 if len(self
.result
) != 3:
304 lines
= self
.html
.split('\n')
305 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
306 lines
[0] = lines
[0][self
.result
[1][1]:]
308 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
309 lines
[-1] = lines
[-1][:self
.result
[2][1]]
310 return '\n'.join(lines
).strip()
311 # Hack for https://github.com/rg3/youtube-dl/issues/662
312 if sys
.version_info
< (2, 7, 3):
313 AttrParser
.parse_endtag
= (lambda self
, i
:
314 i
+ len("</scr'+'ipt>")
315 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
316 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
318 def get_element_by_id(id, html
):
319 """Return the content of the tag with the specified ID in the passed HTML document"""
320 return get_element_by_attribute("id", id, html
)
322 def get_element_by_attribute(attribute
, value
, html
):
323 """Return the content of the tag with the specified attribute in the passed HTML document"""
324 parser
= AttrParser(attribute
, value
)
327 except compat_html_parser
.HTMLParseError
:
329 return parser
.get_result()
332 def clean_html(html
):
333 """Clean an HTML snippet into a readable string"""
335 html
= html
.replace('\n', ' ')
336 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
337 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
339 html
= re
.sub('<.*?>', '', html
)
340 # Replace html entities
341 html
= unescapeHTML(html
)
345 def sanitize_open(filename
, open_mode
):
346 """Try to open the given filename, and slightly tweak it if this fails.
348 Attempts to open the given filename. If this fails, it tries to change
349 the filename slightly, step by step, until it's either able to open it
350 or it fails and raises a final exception, like the standard open()
353 It returns the tuple (stream, definitive_file_name).
357 if sys
.platform
== 'win32':
359 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
360 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
361 stream
= open(encodeFilename(filename
), open_mode
)
362 return (stream
, filename
)
363 except (IOError, OSError) as err
:
364 if err
.errno
in (errno
.EACCES
,):
367 # In case of error, try to remove win32 forbidden chars
368 alt_filename
= os
.path
.join(
369 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
370 for path_part
in os
.path
.split(filename
)
372 if alt_filename
== filename
:
375 # An exception here should be caught in the caller
376 stream
= open(encodeFilename(filename
), open_mode
)
377 return (stream
, alt_filename
)
380 def timeconvert(timestr
):
381 """Convert RFC 2822 defined time string into system timestamp"""
383 timetuple
= email
.utils
.parsedate_tz(timestr
)
384 if timetuple
is not None:
385 timestamp
= email
.utils
.mktime_tz(timetuple
)
388 def sanitize_filename(s
, restricted
=False, is_id
=False):
389 """Sanitizes a string so it could be used as part of a filename.
390 If restricted is set, use a stricter subset of allowed characters.
391 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
393 def replace_insane(char
):
394 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
397 return '' if restricted
else '\''
399 return '_-' if restricted
else ' -'
400 elif char
in '\\/|*<>':
402 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
404 if restricted
and ord(char
) > 127:
408 result
= u
''.join(map(replace_insane
, s
))
410 while '__' in result
:
411 result
= result
.replace('__', '_')
412 result
= result
.strip('_')
413 # Common case of "Foreign band name - English song title"
414 if restricted
and result
.startswith('-_'):
420 def orderedSet(iterable
):
421 """ Remove all duplicates from the input iterable """
432 assert type(s
) == type(u
'')
434 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
437 def encodeFilename(s
):
439 @param s The name of the file
442 assert type(s
) == type(u
'')
444 # Python 3 has a Unicode API
445 if sys
.version_info
>= (3, 0):
448 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
449 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
450 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
451 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
454 encoding
= sys
.getfilesystemencoding()
457 return s
.encode(encoding
, 'ignore')
459 def decodeOption(optval
):
462 if isinstance(optval
, bytes):
463 optval
= optval
.decode(preferredencoding())
465 assert isinstance(optval
, compat_str
)
468 def formatSeconds(secs
):
470 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
472 return '%d:%02d' % (secs
// 60, secs
% 60)
476 def make_HTTPS_handler(opts
):
477 if sys
.version_info
< (3,2):
478 # Python's 2.x handler is very simplistic
479 return compat_urllib_request
.HTTPSHandler()
482 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
483 context
.set_default_verify_paths()
485 context
.verify_mode
= (ssl
.CERT_NONE
486 if opts
.no_check_certificate
487 else ssl
.CERT_REQUIRED
)
488 return compat_urllib_request
.HTTPSHandler(context
=context
)
490 class ExtractorError(Exception):
491 """Error during info extraction."""
492 def __init__(self
, msg
, tb
=None, expected
=False):
493 """ tb, if given, is the original traceback (so that it can be printed out).
494 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
497 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
500 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
501 super(ExtractorError
, self
).__init
__(msg
)
504 self
.exc_info
= sys
.exc_info() # preserve original exception
506 def format_traceback(self
):
507 if self
.traceback
is None:
509 return u
''.join(traceback
.format_tb(self
.traceback
))
512 class DownloadError(Exception):
513 """Download Error exception.
515 This exception may be thrown by FileDownloader objects if they are not
516 configured to continue on errors. They will contain the appropriate
519 def __init__(self
, msg
, exc_info
=None):
520 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
521 super(DownloadError
, self
).__init
__(msg
)
522 self
.exc_info
= exc_info
525 class SameFileError(Exception):
526 """Same File exception.
528 This exception will be thrown by FileDownloader objects if they detect
529 multiple files would have to be downloaded to the same file on disk.
534 class PostProcessingError(Exception):
535 """Post Processing exception.
537 This exception may be raised by PostProcessor's .run() method to
538 indicate an error in the postprocessing task.
540 def __init__(self
, msg
):
543 class MaxDownloadsReached(Exception):
544 """ --max-downloads limit has been reached. """
548 class UnavailableVideoError(Exception):
549 """Unavailable Format exception.
551 This exception will be thrown when a video is requested
552 in a format that is not available for that video.
557 class ContentTooShortError(Exception):
558 """Content Too Short exception.
560 This exception may be raised by FileDownloader objects when a file they
561 download is too small for what the server announced first, indicating
562 the connection was probably interrupted.
568 def __init__(self
, downloaded
, expected
):
569 self
.downloaded
= downloaded
570 self
.expected
= expected
572 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
573 """Handler for HTTP requests and responses.
575 This class, when installed with an OpenerDirector, automatically adds
576 the standard headers to every HTTP request and handles gzipped and
577 deflated responses from web servers. If compression is to be avoided in
578 a particular request, the original request in the program code only has
579 to include the HTTP header "Youtubedl-No-Compression", which will be
580 removed before making the real request.
582 Part of this code was copied from:
584 http://techknack.net/python-urllib2-handlers/
586 Andrew Rowls, the author of that code, agreed to release it to the
593 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
595 return zlib
.decompress(data
)
598 def addinfourl_wrapper(stream
, headers
, url
, code
):
599 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
600 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
601 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
605 def http_request(self
, req
):
606 for h
,v
in std_headers
.items():
610 if 'Youtubedl-no-compression' in req
.headers
:
611 if 'Accept-encoding' in req
.headers
:
612 del req
.headers
['Accept-encoding']
613 del req
.headers
['Youtubedl-no-compression']
614 if 'Youtubedl-user-agent' in req
.headers
:
615 if 'User-agent' in req
.headers
:
616 del req
.headers
['User-agent']
617 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
618 del req
.headers
['Youtubedl-user-agent']
621 def http_response(self
, req
, resp
):
624 if resp
.headers
.get('Content-encoding', '') == 'gzip':
625 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
626 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
627 resp
.msg
= old_resp
.msg
629 if resp
.headers
.get('Content-encoding', '') == 'deflate':
630 gz
= io
.BytesIO(self
.deflate(resp
.read()))
631 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
632 resp
.msg
= old_resp
.msg
635 https_request
= http_request
636 https_response
= http_response
638 def unified_strdate(date_str
):
639 """Return a string with the date in the format YYYYMMDD"""
642 date_str
= date_str
.replace(',',' ')
643 # %z (UTC offset) is only supported in python>=3.2
644 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
645 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
646 for expression
in format_expressions
:
648 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
653 def determine_ext(url
, default_ext
=u
'unknown_video'):
654 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
655 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
660 def date_from_str(date_str
):
662 Return a datetime object from a string in the format YYYYMMDD or
663 (now|today)[+-][0-9](day|week|month|year)(s)?"""
664 today
= datetime
.date
.today()
665 if date_str
== 'now'or date_str
== 'today':
667 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
668 if match
is not None:
669 sign
= match
.group('sign')
670 time
= int(match
.group('time'))
673 unit
= match
.group('unit')
682 delta
= datetime
.timedelta(**{unit
: time
})
684 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
686 class DateRange(object):
687 """Represents a time interval between two dates"""
688 def __init__(self
, start
=None, end
=None):
689 """start and end must be strings in the format accepted by date"""
690 if start
is not None:
691 self
.start
= date_from_str(start
)
693 self
.start
= datetime
.datetime
.min.date()
695 self
.end
= date_from_str(end
)
697 self
.end
= datetime
.datetime
.max.date()
698 if self
.start
> self
.end
:
699 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
702 """Returns a range that only contains the given day"""
704 def __contains__(self
, date
):
705 """Check if the date is in the range"""
706 if not isinstance(date
, datetime
.date
):
707 date
= date_from_str(date
)
708 return self
.start
<= date
<= self
.end
710 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())