2 # -*- coding: utf-8 -*-
19 import urllib
.request
as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2
as compat_urllib_request
24 import urllib
.error
as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2
as compat_urllib_error
29 import urllib
.parse
as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib
as compat_urllib_parse
34 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse
import urlparse
as compat_urllib_parse_urlparse
39 import http
.cookiejar
as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib
as compat_cookiejar
44 import html
.entities
as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs
as compat_html_entities
49 import html
.parser
as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser
as compat_html_parser
54 import http
.client
as compat_http_client
55 except ImportError: # Python 2
56 import httplib
as compat_http_client
59 from subprocess
import DEVNULL
60 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
62 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
65 from urllib
.parse
import parse_qs
as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string
, encoding
='utf-8', errors
='replace'):
72 res
= string
.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence
+= item
[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string
+= pct_sequence
.decode(encoding
, errors
) + rest
100 # Flush the final pct_sequence
101 string
+= pct_sequence
.decode(encoding
, errors
)
104 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
105 encoding
='utf-8', errors
='replace'):
106 qs
, _coerce_result
= qs
, unicode
107 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
109 for name_value
in pairs
:
110 if not name_value
and not strict_parsing
:
112 nv
= name_value
.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value
,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values
:
121 if len(nv
[1]) or keep_blank_values
:
122 name
= nv
[0].replace('+', ' ')
123 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
124 name
= _coerce_result(name
)
125 value
= nv
[1].replace('+', ' ')
126 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
127 value
= _coerce_result(value
)
128 r
.append((name
, value
))
131 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
132 encoding
='utf-8', errors
='replace'):
134 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
135 encoding
=encoding
, errors
=errors
)
136 for name
, value
in pairs
:
137 if name
in parsed_result
:
138 parsed_result
[name
].append(value
)
140 parsed_result
[name
] = [value
]
144 compat_str
= unicode # Python 2
149 compat_chr
= unichr # Python 2
154 if type(c
) is int: return c
157 # This is not clearly defined otherwise
158 compiled_regex_type
= type(re
.compile(''))
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
168 def preferredencoding():
169 """Get preferred encoding.
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
175 pref
= locale
.getpreferredencoding()
182 if sys
.version_info
< (3,0):
184 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
187 assert type(s
) == type(u
'')
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys
.version_info
< (3,0):
193 def write_json_file(obj
, fn
):
194 with open(fn
, 'wb') as f
:
197 def write_json_file(obj
, fn
):
198 with open(fn
, 'w', encoding
='utf-8') as f
:
201 def htmlentity_transform(matchobj
):
202 """Transforms an HTML entity to a character.
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
207 entity
= matchobj
.group(1)
209 # Known non-numeric HTML entity
210 if entity
in compat_html_entities
.name2codepoint
:
211 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
213 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
215 numstr
= mobj
.group(1)
216 if numstr
.startswith(u
'x'):
218 numstr
= u
'0%s' % numstr
221 return compat_chr(int(numstr
, base
))
223 # Unknown entity in name, return its literal representation
224 return (u
'&%s;' % entity
)
226 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
227 class AttrParser(compat_html_parser
.HTMLParser
):
228 """Modified HTMLParser that isolates a tag with the specified attribute"""
229 def __init__(self
, attribute
, value
):
230 self
.attribute
= attribute
236 self
.watch_startpos
= False
238 compat_html_parser
.HTMLParser
.__init
__(self
)
240 def error(self
, message
):
241 if self
.error_count
> 10 or self
.started
:
242 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
243 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
244 self
.error_count
+= 1
247 def loads(self
, html
):
252 def handle_starttag(self
, tag
, attrs
):
255 self
.find_startpos(None)
256 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
259 self
.watch_startpos
= True
261 if not tag
in self
.depth
: self
.depth
[tag
] = 0
264 def handle_endtag(self
, tag
):
266 if tag
in self
.depth
: self
.depth
[tag
] -= 1
267 if self
.depth
[self
.result
[0]] == 0:
269 self
.result
.append(self
.getpos())
271 def find_startpos(self
, x
):
272 """Needed to put the start position of the result (self.result[1])
273 after the opening tag with the requested id"""
274 if self
.watch_startpos
:
275 self
.watch_startpos
= False
276 self
.result
.append(self
.getpos())
277 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
278 handle_decl
= handle_pi
= unknown_decl
= find_startpos
280 def get_result(self
):
281 if self
.result
is None:
283 if len(self
.result
) != 3:
285 lines
= self
.html
.split('\n')
286 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
287 lines
[0] = lines
[0][self
.result
[1][1]:]
289 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
290 lines
[-1] = lines
[-1][:self
.result
[2][1]]
291 return '\n'.join(lines
).strip()
292 # Hack for https://github.com/rg3/youtube-dl/issues/662
293 if sys
.version_info
< (2, 7, 3):
294 AttrParser
.parse_endtag
= (lambda self
, i
:
295 i
+ len("</scr'+'ipt>")
296 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
297 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
299 def get_element_by_id(id, html
):
300 """Return the content of the tag with the specified ID in the passed HTML document"""
301 return get_element_by_attribute("id", id, html
)
303 def get_element_by_attribute(attribute
, value
, html
):
304 """Return the content of the tag with the specified attribute in the passed HTML document"""
305 parser
= AttrParser(attribute
, value
)
308 except compat_html_parser
.HTMLParseError
:
310 return parser
.get_result()
313 def clean_html(html
):
314 """Clean an HTML snippet into a readable string"""
316 html
= html
.replace('\n', ' ')
317 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
318 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
320 html
= re
.sub('<.*?>', '', html
)
321 # Replace html entities
322 html
= unescapeHTML(html
)
326 def sanitize_open(filename
, open_mode
):
327 """Try to open the given filename, and slightly tweak it if this fails.
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
334 It returns the tuple (stream, definitive_file_name).
338 if sys
.platform
== 'win32':
340 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
341 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
342 stream
= open(encodeFilename(filename
), open_mode
)
343 return (stream
, filename
)
344 except (IOError, OSError) as err
:
345 if err
.errno
in (errno
.EACCES
,):
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename
= os
.path
.join(
350 re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', path_part
)
351 for path_part
in os
.path
.split(filename
)
353 if alt_filename
== filename
:
356 # An exception here should be caught in the caller
357 stream
= open(encodeFilename(filename
), open_mode
)
358 return (stream
, alt_filename
)
361 def timeconvert(timestr
):
362 """Convert RFC 2822 defined time string into system timestamp"""
364 timetuple
= email
.utils
.parsedate_tz(timestr
)
365 if timetuple
is not None:
366 timestamp
= email
.utils
.mktime_tz(timetuple
)
369 def sanitize_filename(s
, restricted
=False, is_id
=False):
370 """Sanitizes a string so it could be used as part of a filename.
371 If restricted is set, use a stricter subset of allowed characters.
372 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
374 def replace_insane(char
):
375 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
378 return '' if restricted
else '\''
380 return '_-' if restricted
else ' -'
381 elif char
in '\\/|*<>':
383 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
385 if restricted
and ord(char
) > 127:
389 result
= u
''.join(map(replace_insane
, s
))
391 while '__' in result
:
392 result
= result
.replace('__', '_')
393 result
= result
.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted
and result
.startswith('-_'):
401 def orderedSet(iterable
):
402 """ Remove all duplicates from the input iterable """
413 assert type(s
) == type(u
'')
415 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
418 def encodeFilename(s
):
420 @param s The name of the file
423 assert type(s
) == type(u
'')
425 # Python 3 has a Unicode API
426 if sys
.version_info
>= (3, 0):
429 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
430 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
431 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
432 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
435 encoding
= sys
.getfilesystemencoding()
438 return s
.encode(encoding
, 'ignore')
440 def decodeOption(optval
):
443 if isinstance(optval
, bytes):
444 optval
= optval
.decode(preferredencoding())
446 assert isinstance(optval
, compat_str
)
449 def formatSeconds(secs
):
451 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
453 return '%d:%02d' % (secs
// 60, secs
% 60)
457 def make_HTTPS_handler(opts
):
458 if sys
.version_info
< (3,2):
459 # Python's 2.x handler is very simplistic
460 return compat_urllib_request
.HTTPSHandler()
463 context
= ssl
.SSLContext(ssl
.PROTOCOL_SSLv23
)
464 context
.set_default_verify_paths()
466 context
.verify_mode
= (ssl
.CERT_NONE
467 if opts
.no_check_certificate
468 else ssl
.CERT_REQUIRED
)
469 return compat_urllib_request
.HTTPSHandler(context
=context
)
471 class ExtractorError(Exception):
472 """Error during info extraction."""
473 def __init__(self
, msg
, tb
=None, expected
=False):
474 """ tb, if given, is the original traceback (so that it can be printed out).
475 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
478 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
481 msg
= msg
+ u
'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.'
482 super(ExtractorError
, self
).__init
__(msg
)
485 self
.exc_info
= sys
.exc_info() # preserve original exception
487 def format_traceback(self
):
488 if self
.traceback
is None:
490 return u
''.join(traceback
.format_tb(self
.traceback
))
493 class DownloadError(Exception):
494 """Download Error exception.
496 This exception may be thrown by FileDownloader objects if they are not
497 configured to continue on errors. They will contain the appropriate
500 def __init__(self
, msg
, exc_info
=None):
501 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
502 super(DownloadError
, self
).__init
__(msg
)
503 self
.exc_info
= exc_info
506 class SameFileError(Exception):
507 """Same File exception.
509 This exception will be thrown by FileDownloader objects if they detect
510 multiple files would have to be downloaded to the same file on disk.
515 class PostProcessingError(Exception):
516 """Post Processing exception.
518 This exception may be raised by PostProcessor's .run() method to
519 indicate an error in the postprocessing task.
521 def __init__(self
, msg
):
524 class MaxDownloadsReached(Exception):
525 """ --max-downloads limit has been reached. """
529 class UnavailableVideoError(Exception):
530 """Unavailable Format exception.
532 This exception will be thrown when a video is requested
533 in a format that is not available for that video.
538 class ContentTooShortError(Exception):
539 """Content Too Short exception.
541 This exception may be raised by FileDownloader objects when a file they
542 download is too small for what the server announced first, indicating
543 the connection was probably interrupted.
549 def __init__(self
, downloaded
, expected
):
550 self
.downloaded
= downloaded
551 self
.expected
= expected
553 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
554 """Handler for HTTP requests and responses.
556 This class, when installed with an OpenerDirector, automatically adds
557 the standard headers to every HTTP request and handles gzipped and
558 deflated responses from web servers. If compression is to be avoided in
559 a particular request, the original request in the program code only has
560 to include the HTTP header "Youtubedl-No-Compression", which will be
561 removed before making the real request.
563 Part of this code was copied from:
565 http://techknack.net/python-urllib2-handlers/
567 Andrew Rowls, the author of that code, agreed to release it to the
574 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
576 return zlib
.decompress(data
)
579 def addinfourl_wrapper(stream
, headers
, url
, code
):
580 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
581 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
582 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
586 def http_request(self
, req
):
587 for h
,v
in std_headers
.items():
591 if 'Youtubedl-no-compression' in req
.headers
:
592 if 'Accept-encoding' in req
.headers
:
593 del req
.headers
['Accept-encoding']
594 del req
.headers
['Youtubedl-no-compression']
595 if 'Youtubedl-user-agent' in req
.headers
:
596 if 'User-agent' in req
.headers
:
597 del req
.headers
['User-agent']
598 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
599 del req
.headers
['Youtubedl-user-agent']
602 def http_response(self
, req
, resp
):
605 if resp
.headers
.get('Content-encoding', '') == 'gzip':
606 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
607 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
608 resp
.msg
= old_resp
.msg
610 if resp
.headers
.get('Content-encoding', '') == 'deflate':
611 gz
= io
.BytesIO(self
.deflate(resp
.read()))
612 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
613 resp
.msg
= old_resp
.msg
616 https_request
= http_request
617 https_response
= http_response
619 def unified_strdate(date_str
):
620 """Return a string with the date in the format YYYYMMDD"""
623 date_str
= date_str
.replace(',',' ')
624 # %z (UTC offset) is only supported in python>=3.2
625 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
626 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
627 for expression
in format_expressions
:
629 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
634 def determine_ext(url
):
635 guess
= url
.partition(u
'?')[0].rpartition(u
'.')[2]
636 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
639 return u
'unknown_video'
641 def date_from_str(date_str
):
643 Return a datetime object from a string in the format YYYYMMDD or
644 (now|today)[+-][0-9](day|week|month|year)(s)?"""
645 today
= datetime
.date
.today()
646 if date_str
== 'now'or date_str
== 'today':
648 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
649 if match
is not None:
650 sign
= match
.group('sign')
651 time
= int(match
.group('time'))
654 unit
= match
.group('unit')
663 delta
= datetime
.timedelta(**{unit
: time
})
665 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
667 class DateRange(object):
668 """Represents a time interval between two dates"""
669 def __init__(self
, start
=None, end
=None):
670 """start and end must be strings in the format accepted by date"""
671 if start
is not None:
672 self
.start
= date_from_str(start
)
674 self
.start
= datetime
.datetime
.min.date()
676 self
.end
= date_from_str(end
)
678 self
.end
= datetime
.datetime
.max.date()
679 if self
.start
> self
.end
:
680 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
683 """Returns a range that only contains the given day"""
685 def __contains__(self
, date
):
686 """Check if the date is in the range"""
687 if not isinstance(date
, datetime
.date
):
688 date
= date_from_str(date
)
689 return self
.start
<= date
<= self
.end
691 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())