X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/de5b9c36a58e493e6f8c8c3694370e477da74f94..e2d44fac99aa7cfc6e1d379794b5bda76347965d:/youtube-dl?ds=sidebyside diff --git a/youtube-dl b/youtube-dl index a8e3bd3..818a3a7 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1,13 +1,32 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Author: Ricardo Garcia Gonzalez -# Author: Danny Colligan -# Author: Benjamin Johnson -# Author: Vasyl' Vavrychuk -# License: Public domain code + +__authors__ = ( + 'Ricardo Garcia Gonzalez', + 'Danny Colligan', + 'Benjamin Johnson', + 'Vasyl\' Vavrychuk', + 'Witold Baryluk', + 'PaweŠPaprota', + 'Gergely Imreh', + 'Rogério Brito', + 'Philipp Hagemeister', + 'Sören Schulze', + 'Kevin Ngo', + 'Ori Avtalion', + 'shizeeg', + ) + +__license__ = 'Public Domain' +__version__ = '2012.01.05' + +UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' + import cookielib import datetime +import gzip import htmlentitydefs +import HTMLParser import httplib import locale import math @@ -22,6 +41,20 @@ import sys import time import urllib import urllib2 +import warnings +import zlib + +if os.name == 'nt': + import ctypes + +try: + import email.utils +except ImportError: # Python 2.4 + import email.Utils +try: + import cStringIO as StringIO +except ImportError: + import StringIO # parse_qs was moved from the cgi module to the urlparse module recently. try: @@ -29,14 +62,136 @@ try: except ImportError: from cgi import parse_qs +try: + import lxml.etree +except ImportError: + pass # Handled below + +try: + import xml.etree.ElementTree +except ImportError: # Python<2.5: Not officially supported, but let it slip + warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') + std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } -simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') +try: + import json +except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson): + import re + class json(object): + @staticmethod + def loads(s): + s = s.decode('UTF-8') + def raiseError(msg, i): + raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:])) + def skipSpace(i, expectMore=True): + while i < len(s) and s[i] in ' \t\r\n': + i += 1 + if expectMore: + if i >= len(s): + raiseError('Premature end', i) + return i + def decodeEscape(match): + esc = match.group(1) + _STATIC = { + '"': '"', + '\\': '\\', + '/': '/', + 'b': unichr(0x8), + 'f': unichr(0xc), + 'n': '\n', + 'r': '\r', + 't': '\t', + } + if esc in _STATIC: + return _STATIC[esc] + if esc[0] == 'u': + if len(esc) == 1+4: + return unichr(int(esc[1:5], 16)) + if len(esc) == 5+6 and esc[5:7] == '\\u': + hi = int(esc[1:5], 16) + low = int(esc[7:11], 16) + return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000) + raise ValueError('Unknown escape ' + str(esc)) + def parseString(i): + i += 1 + e = i + while True: + e = s.index('"', e) + bslashes = 0 + while s[e-bslashes-1] == '\\': + bslashes += 1 + if bslashes % 2 == 1: + e += 1 + continue + break + rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)') + stri = rexp.sub(decodeEscape, s[i:e]) + return (e+1,stri) + def parseObj(i): + i += 1 + res = {} + i = skipSpace(i) + if s[i] == '}': # Empty dictionary + return (i+1,res) + while True: + if s[i] != '"': + raiseError('Expected a string object key', i) + i,key = parseString(i) + i = skipSpace(i) + if i >= len(s) or s[i] != ':': + raiseError('Expected a colon', i) + i,val = parse(i+1) + res[key] = val + i = skipSpace(i) + if s[i] == '}': + return (i+1, res) + if s[i] != ',': + raiseError('Expected comma or closing curly brace', i) + i = skipSpace(i+1) + def parseArray(i): + res = [] + i = skipSpace(i+1) + if s[i] == ']': # Empty array + return (i+1,res) + while True: + i,val = parse(i) + res.append(val) + i = skipSpace(i) # Raise exception if premature end + if s[i] == ']': + return (i+1, res) + if s[i] != ',': + raiseError('Expected a comma or closing bracket', i) + i = skipSpace(i+1) + def parseDiscrete(i): + for k,v in {'true': True, 'false': False, 'null': None}.items(): + if s.startswith(k, i): + return (i+len(k), v) + raiseError('Not a boolean (or null)', i) + def parseNumber(i): + mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:]) + if mobj is None: + raiseError('Not a number', i) + nums = mobj.group(1) + if '.' in nums or 'e' in nums or 'E' in nums: + return (i+len(nums), float(nums)) + return (i+len(nums), int(nums)) + CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete} + def parse(i): + i = skipSpace(i) + i,res = CHARMAP.get(s[i], parseNumber)(i) + i = skipSpace(i, False) + return (i,res) + i,res = parse(0) + if i < len(s): + raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') + return res def preferredencoding(): """Get preferred encoding. @@ -54,9 +209,10 @@ def preferredencoding(): yield pref return yield_preferredencoding().next() + def htmlentity_transform(matchobj): """Transforms an HTML entity to a Unicode character. - + This function receives a match object and is intended to be used with the re.sub() function. """ @@ -80,11 +236,13 @@ def htmlentity_transform(matchobj): # Unknown entity in name, return its literal representation return (u'&%s;' % entity) + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) return utitle.replace(unicode(os.sep), u'%') + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -111,15 +269,46 @@ def sanitize_open(filename, open_mode): stream = open(filename, open_mode) return (stream, filename) + +def timeconvert(timestr): + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + +def _simplify_title(title): + expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) + return expr.sub(u'_', title).strip(u'_') + +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + +def _unescapeHTML(s): + """ + @param s a string (of type unicode) + """ + assert type(s) == type(u'') + + htmlParser = HTMLParser.HTMLParser() + return htmlParser.unescape(s) + class DownloadError(Exception): """Download Error exception. - + This exception may be thrown by FileDownloader objects if they are not configured to continue on errors. They will contain the appropriate error message. """ pass + class SameFileError(Exception): """Same File exception. @@ -128,6 +317,7 @@ class SameFileError(Exception): """ pass + class PostProcessingError(Exception): """Post Processing exception. @@ -136,6 +326,11 @@ class PostProcessingError(Exception): """ pass +class MaxDownloadsReached(Exception): + """ --max-downloads limit has been reached. """ + pass + + class UnavailableVideoError(Exception): """Unavailable Format exception. @@ -144,6 +339,7 @@ class UnavailableVideoError(Exception): """ pass + class ContentTooShortError(Exception): """Content Too Short exception. @@ -159,6 +355,66 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected + +class YoutubeDLHandler(urllib2.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-No-Compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + @staticmethod + def deflate(data): + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def addinfourl_wrapper(stream, headers, url, code): + if hasattr(urllib2.addinfourl, 'getcode'): + return urllib2.addinfourl(stream, headers, url, code) + ret = urllib2.addinfourl(stream, headers, url) + ret.code = code + return ret + + def http_request(self, req): + for h in std_headers: + if h in req.headers: + del req.headers[h] + req.add_header(h, std_headers[h]) + if 'Youtubedl-no-compression' in req.headers: + if 'Accept-encoding' in req.headers: + del req.headers['Accept-encoding'] + del req.headers['Youtubedl-no-compression'] + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = StringIO.StringIO(self.deflate(resp.read())) + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + return resp + + class FileDownloader(object): """File Downloader class. @@ -194,6 +450,7 @@ class FileDownloader(object): forcetitle: Force printing title. forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. + forcefilename: Force printing final filename. simulate: Do not download the video files. format: Video format code. format_limit: Highest quality format to try. @@ -206,7 +463,14 @@ class FileDownloader(object): noprogress: Do not print the progress bar. playliststart: Playlist item to start at. playlistend: Playlist item to end at. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. logtostderr: Log messages to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. + nopart: Do not use temporary .part files. + updatetime: Use the Last-modified header to set output file timestamps. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file """ params = None @@ -224,24 +488,7 @@ class FileDownloader(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self.params = params - - @staticmethod - def pmkdir(filename): - """Create directory components in filename. Similar to Unix "mkdir -p".""" - components = filename.split(os.sep) - aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))] - aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator - for dir in aggregate: - if not os.path.exists(dir): - os.mkdir(dir) - - @staticmethod - def temp_name(filename): - """Returns a temporary filename for the given filename.""" - if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)): - return filename - return filename + u'.part' - + @staticmethod def format_bytes(bytes): if bytes is None: @@ -253,7 +500,7 @@ class FileDownloader(object): else: exponent = long(math.log(bytes, 1024.0)) suffix = 'bkMGTPEZY'[exponent] - converted = float(bytes) / float(1024**exponent) + converted = float(bytes) / float(1024 ** exponent) return '%.2f%s' % (converted, suffix) @staticmethod @@ -310,12 +557,12 @@ class FileDownloader(object): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) ie.set_downloader(self) - + def add_post_processor(self, pp): """Add a PostProcessor object to the end of the chain.""" self._pps.append(pp) pp.set_downloader(self) - + def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): """Print message to stdout if not in quiet mode.""" try: @@ -326,11 +573,22 @@ class FileDownloader(object): except (UnicodeEncodeError), err: if not ignore_encoding_errors: raise - + def to_stderr(self, message): """Print message to stderr.""" print >>sys.stderr, message.encode(preferredencoding()) - + + def to_cons_title(self, message): + """Set console/terminal window title to message.""" + if not self.params.get('consoletitle', False): + return + if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding())) + def fixed_template(self): """Checks if the output template is fixed.""" return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None) @@ -360,7 +618,19 @@ class FileDownloader(object): speed = float(byte_counter) / elapsed if speed > rate_limit: time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) - + + def temp_name(self, filename): + """Returns a temporary filename for the given filename.""" + if self.params.get('nopart', False) or filename == u'-' or \ + (os.path.exists(filename) and not os.path.isfile(filename)): + return filename + return filename + u'.part' + + def undo_temp_name(self, filename): + if filename.endswith(u'.part'): + return filename[:-len(u'.part')] + return filename + def try_rename(self, old_filename, new_filename): try: if old_filename == new_filename: @@ -369,98 +639,196 @@ class FileDownloader(object): except (IOError, OSError), err: self.trouble(u'ERROR: unable to rename file') + def try_utime(self, filename, last_modified_hdr): + """Try to set the last-modified time of the given file.""" + if last_modified_hdr is None: + return + if not os.path.isfile(filename): + return + timestr = last_modified_hdr + if timestr is None: + return + filetime = timeconvert(timestr) + if filetime is None: + return filetime + try: + os.utime(filename, (time.time(), filetime)) + except: + pass + return filetime + + def report_writedescription(self, descfn): + """ Report that the description file is being written """ + self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) + + def report_writeinfojson(self, infofn): + """ Report that the metadata file has been written """ + self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) + def report_destination(self, filename): """Report destination filename.""" self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) - + def report_progress(self, percent_str, data_len_str, speed_str, eta_str): """Report download progress.""" if self.params.get('noprogress', False): return self.to_screen(u'\r[download] %s of %s at %s ETA %s' % (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) + self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % + (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" self.to_screen(u'[download] Resuming download at byte %s' % resume_len) - + def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) - + def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: self.to_screen(u'[download] %s has already been downloaded' % file_name) except (UnicodeEncodeError), err: self.to_screen(u'[download] The file has already been downloaded') - + def report_unable_to_resume(self): """Report it was impossible to resume download.""" self.to_screen(u'[download] Unable to resume') - + def report_finish(self): """Report download finished.""" if self.params.get('noprogress', False): self.to_screen(u'[download] Download completed') else: self.to_screen(u'') - + def increment_downloads(self): """Increment the ordinal that assigns a number to each file.""" self._num_downloads += 1 - def process_info(self, info_dict): - """Process a single dictionary returned by an InfoExtractor.""" - # Do nothing else if in simulate mode - if self.params.get('simulate', False): - # Forced printings - if self.params.get('forcetitle', False): - print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forceurl', False): - print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: - print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcedescription', False) and 'description' in info_dict: - print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') - - return - + def prepare_filename(self, info_dict): + """Generate the output filename.""" try: template_dict = dict(info_dict) template_dict['epoch'] = unicode(long(time.time())) template_dict['autonumber'] = unicode('%05d' % self._num_downloads) filename = self.params['outtmpl'] % template_dict + return filename except (ValueError, KeyError), err: self.trouble(u'ERROR: invalid system charset or erroneous output template') + return None + + def _match_entry(self, info_dict): + """ Returns None iff the file should be downloaded """ + + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + return None + + def process_info(self, info_dict): + """Process a single dictionary returned by an InfoExtractor.""" + + reason = self._match_entry(info_dict) + if reason is not None: + self.to_screen(u'[download] ' + reason) return - if self.params.get('nooverwrites', False) and os.path.exists(filename): - self.to_stderr(u'WARNING: file exists and will be skipped') + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads > int(max_downloads): + raise MaxDownloadsReached() + + filename = self.prepare_filename(info_dict) + + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceurl', False): + print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcedescription', False) and 'description' in info_dict: + print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcefilename', False) and filename is not None: + print filename.encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceformat', False): + print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): return - try: - self.pmkdir(filename) - except (OSError, IOError), err: - self.trouble(u'ERROR: unable to create directories: %s' % str(err)) + if filename is None: return try: - success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None)) + dn = os.path.dirname(filename) + if dn != '' and not os.path.exists(dn): + os.makedirs(dn) except (OSError, IOError), err: - raise UnavailableVideoError - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.trouble(u'ERROR: unable to download video data: %s' % str(err)) - return - except (ContentTooShortError, ), err: - self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + self.trouble(u'ERROR: unable to create directory ' + unicode(err)) return - if success: + if self.params.get('writedescription', False): + try: + descfn = filename + '.description' + self.report_writedescription(descfn) + descfile = open(descfn, 'wb') + try: + descfile.write(info_dict['description'].encode('utf-8')) + finally: + descfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write description file ' + descfn) + return + + if self.params.get('writeinfojson', False): + infofn = filename + '.info.json' + self.report_writeinfojson(infofn) try: - self.post_process(filename, info_dict) - except (PostProcessingError), err: - self.trouble(u'ERROR: postprocessing: %s' % str(err)) + json.dump + except (NameError,AttributeError): + self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') return + try: + infof = open(infofn, 'wb') + try: + json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) + json.dump(json_info_dict, infof) + finally: + infof.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) + return + + if not self.params.get('skip_download', False): + if self.params.get('nooverwrites', False) and os.path.exists(filename): + success = True + else: + try: + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return + + if success: + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + self.trouble(u'ERROR: postprocessing: %s' % str(err)) + return def download(self, url_list): """Download a given list of URLs.""" @@ -496,7 +864,7 @@ class FileDownloader(object): info = pp.run(info) if info is None: break - + def _download_with_rtmpdump(self, filename, url, player_url): self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -521,6 +889,11 @@ class FileDownloader(object): cursize = os.path.getsize(tmpfilename) if prevsize == cursize and retval == 1: break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == 2 and cursize > 1024: + self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = 0 + break if retval == 0: self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) self.try_rename(tmpfilename, filename) @@ -529,9 +902,12 @@ class FileDownloader(object): self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) return False - def _do_download(self, filename, url, player_url): + def _do_download(self, filename, info_dict): + url = info_dict['url'] + player_url = info_dict.get('player_url', None) + # Check file already present - if self.params.get('continuedl', False) and os.path.isfile(filename): + if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) return True @@ -541,9 +917,11 @@ class FileDownloader(object): tmpfilename = self.temp_name(filename) stream = None - open_mode = 'wb' - basic_request = urllib2.Request(url, None, std_headers) - request = urllib2.Request(url, None, std_headers) + + # Do not include the Accept-Encoding header + headers = {'Youtubedl-no-compression': 'True'} + basic_request = urllib2.Request(url, None, headers) + request = urllib2.Request(url, None, headers) # Establish possible resume length if os.path.isfile(tmpfilename): @@ -551,17 +929,22 @@ class FileDownloader(object): else: resume_len = 0 - # Request parameters in case of being able to resume - if self.params.get('continuedl', False) and resume_len != 0: - self.report_resuming_byte(resume_len) - request.add_header('Range','bytes=%d-' % resume_len) - open_mode = 'ab' + open_mode = 'wb' + if resume_len != 0: + if self.params.get('continuedl', False): + self.report_resuming_byte(resume_len) + request.add_header('Range','bytes=%d-' % resume_len) + open_mode = 'ab' + else: + resume_len = 0 count = 0 retries = self.params.get('retries', 0) while count <= retries: # Establish connection try: + if count == 0 and 'urlhandle' in info_dict: + data = info_dict['urlhandle'] data = urllib2.urlopen(request) break except (urllib2.HTTPError, ), err: @@ -580,7 +963,7 @@ class FileDownloader(object): else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < long(content_length) < resume_len + 100)): + (resume_len - 100 < long(content_length) < resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -606,8 +989,10 @@ class FileDownloader(object): return False data_len = data.info().get('Content-length', None) + if data_len is not None: + data_len = long(data_len) + resume_len data_len_str = self.format_bytes(data_len) - byte_counter = 0 + byte_counter = 0 + resume_len block_size = 1024 start = time.time() while True: @@ -615,15 +1000,16 @@ class FileDownloader(object): before = time.time() data_block = data.read(block_size) after = time.time() - data_block_len = len(data_block) - if data_block_len == 0: + if len(data_block) == 0: break - byte_counter += data_block_len + byte_counter += len(data_block) # Open file just in time if stream is None: try: (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) + assert stream is not None + filename = self.undo_temp_name(tmpfilename) self.report_destination(filename) except (OSError, IOError), err: self.trouble(u'ERROR: unable to open for writing: %s' % str(err)) @@ -633,24 +1019,36 @@ class FileDownloader(object): except (IOError, OSError), err: self.trouble(u'\nERROR: unable to write data: %s' % str(err)) return False - block_size = self.best_block_size(after - before, data_block_len) + block_size = self.best_block_size(after - before, len(data_block)) # Progress message - percent_str = self.calc_percent(byte_counter, data_len) - eta_str = self.calc_eta(start, time.time(), data_len, byte_counter) - speed_str = self.calc_speed(start, time.time(), byte_counter) - self.report_progress(percent_str, data_len_str, speed_str, eta_str) + speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) + if data_len is None: + self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + else: + percent_str = self.calc_percent(byte_counter, data_len) + eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent_str, data_len_str, speed_str, eta_str) # Apply rate limit - self.slow_down(start, byte_counter) + self.slow_down(start, byte_counter - resume_len) + if stream is None: + self.trouble(u'\nERROR: Did not get any data blocks') + return False stream.close() self.report_finish() - if data_len is not None and str(byte_counter) != data_len: + if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, long(data_len)) self.try_rename(tmpfilename, filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) + return True + class InfoExtractor(object): """Information Extractor class. @@ -681,9 +1079,8 @@ class InfoExtractor(object): description: One-line video description. Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods, as well as the suitable() static method. - Probably, they should also be instantiated and added to the main - downloader. + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. """ _ready = False @@ -694,10 +1091,9 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - @staticmethod - def suitable(url): + def suitable(self, url): """Receives a URL and returns True if suitable for this IE.""" - return False + return re.match(self._VALID_URL, url) is not None def initialize(self): """Initializes an instance (authentication, etc).""" @@ -713,7 +1109,7 @@ class InfoExtractor(object): def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader - + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass @@ -722,16 +1118,18 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13'] + _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -740,12 +1138,25 @@ class YoutubeIE(InfoExtractor): '37': 'mp4', '38': 'video', # You actually don't know if this will be MOV, AVI or whatever '43': 'webm', + '44': 'webm', '45': 'webm', } - - @staticmethod - def suitable(url): - return (re.match(YoutubeIE._VALID_URL, url) is not None) + _video_dimensions = { + '5': '240x400', + '6': '???', + '13': '???', + '17': '144x176', + '18': '360x640', + '22': '720x1280', + '34': '360x640', + '35': '480x854', + '37': '1080x1920', + '38': '3072x4096', + '43': '360x640', + '44': '480x854', + '45': '720x1280', + } + IE_NAME = u'youtube' def report_lang(self): """Report attempt to set language.""" @@ -754,31 +1165,36 @@ class YoutubeIE(InfoExtractor): def report_login(self): """Report attempt to log in.""" self._downloader.to_screen(u'[youtube] Logging in') - + def report_age_confirmation(self): """Report attempt to confirm age.""" self._downloader.to_screen(u'[youtube] Confirming age') - + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) - + def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) - + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) - + def report_unavailable_format(self, video_id, format): """Report extracted video URL.""" self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) - + def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - + + def _print_formats(self, formats): + print 'Available formats:' + for x in formats: + print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + def _real_initialize(self): if self._downloader is None: return @@ -804,7 +1220,7 @@ class YoutubeIE(InfoExtractor): return # Set language - request = urllib2.Request(self._LANG_URL, None, std_headers) + request = urllib2.Request(self._LANG_URL) try: self.report_lang() urllib2.urlopen(request).read() @@ -824,7 +1240,7 @@ class YoutubeIE(InfoExtractor): 'username': username, 'password': password, } - request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers) + request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) try: self.report_login() login_results = urllib2.urlopen(request).read() @@ -834,13 +1250,13 @@ class YoutubeIE(InfoExtractor): except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) return - + # Confirm age age_form = { 'next_url': '/', 'action_confirm': 'Confirm', } - request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers) + request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form)) try: self.report_age_confirmation() age_results = urllib2.urlopen(request).read() @@ -858,7 +1274,7 @@ class YoutubeIE(InfoExtractor): # Get video webpage self.report_video_webpage_download(video_id) - request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers) + request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: @@ -876,8 +1292,8 @@ class YoutubeIE(InfoExtractor): self.report_video_info_webpage_download(video_id) for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - request = urllib2.Request(video_info_url, None, std_headers) + % (video_id, el_type)) + request = urllib2.Request(video_info_url) try: video_info_webpage = urllib2.urlopen(request).read() video_info = parse_qs(video_info_webpage) @@ -911,8 +1327,7 @@ class YoutubeIE(InfoExtractor): video_title = sanitize_title(video_title) # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -923,10 +1338,10 @@ class YoutubeIE(InfoExtractor): # upload date upload_date = u'NA' - mobj = re.search(r'id="eow-date".*?>(.*?)', video_webpage, re.DOTALL) + mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - format_expressions = ['%d %B %Y', '%B %d %Y'] + format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') @@ -934,46 +1349,68 @@ class YoutubeIE(InfoExtractor): pass # description - video_description = 'No description available.' - if self._downloader.params.get('forcedescription', False): - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1) + try: + lxml.etree + except NameError: + video_description = u'No description available.' + if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False): + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') + else: + html_parser = lxml.etree.HTMLParser(encoding='utf-8') + vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) + video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) + # TODO use another parser # token video_token = urllib.unquote_plus(video_info['token'][0]) # Decide which formats to download req_format = self._downloader.params.get('format', None) - get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token) - if 'fmt_url_map' in video_info: - url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(',')) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): + self.report_rtmp_download() + video_url_list = [(None, video_info['conn'][0])] + elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: + url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') + url_data = [parse_qs(uds) for uds in url_data_strs] + url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) + url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) + format_limit = self._downloader.params.get('format_limit', None) - if format_limit is not None and format_limit in self._available_formats: - format_list = self._available_formats[self._available_formats.index(format_limit):] + available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats + if format_limit is not None and format_limit in available_formats: + format_list = available_formats[available_formats.index(format_limit):] else: - format_list = self._available_formats + format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: self._downloader.trouble(u'ERROR: no known formats available for video') return - if req_format is None: + if self._downloader.params.get('listformats', None): + self._print_formats(existing_formats) + return + if req_format is None or req_format == 'best': video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == '-1': + elif req_format == 'worst': + video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality + elif req_format in ('-1', 'all'): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: - if req_format in url_map: - video_url_list = [(req_format, url_map[req_format])] # Specific format - else: - video_url_list = [(req_format, get_video_template % req_format)] # Specific format - - elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - video_url_list = [(None, video_info['conn'][0])] - + # Specific formats. We pick the first in a slash-delimeted sequence. + # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + req_formats = req_format.split('/') + video_url_list = None + for rf in req_formats: + if rf in url_map: + video_url_list = [(rf, url_map[rf])] + break + if video_url_list is None: + self._downloader.trouble(u'ERROR: requested format not available') + return else: - self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info') + self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') return for format_param, video_real_url in video_url_list: @@ -983,7 +1420,6 @@ class YoutubeIE(InfoExtractor): # Extension video_extension = self._video_extensions.get(format_param, 'flv') - # Find the video URL in fmt_url_map or conn paramters try: # Process video information self._downloader.process_info({ @@ -996,11 +1432,11 @@ class YoutubeIE(InfoExtractor): 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description.decode('utf-8'), + 'description': video_description, 'player_url': player_url, }) except UnavailableVideoError, err: - self._downloader.trouble(u'ERROR: unable to download video (format may not be available)') + self._downloader.trouble(u'\nERROR: unable to download video') class MetacafeIE(InfoExtractor): @@ -1010,15 +1446,12 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' _youtube_ie = None + IE_NAME = u'metacafe' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(MetacafeIE._VALID_URL, url) is not None) - def report_disclaimer(self): """Report disclaimer retrieval.""" self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') @@ -1026,18 +1459,18 @@ class MetacafeIE(InfoExtractor): def report_age_confirmation(self): """Report attempt to confirm age.""" self._downloader.to_screen(u'[metacafe] Confirming age') - + def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) - + def report_extraction(self, video_id): """Report information extraction.""" self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) def _real_initialize(self): # Retrieve disclaimer - request = urllib2.Request(self._DISCLAIMER, None, std_headers) + request = urllib2.Request(self._DISCLAIMER) try: self.report_disclaimer() disclaimer = urllib2.urlopen(request).read() @@ -1050,14 +1483,14 @@ class MetacafeIE(InfoExtractor): 'filters': '0', 'submit': "Continue - I'm over 18", } - request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers) + request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form)) try: self.report_age_confirmation() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) return - + def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) @@ -1093,7 +1526,7 @@ class MetacafeIE(InfoExtractor): if mobj is not None: mediaURL = urllib.unquote(mobj.group(1)) video_extension = mediaURL[-3:] - + # Extract gdaKey if available mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) if mobj is None: @@ -1145,32 +1578,26 @@ class MetacafeIE(InfoExtractor): 'player_url': None, }) except UnavailableVideoError: - self._downloader.trouble(u'ERROR: unable to download video') + self._downloader.trouble(u'\nERROR: unable to download video') class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + IE_NAME = u'dailymotion' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(DailymotionIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) - + def report_extraction(self, video_id): """Report information extraction.""" self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) @@ -1182,11 +1609,11 @@ class DailymotionIE(InfoExtractor): self._downloader.increment_downloads() video_id = mobj.group(1) - simple_title = mobj.group(2).decode('utf-8') video_extension = 'flv' # Retrieve video webpage to extract further information request = urllib2.Request(url) + request.add_header('Cookie', 'family_filter=off') try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -1196,25 +1623,30 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage) + mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - mediaURL = urllib.unquote(mobj.group(1)) + sequence = urllib.unquote(mobj.group(1)) + mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract media URL') + return + mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') # if needed add http://www.dailymotion.com/ if relative URL video_url = mediaURL - # '' - mobj = re.search(r'(?im)