X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/04a301d947124b5d27a90db6313acc99ad63fbb1..db19c7e19146d9f8d98b42966d120e519620f35d:/youtube-dl diff --git a/youtube-dl b/youtube-dl index 3cd8bfb..818a3a7 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1,272 +1,4523 @@ #!/usr/bin/env python -# -# Copyright (c) 2006 Ricardo Garcia Gonzalez -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -# Except as contained in this notice, the name(s) of the above copyright -# holders shall not be used in advertising or otherwise to promote the -# sale, use or other dealings in this Software without prior written -# authorization. -# -import sys -import optparse +# -*- coding: utf-8 -*- + +__authors__ = ( + 'Ricardo Garcia Gonzalez', + 'Danny Colligan', + 'Benjamin Johnson', + 'Vasyl\' Vavrychuk', + 'Witold Baryluk', + 'PaweŠPaprota', + 'Gergely Imreh', + 'Rogério Brito', + 'Philipp Hagemeister', + 'Sören Schulze', + 'Kevin Ngo', + 'Ori Avtalion', + 'shizeeg', + ) + +__license__ = 'Public Domain' +__version__ = '2012.01.05' + +UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' + +import cookielib +import datetime +import gzip +import htmlentitydefs +import HTMLParser import httplib -import urllib2 +import locale +import math +import netrc +import os +import os.path import re +import socket import string -import os +import subprocess +import sys +import time +import urllib +import urllib2 +import warnings +import zlib + +if os.name == 'nt': + import ctypes + +try: + import email.utils +except ImportError: # Python 2.4 + import email.Utils +try: + import cStringIO as StringIO +except ImportError: + import StringIO + +# parse_qs was moved from the cgi module to the urlparse module recently. +try: + from urlparse import parse_qs +except ImportError: + from cgi import parse_qs + +try: + import lxml.etree +except ImportError: + pass # Handled below + +try: + import xml.etree.ElementTree +except ImportError: # Python<2.5: Not officially supported, but let it slip + warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') + +std_headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'en-us,en;q=0.5', +} + +try: + import json +except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson): + import re + class json(object): + @staticmethod + def loads(s): + s = s.decode('UTF-8') + def raiseError(msg, i): + raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:])) + def skipSpace(i, expectMore=True): + while i < len(s) and s[i] in ' \t\r\n': + i += 1 + if expectMore: + if i >= len(s): + raiseError('Premature end', i) + return i + def decodeEscape(match): + esc = match.group(1) + _STATIC = { + '"': '"', + '\\': '\\', + '/': '/', + 'b': unichr(0x8), + 'f': unichr(0xc), + 'n': '\n', + 'r': '\r', + 't': '\t', + } + if esc in _STATIC: + return _STATIC[esc] + if esc[0] == 'u': + if len(esc) == 1+4: + return unichr(int(esc[1:5], 16)) + if len(esc) == 5+6 and esc[5:7] == '\\u': + hi = int(esc[1:5], 16) + low = int(esc[7:11], 16) + return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000) + raise ValueError('Unknown escape ' + str(esc)) + def parseString(i): + i += 1 + e = i + while True: + e = s.index('"', e) + bslashes = 0 + while s[e-bslashes-1] == '\\': + bslashes += 1 + if bslashes % 2 == 1: + e += 1 + continue + break + rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)') + stri = rexp.sub(decodeEscape, s[i:e]) + return (e+1,stri) + def parseObj(i): + i += 1 + res = {} + i = skipSpace(i) + if s[i] == '}': # Empty dictionary + return (i+1,res) + while True: + if s[i] != '"': + raiseError('Expected a string object key', i) + i,key = parseString(i) + i = skipSpace(i) + if i >= len(s) or s[i] != ':': + raiseError('Expected a colon', i) + i,val = parse(i+1) + res[key] = val + i = skipSpace(i) + if s[i] == '}': + return (i+1, res) + if s[i] != ',': + raiseError('Expected comma or closing curly brace', i) + i = skipSpace(i+1) + def parseArray(i): + res = [] + i = skipSpace(i+1) + if s[i] == ']': # Empty array + return (i+1,res) + while True: + i,val = parse(i) + res.append(val) + i = skipSpace(i) # Raise exception if premature end + if s[i] == ']': + return (i+1, res) + if s[i] != ',': + raiseError('Expected a comma or closing bracket', i) + i = skipSpace(i+1) + def parseDiscrete(i): + for k,v in {'true': True, 'false': False, 'null': None}.items(): + if s.startswith(k, i): + return (i+len(k), v) + raiseError('Not a boolean (or null)', i) + def parseNumber(i): + mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:]) + if mobj is None: + raiseError('Not a number', i) + nums = mobj.group(1) + if '.' in nums or 'e' in nums or 'E' in nums: + return (i+len(nums), float(nums)) + return (i+len(nums), int(nums)) + CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete} + def parse(i): + i = skipSpace(i) + i,res = CHARMAP.get(s[i], parseNumber)(i) + i = skipSpace(i, False) + return (i,res) + i,res = parse(0) + if i < len(s): + raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') + return res + +def preferredencoding(): + """Get preferred encoding. + + Returns the best encoding scheme for the system, based on + locale.getpreferredencoding() and some further tweaks. + """ + def yield_preferredencoding(): + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + while True: + yield pref + return yield_preferredencoding().next() + + +def htmlentity_transform(matchobj): + """Transforms an HTML entity to a Unicode character. + + This function receives a match object and is intended to be used with + the re.sub() function. + """ + entity = matchobj.group(1) + + # Known non-numeric HTML entity + if entity in htmlentitydefs.name2codepoint: + return unichr(htmlentitydefs.name2codepoint[entity]) + + # Unicode character + mobj = re.match(ur'(?u)#(x?\d+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return unichr(long(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + + +def sanitize_title(utitle): + """Sanitizes a video title so it could be used as part of a filename.""" + utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) + return utitle.replace(unicode(os.sep), u'%') + + +def sanitize_open(filename, open_mode): + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + try: + if filename == u'-': + if sys.platform == 'win32': + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout, filename) + stream = open(filename, open_mode) + return (stream, filename) + except (IOError, OSError), err: + # In case of error, try to remove win32 forbidden chars + filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) + + # An exception here should be caught in the caller + stream = open(filename, open_mode) + return (stream, filename) + + +def timeconvert(timestr): + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + +def _simplify_title(title): + expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) + return expr.sub(u'_', title).strip(u'_') + +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + +def _unescapeHTML(s): + """ + @param s a string (of type unicode) + """ + assert type(s) == type(u'') + + htmlParser = HTMLParser.HTMLParser() + return htmlParser.unescape(s) + +class DownloadError(Exception): + """Download Error exception. + + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + pass + + +class SameFileError(Exception): + """Same File exception. + + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + pass + + +class PostProcessingError(Exception): + """Post Processing exception. + + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + pass + +class MaxDownloadsReached(Exception): + """ --max-downloads limit has been reached. """ + pass + + +class UnavailableVideoError(Exception): + """Unavailable Format exception. + + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + pass + + +class ContentTooShortError(Exception): + """Content Too Short exception. + + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + # Both in bytes + downloaded = None + expected = None + + def __init__(self, downloaded, expected): + self.downloaded = downloaded + self.expected = expected + + +class YoutubeDLHandler(urllib2.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-No-Compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + @staticmethod + def deflate(data): + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def addinfourl_wrapper(stream, headers, url, code): + if hasattr(urllib2.addinfourl, 'getcode'): + return urllib2.addinfourl(stream, headers, url, code) + ret = urllib2.addinfourl(stream, headers, url) + ret.code = code + return ret + + def http_request(self, req): + for h in std_headers: + if h in req.headers: + del req.headers[h] + req.add_header(h, std_headers[h]) + if 'Youtubedl-no-compression' in req.headers: + if 'Accept-encoding' in req.headers: + del req.headers['Accept-encoding'] + del req.headers['Youtubedl-no-compression'] + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = StringIO.StringIO(self.deflate(resp.read())) + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + return resp + + +class FileDownloader(object): + """File Downloader class. + + File downloader objects are the ones responsible of downloading the + actual video file and writing it to disk if the user has requested + it, among some other tasks. In most cases there should be one per + program. As, given a video URL, the downloader doesn't know how to + extract all the needed information, task that InfoExtractors do, it + has to pass the URL to one of them. + + For this, file downloader objects have a method that allows + InfoExtractors to be registered in a given order. When it is passed + a URL, the file downloader handles it to the first InfoExtractor it + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + asks the FileDownloader to process the video information, possibly + downloading the video. + + File downloaders accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. These options are available through the params + attribute for the InfoExtractors to use. The FileDownloader also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". + + Available options: + + username: Username for authentication purposes. + password: Password for authentication purposes. + usenetrc: Use netrc for authentication instead. + quiet: Do not print messages to stdout. + forceurl: Force printing final URL. + forcetitle: Force printing title. + forcethumbnail: Force printing thumbnail URL. + forcedescription: Force printing description. + forcefilename: Force printing final filename. + simulate: Do not download the video files. + format: Video format code. + format_limit: Highest quality format to try. + outtmpl: Template for output names. + ignoreerrors: Do not stop on download errors. + ratelimit: Download speed limit, in bytes/sec. + nooverwrites: Prevent overwriting files. + retries: Number of times to retry for HTTP error 5xx + continuedl: Try to continue downloads if possible. + noprogress: Do not print the progress bar. + playliststart: Playlist item to start at. + playlistend: Playlist item to end at. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. + logtostderr: Log messages to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. + nopart: Do not use temporary .part files. + updatetime: Use the Last-modified header to set output file timestamps. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file + """ + + params = None + _ies = [] + _pps = [] + _download_retcode = None + _num_downloads = None + _screen_file = None + + def __init__(self, params): + """Create a FileDownloader object with the given options.""" + self._ies = [] + self._pps = [] + self._download_retcode = 0 + self._num_downloads = 0 + self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self.params = params + + @staticmethod + def format_bytes(bytes): + if bytes is None: + return 'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = long(math.log(bytes, 1024.0)) + suffix = 'bkMGTPEZY'[exponent] + converted = float(bytes) / float(1024 ** exponent) + return '%.2f%s' % (converted, suffix) + + @staticmethod + def calc_percent(byte_counter, data_len): + if data_len is None: + return '---.-%' + return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) + + @staticmethod + def calc_eta(start, now, total, current): + if total is None: + return '--:--' + dif = now - start + if current == 0 or dif < 0.001: # One millisecond + return '--:--' + rate = float(current) / dif + eta = long((float(total) - float(current)) / rate) + (eta_mins, eta_secs) = divmod(eta, 60) + if eta_mins > 99: + return '--:--' + return '%02d:%02d' % (eta_mins, eta_secs) + + @staticmethod + def calc_speed(start, now, bytes): + dif = now - start + if bytes == 0 or dif < 0.001: # One millisecond + return '%10s' % '---b/s' + return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) + + @staticmethod + def best_block_size(elapsed_time, bytes): + new_min = max(bytes / 2.0, 1.0) + new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB + if elapsed_time < 0.001: + return long(new_max) + rate = bytes / elapsed_time + if rate > new_max: + return long(new_max) + if rate < new_min: + return long(new_min) + return long(rate) + + @staticmethod + def parse_bytes(bytestr): + """Parse a string indicating a byte quantity into a long integer.""" + matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) + if matchobj is None: + return None + number = float(matchobj.group(1)) + multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) + return long(round(number * multiplier)) + + def add_info_extractor(self, ie): + """Add an InfoExtractor object to the end of the list.""" + self._ies.append(ie) + ie.set_downloader(self) + + def add_post_processor(self, pp): + """Add a PostProcessor object to the end of the chain.""" + self._pps.append(pp) + pp.set_downloader(self) + + def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): + """Print message to stdout if not in quiet mode.""" + try: + if not self.params.get('quiet', False): + terminator = [u'\n', u''][skip_eol] + print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()), + self._screen_file.flush() + except (UnicodeEncodeError), err: + if not ignore_encoding_errors: + raise + + def to_stderr(self, message): + """Print message to stderr.""" + print >>sys.stderr, message.encode(preferredencoding()) + + def to_cons_title(self, message): + """Set console/terminal window title to message.""" + if not self.params.get('consoletitle', False): + return + if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding())) + + def fixed_template(self): + """Checks if the output template is fixed.""" + return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None) + + def trouble(self, message=None): + """Determine action to take when a download problem appears. + + Depending on if the downloader has been configured to ignore + download errors or not, this method may throw an exception or + not when errors are found, after printing the message. + """ + if message is not None: + self.to_stderr(message) + if not self.params.get('ignoreerrors', False): + raise DownloadError(message) + self._download_retcode = 1 + + def slow_down(self, start_time, byte_counter): + """Sleep if the download speed is over the rate limit.""" + rate_limit = self.params.get('ratelimit', None) + if rate_limit is None or byte_counter == 0: + return + now = time.time() + elapsed = now - start_time + if elapsed <= 0.0: + return + speed = float(byte_counter) / elapsed + if speed > rate_limit: + time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) + + def temp_name(self, filename): + """Returns a temporary filename for the given filename.""" + if self.params.get('nopart', False) or filename == u'-' or \ + (os.path.exists(filename) and not os.path.isfile(filename)): + return filename + return filename + u'.part' + + def undo_temp_name(self, filename): + if filename.endswith(u'.part'): + return filename[:-len(u'.part')] + return filename + + def try_rename(self, old_filename, new_filename): + try: + if old_filename == new_filename: + return + os.rename(old_filename, new_filename) + except (IOError, OSError), err: + self.trouble(u'ERROR: unable to rename file') + + def try_utime(self, filename, last_modified_hdr): + """Try to set the last-modified time of the given file.""" + if last_modified_hdr is None: + return + if not os.path.isfile(filename): + return + timestr = last_modified_hdr + if timestr is None: + return + filetime = timeconvert(timestr) + if filetime is None: + return filetime + try: + os.utime(filename, (time.time(), filetime)) + except: + pass + return filetime + + def report_writedescription(self, descfn): + """ Report that the description file is being written """ + self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) + + def report_writeinfojson(self, infofn): + """ Report that the metadata file has been written """ + self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) + + def report_destination(self, filename): + """Report destination filename.""" + self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) + + def report_progress(self, percent_str, data_len_str, speed_str, eta_str): + """Report download progress.""" + if self.params.get('noprogress', False): + return + self.to_screen(u'\r[download] %s of %s at %s ETA %s' % + (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) + self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % + (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) + + def report_resuming_byte(self, resume_len): + """Report attempt to resume at given byte.""" + self.to_screen(u'[download] Resuming download at byte %s' % resume_len) + + def report_retry(self, count, retries): + """Report retry in case of HTTP error 5xx""" + self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen(u'[download] %s has already been downloaded' % file_name) + except (UnicodeEncodeError), err: + self.to_screen(u'[download] The file has already been downloaded') + + def report_unable_to_resume(self): + """Report it was impossible to resume download.""" + self.to_screen(u'[download] Unable to resume') + + def report_finish(self): + """Report download finished.""" + if self.params.get('noprogress', False): + self.to_screen(u'[download] Download completed') + else: + self.to_screen(u'') + + def increment_downloads(self): + """Increment the ordinal that assigns a number to each file.""" + self._num_downloads += 1 + + def prepare_filename(self, info_dict): + """Generate the output filename.""" + try: + template_dict = dict(info_dict) + template_dict['epoch'] = unicode(long(time.time())) + template_dict['autonumber'] = unicode('%05d' % self._num_downloads) + filename = self.params['outtmpl'] % template_dict + return filename + except (ValueError, KeyError), err: + self.trouble(u'ERROR: invalid system charset or erroneous output template') + return None + + def _match_entry(self, info_dict): + """ Returns None iff the file should be downloaded """ + + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + return None + + def process_info(self, info_dict): + """Process a single dictionary returned by an InfoExtractor.""" + + reason = self._match_entry(info_dict) + if reason is not None: + self.to_screen(u'[download] ' + reason) + return + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads > int(max_downloads): + raise MaxDownloadsReached() + + filename = self.prepare_filename(info_dict) + + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceurl', False): + print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcedescription', False) and 'description' in info_dict: + print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcefilename', False) and filename is not None: + print filename.encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceformat', False): + print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): + return + + if filename is None: + return + + try: + dn = os.path.dirname(filename) + if dn != '' and not os.path.exists(dn): + os.makedirs(dn) + except (OSError, IOError), err: + self.trouble(u'ERROR: unable to create directory ' + unicode(err)) + return + + if self.params.get('writedescription', False): + try: + descfn = filename + '.description' + self.report_writedescription(descfn) + descfile = open(descfn, 'wb') + try: + descfile.write(info_dict['description'].encode('utf-8')) + finally: + descfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write description file ' + descfn) + return + + if self.params.get('writeinfojson', False): + infofn = filename + '.info.json' + self.report_writeinfojson(infofn) + try: + json.dump + except (NameError,AttributeError): + self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') + return + try: + infof = open(infofn, 'wb') + try: + json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) + json.dump(json_info_dict, infof) + finally: + infof.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) + return + + if not self.params.get('skip_download', False): + if self.params.get('nooverwrites', False) and os.path.exists(filename): + success = True + else: + try: + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return + + if success: + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + self.trouble(u'ERROR: postprocessing: %s' % str(err)) + return + + def download(self, url_list): + """Download a given list of URLs.""" + if len(url_list) > 1 and self.fixed_template(): + raise SameFileError(self.params['outtmpl']) + + for url in url_list: + suitable_found = False + for ie in self._ies: + # Go to next InfoExtractor if not suitable + if not ie.suitable(url): + continue + + # Suitable InfoExtractor found + suitable_found = True + + # Extract information from URL and process it + ie.extract(url) + + # Suitable InfoExtractor had been found; go to next URL + break + + if not suitable_found: + self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) + + return self._download_retcode + + def post_process(self, filename, ie_info): + """Run the postprocessing chain on the given file.""" + info = dict(ie_info) + info['filepath'] = filename + for pp in self._pps: + info = pp.run(info) + if info is None: + break + + def _download_with_rtmpdump(self, filename, url, player_url): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + # Check for rtmpdump first + try: + subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT) + except (OSError, IOError): + self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run') + return False + + # Download using rtmpdump. rtmpdump returns exit code 2 when + # the connection was interrumpted and resuming appears to be + # possible. This is part of rtmpdump's normal usage, AFAIK. + basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] + retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) + while retval == 2 or retval == 1: + prevsize = os.path.getsize(tmpfilename) + self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) + time.sleep(5.0) # This seems to be needed + retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) + cursize = os.path.getsize(tmpfilename) + if prevsize == cursize and retval == 1: + break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == 2 and cursize > 1024: + self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = 0 + break + if retval == 0: + self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) + self.try_rename(tmpfilename, filename) + return True + else: + self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) + return False + + def _do_download(self, filename, info_dict): + url = info_dict['url'] + player_url = info_dict.get('player_url', None) + + # Check file already present + if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): + self.report_file_already_downloaded(filename) + return True + + # Attempt to download using rtmpdump + if url.startswith('rtmp'): + return self._download_with_rtmpdump(filename, url, player_url) + + tmpfilename = self.temp_name(filename) + stream = None + + # Do not include the Accept-Encoding header + headers = {'Youtubedl-no-compression': 'True'} + basic_request = urllib2.Request(url, None, headers) + request = urllib2.Request(url, None, headers) + + # Establish possible resume length + if os.path.isfile(tmpfilename): + resume_len = os.path.getsize(tmpfilename) + else: + resume_len = 0 + + open_mode = 'wb' + if resume_len != 0: + if self.params.get('continuedl', False): + self.report_resuming_byte(resume_len) + request.add_header('Range','bytes=%d-' % resume_len) + open_mode = 'ab' + else: + resume_len = 0 + + count = 0 + retries = self.params.get('retries', 0) + while count <= retries: + # Establish connection + try: + if count == 0 and 'urlhandle' in info_dict: + data = info_dict['urlhandle'] + data = urllib2.urlopen(request) + break + except (urllib2.HTTPError, ), err: + if (err.code < 500 or err.code >= 600) and err.code != 416: + # Unexpected HTTP error + raise + elif err.code == 416: + # Unable to resume (requested range not satisfiable) + try: + # Open the connection again without the range header + data = urllib2.urlopen(basic_request) + content_length = data.info()['Content-Length'] + except (urllib2.HTTPError, ), err: + if err.code < 500 or err.code >= 600: + raise + else: + # Examine the reported length + if (content_length is not None and + (resume_len - 100 < long(content_length) < resume_len + 100)): + # The file had already been fully downloaded. + # Explanation to the above condition: in issue #175 it was revealed that + # YouTube sometimes adds or removes a few bytes from the end of the file, + # changing the file size slightly and causing problems for some users. So + # I decided to implement a suggested change and consider the file + # completely downloaded if the file size differs less than 100 bytes from + # the one in the hard drive. + self.report_file_already_downloaded(filename) + self.try_rename(tmpfilename, filename) + return True + else: + # The length does not match, we start the download over + self.report_unable_to_resume() + open_mode = 'wb' + break + # Retry + count += 1 + if count <= retries: + self.report_retry(count, retries) + + if count > retries: + self.trouble(u'ERROR: giving up after %s retries' % retries) + return False + + data_len = data.info().get('Content-length', None) + if data_len is not None: + data_len = long(data_len) + resume_len + data_len_str = self.format_bytes(data_len) + byte_counter = 0 + resume_len + block_size = 1024 + start = time.time() + while True: + # Download and write + before = time.time() + data_block = data.read(block_size) + after = time.time() + if len(data_block) == 0: + break + byte_counter += len(data_block) + + # Open file just in time + if stream is None: + try: + (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) + assert stream is not None + filename = self.undo_temp_name(tmpfilename) + self.report_destination(filename) + except (OSError, IOError), err: + self.trouble(u'ERROR: unable to open for writing: %s' % str(err)) + return False + try: + stream.write(data_block) + except (IOError, OSError), err: + self.trouble(u'\nERROR: unable to write data: %s' % str(err)) + return False + block_size = self.best_block_size(after - before, len(data_block)) + + # Progress message + speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) + if data_len is None: + self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + else: + percent_str = self.calc_percent(byte_counter, data_len) + eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent_str, data_len_str, speed_str, eta_str) + + # Apply rate limit + self.slow_down(start, byte_counter - resume_len) + + if stream is None: + self.trouble(u'\nERROR: Did not get any data blocks') + return False + stream.close() + self.report_finish() + if data_len is not None and byte_counter != data_len: + raise ContentTooShortError(byte_counter, long(data_len)) + self.try_rename(tmpfilename, filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) + + return True + + +class InfoExtractor(object): + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information from the video (or videos) the URL refers to. This + information includes the real video URL, the video title and simplified + title, author and others. The information is stored in a dictionary + which is then passed to the FileDownloader. The FileDownloader + processes this information possibly downloading the video to the file + system, among other possible outcomes. The dictionaries must include + the following fields: + + id: Video identifier. + url: Final video URL. + uploader: Nickname of the video uploader. + title: Literal title. + stitle: Simplified title. + ext: Video filename extension. + format: Video format. + player_url: SWF Player URL (may be None). + + The following fields are optional. Their primary purpose is to allow + youtube-dl to serve as the backend for a video search function, such + as the one in youtube2mp3. They are only used when their respective + forced printing functions are called: + + thumbnail: Full URL to a video thumbnail image. + description: One-line video description. + + Subclasses of this one should re-define the _real_initialize() and + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. + """ + + _ready = False + _downloader = None + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader.""" + self._ready = False + self.set_downloader(downloader) + + def suitable(self, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(self._VALID_URL, url) is not None + + def initialize(self): + """Initializes an instance (authentication, etc).""" + if not self._ready: + self._real_initialize() + self._ready = True + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + self.initialize() + return self._real_extract(url) + + def set_downloader(self, downloader): + """Sets the downloader for this IE.""" + self._downloader = downloader + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + pass + + +class YoutubeIE(InfoExtractor): + """Information extractor for youtube.com.""" + + _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' + _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' + _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NETRC_MACHINE = 'youtube' + # Listed in order of quality + _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] + _video_extensions = { + '13': '3gp', + '17': 'mp4', + '18': 'mp4', + '22': 'mp4', + '37': 'mp4', + '38': 'video', # You actually don't know if this will be MOV, AVI or whatever + '43': 'webm', + '44': 'webm', + '45': 'webm', + } + _video_dimensions = { + '5': '240x400', + '6': '???', + '13': '???', + '17': '144x176', + '18': '360x640', + '22': '720x1280', + '34': '360x640', + '35': '480x854', + '37': '1080x1920', + '38': '3072x4096', + '43': '360x640', + '44': '480x854', + '45': '720x1280', + } + IE_NAME = u'youtube' + + def report_lang(self): + """Report attempt to set language.""" + self._downloader.to_screen(u'[youtube] Setting language') + + def report_login(self): + """Report attempt to log in.""" + self._downloader.to_screen(u'[youtube] Logging in') + + def report_age_confirmation(self): + """Report attempt to confirm age.""" + self._downloader.to_screen(u'[youtube] Confirming age') + + def report_video_webpage_download(self, video_id): + """Report attempt to download video webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) + + def report_video_info_webpage_download(self, video_id): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + + def report_information_extraction(self, video_id): + """Report attempt to extract video information.""" + self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) + + def report_unavailable_format(self, video_id, format): + """Report extracted video URL.""" + self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) + + def report_rtmp_download(self): + """Indicate the download will use the RTMP protocol.""" + self._downloader.to_screen(u'[youtube] RTMP download detected') + + def _print_formats(self, formats): + print 'Available formats:' + for x in formats: + print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + + def _real_initialize(self): + if self._downloader is None: + return + + username = None + password = None + downloader_params = self._downloader.params + + # Attempt to use provided username and password or .netrc data + if downloader_params.get('username', None) is not None: + username = downloader_params['username'] + password = downloader_params['password'] + elif downloader_params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(self._NETRC_MACHINE) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) + except (IOError, netrc.NetrcParseError), err: + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) + return + + # Set language + request = urllib2.Request(self._LANG_URL) + try: + self.report_lang() + urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) + return + + # No authentication to be performed + if username is None: + return + + # Log in + login_form = { + 'current_form': 'loginForm', + 'next': '/', + 'action_login': 'Log In', + 'username': username, + 'password': password, + } + request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) + try: + self.report_login() + login_results = urllib2.urlopen(request).read() + if re.search(r'(?i)