Raphaël G. Git Repositories - youtubedl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # License: Public domain code
   8 import cookielib
   9 import datetime
  10 import htmlentitydefs
  11 import httplib
  12 import locale
  13 import math
  14 import netrc
  15 import os
  16 import os.path
  17 import re
  18 import socket
  19 import string
  20 import subprocess
  21 import sys
  22 import time
  23 import urllib
  24 import urllib2
  25
  26 # parse_qs was moved from the cgi module to the urlparse module recently.
  27 try:
  28         from urlparse import parse_qs
  29 except ImportError:
  30         from cgi import parse_qs
  31
  32 std_headers = {
  33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  36         'Accept-Language': 'en-us,en;q=0.5',
  37 }
  38
  39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  40
  41 def preferredencoding():
  42         """Get preferred encoding.
  43
  44         Returns the best encoding scheme for the system, based on
  45         locale.getpreferredencoding() and some further tweaks.
  46         """
  47         def yield_preferredencoding():
  48                 try:
  49                         pref = locale.getpreferredencoding()
  50                         u'TEST'.encode(pref)
  51                 except:
  52                         pref = 'UTF-8'
  53                 while True:
  54                         yield pref
  55         return yield_preferredencoding().next()
  56
  57 def htmlentity_transform(matchobj):
  58         """Transforms an HTML entity to a Unicode character.
  59
  60         This function receives a match object and is intended to be used with
  61         the re.sub() function.
  62         """
  63         entity = matchobj.group(1)
  64
  65         # Known non-numeric HTML entity
  66         if entity in htmlentitydefs.name2codepoint:
  67                 return unichr(htmlentitydefs.name2codepoint[entity])
  68
  69         # Unicode character
  70         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  71         if mobj is not None:
  72                 numstr = mobj.group(1)
  73                 if numstr.startswith(u'x'):
  74                         base = 16
  75                         numstr = u'0%s' % numstr
  76                 else:
  77                         base = 10
  78                 return unichr(long(numstr, base))
  79
  80         # Unknown entity in name, return its literal representation
  81         return (u'&%s;' % entity)
  82
  83 def sanitize_title(utitle):
  84         """Sanitizes a video title so it could be used as part of a filename."""
  85         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  86         return utitle.replace(unicode(os.sep), u'%')
  87
  88 def sanitize_open(filename, open_mode):
  89         """Try to open the given filename, and slightly tweak it if this fails.
  90
  91         Attempts to open the given filename. If this fails, it tries to change
  92         the filename slightly, step by step, until it's either able to open it
  93         or it fails and raises a final exception, like the standard open()
  94         function.
  95
  96         It returns the tuple (stream, definitive_file_name).
  97         """
  98         try:
  99                 if filename == u'-':
 100                         if sys.platform == 'win32':
 101                                 import msvcrt
 102                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 103                         return (sys.stdout, filename)
 104                 stream = open(filename, open_mode)
 105                 return (stream, filename)
 106         except (IOError, OSError), err:
 107                 # In case of error, try to remove win32 forbidden chars
 108                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 109
 110                 # An exception here should be caught in the caller
 111                 stream = open(filename, open_mode)
 112                 return (stream, filename)
 113
 114 class DownloadError(Exception):
 115         """Download Error exception.
 116
 117         This exception may be thrown by FileDownloader objects if they are not
 118         configured to continue on errors. They will contain the appropriate
 119         error message.
 120         """
 121         pass
 122
 123 class SameFileError(Exception):
 124         """Same File exception.
 125
 126         This exception will be thrown by FileDownloader objects if they detect
 127         multiple files would have to be downloaded to the same file on disk.
 128         """
 129         pass
 130
 131 class PostProcessingError(Exception):
 132         """Post Processing exception.
 133
 134         This exception may be raised by PostProcessor's .run() method to
 135         indicate an error in the postprocessing task.
 136         """
 137         pass
 138
 139 class UnavailableVideoError(Exception):
 140         """Unavailable Format exception.
 141
 142         This exception will be thrown when a video is requested
 143         in a format that is not available for that video.
 144         """
 145         pass
 146
 147 class ContentTooShortError(Exception):
 148         """Content Too Short exception.
 149
 150         This exception may be raised by FileDownloader objects when a file they
 151         download is too small for what the server announced first, indicating
 152         the connection was probably interrupted.
 153         """
 154         # Both in bytes
 155         downloaded = None
 156         expected = None
 157
 158         def __init__(self, downloaded, expected):
 159                 self.downloaded = downloaded
 160                 self.expected = expected
 161
 162 class FileDownloader(object):
 163         """File Downloader class.
 164
 165         File downloader objects are the ones responsible of downloading the
 166         actual video file and writing it to disk if the user has requested
 167         it, among some other tasks. In most cases there should be one per
 168         program. As, given a video URL, the downloader doesn't know how to
 169         extract all the needed information, task that InfoExtractors do, it
 170         has to pass the URL to one of them.
 171
 172         For this, file downloader objects have a method that allows
 173         InfoExtractors to be registered in a given order. When it is passed
 174         a URL, the file downloader handles it to the first InfoExtractor it
 175         finds that reports being able to handle it. The InfoExtractor extracts
 176         all the information about the video or videos the URL refers to, and
 177         asks the FileDownloader to process the video information, possibly
 178         downloading the video.
 179
 180         File downloaders accept a lot of parameters. In order not to saturate
 181         the object constructor with arguments, it receives a dictionary of
 182         options instead. These options are available through the params
 183         attribute for the InfoExtractors to use. The FileDownloader also
 184         registers itself as the downloader in charge for the InfoExtractors
 185         that are added to it, so this is a "mutual registration".
 186
 187         Available options:
 188
 189         username:         Username for authentication purposes.
 190         password:         Password for authentication purposes.
 191         usenetrc:         Use netrc for authentication instead.
 192         quiet:            Do not print messages to stdout.
 193         forceurl:         Force printing final URL.
 194         forcetitle:       Force printing title.
 195         forcethumbnail:   Force printing thumbnail URL.
 196         forcedescription: Force printing description.
 197         simulate:         Do not download the video files.
 198         format:           Video format code.
 199         format_limit:     Highest quality format to try.
 200         outtmpl:          Template for output names.
 201         ignoreerrors:     Do not stop on download errors.
 202         ratelimit:        Download speed limit, in bytes/sec.
 203         nooverwrites:     Prevent overwriting files.
 204         retries:          Number of times to retry for HTTP error 5xx
 205         continuedl:       Try to continue downloads if possible.
 206         noprogress:       Do not print the progress bar.
 207         playliststart:    Playlist item to start at.
 208         playlistend:      Playlist item to end at.
 209         logtostderr:      Log messages to stderr instead of stdout.
 210         """
 211
 212         params = None
 213         _ies = []
 214         _pps = []
 215         _download_retcode = None
 216         _num_downloads = None
 217         _screen_file = None
 218
 219         def __init__(self, params):
 220                 """Create a FileDownloader object with the given options."""
 221                 self._ies = []
 222                 self._pps = []
 223                 self._download_retcode = 0
 224                 self._num_downloads = 0
 225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 226                 self.params = params
 227
 228         @staticmethod
 229         def pmkdir(filename):
 230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 231                 components = filename.split(os.sep)
 232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 234                 for dir in aggregate:
 235                         if not os.path.exists(dir):
 236                                 os.mkdir(dir)
 237
 238         @staticmethod
 239         def temp_name(filename):
 240                 """Returns a temporary filename for the given filename."""
 241                 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
 242                         return filename
 243                 return filename + u'.part'
 244
 245         @staticmethod
 246         def format_bytes(bytes):
 247                 if bytes is None:
 248                         return 'N/A'
 249                 if type(bytes) is str:
 250                         bytes = float(bytes)
 251                 if bytes == 0.0:
 252                         exponent = 0
 253                 else:
 254                         exponent = long(math.log(bytes, 1024.0))
 255                 suffix = 'bkMGTPEZY'[exponent]
 256                 converted = float(bytes) / float(1024**exponent)
 257                 return '%.2f%s' % (converted, suffix)
 258
 259         @staticmethod
 260         def calc_percent(byte_counter, data_len):
 261                 if data_len is None:
 262                         return '---.-%'
 263                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 264
 265         @staticmethod
 266         def calc_eta(start, now, total, current):
 267                 if total is None:
 268                         return '--:--'
 269                 dif = now - start
 270                 if current == 0 or dif < 0.001: # One millisecond
 271                         return '--:--'
 272                 rate = float(current) / dif
 273                 eta = long((float(total) - float(current)) / rate)
 274                 (eta_mins, eta_secs) = divmod(eta, 60)
 275                 if eta_mins > 99:
 276                         return '--:--'
 277                 return '%02d:%02d' % (eta_mins, eta_secs)
 278
 279         @staticmethod
 280         def calc_speed(start, now, bytes):
 281                 dif = now - start
 282                 if bytes == 0 or dif < 0.001: # One millisecond
 283                         return '%10s' % '---b/s'
 284                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 285
 286         @staticmethod
 287         def best_block_size(elapsed_time, bytes):
 288                 new_min = max(bytes / 2.0, 1.0)
 289                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 290                 if elapsed_time < 0.001:
 291                         return long(new_max)
 292                 rate = bytes / elapsed_time
 293                 if rate > new_max:
 294                         return long(new_max)
 295                 if rate < new_min:
 296                         return long(new_min)
 297                 return long(rate)
 298
 299         @staticmethod
 300         def parse_bytes(bytestr):
 301                 """Parse a string indicating a byte quantity into a long integer."""
 302                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 303                 if matchobj is None:
 304                         return None
 305                 number = float(matchobj.group(1))
 306                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 307                 return long(round(number * multiplier))
 308
 309         def add_info_extractor(self, ie):
 310                 """Add an InfoExtractor object to the end of the list."""
 311                 self._ies.append(ie)
 312                 ie.set_downloader(self)
 313
 314         def add_post_processor(self, pp):
 315                 """Add a PostProcessor object to the end of the chain."""
 316                 self._pps.append(pp)
 317                 pp.set_downloader(self)
 318
 319         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 320                 """Print message to stdout if not in quiet mode."""
 321                 try:
 322                         if not self.params.get('quiet', False):
 323                                 terminator = [u'\n', u''][skip_eol]
 324                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 325                         self._screen_file.flush()
 326                 except (UnicodeEncodeError), err:
 327                         if not ignore_encoding_errors:
 328                                 raise
 329
 330         def to_stderr(self, message):
 331                 """Print message to stderr."""
 332                 print >>sys.stderr, message.encode(preferredencoding())
 333
 334         def fixed_template(self):
 335                 """Checks if the output template is fixed."""
 336                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 337
 338         def trouble(self, message=None):
 339                 """Determine action to take when a download problem appears.
 340
 341                 Depending on if the downloader has been configured to ignore
 342                 download errors or not, this method may throw an exception or
 343                 not when errors are found, after printing the message.
 344                 """
 345                 if message is not None:
 346                         self.to_stderr(message)
 347                 if not self.params.get('ignoreerrors', False):
 348                         raise DownloadError(message)
 349                 self._download_retcode = 1
 350
 351         def slow_down(self, start_time, byte_counter):
 352                 """Sleep if the download speed is over the rate limit."""
 353                 rate_limit = self.params.get('ratelimit', None)
 354                 if rate_limit is None or byte_counter == 0:
 355                         return
 356                 now = time.time()
 357                 elapsed = now - start_time
 358                 if elapsed <= 0.0:
 359                         return
 360                 speed = float(byte_counter) / elapsed
 361                 if speed > rate_limit:
 362                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 363
 364         def try_rename(self, old_filename, new_filename):
 365                 try:
 366                         if old_filename == new_filename:
 367                                 return
 368                         os.rename(old_filename, new_filename)
 369                 except (IOError, OSError), err:
 370                         self.trouble(u'ERROR: unable to rename file')
 371
 372         def report_destination(self, filename):
 373                 """Report destination filename."""
 374                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 375
 376         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 377                 """Report download progress."""
 378                 if self.params.get('noprogress', False):
 379                         return
 380                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 381                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 382
 383         def report_resuming_byte(self, resume_len):
 384                 """Report attempt to resume at given byte."""
 385                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 386
 387         def report_retry(self, count, retries):
 388                 """Report retry in case of HTTP error 5xx"""
 389                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 390
 391         def report_file_already_downloaded(self, file_name):
 392                 """Report file has already been fully downloaded."""
 393                 try:
 394                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 395                 except (UnicodeEncodeError), err:
 396                         self.to_screen(u'[download] The file has already been downloaded')
 397
 398         def report_unable_to_resume(self):
 399                 """Report it was impossible to resume download."""
 400                 self.to_screen(u'[download] Unable to resume')
 401
 402         def report_finish(self):
 403                 """Report download finished."""
 404                 if self.params.get('noprogress', False):
 405                         self.to_screen(u'[download] Download completed')
 406                 else:
 407                         self.to_screen(u'')
 408
 409         def increment_downloads(self):
 410                 """Increment the ordinal that assigns a number to each file."""
 411                 self._num_downloads += 1
 412
 413         def process_info(self, info_dict):
 414                 """Process a single dictionary returned by an InfoExtractor."""
 415                 # Do nothing else if in simulate mode
 416                 if self.params.get('simulate', False):
 417                         # Forced printings
 418                         if self.params.get('forcetitle', False):
 419                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 420                         if self.params.get('forceurl', False):
 421                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 422                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 423                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 424                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 425                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 426
 427                         return
 428
 429                 try:
 430                         template_dict = dict(info_dict)
 431                         template_dict['epoch'] = unicode(long(time.time()))
 432                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 433                         filename = self.params['outtmpl'] % template_dict
 434                 except (ValueError, KeyError), err:
 435                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 436                         return
 437                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 438                         self.to_stderr(u'WARNING: file exists and will be skipped')
 439                         return
 440
 441                 try:
 442                         self.pmkdir(filename)
 443                 except (OSError, IOError), err:
 444                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 445                         return
 446
 447                 try:
 448                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 449                 except (OSError, IOError), err:
 450                         raise UnavailableVideoError
 451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 452                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 453                         return
 454                 except (ContentTooShortError, ), err:
 455                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 456                         return
 457
 458                 if success:
 459                         try:
 460                                 self.post_process(filename, info_dict)
 461                         except (PostProcessingError), err:
 462                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 463                                 return
 464
 465         def download(self, url_list):
 466                 """Download a given list of URLs."""
 467                 if len(url_list) > 1 and self.fixed_template():
 468                         raise SameFileError(self.params['outtmpl'])
 469
 470                 for url in url_list:
 471                         suitable_found = False
 472                         for ie in self._ies:
 473                                 # Go to next InfoExtractor if not suitable
 474                                 if not ie.suitable(url):
 475                                         continue
 476
 477                                 # Suitable InfoExtractor found
 478                                 suitable_found = True
 479
 480                                 # Extract information from URL and process it
 481                                 ie.extract(url)
 482
 483                                 # Suitable InfoExtractor had been found; go to next URL
 484                                 break
 485
 486                         if not suitable_found:
 487                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 488
 489                 return self._download_retcode
 490
 491         def post_process(self, filename, ie_info):
 492                 """Run the postprocessing chain on the given file."""
 493                 info = dict(ie_info)
 494                 info['filepath'] = filename
 495                 for pp in self._pps:
 496                         info = pp.run(info)
 497                         if info is None:
 498                                 break
 499
 500         def _download_with_rtmpdump(self, filename, url, player_url):
 501                 self.report_destination(filename)
 502                 tmpfilename = self.temp_name(filename)
 503
 504                 # Check for rtmpdump first
 505                 try:
 506                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 507                 except (OSError, IOError):
 508                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 509                         return False
 510
 511                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 512                 # the connection was interrumpted and resuming appears to be
 513                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 514                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 515                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 516                 while retval == 2 or retval == 1:
 517                         prevsize = os.path.getsize(tmpfilename)
 518                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 519                         time.sleep(5.0) # This seems to be needed
 520                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 521                         cursize = os.path.getsize(tmpfilename)
 522                         if prevsize == cursize and retval == 1:
 523                                 break
 524                 if retval == 0:
 525                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 526                         self.try_rename(tmpfilename, filename)
 527                         return True
 528                 else:
 529                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 530                         return False
 531
 532         def _do_download(self, filename, url, player_url):
 533                 # Check file already present
 534                 if self.params.get('continuedl', False) and os.path.isfile(filename):
 535                         self.report_file_already_downloaded(filename)
 536                         return True
 537
 538                 # Attempt to download using rtmpdump
 539                 if url.startswith('rtmp'):
 540                         return self._download_with_rtmpdump(filename, url, player_url)
 541
 542                 tmpfilename = self.temp_name(filename)
 543                 stream = None
 544                 open_mode = 'wb'
 545                 basic_request = urllib2.Request(url, None, std_headers)
 546                 request = urllib2.Request(url, None, std_headers)
 547
 548                 # Establish possible resume length
 549                 if os.path.isfile(tmpfilename):
 550                         resume_len = os.path.getsize(tmpfilename)
 551                 else:
 552                         resume_len = 0
 553
 554                 # Request parameters in case of being able to resume
 555                 if self.params.get('continuedl', False) and resume_len != 0:
 556                         self.report_resuming_byte(resume_len)
 557                         request.add_header('Range','bytes=%d-' % resume_len)
 558                         open_mode = 'ab'
 559
 560                 count = 0
 561                 retries = self.params.get('retries', 0)
 562                 while count <= retries:
 563                         # Establish connection
 564                         try:
 565                                 data = urllib2.urlopen(request)
 566                                 break
 567                         except (urllib2.HTTPError, ), err:
 568                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 569                                         # Unexpected HTTP error
 570                                         raise
 571                                 elif err.code == 416:
 572                                         # Unable to resume (requested range not satisfiable)
 573                                         try:
 574                                                 # Open the connection again without the range header
 575                                                 data = urllib2.urlopen(basic_request)
 576                                                 content_length = data.info()['Content-Length']
 577                                         except (urllib2.HTTPError, ), err:
 578                                                 if err.code < 500 or err.code >= 600:
 579                                                         raise
 580                                         else:
 581                                                 # Examine the reported length
 582                                                 if (content_length is not None and
 583                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 584                                                         # The file had already been fully downloaded.
 585                                                         # Explanation to the above condition: in issue #175 it was revealed that
 586                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 587                                                         # changing the file size slightly and causing problems for some users. So
 588                                                         # I decided to implement a suggested change and consider the file
 589                                                         # completely downloaded if the file size differs less than 100 bytes from
 590                                                         # the one in the hard drive.
 591                                                         self.report_file_already_downloaded(filename)
 592                                                         self.try_rename(tmpfilename, filename)
 593                                                         return True
 594                                                 else:
 595                                                         # The length does not match, we start the download over
 596                                                         self.report_unable_to_resume()
 597                                                         open_mode = 'wb'
 598                                                         break
 599                         # Retry
 600                         count += 1
 601                         if count <= retries:
 602                                 self.report_retry(count, retries)
 603
 604                 if count > retries:
 605                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 606                         return False
 607
 608                 data_len = data.info().get('Content-length', None)
 609                 data_len_str = self.format_bytes(data_len)
 610                 byte_counter = 0
 611                 block_size = 1024
 612                 start = time.time()
 613                 while True:
 614                         # Download and write
 615                         before = time.time()
 616                         data_block = data.read(block_size)
 617                         after = time.time()
 618                         data_block_len = len(data_block)
 619                         if data_block_len == 0:
 620                                 break
 621                         byte_counter += data_block_len
 622
 623                         # Open file just in time
 624                         if stream is None:
 625                                 try:
 626                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 627                                         self.report_destination(filename)
 628                                 except (OSError, IOError), err:
 629                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 630                                         return False
 631                         try:
 632                                 stream.write(data_block)
 633                         except (IOError, OSError), err:
 634                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 635                                 return False
 636                         block_size = self.best_block_size(after - before, data_block_len)
 637
 638                         # Progress message
 639                         percent_str = self.calc_percent(byte_counter, data_len)
 640                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 641                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 642                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 643
 644                         # Apply rate limit
 645                         self.slow_down(start, byte_counter)
 646
 647                 stream.close()
 648                 self.report_finish()
 649                 if data_len is not None and str(byte_counter) != data_len:
 650                         raise ContentTooShortError(byte_counter, long(data_len))
 651                 self.try_rename(tmpfilename, filename)
 652                 return True
 653
 654 class InfoExtractor(object):
 655         """Information Extractor class.
 656
 657         Information extractors are the classes that, given a URL, extract
 658         information from the video (or videos) the URL refers to. This
 659         information includes the real video URL, the video title and simplified
 660         title, author and others. The information is stored in a dictionary
 661         which is then passed to the FileDownloader. The FileDownloader
 662         processes this information possibly downloading the video to the file
 663         system, among other possible outcomes. The dictionaries must include
 664         the following fields:
 665
 666         id:             Video identifier.
 667         url:            Final video URL.
 668         uploader:       Nickname of the video uploader.
 669         title:          Literal title.
 670         stitle:         Simplified title.
 671         ext:            Video filename extension.
 672         format:         Video format.
 673         player_url:     SWF Player URL (may be None).
 674
 675         The following fields are optional. Their primary purpose is to allow
 676         youtube-dl to serve as the backend for a video search function, such
 677         as the one in youtube2mp3.  They are only used when their respective
 678         forced printing functions are called:
 679
 680         thumbnail:      Full URL to a video thumbnail image.
 681         description:    One-line video description.
 682
 683         Subclasses of this one should re-define the _real_initialize() and
 684         _real_extract() methods, as well as the suitable() static method.
 685         Probably, they should also be instantiated and added to the main
 686         downloader.
 687         """
 688
 689         _ready = False
 690         _downloader = None
 691
 692         def __init__(self, downloader=None):
 693                 """Constructor. Receives an optional downloader."""
 694                 self._ready = False
 695                 self.set_downloader(downloader)
 696
 697         @staticmethod
 698         def suitable(url):
 699                 """Receives a URL and returns True if suitable for this IE."""
 700                 return False
 701
 702         def initialize(self):
 703                 """Initializes an instance (authentication, etc)."""
 704                 if not self._ready:
 705                         self._real_initialize()
 706                         self._ready = True
 707
 708         def extract(self, url):
 709                 """Extracts URL information and returns it in list of dicts."""
 710                 self.initialize()
 711                 return self._real_extract(url)
 712
 713         def set_downloader(self, downloader):
 714                 """Sets the downloader for this IE."""
 715                 self._downloader = downloader
 716
 717         def _real_initialize(self):
 718                 """Real initialization process. Redefine in subclasses."""
 719                 pass
 720
 721         def _real_extract(self, url):
 722                 """Real extraction process. Redefine in subclasses."""
 723                 pass
 724
 725 class YoutubeIE(InfoExtractor):
 726         """Information extractor for youtube.com."""
 727
 728         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
 729         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 730         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 731         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 732         _NETRC_MACHINE = 'youtube'
 733         # Listed in order of quality
 734         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 735         _video_extensions = {
 736                 '13': '3gp',
 737                 '17': 'mp4',
 738                 '18': 'mp4',
 739                 '22': 'mp4',
 740                 '37': 'mp4',
 741                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 742                 '43': 'webm',
 743                 '45': 'webm',
 744         }
 745
 746         @staticmethod
 747         def suitable(url):
 748                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 749
 750         def report_lang(self):
 751                 """Report attempt to set language."""
 752                 self._downloader.to_screen(u'[youtube] Setting language')
 753
 754         def report_login(self):
 755                 """Report attempt to log in."""
 756                 self._downloader.to_screen(u'[youtube] Logging in')
 757
 758         def report_age_confirmation(self):
 759                 """Report attempt to confirm age."""
 760                 self._downloader.to_screen(u'[youtube] Confirming age')
 761
 762         def report_video_webpage_download(self, video_id):
 763                 """Report attempt to download video webpage."""
 764                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 765
 766         def report_video_info_webpage_download(self, video_id):
 767                 """Report attempt to download video info webpage."""
 768                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 769
 770         def report_information_extraction(self, video_id):
 771                 """Report attempt to extract video information."""
 772                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 773
 774         def report_unavailable_format(self, video_id, format):
 775                 """Report extracted video URL."""
 776                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 777
 778         def report_rtmp_download(self):
 779                 """Indicate the download will use the RTMP protocol."""
 780                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 781
 782         def _real_initialize(self):
 783                 if self._downloader is None:
 784                         return
 785
 786                 username = None
 787                 password = None
 788                 downloader_params = self._downloader.params
 789
 790                 # Attempt to use provided username and password or .netrc data
 791                 if downloader_params.get('username', None) is not None:
 792                         username = downloader_params['username']
 793                         password = downloader_params['password']
 794                 elif downloader_params.get('usenetrc', False):
 795                         try:
 796                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 797                                 if info is not None:
 798                                         username = info[0]
 799                                         password = info[2]
 800                                 else:
 801                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 802                         except (IOError, netrc.NetrcParseError), err:
 803                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 804                                 return
 805
 806                 # Set language
 807                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 808                 try:
 809                         self.report_lang()
 810                         urllib2.urlopen(request).read()
 811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 812                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 813                         return
 814
 815                 # No authentication to be performed
 816                 if username is None:
 817                         return
 818
 819                 # Log in
 820                 login_form = {
 821                                 'current_form': 'loginForm',
 822                                 'next':         '/',
 823                                 'action_login': 'Log In',
 824                                 'username':     username,
 825                                 'password':     password,
 826                                 }
 827                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 828                 try:
 829                         self.report_login()
 830                         login_results = urllib2.urlopen(request).read()
 831                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 832                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 833                                 return
 834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 835                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 836                         return
 837
 838                 # Confirm age
 839                 age_form = {
 840                                 'next_url':             '/',
 841                                 'action_confirm':       'Confirm',
 842                                 }
 843                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 844                 try:
 845                         self.report_age_confirmation()
 846                         age_results = urllib2.urlopen(request).read()
 847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 848                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 849                         return
 850
 851         def _real_extract(self, url):
 852                 # Extract video id from URL
 853                 mobj = re.match(self._VALID_URL, url)
 854                 if mobj is None:
 855                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 856                         return
 857                 video_id = mobj.group(2)
 858
 859                 # Get video webpage
 860                 self.report_video_webpage_download(video_id)
 861                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
 862                 try:
 863                         video_webpage = urllib2.urlopen(request).read()
 864                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 865                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 866                         return
 867
 868                 # Attempt to extract SWF player URL
 869                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 870                 if mobj is not None:
 871                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 872                 else:
 873                         player_url = None
 874
 875                 # Get video info
 876                 self.report_video_info_webpage_download(video_id)
 877                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 878                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 879                                            % (video_id, el_type))
 880                         request = urllib2.Request(video_info_url, None, std_headers)
 881                         try:
 882                                 video_info_webpage = urllib2.urlopen(request).read()
 883                                 video_info = parse_qs(video_info_webpage)
 884                                 if 'token' in video_info:
 885                                         break
 886                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 887                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 888                                 return
 889                 if 'token' not in video_info:
 890                         if 'reason' in video_info:
 891                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 892                         else:
 893                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 894                         return
 895
 896                 # Start extracting information
 897                 self.report_information_extraction(video_id)
 898
 899                 # uploader
 900                 if 'author' not in video_info:
 901                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 902                         return
 903                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 904
 905                 # title
 906                 if 'title' not in video_info:
 907                         self._downloader.trouble(u'ERROR: unable to extract video title')
 908                         return
 909                 video_title = urllib.unquote_plus(video_info['title'][0])
 910                 video_title = video_title.decode('utf-8')
 911                 video_title = sanitize_title(video_title)
 912
 913                 # simplified title
 914                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 915                 simple_title = simple_title.strip(ur'_')
 916
 917                 # thumbnail image
 918                 if 'thumbnail_url' not in video_info:
 919                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 920                         video_thumbnail = ''
 921                 else:   # don't panic if we can't find it
 922                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 923
 924                 # upload date
 925                 upload_date = u'NA'
 926                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
 927                 if mobj is not None:
 928                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 929                         format_expressions = ['%d %B %Y', '%B %d %Y']
 930                         for expression in format_expressions:
 931                                 try:
 932                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 933                                 except:
 934                                         pass
 935
 936                 # description
 937                 video_description = 'No description available.'
 938                 if self._downloader.params.get('forcedescription', False):
 939                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 940                         if mobj is not None:
 941                                 video_description = mobj.group(1)
 942
 943                 # token
 944                 video_token = urllib.unquote_plus(video_info['token'][0])
 945
 946                 # Decide which formats to download
 947                 req_format = self._downloader.params.get('format', None)
 948                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
 949
 950                 if 'fmt_url_map' in video_info:
 951                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
 952                         format_limit = self._downloader.params.get('format_limit', None)
 953                         if format_limit is not None and format_limit in self._available_formats:
 954                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
 955                         else:
 956                                 format_list = self._available_formats
 957                         existing_formats = [x for x in format_list if x in url_map]
 958                         if len(existing_formats) == 0:
 959                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 960                                 return
 961                         if req_format is None:
 962                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 963                         elif req_format == '-1':
 964                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 965                         else:
 966                                 if req_format in url_map:
 967                                         video_url_list = [(req_format, url_map[req_format])] # Specific format
 968                                 else:
 969                                         video_url_list = [(req_format, get_video_template % req_format)] # Specific format
 970
 971                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 972                         self.report_rtmp_download()
 973                         video_url_list = [(None, video_info['conn'][0])]
 974
 975                 else:
 976                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 977                         return
 978
 979                 for format_param, video_real_url in video_url_list:
 980                         # At this point we have a new video
 981                         self._downloader.increment_downloads()
 982
 983                         # Extension
 984                         video_extension = self._video_extensions.get(format_param, 'flv')
 985
 986                         # Find the video URL in fmt_url_map or conn paramters
 987                         try:
 988                                 # Process video information
 989                                 self._downloader.process_info({
 990                                         'id':           video_id.decode('utf-8'),
 991                                         'url':          video_real_url.decode('utf-8'),
 992                                         'uploader':     video_uploader.decode('utf-8'),
 993                                         'upload_date':  upload_date,
 994                                         'title':        video_title,
 995                                         'stitle':       simple_title,
 996                                         'ext':          video_extension.decode('utf-8'),
 997                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 998                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 999                                         'description':  video_description.decode('utf-8'),
1000                                         'player_url':   player_url,
1001                                 })
1002                         except UnavailableVideoError, err:
1003                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
1004
1005
1006 class MetacafeIE(InfoExtractor):
1007         """Information Extractor for metacafe.com."""
1008
1009         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1010         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1011         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1012         _youtube_ie = None
1013
1014         def __init__(self, youtube_ie, downloader=None):
1015                 InfoExtractor.__init__(self, downloader)
1016                 self._youtube_ie = youtube_ie
1017
1018         @staticmethod
1019         def suitable(url):
1020                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1021
1022         def report_disclaimer(self):
1023                 """Report disclaimer retrieval."""
1024                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1025
1026         def report_age_confirmation(self):
1027                 """Report attempt to confirm age."""
1028                 self._downloader.to_screen(u'[metacafe] Confirming age')
1029
1030         def report_download_webpage(self, video_id):
1031                 """Report webpage download."""
1032                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1033
1034         def report_extraction(self, video_id):
1035                 """Report information extraction."""
1036                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1037
1038         def _real_initialize(self):
1039                 # Retrieve disclaimer
1040                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1041                 try:
1042                         self.report_disclaimer()
1043                         disclaimer = urllib2.urlopen(request).read()
1044                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1045                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1046                         return
1047
1048                 # Confirm age
1049                 disclaimer_form = {
1050                         'filters': '0',
1051                         'submit': "Continue - I'm over 18",
1052                         }
1053                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1054                 try:
1055                         self.report_age_confirmation()
1056                         disclaimer = urllib2.urlopen(request).read()
1057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1059                         return
1060
1061         def _real_extract(self, url):
1062                 # Extract id and simplified title from URL
1063                 mobj = re.match(self._VALID_URL, url)
1064                 if mobj is None:
1065                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1066                         return
1067
1068                 video_id = mobj.group(1)
1069
1070                 # Check if video comes from YouTube
1071                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1072                 if mobj2 is not None:
1073                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1074                         return
1075
1076                 # At this point we have a new video
1077                 self._downloader.increment_downloads()
1078
1079                 simple_title = mobj.group(2).decode('utf-8')
1080
1081                 # Retrieve video webpage to extract further information
1082                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1083                 try:
1084                         self.report_download_webpage(video_id)
1085                         webpage = urllib2.urlopen(request).read()
1086                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1087                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1088                         return
1089
1090                 # Extract URL, uploader and title from webpage
1091                 self.report_extraction(video_id)
1092                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1093                 if mobj is not None:
1094                         mediaURL = urllib.unquote(mobj.group(1))
1095                         video_extension = mediaURL[-3:]
1096
1097                         # Extract gdaKey if available
1098                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1099                         if mobj is None:
1100                                 video_url = mediaURL
1101                         else:
1102                                 gdaKey = mobj.group(1)
1103                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1104                 else:
1105                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1106                         if mobj is None:
1107                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1108                                 return
1109                         vardict = parse_qs(mobj.group(1))
1110                         if 'mediaData' not in vardict:
1111                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1112                                 return
1113                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1114                         if mobj is None:
1115                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1116                                 return
1117                         mediaURL = mobj.group(1).replace('\\/', '/')
1118                         video_extension = mediaURL[-3:]
1119                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1120
1121                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1122                 if mobj is None:
1123                         self._downloader.trouble(u'ERROR: unable to extract title')
1124                         return
1125                 video_title = mobj.group(1).decode('utf-8')
1126                 video_title = sanitize_title(video_title)
1127
1128                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1129                 if mobj is None:
1130                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1131                         return
1132                 video_uploader = mobj.group(1)
1133
1134                 try:
1135                         # Process video information
1136                         self._downloader.process_info({
1137                                 'id':           video_id.decode('utf-8'),
1138                                 'url':          video_url.decode('utf-8'),
1139                                 'uploader':     video_uploader.decode('utf-8'),
1140                                 'upload_date':  u'NA',
1141                                 'title':        video_title,
1142                                 'stitle':       simple_title,
1143                                 'ext':          video_extension.decode('utf-8'),
1144                                 'format':       u'NA',
1145                                 'player_url':   None,
1146                         })
1147                 except UnavailableVideoError:
1148                         self._downloader.trouble(u'ERROR: unable to download video')
1149
1150
1151 class DailymotionIE(InfoExtractor):
1152         """Information Extractor for Dailymotion"""
1153
1154         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1155
1156         def __init__(self, downloader=None):
1157                 InfoExtractor.__init__(self, downloader)
1158
1159         @staticmethod
1160         def suitable(url):
1161                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1162
1163         def report_download_webpage(self, video_id):
1164                 """Report webpage download."""
1165                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1166
1167         def report_extraction(self, video_id):
1168                 """Report information extraction."""
1169                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1170
1171         def _real_initialize(self):
1172                 return
1173
1174         def _real_extract(self, url):
1175                 # Extract id and simplified title from URL
1176                 mobj = re.match(self._VALID_URL, url)
1177                 if mobj is None:
1178                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1179                         return
1180
1181                 # At this point we have a new video
1182                 self._downloader.increment_downloads()
1183                 video_id = mobj.group(1)
1184
1185                 simple_title = mobj.group(2).decode('utf-8')
1186                 video_extension = 'flv'
1187
1188                 # Retrieve video webpage to extract further information
1189                 request = urllib2.Request(url)
1190                 try:
1191                         self.report_download_webpage(video_id)
1192                         webpage = urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1195                         return
1196
1197                 # Extract URL, uploader and title from webpage
1198                 self.report_extraction(video_id)
1199                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1202                         return
1203                 mediaURL = urllib.unquote(mobj.group(1))
1204
1205                 # if needed add http://www.dailymotion.com/ if relative URL
1206
1207                 video_url = mediaURL
1208
1209                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1210                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1211                 if mobj is None:
1212                         self._downloader.trouble(u'ERROR: unable to extract title')
1213                         return
1214                 video_title = mobj.group(1).decode('utf-8')
1215                 video_title = sanitize_title(video_title)
1216
1217                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1218                 if mobj is None:
1219                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1220                         return
1221                 video_uploader = mobj.group(1)
1222
1223                 try:
1224                         # Process video information
1225                         self._downloader.process_info({
1226                                 'id':           video_id.decode('utf-8'),
1227                                 'url':          video_url.decode('utf-8'),
1228                                 'uploader':     video_uploader.decode('utf-8'),
1229                                 'upload_date':  u'NA',
1230                                 'title':        video_title,
1231                                 'stitle':       simple_title,
1232                                 'ext':          video_extension.decode('utf-8'),
1233                                 'format':       u'NA',
1234                                 'player_url':   None,
1235                         })
1236                 except UnavailableVideoError:
1237                         self._downloader.trouble(u'ERROR: unable to download video')
1238
1239 class GoogleIE(InfoExtractor):
1240         """Information extractor for video.google.com."""
1241
1242         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1243
1244         def __init__(self, downloader=None):
1245                 InfoExtractor.__init__(self, downloader)
1246
1247         @staticmethod
1248         def suitable(url):
1249                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1250
1251         def report_download_webpage(self, video_id):
1252                 """Report webpage download."""
1253                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1254
1255         def report_extraction(self, video_id):
1256                 """Report information extraction."""
1257                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1258
1259         def _real_initialize(self):
1260                 return
1261
1262         def _real_extract(self, url):
1263                 # Extract id from URL
1264                 mobj = re.match(self._VALID_URL, url)
1265                 if mobj is None:
1266                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1267                         return
1268
1269                 # At this point we have a new video
1270                 self._downloader.increment_downloads()
1271                 video_id = mobj.group(1)
1272
1273                 video_extension = 'mp4'
1274
1275                 # Retrieve video webpage to extract further information
1276                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1277                 try:
1278                         self.report_download_webpage(video_id)
1279                         webpage = urllib2.urlopen(request).read()
1280                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1282                         return
1283
1284                 # Extract URL, uploader, and title from webpage
1285                 self.report_extraction(video_id)
1286                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1287                 if mobj is None:
1288                         video_extension = 'flv'
1289                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1290                 if mobj is None:
1291                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1292                         return
1293                 mediaURL = urllib.unquote(mobj.group(1))
1294                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1295                 mediaURL = mediaURL.replace('\\x26', '\x26')
1296
1297                 video_url = mediaURL
1298
1299                 mobj = re.search(r'<title>(.*)</title>', webpage)
1300                 if mobj is None:
1301                         self._downloader.trouble(u'ERROR: unable to extract title')
1302                         return
1303                 video_title = mobj.group(1).decode('utf-8')
1304                 video_title = sanitize_title(video_title)
1305                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1306
1307                 # Extract video description
1308                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1309                 if mobj is None:
1310                         self._downloader.trouble(u'ERROR: unable to extract video description')
1311                         return
1312                 video_description = mobj.group(1).decode('utf-8')
1313                 if not video_description:
1314                         video_description = 'No description available.'
1315
1316                 # Extract video thumbnail
1317                 if self._downloader.params.get('forcethumbnail', False):
1318                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1319                         try:
1320                                 webpage = urllib2.urlopen(request).read()
1321                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1323                                 return
1324                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1325                         if mobj is None:
1326                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1327                                 return
1328                         video_thumbnail = mobj.group(1)
1329                 else:   # we need something to pass to process_info
1330                         video_thumbnail = ''
1331
1332
1333                 try:
1334                         # Process video information
1335                         self._downloader.process_info({
1336                                 'id':           video_id.decode('utf-8'),
1337                                 'url':          video_url.decode('utf-8'),
1338                                 'uploader':     u'NA',
1339                                 'upload_date':  u'NA',
1340                                 'title':        video_title,
1341                                 'stitle':       simple_title,
1342                                 'ext':          video_extension.decode('utf-8'),
1343                                 'format':       u'NA',
1344                                 'player_url':   None,
1345                         })
1346                 except UnavailableVideoError:
1347                         self._downloader.trouble(u'ERROR: unable to download video')
1348
1349
1350 class PhotobucketIE(InfoExtractor):
1351         """Information extractor for photobucket.com."""
1352
1353         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1354
1355         def __init__(self, downloader=None):
1356                 InfoExtractor.__init__(self, downloader)
1357
1358         @staticmethod
1359         def suitable(url):
1360                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1361
1362         def report_download_webpage(self, video_id):
1363                 """Report webpage download."""
1364                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1365
1366         def report_extraction(self, video_id):
1367                 """Report information extraction."""
1368                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1369
1370         def _real_initialize(self):
1371                 return
1372
1373         def _real_extract(self, url):
1374                 # Extract id from URL
1375                 mobj = re.match(self._VALID_URL, url)
1376                 if mobj is None:
1377                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378                         return
1379
1380                 # At this point we have a new video
1381                 self._downloader.increment_downloads()
1382                 video_id = mobj.group(1)
1383
1384                 video_extension = 'flv'
1385
1386                 # Retrieve video webpage to extract further information
1387                 request = urllib2.Request(url)
1388                 try:
1389                         self.report_download_webpage(video_id)
1390                         webpage = urllib2.urlopen(request).read()
1391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1393                         return
1394
1395                 # Extract URL, uploader, and title from webpage
1396                 self.report_extraction(video_id)
1397                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1398                 if mobj is None:
1399                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1400                         return
1401                 mediaURL = urllib.unquote(mobj.group(1))
1402
1403                 video_url = mediaURL
1404
1405                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1406                 if mobj is None:
1407                         self._downloader.trouble(u'ERROR: unable to extract title')
1408                         return
1409                 video_title = mobj.group(1).decode('utf-8')
1410                 video_title = sanitize_title(video_title)
1411                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1412
1413                 video_uploader = mobj.group(2).decode('utf-8')
1414
1415                 try:
1416                         # Process video information
1417                         self._downloader.process_info({
1418                                 'id':           video_id.decode('utf-8'),
1419                                 'url':          video_url.decode('utf-8'),
1420                                 'uploader':     video_uploader,
1421                                 'upload_date':  u'NA',
1422                                 'title':        video_title,
1423                                 'stitle':       simple_title,
1424                                 'ext':          video_extension.decode('utf-8'),
1425                                 'format':       u'NA',
1426                                 'player_url':   None,
1427                         })
1428                 except UnavailableVideoError:
1429                         self._downloader.trouble(u'ERROR: unable to download video')
1430
1431
1432 class YahooIE(InfoExtractor):
1433         """Information extractor for video.yahoo.com."""
1434
1435         # _VALID_URL matches all Yahoo! Video URLs
1436         # _VPAGE_URL matches only the extractable '/watch/' URLs
1437         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1438         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1439
1440         def __init__(self, downloader=None):
1441                 InfoExtractor.__init__(self, downloader)
1442
1443         @staticmethod
1444         def suitable(url):
1445                 return (re.match(YahooIE._VALID_URL, url) is not None)
1446
1447         def report_download_webpage(self, video_id):
1448                 """Report webpage download."""
1449                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1450
1451         def report_extraction(self, video_id):
1452                 """Report information extraction."""
1453                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1454
1455         def _real_initialize(self):
1456                 return
1457
1458         def _real_extract(self, url, new_video=True):
1459                 # Extract ID from URL
1460                 mobj = re.match(self._VALID_URL, url)
1461                 if mobj is None:
1462                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1463                         return
1464
1465                 # At this point we have a new video
1466                 self._downloader.increment_downloads()
1467                 video_id = mobj.group(2)
1468                 video_extension = 'flv'
1469
1470                 # Rewrite valid but non-extractable URLs as
1471                 # extractable English language /watch/ URLs
1472                 if re.match(self._VPAGE_URL, url) is None:
1473                         request = urllib2.Request(url)
1474                         try:
1475                                 webpage = urllib2.urlopen(request).read()
1476                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1477                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1478                                 return
1479
1480                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1481                         if mobj is None:
1482                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1483                                 return
1484                         yahoo_id = mobj.group(1)
1485
1486                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1487                         if mobj is None:
1488                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1489                                 return
1490                         yahoo_vid = mobj.group(1)
1491
1492                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1493                         return self._real_extract(url, new_video=False)
1494
1495                 # Retrieve video webpage to extract further information
1496                 request = urllib2.Request(url)
1497                 try:
1498                         self.report_download_webpage(video_id)
1499                         webpage = urllib2.urlopen(request).read()
1500                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1501                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1502                         return
1503
1504                 # Extract uploader and title from webpage
1505                 self.report_extraction(video_id)
1506                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: unable to extract video title')
1509                         return
1510                 video_title = mobj.group(1).decode('utf-8')
1511                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1512
1513                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1516                         return
1517                 video_uploader = mobj.group(1).decode('utf-8')
1518
1519                 # Extract video thumbnail
1520                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1523                         return
1524                 video_thumbnail = mobj.group(1).decode('utf-8')
1525
1526                 # Extract video description
1527                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract video description')
1530                         return
1531                 video_description = mobj.group(1).decode('utf-8')
1532                 if not video_description: video_description = 'No description available.'
1533
1534                 # Extract video height and width
1535                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1536                 if mobj is None:
1537                         self._downloader.trouble(u'ERROR: unable to extract video height')
1538                         return
1539                 yv_video_height = mobj.group(1)
1540
1541                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: unable to extract video width')
1544                         return
1545                 yv_video_width = mobj.group(1)
1546
1547                 # Retrieve video playlist to extract media URL
1548                 # I'm not completely sure what all these options are, but we
1549                 # seem to need most of them, otherwise the server sends a 401.
1550                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1551                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1552                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1553                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1554                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1555                 try:
1556                         self.report_download_webpage(video_id)
1557                         webpage = urllib2.urlopen(request).read()
1558                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1559                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1560                         return
1561
1562                 # Extract media URL from playlist XML
1563                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1564                 if mobj is None:
1565                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1566                         return
1567                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1568                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1569
1570                 try:
1571                         # Process video information
1572                         self._downloader.process_info({
1573                                 'id':           video_id.decode('utf-8'),
1574                                 'url':          video_url,
1575                                 'uploader':     video_uploader,
1576                                 'upload_date':  u'NA',
1577                                 'title':        video_title,
1578                                 'stitle':       simple_title,
1579                                 'ext':          video_extension.decode('utf-8'),
1580                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1581                                 'description':  video_description,
1582                                 'thumbnail':    video_thumbnail,
1583                                 'description':  video_description,
1584                                 'player_url':   None,
1585                         })
1586                 except UnavailableVideoError:
1587                         self._downloader.trouble(u'ERROR: unable to download video')
1588
1589
1590 class GenericIE(InfoExtractor):
1591         """Generic last-resort information extractor."""
1592
1593         def __init__(self, downloader=None):
1594                 InfoExtractor.__init__(self, downloader)
1595
1596         @staticmethod
1597         def suitable(url):
1598                 return True
1599
1600         def report_download_webpage(self, video_id):
1601                 """Report webpage download."""
1602                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1603                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1604
1605         def report_extraction(self, video_id):
1606                 """Report information extraction."""
1607                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1608
1609         def _real_initialize(self):
1610                 return
1611
1612         def _real_extract(self, url):
1613                 # At this point we have a new video
1614                 self._downloader.increment_downloads()
1615
1616                 video_id = url.split('/')[-1]
1617                 request = urllib2.Request(url)
1618                 try:
1619                         self.report_download_webpage(video_id)
1620                         webpage = urllib2.urlopen(request).read()
1621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1623                         return
1624                 except ValueError, err:
1625                         # since this is the last-resort InfoExtractor, if
1626                         # this error is thrown, it'll be thrown here
1627                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1628                         return
1629
1630                 self.report_extraction(video_id)
1631                 # Start with something easy: JW Player in SWFObject
1632                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1633                 if mobj is None:
1634                         # Broaden the search a little bit
1635                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1636                 if mobj is None:
1637                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1638                         return
1639
1640                 # It's possible that one of the regexes
1641                 # matched, but returned an empty group:
1642                 if mobj.group(1) is None:
1643                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1644                         return
1645
1646                 video_url = urllib.unquote(mobj.group(1))
1647                 video_id  = os.path.basename(video_url)
1648
1649                 # here's a fun little line of code for you:
1650                 video_extension = os.path.splitext(video_id)[1][1:]
1651                 video_id        = os.path.splitext(video_id)[0]
1652
1653                 # it's tempting to parse this further, but you would
1654                 # have to take into account all the variations like
1655                 #   Video Title - Site Name
1656                 #   Site Name | Video Title
1657                 #   Video Title - Tagline | Site Name
1658                 # and so on and so forth; it's just not practical
1659                 mobj = re.search(r'<title>(.*)</title>', webpage)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: unable to extract title')
1662                         return
1663                 video_title = mobj.group(1).decode('utf-8')
1664                 video_title = sanitize_title(video_title)
1665                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1666
1667                 # video uploader is domain name
1668                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract title')
1671                         return
1672                 video_uploader = mobj.group(1).decode('utf-8')
1673
1674                 try:
1675                         # Process video information
1676                         self._downloader.process_info({
1677                                 'id':           video_id.decode('utf-8'),
1678                                 'url':          video_url.decode('utf-8'),
1679                                 'uploader':     video_uploader,
1680                                 'upload_date':  u'NA',
1681                                 'title':        video_title,
1682                                 'stitle':       simple_title,
1683                                 'ext':          video_extension.decode('utf-8'),
1684                                 'format':       u'NA',
1685                                 'player_url':   None,
1686                         })
1687                 except UnavailableVideoError, err:
1688                         self._downloader.trouble(u'ERROR: unable to download video')
1689
1690
1691 class YoutubeSearchIE(InfoExtractor):
1692         """Information Extractor for YouTube search queries."""
1693         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1694         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1695         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1696         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1697         _youtube_ie = None
1698         _max_youtube_results = 1000
1699
1700         def __init__(self, youtube_ie, downloader=None):
1701                 InfoExtractor.__init__(self, downloader)
1702                 self._youtube_ie = youtube_ie
1703
1704         @staticmethod
1705         def suitable(url):
1706                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1707
1708         def report_download_page(self, query, pagenum):
1709                 """Report attempt to download playlist page with given number."""
1710                 query = query.decode(preferredencoding())
1711                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1712
1713         def _real_initialize(self):
1714                 self._youtube_ie.initialize()
1715
1716         def _real_extract(self, query):
1717                 mobj = re.match(self._VALID_QUERY, query)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1720                         return
1721
1722                 prefix, query = query.split(':')
1723                 prefix = prefix[8:]
1724                 query  = query.encode('utf-8')
1725                 if prefix == '':
1726                         self._download_n_results(query, 1)
1727                         return
1728                 elif prefix == 'all':
1729                         self._download_n_results(query, self._max_youtube_results)
1730                         return
1731                 else:
1732                         try:
1733                                 n = long(prefix)
1734                                 if n <= 0:
1735                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1736                                         return
1737                                 elif n > self._max_youtube_results:
1738                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1739                                         n = self._max_youtube_results
1740                                 self._download_n_results(query, n)
1741                                 return
1742                         except ValueError: # parsing prefix as integer fails
1743                                 self._download_n_results(query, 1)
1744                                 return
1745
1746         def _download_n_results(self, query, n):
1747                 """Downloads a specified number of results for a query"""
1748
1749                 video_ids = []
1750                 already_seen = set()
1751                 pagenum = 1
1752
1753                 while True:
1754                         self.report_download_page(query, pagenum)
1755                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1756                         request = urllib2.Request(result_url, None, std_headers)
1757                         try:
1758                                 page = urllib2.urlopen(request).read()
1759                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1761                                 return
1762
1763                         # Extract video identifiers
1764                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1765                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1766                                 if video_id not in already_seen:
1767                                         video_ids.append(video_id)
1768                                         already_seen.add(video_id)
1769                                         if len(video_ids) == n:
1770                                                 # Specified n videos reached
1771                                                 for id in video_ids:
1772                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1773                                                 return
1774
1775                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1776                                 for id in video_ids:
1777                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1778                                 return
1779
1780                         pagenum = pagenum + 1
1781
1782 class GoogleSearchIE(InfoExtractor):
1783         """Information Extractor for Google Video search queries."""
1784         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1785         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1786         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1787         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1788         _google_ie = None
1789         _max_google_results = 1000
1790
1791         def __init__(self, google_ie, downloader=None):
1792                 InfoExtractor.__init__(self, downloader)
1793                 self._google_ie = google_ie
1794
1795         @staticmethod
1796         def suitable(url):
1797                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1798
1799         def report_download_page(self, query, pagenum):
1800                 """Report attempt to download playlist page with given number."""
1801                 query = query.decode(preferredencoding())
1802                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1803
1804         def _real_initialize(self):
1805                 self._google_ie.initialize()
1806
1807         def _real_extract(self, query):
1808                 mobj = re.match(self._VALID_QUERY, query)
1809                 if mobj is None:
1810                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1811                         return
1812
1813                 prefix, query = query.split(':')
1814                 prefix = prefix[8:]
1815                 query  = query.encode('utf-8')
1816                 if prefix == '':
1817                         self._download_n_results(query, 1)
1818                         return
1819                 elif prefix == 'all':
1820                         self._download_n_results(query, self._max_google_results)
1821                         return
1822                 else:
1823                         try:
1824                                 n = long(prefix)
1825                                 if n <= 0:
1826                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1827                                         return
1828                                 elif n > self._max_google_results:
1829                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1830                                         n = self._max_google_results
1831                                 self._download_n_results(query, n)
1832                                 return
1833                         except ValueError: # parsing prefix as integer fails
1834                                 self._download_n_results(query, 1)
1835                                 return
1836
1837         def _download_n_results(self, query, n):
1838                 """Downloads a specified number of results for a query"""
1839
1840                 video_ids = []
1841                 already_seen = set()
1842                 pagenum = 1
1843
1844                 while True:
1845                         self.report_download_page(query, pagenum)
1846                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1847                         request = urllib2.Request(result_url, None, std_headers)
1848                         try:
1849                                 page = urllib2.urlopen(request).read()
1850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1852                                 return
1853
1854                         # Extract video identifiers
1855                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1856                                 video_id = mobj.group(1)
1857                                 if video_id not in already_seen:
1858                                         video_ids.append(video_id)
1859                                         already_seen.add(video_id)
1860                                         if len(video_ids) == n:
1861                                                 # Specified n videos reached
1862                                                 for id in video_ids:
1863                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1864                                                 return
1865
1866                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1867                                 for id in video_ids:
1868                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1869                                 return
1870
1871                         pagenum = pagenum + 1
1872
1873 class YahooSearchIE(InfoExtractor):
1874         """Information Extractor for Yahoo! Video search queries."""
1875         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1876         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1877         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1878         _MORE_PAGES_INDICATOR = r'\s*Next'
1879         _yahoo_ie = None
1880         _max_yahoo_results = 1000
1881
1882         def __init__(self, yahoo_ie, downloader=None):
1883                 InfoExtractor.__init__(self, downloader)
1884                 self._yahoo_ie = yahoo_ie
1885
1886         @staticmethod
1887         def suitable(url):
1888                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1889
1890         def report_download_page(self, query, pagenum):
1891                 """Report attempt to download playlist page with given number."""
1892                 query = query.decode(preferredencoding())
1893                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1894
1895         def _real_initialize(self):
1896                 self._yahoo_ie.initialize()
1897
1898         def _real_extract(self, query):
1899                 mobj = re.match(self._VALID_QUERY, query)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1902                         return
1903
1904                 prefix, query = query.split(':')
1905                 prefix = prefix[8:]
1906                 query  = query.encode('utf-8')
1907                 if prefix == '':
1908                         self._download_n_results(query, 1)
1909                         return
1910                 elif prefix == 'all':
1911                         self._download_n_results(query, self._max_yahoo_results)
1912                         return
1913                 else:
1914                         try:
1915                                 n = long(prefix)
1916                                 if n <= 0:
1917                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1918                                         return
1919                                 elif n > self._max_yahoo_results:
1920                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1921                                         n = self._max_yahoo_results
1922                                 self._download_n_results(query, n)
1923                                 return
1924                         except ValueError: # parsing prefix as integer fails
1925                                 self._download_n_results(query, 1)
1926                                 return
1927
1928         def _download_n_results(self, query, n):
1929                 """Downloads a specified number of results for a query"""
1930
1931                 video_ids = []
1932                 already_seen = set()
1933                 pagenum = 1
1934
1935                 while True:
1936                         self.report_download_page(query, pagenum)
1937                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1938                         request = urllib2.Request(result_url, None, std_headers)
1939                         try:
1940                                 page = urllib2.urlopen(request).read()
1941                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1943                                 return
1944
1945                         # Extract video identifiers
1946                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1947                                 video_id = mobj.group(1)
1948                                 if video_id not in already_seen:
1949                                         video_ids.append(video_id)
1950                                         already_seen.add(video_id)
1951                                         if len(video_ids) == n:
1952                                                 # Specified n videos reached
1953                                                 for id in video_ids:
1954                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1955                                                 return
1956
1957                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1958                                 for id in video_ids:
1959                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1960                                 return
1961
1962                         pagenum = pagenum + 1
1963
1964 class YoutubePlaylistIE(InfoExtractor):
1965         """Information Extractor for YouTube playlists."""
1966
1967         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1968         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1969         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1970         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1971         _youtube_ie = None
1972
1973         def __init__(self, youtube_ie, downloader=None):
1974                 InfoExtractor.__init__(self, downloader)
1975                 self._youtube_ie = youtube_ie
1976
1977         @staticmethod
1978         def suitable(url):
1979                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1980
1981         def report_download_page(self, playlist_id, pagenum):
1982                 """Report attempt to download playlist page with given number."""
1983                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1984
1985         def _real_initialize(self):
1986                 self._youtube_ie.initialize()
1987
1988         def _real_extract(self, url):
1989                 # Extract playlist id
1990                 mobj = re.match(self._VALID_URL, url)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1993                         return
1994
1995                 # Download playlist pages
1996                 playlist_id = mobj.group(1)
1997                 video_ids = []
1998                 pagenum = 1
1999
2000                 while True:
2001                         self.report_download_page(playlist_id, pagenum)
2002                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2003                         try:
2004                                 page = urllib2.urlopen(request).read()
2005                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2006                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2007                                 return
2008
2009                         # Extract video identifiers
2010                         ids_in_page = []
2011                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2012                                 if mobj.group(1) not in ids_in_page:
2013                                         ids_in_page.append(mobj.group(1))
2014                         video_ids.extend(ids_in_page)
2015
2016                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2017                                 break
2018                         pagenum = pagenum + 1
2019
2020                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2021                 playlistend = self._downloader.params.get('playlistend', -1)
2022                 video_ids = video_ids[playliststart:playlistend]
2023
2024                 for id in video_ids:
2025                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2026                 return
2027
2028 class YoutubeUserIE(InfoExtractor):
2029         """Information Extractor for YouTube users."""
2030
2031         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2032         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2033         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2034         _youtube_ie = None
2035
2036         def __init__(self, youtube_ie, downloader=None):
2037                 InfoExtractor.__init__(self, downloader)
2038                 self._youtube_ie = youtube_ie
2039
2040         @staticmethod
2041         def suitable(url):
2042                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2043
2044         def report_download_page(self, username):
2045                 """Report attempt to download user page."""
2046                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2047
2048         def _real_initialize(self):
2049                 self._youtube_ie.initialize()
2050
2051         def _real_extract(self, url):
2052                 # Extract username
2053                 mobj = re.match(self._VALID_URL, url)
2054                 if mobj is None:
2055                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2056                         return
2057
2058                 # Download user page
2059                 username = mobj.group(1)
2060                 video_ids = []
2061                 pagenum = 1
2062
2063                 self.report_download_page(username)
2064                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2065                 try:
2066                         page = urllib2.urlopen(request).read()
2067                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2068                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2069                         return
2070
2071                 # Extract video identifiers
2072                 ids_in_page = []
2073
2074                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2075                         if mobj.group(1) not in ids_in_page:
2076                                 ids_in_page.append(mobj.group(1))
2077                 video_ids.extend(ids_in_page)
2078
2079                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2080                 playlistend = self._downloader.params.get('playlistend', -1)
2081                 video_ids = video_ids[playliststart:playlistend]
2082
2083                 for id in video_ids:
2084                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2085                 return
2086
2087 class DepositFilesIE(InfoExtractor):
2088         """Information extractor for depositfiles.com"""
2089
2090         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2091
2092         def __init__(self, downloader=None):
2093                 InfoExtractor.__init__(self, downloader)
2094
2095         @staticmethod
2096         def suitable(url):
2097                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2098
2099         def report_download_webpage(self, file_id):
2100                 """Report webpage download."""
2101                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2102
2103         def report_extraction(self, file_id):
2104                 """Report information extraction."""
2105                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2106
2107         def _real_initialize(self):
2108                 return
2109
2110         def _real_extract(self, url):
2111                 # At this point we have a new file
2112                 self._downloader.increment_downloads()
2113
2114                 file_id = url.split('/')[-1]
2115                 # Rebuild url in english locale
2116                 url = 'http://depositfiles.com/en/files/' + file_id
2117
2118                 # Retrieve file webpage with 'Free download' button pressed
2119                 free_download_indication = { 'gateway_result' : '1' }
2120                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2121                 try:
2122                         self.report_download_webpage(file_id)
2123                         webpage = urllib2.urlopen(request).read()
2124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2125                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2126                         return
2127
2128                 # Search for the real file URL
2129                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2130                 if (mobj is None) or (mobj.group(1) is None):
2131                         # Try to figure out reason of the error.
2132                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2133                         if (mobj is not None) and (mobj.group(1) is not None):
2134                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2135                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2136                         else:
2137                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2138                         return
2139
2140                 file_url = mobj.group(1)
2141                 file_extension = os.path.splitext(file_url)[1][1:]
2142
2143                 # Search for file title
2144                 mobj = re.search(r'<b title="(.*?)">', webpage)
2145                 if mobj is None:
2146                         self._downloader.trouble(u'ERROR: unable to extract title')
2147                         return
2148                 file_title = mobj.group(1).decode('utf-8')
2149
2150                 try:
2151                         # Process file information
2152                         self._downloader.process_info({
2153                                 'id':           file_id.decode('utf-8'),
2154                                 'url':          file_url.decode('utf-8'),
2155                                 'uploader':     u'NA',
2156                                 'upload_date':  u'NA',
2157                                 'title':        file_title,
2158                                 'stitle':       file_title,
2159                                 'ext':          file_extension.decode('utf-8'),
2160                                 'format':       u'NA',
2161                                 'player_url':   None,
2162                         })
2163                 except UnavailableVideoError, err:
2164                         self._downloader.trouble(u'ERROR: unable to download file')
2165
2166 class PostProcessor(object):
2167         """Post Processor class.
2168
2169         PostProcessor objects can be added to downloaders with their
2170         add_post_processor() method. When the downloader has finished a
2171         successful download, it will take its internal chain of PostProcessors
2172         and start calling the run() method on each one of them, first with
2173         an initial argument and then with the returned value of the previous
2174         PostProcessor.
2175
2176         The chain will be stopped if one of them ever returns None or the end
2177         of the chain is reached.
2178
2179         PostProcessor objects follow a "mutual registration" process similar
2180         to InfoExtractor objects.
2181         """
2182
2183         _downloader = None
2184
2185         def __init__(self, downloader=None):
2186                 self._downloader = downloader
2187
2188         def set_downloader(self, downloader):
2189                 """Sets the downloader for this PP."""
2190                 self._downloader = downloader
2191
2192         def run(self, information):
2193                 """Run the PostProcessor.
2194
2195                 The "information" argument is a dictionary like the ones
2196                 composed by InfoExtractors. The only difference is that this
2197                 one has an extra field called "filepath" that points to the
2198                 downloaded file.
2199
2200                 When this method returns None, the postprocessing chain is
2201                 stopped. However, this method may return an information
2202                 dictionary that will be passed to the next postprocessing
2203                 object in the chain. It can be the one it received after
2204                 changing some fields.
2205
2206                 In addition, this method may raise a PostProcessingError
2207                 exception that will be taken into account by the downloader
2208                 it was called from.
2209                 """
2210                 return information # by default, do nothing
2211
2212 ### MAIN PROGRAM ###
2213 if __name__ == '__main__':
2214         try:
2215                 # Modules needed only when running the main program
2216                 import getpass
2217                 import optparse
2218
2219                 # Function to update the program file with the latest version from bitbucket.org
2220                 def update_self(downloader, filename):
2221                         # Note: downloader only used for options
2222                         if not os.access (filename, os.W_OK):
2223                                 sys.exit('ERROR: no write permissions on %s' % filename)
2224
2225                         downloader.to_screen('Updating to latest stable version...')
2226                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2227                         latest_version = urllib.urlopen(latest_url).read().strip()
2228                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2229                         newcontent = urllib.urlopen(prog_url).read()
2230                         stream = open(filename, 'w')
2231                         stream.write(newcontent)
2232                         stream.close()
2233                         downloader.to_screen('Updated to version %s' % latest_version)
2234
2235                 # Parse command line
2236                 parser = optparse.OptionParser(
2237                         usage='Usage: %prog [options] url...',
2238                         version='2010.12.09',
2239                         conflict_handler='resolve',
2240                 )
2241
2242                 parser.add_option('-h', '--help',
2243                                 action='help', help='print this help text and exit')
2244                 parser.add_option('-v', '--version',
2245                                 action='version', help='print program version and exit')
2246                 parser.add_option('-U', '--update',
2247                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2248                 parser.add_option('-i', '--ignore-errors',
2249                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2250                 parser.add_option('-r', '--rate-limit',
2251                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2252                 parser.add_option('-R', '--retries',
2253                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2254                 parser.add_option('--playlist-start',
2255                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2256                 parser.add_option('--playlist-end',
2257                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2258
2259                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2260                 authentication.add_option('-u', '--username',
2261                                 dest='username', metavar='USERNAME', help='account username')
2262                 authentication.add_option('-p', '--password',
2263                                 dest='password', metavar='PASSWORD', help='account password')
2264                 authentication.add_option('-n', '--netrc',
2265                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2266                 parser.add_option_group(authentication)
2267
2268                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2269                 video_format.add_option('-f', '--format',
2270                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2271                 video_format.add_option('-m', '--mobile-version',
2272                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2273                 video_format.add_option('--all-formats',
2274                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2275                 video_format.add_option('--max-quality',
2276                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2277                 video_format.add_option('-b', '--best-quality',
2278                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2279                 parser.add_option_group(video_format)
2280
2281                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2282                 verbosity.add_option('-q', '--quiet',
2283                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2284                 verbosity.add_option('-s', '--simulate',
2285                                 action='store_true', dest='simulate', help='do not download video', default=False)
2286                 verbosity.add_option('-g', '--get-url',
2287                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2288                 verbosity.add_option('-e', '--get-title',
2289                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2290                 verbosity.add_option('--get-thumbnail',
2291                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2292                 verbosity.add_option('--get-description',
2293                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2294                 verbosity.add_option('--no-progress',
2295                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2296                 parser.add_option_group(verbosity)
2297
2298                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2299                 filesystem.add_option('-t', '--title',
2300                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2301                 filesystem.add_option('-l', '--literal',
2302                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2303                 filesystem.add_option('-A', '--auto-number',
2304                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2305                 filesystem.add_option('-o', '--output',
2306                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2307                 filesystem.add_option('-a', '--batch-file',
2308                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2309                 filesystem.add_option('-w', '--no-overwrites',
2310                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2311                 filesystem.add_option('-c', '--continue',
2312                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2313                 filesystem.add_option('--cookies',
2314                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2315                 parser.add_option_group(filesystem)
2316
2317                 (opts, args) = parser.parse_args()
2318
2319                 # Open appropriate CookieJar
2320                 if opts.cookiefile is None:
2321                         jar = cookielib.CookieJar()
2322                 else:
2323                         try:
2324                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2325                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2326                                         jar.load()
2327                         except (IOError, OSError), err:
2328                                 sys.exit(u'ERROR: unable to open cookie file')
2329
2330                 # General configuration
2331                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2332                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2333                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2334                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2335
2336                 # Batch file verification
2337                 batchurls = []
2338                 if opts.batchfile is not None:
2339                         try:
2340                                 if opts.batchfile == '-':
2341                                         batchfd = sys.stdin
2342                                 else:
2343                                         batchfd = open(opts.batchfile, 'r')
2344                                 batchurls = batchfd.readlines()
2345                                 batchurls = [x.strip() for x in batchurls]
2346                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2347                         except IOError:
2348                                 sys.exit(u'ERROR: batch file could not be read')
2349                 all_urls = batchurls + args
2350
2351                 # Conflicting, missing and erroneous options
2352                 if opts.bestquality:
2353                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2354                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2355                         parser.error(u'using .netrc conflicts with giving username/password')
2356                 if opts.password is not None and opts.username is None:
2357                         parser.error(u'account username missing')
2358                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2359                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2360                 if opts.usetitle and opts.useliteral:
2361                         parser.error(u'using title conflicts with using literal title')
2362                 if opts.username is not None and opts.password is None:
2363                         opts.password = getpass.getpass(u'Type account password and press return:')
2364                 if opts.ratelimit is not None:
2365                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2366                         if numeric_limit is None:
2367                                 parser.error(u'invalid rate limit specified')
2368                         opts.ratelimit = numeric_limit
2369                 if opts.retries is not None:
2370                         try:
2371                                 opts.retries = long(opts.retries)
2372                         except (TypeError, ValueError), err:
2373                                 parser.error(u'invalid retry count specified')
2374                 try:
2375                         opts.playliststart = long(opts.playliststart)
2376                         if opts.playliststart <= 0:
2377                                 raise ValueError
2378                 except (TypeError, ValueError), err:
2379                         parser.error(u'invalid playlist start number specified')
2380                 try:
2381                         opts.playlistend = long(opts.playlistend)
2382                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2383                                 raise ValueError
2384                 except (TypeError, ValueError), err:
2385                         parser.error(u'invalid playlist end number specified')
2386
2387                 # Information extractors
2388                 youtube_ie = YoutubeIE()
2389                 metacafe_ie = MetacafeIE(youtube_ie)
2390                 dailymotion_ie = DailymotionIE()
2391                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2392                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2393                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2394                 google_ie = GoogleIE()
2395                 google_search_ie = GoogleSearchIE(google_ie)
2396                 photobucket_ie = PhotobucketIE()
2397                 yahoo_ie = YahooIE()
2398                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2399                 deposit_files_ie = DepositFilesIE()
2400                 generic_ie = GenericIE()
2401
2402                 # File downloader
2403                 fd = FileDownloader({
2404                         'usenetrc': opts.usenetrc,
2405                         'username': opts.username,
2406                         'password': opts.password,
2407                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2408                         'forceurl': opts.geturl,
2409                         'forcetitle': opts.gettitle,
2410                         'forcethumbnail': opts.getthumbnail,
2411                         'forcedescription': opts.getdescription,
2412                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2413                         'format': opts.format,
2414                         'format_limit': opts.format_limit,
2415                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2416                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2417                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2418                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2419                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2420                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2421                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2422                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2423                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2424                                 or u'%(id)s.%(ext)s'),
2425                         'ignoreerrors': opts.ignoreerrors,
2426                         'ratelimit': opts.ratelimit,
2427                         'nooverwrites': opts.nooverwrites,
2428                         'retries': opts.retries,
2429                         'continuedl': opts.continue_dl,
2430                         'noprogress': opts.noprogress,
2431                         'playliststart': opts.playliststart,
2432                         'playlistend': opts.playlistend,
2433                         'logtostderr': opts.outtmpl == '-',
2434                         })
2435                 fd.add_info_extractor(youtube_search_ie)
2436                 fd.add_info_extractor(youtube_pl_ie)
2437                 fd.add_info_extractor(youtube_user_ie)
2438                 fd.add_info_extractor(metacafe_ie)
2439                 fd.add_info_extractor(dailymotion_ie)
2440                 fd.add_info_extractor(youtube_ie)
2441                 fd.add_info_extractor(google_ie)
2442                 fd.add_info_extractor(google_search_ie)
2443                 fd.add_info_extractor(photobucket_ie)
2444                 fd.add_info_extractor(yahoo_ie)
2445                 fd.add_info_extractor(yahoo_search_ie)
2446                 fd.add_info_extractor(deposit_files_ie)
2447
2448                 # This must come last since it's the
2449                 # fallback if none of the others work
2450                 fd.add_info_extractor(generic_ie)
2451
2452                 # Update version
2453                 if opts.update_self:
2454                         update_self(fd, sys.argv[0])
2455
2456                 # Maybe do nothing
2457                 if len(all_urls) < 1:
2458                         if not opts.update_self:
2459                                 parser.error(u'you must provide at least one URL')
2460                         else:
2461                                 sys.exit()
2462                 retcode = fd.download(all_urls)
2463
2464                 # Dump cookie jar if requested
2465                 if opts.cookiefile is not None:
2466                         try:
2467                                 jar.save()
2468                         except (IOError, OSError), err:
2469                                 sys.exit(u'ERROR: unable to save cookie jar')
2470
2471                 sys.exit(retcode)
2472
2473         except DownloadError:
2474                 sys.exit(1)
2475         except SameFileError:
2476                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2477         except KeyboardInterrupt:
2478                 sys.exit(u'\nERROR: Interrupted by user')