Raphaël G. Git Repositories - youtubedl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 class DownloadError(Exception):
  55         """Download Error exception.
  56
  57         This exception may be thrown by FileDownloader objects if they are not
  58         configured to continue on errors. They will contain the appropriate
  59         error message.
  60         """
  61         pass
  62
  63 class SameFileError(Exception):
  64         """Same File exception.
  65
  66         This exception will be thrown by FileDownloader objects if they detect
  67         multiple files would have to be downloaded to the same file on disk.
  68         """
  69         pass
  70
  71 class PostProcessingError(Exception):
  72         """Post Processing exception.
  73
  74         This exception may be raised by PostProcessor's .run() method to
  75         indicate an error in the postprocessing task.
  76         """
  77         pass
  78
  79 class UnavailableFormatError(Exception):
  80         """Unavailable Format exception.
  81
  82         This exception will be thrown when a video is requested
  83         in a format that is not available for that video.
  84         """
  85         pass
  86
  87 class ContentTooShortError(Exception):
  88         """Content Too Short exception.
  89
  90         This exception may be raised by FileDownloader objects when a file they
  91         download is too small for what the server announced first, indicating
  92         the connection was probably interrupted.
  93         """
  94         # Both in bytes
  95         downloaded = None
  96         expected = None
  97
  98         def __init__(self, downloaded, expected):
  99                 self.downloaded = downloaded
 100                 self.expected = expected
 101
 102 class FileDownloader(object):
 103         """File Downloader class.
 104
 105         File downloader objects are the ones responsible of downloading the
 106         actual video file and writing it to disk if the user has requested
 107         it, among some other tasks. In most cases there should be one per
 108         program. As, given a video URL, the downloader doesn't know how to
 109         extract all the needed information, task that InfoExtractors do, it
 110         has to pass the URL to one of them.
 111
 112         For this, file downloader objects have a method that allows
 113         InfoExtractors to be registered in a given order. When it is passed
 114         a URL, the file downloader handles it to the first InfoExtractor it
 115         finds that reports being able to handle it. The InfoExtractor extracts
 116         all the information about the video or videos the URL refers to, and
 117         asks the FileDownloader to process the video information, possibly
 118         downloading the video.
 119
 120         File downloaders accept a lot of parameters. In order not to saturate
 121         the object constructor with arguments, it receives a dictionary of
 122         options instead. These options are available through the params
 123         attribute for the InfoExtractors to use. The FileDownloader also
 124         registers itself as the downloader in charge for the InfoExtractors
 125         that are added to it, so this is a "mutual registration".
 126
 127         Available options:
 128
 129         username:       Username for authentication purposes.
 130         password:       Password for authentication purposes.
 131         usenetrc:       Use netrc for authentication instead.
 132         quiet:          Do not print messages to stdout.
 133         forceurl:       Force printing final URL.
 134         forcetitle:     Force printing title.
 135         simulate:       Do not download the video files.
 136         format:         Video format code.
 137         outtmpl:        Template for output names.
 138         ignoreerrors:   Do not stop on download errors.
 139         ratelimit:      Download speed limit, in bytes/sec.
 140         nooverwrites:   Prevent overwriting files.
 141         continuedl:     Try to continue downloads if possible.
 142         """
 143
 144         params = None
 145         _ies = []
 146         _pps = []
 147         _download_retcode = None
 148
 149         def __init__(self, params):
 150                 """Create a FileDownloader object with the given options."""
 151                 self._ies = []
 152                 self._pps = []
 153                 self._download_retcode = 0
 154                 self.params = params
 155
 156         @staticmethod
 157         def pmkdir(filename):
 158                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 159                 components = filename.split(os.sep)
 160                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 161                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 162                 for dir in aggregate:
 163                         if not os.path.exists(dir):
 164                                 os.mkdir(dir)
 165
 166         @staticmethod
 167         def format_bytes(bytes):
 168                 if bytes is None:
 169                         return 'N/A'
 170                 if type(bytes) is str:
 171                         bytes = float(bytes)
 172                 if bytes == 0.0:
 173                         exponent = 0
 174                 else:
 175                         exponent = long(math.log(bytes, 1024.0))
 176                 suffix = 'bkMGTPEZY'[exponent]
 177                 converted = float(bytes) / float(1024**exponent)
 178                 return '%.2f%s' % (converted, suffix)
 179
 180         @staticmethod
 181         def calc_percent(byte_counter, data_len):
 182                 if data_len is None:
 183                         return '---.-%'
 184                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 185
 186         @staticmethod
 187         def calc_eta(start, now, total, current):
 188                 if total is None:
 189                         return '--:--'
 190                 dif = now - start
 191                 if current == 0 or dif < 0.001: # One millisecond
 192                         return '--:--'
 193                 rate = float(current) / dif
 194                 eta = long((float(total) - float(current)) / rate)
 195                 (eta_mins, eta_secs) = divmod(eta, 60)
 196                 if eta_mins > 99:
 197                         return '--:--'
 198                 return '%02d:%02d' % (eta_mins, eta_secs)
 199
 200         @staticmethod
 201         def calc_speed(start, now, bytes):
 202                 dif = now - start
 203                 if bytes == 0 or dif < 0.001: # One millisecond
 204                         return '%10s' % '---b/s'
 205                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 206
 207         @staticmethod
 208         def best_block_size(elapsed_time, bytes):
 209                 new_min = max(bytes / 2.0, 1.0)
 210                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 211                 if elapsed_time < 0.001:
 212                         return long(new_max)
 213                 rate = bytes / elapsed_time
 214                 if rate > new_max:
 215                         return long(new_max)
 216                 if rate < new_min:
 217                         return long(new_min)
 218                 return long(rate)
 219
 220         @staticmethod
 221         def parse_bytes(bytestr):
 222                 """Parse a string indicating a byte quantity into a long integer."""
 223                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 224                 if matchobj is None:
 225                         return None
 226                 number = float(matchobj.group(1))
 227                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 228                 return long(round(number * multiplier))
 229
 230         @staticmethod
 231         def verify_url(url):
 232                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 233                 request = urllib2.Request(url, None, std_headers)
 234                 data = urllib2.urlopen(request)
 235                 data.read(1)
 236                 url = data.geturl()
 237                 data.close()
 238                 return url
 239
 240         def add_info_extractor(self, ie):
 241                 """Add an InfoExtractor object to the end of the list."""
 242                 self._ies.append(ie)
 243                 ie.set_downloader(self)
 244
 245         def add_post_processor(self, pp):
 246                 """Add a PostProcessor object to the end of the chain."""
 247                 self._pps.append(pp)
 248                 pp.set_downloader(self)
 249
 250         def to_stdout(self, message, skip_eol=False):
 251                 """Print message to stdout if not in quiet mode."""
 252                 if not self.params.get('quiet', False):
 253                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 254                         sys.stdout.flush()
 255
 256         def to_stderr(self, message):
 257                 """Print message to stderr."""
 258                 print >>sys.stderr, message.encode(preferredencoding())
 259
 260         def fixed_template(self):
 261                 """Checks if the output template is fixed."""
 262                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 263
 264         def trouble(self, message=None):
 265                 """Determine action to take when a download problem appears.
 266
 267                 Depending on if the downloader has been configured to ignore
 268                 download errors or not, this method may throw an exception or
 269                 not when errors are found, after printing the message.
 270                 """
 271                 if message is not None:
 272                         self.to_stderr(message)
 273                 if not self.params.get('ignoreerrors', False):
 274                         raise DownloadError(message)
 275                 self._download_retcode = 1
 276
 277         def slow_down(self, start_time, byte_counter):
 278                 """Sleep if the download speed is over the rate limit."""
 279                 rate_limit = self.params.get('ratelimit', None)
 280                 if rate_limit is None or byte_counter == 0:
 281                         return
 282                 now = time.time()
 283                 elapsed = now - start_time
 284                 if elapsed <= 0.0:
 285                         return
 286                 speed = float(byte_counter) / elapsed
 287                 if speed > rate_limit:
 288                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 289
 290         def report_destination(self, filename):
 291                 """Report destination filename."""
 292                 self.to_stdout(u'[download] Destination: %s' % filename)
 293
 294         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 295                 """Report download progress."""
 296                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 297                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 298
 299         def report_resuming_byte(self, resume_len):
 300                 """Report attemtp to resume at given byte."""
 301                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 302
 303         def report_file_already_downloaded(self, file_name):
 304                 """Report file has already been fully downloaded."""
 305                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 306
 307         def report_unable_to_resume(self):
 308                 """Report it was impossible to resume download."""
 309                 self.to_stdout(u'[download] Unable to resume')
 310
 311         def report_finish(self):
 312                 """Report download finished."""
 313                 self.to_stdout(u'')
 314
 315         def process_info(self, info_dict):
 316                 """Process a single dictionary returned by an InfoExtractor."""
 317                 # Do nothing else if in simulate mode
 318                 if self.params.get('simulate', False):
 319                         # Verify URL if it's an HTTP one
 320                         if info_dict['url'].startswith('http'):
 321                                 try:
 322                                         info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 323                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 324                                         raise UnavailableFormatError
 325
 326                         # Forced printings
 327                         if self.params.get('forcetitle', False):
 328                                 print info_dict['title'].encode(preferredencoding())
 329                         if self.params.get('forceurl', False):
 330                                 print info_dict['url'].encode(preferredencoding())
 331
 332                         return
 333
 334                 try:
 335                         template_dict = dict(info_dict)
 336                         template_dict['epoch'] = unicode(long(time.time()))
 337                         filename = self.params['outtmpl'] % template_dict
 338                 except (ValueError, KeyError), err:
 339                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 340                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 341                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 342                         return
 343
 344                 try:
 345                         self.pmkdir(filename)
 346                 except (OSError, IOError), err:
 347                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 348                         return
 349
 350                 try:
 351                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 352                 except (OSError, IOError), err:
 353                         raise UnavailableFormatError
 354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 355                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 356                         return
 357                 except (ContentTooShortError, ), err:
 358                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 359                         return
 360
 361                 if success:
 362                         try:
 363                                 self.post_process(filename, info_dict)
 364                         except (PostProcessingError), err:
 365                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 366                                 return
 367
 368         def download(self, url_list):
 369                 """Download a given list of URLs."""
 370                 if len(url_list) > 1 and self.fixed_template():
 371                         raise SameFileError(self.params['outtmpl'])
 372
 373                 for url in url_list:
 374                         suitable_found = False
 375                         for ie in self._ies:
 376                                 # Go to next InfoExtractor if not suitable
 377                                 if not ie.suitable(url):
 378                                         continue
 379
 380                                 # Suitable InfoExtractor found
 381                                 suitable_found = True
 382
 383                                 # Extract information from URL and process it
 384                                 ie.extract(url)
 385
 386                                 # Suitable InfoExtractor had been found; go to next URL
 387                                 break
 388
 389                         if not suitable_found:
 390                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 391
 392                 return self._download_retcode
 393
 394         def post_process(self, filename, ie_info):
 395                 """Run the postprocessing chain on the given file."""
 396                 info = dict(ie_info)
 397                 info['filepath'] = filename
 398                 for pp in self._pps:
 399                         info = pp.run(info)
 400                         if info is None:
 401                                 break
 402
 403         def _download_with_rtmpdump(self, filename, url):
 404                 self.report_destination(filename)
 405
 406                 # Check for rtmpdump first
 407                 try:
 408                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 409                 except (OSError, IOError):
 410                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 411                         return False
 412
 413                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 414                 # the connection was interrumpted and resuming appears to be
 415                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 416                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 417                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 418                 while retval == 2 or retval == 1:
 419                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 420                         time.sleep(2.0) # This seems to be needed
 421                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 422                 if retval == 0:
 423                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 424                         return True
 425                 else:
 426                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 427                         return False
 428
 429         def _do_download(self, filename, url):
 430                 # Attempt to download using rtmpdump
 431                 if url.startswith('rtmp'):
 432                         return self._download_with_rtmpdump(filename, url)
 433
 434                 stream = None
 435                 open_mode = 'wb'
 436                 basic_request = urllib2.Request(url, None, std_headers)
 437                 request = urllib2.Request(url, None, std_headers)
 438
 439                 # Establish possible resume length
 440                 if os.path.isfile(filename):
 441                         resume_len = os.path.getsize(filename)
 442                 else:
 443                         resume_len = 0
 444
 445                 # Request parameters in case of being able to resume
 446                 if self.params.get('continuedl', False) and resume_len != 0:
 447                         self.report_resuming_byte(resume_len)
 448                         request.add_header('Range','bytes=%d-' % resume_len)
 449                         open_mode = 'ab'
 450
 451                 # Establish connection
 452                 try:
 453                         data = urllib2.urlopen(request)
 454                 except (urllib2.HTTPError, ), err:
 455                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 456                                 raise
 457                         # Unable to resume
 458                         data = urllib2.urlopen(basic_request)
 459                         content_length = data.info()['Content-Length']
 460
 461                         if content_length is not None and long(content_length) == resume_len:
 462                                 # Because the file had already been fully downloaded
 463                                 self.report_file_already_downloaded(filename)
 464                                 return True
 465                         else:
 466                                 # Because the server didn't let us
 467                                 self.report_unable_to_resume()
 468                                 open_mode = 'wb'
 469
 470                 data_len = data.info().get('Content-length', None)
 471                 data_len_str = self.format_bytes(data_len)
 472                 byte_counter = 0
 473                 block_size = 1024
 474                 start = time.time()
 475                 while True:
 476                         # Download and write
 477                         before = time.time()
 478                         data_block = data.read(block_size)
 479                         after = time.time()
 480                         data_block_len = len(data_block)
 481                         if data_block_len == 0:
 482                                 break
 483                         byte_counter += data_block_len
 484
 485                         # Open file just in time
 486                         if stream is None:
 487                                 try:
 488                                         stream = open(filename, open_mode)
 489                                         self.report_destination(filename)
 490                                 except (OSError, IOError), err:
 491                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 492                                         return False
 493                         stream.write(data_block)
 494                         block_size = self.best_block_size(after - before, data_block_len)
 495
 496                         # Progress message
 497                         percent_str = self.calc_percent(byte_counter, data_len)
 498                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 499                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 500                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 501
 502                         # Apply rate limit
 503                         self.slow_down(start, byte_counter)
 504
 505                 self.report_finish()
 506                 if data_len is not None and str(byte_counter) != data_len:
 507                         raise ContentTooShortError(byte_counter, long(data_len))
 508                 return True
 509
 510 class InfoExtractor(object):
 511         """Information Extractor class.
 512
 513         Information extractors are the classes that, given a URL, extract
 514         information from the video (or videos) the URL refers to. This
 515         information includes the real video URL, the video title and simplified
 516         title, author and others. The information is stored in a dictionary
 517         which is then passed to the FileDownloader. The FileDownloader
 518         processes this information possibly downloading the video to the file
 519         system, among other possible outcomes. The dictionaries must include
 520         the following fields:
 521
 522         id:             Video identifier.
 523         url:            Final video URL.
 524         uploader:       Nickname of the video uploader.
 525         title:          Literal title.
 526         stitle:         Simplified title.
 527         ext:            Video filename extension.
 528
 529         Subclasses of this one should re-define the _real_initialize() and
 530         _real_extract() methods, as well as the suitable() static method.
 531         Probably, they should also be instantiated and added to the main
 532         downloader.
 533         """
 534
 535         _ready = False
 536         _downloader = None
 537
 538         def __init__(self, downloader=None):
 539                 """Constructor. Receives an optional downloader."""
 540                 self._ready = False
 541                 self.set_downloader(downloader)
 542
 543         @staticmethod
 544         def suitable(url):
 545                 """Receives a URL and returns True if suitable for this IE."""
 546                 return False
 547
 548         def initialize(self):
 549                 """Initializes an instance (authentication, etc)."""
 550                 if not self._ready:
 551                         self._real_initialize()
 552                         self._ready = True
 553
 554         def extract(self, url):
 555                 """Extracts URL information and returns it in list of dicts."""
 556                 self.initialize()
 557                 return self._real_extract(url)
 558
 559         def set_downloader(self, downloader):
 560                 """Sets the downloader for this IE."""
 561                 self._downloader = downloader
 562
 563         def _real_initialize(self):
 564                 """Real initialization process. Redefine in subclasses."""
 565                 pass
 566
 567         def _real_extract(self, url):
 568                 """Real extraction process. Redefine in subclasses."""
 569                 pass
 570
 571 class YoutubeIE(InfoExtractor):
 572         """Information extractor for youtube.com."""
 573
 574         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 575         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 576         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 577         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 578         _NETRC_MACHINE = 'youtube'
 579         _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
 580         _video_extensions = {
 581                 '13': '3gp',
 582                 '17': 'mp4',
 583                 '18': 'mp4',
 584                 '22': 'mp4',
 585                 '37': 'mp4',
 586         }
 587
 588         @staticmethod
 589         def suitable(url):
 590                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 591
 592         @staticmethod
 593         def htmlentity_transform(matchobj):
 594                 """Transforms an HTML entity to a Unicode character."""
 595                 entity = matchobj.group(1)
 596
 597                 # Known non-numeric HTML entity
 598                 if entity in htmlentitydefs.name2codepoint:
 599                         return unichr(htmlentitydefs.name2codepoint[entity])
 600
 601                 # Unicode character
 602                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 603                 if mobj is not None:
 604                         numstr = mobj.group(1)
 605                         if numstr.startswith(u'x'):
 606                                 base = 16
 607                                 numstr = u'0%s' % numstr
 608                         else:
 609                                 base = 10
 610                         return unichr(long(numstr, base))
 611
 612                 # Unknown entity in name, return its literal representation
 613                 return (u'&%s;' % entity)
 614
 615         def report_lang(self):
 616                 """Report attempt to set language."""
 617                 self._downloader.to_stdout(u'[youtube] Setting language')
 618
 619         def report_login(self):
 620                 """Report attempt to log in."""
 621                 self._downloader.to_stdout(u'[youtube] Logging in')
 622
 623         def report_age_confirmation(self):
 624                 """Report attempt to confirm age."""
 625                 self._downloader.to_stdout(u'[youtube] Confirming age')
 626
 627         def report_video_info_webpage_download(self, video_id):
 628                 """Report attempt to download video info webpage."""
 629                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 630
 631         def report_information_extraction(self, video_id):
 632                 """Report attempt to extract video information."""
 633                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 634
 635         def report_unavailable_format(self, video_id, format):
 636                 """Report extracted video URL."""
 637                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 638
 639         def report_rtmp_download(self):
 640                 """Indicate the download will use the RTMP protocol."""
 641                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 642
 643         def _real_initialize(self):
 644                 if self._downloader is None:
 645                         return
 646
 647                 username = None
 648                 password = None
 649                 downloader_params = self._downloader.params
 650
 651                 # Attempt to use provided username and password or .netrc data
 652                 if downloader_params.get('username', None) is not None:
 653                         username = downloader_params['username']
 654                         password = downloader_params['password']
 655                 elif downloader_params.get('usenetrc', False):
 656                         try:
 657                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 658                                 if info is not None:
 659                                         username = info[0]
 660                                         password = info[2]
 661                                 else:
 662                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 663                         except (IOError, netrc.NetrcParseError), err:
 664                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 665                                 return
 666
 667                 # Set language
 668                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 669                 try:
 670                         self.report_lang()
 671                         urllib2.urlopen(request).read()
 672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 673                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 674                         return
 675
 676                 # No authentication to be performed
 677                 if username is None:
 678                         return
 679
 680                 # Log in
 681                 login_form = {
 682                                 'current_form': 'loginForm',
 683                                 'next':         '/',
 684                                 'action_login': 'Log In',
 685                                 'username':     username,
 686                                 'password':     password,
 687                                 }
 688                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 689                 try:
 690                         self.report_login()
 691                         login_results = urllib2.urlopen(request).read()
 692                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 693                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 694                                 return
 695                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 696                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 697                         return
 698
 699                 # Confirm age
 700                 age_form = {
 701                                 'next_url':             '/',
 702                                 'action_confirm':       'Confirm',
 703                                 }
 704                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 705                 try:
 706                         self.report_age_confirmation()
 707                         age_results = urllib2.urlopen(request).read()
 708                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 709                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 710                         return
 711
 712         def _real_extract(self, url):
 713                 # Extract video id from URL
 714                 mobj = re.match(self._VALID_URL, url)
 715                 if mobj is None:
 716                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 717                         return
 718                 video_id = mobj.group(2)
 719
 720                 # Downloader parameters
 721                 best_quality = False
 722                 format_param = None
 723                 quality_index = 0
 724                 if self._downloader is not None:
 725                         params = self._downloader.params
 726                         format_param = params.get('format', None)
 727                         if format_param == '0':
 728                                 format_param = self._available_formats[quality_index]
 729                                 best_quality = True
 730
 731                 while True:
 732                         # Extension
 733                         video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735                         # Get video info
 736                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 737                         request = urllib2.Request(video_info_url, None, std_headers)
 738                         try:
 739                                 self.report_video_info_webpage_download(video_id)
 740                                 video_info_webpage = urllib2.urlopen(request).read()
 741                                 video_info = parse_qs(video_info_webpage)
 742                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 743                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 744                                 return
 745                         self.report_information_extraction(video_id)
 746
 747                         # "t" param
 748                         if 'token' not in video_info:
 749                                 # Attempt to see if YouTube has issued an error message
 750                                 if 'reason' not in video_info:
 751                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 752                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 753                                         stream.write(video_info_webpage)
 754                                         stream.close()
 755                                 else:
 756                                         reason = urllib.unquote_plus(video_info['reason'][0])
 757                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 758                                 return
 759                         token = urllib.unquote_plus(video_info['token'][0])
 760                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 761                         if format_param is not None:
 762                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 763
 764                         # Check possible RTMP download
 765                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 766                                 self.report_rtmp_download()
 767                                 video_real_url = video_info['conn'][0]
 768
 769                         # uploader
 770                         if 'author' not in video_info:
 771                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 772                                 return
 773                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 774
 775                         # title
 776                         if 'title' not in video_info:
 777                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 778                                 return
 779                         video_title = urllib.unquote_plus(video_info['title'][0])
 780                         video_title = video_title.decode('utf-8')
 781                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 782                         video_title = video_title.replace(os.sep, u'%')
 783
 784                         # simplified title
 785                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 786                         simple_title = simple_title.strip(ur'_')
 787
 788                         try:
 789                                 # Process video information
 790                                 self._downloader.process_info({
 791                                         'id':           video_id.decode('utf-8'),
 792                                         'url':          video_real_url.decode('utf-8'),
 793                                         'uploader':     video_uploader.decode('utf-8'),
 794                                         'title':        video_title,
 795                                         'stitle':       simple_title,
 796                                         'ext':          video_extension.decode('utf-8'),
 797                                 })
 798
 799                                 return
 800
 801                         except UnavailableFormatError, err:
 802                                 if best_quality:
 803                                         if quality_index == len(self._available_formats) - 1:
 804                                                 # I don't ever expect this to happen
 805                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 806                                                 return
 807                                         else:
 808                                                 self.report_unavailable_format(video_id, format_param)
 809                                                 quality_index += 1
 810                                                 format_param = self._available_formats[quality_index]
 811                                                 continue
 812                                 else:
 813                                         self._downloader.trouble('ERROR: format not available for video')
 814                                         return
 815
 816
 817 class MetacafeIE(InfoExtractor):
 818         """Information Extractor for metacafe.com."""
 819
 820         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 821         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 822         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 823         _youtube_ie = None
 824
 825         def __init__(self, youtube_ie, downloader=None):
 826                 InfoExtractor.__init__(self, downloader)
 827                 self._youtube_ie = youtube_ie
 828
 829         @staticmethod
 830         def suitable(url):
 831                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 832
 833         def report_disclaimer(self):
 834                 """Report disclaimer retrieval."""
 835                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 836
 837         def report_age_confirmation(self):
 838                 """Report attempt to confirm age."""
 839                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 840
 841         def report_download_webpage(self, video_id):
 842                 """Report webpage download."""
 843                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 844
 845         def report_extraction(self, video_id):
 846                 """Report information extraction."""
 847                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 848
 849         def _real_initialize(self):
 850                 # Retrieve disclaimer
 851                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 852                 try:
 853                         self.report_disclaimer()
 854                         disclaimer = urllib2.urlopen(request).read()
 855                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 856                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 857                         return
 858
 859                 # Confirm age
 860                 disclaimer_form = {
 861                         'filters': '0',
 862                         'submit': "Continue - I'm over 18",
 863                         }
 864                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 865                 try:
 866                         self.report_age_confirmation()
 867                         disclaimer = urllib2.urlopen(request).read()
 868                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 869                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 870                         return
 871
 872         def _real_extract(self, url):
 873                 # Extract id and simplified title from URL
 874                 mobj = re.match(self._VALID_URL, url)
 875                 if mobj is None:
 876                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 877                         return
 878
 879                 video_id = mobj.group(1)
 880
 881                 # Check if video comes from YouTube
 882                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 883                 if mobj2 is not None:
 884                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 885                         return
 886
 887                 simple_title = mobj.group(2).decode('utf-8')
 888                 video_extension = 'flv'
 889
 890                 # Retrieve video webpage to extract further information
 891                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 892                 try:
 893                         self.report_download_webpage(video_id)
 894                         webpage = urllib2.urlopen(request).read()
 895                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 896                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 897                         return
 898
 899                 # Extract URL, uploader and title from webpage
 900                 self.report_extraction(video_id)
 901                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 902                 if mobj is None:
 903                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 904                         return
 905                 mediaURL = urllib.unquote(mobj.group(1))
 906
 907                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 908                 #if mobj is None:
 909                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 910                 #       return
 911                 #gdaKey = mobj.group(1)
 912                 #
 913                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 914
 915                 video_url = mediaURL
 916
 917                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 918                 if mobj is None:
 919                         self._downloader.trouble(u'ERROR: unable to extract title')
 920                         return
 921                 video_title = mobj.group(1).decode('utf-8')
 922
 923                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 924                 if mobj is None:
 925                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 926                         return
 927                 video_uploader = mobj.group(1)
 928
 929                 try:
 930                         # Process video information
 931                         self._downloader.process_info({
 932                                 'id':           video_id.decode('utf-8'),
 933                                 'url':          video_url.decode('utf-8'),
 934                                 'uploader':     video_uploader.decode('utf-8'),
 935                                 'title':        video_title,
 936                                 'stitle':       simple_title,
 937                                 'ext':          video_extension.decode('utf-8'),
 938                         })
 939                 except UnavailableFormatError:
 940                         self._downloader.trouble(u'ERROR: format not available for video')
 941
 942
 943 class GoogleIE(InfoExtractor):
 944         """Information extractor for video.google.com."""
 945
 946         _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
 947
 948         def __init__(self, downloader=None):
 949                 InfoExtractor.__init__(self, downloader)
 950
 951         @staticmethod
 952         def suitable(url):
 953                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 954
 955         def report_download_webpage(self, video_id):
 956                 """Report webpage download."""
 957                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
 958
 959         def report_extraction(self, video_id):
 960                 """Report information extraction."""
 961                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
 962
 963         def _real_initialize(self):
 964                 return
 965
 966         def _real_extract(self, url):
 967                 # Extract id from URL
 968                 mobj = re.match(self._VALID_URL, url)
 969                 if mobj is None:
 970                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 971                         return
 972
 973                 video_id = mobj.group(1)
 974
 975                 video_extension = 'mp4'
 976
 977                 # Retrieve video webpage to extract further information
 978                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
 979                 try:
 980                         self.report_download_webpage(video_id)
 981                         webpage = urllib2.urlopen(request).read()
 982                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 983                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 984                         return
 985
 986                 # Extract URL, uploader, and title from webpage
 987                 self.report_extraction(video_id)
 988                 mobj = re.search(r"download_url:'(.*)'", webpage)
 989                 if mobj is None:
 990                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 991                         return
 992                 mediaURL = urllib.unquote(mobj.group(1))
 993                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 994                 mediaURL = mediaURL.replace('\\x26', '\x26')
 995
 996                 video_url = mediaURL
 997
 998                 mobj = re.search(r'<title>(.*)</title>', webpage)
 999                 if mobj is None:
1000                         self._downloader.trouble(u'ERROR: unable to extract title')
1001                         return
1002                 video_title = mobj.group(1).decode('utf-8')
1003
1004                 # Google Video doesn't show uploader nicknames?
1005                 video_uploader = 'uploader'
1006
1007                 try:
1008                         # Process video information
1009                         self._downloader.process_info({
1010                                 'id':           video_id.decode('utf-8'),
1011                                 'url':          video_url.decode('utf-8'),
1012                                 'uploader':     video_uploader.decode('utf-8'),
1013                                 'title':        video_title.decode('utf-8'),
1014                                 'stitle':       video_title.decode('utf-8'),
1015                                 'ext':          video_extension.decode('utf-8'),
1016                         })
1017                 except UnavailableFormatError:
1018                         self._downloader.trouble(u'ERROR: format not available for video')
1019
1020
1021 class PhotobucketIE(InfoExtractor):
1022         """Information extractor for photobucket.com."""
1023
1024         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1025
1026         def __init__(self, downloader=None):
1027                 InfoExtractor.__init__(self, downloader)
1028
1029         @staticmethod
1030         def suitable(url):
1031                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1032
1033         def report_download_webpage(self, video_id):
1034                 """Report webpage download."""
1035                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1036
1037         def report_extraction(self, video_id):
1038                 """Report information extraction."""
1039                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1040
1041         def _real_initialize(self):
1042                 return
1043
1044         def _real_extract(self, url):
1045                 # Extract id from URL
1046                 mobj = re.match(self._VALID_URL, url)
1047                 if mobj is None:
1048                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1049                         return
1050
1051                 video_id = mobj.group(1)
1052
1053                 video_extension = 'flv'
1054
1055                 # Retrieve video webpage to extract further information
1056                 request = urllib2.Request(url)
1057                 try:
1058                         self.report_download_webpage(video_id)
1059                         webpage = urllib2.urlopen(request).read()
1060                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1061                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1062                         return
1063
1064                 # Extract URL, uploader, and title from webpage
1065                 self.report_extraction(video_id)
1066                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1069                         return
1070                 mediaURL = urllib.unquote(mobj.group(1))
1071
1072                 video_url = mediaURL
1073
1074                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1075                 if mobj is None:
1076                         self._downloader.trouble(u'ERROR: unable to extract title')
1077                         return
1078                 video_title = mobj.group(1).decode('utf-8')
1079
1080                 video_uploader = mobj.group(2).decode('utf-8')
1081
1082                 try:
1083                         # Process video information
1084                         self._downloader.process_info({
1085                                 'id':           video_id.decode('utf-8'),
1086                                 'url':          video_url.decode('utf-8'),
1087                                 'uploader':     video_uploader.decode('utf-8'),
1088                                 'title':        video_title.decode('utf-8'),
1089                                 'stitle':       video_title.decode('utf-8'),
1090                                 'ext':          video_extension.decode('utf-8'),
1091                         })
1092                 except UnavailableFormatError:
1093                         self._downloader.trouble(u'ERROR: format not available for video')
1094
1095
1096 class YoutubeSearchIE(InfoExtractor):
1097         """Information Extractor for YouTube search queries."""
1098         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1099         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1100         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1101         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1102         _youtube_ie = None
1103         _max_youtube_results = 1000
1104
1105         def __init__(self, youtube_ie, downloader=None):
1106                 InfoExtractor.__init__(self, downloader)
1107                 self._youtube_ie = youtube_ie
1108
1109         @staticmethod
1110         def suitable(url):
1111                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1112
1113         def report_download_page(self, query, pagenum):
1114                 """Report attempt to download playlist page with given number."""
1115                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1116
1117         def _real_initialize(self):
1118                 self._youtube_ie.initialize()
1119
1120         def _real_extract(self, query):
1121                 mobj = re.match(self._VALID_QUERY, query)
1122                 if mobj is None:
1123                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1124                         return
1125
1126                 prefix, query = query.split(':')
1127                 prefix = prefix[8:]
1128                 if prefix == '':
1129                         self._download_n_results(query, 1)
1130                         return
1131                 elif prefix == 'all':
1132                         self._download_n_results(query, self._max_youtube_results)
1133                         return
1134                 else:
1135                         try:
1136                                 n = long(prefix)
1137                                 if n <= 0:
1138                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1139                                         return
1140                                 elif n > self._max_youtube_results:
1141                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1142                                         n = self._max_youtube_results
1143                                 self._download_n_results(query, n)
1144                                 return
1145                         except ValueError: # parsing prefix as integer fails
1146                                 self._download_n_results(query, 1)
1147                                 return
1148
1149         def _download_n_results(self, query, n):
1150                 """Downloads a specified number of results for a query"""
1151
1152                 video_ids = []
1153                 already_seen = set()
1154                 pagenum = 1
1155
1156                 while True:
1157                         self.report_download_page(query, pagenum)
1158                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1159                         request = urllib2.Request(result_url, None, std_headers)
1160                         try:
1161                                 page = urllib2.urlopen(request).read()
1162                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1163                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1164                                 return
1165
1166                         # Extract video identifiers
1167                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1168                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1169                                 if video_id not in already_seen:
1170                                         video_ids.append(video_id)
1171                                         already_seen.add(video_id)
1172                                         if len(video_ids) == n:
1173                                                 # Specified n videos reached
1174                                                 for id in video_ids:
1175                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1176                                                 return
1177
1178                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1179                                 for id in video_ids:
1180                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1181                                 return
1182
1183                         pagenum = pagenum + 1
1184
1185 class YoutubePlaylistIE(InfoExtractor):
1186         """Information Extractor for YouTube playlists."""
1187
1188         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1189         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1190         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1191         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1192         _youtube_ie = None
1193
1194         def __init__(self, youtube_ie, downloader=None):
1195                 InfoExtractor.__init__(self, downloader)
1196                 self._youtube_ie = youtube_ie
1197
1198         @staticmethod
1199         def suitable(url):
1200                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1201
1202         def report_download_page(self, playlist_id, pagenum):
1203                 """Report attempt to download playlist page with given number."""
1204                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1205
1206         def _real_initialize(self):
1207                 self._youtube_ie.initialize()
1208
1209         def _real_extract(self, url):
1210                 # Extract playlist id
1211                 mobj = re.match(self._VALID_URL, url)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1214                         return
1215
1216                 # Download playlist pages
1217                 playlist_id = mobj.group(1)
1218                 video_ids = []
1219                 pagenum = 1
1220
1221                 while True:
1222                         self.report_download_page(playlist_id, pagenum)
1223                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1224                         try:
1225                                 page = urllib2.urlopen(request).read()
1226                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1228                                 return
1229
1230                         # Extract video identifiers
1231                         ids_in_page = []
1232                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1233                                 if mobj.group(1) not in ids_in_page:
1234                                         ids_in_page.append(mobj.group(1))
1235                         video_ids.extend(ids_in_page)
1236
1237                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1238                                 break
1239                         pagenum = pagenum + 1
1240
1241                 for id in video_ids:
1242                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1243                 return
1244
1245 class YoutubeUserIE(InfoExtractor):
1246         """Information Extractor for YouTube users."""
1247
1248         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1249         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1250         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1251         _youtube_ie = None
1252
1253         def __init__(self, youtube_ie, downloader=None):
1254                 InfoExtractor.__init__(self, downloader)
1255                 self._youtube_ie = youtube_ie
1256
1257         @staticmethod
1258         def suitable(url):
1259                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1260
1261         def report_download_page(self, username):
1262                 """Report attempt to download user page."""
1263                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1264
1265         def _real_initialize(self):
1266                 self._youtube_ie.initialize()
1267
1268         def _real_extract(self, url):
1269                 # Extract username
1270                 mobj = re.match(self._VALID_URL, url)
1271                 if mobj is None:
1272                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1273                         return
1274
1275                 # Download user page
1276                 username = mobj.group(1)
1277                 video_ids = []
1278                 pagenum = 1
1279
1280                 self.report_download_page(username)
1281                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1282                 try:
1283                         page = urllib2.urlopen(request).read()
1284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1285                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1286                         return
1287
1288                 # Extract video identifiers
1289                 ids_in_page = []
1290
1291                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1292                         if mobj.group(1) not in ids_in_page:
1293                                 ids_in_page.append(mobj.group(1))
1294                 video_ids.extend(ids_in_page)
1295
1296                 for id in video_ids:
1297                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1298                 return
1299
1300 class PostProcessor(object):
1301         """Post Processor class.
1302
1303         PostProcessor objects can be added to downloaders with their
1304         add_post_processor() method. When the downloader has finished a
1305         successful download, it will take its internal chain of PostProcessors
1306         and start calling the run() method on each one of them, first with
1307         an initial argument and then with the returned value of the previous
1308         PostProcessor.
1309
1310         The chain will be stopped if one of them ever returns None or the end
1311         of the chain is reached.
1312
1313         PostProcessor objects follow a "mutual registration" process similar
1314         to InfoExtractor objects.
1315         """
1316
1317         _downloader = None
1318
1319         def __init__(self, downloader=None):
1320                 self._downloader = downloader
1321
1322         def set_downloader(self, downloader):
1323                 """Sets the downloader for this PP."""
1324                 self._downloader = downloader
1325
1326         def run(self, information):
1327                 """Run the PostProcessor.
1328
1329                 The "information" argument is a dictionary like the ones
1330                 composed by InfoExtractors. The only difference is that this
1331                 one has an extra field called "filepath" that points to the
1332                 downloaded file.
1333
1334                 When this method returns None, the postprocessing chain is
1335                 stopped. However, this method may return an information
1336                 dictionary that will be passed to the next postprocessing
1337                 object in the chain. It can be the one it received after
1338                 changing some fields.
1339
1340                 In addition, this method may raise a PostProcessingError
1341                 exception that will be taken into account by the downloader
1342                 it was called from.
1343                 """
1344                 return information # by default, do nothing
1345
1346 ### MAIN PROGRAM ###
1347 if __name__ == '__main__':
1348         try:
1349                 # Modules needed only when running the main program
1350                 import getpass
1351                 import optparse
1352
1353                 # Function to update the program file with the latest version from bitbucket.org
1354                 def update_self(downloader, filename):
1355                         # Note: downloader only used for options
1356                         if not os.access (filename, os.W_OK):
1357                                 sys.exit('ERROR: no write permissions on %s' % filename)
1358
1359                         downloader.to_stdout('Updating to latest stable version...')
1360                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1361                         latest_version = urllib.urlopen(latest_url).read().strip()
1362                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1363                         newcontent = urllib.urlopen(prog_url).read()
1364                         stream = open(filename, 'w')
1365                         stream.write(newcontent)
1366                         stream.close()
1367                         downloader.to_stdout('Updated to version %s' % latest_version)
1368
1369                 # General configuration
1370                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1371                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1372                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1373
1374                 # Parse command line
1375                 parser = optparse.OptionParser(
1376                         usage='Usage: %prog [options] url...',
1377                         version='2010.01.19',
1378                         conflict_handler='resolve',
1379                 )
1380
1381                 parser.add_option('-h', '--help',
1382                                 action='help', help='print this help text and exit')
1383                 parser.add_option('-v', '--version',
1384                                 action='version', help='print program version and exit')
1385                 parser.add_option('-U', '--update',
1386                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1387                 parser.add_option('-i', '--ignore-errors',
1388                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1389                 parser.add_option('-r', '--rate-limit',
1390                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1391
1392                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1393                 authentication.add_option('-u', '--username',
1394                                 dest='username', metavar='UN', help='account username')
1395                 authentication.add_option('-p', '--password',
1396                                 dest='password', metavar='PW', help='account password')
1397                 authentication.add_option('-n', '--netrc',
1398                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1399                 parser.add_option_group(authentication)
1400
1401                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1402                 video_format.add_option('-f', '--format',
1403                                 action='store', dest='format', metavar='FMT', help='video format code')
1404                 video_format.add_option('-b', '--best-quality',
1405                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1406                 video_format.add_option('-m', '--mobile-version',
1407                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1408                 video_format.add_option('-d', '--high-def',
1409                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1410                 parser.add_option_group(video_format)
1411
1412                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1413                 verbosity.add_option('-q', '--quiet',
1414                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1415                 verbosity.add_option('-s', '--simulate',
1416                                 action='store_true', dest='simulate', help='do not download video', default=False)
1417                 verbosity.add_option('-g', '--get-url',
1418                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1419                 verbosity.add_option('-e', '--get-title',
1420                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1421                 parser.add_option_group(verbosity)
1422
1423                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1424                 filesystem.add_option('-t', '--title',
1425                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1426                 filesystem.add_option('-l', '--literal',
1427                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1428                 filesystem.add_option('-o', '--output',
1429                                 dest='outtmpl', metavar='TPL', help='output filename template')
1430                 filesystem.add_option('-a', '--batch-file',
1431                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1432                 filesystem.add_option('-w', '--no-overwrites',
1433                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1434                 filesystem.add_option('-c', '--continue',
1435                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1436                 parser.add_option_group(filesystem)
1437
1438                 (opts, args) = parser.parse_args()
1439
1440                 # Batch file verification
1441                 batchurls = []
1442                 if opts.batchfile is not None:
1443                         try:
1444                                 batchurls = open(opts.batchfile, 'r').readlines()
1445                                 batchurls = [x.strip() for x in batchurls]
1446                                 batchurls = [x for x in batchurls if len(x) > 0]
1447                         except IOError:
1448                                 sys.exit(u'ERROR: batch file could not be read')
1449                 all_urls = batchurls + args
1450
1451                 # Conflicting, missing and erroneous options
1452                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1453                         parser.error(u'using .netrc conflicts with giving username/password')
1454                 if opts.password is not None and opts.username is None:
1455                         parser.error(u'account username missing')
1456                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1457                         parser.error(u'using output template conflicts with using title or literal title')
1458                 if opts.usetitle and opts.useliteral:
1459                         parser.error(u'using title conflicts with using literal title')
1460                 if opts.username is not None and opts.password is None:
1461                         opts.password = getpass.getpass(u'Type account password and press return:')
1462                 if opts.ratelimit is not None:
1463                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1464                         if numeric_limit is None:
1465                                 parser.error(u'invalid rate limit specified')
1466                         opts.ratelimit = numeric_limit
1467
1468                 # Information extractors
1469                 youtube_ie = YoutubeIE()
1470                 metacafe_ie = MetacafeIE(youtube_ie)
1471                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1472                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1473                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1474                 google_ie = GoogleIE()
1475                 photobucket_ie = PhotobucketIE()
1476
1477                 # File downloader
1478                 fd = FileDownloader({
1479                         'usenetrc': opts.usenetrc,
1480                         'username': opts.username,
1481                         'password': opts.password,
1482                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1483                         'forceurl': opts.geturl,
1484                         'forcetitle': opts.gettitle,
1485                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1486                         'format': opts.format,
1487                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1488                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1489                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1490                                 or u'%(id)s.%(ext)s'),
1491                         'ignoreerrors': opts.ignoreerrors,
1492                         'ratelimit': opts.ratelimit,
1493                         'nooverwrites': opts.nooverwrites,
1494                         'continuedl': opts.continue_dl,
1495                         })
1496                 fd.add_info_extractor(youtube_search_ie)
1497                 fd.add_info_extractor(youtube_pl_ie)
1498                 fd.add_info_extractor(youtube_user_ie)
1499                 fd.add_info_extractor(metacafe_ie)
1500                 fd.add_info_extractor(youtube_ie)
1501                 fd.add_info_extractor(google_ie)
1502                 fd.add_info_extractor(photobucket_ie)
1503
1504                 # Update version
1505                 if opts.update_self:
1506                         update_self(fd, sys.argv[0])
1507
1508                 # Maybe do nothing
1509                 if len(all_urls) < 1:
1510                         if not opts.update_self:
1511                                 parser.error(u'you must provide at least one URL')
1512                         else:
1513                                 sys.exit()
1514                 retcode = fd.download(all_urls)
1515                 sys.exit(retcode)
1516
1517         except DownloadError:
1518                 sys.exit(1)
1519         except SameFileError:
1520                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1521         except KeyboardInterrupt:
1522                 sys.exit(u'\nERROR: Interrupted by user')