Raphaël G. Git Repositories - youtubedl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 stream = open(filename, open_mode)
  97                 return (stream, filename)
  98         except (IOError, OSError), err:
  99                 # In case of error, try to remove win32 forbidden chars
 100                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 101
 102                 # An exception here should be caught in the caller
 103                 stream = open(filename, open_mode)
 104                 return (stream, filename)
 105
 106
 107 class DownloadError(Exception):
 108         """Download Error exception.
 109
 110         This exception may be thrown by FileDownloader objects if they are not
 111         configured to continue on errors. They will contain the appropriate
 112         error message.
 113         """
 114         pass
 115
 116 class SameFileError(Exception):
 117         """Same File exception.
 118
 119         This exception will be thrown by FileDownloader objects if they detect
 120         multiple files would have to be downloaded to the same file on disk.
 121         """
 122         pass
 123
 124 class PostProcessingError(Exception):
 125         """Post Processing exception.
 126
 127         This exception may be raised by PostProcessor's .run() method to
 128         indicate an error in the postprocessing task.
 129         """
 130         pass
 131
 132 class UnavailableFormatError(Exception):
 133         """Unavailable Format exception.
 134
 135         This exception will be thrown when a video is requested
 136         in a format that is not available for that video.
 137         """
 138         pass
 139
 140 class ContentTooShortError(Exception):
 141         """Content Too Short exception.
 142
 143         This exception may be raised by FileDownloader objects when a file they
 144         download is too small for what the server announced first, indicating
 145         the connection was probably interrupted.
 146         """
 147         # Both in bytes
 148         downloaded = None
 149         expected = None
 150
 151         def __init__(self, downloaded, expected):
 152                 self.downloaded = downloaded
 153                 self.expected = expected
 154
 155 class FileDownloader(object):
 156         """File Downloader class.
 157
 158         File downloader objects are the ones responsible of downloading the
 159         actual video file and writing it to disk if the user has requested
 160         it, among some other tasks. In most cases there should be one per
 161         program. As, given a video URL, the downloader doesn't know how to
 162         extract all the needed information, task that InfoExtractors do, it
 163         has to pass the URL to one of them.
 164
 165         For this, file downloader objects have a method that allows
 166         InfoExtractors to be registered in a given order. When it is passed
 167         a URL, the file downloader handles it to the first InfoExtractor it
 168         finds that reports being able to handle it. The InfoExtractor extracts
 169         all the information about the video or videos the URL refers to, and
 170         asks the FileDownloader to process the video information, possibly
 171         downloading the video.
 172
 173         File downloaders accept a lot of parameters. In order not to saturate
 174         the object constructor with arguments, it receives a dictionary of
 175         options instead. These options are available through the params
 176         attribute for the InfoExtractors to use. The FileDownloader also
 177         registers itself as the downloader in charge for the InfoExtractors
 178         that are added to it, so this is a "mutual registration".
 179
 180         Available options:
 181
 182         username:       Username for authentication purposes.
 183         password:       Password for authentication purposes.
 184         usenetrc:       Use netrc for authentication instead.
 185         quiet:          Do not print messages to stdout.
 186         forceurl:       Force printing final URL.
 187         forcetitle:     Force printing title.
 188         simulate:       Do not download the video files.
 189         format:         Video format code.
 190         outtmpl:        Template for output names.
 191         ignoreerrors:   Do not stop on download errors.
 192         ratelimit:      Download speed limit, in bytes/sec.
 193         nooverwrites:   Prevent overwriting files.
 194         continuedl:     Try to continue downloads if possible.
 195         noprogress:     Do not print the progress bar.
 196         """
 197
 198         params = None
 199         _ies = []
 200         _pps = []
 201         _download_retcode = None
 202
 203         def __init__(self, params):
 204                 """Create a FileDownloader object with the given options."""
 205                 self._ies = []
 206                 self._pps = []
 207                 self._download_retcode = 0
 208                 self.params = params
 209
 210         @staticmethod
 211         def pmkdir(filename):
 212                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 213                 components = filename.split(os.sep)
 214                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 215                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 216                 for dir in aggregate:
 217                         if not os.path.exists(dir):
 218                                 os.mkdir(dir)
 219
 220         @staticmethod
 221         def format_bytes(bytes):
 222                 if bytes is None:
 223                         return 'N/A'
 224                 if type(bytes) is str:
 225                         bytes = float(bytes)
 226                 if bytes == 0.0:
 227                         exponent = 0
 228                 else:
 229                         exponent = long(math.log(bytes, 1024.0))
 230                 suffix = 'bkMGTPEZY'[exponent]
 231                 converted = float(bytes) / float(1024**exponent)
 232                 return '%.2f%s' % (converted, suffix)
 233
 234         @staticmethod
 235         def calc_percent(byte_counter, data_len):
 236                 if data_len is None:
 237                         return '---.-%'
 238                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 239
 240         @staticmethod
 241         def calc_eta(start, now, total, current):
 242                 if total is None:
 243                         return '--:--'
 244                 dif = now - start
 245                 if current == 0 or dif < 0.001: # One millisecond
 246                         return '--:--'
 247                 rate = float(current) / dif
 248                 eta = long((float(total) - float(current)) / rate)
 249                 (eta_mins, eta_secs) = divmod(eta, 60)
 250                 if eta_mins > 99:
 251                         return '--:--'
 252                 return '%02d:%02d' % (eta_mins, eta_secs)
 253
 254         @staticmethod
 255         def calc_speed(start, now, bytes):
 256                 dif = now - start
 257                 if bytes == 0 or dif < 0.001: # One millisecond
 258                         return '%10s' % '---b/s'
 259                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 260
 261         @staticmethod
 262         def best_block_size(elapsed_time, bytes):
 263                 new_min = max(bytes / 2.0, 1.0)
 264                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 265                 if elapsed_time < 0.001:
 266                         return long(new_max)
 267                 rate = bytes / elapsed_time
 268                 if rate > new_max:
 269                         return long(new_max)
 270                 if rate < new_min:
 271                         return long(new_min)
 272                 return long(rate)
 273
 274         @staticmethod
 275         def parse_bytes(bytestr):
 276                 """Parse a string indicating a byte quantity into a long integer."""
 277                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 278                 if matchobj is None:
 279                         return None
 280                 number = float(matchobj.group(1))
 281                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 282                 return long(round(number * multiplier))
 283
 284         @staticmethod
 285         def verify_url(url):
 286                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 287                 request = urllib2.Request(url, None, std_headers)
 288                 data = urllib2.urlopen(request)
 289                 data.read(1)
 290                 url = data.geturl()
 291                 data.close()
 292                 return url
 293
 294         def add_info_extractor(self, ie):
 295                 """Add an InfoExtractor object to the end of the list."""
 296                 self._ies.append(ie)
 297                 ie.set_downloader(self)
 298
 299         def add_post_processor(self, pp):
 300                 """Add a PostProcessor object to the end of the chain."""
 301                 self._pps.append(pp)
 302                 pp.set_downloader(self)
 303
 304         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 305                 """Print message to stdout if not in quiet mode."""
 306                 try:
 307                         if not self.params.get('quiet', False):
 308                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 309                         sys.stdout.flush()
 310                 except (UnicodeEncodeError), err:
 311                         if not ignore_encoding_errors:
 312                                 raise
 313
 314         def to_stderr(self, message):
 315                 """Print message to stderr."""
 316                 print >>sys.stderr, message.encode(preferredencoding())
 317
 318         def fixed_template(self):
 319                 """Checks if the output template is fixed."""
 320                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 321
 322         def trouble(self, message=None):
 323                 """Determine action to take when a download problem appears.
 324
 325                 Depending on if the downloader has been configured to ignore
 326                 download errors or not, this method may throw an exception or
 327                 not when errors are found, after printing the message.
 328                 """
 329                 if message is not None:
 330                         self.to_stderr(message)
 331                 if not self.params.get('ignoreerrors', False):
 332                         raise DownloadError(message)
 333                 self._download_retcode = 1
 334
 335         def slow_down(self, start_time, byte_counter):
 336                 """Sleep if the download speed is over the rate limit."""
 337                 rate_limit = self.params.get('ratelimit', None)
 338                 if rate_limit is None or byte_counter == 0:
 339                         return
 340                 now = time.time()
 341                 elapsed = now - start_time
 342                 if elapsed <= 0.0:
 343                         return
 344                 speed = float(byte_counter) / elapsed
 345                 if speed > rate_limit:
 346                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 347
 348         def report_destination(self, filename):
 349                 """Report destination filename."""
 350                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 351
 352         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 353                 """Report download progress."""
 354                 if self.params.get('noprogress', False):
 355                         return
 356                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 357                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 358
 359         def report_resuming_byte(self, resume_len):
 360                 """Report attemtp to resume at given byte."""
 361                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 362
 363         def report_file_already_downloaded(self, file_name):
 364                 """Report file has already been fully downloaded."""
 365                 try:
 366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 367                 except (UnicodeEncodeError), err:
 368                         self.to_stdout(u'[download] The file has already been downloaded')
 369
 370         def report_unable_to_resume(self):
 371                 """Report it was impossible to resume download."""
 372                 self.to_stdout(u'[download] Unable to resume')
 373
 374         def report_finish(self):
 375                 """Report download finished."""
 376                 if self.params.get('noprogress', False):
 377                         self.to_stdout(u'[download] Download completed')
 378                 else:
 379                         self.to_stdout(u'')
 380
 381         def process_info(self, info_dict):
 382                 """Process a single dictionary returned by an InfoExtractor."""
 383                 # Do nothing else if in simulate mode
 384                 if self.params.get('simulate', False):
 385                         # Verify URL if it's an HTTP one
 386                         if info_dict['url'].startswith('http'):
 387                                 try:
 388                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 389                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 390                                         raise UnavailableFormatError
 391
 392                         # Forced printings
 393                         if self.params.get('forcetitle', False):
 394                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 395                         if self.params.get('forceurl', False):
 396                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 397
 398                         return
 399
 400                 try:
 401                         template_dict = dict(info_dict)
 402                         template_dict['epoch'] = unicode(long(time.time()))
 403                         filename = self.params['outtmpl'] % template_dict
 404                 except (ValueError, KeyError), err:
 405                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 406                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 407                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 408                         return
 409
 410                 try:
 411                         self.pmkdir(filename)
 412                 except (OSError, IOError), err:
 413                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 414                         return
 415
 416                 try:
 417                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 418                 except (OSError, IOError), err:
 419                         raise UnavailableFormatError
 420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 421                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 422                         return
 423                 except (ContentTooShortError, ), err:
 424                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 425                         return
 426
 427                 if success:
 428                         try:
 429                                 self.post_process(filename, info_dict)
 430                         except (PostProcessingError), err:
 431                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 432                                 return
 433
 434         def download(self, url_list):
 435                 """Download a given list of URLs."""
 436                 if len(url_list) > 1 and self.fixed_template():
 437                         raise SameFileError(self.params['outtmpl'])
 438
 439                 for url in url_list:
 440                         suitable_found = False
 441                         for ie in self._ies:
 442                                 # Go to next InfoExtractor if not suitable
 443                                 if not ie.suitable(url):
 444                                         continue
 445
 446                                 # Suitable InfoExtractor found
 447                                 suitable_found = True
 448
 449                                 # Extract information from URL and process it
 450                                 ie.extract(url)
 451
 452                                 # Suitable InfoExtractor had been found; go to next URL
 453                                 break
 454
 455                         if not suitable_found:
 456                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 457
 458                 return self._download_retcode
 459
 460         def post_process(self, filename, ie_info):
 461                 """Run the postprocessing chain on the given file."""
 462                 info = dict(ie_info)
 463                 info['filepath'] = filename
 464                 for pp in self._pps:
 465                         info = pp.run(info)
 466                         if info is None:
 467                                 break
 468
 469         def _download_with_rtmpdump(self, filename, url):
 470                 self.report_destination(filename)
 471
 472                 # Check for rtmpdump first
 473                 try:
 474                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 475                 except (OSError, IOError):
 476                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 477                         return False
 478
 479                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 480                 # the connection was interrumpted and resuming appears to be
 481                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 482                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 483                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 484                 while retval == 2 or retval == 1:
 485                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 486                         time.sleep(2.0) # This seems to be needed
 487                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 488                 if retval == 0:
 489                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 490                         return True
 491                 else:
 492                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 493                         return False
 494
 495         def _do_download(self, filename, url):
 496                 # Attempt to download using rtmpdump
 497                 if url.startswith('rtmp'):
 498                         return self._download_with_rtmpdump(filename, url)
 499
 500                 stream = None
 501                 open_mode = 'wb'
 502                 basic_request = urllib2.Request(url, None, std_headers)
 503                 request = urllib2.Request(url, None, std_headers)
 504
 505                 # Establish possible resume length
 506                 if os.path.isfile(filename):
 507                         resume_len = os.path.getsize(filename)
 508                 else:
 509                         resume_len = 0
 510
 511                 # Request parameters in case of being able to resume
 512                 if self.params.get('continuedl', False) and resume_len != 0:
 513                         self.report_resuming_byte(resume_len)
 514                         request.add_header('Range','bytes=%d-' % resume_len)
 515                         open_mode = 'ab'
 516
 517                 # Establish connection
 518                 try:
 519                         data = urllib2.urlopen(request)
 520                 except (urllib2.HTTPError, ), err:
 521                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 522                                 raise
 523                         # Unable to resume
 524                         data = urllib2.urlopen(basic_request)
 525                         content_length = data.info()['Content-Length']
 526
 527                         if content_length is not None and long(content_length) == resume_len:
 528                                 # Because the file had already been fully downloaded
 529                                 self.report_file_already_downloaded(filename)
 530                                 return True
 531                         else:
 532                                 # Because the server didn't let us
 533                                 self.report_unable_to_resume()
 534                                 open_mode = 'wb'
 535
 536                 data_len = data.info().get('Content-length', None)
 537                 data_len_str = self.format_bytes(data_len)
 538                 byte_counter = 0
 539                 block_size = 1024
 540                 start = time.time()
 541                 while True:
 542                         # Download and write
 543                         before = time.time()
 544                         data_block = data.read(block_size)
 545                         after = time.time()
 546                         data_block_len = len(data_block)
 547                         if data_block_len == 0:
 548                                 break
 549                         byte_counter += data_block_len
 550
 551                         # Open file just in time
 552                         if stream is None:
 553                                 try:
 554                                         (stream, filename) = sanitize_open(filename, open_mode)
 555                                         self.report_destination(filename)
 556                                 except (OSError, IOError), err:
 557                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 558                                         return False
 559                         stream.write(data_block)
 560                         block_size = self.best_block_size(after - before, data_block_len)
 561
 562                         # Progress message
 563                         percent_str = self.calc_percent(byte_counter, data_len)
 564                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 565                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 566                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 567
 568                         # Apply rate limit
 569                         self.slow_down(start, byte_counter)
 570
 571                 self.report_finish()
 572                 if data_len is not None and str(byte_counter) != data_len:
 573                         raise ContentTooShortError(byte_counter, long(data_len))
 574                 return True
 575
 576 class InfoExtractor(object):
 577         """Information Extractor class.
 578
 579         Information extractors are the classes that, given a URL, extract
 580         information from the video (or videos) the URL refers to. This
 581         information includes the real video URL, the video title and simplified
 582         title, author and others. The information is stored in a dictionary
 583         which is then passed to the FileDownloader. The FileDownloader
 584         processes this information possibly downloading the video to the file
 585         system, among other possible outcomes. The dictionaries must include
 586         the following fields:
 587
 588         id:             Video identifier.
 589         url:            Final video URL.
 590         uploader:       Nickname of the video uploader.
 591         title:          Literal title.
 592         stitle:         Simplified title.
 593         ext:            Video filename extension.
 594
 595         Subclasses of this one should re-define the _real_initialize() and
 596         _real_extract() methods, as well as the suitable() static method.
 597         Probably, they should also be instantiated and added to the main
 598         downloader.
 599         """
 600
 601         _ready = False
 602         _downloader = None
 603
 604         def __init__(self, downloader=None):
 605                 """Constructor. Receives an optional downloader."""
 606                 self._ready = False
 607                 self.set_downloader(downloader)
 608
 609         @staticmethod
 610         def suitable(url):
 611                 """Receives a URL and returns True if suitable for this IE."""
 612                 return False
 613
 614         def initialize(self):
 615                 """Initializes an instance (authentication, etc)."""
 616                 if not self._ready:
 617                         self._real_initialize()
 618                         self._ready = True
 619
 620         def extract(self, url):
 621                 """Extracts URL information and returns it in list of dicts."""
 622                 self.initialize()
 623                 return self._real_extract(url)
 624
 625         def set_downloader(self, downloader):
 626                 """Sets the downloader for this IE."""
 627                 self._downloader = downloader
 628
 629         def _real_initialize(self):
 630                 """Real initialization process. Redefine in subclasses."""
 631                 pass
 632
 633         def _real_extract(self, url):
 634                 """Real extraction process. Redefine in subclasses."""
 635                 pass
 636
 637 class YoutubeIE(InfoExtractor):
 638         """Information extractor for youtube.com."""
 639
 640         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 641         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 642         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 643         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 644         _NETRC_MACHINE = 'youtube'
 645         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 646         _video_extensions = {
 647                 '13': '3gp',
 648                 '17': 'mp4',
 649                 '18': 'mp4',
 650                 '22': 'mp4',
 651                 '37': 'mp4',
 652         }
 653
 654         @staticmethod
 655         def suitable(url):
 656                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 657
 658         def report_lang(self):
 659                 """Report attempt to set language."""
 660                 self._downloader.to_stdout(u'[youtube] Setting language')
 661
 662         def report_login(self):
 663                 """Report attempt to log in."""
 664                 self._downloader.to_stdout(u'[youtube] Logging in')
 665
 666         def report_age_confirmation(self):
 667                 """Report attempt to confirm age."""
 668                 self._downloader.to_stdout(u'[youtube] Confirming age')
 669
 670         def report_video_info_webpage_download(self, video_id):
 671                 """Report attempt to download video info webpage."""
 672                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 673
 674         def report_information_extraction(self, video_id):
 675                 """Report attempt to extract video information."""
 676                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 677
 678         def report_unavailable_format(self, video_id, format):
 679                 """Report extracted video URL."""
 680                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 681
 682         def report_rtmp_download(self):
 683                 """Indicate the download will use the RTMP protocol."""
 684                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 685
 686         def _real_initialize(self):
 687                 if self._downloader is None:
 688                         return
 689
 690                 username = None
 691                 password = None
 692                 downloader_params = self._downloader.params
 693
 694                 # Attempt to use provided username and password or .netrc data
 695                 if downloader_params.get('username', None) is not None:
 696                         username = downloader_params['username']
 697                         password = downloader_params['password']
 698                 elif downloader_params.get('usenetrc', False):
 699                         try:
 700                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 701                                 if info is not None:
 702                                         username = info[0]
 703                                         password = info[2]
 704                                 else:
 705                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 706                         except (IOError, netrc.NetrcParseError), err:
 707                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 708                                 return
 709
 710                 # Set language
 711                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 712                 try:
 713                         self.report_lang()
 714                         urllib2.urlopen(request).read()
 715                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 716                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 717                         return
 718
 719                 # No authentication to be performed
 720                 if username is None:
 721                         return
 722
 723                 # Log in
 724                 login_form = {
 725                                 'current_form': 'loginForm',
 726                                 'next':         '/',
 727                                 'action_login': 'Log In',
 728                                 'username':     username,
 729                                 'password':     password,
 730                                 }
 731                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 732                 try:
 733                         self.report_login()
 734                         login_results = urllib2.urlopen(request).read()
 735                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 736                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 737                                 return
 738                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 739                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 740                         return
 741
 742                 # Confirm age
 743                 age_form = {
 744                                 'next_url':             '/',
 745                                 'action_confirm':       'Confirm',
 746                                 }
 747                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 748                 try:
 749                         self.report_age_confirmation()
 750                         age_results = urllib2.urlopen(request).read()
 751                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 752                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 753                         return
 754
 755         def _real_extract(self, url):
 756                 # Extract video id from URL
 757                 mobj = re.match(self._VALID_URL, url)
 758                 if mobj is None:
 759                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 760                         return
 761                 video_id = mobj.group(2)
 762
 763                 # Downloader parameters
 764                 best_quality = False
 765                 format_param = None
 766                 quality_index = 0
 767                 if self._downloader is not None:
 768                         params = self._downloader.params
 769                         format_param = params.get('format', None)
 770                         if format_param == '0':
 771                                 format_param = self._available_formats[quality_index]
 772                                 best_quality = True
 773
 774                 while True:
 775                         # Extension
 776                         video_extension = self._video_extensions.get(format_param, 'flv')
 777
 778                         # Get video info
 779                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 780                         request = urllib2.Request(video_info_url, None, std_headers)
 781                         try:
 782                                 self.report_video_info_webpage_download(video_id)
 783                                 video_info_webpage = urllib2.urlopen(request).read()
 784                                 video_info = parse_qs(video_info_webpage)
 785                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 786                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 787                                 return
 788                         self.report_information_extraction(video_id)
 789
 790                         # "t" param
 791                         if 'token' not in video_info:
 792                                 # Attempt to see if YouTube has issued an error message
 793                                 if 'reason' not in video_info:
 794                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 795                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 796                                         stream.write(video_info_webpage)
 797                                         stream.close()
 798                                 else:
 799                                         reason = urllib.unquote_plus(video_info['reason'][0])
 800                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 801                                 return
 802                         token = urllib.unquote_plus(video_info['token'][0])
 803                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 804                         if format_param is not None:
 805                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 806
 807                         # Check possible RTMP download
 808                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 809                                 self.report_rtmp_download()
 810                                 video_real_url = video_info['conn'][0]
 811
 812                         # uploader
 813                         if 'author' not in video_info:
 814                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 815                                 return
 816                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 817
 818                         # title
 819                         if 'title' not in video_info:
 820                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 821                                 return
 822                         video_title = urllib.unquote_plus(video_info['title'][0])
 823                         video_title = video_title.decode('utf-8')
 824                         video_title = sanitize_title(video_title)
 825
 826                         # simplified title
 827                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 828                         simple_title = simple_title.strip(ur'_')
 829
 830                         try:
 831                                 # Process video information
 832                                 self._downloader.process_info({
 833                                         'id':           video_id.decode('utf-8'),
 834                                         'url':          video_real_url.decode('utf-8'),
 835                                         'uploader':     video_uploader.decode('utf-8'),
 836                                         'title':        video_title,
 837                                         'stitle':       simple_title,
 838                                         'ext':          video_extension.decode('utf-8'),
 839                                 })
 840
 841                                 return
 842
 843                         except UnavailableFormatError, err:
 844                                 if best_quality:
 845                                         if quality_index == len(self._available_formats) - 1:
 846                                                 # I don't ever expect this to happen
 847                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 848                                                 return
 849                                         else:
 850                                                 self.report_unavailable_format(video_id, format_param)
 851                                                 quality_index += 1
 852                                                 format_param = self._available_formats[quality_index]
 853                                                 continue
 854                                 else:
 855                                         self._downloader.trouble('ERROR: format not available for video')
 856                                         return
 857
 858
 859 class MetacafeIE(InfoExtractor):
 860         """Information Extractor for metacafe.com."""
 861
 862         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 863         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 864         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 865         _youtube_ie = None
 866
 867         def __init__(self, youtube_ie, downloader=None):
 868                 InfoExtractor.__init__(self, downloader)
 869                 self._youtube_ie = youtube_ie
 870
 871         @staticmethod
 872         def suitable(url):
 873                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 874
 875         def report_disclaimer(self):
 876                 """Report disclaimer retrieval."""
 877                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 878
 879         def report_age_confirmation(self):
 880                 """Report attempt to confirm age."""
 881                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 882
 883         def report_download_webpage(self, video_id):
 884                 """Report webpage download."""
 885                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 886
 887         def report_extraction(self, video_id):
 888                 """Report information extraction."""
 889                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 890
 891         def _real_initialize(self):
 892                 # Retrieve disclaimer
 893                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 894                 try:
 895                         self.report_disclaimer()
 896                         disclaimer = urllib2.urlopen(request).read()
 897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 898                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 899                         return
 900
 901                 # Confirm age
 902                 disclaimer_form = {
 903                         'filters': '0',
 904                         'submit': "Continue - I'm over 18",
 905                         }
 906                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 907                 try:
 908                         self.report_age_confirmation()
 909                         disclaimer = urllib2.urlopen(request).read()
 910                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 911                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 912                         return
 913
 914         def _real_extract(self, url):
 915                 # Extract id and simplified title from URL
 916                 mobj = re.match(self._VALID_URL, url)
 917                 if mobj is None:
 918                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 919                         return
 920
 921                 video_id = mobj.group(1)
 922
 923                 # Check if video comes from YouTube
 924                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 925                 if mobj2 is not None:
 926                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 927                         return
 928
 929                 simple_title = mobj.group(2).decode('utf-8')
 930                 video_extension = 'flv'
 931
 932                 # Retrieve video webpage to extract further information
 933                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 934                 try:
 935                         self.report_download_webpage(video_id)
 936                         webpage = urllib2.urlopen(request).read()
 937                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 938                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 939                         return
 940
 941                 # Extract URL, uploader and title from webpage
 942                 self.report_extraction(video_id)
 943                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 944                 if mobj is None:
 945                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 946                         return
 947                 mediaURL = urllib.unquote(mobj.group(1))
 948
 949                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 950                 #if mobj is None:
 951                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 952                 #       return
 953                 #gdaKey = mobj.group(1)
 954                 #
 955                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 956
 957                 video_url = mediaURL
 958
 959                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: unable to extract title')
 962                         return
 963                 video_title = mobj.group(1).decode('utf-8')
 964                 video_title = sanitize_title(video_title)
 965
 966                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 967                 if mobj is None:
 968                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 969                         return
 970                 video_uploader = mobj.group(1)
 971
 972                 try:
 973                         # Process video information
 974                         self._downloader.process_info({
 975                                 'id':           video_id.decode('utf-8'),
 976                                 'url':          video_url.decode('utf-8'),
 977                                 'uploader':     video_uploader.decode('utf-8'),
 978                                 'title':        video_title,
 979                                 'stitle':       simple_title,
 980                                 'ext':          video_extension.decode('utf-8'),
 981                         })
 982                 except UnavailableFormatError:
 983                         self._downloader.trouble(u'ERROR: format not available for video')
 984
 985
 986 class GoogleIE(InfoExtractor):
 987         """Information extractor for video.google.com."""
 988
 989         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 990
 991         def __init__(self, downloader=None):
 992                 InfoExtractor.__init__(self, downloader)
 993
 994         @staticmethod
 995         def suitable(url):
 996                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 997
 998         def report_download_webpage(self, video_id):
 999                 """Report webpage download."""
1000                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1001
1002         def report_extraction(self, video_id):
1003                 """Report information extraction."""
1004                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1005
1006         def _real_initialize(self):
1007                 return
1008
1009         def _real_extract(self, url):
1010                 # Extract id from URL
1011                 mobj = re.match(self._VALID_URL, url)
1012                 if mobj is None:
1013                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1014                         return
1015
1016                 video_id = mobj.group(1)
1017
1018                 video_extension = 'mp4'
1019
1020                 # Retrieve video webpage to extract further information
1021                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1022                 try:
1023                         self.report_download_webpage(video_id)
1024                         webpage = urllib2.urlopen(request).read()
1025                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1026                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1027                         return
1028
1029                 # Extract URL, uploader, and title from webpage
1030                 self.report_extraction(video_id)
1031                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1032                 if mobj is None:
1033                         video_extension = 'flv'
1034                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1035                 if mobj is None:
1036                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1037                         return
1038                 mediaURL = urllib.unquote(mobj.group(1))
1039                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1040                 mediaURL = mediaURL.replace('\\x26', '\x26')
1041
1042                 video_url = mediaURL
1043
1044                 mobj = re.search(r'<title>(.*)</title>', webpage)
1045                 if mobj is None:
1046                         self._downloader.trouble(u'ERROR: unable to extract title')
1047                         return
1048                 video_title = mobj.group(1).decode('utf-8')
1049                 video_title = sanitize_title(video_title)
1050                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1051
1052                 # Google Video doesn't show uploader nicknames?
1053                 video_uploader = 'NA'
1054
1055                 try:
1056                         # Process video information
1057                         self._downloader.process_info({
1058                                 'id':           video_id.decode('utf-8'),
1059                                 'url':          video_url.decode('utf-8'),
1060                                 'uploader':     video_uploader.decode('utf-8'),
1061                                 'title':        video_title,
1062                                 'stitle':       simple_title,
1063                                 'ext':          video_extension.decode('utf-8'),
1064                         })
1065                 except UnavailableFormatError:
1066                         self._downloader.trouble(u'ERROR: format not available for video')
1067
1068
1069 class PhotobucketIE(InfoExtractor):
1070         """Information extractor for photobucket.com."""
1071
1072         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1073
1074         def __init__(self, downloader=None):
1075                 InfoExtractor.__init__(self, downloader)
1076
1077         @staticmethod
1078         def suitable(url):
1079                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1080
1081         def report_download_webpage(self, video_id):
1082                 """Report webpage download."""
1083                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1084
1085         def report_extraction(self, video_id):
1086                 """Report information extraction."""
1087                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1088
1089         def _real_initialize(self):
1090                 return
1091
1092         def _real_extract(self, url):
1093                 # Extract id from URL
1094                 mobj = re.match(self._VALID_URL, url)
1095                 if mobj is None:
1096                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1097                         return
1098
1099                 video_id = mobj.group(1)
1100
1101                 video_extension = 'flv'
1102
1103                 # Retrieve video webpage to extract further information
1104                 request = urllib2.Request(url)
1105                 try:
1106                         self.report_download_webpage(video_id)
1107                         webpage = urllib2.urlopen(request).read()
1108                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1109                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1110                         return
1111
1112                 # Extract URL, uploader, and title from webpage
1113                 self.report_extraction(video_id)
1114                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1115                 if mobj is None:
1116                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1117                         return
1118                 mediaURL = urllib.unquote(mobj.group(1))
1119
1120                 video_url = mediaURL
1121
1122                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: unable to extract title')
1125                         return
1126                 video_title = mobj.group(1).decode('utf-8')
1127                 video_title = sanitize_title(video_title)
1128                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1129
1130                 video_uploader = mobj.group(2).decode('utf-8')
1131
1132                 try:
1133                         # Process video information
1134                         self._downloader.process_info({
1135                                 'id':           video_id.decode('utf-8'),
1136                                 'url':          video_url.decode('utf-8'),
1137                                 'uploader':     video_uploader,
1138                                 'title':        video_title,
1139                                 'stitle':       simple_title,
1140                                 'ext':          video_extension.decode('utf-8'),
1141                         })
1142                 except UnavailableFormatError:
1143                         self._downloader.trouble(u'ERROR: format not available for video')
1144
1145
1146 class GenericIE(InfoExtractor):
1147         """Generic last-resort information extractor."""
1148
1149         def __init__(self, downloader=None):
1150                 InfoExtractor.__init__(self, downloader)
1151
1152         @staticmethod
1153         def suitable(url):
1154                 return True
1155
1156         def report_download_webpage(self, video_id):
1157                 """Report webpage download."""
1158                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1159                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1160
1161         def report_extraction(self, video_id):
1162                 """Report information extraction."""
1163                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1164
1165         def _real_initialize(self):
1166                 return
1167
1168         def _real_extract(self, url):
1169                 video_id = url.split('/')[-1]
1170                 request = urllib2.Request(url)
1171                 try:
1172                         self.report_download_webpage(video_id)
1173                         webpage = urllib2.urlopen(request).read()
1174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1176                         return
1177                 except ValueError, err:
1178                         # since this is the last-resort InfoExtractor, if
1179                         # this error is thrown, it'll be thrown here
1180                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181                         return
1182
1183                 # Start with something easy: JW Player in SWFObject
1184                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1185                 if mobj is None:
1186                         # Broaden the search a little bit
1187                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1188                 if mobj is None:
1189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190                         return
1191
1192                 # It's possible that one of the regexes
1193                 # matched, but returned an empty group:
1194                 if mobj.group(1) is None:
1195                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196                         return
1197
1198                 video_url = urllib.unquote(mobj.group(1))
1199                 video_id  = os.path.basename(video_url)
1200
1201                 # here's a fun little line of code for you:
1202                 video_extension = os.path.splitext(video_id)[1][1:]
1203                 video_id        = os.path.splitext(video_id)[0]
1204
1205                 # it's tempting to parse this further, but you would
1206                 # have to take into account all the variations like
1207                 #   Video Title - Site Name
1208                 #   Site Name | Video Title
1209                 #   Video Title - Tagline | Site Name
1210                 # and so on and so forth; it's just not practical
1211                 mobj = re.search(r'<title>(.*)</title>', webpage)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract title')
1214                         return
1215                 video_title = mobj.group(1).decode('utf-8')
1216                 video_title = sanitize_title(video_title)
1217                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218
1219                 # video uploader is domain name
1220                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1221                 if mobj is None:
1222                         self._downloader.trouble(u'ERROR: unable to extract title')
1223                         return
1224                 video_uploader = mobj.group(1).decode('utf-8')
1225
1226                 try:
1227                         # Process video information
1228                         self._downloader.process_info({
1229                                 'id':           video_id.decode('utf-8'),
1230                                 'url':          video_url.decode('utf-8'),
1231                                 'uploader':     video_uploader,
1232                                 'title':        video_title,
1233                                 'stitle':       simple_title,
1234                                 'ext':          video_extension.decode('utf-8'),
1235                         })
1236                 except UnavailableFormatError:
1237                         self._downloader.trouble(u'ERROR: format not available for video')
1238
1239
1240 class YoutubeSearchIE(InfoExtractor):
1241         """Information Extractor for YouTube search queries."""
1242         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1243         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1244         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1245         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1246         _youtube_ie = None
1247         _max_youtube_results = 1000
1248
1249         def __init__(self, youtube_ie, downloader=None):
1250                 InfoExtractor.__init__(self, downloader)
1251                 self._youtube_ie = youtube_ie
1252
1253         @staticmethod
1254         def suitable(url):
1255                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1256
1257         def report_download_page(self, query, pagenum):
1258                 """Report attempt to download playlist page with given number."""
1259                 query = query.decode(preferredencoding())
1260                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1261
1262         def _real_initialize(self):
1263                 self._youtube_ie.initialize()
1264
1265         def _real_extract(self, query):
1266                 mobj = re.match(self._VALID_QUERY, query)
1267                 if mobj is None:
1268                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1269                         return
1270
1271                 prefix, query = query.split(':')
1272                 prefix = prefix[8:]
1273                 query  = query.encode('utf-8')
1274                 if prefix == '':
1275                         self._download_n_results(query, 1)
1276                         return
1277                 elif prefix == 'all':
1278                         self._download_n_results(query, self._max_youtube_results)
1279                         return
1280                 else:
1281                         try:
1282                                 n = long(prefix)
1283                                 if n <= 0:
1284                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1285                                         return
1286                                 elif n > self._max_youtube_results:
1287                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1288                                         n = self._max_youtube_results
1289                                 self._download_n_results(query, n)
1290                                 return
1291                         except ValueError: # parsing prefix as integer fails
1292                                 self._download_n_results(query, 1)
1293                                 return
1294
1295         def _download_n_results(self, query, n):
1296                 """Downloads a specified number of results for a query"""
1297
1298                 video_ids = []
1299                 already_seen = set()
1300                 pagenum = 1
1301
1302                 while True:
1303                         self.report_download_page(query, pagenum)
1304                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1305                         request = urllib2.Request(result_url, None, std_headers)
1306                         try:
1307                                 page = urllib2.urlopen(request).read()
1308                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1310                                 return
1311
1312                         # Extract video identifiers
1313                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1314                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1315                                 if video_id not in already_seen:
1316                                         video_ids.append(video_id)
1317                                         already_seen.add(video_id)
1318                                         if len(video_ids) == n:
1319                                                 # Specified n videos reached
1320                                                 for id in video_ids:
1321                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1322                                                 return
1323
1324                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1325                                 for id in video_ids:
1326                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1327                                 return
1328
1329                         pagenum = pagenum + 1
1330
1331 class YoutubePlaylistIE(InfoExtractor):
1332         """Information Extractor for YouTube playlists."""
1333
1334         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1335         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1336         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1337         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1338         _youtube_ie = None
1339
1340         def __init__(self, youtube_ie, downloader=None):
1341                 InfoExtractor.__init__(self, downloader)
1342                 self._youtube_ie = youtube_ie
1343
1344         @staticmethod
1345         def suitable(url):
1346                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1347
1348         def report_download_page(self, playlist_id, pagenum):
1349                 """Report attempt to download playlist page with given number."""
1350                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1351
1352         def _real_initialize(self):
1353                 self._youtube_ie.initialize()
1354
1355         def _real_extract(self, url):
1356                 # Extract playlist id
1357                 mobj = re.match(self._VALID_URL, url)
1358                 if mobj is None:
1359                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1360                         return
1361
1362                 # Download playlist pages
1363                 playlist_id = mobj.group(1)
1364                 video_ids = []
1365                 pagenum = 1
1366
1367                 while True:
1368                         self.report_download_page(playlist_id, pagenum)
1369                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1370                         try:
1371                                 page = urllib2.urlopen(request).read()
1372                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1374                                 return
1375
1376                         # Extract video identifiers
1377                         ids_in_page = []
1378                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1379                                 if mobj.group(1) not in ids_in_page:
1380                                         ids_in_page.append(mobj.group(1))
1381                         video_ids.extend(ids_in_page)
1382
1383                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1384                                 break
1385                         pagenum = pagenum + 1
1386
1387                 for id in video_ids:
1388                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1389                 return
1390
1391 class YoutubeUserIE(InfoExtractor):
1392         """Information Extractor for YouTube users."""
1393
1394         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1395         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1396         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1397         _youtube_ie = None
1398
1399         def __init__(self, youtube_ie, downloader=None):
1400                 InfoExtractor.__init__(self, downloader)
1401                 self._youtube_ie = youtube_ie
1402
1403         @staticmethod
1404         def suitable(url):
1405                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1406
1407         def report_download_page(self, username):
1408                 """Report attempt to download user page."""
1409                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1410
1411         def _real_initialize(self):
1412                 self._youtube_ie.initialize()
1413
1414         def _real_extract(self, url):
1415                 # Extract username
1416                 mobj = re.match(self._VALID_URL, url)
1417                 if mobj is None:
1418                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1419                         return
1420
1421                 # Download user page
1422                 username = mobj.group(1)
1423                 video_ids = []
1424                 pagenum = 1
1425
1426                 self.report_download_page(username)
1427                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1428                 try:
1429                         page = urllib2.urlopen(request).read()
1430                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1431                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1432                         return
1433
1434                 # Extract video identifiers
1435                 ids_in_page = []
1436
1437                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1438                         if mobj.group(1) not in ids_in_page:
1439                                 ids_in_page.append(mobj.group(1))
1440                 video_ids.extend(ids_in_page)
1441
1442                 for id in video_ids:
1443                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1444                 return
1445
1446 class PostProcessor(object):
1447         """Post Processor class.
1448
1449         PostProcessor objects can be added to downloaders with their
1450         add_post_processor() method. When the downloader has finished a
1451         successful download, it will take its internal chain of PostProcessors
1452         and start calling the run() method on each one of them, first with
1453         an initial argument and then with the returned value of the previous
1454         PostProcessor.
1455
1456         The chain will be stopped if one of them ever returns None or the end
1457         of the chain is reached.
1458
1459         PostProcessor objects follow a "mutual registration" process similar
1460         to InfoExtractor objects.
1461         """
1462
1463         _downloader = None
1464
1465         def __init__(self, downloader=None):
1466                 self._downloader = downloader
1467
1468         def set_downloader(self, downloader):
1469                 """Sets the downloader for this PP."""
1470                 self._downloader = downloader
1471
1472         def run(self, information):
1473                 """Run the PostProcessor.
1474
1475                 The "information" argument is a dictionary like the ones
1476                 composed by InfoExtractors. The only difference is that this
1477                 one has an extra field called "filepath" that points to the
1478                 downloaded file.
1479
1480                 When this method returns None, the postprocessing chain is
1481                 stopped. However, this method may return an information
1482                 dictionary that will be passed to the next postprocessing
1483                 object in the chain. It can be the one it received after
1484                 changing some fields.
1485
1486                 In addition, this method may raise a PostProcessingError
1487                 exception that will be taken into account by the downloader
1488                 it was called from.
1489                 """
1490                 return information # by default, do nothing
1491
1492 ### MAIN PROGRAM ###
1493 if __name__ == '__main__':
1494         try:
1495                 # Modules needed only when running the main program
1496                 import getpass
1497                 import optparse
1498
1499                 # Function to update the program file with the latest version from bitbucket.org
1500                 def update_self(downloader, filename):
1501                         # Note: downloader only used for options
1502                         if not os.access (filename, os.W_OK):
1503                                 sys.exit('ERROR: no write permissions on %s' % filename)
1504
1505                         downloader.to_stdout('Updating to latest stable version...')
1506                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1507                         latest_version = urllib.urlopen(latest_url).read().strip()
1508                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1509                         newcontent = urllib.urlopen(prog_url).read()
1510                         stream = open(filename, 'w')
1511                         stream.write(newcontent)
1512                         stream.close()
1513                         downloader.to_stdout('Updated to version %s' % latest_version)
1514
1515                 # General configuration
1516                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1517                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1518                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1519
1520                 # Parse command line
1521                 parser = optparse.OptionParser(
1522                         usage='Usage: %prog [options] url...',
1523                         version='2010.03.07',
1524                         conflict_handler='resolve',
1525                 )
1526
1527                 parser.add_option('-h', '--help',
1528                                 action='help', help='print this help text and exit')
1529                 parser.add_option('-v', '--version',
1530                                 action='version', help='print program version and exit')
1531                 parser.add_option('-U', '--update',
1532                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1533                 parser.add_option('-i', '--ignore-errors',
1534                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1535                 parser.add_option('-r', '--rate-limit',
1536                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1537
1538                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1539                 authentication.add_option('-u', '--username',
1540                                 dest='username', metavar='UN', help='account username')
1541                 authentication.add_option('-p', '--password',
1542                                 dest='password', metavar='PW', help='account password')
1543                 authentication.add_option('-n', '--netrc',
1544                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1545                 parser.add_option_group(authentication)
1546
1547                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1548                 video_format.add_option('-f', '--format',
1549                                 action='store', dest='format', metavar='FMT', help='video format code')
1550                 video_format.add_option('-b', '--best-quality',
1551                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1552                 video_format.add_option('-m', '--mobile-version',
1553                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1554                 video_format.add_option('-d', '--high-def',
1555                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1556                 parser.add_option_group(video_format)
1557
1558                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1559                 verbosity.add_option('-q', '--quiet',
1560                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1561                 verbosity.add_option('-s', '--simulate',
1562                                 action='store_true', dest='simulate', help='do not download video', default=False)
1563                 verbosity.add_option('-g', '--get-url',
1564                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1565                 verbosity.add_option('-e', '--get-title',
1566                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1567                 verbosity.add_option('--no-progress',
1568                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1569                 parser.add_option_group(verbosity)
1570
1571                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1572                 filesystem.add_option('-t', '--title',
1573                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1574                 filesystem.add_option('-l', '--literal',
1575                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1576                 filesystem.add_option('-o', '--output',
1577                                 dest='outtmpl', metavar='TPL', help='output filename template')
1578                 filesystem.add_option('-a', '--batch-file',
1579                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1580                 filesystem.add_option('-w', '--no-overwrites',
1581                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1582                 filesystem.add_option('-c', '--continue',
1583                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1584                 parser.add_option_group(filesystem)
1585
1586                 (opts, args) = parser.parse_args()
1587
1588                 # Batch file verification
1589                 batchurls = []
1590                 if opts.batchfile is not None:
1591                         try:
1592                                 batchurls = open(opts.batchfile, 'r').readlines()
1593                                 batchurls = [x.strip() for x in batchurls]
1594                                 batchurls = [x for x in batchurls if len(x) > 0]
1595                         except IOError:
1596                                 sys.exit(u'ERROR: batch file could not be read')
1597                 all_urls = batchurls + args
1598
1599                 # Conflicting, missing and erroneous options
1600                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1601                         parser.error(u'using .netrc conflicts with giving username/password')
1602                 if opts.password is not None and opts.username is None:
1603                         parser.error(u'account username missing')
1604                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1605                         parser.error(u'using output template conflicts with using title or literal title')
1606                 if opts.usetitle and opts.useliteral:
1607                         parser.error(u'using title conflicts with using literal title')
1608                 if opts.username is not None and opts.password is None:
1609                         opts.password = getpass.getpass(u'Type account password and press return:')
1610                 if opts.ratelimit is not None:
1611                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1612                         if numeric_limit is None:
1613                                 parser.error(u'invalid rate limit specified')
1614                         opts.ratelimit = numeric_limit
1615
1616                 # Information extractors
1617                 youtube_ie = YoutubeIE()
1618                 metacafe_ie = MetacafeIE(youtube_ie)
1619                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1620                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1621                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1622                 google_ie = GoogleIE()
1623                 photobucket_ie = PhotobucketIE()
1624                 generic_ie = GenericIE()
1625
1626                 # File downloader
1627                 fd = FileDownloader({
1628                         'usenetrc': opts.usenetrc,
1629                         'username': opts.username,
1630                         'password': opts.password,
1631                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1632                         'forceurl': opts.geturl,
1633                         'forcetitle': opts.gettitle,
1634                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1635                         'format': opts.format,
1636                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1637                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1638                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1639                                 or u'%(id)s.%(ext)s'),
1640                         'ignoreerrors': opts.ignoreerrors,
1641                         'ratelimit': opts.ratelimit,
1642                         'nooverwrites': opts.nooverwrites,
1643                         'continuedl': opts.continue_dl,
1644                         'noprogress': opts.noprogress,
1645                         })
1646                 fd.add_info_extractor(youtube_search_ie)
1647                 fd.add_info_extractor(youtube_pl_ie)
1648                 fd.add_info_extractor(youtube_user_ie)
1649                 fd.add_info_extractor(metacafe_ie)
1650                 fd.add_info_extractor(youtube_ie)
1651                 fd.add_info_extractor(google_ie)
1652                 fd.add_info_extractor(photobucket_ie)
1653
1654                 # This must come last since it's the
1655                 # fallback if none of the others work
1656                 fd.add_info_extractor(generic_ie)
1657
1658                 # Update version
1659                 if opts.update_self:
1660                         update_self(fd, sys.argv[0])
1661
1662                 # Maybe do nothing
1663                 if len(all_urls) < 1:
1664                         if not opts.update_self:
1665                                 parser.error(u'you must provide at least one URL')
1666                         else:
1667                                 sys.exit()
1668                 retcode = fd.download(all_urls)
1669                 sys.exit(retcode)
1670
1671         except DownloadError:
1672                 sys.exit(1)
1673         except SameFileError:
1674                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1675         except KeyboardInterrupt:
1676                 sys.exit(u'\nERROR: Interrupted by user')