Raphaël G. Git Repositories - youtubedl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.14'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import httplib
  27 import locale
  28 import math
  29 import netrc
  30 import os
  31 import os.path
  32 import re
  33 import socket
  34 import string
  35 import subprocess
  36 import sys
  37 import time
  38 import urllib
  39 import urllib2
  40 import warnings
  41 import zlib
  42
  43 if os.name == 'nt':
  44         import ctypes
  45
  46 try:
  47         import email.utils
  48 except ImportError: # Python 2.4
  49         import email.Utils
  50 try:
  51         import cStringIO as StringIO
  52 except ImportError:
  53         import StringIO
  54
  55 # parse_qs was moved from the cgi module to the urlparse module recently.
  56 try:
  57         from urlparse import parse_qs
  58 except ImportError:
  59         from cgi import parse_qs
  60
  61 try:
  62         import lxml.etree
  63 except ImportError:
  64         pass # Handled below
  65
  66 try:
  67         import xml.etree.ElementTree
  68 except ImportError: # Python<2.5
  69         pass # Not officially supported, but let it slip
  70
  71 std_headers = {
  72         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  73         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  74         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  75         'Accept-Encoding': 'gzip, deflate',
  76         'Accept-Language': 'en-us,en;q=0.5',
  77 }
  78
  79 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  80
  81 try:
  82         import json
  83 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  84         import re
  85         class json(object):
  86                 @staticmethod
  87                 def loads(s):
  88                         s = s.decode('UTF-8')
  89                         def raiseError(msg, i):
  90                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  91                         def skipSpace(i, expectMore=True):
  92                                 while i < len(s) and s[i] in ' \t\r\n':
  93                                         i += 1
  94                                 if expectMore:
  95                                         if i >= len(s):
  96                                                 raiseError('Premature end', i)
  97                                 return i
  98                         def decodeEscape(match):
  99                                 esc = match.group(1)
 100                                 _STATIC = {
 101                                         '"': '"',
 102                                         '\\': '\\',
 103                                         '/': '/',
 104                                         'b': unichr(0x8),
 105                                         'f': unichr(0xc),
 106                                         'n': '\n',
 107                                         'r': '\r',
 108                                         't': '\t',
 109                                 }
 110                                 if esc in _STATIC:
 111                                         return _STATIC[esc]
 112                                 if esc[0] == 'u':
 113                                         if len(esc) == 1+4:
 114                                                 return unichr(int(esc[1:5], 16))
 115                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 116                                                 hi = int(esc[1:5], 16)
 117                                                 low = int(esc[7:11], 16)
 118                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 119                                 raise ValueError('Unknown escape ' + str(esc))
 120                         def parseString(i):
 121                                 i += 1
 122                                 e = i
 123                                 while True:
 124                                         e = s.index('"', e)
 125                                         bslashes = 0
 126                                         while s[e-bslashes-1] == '\\':
 127                                                 bslashes += 1
 128                                         if bslashes % 2 == 1:
 129                                                 e += 1
 130                                                 continue
 131                                         break
 132                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 133                                 stri = rexp.sub(decodeEscape, s[i:e])
 134                                 return (e+1,stri)
 135                         def parseObj(i):
 136                                 i += 1
 137                                 res = {}
 138                                 i = skipSpace(i)
 139                                 if s[i] == '}': # Empty dictionary
 140                                         return (i+1,res)
 141                                 while True:
 142                                         if s[i] != '"':
 143                                                 raiseError('Expected a string object key', i)
 144                                         i,key = parseString(i)
 145                                         i = skipSpace(i)
 146                                         if i >= len(s) or s[i] != ':':
 147                                                 raiseError('Expected a colon', i)
 148                                         i,val = parse(i+1)
 149                                         res[key] = val
 150                                         i = skipSpace(i)
 151                                         if s[i] == '}':
 152                                                 return (i+1, res)
 153                                         if s[i] != ',':
 154                                                 raiseError('Expected comma or closing curly brace', i)
 155                                         i = skipSpace(i+1)
 156                         def parseArray(i):
 157                                 res = []
 158                                 i = skipSpace(i+1)
 159                                 if s[i] == ']': # Empty array
 160                                         return (i+1,res)
 161                                 while True:
 162                                         i,val = parse(i)
 163                                         res.append(val)
 164                                         i = skipSpace(i) # Raise exception if premature end
 165                                         if s[i] == ']':
 166                                                 return (i+1, res)
 167                                         if s[i] != ',':
 168                                                 raiseError('Expected a comma or closing bracket', i)
 169                                         i = skipSpace(i+1)
 170                         def parseDiscrete(i):
 171                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 172                                         if s.startswith(k, i):
 173                                                 return (i+len(k), v)
 174                                 raiseError('Not a boolean (or null)', i)
 175                         def parseNumber(i):
 176                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 177                                 if mobj is None:
 178                                         raiseError('Not a number', i)
 179                                 nums = mobj.group(1)
 180                                 if '.' in nums or 'e' in nums or 'E' in nums:
 181                                         return (i+len(nums), float(nums))
 182                                 return (i+len(nums), int(nums))
 183                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 184                         def parse(i):
 185                                 i = skipSpace(i)
 186                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 187                                 i = skipSpace(i, False)
 188                                 return (i,res)
 189                         i,res = parse(0)
 190                         if i < len(s):
 191                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 192                         return res
 193
 194 def preferredencoding():
 195         """Get preferred encoding.
 196
 197         Returns the best encoding scheme for the system, based on
 198         locale.getpreferredencoding() and some further tweaks.
 199         """
 200         def yield_preferredencoding():
 201                 try:
 202                         pref = locale.getpreferredencoding()
 203                         u'TEST'.encode(pref)
 204                 except:
 205                         pref = 'UTF-8'
 206                 while True:
 207                         yield pref
 208         return yield_preferredencoding().next()
 209
 210
 211 def htmlentity_transform(matchobj):
 212         """Transforms an HTML entity to a Unicode character.
 213
 214         This function receives a match object and is intended to be used with
 215         the re.sub() function.
 216         """
 217         entity = matchobj.group(1)
 218
 219         # Known non-numeric HTML entity
 220         if entity in htmlentitydefs.name2codepoint:
 221                 return unichr(htmlentitydefs.name2codepoint[entity])
 222
 223         # Unicode character
 224         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 225         if mobj is not None:
 226                 numstr = mobj.group(1)
 227                 if numstr.startswith(u'x'):
 228                         base = 16
 229                         numstr = u'0%s' % numstr
 230                 else:
 231                         base = 10
 232                 return unichr(long(numstr, base))
 233
 234         # Unknown entity in name, return its literal representation
 235         return (u'&%s;' % entity)
 236
 237
 238 def sanitize_title(utitle):
 239         """Sanitizes a video title so it could be used as part of a filename."""
 240         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 241         return utitle.replace(unicode(os.sep), u'%')
 242
 243
 244 def sanitize_open(filename, open_mode):
 245         """Try to open the given filename, and slightly tweak it if this fails.
 246
 247         Attempts to open the given filename. If this fails, it tries to change
 248         the filename slightly, step by step, until it's either able to open it
 249         or it fails and raises a final exception, like the standard open()
 250         function.
 251
 252         It returns the tuple (stream, definitive_file_name).
 253         """
 254         try:
 255                 if filename == u'-':
 256                         if sys.platform == 'win32':
 257                                 import msvcrt
 258                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 259                         return (sys.stdout, filename)
 260                 stream = open(filename, open_mode)
 261                 return (stream, filename)
 262         except (IOError, OSError), err:
 263                 # In case of error, try to remove win32 forbidden chars
 264                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 265
 266                 # An exception here should be caught in the caller
 267                 stream = open(filename, open_mode)
 268                 return (stream, filename)
 269
 270
 271 def timeconvert(timestr):
 272         """Convert RFC 2822 defined time string into system timestamp"""
 273         timestamp = None
 274         timetuple = email.utils.parsedate_tz(timestr)
 275         if timetuple is not None:
 276                 timestamp = email.utils.mktime_tz(timetuple)
 277         return timestamp
 278
 279
 280 class DownloadError(Exception):
 281         """Download Error exception.
 282
 283         This exception may be thrown by FileDownloader objects if they are not
 284         configured to continue on errors. They will contain the appropriate
 285         error message.
 286         """
 287         pass
 288
 289
 290 class SameFileError(Exception):
 291         """Same File exception.
 292
 293         This exception will be thrown by FileDownloader objects if they detect
 294         multiple files would have to be downloaded to the same file on disk.
 295         """
 296         pass
 297
 298
 299 class PostProcessingError(Exception):
 300         """Post Processing exception.
 301
 302         This exception may be raised by PostProcessor's .run() method to
 303         indicate an error in the postprocessing task.
 304         """
 305         pass
 306
 307
 308 class UnavailableVideoError(Exception):
 309         """Unavailable Format exception.
 310
 311         This exception will be thrown when a video is requested
 312         in a format that is not available for that video.
 313         """
 314         pass
 315
 316
 317 class ContentTooShortError(Exception):
 318         """Content Too Short exception.
 319
 320         This exception may be raised by FileDownloader objects when a file they
 321         download is too small for what the server announced first, indicating
 322         the connection was probably interrupted.
 323         """
 324         # Both in bytes
 325         downloaded = None
 326         expected = None
 327
 328         def __init__(self, downloaded, expected):
 329                 self.downloaded = downloaded
 330                 self.expected = expected
 331
 332
 333 class YoutubeDLHandler(urllib2.HTTPHandler):
 334         """Handler for HTTP requests and responses.
 335
 336         This class, when installed with an OpenerDirector, automatically adds
 337         the standard headers to every HTTP request and handles gzipped and
 338         deflated responses from web servers. If compression is to be avoided in
 339         a particular request, the original request in the program code only has
 340         to include the HTTP header "Youtubedl-No-Compression", which will be
 341         removed before making the real request.
 342
 343         Part of this code was copied from:
 344
 345         http://techknack.net/python-urllib2-handlers/
 346
 347         Andrew Rowls, the author of that code, agreed to release it to the
 348         public domain.
 349         """
 350
 351         @staticmethod
 352         def deflate(data):
 353                 try:
 354                         return zlib.decompress(data, -zlib.MAX_WBITS)
 355                 except zlib.error:
 356                         return zlib.decompress(data)
 357
 358         @staticmethod
 359         def addinfourl_wrapper(stream, headers, url, code):
 360                 if hasattr(urllib2.addinfourl, 'getcode'):
 361                         return urllib2.addinfourl(stream, headers, url, code)
 362                 ret = urllib2.addinfourl(stream, headers, url)
 363                 ret.code = code
 364                 return ret
 365
 366         def http_request(self, req):
 367                 for h in std_headers:
 368                         if h in req.headers:
 369                                 del req.headers[h]
 370                         req.add_header(h, std_headers[h])
 371                 if 'Youtubedl-no-compression' in req.headers:
 372                         if 'Accept-encoding' in req.headers:
 373                                 del req.headers['Accept-encoding']
 374                         del req.headers['Youtubedl-no-compression']
 375                 return req
 376
 377         def http_response(self, req, resp):
 378                 old_resp = resp
 379                 # gzip
 380                 if resp.headers.get('Content-encoding', '') == 'gzip':
 381                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 382                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 383                         resp.msg = old_resp.msg
 384                 # deflate
 385                 if resp.headers.get('Content-encoding', '') == 'deflate':
 386                         gz = StringIO.StringIO(self.deflate(resp.read()))
 387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 388                         resp.msg = old_resp.msg
 389                 return resp
 390
 391
 392 class FileDownloader(object):
 393         """File Downloader class.
 394
 395         File downloader objects are the ones responsible of downloading the
 396         actual video file and writing it to disk if the user has requested
 397         it, among some other tasks. In most cases there should be one per
 398         program. As, given a video URL, the downloader doesn't know how to
 399         extract all the needed information, task that InfoExtractors do, it
 400         has to pass the URL to one of them.
 401
 402         For this, file downloader objects have a method that allows
 403         InfoExtractors to be registered in a given order. When it is passed
 404         a URL, the file downloader handles it to the first InfoExtractor it
 405         finds that reports being able to handle it. The InfoExtractor extracts
 406         all the information about the video or videos the URL refers to, and
 407         asks the FileDownloader to process the video information, possibly
 408         downloading the video.
 409
 410         File downloaders accept a lot of parameters. In order not to saturate
 411         the object constructor with arguments, it receives a dictionary of
 412         options instead. These options are available through the params
 413         attribute for the InfoExtractors to use. The FileDownloader also
 414         registers itself as the downloader in charge for the InfoExtractors
 415         that are added to it, so this is a "mutual registration".
 416
 417         Available options:
 418
 419         username:         Username for authentication purposes.
 420         password:         Password for authentication purposes.
 421         usenetrc:         Use netrc for authentication instead.
 422         quiet:            Do not print messages to stdout.
 423         forceurl:         Force printing final URL.
 424         forcetitle:       Force printing title.
 425         forcethumbnail:   Force printing thumbnail URL.
 426         forcedescription: Force printing description.
 427         forcefilename:    Force printing final filename.
 428         simulate:         Do not download the video files.
 429         format:           Video format code.
 430         format_limit:     Highest quality format to try.
 431         outtmpl:          Template for output names.
 432         ignoreerrors:     Do not stop on download errors.
 433         ratelimit:        Download speed limit, in bytes/sec.
 434         nooverwrites:     Prevent overwriting files.
 435         retries:          Number of times to retry for HTTP error 5xx
 436         continuedl:       Try to continue downloads if possible.
 437         noprogress:       Do not print the progress bar.
 438         playliststart:    Playlist item to start at.
 439         playlistend:      Playlist item to end at.
 440         logtostderr:      Log messages to stderr instead of stdout.
 441         consoletitle:     Display progress in console window's titlebar.
 442         nopart:           Do not use temporary .part files.
 443         updatetime:       Use the Last-modified header to set output file timestamps.
 444         writedescription: Write the video description to a .description file
 445         writeinfojson:    Write the video description to a .info.json file
 446         """
 447
 448         params = None
 449         _ies = []
 450         _pps = []
 451         _download_retcode = None
 452         _num_downloads = None
 453         _screen_file = None
 454
 455         def __init__(self, params):
 456                 """Create a FileDownloader object with the given options."""
 457                 self._ies = []
 458                 self._pps = []
 459                 self._download_retcode = 0
 460                 self._num_downloads = 0
 461                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 462                 self.params = params
 463
 464         @staticmethod
 465         def format_bytes(bytes):
 466                 if bytes is None:
 467                         return 'N/A'
 468                 if type(bytes) is str:
 469                         bytes = float(bytes)
 470                 if bytes == 0.0:
 471                         exponent = 0
 472                 else:
 473                         exponent = long(math.log(bytes, 1024.0))
 474                 suffix = 'bkMGTPEZY'[exponent]
 475                 converted = float(bytes) / float(1024 ** exponent)
 476                 return '%.2f%s' % (converted, suffix)
 477
 478         @staticmethod
 479         def calc_percent(byte_counter, data_len):
 480                 if data_len is None:
 481                         return '---.-%'
 482                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 483
 484         @staticmethod
 485         def calc_eta(start, now, total, current):
 486                 if total is None:
 487                         return '--:--'
 488                 dif = now - start
 489                 if current == 0 or dif < 0.001: # One millisecond
 490                         return '--:--'
 491                 rate = float(current) / dif
 492                 eta = long((float(total) - float(current)) / rate)
 493                 (eta_mins, eta_secs) = divmod(eta, 60)
 494                 if eta_mins > 99:
 495                         return '--:--'
 496                 return '%02d:%02d' % (eta_mins, eta_secs)
 497
 498         @staticmethod
 499         def calc_speed(start, now, bytes):
 500                 dif = now - start
 501                 if bytes == 0 or dif < 0.001: # One millisecond
 502                         return '%10s' % '---b/s'
 503                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 504
 505         @staticmethod
 506         def best_block_size(elapsed_time, bytes):
 507                 new_min = max(bytes / 2.0, 1.0)
 508                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 509                 if elapsed_time < 0.001:
 510                         return long(new_max)
 511                 rate = bytes / elapsed_time
 512                 if rate > new_max:
 513                         return long(new_max)
 514                 if rate < new_min:
 515                         return long(new_min)
 516                 return long(rate)
 517
 518         @staticmethod
 519         def parse_bytes(bytestr):
 520                 """Parse a string indicating a byte quantity into a long integer."""
 521                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 522                 if matchobj is None:
 523                         return None
 524                 number = float(matchobj.group(1))
 525                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 526                 return long(round(number * multiplier))
 527
 528         def add_info_extractor(self, ie):
 529                 """Add an InfoExtractor object to the end of the list."""
 530                 self._ies.append(ie)
 531                 ie.set_downloader(self)
 532
 533         def add_post_processor(self, pp):
 534                 """Add a PostProcessor object to the end of the chain."""
 535                 self._pps.append(pp)
 536                 pp.set_downloader(self)
 537
 538         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 539                 """Print message to stdout if not in quiet mode."""
 540                 try:
 541                         if not self.params.get('quiet', False):
 542                                 terminator = [u'\n', u''][skip_eol]
 543                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 544                         self._screen_file.flush()
 545                 except (UnicodeEncodeError), err:
 546                         if not ignore_encoding_errors:
 547                                 raise
 548
 549         def to_stderr(self, message):
 550                 """Print message to stderr."""
 551                 print >>sys.stderr, message.encode(preferredencoding())
 552
 553         def to_cons_title(self, message):
 554                 """Set console/terminal window title to message."""
 555                 if not self.params.get('consoletitle', False):
 556                         return
 557                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 558                         # c_wchar_p() might not be necessary if `message` is
 559                         # already of type unicode()
 560                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 561                 elif 'TERM' in os.environ:
 562                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 563
 564         def fixed_template(self):
 565                 """Checks if the output template is fixed."""
 566                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 567
 568         def trouble(self, message=None):
 569                 """Determine action to take when a download problem appears.
 570
 571                 Depending on if the downloader has been configured to ignore
 572                 download errors or not, this method may throw an exception or
 573                 not when errors are found, after printing the message.
 574                 """
 575                 if message is not None:
 576                         self.to_stderr(message)
 577                 if not self.params.get('ignoreerrors', False):
 578                         raise DownloadError(message)
 579                 self._download_retcode = 1
 580
 581         def slow_down(self, start_time, byte_counter):
 582                 """Sleep if the download speed is over the rate limit."""
 583                 rate_limit = self.params.get('ratelimit', None)
 584                 if rate_limit is None or byte_counter == 0:
 585                         return
 586                 now = time.time()
 587                 elapsed = now - start_time
 588                 if elapsed <= 0.0:
 589                         return
 590                 speed = float(byte_counter) / elapsed
 591                 if speed > rate_limit:
 592                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 593
 594         def temp_name(self, filename):
 595                 """Returns a temporary filename for the given filename."""
 596                 if self.params.get('nopart', False) or filename == u'-' or \
 597                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 598                         return filename
 599                 return filename + u'.part'
 600
 601         def undo_temp_name(self, filename):
 602                 if filename.endswith(u'.part'):
 603                         return filename[:-len(u'.part')]
 604                 return filename
 605
 606         def try_rename(self, old_filename, new_filename):
 607                 try:
 608                         if old_filename == new_filename:
 609                                 return
 610                         os.rename(old_filename, new_filename)
 611                 except (IOError, OSError), err:
 612                         self.trouble(u'ERROR: unable to rename file')
 613
 614         def try_utime(self, filename, last_modified_hdr):
 615                 """Try to set the last-modified time of the given file."""
 616                 if last_modified_hdr is None:
 617                         return
 618                 if not os.path.isfile(filename):
 619                         return
 620                 timestr = last_modified_hdr
 621                 if timestr is None:
 622                         return
 623                 filetime = timeconvert(timestr)
 624                 if filetime is None:
 625                         return
 626                 try:
 627                         os.utime(filename, (time.time(), filetime))
 628                 except:
 629                         pass
 630
 631         def report_writedescription(self, descfn):
 632                 """ Report that the description file is being written """
 633                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 634
 635         def report_writeinfojson(self, infofn):
 636                 """ Report that the metadata file has been written """
 637                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 638
 639         def report_destination(self, filename):
 640                 """Report destination filename."""
 641                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 642
 643         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 644                 """Report download progress."""
 645                 if self.params.get('noprogress', False):
 646                         return
 647                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 648                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 649                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 650                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 651
 652         def report_resuming_byte(self, resume_len):
 653                 """Report attempt to resume at given byte."""
 654                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 655
 656         def report_retry(self, count, retries):
 657                 """Report retry in case of HTTP error 5xx"""
 658                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 659
 660         def report_file_already_downloaded(self, file_name):
 661                 """Report file has already been fully downloaded."""
 662                 try:
 663                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 664                 except (UnicodeEncodeError), err:
 665                         self.to_screen(u'[download] The file has already been downloaded')
 666
 667         def report_unable_to_resume(self):
 668                 """Report it was impossible to resume download."""
 669                 self.to_screen(u'[download] Unable to resume')
 670
 671         def report_finish(self):
 672                 """Report download finished."""
 673                 if self.params.get('noprogress', False):
 674                         self.to_screen(u'[download] Download completed')
 675                 else:
 676                         self.to_screen(u'')
 677
 678         def increment_downloads(self):
 679                 """Increment the ordinal that assigns a number to each file."""
 680                 self._num_downloads += 1
 681
 682         def prepare_filename(self, info_dict):
 683                 """Generate the output filename."""
 684                 try:
 685                         template_dict = dict(info_dict)
 686                         template_dict['epoch'] = unicode(long(time.time()))
 687                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 688                         filename = self.params['outtmpl'] % template_dict
 689                         return filename
 690                 except (ValueError, KeyError), err:
 691                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 692                         return None
 693
 694         def process_info(self, info_dict):
 695                 """Process a single dictionary returned by an InfoExtractor."""
 696                 filename = self.prepare_filename(info_dict)
 697                 # Do nothing else if in simulate mode
 698                 if self.params.get('simulate', False):
 699                         # Forced printings
 700                         if self.params.get('forcetitle', False):
 701                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 702                         if self.params.get('forceurl', False):
 703                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 704                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 705                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 706                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 707                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 708                         if self.params.get('forcefilename', False) and filename is not None:
 709                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 710
 711                         return
 712
 713                 if filename is None:
 714                         return
 715                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 716                         self.to_stderr(u'WARNING: file exists and will be skipped')
 717                         return
 718
 719                 try:
 720                         dn = os.path.dirname(filename)
 721                         if dn != '' and not os.path.exists(dn):
 722                                 os.makedirs(dn)
 723                 except (OSError, IOError), err:
 724                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 725                         return
 726
 727                 if self.params.get('writedescription', False):
 728                         try:
 729                                 descfn = filename + '.description'
 730                                 self.report_writedescription(descfn)
 731                                 descfile = open(descfn, 'wb')
 732                                 try:
 733                                         descfile.write(info_dict['description'].encode('utf-8'))
 734                                 finally:
 735                                         descfile.close()
 736                         except (OSError, IOError):
 737                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 738                                 return
 739
 740                 if self.params.get('writeinfojson', False):
 741                         infofn = filename + '.info.json'
 742                         self.report_writeinfojson(infofn)
 743                         try:
 744                                 json.dump
 745                         except (NameError,AttributeError):
 746                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 747                                 return
 748                         try:
 749                                 infof = open(infofn, 'wb')
 750                                 try:
 751                                         json.dump(info_dict, infof)
 752                                 finally:
 753                                         infof.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 756                                 return
 757
 758                 try:
 759                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 760                 except (OSError, IOError), err:
 761                         raise UnavailableVideoError
 762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 763                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 764                         return
 765                 except (ContentTooShortError, ), err:
 766                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 767                         return
 768
 769                 if success:
 770                         try:
 771                                 self.post_process(filename, info_dict)
 772                         except (PostProcessingError), err:
 773                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 774                                 return
 775
 776         def download(self, url_list):
 777                 """Download a given list of URLs."""
 778                 if len(url_list) > 1 and self.fixed_template():
 779                         raise SameFileError(self.params['outtmpl'])
 780
 781                 for url in url_list:
 782                         suitable_found = False
 783                         for ie in self._ies:
 784                                 # Go to next InfoExtractor if not suitable
 785                                 if not ie.suitable(url):
 786                                         continue
 787
 788                                 # Suitable InfoExtractor found
 789                                 suitable_found = True
 790
 791                                 # Extract information from URL and process it
 792                                 ie.extract(url)
 793
 794                                 # Suitable InfoExtractor had been found; go to next URL
 795                                 break
 796
 797                         if not suitable_found:
 798                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 799
 800                 return self._download_retcode
 801
 802         def post_process(self, filename, ie_info):
 803                 """Run the postprocessing chain on the given file."""
 804                 info = dict(ie_info)
 805                 info['filepath'] = filename
 806                 for pp in self._pps:
 807                         info = pp.run(info)
 808                         if info is None:
 809                                 break
 810
 811         def _download_with_rtmpdump(self, filename, url, player_url):
 812                 self.report_destination(filename)
 813                 tmpfilename = self.temp_name(filename)
 814
 815                 # Check for rtmpdump first
 816                 try:
 817                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 818                 except (OSError, IOError):
 819                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 820                         return False
 821
 822                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 823                 # the connection was interrumpted and resuming appears to be
 824                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 825                 basic_args = ['rtmpdump'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 826                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 827                 while retval == 2 or retval == 1:
 828                         prevsize = os.path.getsize(tmpfilename)
 829                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 830                         time.sleep(5.0) # This seems to be needed
 831                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 832                         cursize = os.path.getsize(tmpfilename)
 833                         if prevsize == cursize and retval == 1:
 834                                 break
 835                 if retval == 0:
 836                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 837                         self.try_rename(tmpfilename, filename)
 838                         return True
 839                 else:
 840                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 841                         return False
 842
 843         def _do_download(self, filename, url, player_url):
 844                 # Check file already present
 845                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 846                         self.report_file_already_downloaded(filename)
 847                         return True
 848
 849                 # Attempt to download using rtmpdump
 850                 if url.startswith('rtmp'):
 851                         return self._download_with_rtmpdump(filename, url, player_url)
 852
 853                 tmpfilename = self.temp_name(filename)
 854                 stream = None
 855                 open_mode = 'wb'
 856
 857                 # Do not include the Accept-Encoding header
 858                 headers = {'Youtubedl-no-compression': 'True'}
 859                 basic_request = urllib2.Request(url, None, headers)
 860                 request = urllib2.Request(url, None, headers)
 861
 862                 # Establish possible resume length
 863                 if os.path.isfile(tmpfilename):
 864                         resume_len = os.path.getsize(tmpfilename)
 865                 else:
 866                         resume_len = 0
 867
 868                 # Request parameters in case of being able to resume
 869                 if self.params.get('continuedl', False) and resume_len != 0:
 870                         self.report_resuming_byte(resume_len)
 871                         request.add_header('Range', 'bytes=%d-' % resume_len)
 872                         open_mode = 'ab'
 873
 874                 count = 0
 875                 retries = self.params.get('retries', 0)
 876                 while count <= retries:
 877                         # Establish connection
 878                         try:
 879                                 data = urllib2.urlopen(request)
 880                                 break
 881                         except (urllib2.HTTPError, ), err:
 882                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 883                                         # Unexpected HTTP error
 884                                         raise
 885                                 elif err.code == 416:
 886                                         # Unable to resume (requested range not satisfiable)
 887                                         try:
 888                                                 # Open the connection again without the range header
 889                                                 data = urllib2.urlopen(basic_request)
 890                                                 content_length = data.info()['Content-Length']
 891                                         except (urllib2.HTTPError, ), err:
 892                                                 if err.code < 500 or err.code >= 600:
 893                                                         raise
 894                                         else:
 895                                                 # Examine the reported length
 896                                                 if (content_length is not None and
 897                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 898                                                         # The file had already been fully downloaded.
 899                                                         # Explanation to the above condition: in issue #175 it was revealed that
 900                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 901                                                         # changing the file size slightly and causing problems for some users. So
 902                                                         # I decided to implement a suggested change and consider the file
 903                                                         # completely downloaded if the file size differs less than 100 bytes from
 904                                                         # the one in the hard drive.
 905                                                         self.report_file_already_downloaded(filename)
 906                                                         self.try_rename(tmpfilename, filename)
 907                                                         return True
 908                                                 else:
 909                                                         # The length does not match, we start the download over
 910                                                         self.report_unable_to_resume()
 911                                                         open_mode = 'wb'
 912                                                         break
 913                         # Retry
 914                         count += 1
 915                         if count <= retries:
 916                                 self.report_retry(count, retries)
 917
 918                 if count > retries:
 919                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 920                         return False
 921
 922                 data_len = data.info().get('Content-length', None)
 923                 if data_len is not None:
 924                         data_len = long(data_len) + resume_len
 925                 data_len_str = self.format_bytes(data_len)
 926                 byte_counter = 0 + resume_len
 927                 block_size = 1024
 928                 start = time.time()
 929                 while True:
 930                         # Download and write
 931                         before = time.time()
 932                         data_block = data.read(block_size)
 933                         after = time.time()
 934                         if len(data_block) == 0:
 935                                 break
 936                         byte_counter += len(data_block)
 937
 938                         # Open file just in time
 939                         if stream is None:
 940                                 try:
 941                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 942                                         assert stream is not None
 943                                         filename = self.undo_temp_name(tmpfilename)
 944                                         self.report_destination(filename)
 945                                 except (OSError, IOError), err:
 946                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 947                                         return False
 948                         try:
 949                                 stream.write(data_block)
 950                         except (IOError, OSError), err:
 951                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 952                                 return False
 953                         block_size = self.best_block_size(after - before, len(data_block))
 954
 955                         # Progress message
 956                         percent_str = self.calc_percent(byte_counter, data_len)
 957                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 958                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 959                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 960
 961                         # Apply rate limit
 962                         self.slow_down(start, byte_counter - resume_len)
 963
 964                 if stream is None:
 965                         self.trouble(u'\nERROR: Did not get any data blocks')
 966                         return False
 967                 stream.close()
 968                 self.report_finish()
 969                 if data_len is not None and byte_counter != data_len:
 970                         raise ContentTooShortError(byte_counter, long(data_len))
 971                 self.try_rename(tmpfilename, filename)
 972
 973                 # Update file modification time
 974                 if self.params.get('updatetime', True):
 975                         self.try_utime(filename, data.info().get('last-modified', None))
 976
 977                 return True
 978
 979
 980 class InfoExtractor(object):
 981         """Information Extractor class.
 982
 983         Information extractors are the classes that, given a URL, extract
 984         information from the video (or videos) the URL refers to. This
 985         information includes the real video URL, the video title and simplified
 986         title, author and others. The information is stored in a dictionary
 987         which is then passed to the FileDownloader. The FileDownloader
 988         processes this information possibly downloading the video to the file
 989         system, among other possible outcomes. The dictionaries must include
 990         the following fields:
 991
 992         id:             Video identifier.
 993         url:            Final video URL.
 994         uploader:       Nickname of the video uploader.
 995         title:          Literal title.
 996         stitle:         Simplified title.
 997         ext:            Video filename extension.
 998         format:         Video format.
 999         player_url:     SWF Player URL (may be None).
1000
1001         The following fields are optional. Their primary purpose is to allow
1002         youtube-dl to serve as the backend for a video search function, such
1003         as the one in youtube2mp3.  They are only used when their respective
1004         forced printing functions are called:
1005
1006         thumbnail:      Full URL to a video thumbnail image.
1007         description:    One-line video description.
1008
1009         Subclasses of this one should re-define the _real_initialize() and
1010         _real_extract() methods, as well as the suitable() static method.
1011         Probably, they should also be instantiated and added to the main
1012         downloader.
1013         """
1014
1015         _ready = False
1016         _downloader = None
1017
1018         def __init__(self, downloader=None):
1019                 """Constructor. Receives an optional downloader."""
1020                 self._ready = False
1021                 self.set_downloader(downloader)
1022
1023         @staticmethod
1024         def suitable(url):
1025                 """Receives a URL and returns True if suitable for this IE."""
1026                 return False
1027
1028         def initialize(self):
1029                 """Initializes an instance (authentication, etc)."""
1030                 if not self._ready:
1031                         self._real_initialize()
1032                         self._ready = True
1033
1034         def extract(self, url):
1035                 """Extracts URL information and returns it in list of dicts."""
1036                 self.initialize()
1037                 return self._real_extract(url)
1038
1039         def set_downloader(self, downloader):
1040                 """Sets the downloader for this IE."""
1041                 self._downloader = downloader
1042
1043         def _real_initialize(self):
1044                 """Real initialization process. Redefine in subclasses."""
1045                 pass
1046
1047         def _real_extract(self, url):
1048                 """Real extraction process. Redefine in subclasses."""
1049                 pass
1050
1051
1052 class YoutubeIE(InfoExtractor):
1053         """Information extractor for youtube.com."""
1054
1055         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1056         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1057         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1058         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1059         _NETRC_MACHINE = 'youtube'
1060         # Listed in order of quality
1061         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1062         _video_extensions = {
1063                 '13': '3gp',
1064                 '17': 'mp4',
1065                 '18': 'mp4',
1066                 '22': 'mp4',
1067                 '37': 'mp4',
1068                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1069                 '43': 'webm',
1070                 '45': 'webm',
1071         }
1072
1073         @staticmethod
1074         def suitable(url):
1075                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1076
1077         def report_lang(self):
1078                 """Report attempt to set language."""
1079                 self._downloader.to_screen(u'[youtube] Setting language')
1080
1081         def report_login(self):
1082                 """Report attempt to log in."""
1083                 self._downloader.to_screen(u'[youtube] Logging in')
1084
1085         def report_age_confirmation(self):
1086                 """Report attempt to confirm age."""
1087                 self._downloader.to_screen(u'[youtube] Confirming age')
1088
1089         def report_video_webpage_download(self, video_id):
1090                 """Report attempt to download video webpage."""
1091                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1092
1093         def report_video_info_webpage_download(self, video_id):
1094                 """Report attempt to download video info webpage."""
1095                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1096
1097         def report_information_extraction(self, video_id):
1098                 """Report attempt to extract video information."""
1099                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1100
1101         def report_unavailable_format(self, video_id, format):
1102                 """Report extracted video URL."""
1103                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1104
1105         def report_rtmp_download(self):
1106                 """Indicate the download will use the RTMP protocol."""
1107                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1108
1109         def _real_initialize(self):
1110                 if self._downloader is None:
1111                         return
1112
1113                 username = None
1114                 password = None
1115                 downloader_params = self._downloader.params
1116
1117                 # Attempt to use provided username and password or .netrc data
1118                 if downloader_params.get('username', None) is not None:
1119                         username = downloader_params['username']
1120                         password = downloader_params['password']
1121                 elif downloader_params.get('usenetrc', False):
1122                         try:
1123                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1124                                 if info is not None:
1125                                         username = info[0]
1126                                         password = info[2]
1127                                 else:
1128                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1129                         except (IOError, netrc.NetrcParseError), err:
1130                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1131                                 return
1132
1133                 # Set language
1134                 request = urllib2.Request(self._LANG_URL)
1135                 try:
1136                         self.report_lang()
1137                         urllib2.urlopen(request).read()
1138                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1139                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1140                         return
1141
1142                 # No authentication to be performed
1143                 if username is None:
1144                         return
1145
1146                 # Log in
1147                 login_form = {
1148                                 'current_form': 'loginForm',
1149                                 'next':         '/',
1150                                 'action_login': 'Log In',
1151                                 'username':     username,
1152                                 'password':     password,
1153                                 }
1154                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1155                 try:
1156                         self.report_login()
1157                         login_results = urllib2.urlopen(request).read()
1158                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1159                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1160                                 return
1161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1163                         return
1164
1165                 # Confirm age
1166                 age_form = {
1167                                 'next_url':             '/',
1168                                 'action_confirm':       'Confirm',
1169                                 }
1170                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1171                 try:
1172                         self.report_age_confirmation()
1173                         age_results = urllib2.urlopen(request).read()
1174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1176                         return
1177
1178         def _real_extract(self, url):
1179                 # Extract video id from URL
1180                 mobj = re.match(self._VALID_URL, url)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1183                         return
1184                 video_id = mobj.group(2)
1185
1186                 # Get video webpage
1187                 self.report_video_webpage_download(video_id)
1188                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1189                 try:
1190                         video_webpage = urllib2.urlopen(request).read()
1191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1193                         return
1194
1195                 # Attempt to extract SWF player URL
1196                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1197                 if mobj is not None:
1198                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1199                 else:
1200                         player_url = None
1201
1202                 # Get video info
1203                 self.report_video_info_webpage_download(video_id)
1204                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1205                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1206                                         % (video_id, el_type))
1207                         request = urllib2.Request(video_info_url)
1208                         try:
1209                                 video_info_webpage = urllib2.urlopen(request).read()
1210                                 video_info = parse_qs(video_info_webpage)
1211                                 if 'token' in video_info:
1212                                         break
1213                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1214                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1215                                 return
1216                 if 'token' not in video_info:
1217                         if 'reason' in video_info:
1218                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1219                         else:
1220                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1221                         return
1222
1223                 # Start extracting information
1224                 self.report_information_extraction(video_id)
1225
1226                 # uploader
1227                 if 'author' not in video_info:
1228                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1229                         return
1230                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1231
1232                 # title
1233                 if 'title' not in video_info:
1234                         self._downloader.trouble(u'ERROR: unable to extract video title')
1235                         return
1236                 video_title = urllib.unquote_plus(video_info['title'][0])
1237                 video_title = video_title.decode('utf-8')
1238                 video_title = sanitize_title(video_title)
1239
1240                 # simplified title
1241                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1242                 simple_title = simple_title.strip(ur'_')
1243
1244                 # thumbnail image
1245                 if 'thumbnail_url' not in video_info:
1246                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1247                         video_thumbnail = ''
1248                 else:   # don't panic if we can't find it
1249                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1250
1251                 # upload date
1252                 upload_date = u'NA'
1253                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1254                 if mobj is not None:
1255                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1256                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1257                         for expression in format_expressions:
1258                                 try:
1259                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1260                                 except:
1261                                         pass
1262
1263                 # description
1264                 try:
1265                         lxml.etree
1266                 except NameError:
1267                         video_description = u'No description available.'
1268                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1269                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1270                                 if mobj is not None:
1271                                         video_description = mobj.group(1).decode('utf-8')
1272                 else:
1273                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1274                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1275                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1276                         # TODO use another parser
1277
1278                 # token
1279                 video_token = urllib.unquote_plus(video_info['token'][0])
1280
1281                 # Decide which formats to download
1282                 req_format = self._downloader.params.get('format', None)
1283
1284                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1285                         self.report_rtmp_download()
1286                         video_url_list = [(None, video_info['conn'][0])]
1287                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1288                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1289                         url_data = [parse_qs(uds) for uds in url_data_strs]
1290                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1291                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1292
1293                         format_limit = self._downloader.params.get('format_limit', None)
1294                         if format_limit is not None and format_limit in self._available_formats:
1295                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1296                         else:
1297                                 format_list = self._available_formats
1298                         existing_formats = [x for x in format_list if x in url_map]
1299                         if len(existing_formats) == 0:
1300                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1301                                 return
1302                         if req_format is None:
1303                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1304                         elif req_format == '-1':
1305                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1306                         else:
1307                                 # Specific format
1308                                 if req_format not in url_map:
1309                                         self._downloader.trouble(u'ERROR: requested format not available')
1310                                         return
1311                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1312                 else:
1313                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1314                         return
1315
1316                 for format_param, video_real_url in video_url_list:
1317                         # At this point we have a new video
1318                         self._downloader.increment_downloads()
1319
1320                         # Extension
1321                         video_extension = self._video_extensions.get(format_param, 'flv')
1322
1323                         try:
1324                                 # Process video information
1325                                 self._downloader.process_info({
1326                                         'id':           video_id.decode('utf-8'),
1327                                         'url':          video_real_url.decode('utf-8'),
1328                                         'uploader':     video_uploader.decode('utf-8'),
1329                                         'upload_date':  upload_date,
1330                                         'title':        video_title,
1331                                         'stitle':       simple_title,
1332                                         'ext':          video_extension.decode('utf-8'),
1333                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1334                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1335                                         'description':  video_description,
1336                                         'player_url':   player_url,
1337                                 })
1338                         except UnavailableVideoError, err:
1339                                 self._downloader.trouble(u'\nERROR: unable to download video')
1340
1341
1342 class MetacafeIE(InfoExtractor):
1343         """Information Extractor for metacafe.com."""
1344
1345         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1346         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1347         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1348         _youtube_ie = None
1349
1350         def __init__(self, youtube_ie, downloader=None):
1351                 InfoExtractor.__init__(self, downloader)
1352                 self._youtube_ie = youtube_ie
1353
1354         @staticmethod
1355         def suitable(url):
1356                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1357
1358         def report_disclaimer(self):
1359                 """Report disclaimer retrieval."""
1360                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1361
1362         def report_age_confirmation(self):
1363                 """Report attempt to confirm age."""
1364                 self._downloader.to_screen(u'[metacafe] Confirming age')
1365
1366         def report_download_webpage(self, video_id):
1367                 """Report webpage download."""
1368                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1369
1370         def report_extraction(self, video_id):
1371                 """Report information extraction."""
1372                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1373
1374         def _real_initialize(self):
1375                 # Retrieve disclaimer
1376                 request = urllib2.Request(self._DISCLAIMER)
1377                 try:
1378                         self.report_disclaimer()
1379                         disclaimer = urllib2.urlopen(request).read()
1380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1381                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1382                         return
1383
1384                 # Confirm age
1385                 disclaimer_form = {
1386                         'filters': '0',
1387                         'submit': "Continue - I'm over 18",
1388                         }
1389                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1390                 try:
1391                         self.report_age_confirmation()
1392                         disclaimer = urllib2.urlopen(request).read()
1393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1395                         return
1396
1397         def _real_extract(self, url):
1398                 # Extract id and simplified title from URL
1399                 mobj = re.match(self._VALID_URL, url)
1400                 if mobj is None:
1401                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1402                         return
1403
1404                 video_id = mobj.group(1)
1405
1406                 # Check if video comes from YouTube
1407                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1408                 if mobj2 is not None:
1409                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1410                         return
1411
1412                 # At this point we have a new video
1413                 self._downloader.increment_downloads()
1414
1415                 simple_title = mobj.group(2).decode('utf-8')
1416
1417                 # Retrieve video webpage to extract further information
1418                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1419                 try:
1420                         self.report_download_webpage(video_id)
1421                         webpage = urllib2.urlopen(request).read()
1422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1423                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1424                         return
1425
1426                 # Extract URL, uploader and title from webpage
1427                 self.report_extraction(video_id)
1428                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1429                 if mobj is not None:
1430                         mediaURL = urllib.unquote(mobj.group(1))
1431                         video_extension = mediaURL[-3:]
1432
1433                         # Extract gdaKey if available
1434                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1435                         if mobj is None:
1436                                 video_url = mediaURL
1437                         else:
1438                                 gdaKey = mobj.group(1)
1439                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1440                 else:
1441                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1442                         if mobj is None:
1443                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1444                                 return
1445                         vardict = parse_qs(mobj.group(1))
1446                         if 'mediaData' not in vardict:
1447                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1448                                 return
1449                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1450                         if mobj is None:
1451                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1452                                 return
1453                         mediaURL = mobj.group(1).replace('\\/', '/')
1454                         video_extension = mediaURL[-3:]
1455                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1456
1457                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: unable to extract title')
1460                         return
1461                 video_title = mobj.group(1).decode('utf-8')
1462                 video_title = sanitize_title(video_title)
1463
1464                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1467                         return
1468                 video_uploader = mobj.group(1)
1469
1470                 try:
1471                         # Process video information
1472                         self._downloader.process_info({
1473                                 'id':           video_id.decode('utf-8'),
1474                                 'url':          video_url.decode('utf-8'),
1475                                 'uploader':     video_uploader.decode('utf-8'),
1476                                 'upload_date':  u'NA',
1477                                 'title':        video_title,
1478                                 'stitle':       simple_title,
1479                                 'ext':          video_extension.decode('utf-8'),
1480                                 'format':       u'NA',
1481                                 'player_url':   None,
1482                         })
1483                 except UnavailableVideoError:
1484                         self._downloader.trouble(u'\nERROR: unable to download video')
1485
1486
1487 class DailymotionIE(InfoExtractor):
1488         """Information Extractor for Dailymotion"""
1489
1490         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1491
1492         def __init__(self, downloader=None):
1493                 InfoExtractor.__init__(self, downloader)
1494
1495         @staticmethod
1496         def suitable(url):
1497                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1498
1499         def report_download_webpage(self, video_id):
1500                 """Report webpage download."""
1501                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1502
1503         def report_extraction(self, video_id):
1504                 """Report information extraction."""
1505                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1506
1507         def _real_initialize(self):
1508                 return
1509
1510         def _real_extract(self, url):
1511                 # Extract id and simplified title from URL
1512                 mobj = re.match(self._VALID_URL, url)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1515                         return
1516
1517                 # At this point we have a new video
1518                 self._downloader.increment_downloads()
1519                 video_id = mobj.group(1)
1520
1521                 simple_title = mobj.group(2).decode('utf-8')
1522                 video_extension = 'flv'
1523
1524                 # Retrieve video webpage to extract further information
1525                 request = urllib2.Request(url)
1526                 request.add_header('Cookie', 'family_filter=off')
1527                 try:
1528                         self.report_download_webpage(video_id)
1529                         webpage = urllib2.urlopen(request).read()
1530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1531                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1532                         return
1533
1534                 # Extract URL, uploader and title from webpage
1535                 self.report_extraction(video_id)
1536                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1537                 if mobj is None:
1538                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1539                         return
1540                 sequence = urllib.unquote(mobj.group(1))
1541                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1544                         return
1545                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1546
1547                 # if needed add http://www.dailymotion.com/ if relative URL
1548
1549                 video_url = mediaURL
1550
1551                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: unable to extract title')
1554                         return
1555                 video_title = mobj.group(1).decode('utf-8')
1556                 video_title = sanitize_title(video_title)
1557
1558                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1559                 if mobj is None:
1560                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1561                         return
1562                 video_uploader = mobj.group(1)
1563
1564                 try:
1565                         # Process video information
1566                         self._downloader.process_info({
1567                                 'id':           video_id.decode('utf-8'),
1568                                 'url':          video_url.decode('utf-8'),
1569                                 'uploader':     video_uploader.decode('utf-8'),
1570                                 'upload_date':  u'NA',
1571                                 'title':        video_title,
1572                                 'stitle':       simple_title,
1573                                 'ext':          video_extension.decode('utf-8'),
1574                                 'format':       u'NA',
1575                                 'player_url':   None,
1576                         })
1577                 except UnavailableVideoError:
1578                         self._downloader.trouble(u'\nERROR: unable to download video')
1579
1580
1581 class GoogleIE(InfoExtractor):
1582         """Information extractor for video.google.com."""
1583
1584         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1585
1586         def __init__(self, downloader=None):
1587                 InfoExtractor.__init__(self, downloader)
1588
1589         @staticmethod
1590         def suitable(url):
1591                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1592
1593         def report_download_webpage(self, video_id):
1594                 """Report webpage download."""
1595                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1596
1597         def report_extraction(self, video_id):
1598                 """Report information extraction."""
1599                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1600
1601         def _real_initialize(self):
1602                 return
1603
1604         def _real_extract(self, url):
1605                 # Extract id from URL
1606                 mobj = re.match(self._VALID_URL, url)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1609                         return
1610
1611                 # At this point we have a new video
1612                 self._downloader.increment_downloads()
1613                 video_id = mobj.group(1)
1614
1615                 video_extension = 'mp4'
1616
1617                 # Retrieve video webpage to extract further information
1618                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1619                 try:
1620                         self.report_download_webpage(video_id)
1621                         webpage = urllib2.urlopen(request).read()
1622                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1624                         return
1625
1626                 # Extract URL, uploader, and title from webpage
1627                 self.report_extraction(video_id)
1628                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1629                 if mobj is None:
1630                         video_extension = 'flv'
1631                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1632                 if mobj is None:
1633                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1634                         return
1635                 mediaURL = urllib.unquote(mobj.group(1))
1636                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1637                 mediaURL = mediaURL.replace('\\x26', '\x26')
1638
1639                 video_url = mediaURL
1640
1641                 mobj = re.search(r'<title>(.*)</title>', webpage)
1642                 if mobj is None:
1643                         self._downloader.trouble(u'ERROR: unable to extract title')
1644                         return
1645                 video_title = mobj.group(1).decode('utf-8')
1646                 video_title = sanitize_title(video_title)
1647                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1648
1649                 # Extract video description
1650                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1651                 if mobj is None:
1652                         self._downloader.trouble(u'ERROR: unable to extract video description')
1653                         return
1654                 video_description = mobj.group(1).decode('utf-8')
1655                 if not video_description:
1656                         video_description = 'No description available.'
1657
1658                 # Extract video thumbnail
1659                 if self._downloader.params.get('forcethumbnail', False):
1660                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1661                         try:
1662                                 webpage = urllib2.urlopen(request).read()
1663                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1664                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1665                                 return
1666                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1667                         if mobj is None:
1668                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1669                                 return
1670                         video_thumbnail = mobj.group(1)
1671                 else:   # we need something to pass to process_info
1672                         video_thumbnail = ''
1673
1674                 try:
1675                         # Process video information
1676                         self._downloader.process_info({
1677                                 'id':           video_id.decode('utf-8'),
1678                                 'url':          video_url.decode('utf-8'),
1679                                 'uploader':     u'NA',
1680                                 'upload_date':  u'NA',
1681                                 'title':        video_title,
1682                                 'stitle':       simple_title,
1683                                 'ext':          video_extension.decode('utf-8'),
1684                                 'format':       u'NA',
1685                                 'player_url':   None,
1686                         })
1687                 except UnavailableVideoError:
1688                         self._downloader.trouble(u'\nERROR: unable to download video')
1689
1690
1691 class PhotobucketIE(InfoExtractor):
1692         """Information extractor for photobucket.com."""
1693
1694         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1695
1696         def __init__(self, downloader=None):
1697                 InfoExtractor.__init__(self, downloader)
1698
1699         @staticmethod
1700         def suitable(url):
1701                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1702
1703         def report_download_webpage(self, video_id):
1704                 """Report webpage download."""
1705                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1706
1707         def report_extraction(self, video_id):
1708                 """Report information extraction."""
1709                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1710
1711         def _real_initialize(self):
1712                 return
1713
1714         def _real_extract(self, url):
1715                 # Extract id from URL
1716                 mobj = re.match(self._VALID_URL, url)
1717                 if mobj is None:
1718                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1719                         return
1720
1721                 # At this point we have a new video
1722                 self._downloader.increment_downloads()
1723                 video_id = mobj.group(1)
1724
1725                 video_extension = 'flv'
1726
1727                 # Retrieve video webpage to extract further information
1728                 request = urllib2.Request(url)
1729                 try:
1730                         self.report_download_webpage(video_id)
1731                         webpage = urllib2.urlopen(request).read()
1732                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1733                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1734                         return
1735
1736                 # Extract URL, uploader, and title from webpage
1737                 self.report_extraction(video_id)
1738                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1739                 if mobj is None:
1740                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1741                         return
1742                 mediaURL = urllib.unquote(mobj.group(1))
1743
1744                 video_url = mediaURL
1745
1746                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1747                 if mobj is None:
1748                         self._downloader.trouble(u'ERROR: unable to extract title')
1749                         return
1750                 video_title = mobj.group(1).decode('utf-8')
1751                 video_title = sanitize_title(video_title)
1752                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1753
1754                 video_uploader = mobj.group(2).decode('utf-8')
1755
1756                 try:
1757                         # Process video information
1758                         self._downloader.process_info({
1759                                 'id':           video_id.decode('utf-8'),
1760                                 'url':          video_url.decode('utf-8'),
1761                                 'uploader':     video_uploader,
1762                                 'upload_date':  u'NA',
1763                                 'title':        video_title,
1764                                 'stitle':       simple_title,
1765                                 'ext':          video_extension.decode('utf-8'),
1766                                 'format':       u'NA',
1767                                 'player_url':   None,
1768                         })
1769                 except UnavailableVideoError:
1770                         self._downloader.trouble(u'\nERROR: unable to download video')
1771
1772
1773 class YahooIE(InfoExtractor):
1774         """Information extractor for video.yahoo.com."""
1775
1776         # _VALID_URL matches all Yahoo! Video URLs
1777         # _VPAGE_URL matches only the extractable '/watch/' URLs
1778         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1779         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1780
1781         def __init__(self, downloader=None):
1782                 InfoExtractor.__init__(self, downloader)
1783
1784         @staticmethod
1785         def suitable(url):
1786                 return (re.match(YahooIE._VALID_URL, url) is not None)
1787
1788         def report_download_webpage(self, video_id):
1789                 """Report webpage download."""
1790                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1791
1792         def report_extraction(self, video_id):
1793                 """Report information extraction."""
1794                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1795
1796         def _real_initialize(self):
1797                 return
1798
1799         def _real_extract(self, url, new_video=True):
1800                 # Extract ID from URL
1801                 mobj = re.match(self._VALID_URL, url)
1802                 if mobj is None:
1803                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1804                         return
1805
1806                 # At this point we have a new video
1807                 self._downloader.increment_downloads()
1808                 video_id = mobj.group(2)
1809                 video_extension = 'flv'
1810
1811                 # Rewrite valid but non-extractable URLs as
1812                 # extractable English language /watch/ URLs
1813                 if re.match(self._VPAGE_URL, url) is None:
1814                         request = urllib2.Request(url)
1815                         try:
1816                                 webpage = urllib2.urlopen(request).read()
1817                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819                                 return
1820
1821                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1822                         if mobj is None:
1823                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1824                                 return
1825                         yahoo_id = mobj.group(1)
1826
1827                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1828                         if mobj is None:
1829                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1830                                 return
1831                         yahoo_vid = mobj.group(1)
1832
1833                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1834                         return self._real_extract(url, new_video=False)
1835
1836                 # Retrieve video webpage to extract further information
1837                 request = urllib2.Request(url)
1838                 try:
1839                         self.report_download_webpage(video_id)
1840                         webpage = urllib2.urlopen(request).read()
1841                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843                         return
1844
1845                 # Extract uploader and title from webpage
1846                 self.report_extraction(video_id)
1847                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1848                 if mobj is None:
1849                         self._downloader.trouble(u'ERROR: unable to extract video title')
1850                         return
1851                 video_title = mobj.group(1).decode('utf-8')
1852                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1853
1854                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1855                 if mobj is None:
1856                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1857                         return
1858                 video_uploader = mobj.group(1).decode('utf-8')
1859
1860                 # Extract video thumbnail
1861                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1862                 if mobj is None:
1863                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1864                         return
1865                 video_thumbnail = mobj.group(1).decode('utf-8')
1866
1867                 # Extract video description
1868                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1869                 if mobj is None:
1870                         self._downloader.trouble(u'ERROR: unable to extract video description')
1871                         return
1872                 video_description = mobj.group(1).decode('utf-8')
1873                 if not video_description:
1874                         video_description = 'No description available.'
1875
1876                 # Extract video height and width
1877                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1878                 if mobj is None:
1879                         self._downloader.trouble(u'ERROR: unable to extract video height')
1880                         return
1881                 yv_video_height = mobj.group(1)
1882
1883                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: unable to extract video width')
1886                         return
1887                 yv_video_width = mobj.group(1)
1888
1889                 # Retrieve video playlist to extract media URL
1890                 # I'm not completely sure what all these options are, but we
1891                 # seem to need most of them, otherwise the server sends a 401.
1892                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1893                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1894                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1895                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1896                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1897                 try:
1898                         self.report_download_webpage(video_id)
1899                         webpage = urllib2.urlopen(request).read()
1900                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1901                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1902                         return
1903
1904                 # Extract media URL from playlist XML
1905                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1906                 if mobj is None:
1907                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1908                         return
1909                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1910                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1911
1912                 try:
1913                         # Process video information
1914                         self._downloader.process_info({
1915                                 'id':           video_id.decode('utf-8'),
1916                                 'url':          video_url,
1917                                 'uploader':     video_uploader,
1918                                 'upload_date':  u'NA',
1919                                 'title':        video_title,
1920                                 'stitle':       simple_title,
1921                                 'ext':          video_extension.decode('utf-8'),
1922                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1923                                 'description':  video_description,
1924                                 'thumbnail':    video_thumbnail,
1925                                 'player_url':   None,
1926                         })
1927                 except UnavailableVideoError:
1928                         self._downloader.trouble(u'\nERROR: unable to download video')
1929
1930
1931 class VimeoIE(InfoExtractor):
1932         """Information extractor for vimeo.com."""
1933
1934         # _VALID_URL matches Vimeo URLs
1935         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1936
1937         def __init__(self, downloader=None):
1938                 InfoExtractor.__init__(self, downloader)
1939
1940         @staticmethod
1941         def suitable(url):
1942                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1943
1944         def report_download_webpage(self, video_id):
1945                 """Report webpage download."""
1946                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1947
1948         def report_extraction(self, video_id):
1949                 """Report information extraction."""
1950                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1951
1952         def _real_initialize(self):
1953                 return
1954
1955         def _real_extract(self, url, new_video=True):
1956                 # Extract ID from URL
1957                 mobj = re.match(self._VALID_URL, url)
1958                 if mobj is None:
1959                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1960                         return
1961
1962                 # At this point we have a new video
1963                 self._downloader.increment_downloads()
1964                 video_id = mobj.group(1)
1965
1966                 # Retrieve video webpage to extract further information
1967                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1968                 try:
1969                         self.report_download_webpage(video_id)
1970                         webpage = urllib2.urlopen(request).read()
1971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1973                         return
1974
1975                 # Now we begin extracting as much information as we can from what we
1976                 # retrieved. First we extract the information common to all extractors,
1977                 # and latter we extract those that are Vimeo specific.
1978                 self.report_extraction(video_id)
1979
1980                 # Extract title
1981                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1982                 if mobj is None:
1983                         self._downloader.trouble(u'ERROR: unable to extract video title')
1984                         return
1985                 video_title = mobj.group(1).decode('utf-8')
1986                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1987
1988                 # Extract uploader
1989                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1990                 if mobj is None:
1991                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1992                         return
1993                 video_uploader = mobj.group(1).decode('utf-8')
1994
1995                 # Extract video thumbnail
1996                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1997                 if mobj is None:
1998                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1999                         return
2000                 video_thumbnail = mobj.group(1).decode('utf-8')
2001
2002                 # # Extract video description
2003                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2004                 # if mobj is None:
2005                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2006                 #       return
2007                 # video_description = mobj.group(1).decode('utf-8')
2008                 # if not video_description: video_description = 'No description available.'
2009                 video_description = 'Foo.'
2010
2011                 # Vimeo specific: extract request signature
2012                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2013                 if mobj is None:
2014                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2015                         return
2016                 sig = mobj.group(1).decode('utf-8')
2017
2018                 # Vimeo specific: Extract request signature expiration
2019                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2020                 if mobj is None:
2021                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2022                         return
2023                 sig_exp = mobj.group(1).decode('utf-8')
2024
2025                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2026
2027                 try:
2028                         # Process video information
2029                         self._downloader.process_info({
2030                                 'id':           video_id.decode('utf-8'),
2031                                 'url':          video_url,
2032                                 'uploader':     video_uploader,
2033                                 'upload_date':  u'NA',
2034                                 'title':        video_title,
2035                                 'stitle':       simple_title,
2036                                 'ext':          u'mp4',
2037                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2038                                 'description':  video_description,
2039                                 'thumbnail':    video_thumbnail,
2040                                 'description':  video_description,
2041                                 'player_url':   None,
2042                         })
2043                 except UnavailableVideoError:
2044                         self._downloader.trouble(u'ERROR: unable to download video')
2045
2046
2047 class GenericIE(InfoExtractor):
2048         """Generic last-resort information extractor."""
2049
2050         def __init__(self, downloader=None):
2051                 InfoExtractor.__init__(self, downloader)
2052
2053         @staticmethod
2054         def suitable(url):
2055                 return True
2056
2057         def report_download_webpage(self, video_id):
2058                 """Report webpage download."""
2059                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2060                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2061
2062         def report_extraction(self, video_id):
2063                 """Report information extraction."""
2064                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2065
2066         def _real_initialize(self):
2067                 return
2068
2069         def _real_extract(self, url):
2070                 # At this point we have a new video
2071                 self._downloader.increment_downloads()
2072
2073                 video_id = url.split('/')[-1]
2074                 request = urllib2.Request(url)
2075                 try:
2076                         self.report_download_webpage(video_id)
2077                         webpage = urllib2.urlopen(request).read()
2078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2080                         return
2081                 except ValueError, err:
2082                         # since this is the last-resort InfoExtractor, if
2083                         # this error is thrown, it'll be thrown here
2084                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085                         return
2086
2087                 self.report_extraction(video_id)
2088                 # Start with something easy: JW Player in SWFObject
2089                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2090                 if mobj is None:
2091                         # Broaden the search a little bit
2092                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2095                         return
2096
2097                 # It's possible that one of the regexes
2098                 # matched, but returned an empty group:
2099                 if mobj.group(1) is None:
2100                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101                         return
2102
2103                 video_url = urllib.unquote(mobj.group(1))
2104                 video_id = os.path.basename(video_url)
2105
2106                 # here's a fun little line of code for you:
2107                 video_extension = os.path.splitext(video_id)[1][1:]
2108                 video_id = os.path.splitext(video_id)[0]
2109
2110                 # it's tempting to parse this further, but you would
2111                 # have to take into account all the variations like
2112                 #   Video Title - Site Name
2113                 #   Site Name | Video Title
2114                 #   Video Title - Tagline | Site Name
2115                 # and so on and so forth; it's just not practical
2116                 mobj = re.search(r'<title>(.*)</title>', webpage)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: unable to extract title')
2119                         return
2120                 video_title = mobj.group(1).decode('utf-8')
2121                 video_title = sanitize_title(video_title)
2122                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2123
2124                 # video uploader is domain name
2125                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2126                 if mobj is None:
2127                         self._downloader.trouble(u'ERROR: unable to extract title')
2128                         return
2129                 video_uploader = mobj.group(1).decode('utf-8')
2130
2131                 try:
2132                         # Process video information
2133                         self._downloader.process_info({
2134                                 'id':           video_id.decode('utf-8'),
2135                                 'url':          video_url.decode('utf-8'),
2136                                 'uploader':     video_uploader,
2137                                 'upload_date':  u'NA',
2138                                 'title':        video_title,
2139                                 'stitle':       simple_title,
2140                                 'ext':          video_extension.decode('utf-8'),
2141                                 'format':       u'NA',
2142                                 'player_url':   None,
2143                         })
2144                 except UnavailableVideoError, err:
2145                         self._downloader.trouble(u'\nERROR: unable to download video')
2146
2147
2148 class YoutubeSearchIE(InfoExtractor):
2149         """Information Extractor for YouTube search queries."""
2150         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2151         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2152         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2153         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2154         _youtube_ie = None
2155         _max_youtube_results = 1000
2156
2157         def __init__(self, youtube_ie, downloader=None):
2158                 InfoExtractor.__init__(self, downloader)
2159                 self._youtube_ie = youtube_ie
2160
2161         @staticmethod
2162         def suitable(url):
2163                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2164
2165         def report_download_page(self, query, pagenum):
2166                 """Report attempt to download playlist page with given number."""
2167                 query = query.decode(preferredencoding())
2168                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2169
2170         def _real_initialize(self):
2171                 self._youtube_ie.initialize()
2172
2173         def _real_extract(self, query):
2174                 mobj = re.match(self._VALID_QUERY, query)
2175                 if mobj is None:
2176                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2177                         return
2178
2179                 prefix, query = query.split(':')
2180                 prefix = prefix[8:]
2181                 query = query.encode('utf-8')
2182                 if prefix == '':
2183                         self._download_n_results(query, 1)
2184                         return
2185                 elif prefix == 'all':
2186                         self._download_n_results(query, self._max_youtube_results)
2187                         return
2188                 else:
2189                         try:
2190                                 n = long(prefix)
2191                                 if n <= 0:
2192                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2193                                         return
2194                                 elif n > self._max_youtube_results:
2195                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2196                                         n = self._max_youtube_results
2197                                 self._download_n_results(query, n)
2198                                 return
2199                         except ValueError: # parsing prefix as integer fails
2200                                 self._download_n_results(query, 1)
2201                                 return
2202
2203         def _download_n_results(self, query, n):
2204                 """Downloads a specified number of results for a query"""
2205
2206                 video_ids = []
2207                 already_seen = set()
2208                 pagenum = 1
2209
2210                 while True:
2211                         self.report_download_page(query, pagenum)
2212                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2213                         request = urllib2.Request(result_url)
2214                         try:
2215                                 page = urllib2.urlopen(request).read()
2216                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2218                                 return
2219
2220                         # Extract video identifiers
2221                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2222                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2223                                 if video_id not in already_seen:
2224                                         video_ids.append(video_id)
2225                                         already_seen.add(video_id)
2226                                         if len(video_ids) == n:
2227                                                 # Specified n videos reached
2228                                                 for id in video_ids:
2229                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2230                                                 return
2231
2232                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2233                                 for id in video_ids:
2234                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2235                                 return
2236
2237                         pagenum = pagenum + 1
2238
2239
2240 class GoogleSearchIE(InfoExtractor):
2241         """Information Extractor for Google Video search queries."""
2242         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2243         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2244         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2245         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2246         _google_ie = None
2247         _max_google_results = 1000
2248
2249         def __init__(self, google_ie, downloader=None):
2250                 InfoExtractor.__init__(self, downloader)
2251                 self._google_ie = google_ie
2252
2253         @staticmethod
2254         def suitable(url):
2255                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2256
2257         def report_download_page(self, query, pagenum):
2258                 """Report attempt to download playlist page with given number."""
2259                 query = query.decode(preferredencoding())
2260                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2261
2262         def _real_initialize(self):
2263                 self._google_ie.initialize()
2264
2265         def _real_extract(self, query):
2266                 mobj = re.match(self._VALID_QUERY, query)
2267                 if mobj is None:
2268                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2269                         return
2270
2271                 prefix, query = query.split(':')
2272                 prefix = prefix[8:]
2273                 query = query.encode('utf-8')
2274                 if prefix == '':
2275                         self._download_n_results(query, 1)
2276                         return
2277                 elif prefix == 'all':
2278                         self._download_n_results(query, self._max_google_results)
2279                         return
2280                 else:
2281                         try:
2282                                 n = long(prefix)
2283                                 if n <= 0:
2284                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2285                                         return
2286                                 elif n > self._max_google_results:
2287                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2288                                         n = self._max_google_results
2289                                 self._download_n_results(query, n)
2290                                 return
2291                         except ValueError: # parsing prefix as integer fails
2292                                 self._download_n_results(query, 1)
2293                                 return
2294
2295         def _download_n_results(self, query, n):
2296                 """Downloads a specified number of results for a query"""
2297
2298                 video_ids = []
2299                 already_seen = set()
2300                 pagenum = 1
2301
2302                 while True:
2303                         self.report_download_page(query, pagenum)
2304                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2305                         request = urllib2.Request(result_url)
2306                         try:
2307                                 page = urllib2.urlopen(request).read()
2308                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2310                                 return
2311
2312                         # Extract video identifiers
2313                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2314                                 video_id = mobj.group(1)
2315                                 if video_id not in already_seen:
2316                                         video_ids.append(video_id)
2317                                         already_seen.add(video_id)
2318                                         if len(video_ids) == n:
2319                                                 # Specified n videos reached
2320                                                 for id in video_ids:
2321                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2322                                                 return
2323
2324                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2325                                 for id in video_ids:
2326                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2327                                 return
2328
2329                         pagenum = pagenum + 1
2330
2331
2332 class YahooSearchIE(InfoExtractor):
2333         """Information Extractor for Yahoo! Video search queries."""
2334         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2335         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2336         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2337         _MORE_PAGES_INDICATOR = r'\s*Next'
2338         _yahoo_ie = None
2339         _max_yahoo_results = 1000
2340
2341         def __init__(self, yahoo_ie, downloader=None):
2342                 InfoExtractor.__init__(self, downloader)
2343                 self._yahoo_ie = yahoo_ie
2344
2345         @staticmethod
2346         def suitable(url):
2347                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2348
2349         def report_download_page(self, query, pagenum):
2350                 """Report attempt to download playlist page with given number."""
2351                 query = query.decode(preferredencoding())
2352                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2353
2354         def _real_initialize(self):
2355                 self._yahoo_ie.initialize()
2356
2357         def _real_extract(self, query):
2358                 mobj = re.match(self._VALID_QUERY, query)
2359                 if mobj is None:
2360                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2361                         return
2362
2363                 prefix, query = query.split(':')
2364                 prefix = prefix[8:]
2365                 query = query.encode('utf-8')
2366                 if prefix == '':
2367                         self._download_n_results(query, 1)
2368                         return
2369                 elif prefix == 'all':
2370                         self._download_n_results(query, self._max_yahoo_results)
2371                         return
2372                 else:
2373                         try:
2374                                 n = long(prefix)
2375                                 if n <= 0:
2376                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2377                                         return
2378                                 elif n > self._max_yahoo_results:
2379                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2380                                         n = self._max_yahoo_results
2381                                 self._download_n_results(query, n)
2382                                 return
2383                         except ValueError: # parsing prefix as integer fails
2384                                 self._download_n_results(query, 1)
2385                                 return
2386
2387         def _download_n_results(self, query, n):
2388                 """Downloads a specified number of results for a query"""
2389
2390                 video_ids = []
2391                 already_seen = set()
2392                 pagenum = 1
2393
2394                 while True:
2395                         self.report_download_page(query, pagenum)
2396                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2397                         request = urllib2.Request(result_url)
2398                         try:
2399                                 page = urllib2.urlopen(request).read()
2400                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2402                                 return
2403
2404                         # Extract video identifiers
2405                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2406                                 video_id = mobj.group(1)
2407                                 if video_id not in already_seen:
2408                                         video_ids.append(video_id)
2409                                         already_seen.add(video_id)
2410                                         if len(video_ids) == n:
2411                                                 # Specified n videos reached
2412                                                 for id in video_ids:
2413                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2414                                                 return
2415
2416                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2417                                 for id in video_ids:
2418                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2419                                 return
2420
2421                         pagenum = pagenum + 1
2422
2423
2424 class YoutubePlaylistIE(InfoExtractor):
2425         """Information Extractor for YouTube playlists."""
2426
2427         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2428         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2429         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2430         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2431         _youtube_ie = None
2432
2433         def __init__(self, youtube_ie, downloader=None):
2434                 InfoExtractor.__init__(self, downloader)
2435                 self._youtube_ie = youtube_ie
2436
2437         @staticmethod
2438         def suitable(url):
2439                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2440
2441         def report_download_page(self, playlist_id, pagenum):
2442                 """Report attempt to download playlist page with given number."""
2443                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2444
2445         def _real_initialize(self):
2446                 self._youtube_ie.initialize()
2447
2448         def _real_extract(self, url):
2449                 # Extract playlist id
2450                 mobj = re.match(self._VALID_URL, url)
2451                 if mobj is None:
2452                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2453                         return
2454
2455                 # Single video case
2456                 if mobj.group(3) is not None:
2457                         self._youtube_ie.extract(mobj.group(3))
2458                         return
2459
2460                 # Download playlist pages
2461                 # prefix is 'p' as default for playlists but there are other types that need extra care
2462                 playlist_prefix = mobj.group(1)
2463                 if playlist_prefix == 'a':
2464                         playlist_access = 'artist'
2465                 else:
2466                         playlist_prefix = 'p'
2467                         playlist_access = 'view_play_list'
2468                 playlist_id = mobj.group(2)
2469                 video_ids = []
2470                 pagenum = 1
2471
2472                 while True:
2473                         self.report_download_page(playlist_id, pagenum)
2474                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2475                         try:
2476                                 page = urllib2.urlopen(request).read()
2477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479                                 return
2480
2481                         # Extract video identifiers
2482                         ids_in_page = []
2483                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2484                                 if mobj.group(1) not in ids_in_page:
2485                                         ids_in_page.append(mobj.group(1))
2486                         video_ids.extend(ids_in_page)
2487
2488                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2489                                 break
2490                         pagenum = pagenum + 1
2491
2492                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2493                 playlistend = self._downloader.params.get('playlistend', -1)
2494                 video_ids = video_ids[playliststart:playlistend]
2495
2496                 for id in video_ids:
2497                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2498                 return
2499
2500
2501 class YoutubeUserIE(InfoExtractor):
2502         """Information Extractor for YouTube users."""
2503
2504         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2505         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2506         _GDATA_PAGE_SIZE = 50
2507         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2508         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2509         _youtube_ie = None
2510
2511         def __init__(self, youtube_ie, downloader=None):
2512                 InfoExtractor.__init__(self, downloader)
2513                 self._youtube_ie = youtube_ie
2514
2515         @staticmethod
2516         def suitable(url):
2517                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2518
2519         def report_download_page(self, username, start_index):
2520                 """Report attempt to download user page."""
2521                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2522                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2523
2524         def _real_initialize(self):
2525                 self._youtube_ie.initialize()
2526
2527         def _real_extract(self, url):
2528                 # Extract username
2529                 mobj = re.match(self._VALID_URL, url)
2530                 if mobj is None:
2531                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2532                         return
2533
2534                 username = mobj.group(1)
2535
2536                 # Download video ids using YouTube Data API. Result size per
2537                 # query is limited (currently to 50 videos) so we need to query
2538                 # page by page until there are no video ids - it means we got
2539                 # all of them.
2540
2541                 video_ids = []
2542                 pagenum = 0
2543
2544                 while True:
2545                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2546                         self.report_download_page(username, start_index)
2547
2548                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2549
2550                         try:
2551                                 page = urllib2.urlopen(request).read()
2552                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2554                                 return
2555
2556                         # Extract video identifiers
2557                         ids_in_page = []
2558
2559                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2560                                 if mobj.group(1) not in ids_in_page:
2561                                         ids_in_page.append(mobj.group(1))
2562
2563                         video_ids.extend(ids_in_page)
2564
2565                         # A little optimization - if current page is not
2566                         # "full", ie. does not contain PAGE_SIZE video ids then
2567                         # we can assume that this page is the last one - there
2568                         # are no more ids on further pages - no need to query
2569                         # again.
2570
2571                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2572                                 break
2573
2574                         pagenum += 1
2575
2576                 all_ids_count = len(video_ids)
2577                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2578                 playlistend = self._downloader.params.get('playlistend', -1)
2579
2580                 if playlistend == -1:
2581                         video_ids = video_ids[playliststart:]
2582                 else:
2583                         video_ids = video_ids[playliststart:playlistend]
2584
2585                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2586                                 (username, all_ids_count, len(video_ids)))
2587
2588                 for video_id in video_ids:
2589                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2590
2591
2592 class DepositFilesIE(InfoExtractor):
2593         """Information extractor for depositfiles.com"""
2594
2595         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2596
2597         def __init__(self, downloader=None):
2598                 InfoExtractor.__init__(self, downloader)
2599
2600         @staticmethod
2601         def suitable(url):
2602                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2603
2604         def report_download_webpage(self, file_id):
2605                 """Report webpage download."""
2606                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2607
2608         def report_extraction(self, file_id):
2609                 """Report information extraction."""
2610                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2611
2612         def _real_initialize(self):
2613                 return
2614
2615         def _real_extract(self, url):
2616                 # At this point we have a new file
2617                 self._downloader.increment_downloads()
2618
2619                 file_id = url.split('/')[-1]
2620                 # Rebuild url in english locale
2621                 url = 'http://depositfiles.com/en/files/' + file_id
2622
2623                 # Retrieve file webpage with 'Free download' button pressed
2624                 free_download_indication = { 'gateway_result' : '1' }
2625                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2626                 try:
2627                         self.report_download_webpage(file_id)
2628                         webpage = urllib2.urlopen(request).read()
2629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2630                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2631                         return
2632
2633                 # Search for the real file URL
2634                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2635                 if (mobj is None) or (mobj.group(1) is None):
2636                         # Try to figure out reason of the error.
2637                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2638                         if (mobj is not None) and (mobj.group(1) is not None):
2639                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2640                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2641                         else:
2642                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2643                         return
2644
2645                 file_url = mobj.group(1)
2646                 file_extension = os.path.splitext(file_url)[1][1:]
2647
2648                 # Search for file title
2649                 mobj = re.search(r'<b title="(.*?)">', webpage)
2650                 if mobj is None:
2651                         self._downloader.trouble(u'ERROR: unable to extract title')
2652                         return
2653                 file_title = mobj.group(1).decode('utf-8')
2654
2655                 try:
2656                         # Process file information
2657                         self._downloader.process_info({
2658                                 'id':           file_id.decode('utf-8'),
2659                                 'url':          file_url.decode('utf-8'),
2660                                 'uploader':     u'NA',
2661                                 'upload_date':  u'NA',
2662                                 'title':        file_title,
2663                                 'stitle':       file_title,
2664                                 'ext':          file_extension.decode('utf-8'),
2665                                 'format':       u'NA',
2666                                 'player_url':   None,
2667                         })
2668                 except UnavailableVideoError, err:
2669                         self._downloader.trouble(u'ERROR: unable to download file')
2670
2671
2672 class FacebookIE(InfoExtractor):
2673         """Information Extractor for Facebook"""
2674
2675         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2676         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2677         _NETRC_MACHINE = 'facebook'
2678         _available_formats = ['highqual', 'lowqual']
2679         _video_extensions = {
2680                 'highqual': 'mp4',
2681                 'lowqual': 'mp4',
2682         }
2683
2684         def __init__(self, downloader=None):
2685                 InfoExtractor.__init__(self, downloader)
2686
2687         @staticmethod
2688         def suitable(url):
2689                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2690
2691         def _reporter(self, message):
2692                 """Add header and report message."""
2693                 self._downloader.to_screen(u'[facebook] %s' % message)
2694
2695         def report_login(self):
2696                 """Report attempt to log in."""
2697                 self._reporter(u'Logging in')
2698
2699         def report_video_webpage_download(self, video_id):
2700                 """Report attempt to download video webpage."""
2701                 self._reporter(u'%s: Downloading video webpage' % video_id)
2702
2703         def report_information_extraction(self, video_id):
2704                 """Report attempt to extract video information."""
2705                 self._reporter(u'%s: Extracting video information' % video_id)
2706
2707         def _parse_page(self, video_webpage):
2708                 """Extract video information from page"""
2709                 # General data
2710                 data = {'title': r'class="video_title datawrap">(.*?)</',
2711                         'description': r'<div class="datawrap">(.*?)</div>',
2712                         'owner': r'\("video_owner_name", "(.*?)"\)',
2713                         'upload_date': r'data-date="(.*?)"',
2714                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2715                         }
2716                 video_info = {}
2717                 for piece in data.keys():
2718                         mobj = re.search(data[piece], video_webpage)
2719                         if mobj is not None:
2720                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2721
2722                 # Video urls
2723                 video_urls = {}
2724                 for fmt in self._available_formats:
2725                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2726                         if mobj is not None:
2727                                 # URL is in a Javascript segment inside an escaped Unicode format within
2728                                 # the generally utf-8 page
2729                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2730                 video_info['video_urls'] = video_urls
2731
2732                 return video_info
2733
2734         def _real_initialize(self):
2735                 if self._downloader is None:
2736                         return
2737
2738                 useremail = None
2739                 password = None
2740                 downloader_params = self._downloader.params
2741
2742                 # Attempt to use provided username and password or .netrc data
2743                 if downloader_params.get('username', None) is not None:
2744                         useremail = downloader_params['username']
2745                         password = downloader_params['password']
2746                 elif downloader_params.get('usenetrc', False):
2747                         try:
2748                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2749                                 if info is not None:
2750                                         useremail = info[0]
2751                                         password = info[2]
2752                                 else:
2753                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2754                         except (IOError, netrc.NetrcParseError), err:
2755                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2756                                 return
2757
2758                 if useremail is None:
2759                         return
2760
2761                 # Log in
2762                 login_form = {
2763                         'email': useremail,
2764                         'pass': password,
2765                         'login': 'Log+In'
2766                         }
2767                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2768                 try:
2769                         self.report_login()
2770                         login_results = urllib2.urlopen(request).read()
2771                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2772                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2773                                 return
2774                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2776                         return
2777
2778         def _real_extract(self, url):
2779                 mobj = re.match(self._VALID_URL, url)
2780                 if mobj is None:
2781                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782                         return
2783                 video_id = mobj.group('ID')
2784
2785                 # Get video webpage
2786                 self.report_video_webpage_download(video_id)
2787                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2788                 try:
2789                         page = urllib2.urlopen(request)
2790                         video_webpage = page.read()
2791                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2793                         return
2794
2795                 # Start extracting information
2796                 self.report_information_extraction(video_id)
2797
2798                 # Extract information
2799                 video_info = self._parse_page(video_webpage)
2800
2801                 # uploader
2802                 if 'owner' not in video_info:
2803                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2804                         return
2805                 video_uploader = video_info['owner']
2806
2807                 # title
2808                 if 'title' not in video_info:
2809                         self._downloader.trouble(u'ERROR: unable to extract video title')
2810                         return
2811                 video_title = video_info['title']
2812                 video_title = video_title.decode('utf-8')
2813                 video_title = sanitize_title(video_title)
2814
2815                 # simplified title
2816                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2817                 simple_title = simple_title.strip(ur'_')
2818
2819                 # thumbnail image
2820                 if 'thumbnail' not in video_info:
2821                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2822                         video_thumbnail = ''
2823                 else:
2824                         video_thumbnail = video_info['thumbnail']
2825
2826                 # upload date
2827                 upload_date = u'NA'
2828                 if 'upload_date' in video_info:
2829                         upload_time = video_info['upload_date']
2830                         timetuple = email.utils.parsedate_tz(upload_time)
2831                         if timetuple is not None:
2832                                 try:
2833                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2834                                 except:
2835                                         pass
2836
2837                 # description
2838                 video_description = video_info.get('description', 'No description available.')
2839
2840                 url_map = video_info['video_urls']
2841                 if len(url_map.keys()) > 0:
2842                         # Decide which formats to download
2843                         req_format = self._downloader.params.get('format', None)
2844                         format_limit = self._downloader.params.get('format_limit', None)
2845
2846                         if format_limit is not None and format_limit in self._available_formats:
2847                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2848                         else:
2849                                 format_list = self._available_formats
2850                         existing_formats = [x for x in format_list if x in url_map]
2851                         if len(existing_formats) == 0:
2852                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2853                                 return
2854                         if req_format is None:
2855                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2856                         elif req_format == '-1':
2857                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2858                         else:
2859                                 # Specific format
2860                                 if req_format not in url_map:
2861                                         self._downloader.trouble(u'ERROR: requested format not available')
2862                                         return
2863                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2864
2865                 for format_param, video_real_url in video_url_list:
2866
2867                         # At this point we have a new video
2868                         self._downloader.increment_downloads()
2869
2870                         # Extension
2871                         video_extension = self._video_extensions.get(format_param, 'mp4')
2872
2873                         try:
2874                                 # Process video information
2875                                 self._downloader.process_info({
2876                                         'id':           video_id.decode('utf-8'),
2877                                         'url':          video_real_url.decode('utf-8'),
2878                                         'uploader':     video_uploader.decode('utf-8'),
2879                                         'upload_date':  upload_date,
2880                                         'title':        video_title,
2881                                         'stitle':       simple_title,
2882                                         'ext':          video_extension.decode('utf-8'),
2883                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2884                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2885                                         'description':  video_description.decode('utf-8'),
2886                                         'player_url':   None,
2887                                 })
2888                         except UnavailableVideoError, err:
2889                                 self._downloader.trouble(u'\nERROR: unable to download video')
2890
2891 class BlipTVIE(InfoExtractor):
2892         """Information extractor for blip.tv"""
2893
2894         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2895         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2896
2897         @staticmethod
2898         def suitable(url):
2899                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2900
2901         def report_extraction(self, file_id):
2902                 """Report information extraction."""
2903                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2904
2905         def _simplify_title(self, title):
2906                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2907                 res = res.strip(ur'_')
2908                 return res
2909
2910         def _real_extract(self, url):
2911                 mobj = re.match(self._VALID_URL, url)
2912                 if mobj is None:
2913                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2914                         return
2915
2916                 if '?' in url:
2917                         cchar = '&'
2918                 else:
2919                         cchar = '?'
2920                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2921                 request = urllib2.Request(json_url)
2922                 self.report_extraction(mobj.group(1))
2923                 try:
2924                         json_code = urllib2.urlopen(request).read()
2925                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2926                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2927                         return
2928                 try:
2929                         json_data = json.loads(json_code)
2930                         if 'Post' in json_data:
2931                                 data = json_data['Post']
2932                         else:
2933                                 data = json_data
2934
2935                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2936                         video_url = data['media']['url']
2937                         umobj = re.match(self._URL_EXT, video_url)
2938                         if umobj is None:
2939                                 raise ValueError('Can not determine filename extension')
2940                         ext = umobj.group(1)
2941
2942                         self._downloader.increment_downloads()
2943
2944                         info = {
2945                                 'id': data['item_id'],
2946                                 'url': video_url,
2947                                 'uploader': data['display_name'],
2948                                 'upload_date': upload_date,
2949                                 'title': data['title'],
2950                                 'stitle': self._simplify_title(data['title']),
2951                                 'ext': ext,
2952                                 'format': data['media']['mimeType'],
2953                                 'thumbnail': data['thumbnailUrl'],
2954                                 'description': data['description'],
2955                                 'player_url': data['embedUrl']
2956                         }
2957                 except (ValueError,KeyError), err:
2958                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2959                         return
2960
2961                 try:
2962                         self._downloader.process_info(info)
2963                 except UnavailableVideoError, err:
2964                         self._downloader.trouble(u'\nERROR: unable to download video')
2965
2966
2967 class MyVideoIE(InfoExtractor):
2968         """Information Extractor for myvideo.de."""
2969
2970         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2971
2972         def __init__(self, downloader=None):
2973                 InfoExtractor.__init__(self, downloader)
2974
2975         @staticmethod
2976         def suitable(url):
2977                 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2978
2979         def report_download_webpage(self, video_id):
2980                 """Report webpage download."""
2981                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2982
2983         def report_extraction(self, video_id):
2984                 """Report information extraction."""
2985                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2986
2987         def _real_initialize(self):
2988                 return
2989
2990         def _real_extract(self,url):
2991                 mobj = re.match(self._VALID_URL, url)
2992                 if mobj is None:
2993                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2994                         return
2995
2996                 video_id = mobj.group(1)
2997                 simple_title = mobj.group(2).decode('utf-8')
2998                 # should actually not be necessary
2999                 simple_title = sanitize_title(simple_title)
3000                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3001
3002                 # Get video webpage
3003                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3004                 try:
3005                         self.report_download_webpage(video_id)
3006                         webpage = urllib2.urlopen(request).read()
3007                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3008                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3009                         return
3010
3011                 self.report_extraction(video_id)
3012                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3013                                  webpage)
3014                 if mobj is None:
3015                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3016                         return
3017                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3018
3019                 mobj = re.search('<title>([^<]+)</title>', webpage)
3020                 if mobj is None:
3021                         self._downloader.trouble(u'ERROR: unable to extract title')
3022                         return
3023
3024                 video_title = mobj.group(1)
3025                 video_title = sanitize_title(video_title)
3026
3027                 try:
3028                         print(video_url)
3029                         self._downloader.process_info({
3030                                 'id':           video_id,
3031                                 'url':          video_url,
3032                                 'uploader':     u'NA',
3033                                 'upload_date':  u'NA',
3034                                 'title':        video_title,
3035                                 'stitle':       simple_title,
3036                                 'ext':          u'flv',
3037                                 'format':       u'NA',
3038                                 'player_url':   None,
3039                         })
3040                 except UnavailableVideoError:
3041                         self._downloader.trouble(u'\nERROR: Unable to download video')
3042
3043 class ComedyCentralIE(InfoExtractor):
3044         """Information extractor for The Daily Show and Colbert Report """
3045
3046         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3047
3048         @staticmethod
3049         def suitable(url):
3050                 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3051
3052         def report_extraction(self, episode_id):
3053                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3054
3055         def report_config_download(self, episode_id):
3056                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3057
3058         def report_player_url(self, episode_id):
3059                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3060
3061         def _simplify_title(self, title):
3062                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3063                 res = res.strip(ur'_')
3064                 return res
3065
3066         def _real_extract(self, url):
3067                 mobj = re.match(self._VALID_URL, url)
3068                 if mobj is None:
3069                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070                         return
3071
3072                 if mobj.group('shortname'):
3073                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3074                                 url = 'http://www.thedailyshow.com/full-episodes/'
3075                         else:
3076                                 url = 'http://www.colbertnation.com/full-episodes/'
3077                         mobj = re.match(self._VALID_URL, url)
3078                         assert mobj is not None
3079
3080                 dlNewest = not mobj.group('episode')
3081                 if dlNewest:
3082                         epTitle = mobj.group('showname')
3083                 else:
3084                         epTitle = mobj.group('episode')
3085
3086                 req = urllib2.Request(url)
3087                 self.report_extraction(epTitle)
3088                 try:
3089                         htmlHandle = urllib2.urlopen(req)
3090                         html = htmlHandle.read()
3091                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3092                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3093                         return
3094                 if dlNewest:
3095                         url = htmlHandle.geturl()
3096                         mobj = re.match(self._VALID_URL, url)
3097                         if mobj is None:
3098                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3099                                 return
3100                         if mobj.group('episode') == '':
3101                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3102                                 return
3103                         epTitle = mobj.group('episode')
3104
3105                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/(.*?:episode:([^:]*):)(.*?))"/>', html)
3106                 if len(mMovieParams) == 0:
3107                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3108                         return
3109                 show_id = mMovieParams[0][2]
3110                 ACT_COUNT = { # TODO: Detect this dynamically
3111                         'thedailyshow.com': 4,
3112                         'colbertnation.com': 3,
3113                 }.get(show_id, 4)
3114                 OFFSET = {
3115                         'thedailyshow.com': 1,
3116                         'colbertnation.com': 1,
3117                 }.get(show_id, 1)
3118
3119                 first_player_url = mMovieParams[0][0]
3120                 startMediaNum = int(mMovieParams[0][3]) + OFFSET
3121                 movieId = mMovieParams[0][1]
3122
3123                 playerReq = urllib2.Request(first_player_url)
3124                 self.report_player_url(epTitle)
3125                 try:
3126                         playerResponse = urllib2.urlopen(playerReq)
3127                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3128                         self._downloader.trouble(u'ERROR: unable to download player: %s' % unicode(err))
3129                         return
3130                 player_url = playerResponse.geturl()
3131
3132                 for actNum in range(ACT_COUNT):
3133                         mediaNum = startMediaNum + actNum
3134                         mediaId = movieId + str(mediaNum)
3135                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3136                                                 urllib.urlencode({'uri': mediaId}))
3137                         configReq = urllib2.Request(configUrl)
3138                         self.report_config_download(epTitle)
3139                         try:
3140                                 configXml = urllib2.urlopen(configReq).read()
3141                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3143                                 return
3144
3145                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3146                         turls = []
3147                         for rendition in cdoc.findall('.//rendition'):
3148                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3149                                 turls.append(finfo)
3150
3151                         if len(turls) == 0:
3152                                 self._downloader.trouble(u'\nERROR: unable to download ' + str(mediaNum) + ': No videos found')
3153                                 continue
3154
3155                         # For now, just pick the highest bitrate
3156                         format,video_url = turls[-1]
3157
3158                         self._downloader.increment_downloads()
3159
3160                         effTitle = show_id.replace('.com', '') + '-' + epTitle
3161                         info = {
3162                                 'id': str(mediaNum),
3163                                 'url': video_url,
3164                                 'uploader': show_id,
3165                                 'upload_date': 'NA',
3166                                 'title': effTitle,
3167                                 'stitle': self._simplify_title(effTitle),
3168                                 'ext': 'mp4',
3169                                 'format': format,
3170                                 'thumbnail': None,
3171                                 'description': 'TODO: Not yet supported',
3172                                 'player_url': player_url
3173                         }
3174
3175                         try:
3176                                 self._downloader.process_info(info)
3177                         except UnavailableVideoError, err:
3178                                 self._downloader.trouble(u'\nERROR: unable to download ' + str(mediaNum))
3179                                 continue
3180
3181
3182 class PostProcessor(object):
3183         """Post Processor class.
3184
3185         PostProcessor objects can be added to downloaders with their
3186         add_post_processor() method. When the downloader has finished a
3187         successful download, it will take its internal chain of PostProcessors
3188         and start calling the run() method on each one of them, first with
3189         an initial argument and then with the returned value of the previous
3190         PostProcessor.
3191
3192         The chain will be stopped if one of them ever returns None or the end
3193         of the chain is reached.
3194
3195         PostProcessor objects follow a "mutual registration" process similar
3196         to InfoExtractor objects.
3197         """
3198
3199         _downloader = None
3200
3201         def __init__(self, downloader=None):
3202                 self._downloader = downloader
3203
3204         def set_downloader(self, downloader):
3205                 """Sets the downloader for this PP."""
3206                 self._downloader = downloader
3207
3208         def run(self, information):
3209                 """Run the PostProcessor.
3210
3211                 The "information" argument is a dictionary like the ones
3212                 composed by InfoExtractors. The only difference is that this
3213                 one has an extra field called "filepath" that points to the
3214                 downloaded file.
3215
3216                 When this method returns None, the postprocessing chain is
3217                 stopped. However, this method may return an information
3218                 dictionary that will be passed to the next postprocessing
3219                 object in the chain. It can be the one it received after
3220                 changing some fields.
3221
3222                 In addition, this method may raise a PostProcessingError
3223                 exception that will be taken into account by the downloader
3224                 it was called from.
3225                 """
3226                 return information # by default, do nothing
3227
3228
3229 class FFmpegExtractAudioPP(PostProcessor):
3230
3231         def __init__(self, downloader=None, preferredcodec=None):
3232                 PostProcessor.__init__(self, downloader)
3233                 if preferredcodec is None:
3234                         preferredcodec = 'best'
3235                 self._preferredcodec = preferredcodec
3236
3237         @staticmethod
3238         def get_audio_codec(path):
3239                 try:
3240                         cmd = ['ffprobe', '-show_streams', '--', path]
3241                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3242                         output = handle.communicate()[0]
3243                         if handle.wait() != 0:
3244                                 return None
3245                 except (IOError, OSError):
3246                         return None
3247                 audio_codec = None
3248                 for line in output.split('\n'):
3249                         if line.startswith('codec_name='):
3250                                 audio_codec = line.split('=')[1].strip()
3251                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3252                                 return audio_codec
3253                 return None
3254
3255         @staticmethod
3256         def run_ffmpeg(path, out_path, codec, more_opts):
3257                 try:
3258                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3259                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3260                         return (ret == 0)
3261                 except (IOError, OSError):
3262                         return False
3263
3264         def run(self, information):
3265                 path = information['filepath']
3266
3267                 filecodec = self.get_audio_codec(path)
3268                 if filecodec is None:
3269                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3270                         return None
3271
3272                 more_opts = []
3273                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3274                         if filecodec == 'aac' or filecodec == 'mp3':
3275                                 # Lossless if possible
3276                                 acodec = 'copy'
3277                                 extension = filecodec
3278                                 if filecodec == 'aac':
3279                                         more_opts = ['-f', 'adts']
3280                         else:
3281                                 # MP3 otherwise.
3282                                 acodec = 'libmp3lame'
3283                                 extension = 'mp3'
3284                                 more_opts = ['-ab', '128k']
3285                 else:
3286                         # We convert the audio (lossy)
3287                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3288                         extension = self._preferredcodec
3289                         more_opts = ['-ab', '128k']
3290                         if self._preferredcodec == 'aac':
3291                                 more_opts += ['-f', 'adts']
3292
3293                 (prefix, ext) = os.path.splitext(path)
3294                 new_path = prefix + '.' + extension
3295                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3296                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3297
3298                 if not status:
3299                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3300                         return None
3301
3302                 try:
3303                         os.remove(path)
3304                 except (IOError, OSError):
3305                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3306                         return None
3307
3308                 information['filepath'] = new_path
3309                 return information
3310
3311
3312 def updateSelf(downloader, filename):
3313         ''' Update the program file with the latest version from the repository '''
3314         # Note: downloader only used for options
3315         if not os.access(filename, os.W_OK):
3316                 sys.exit('ERROR: no write permissions on %s' % filename)
3317
3318         downloader.to_screen('Updating to latest version...')
3319
3320         try:
3321                 try:
3322                         urlh = urllib.urlopen(UPDATE_URL)
3323                         newcontent = urlh.read()
3324                 finally:
3325                         urlh.close()
3326         except (IOError, OSError), err:
3327                 sys.exit('ERROR: unable to download latest version')
3328
3329         try:
3330                 outf = open(filename, 'wb')
3331                 try:
3332                         outf.write(newcontent)
3333                 finally:
3334                         outf.close()
3335         except (IOError, OSError), err:
3336                 sys.exit('ERROR: unable to overwrite current version')
3337
3338         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3339
3340 def parseOpts():
3341         # Deferred imports
3342         import getpass
3343         import optparse
3344
3345         def _format_option_string(option):
3346                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3347
3348                 opts = []
3349
3350                 if option._short_opts: opts.append(option._short_opts[0])
3351                 if option._long_opts: opts.append(option._long_opts[0])
3352                 if len(opts) > 1: opts.insert(1, ', ')
3353
3354                 if option.takes_value(): opts.append(' %s' % option.metavar)
3355
3356                 return "".join(opts)
3357
3358         def _find_term_columns():
3359                 columns = os.environ.get('COLUMNS', None)
3360                 if columns:
3361                         return int(columns)
3362
3363                 try:
3364                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3365                         out,err = sp.communicate()
3366                         return int(out.split()[1])
3367                 except:
3368                         pass
3369                 return None
3370
3371         max_width = 80
3372         max_help_position = 80
3373
3374         # No need to wrap help messages if we're on a wide console
3375         columns = _find_term_columns()
3376         if columns: max_width = columns
3377
3378         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3379         fmt.format_option_strings = _format_option_string
3380
3381         kw = {
3382                 'version'   : __version__,
3383                 'formatter' : fmt,
3384                 'usage' : '%prog [options] url...',
3385                 'conflict_handler' : 'resolve',
3386         }
3387
3388         parser = optparse.OptionParser(**kw)
3389
3390         # option groups
3391         general        = optparse.OptionGroup(parser, 'General Options')
3392         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3393         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3394         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3395         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3396         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3397
3398         general.add_option('-h', '--help',
3399                         action='help', help='print this help text and exit')
3400         general.add_option('-v', '--version',
3401                         action='version', help='print program version and exit')
3402         general.add_option('-U', '--update',
3403                         action='store_true', dest='update_self', help='update this program to latest version')
3404         general.add_option('-i', '--ignore-errors',
3405                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3406         general.add_option('-r', '--rate-limit',
3407                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3408         general.add_option('-R', '--retries',
3409                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3410         general.add_option('--playlist-start',
3411                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3412         general.add_option('--playlist-end',
3413                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3414         general.add_option('--dump-user-agent',
3415                         action='store_true', dest='dump_user_agent',
3416                         help='display the current browser identification', default=False)
3417
3418         authentication.add_option('-u', '--username',
3419                         dest='username', metavar='USERNAME', help='account username')
3420         authentication.add_option('-p', '--password',
3421                         dest='password', metavar='PASSWORD', help='account password')
3422         authentication.add_option('-n', '--netrc',
3423                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3424
3425
3426         video_format.add_option('-f', '--format',
3427                         action='store', dest='format', metavar='FORMAT', help='video format code')
3428         video_format.add_option('--all-formats',
3429                         action='store_const', dest='format', help='download all available video formats', const='-1')
3430         video_format.add_option('--max-quality',
3431                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3432
3433
3434         verbosity.add_option('-q', '--quiet',
3435                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3436         verbosity.add_option('-s', '--simulate',
3437                         action='store_true', dest='simulate', help='do not download video', default=False)
3438         verbosity.add_option('-g', '--get-url',
3439                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3440         verbosity.add_option('-e', '--get-title',
3441                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3442         verbosity.add_option('--get-thumbnail',
3443                         action='store_true', dest='getthumbnail',
3444                         help='simulate, quiet but print thumbnail URL', default=False)
3445         verbosity.add_option('--get-description',
3446                         action='store_true', dest='getdescription',
3447                         help='simulate, quiet but print video description', default=False)
3448         verbosity.add_option('--get-filename',
3449                         action='store_true', dest='getfilename',
3450                         help='simulate, quiet but print output filename', default=False)
3451         verbosity.add_option('--no-progress',
3452                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3453         verbosity.add_option('--console-title',
3454                         action='store_true', dest='consoletitle',
3455                         help='display progress in console titlebar', default=False)
3456
3457
3458         filesystem.add_option('-t', '--title',
3459                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3460         filesystem.add_option('-l', '--literal',
3461                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3462         filesystem.add_option('-A', '--auto-number',
3463                         action='store_true', dest='autonumber',
3464                         help='number downloaded files starting from 00000', default=False)
3465         filesystem.add_option('-o', '--output',
3466                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3467         filesystem.add_option('-a', '--batch-file',
3468                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3469         filesystem.add_option('-w', '--no-overwrites',
3470                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3471         filesystem.add_option('-c', '--continue',
3472                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3473         filesystem.add_option('--cookies',
3474                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3475         filesystem.add_option('--no-part',
3476                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3477         filesystem.add_option('--no-mtime',
3478                         action='store_false', dest='updatetime',
3479                         help='do not use the Last-modified header to set the file modification time', default=True)
3480         filesystem.add_option('--write-description',
3481                         action='store_true', dest='writedescription',
3482                         help='write video description to a .description file', default=False)
3483         filesystem.add_option('--write-info-json',
3484                         action='store_true', dest='writeinfojson',
3485                         help='write video metadata to a .info.json file', default=False)
3486
3487
3488         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3489                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3490         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3491                         help='"best", "aac" or "mp3"; best by default')
3492
3493
3494         parser.add_option_group(general)
3495         parser.add_option_group(filesystem)
3496         parser.add_option_group(verbosity)
3497         parser.add_option_group(video_format)
3498         parser.add_option_group(authentication)
3499         parser.add_option_group(postproc)
3500
3501         opts, args = parser.parse_args()
3502
3503         return parser, opts, args
3504
3505 def main():
3506         parser, opts, args = parseOpts()
3507
3508         # Open appropriate CookieJar
3509         if opts.cookiefile is None:
3510                 jar = cookielib.CookieJar()
3511         else:
3512                 try:
3513                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3514                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3515                                 jar.load()
3516                 except (IOError, OSError), err:
3517                         sys.exit(u'ERROR: unable to open cookie file')
3518
3519         # Dump user agent
3520         if opts.dump_user_agent:
3521                 print std_headers['User-Agent']
3522                 sys.exit(0)
3523
3524         # General configuration
3525         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3526         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3527         urllib2.install_opener(opener)
3528         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3529
3530         # Batch file verification
3531         batchurls = []
3532         if opts.batchfile is not None:
3533                 try:
3534                         if opts.batchfile == '-':
3535                                 batchfd = sys.stdin
3536                         else:
3537                                 batchfd = open(opts.batchfile, 'r')
3538                         batchurls = batchfd.readlines()
3539                         batchurls = [x.strip() for x in batchurls]
3540                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3541                 except IOError:
3542                         sys.exit(u'ERROR: batch file could not be read')
3543         all_urls = batchurls + args
3544
3545         # Conflicting, missing and erroneous options
3546         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3547                 parser.error(u'using .netrc conflicts with giving username/password')
3548         if opts.password is not None and opts.username is None:
3549                 parser.error(u'account username missing')
3550         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3551                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3552         if opts.usetitle and opts.useliteral:
3553                 parser.error(u'using title conflicts with using literal title')
3554         if opts.username is not None and opts.password is None:
3555                 opts.password = getpass.getpass(u'Type account password and press return:')
3556         if opts.ratelimit is not None:
3557                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3558                 if numeric_limit is None:
3559                         parser.error(u'invalid rate limit specified')
3560                 opts.ratelimit = numeric_limit
3561         if opts.retries is not None:
3562                 try:
3563                         opts.retries = long(opts.retries)
3564                 except (TypeError, ValueError), err:
3565                         parser.error(u'invalid retry count specified')
3566         try:
3567                 opts.playliststart = int(opts.playliststart)
3568                 if opts.playliststart <= 0:
3569                         raise ValueError(u'Playlist start must be positive')
3570         except (TypeError, ValueError), err:
3571                 parser.error(u'invalid playlist start number specified')
3572         try:
3573                 opts.playlistend = int(opts.playlistend)
3574                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3575                         raise ValueError(u'Playlist end must be greater than playlist start')
3576         except (TypeError, ValueError), err:
3577                 parser.error(u'invalid playlist end number specified')
3578         if opts.extractaudio:
3579                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3580                         parser.error(u'invalid audio format specified')
3581
3582         # Information extractors
3583         youtube_ie = YoutubeIE()
3584         metacafe_ie = MetacafeIE(youtube_ie)
3585         dailymotion_ie = DailymotionIE()
3586         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3587         youtube_user_ie = YoutubeUserIE(youtube_ie)
3588         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3589         google_ie = GoogleIE()
3590         google_search_ie = GoogleSearchIE(google_ie)
3591         photobucket_ie = PhotobucketIE()
3592         yahoo_ie = YahooIE()
3593         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3594         deposit_files_ie = DepositFilesIE()
3595         facebook_ie = FacebookIE()
3596         bliptv_ie = BlipTVIE()
3597         vimeo_ie = VimeoIE()
3598         myvideo_ie = MyVideoIE()
3599         comedycentral_ie = ComedyCentralIE()
3600
3601         generic_ie = GenericIE()
3602
3603         # File downloader
3604         fd = FileDownloader({
3605                 'usenetrc': opts.usenetrc,
3606                 'username': opts.username,
3607                 'password': opts.password,
3608                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3609                 'forceurl': opts.geturl,
3610                 'forcetitle': opts.gettitle,
3611                 'forcethumbnail': opts.getthumbnail,
3612                 'forcedescription': opts.getdescription,
3613                 'forcefilename': opts.getfilename,
3614                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3615                 'format': opts.format,
3616                 'format_limit': opts.format_limit,
3617                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3618                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3619                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3620                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3621                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3622                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3623                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3624                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3625                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3626                         or u'%(id)s.%(ext)s'),
3627                 'ignoreerrors': opts.ignoreerrors,
3628                 'ratelimit': opts.ratelimit,
3629                 'nooverwrites': opts.nooverwrites,
3630                 'retries': opts.retries,
3631                 'continuedl': opts.continue_dl,
3632                 'noprogress': opts.noprogress,
3633                 'playliststart': opts.playliststart,
3634                 'playlistend': opts.playlistend,
3635                 'logtostderr': opts.outtmpl == '-',
3636                 'consoletitle': opts.consoletitle,
3637                 'nopart': opts.nopart,
3638                 'updatetime': opts.updatetime,
3639                 'writedescription': opts.writedescription,
3640                 'writeinfojson': opts.writeinfojson,
3641                 })
3642         fd.add_info_extractor(youtube_search_ie)
3643         fd.add_info_extractor(youtube_pl_ie)
3644         fd.add_info_extractor(youtube_user_ie)
3645         fd.add_info_extractor(metacafe_ie)
3646         fd.add_info_extractor(dailymotion_ie)
3647         fd.add_info_extractor(youtube_ie)
3648         fd.add_info_extractor(google_ie)
3649         fd.add_info_extractor(google_search_ie)
3650         fd.add_info_extractor(photobucket_ie)
3651         fd.add_info_extractor(yahoo_ie)
3652         fd.add_info_extractor(yahoo_search_ie)
3653         fd.add_info_extractor(deposit_files_ie)
3654         fd.add_info_extractor(facebook_ie)
3655         fd.add_info_extractor(bliptv_ie)
3656         fd.add_info_extractor(vimeo_ie)
3657         fd.add_info_extractor(myvideo_ie)
3658         fd.add_info_extractor(comedycentral_ie)
3659
3660         # This must come last since it's the
3661         # fallback if none of the others work
3662         fd.add_info_extractor(generic_ie)
3663
3664         # PostProcessors
3665         if opts.extractaudio:
3666                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3667
3668         # Update version
3669         if opts.update_self:
3670                 updateSelf(fd, sys.argv[0])
3671
3672         # Maybe do nothing
3673         if len(all_urls) < 1:
3674                 if not opts.update_self:
3675                         parser.error(u'you must provide at least one URL')
3676                 else:
3677                         sys.exit()
3678         retcode = fd.download(all_urls)
3679
3680         # Dump cookie jar if requested
3681         if opts.cookiefile is not None:
3682                 try:
3683                         jar.save()
3684                 except (IOError, OSError), err:
3685                         sys.exit(u'ERROR: unable to save cookie jar')
3686
3687         sys.exit(retcode)
3688
3689
3690 if __name__ == '__main__':
3691         try:
3692                 main()
3693         except DownloadError:
3694                 sys.exit(1)
3695         except SameFileError:
3696                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3697         except KeyboardInterrupt:
3698                 sys.exit(u'\nERROR: Interrupted by user')
3699
3700 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: