Imported Upstream version 2010.01.19

[youtubedl] / youtube-dl
diff --git a/youtube-dl b/youtube-dl

index 5afff4e61879526235ff5f262ce4be188c122725..9e2862123c124ba82e0e235181048fbf983b3dfc 100755 (executable)
--- a/youtube-dl
+++ b/youtube-dl
@@ -2,6 +2,7 @@
  # -*- coding: utf-8 -*-
  # Author: Ricardo Garcia Gonzalez
  # Author: Danny Colligan
+# Author: Benjamin Johnson
  # License: Public domain code
  import htmlentitydefs
  import httplib
@@ -13,11 +14,18 @@ import os.path
  import re
  import socket
  import string
+import subprocess
  import sys
  import time
  import urllib
  import urllib2
  
+# parse_qs was moved from the cgi module to the urlparse module recently.
+try:
+       from urlparse import parse_qs
+except ImportError:
+       from cgi import parse_qs
+
  std_headers = {
         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
@@ -33,15 +41,15 @@ def preferredencoding():
         Returns the best encoding scheme for the system, based on
         locale.getpreferredencoding() and some further tweaks.
         """
-       try:
-               pref = locale.getpreferredencoding()
-               # Mac OSX systems have this problem sometimes
-               if pref == '':
-                       return 'UTF-8'
-               return pref
-       except:
-               sys.stderr.write('WARNING: problem obtaining preferred encoding. Falling back to UTF-8.\n')
-               return 'UTF-8'
+       def yield_preferredencoding():
+               try:
+                       pref = locale.getpreferredencoding()
+                       u'TEST'.encode(pref)
+               except:
+                       pref = 'UTF-8'
+               while True:
+                       yield pref
+       return yield_preferredencoding().next()
  
  class DownloadError(Exception):
         """Download Error exception.
@@ -308,10 +316,12 @@ class FileDownloader(object):
                 """Process a single dictionary returned by an InfoExtractor."""
                 # Do nothing else if in simulate mode
                 if self.params.get('simulate', False):
-                       try:
-                               info_dict['url'] = self.verify_url(info_dict['url'])
-                       except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
-                               raise UnavailableFormatError
+                       # Verify URL if it's an HTTP one
+                       if info_dict['url'].startswith('http'):
+                               try:
+                                       info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
+                               except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                       raise UnavailableFormatError
  
                         # Forced printings
                         if self.params.get('forcetitle', False):
@@ -327,7 +337,7 @@ class FileDownloader(object):
                         filename = self.params['outtmpl'] % template_dict
                 except (ValueError, KeyError), err:
                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
-               if self.params['nooverwrites'] and os.path.exists(filename):
+               if self.params.get('nooverwrites', False) and os.path.exists(filename):
                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
                         return
  
@@ -338,7 +348,7 @@ class FileDownloader(object):
                         return
  
                 try:
-                       success = self._do_download(filename, info_dict['url'])
+                       success = self._do_download(filename, info_dict['url'].encode('utf-8'))
                 except (OSError, IOError), err:
                         raise UnavailableFormatError
                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@@ -390,21 +400,53 @@ class FileDownloader(object):
                         if info is None:
                                 break
         
+       def _download_with_rtmpdump(self, filename, url):
+               self.report_destination(filename)
+
+               # Check for rtmpdump first
+               try:
+                       subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+               except (OSError, IOError):
+                       self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
+                       return False
+
+               # Download using rtmpdump. rtmpdump returns exit code 2 when
+               # the connection was interrumpted and resuming appears to be
+               # possible. This is part of rtmpdump's normal usage, AFAIK.
+               basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
+               retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
+               while retval == 2 or retval == 1:
+                       self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
+                       time.sleep(2.0) # This seems to be needed
+                       retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+               if retval == 0:
+                       self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
+                       return True
+               else:
+                       self.trouble('ERROR: rtmpdump exited with code %d' % retval)
+                       return False
+
         def _do_download(self, filename, url):
-               stream = None
-               open_mode = 'ab'
+               # Attempt to download using rtmpdump
+               if url.startswith('rtmp'):
+                       return self._download_with_rtmpdump(filename, url)
  
+               stream = None
+               open_mode = 'wb'
                 basic_request = urllib2.Request(url, None, std_headers)
                 request = urllib2.Request(url, None, std_headers)
  
-               # Attempt to resume download with "continuedl" option
+               # Establish possible resume length
                 if os.path.isfile(filename):
                         resume_len = os.path.getsize(filename)
                 else:
                         resume_len = 0
-               if self.params['continuedl'] and resume_len != 0:
+
+               # Request parameters in case of being able to resume
+               if self.params.get('continuedl', False) and resume_len != 0:
                         self.report_resuming_byte(resume_len)
                         request.add_header('Range','bytes=%d-' % resume_len)
+                       open_mode = 'ab'
  
                 # Establish connection
                 try:
@@ -412,12 +454,16 @@ class FileDownloader(object):
                 except (urllib2.HTTPError, ), err:
                         if err.code != 416: #  416 is 'Requested range not satisfiable'
                                 raise
+                       # Unable to resume
                         data = urllib2.urlopen(basic_request)
                         content_length = data.info()['Content-Length']
+
                         if content_length is not None and long(content_length) == resume_len:
+                               # Because the file had already been fully downloaded
                                 self.report_file_already_downloaded(filename)
                                 return True
                         else:
+                               # Because the server didn't let us
                                 self.report_unable_to_resume()
                                 open_mode = 'wb'
  
@@ -530,12 +576,13 @@ class YoutubeIE(InfoExtractor):
         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
         _NETRC_MACHINE = 'youtube'
-       _available_formats = ['22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
+       _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
         _video_extensions = {
                 '13': '3gp',
                 '17': 'mp4',
                 '18': 'mp4',
                 '22': 'mp4',
+               '37': 'mp4',
         }
  
         @staticmethod
@@ -589,6 +636,10 @@ class YoutubeIE(InfoExtractor):
                 """Report extracted video URL."""
                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
         
+       def report_rtmp_download(self):
+               """Indicate the download will use the RTMP protocol."""
+               self._downloader.to_stdout(u'[youtube] RTMP download detected')
+       
         def _real_initialize(self):
                 if self._downloader is None:
                         return
@@ -687,43 +738,45 @@ class YoutubeIE(InfoExtractor):
                         try:
                                 self.report_video_info_webpage_download(video_id)
                                 video_info_webpage = urllib2.urlopen(request).read()
+                               video_info = parse_qs(video_info_webpage)
                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
                                 return
                         self.report_information_extraction(video_id)
  
                         # "t" param
-                       mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'token' not in video_info:
                                 # Attempt to see if YouTube has issued an error message
-                               mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
-                               if mobj is None:
+                               if 'reason' not in video_info:
                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
                                         stream.write(video_info_webpage)
                                         stream.close()
                                 else:
-                                       reason = urllib.unquote_plus(mobj.group(1))
+                                       reason = urllib.unquote_plus(video_info['reason'][0])
                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
                                 return
-                       token = urllib.unquote(mobj.group(1))
+                       token = urllib.unquote_plus(video_info['token'][0])
                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
                         if format_param is not None:
                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
  
+                       # Check possible RTMP download
+                       if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
+                               self.report_rtmp_download()
+                               video_real_url = video_info['conn'][0]
+
                         # uploader
-                       mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'author' not in video_info:
                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
                                 return
-                       video_uploader = urllib.unquote(mobj.group(1))
+                       video_uploader = urllib.unquote_plus(video_info['author'][0])
  
                         # title
-                       mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'title' not in video_info:
                                 self._downloader.trouble(u'ERROR: unable to extract video title')
                                 return
-                       video_title = urllib.unquote(mobj.group(1))
+                       video_title = urllib.unquote_plus(video_info['title'][0])
                         video_title = video_title.decode('utf-8')
                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
                         video_title = video_title.replace(os.sep, u'%')
@@ -867,7 +920,7 @@ class MetacafeIE(InfoExtractor):
                         return
                 video_title = mobj.group(1).decode('utf-8')
  
-               mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
+               mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
                         return
@@ -887,6 +940,159 @@ class MetacafeIE(InfoExtractor):
                         self._downloader.trouble(u'ERROR: format not available for video')
  
  
+class GoogleIE(InfoExtractor):
+       """Information extractor for video.google.com."""
+
+       _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       @staticmethod
+       def suitable(url):
+               return (re.match(GoogleIE._VALID_URL, url) is not None)
+
+       def report_download_webpage(self, video_id):
+               """Report webpage download."""
+               self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
+
+       def report_extraction(self, video_id):
+               """Report information extraction."""
+               self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
+
+       def _real_initialize(self):
+               return
+
+       def _real_extract(self, url):
+               # Extract id from URL
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+                       return
+
+               video_id = mobj.group(1)
+
+               video_extension = 'mp4'
+
+               # Retrieve video webpage to extract further information
+               request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract URL, uploader, and title from webpage
+               self.report_extraction(video_id)
+               mobj = re.search(r"download_url:'(.*)'", webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract media URL')
+                       return
+               mediaURL = urllib.unquote(mobj.group(1))
+               mediaURL = mediaURL.replace('\\x3d', '\x3d')
+               mediaURL = mediaURL.replace('\\x26', '\x26')
+
+               video_url = mediaURL
+
+               mobj = re.search(r'<title>(.*)</title>', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract title')
+                       return
+               video_title = mobj.group(1).decode('utf-8')
+
+               # Google Video doesn't show uploader nicknames?
+               video_uploader = 'uploader'
+
+               try:
+                       # Process video information
+                       self._downloader.process_info({
+                               'id':           video_id.decode('utf-8'),
+                               'url':          video_url.decode('utf-8'),
+                               'uploader':     video_uploader.decode('utf-8'),
+                               'title':        video_title.decode('utf-8'),
+                               'stitle':       video_title.decode('utf-8'),
+                               'ext':          video_extension.decode('utf-8'),
+                       })
+               except UnavailableFormatError:
+                       self._downloader.trouble(u'ERROR: format not available for video')
+
+
+class PhotobucketIE(InfoExtractor):
+       """Information extractor for photobucket.com."""
+
+       _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       @staticmethod
+       def suitable(url):
+               return (re.match(PhotobucketIE._VALID_URL, url) is not None)
+
+       def report_download_webpage(self, video_id):
+               """Report webpage download."""
+               self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
+
+       def report_extraction(self, video_id):
+               """Report information extraction."""
+               self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
+
+       def _real_initialize(self):
+               return
+
+       def _real_extract(self, url):
+               # Extract id from URL
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+                       return
+
+               video_id = mobj.group(1)
+
+               video_extension = 'flv'
+
+               # Retrieve video webpage to extract further information
+               request = urllib2.Request(url)
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract URL, uploader, and title from webpage
+               self.report_extraction(video_id)
+               mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract media URL')
+                       return
+               mediaURL = urllib.unquote(mobj.group(1))
+
+               video_url = mediaURL
+
+               mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract title')
+                       return
+               video_title = mobj.group(1).decode('utf-8')
+
+               video_uploader = mobj.group(2).decode('utf-8')
+
+               try:
+                       # Process video information
+                       self._downloader.process_info({
+                               'id':           video_id.decode('utf-8'),
+                               'url':          video_url.decode('utf-8'),
+                               'uploader':     video_uploader.decode('utf-8'),
+                               'title':        video_title.decode('utf-8'),
+                               'stitle':       video_title.decode('utf-8'),
+                               'ext':          video_extension.decode('utf-8'),
+                       })
+               except UnavailableFormatError:
+                       self._downloader.trouble(u'ERROR: format not available for video')
+
+
  class YoutubeSearchIE(InfoExtractor):
         """Information Extractor for YouTube search queries."""
         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
@@ -1036,6 +1242,61 @@ class YoutubePlaylistIE(InfoExtractor):
                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
                 return
  
+class YoutubeUserIE(InfoExtractor):
+       """Information Extractor for YouTube users."""
+
+       _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
+       _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
+       _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
+       _youtube_ie = None
+
+       def __init__(self, youtube_ie, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+               self._youtube_ie = youtube_ie
+       
+       @staticmethod
+       def suitable(url):
+               return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
+
+       def report_download_page(self, username):
+               """Report attempt to download user page."""
+               self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
+
+       def _real_initialize(self):
+               self._youtube_ie.initialize()
+       
+       def _real_extract(self, url):
+               # Extract username
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: invalid url: %s' % url)
+                       return
+
+               # Download user page
+               username = mobj.group(1)
+               video_ids = []
+               pagenum = 1
+
+               self.report_download_page(username)
+               request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
+               try:
+                       page = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+                       return
+
+               # Extract video identifiers
+               ids_in_page = []
+
+               for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+                       if mobj.group(1) not in ids_in_page:
+                               ids_in_page.append(mobj.group(1))
+               video_ids.extend(ids_in_page)
+
+               for id in video_ids:
+                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+               return
+
  class PostProcessor(object):
         """Post Processor class.
  
@@ -1089,6 +1350,22 @@ if __name__ == '__main__':
                 import getpass
                 import optparse
  
+               # Function to update the program file with the latest version from bitbucket.org
+               def update_self(downloader, filename):
+                       # Note: downloader only used for options
+                       if not os.access (filename, os.W_OK):
+                               sys.exit('ERROR: no write permissions on %s' % filename)
+
+                       downloader.to_stdout('Updating to latest stable version...')
+                       latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
+                       latest_version = urllib.urlopen(latest_url).read().strip()
+                       prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
+                       newcontent = urllib.urlopen(prog_url).read()
+                       stream = open(filename, 'w')
+                       stream.write(newcontent)
+                       stream.close()
+                       downloader.to_stdout('Updated to version %s' % latest_version)
+
                 # General configuration
                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
@@ -1097,7 +1374,7 @@ if __name__ == '__main__':
                 # Parse command line
                 parser = optparse.OptionParser(
                         usage='Usage: %prog [options] url...',
-                       version='2009.09.13',
+                       version='2010.01.19',
                         conflict_handler='resolve',
                 )
  
@@ -1105,6 +1382,8 @@ if __name__ == '__main__':
                                 action='help', help='print this help text and exit')
                 parser.add_option('-v', '--version',
                                 action='version', help='print program version and exit')
+               parser.add_option('-U', '--update',
+                               action='store_true', dest='update_self', help='update this program to latest stable version')
                 parser.add_option('-i', '--ignore-errors',
                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
                 parser.add_option('-r', '--rate-limit',
@@ -1157,7 +1436,7 @@ if __name__ == '__main__':
                 parser.add_option_group(filesystem)
  
                 (opts, args) = parser.parse_args()
-
+        
                 # Batch file verification
                 batchurls = []
                 if opts.batchfile is not None:
@@ -1170,8 +1449,6 @@ if __name__ == '__main__':
                 all_urls = batchurls + args
  
                 # Conflicting, missing and erroneous options
-               if len(all_urls) < 1:
-                       parser.error(u'you must provide at least one URL')
                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
                         parser.error(u'using .netrc conflicts with giving username/password')
                 if opts.password is not None and opts.username is None:
@@ -1192,7 +1469,10 @@ if __name__ == '__main__':
                 youtube_ie = YoutubeIE()
                 metacafe_ie = MetacafeIE(youtube_ie)
                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
+               youtube_user_ie = YoutubeUserIE(youtube_ie)
                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
+               google_ie = GoogleIE()
+               photobucket_ie = PhotobucketIE()
  
                 # File downloader
                 fd = FileDownloader({
@@ -1215,8 +1495,22 @@ if __name__ == '__main__':
                         })
                 fd.add_info_extractor(youtube_search_ie)
                 fd.add_info_extractor(youtube_pl_ie)
+               fd.add_info_extractor(youtube_user_ie)
                 fd.add_info_extractor(metacafe_ie)
                 fd.add_info_extractor(youtube_ie)
+               fd.add_info_extractor(google_ie)
+               fd.add_info_extractor(photobucket_ie)
+
+               # Update version
+               if opts.update_self:
+                       update_self(fd, sys.argv[0])
+
+               # Maybe do nothing
+               if len(all_urls) < 1:
+                       if not opts.update_self:
+                               parser.error(u'you must provide at least one URL')
+                       else:
+                               sys.exit()
                 retcode = fd.download(all_urls)
                 sys.exit(retcode)