]> Raphaël G. Git Repositories - youtubedl/commitdiff
Imported Upstream version 2010.04.04
authorRogério Brito <rbrito@ime.usp.br>
Sat, 18 Jun 2011 05:16:52 +0000 (02:16 -0300)
committerRogério Brito <rbrito@ime.usp.br>
Sat, 18 Jun 2011 05:16:52 +0000 (02:16 -0300)
.hg_archival.txt
.hgtags
LATEST_VERSION
youtube-dl

index 8e181469f5833b5d2593fa78643537ffa4c1362b..be1abdffba66563ea0315f1f4dea8bd36cde4a21 100644 (file)
@@ -1,2 +1,2 @@
 repo: f87cf8f2527c4adda57b14bd79a354f815164a41
-node: af285beaf15be8c4800d7bfdd868dd6058b14d65
+node: 1d3bca5d5e5dd4dc014c3d2681b7a429be9edd2f
diff --git a/.hgtags b/.hgtags
index cfad66b80782814ee2d9cd736473459ccf61a035..4697dc59e0bc95581dcd050fdc8e4f6396c17de6 100644 (file)
--- a/.hgtags
+++ b/.hgtags
@@ -28,3 +28,10 @@ eabc9bc8ab3ca52115a13036a57135d6df128f6f 2009.12.26
 9ade3c8f7a53748c3e3edd2009585b8fbbe1f561 2010.01.19
 93ff6dceb3da30991d4e7c7bdf7bbf5ed7baea61 2010.02.13
 c69858fd48506de909499f59ea65d982df93b103 2010.03.07
+af285beaf15be8c4800d7bfdd868dd6058b14d65 2010.03.13
+af285beaf15be8c4800d7bfdd868dd6058b14d65 2010.03.13
+0000000000000000000000000000000000000000 2010.03.13
+0000000000000000000000000000000000000000 2010.03.13
+eadec6d49e8a8d266c2ac6674aef2b415ccd3424 2010.03.13
+a40f32f2978ab2c20450da2a708b8ece806fe147 2010.04.02
+d6f421afc953fcc2d7336e7e4d6248b4d015e360 2010.04.03
index 69e947131039cfe334f7f952fbedb98db6c4a811..013f034f30e7e771a9e68de3dc6677ff9d75e007 100644 (file)
@@ -1 +1 @@
-2010.03.07
+2010.04.04
index 4530c5da2132340a9ca6d415c65fc004e217ecd7..43566b6153e880200267d0a4291f6984b526a1c4 100755 (executable)
@@ -93,6 +93,8 @@ def sanitize_open(filename, open_mode):
        It returns the tuple (stream, definitive_file_name).
        """
        try:
+               if filename == u'-':
+                       return (sys.stdout, filename)
                stream = open(filename, open_mode)
                return (stream, filename)
        except (IOError, OSError), err:
@@ -199,12 +201,14 @@ class FileDownloader(object):
        _ies = []
        _pps = []
        _download_retcode = None
+       _num_downloads = None
 
        def __init__(self, params):
                """Create a FileDownloader object with the given options."""
                self._ies = []
                self._pps = []
                self._download_retcode = 0
+               self._num_downloads = 0
                self.params = params
        
        @staticmethod
@@ -400,6 +404,7 @@ class FileDownloader(object):
                try:
                        template_dict = dict(info_dict)
                        template_dict['epoch'] = unicode(long(time.time()))
+                       template_dict['ord'] = unicode('%05d' % self._num_downloads)
                        filename = self.params['outtmpl'] % template_dict
                except (ValueError, KeyError), err:
                        self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
@@ -489,7 +494,7 @@ class FileDownloader(object):
                        self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
                        return True
                else:
-                       self.trouble('ERROR: rtmpdump exited with code %d' % retval)
+                       self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
                        return False
 
        def _do_download(self, filename, url):
@@ -553,6 +558,7 @@ class FileDownloader(object):
                                try:
                                        (stream, filename) = sanitize_open(filename, open_mode)
                                        self.report_destination(filename)
+                                       self._num_downloads += 1
                                except (OSError, IOError), err:
                                        self.trouble('ERROR: unable to open for writing: %s' % str(err))
                                        return False
@@ -591,6 +597,7 @@ class InfoExtractor(object):
        title:          Literal title.
        stitle:         Simplified title.
        ext:            Video filename extension.
+       format:         Video format.
 
        Subclasses of this one should re-define the _real_initialize() and
        _real_extract() methods, as well as the suitable() static method.
@@ -762,6 +769,7 @@ class YoutubeIE(InfoExtractor):
 
                # Downloader parameters
                best_quality = False
+               all_formats = False
                format_param = None
                quality_index = 0
                if self._downloader is not None:
@@ -770,21 +778,28 @@ class YoutubeIE(InfoExtractor):
                        if format_param == '0':
                                format_param = self._available_formats[quality_index]
                                best_quality = True
+                       elif format_param == '-1':
+                               format_param = self._available_formats[quality_index]
+                               all_formats = True
 
                while True:
                        # Extension
                        video_extension = self._video_extensions.get(format_param, 'flv')
 
                        # Get video info
-                       video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
-                       request = urllib2.Request(video_info_url, None, std_headers)
-                       try:
-                               self.report_video_info_webpage_download(video_id)
-                               video_info_webpage = urllib2.urlopen(request).read()
-                               video_info = parse_qs(video_info_webpage)
-                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
-                               self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
-                               return
+                       self.report_video_info_webpage_download(video_id)
+                       for el_type in ['embedded', 'detailpage', 'vevo']:
+                               video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
+                                                  % (video_id, el_type))
+                               request = urllib2.Request(video_info_url, None, std_headers)
+                               try:
+                                       video_info_webpage = urllib2.urlopen(request).read()
+                                       video_info = parse_qs(video_info_webpage)
+                                       if 'token' in video_info:
+                                               break
+                               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                       self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
+                                       return
                        self.report_information_extraction(video_id)
 
                        # "t" param
@@ -836,20 +851,35 @@ class YoutubeIE(InfoExtractor):
                                        'title':        video_title,
                                        'stitle':       simple_title,
                                        'ext':          video_extension.decode('utf-8'),
+                                       'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
                                })
 
+                               if all_formats:
+                                       if quality_index == len(self._available_formats) - 1:
+                                               # None left to get
+                                               return
+                                       else:
+                                               quality_index += 1
+                                               format_param = self._available_formats[quality_index]
+                                               if format_param == None:
+                                                       return
+                                               continue
+
                                return
 
                        except UnavailableFormatError, err:
-                               if best_quality:
+                               if best_quality or all_formats:
                                        if quality_index == len(self._available_formats) - 1:
                                                # I don't ever expect this to happen
-                                               self._downloader.trouble(u'ERROR: no known formats available for video')
+                                               if not all_formats:
+                                                       self._downloader.trouble(u'ERROR: no known formats available for video')
                                                return
                                        else:
                                                self.report_unavailable_format(video_id, format_param)
                                                quality_index += 1
                                                format_param = self._available_formats[quality_index]
+                                               if format_param == None:
+                                                       return
                                                continue
                                else: 
                                        self._downloader.trouble('ERROR: format not available for video')
@@ -978,6 +1008,7 @@ class MetacafeIE(InfoExtractor):
                                'title':        video_title,
                                'stitle':       simple_title,
                                'ext':          video_extension.decode('utf-8'),
+                               'format':       u'NA',
                        })
                except UnavailableFormatError:
                        self._downloader.trouble(u'ERROR: format not available for video')
@@ -1049,18 +1080,16 @@ class GoogleIE(InfoExtractor):
                video_title = sanitize_title(video_title)
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
-               # Google Video doesn't show uploader nicknames?
-               video_uploader = 'NA'
-
                try:
                        # Process video information
                        self._downloader.process_info({
                                'id':           video_id.decode('utf-8'),
                                'url':          video_url.decode('utf-8'),
-                               'uploader':     video_uploader.decode('utf-8'),
+                               'uploader':     u'NA',
                                'title':        video_title,
                                'stitle':       simple_title,
                                'ext':          video_extension.decode('utf-8'),
+                               'format':       u'NA',
                        })
                except UnavailableFormatError:
                        self._downloader.trouble(u'ERROR: format not available for video')
@@ -1138,6 +1167,142 @@ class PhotobucketIE(InfoExtractor):
                                'title':        video_title,
                                'stitle':       simple_title,
                                'ext':          video_extension.decode('utf-8'),
+                               'format':       u'NA',
+                       })
+               except UnavailableFormatError:
+                       self._downloader.trouble(u'ERROR: format not available for video')
+
+
+class YahooIE(InfoExtractor):
+       """Information extractor for video.yahoo.com."""
+
+       # _VALID_URL matches all Yahoo! Video URLs
+       # _VPAGE_URL matches only the extractable '/watch/' URLs
+       _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
+       _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       @staticmethod
+       def suitable(url):
+               return (re.match(YahooIE._VALID_URL, url) is not None)
+
+       def report_download_webpage(self, video_id):
+               """Report webpage download."""
+               self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
+
+       def report_extraction(self, video_id):
+               """Report information extraction."""
+               self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
+
+       def _real_initialize(self):
+               return
+
+       def _real_extract(self, url):
+               # Extract ID from URL
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+                       return
+
+               video_id = mobj.group(2)
+               video_extension = 'flv'
+
+               # Rewrite valid but non-extractable URLs as
+               # extractable English language /watch/ URLs
+               if re.match(self._VPAGE_URL, url) is None:
+                       request = urllib2.Request(url)
+                       try:
+                               webpage = urllib2.urlopen(request).read()
+                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                               self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                               return
+
+                       mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
+                       if mobj is None:
+                               self._downloader.trouble(u'ERROR: Unable to extract id field')
+                               return
+                       yahoo_id = mobj.group(1)
+
+                       mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
+                       if mobj is None:
+                               self._downloader.trouble(u'ERROR: Unable to extract vid field')
+                               return
+                       yahoo_vid = mobj.group(1)
+
+                       url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
+                       return self._real_extract(url)
+
+               # Retrieve video webpage to extract further information
+               request = urllib2.Request(url)
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract uploader and title from webpage
+               self.report_extraction(video_id)
+               mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract video title')
+                       return
+               video_title = mobj.group(1).decode('utf-8')
+               simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+
+               mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract video uploader')
+                       return
+               video_uploader = mobj.group(1).decode('utf-8')
+
+               # Extract video height and width
+               mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract video height')
+                       return
+               yv_video_height = mobj.group(1)
+
+               mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract video width')
+                       return
+               yv_video_width = mobj.group(1)
+
+               # Retrieve video playlist to extract media URL
+               # I'm not completely sure what all these options are, but we
+               # seem to need most of them, otherwise the server sends a 401.
+               yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
+               yv_bitrate = '700'  # according to Wikipedia this is hard-coded
+               request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
+                                         '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
+                                         '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract media URL from playlist XML
+               mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Unable to extract media URL')
+                       return
+               video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
+               video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
+
+               try:
+                       # Process video information
+                       self._downloader.process_info({
+                               'id':           video_id.decode('utf-8'),
+                               'url':          video_url,
+                               'uploader':     video_uploader,
+                               'title':        video_title,
+                               'stitle':       simple_title,
+                               'ext':          video_extension.decode('utf-8'),
                        })
                except UnavailableFormatError:
                        self._downloader.trouble(u'ERROR: format not available for video')
@@ -1232,6 +1397,7 @@ class GenericIE(InfoExtractor):
                                'title':        video_title,
                                'stitle':       simple_title,
                                'ext':          video_extension.decode('utf-8'),
+                               'format':       u'NA',
                        })
                except UnavailableFormatError:
                        self._downloader.trouble(u'ERROR: format not available for video')
@@ -1331,10 +1497,10 @@ class YoutubeSearchIE(InfoExtractor):
 class YoutubePlaylistIE(InfoExtractor):
        """Information Extractor for YouTube playlists."""
 
-       _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
+       _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
        _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
-       _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
+       _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
        _youtube_ie = None
 
        def __init__(self, youtube_ie, downloader=None):
@@ -1380,7 +1546,7 @@ class YoutubePlaylistIE(InfoExtractor):
                                        ids_in_page.append(mobj.group(1))
                        video_ids.extend(ids_in_page)
 
-                       if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
+                       if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                                break
                        pagenum = pagenum + 1
 
@@ -1520,7 +1686,7 @@ if __name__ == '__main__':
                # Parse command line
                parser = optparse.OptionParser(
                        usage='Usage: %prog [options] url...',
-                       version='2010.03.07',
+                       version='2010.04.04',
                        conflict_handler='resolve',
                )
 
@@ -1553,6 +1719,8 @@ if __name__ == '__main__':
                                action='store_const', dest='format', help='alias for -f 17', const='17')
                video_format.add_option('-d', '--high-def',
                                action='store_const', dest='format', help='alias for -f 22', const='22')
+               video_format.add_option('--all-formats',
+                               action='store_const', dest='format', help='download all available video formats', const='-1')
                parser.add_option_group(video_format)
 
                verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
@@ -1621,6 +1789,7 @@ if __name__ == '__main__':
                youtube_search_ie = YoutubeSearchIE(youtube_ie)
                google_ie = GoogleIE()
                photobucket_ie = PhotobucketIE()
+               yahoo_ie = YahooIE()
                generic_ie = GenericIE()
 
                # File downloader
@@ -1634,6 +1803,9 @@ if __name__ == '__main__':
                        'simulate': (opts.simulate or opts.geturl or opts.gettitle),
                        'format': opts.format,
                        'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
+                               or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
+                               or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
+                               or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
                                or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
                                or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
                                or u'%(id)s.%(ext)s'),
@@ -1650,6 +1822,7 @@ if __name__ == '__main__':
                fd.add_info_extractor(youtube_ie)
                fd.add_info_extractor(google_ie)
                fd.add_info_extractor(photobucket_ie)
+               fd.add_info_extractor(yahoo_ie)
 
                # This must come last since it's the
                # fallback if none of the others work