+ def process_info(self, info_dict):
+ """Process a single dictionary returned by an InfoExtractor."""
+
+ reason = self._match_entry(info_dict)
+ if reason is not None:
+ self.to_screen(u'[download] ' + reason)
+ return
+
+ max_downloads = self.params.get('max_downloads')
+ if max_downloads is not None:
+ if self._num_downloads > int(max_downloads):
+ raise MaxDownloadsReached()
+
+ filename = self.prepare_filename(info_dict)
+
+ # Forced printings
+ if self.params.get('forcetitle', False):
+ print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
+ if self.params.get('forceurl', False):
+ print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
+ if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
+ print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
+ if self.params.get('forcedescription', False) and 'description' in info_dict:
+ print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
+ if self.params.get('forcefilename', False) and filename is not None:
+ print filename.encode(preferredencoding(), 'xmlcharrefreplace')
+ if self.params.get('forceformat', False):
+ print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
+
+ # Do nothing else if in simulate mode
+ if self.params.get('simulate', False):
+ return
+
+ if filename is None:
+ return
+
+ try:
+ dn = os.path.dirname(filename)
+ if dn != '' and not os.path.exists(dn):
+ os.makedirs(dn)
+ except (OSError, IOError), err:
+ self.trouble(u'ERROR: unable to create directory ' + unicode(err))
+ return
+
+ if self.params.get('writedescription', False):
+ try:
+ descfn = filename + '.description'
+ self.report_writedescription(descfn)
+ descfile = open(descfn, 'wb')
+ try:
+ descfile.write(info_dict['description'].encode('utf-8'))
+ finally:
+ descfile.close()
+ except (OSError, IOError):
+ self.trouble(u'ERROR: Cannot write description file ' + descfn)
+ return
+
+ if self.params.get('writeinfojson', False):
+ infofn = filename + '.info.json'
+ self.report_writeinfojson(infofn)
+ try:
+ json.dump
+ except (NameError,AttributeError):
+ self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
+ return
+ try:
+ infof = open(infofn, 'wb')
+ try:
+ json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
+ json.dump(json_info_dict, infof)
+ finally:
+ infof.close()
+ except (OSError, IOError):
+ self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
+ return
+
+ if not self.params.get('skip_download', False):
+ if self.params.get('nooverwrites', False) and os.path.exists(filename):
+ success = True
+ else:
+ try:
+ success = self._do_download(filename, info_dict)
+ except (OSError, IOError), err:
+ raise UnavailableVideoError
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self.trouble(u'ERROR: unable to download video data: %s' % str(err))
+ return
+ except (ContentTooShortError, ), err:
+ self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+ return
+
+ if success:
+ try:
+ self.post_process(filename, info_dict)
+ except (PostProcessingError), err:
+ self.trouble(u'ERROR: postprocessing: %s' % str(err))
+ return
+
+ def download(self, url_list):
+ """Download a given list of URLs."""
+ if len(url_list) > 1 and self.fixed_template():
+ raise SameFileError(self.params['outtmpl'])
+
+ for url in url_list:
+ suitable_found = False
+ for ie in self._ies:
+ # Go to next InfoExtractor if not suitable
+ if not ie.suitable(url):
+ continue
+
+ # Suitable InfoExtractor found
+ suitable_found = True
+
+ # Extract information from URL and process it
+ ie.extract(url)
+
+ # Suitable InfoExtractor had been found; go to next URL
+ break
+
+ if not suitable_found:
+ self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
+
+ return self._download_retcode
+
+ def post_process(self, filename, ie_info):
+ """Run the postprocessing chain on the given file."""
+ info = dict(ie_info)
+ info['filepath'] = filename
+ for pp in self._pps:
+ info = pp.run(info)
+ if info is None:
+ break
+
+ def _download_with_rtmpdump(self, filename, url, player_url):
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+
+ # Check for rtmpdump first
+ try:
+ subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ except (OSError, IOError):
+ self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
+ return False
+
+ # Download using rtmpdump. rtmpdump returns exit code 2 when
+ # the connection was interrumpted and resuming appears to be
+ # possible. This is part of rtmpdump's normal usage, AFAIK.
+ basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
+ retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
+ while retval == 2 or retval == 1:
+ prevsize = os.path.getsize(tmpfilename)
+ self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
+ time.sleep(5.0) # This seems to be needed
+ retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+ cursize = os.path.getsize(tmpfilename)
+ if prevsize == cursize and retval == 1:
+ break
+ # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
+ if prevsize == cursize and retval == 2 and cursize > 1024:
+ self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+ retval = 0
+ break
+ if retval == 0:
+ self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
+ self.try_rename(tmpfilename, filename)
+ return True
+ else:
+ self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
+ return False
+
+ def _do_download(self, filename, info_dict):
+ url = info_dict['url']
+ player_url = info_dict.get('player_url', None)
+
+ # Check file already present
+ if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
+ self.report_file_already_downloaded(filename)
+ return True
+
+ # Attempt to download using rtmpdump
+ if url.startswith('rtmp'):
+ return self._download_with_rtmpdump(filename, url, player_url)
+
+ tmpfilename = self.temp_name(filename)
+ stream = None
+
+ # Do not include the Accept-Encoding header
+ headers = {'Youtubedl-no-compression': 'True'}
+ basic_request = urllib2.Request(url, None, headers)
+ request = urllib2.Request(url, None, headers)
+
+ # Establish possible resume length
+ if os.path.isfile(tmpfilename):
+ resume_len = os.path.getsize(tmpfilename)
+ else:
+ resume_len = 0
+
+ open_mode = 'wb'
+ if resume_len != 0:
+ if self.params.get('continuedl', False):
+ self.report_resuming_byte(resume_len)
+ request.add_header('Range','bytes=%d-' % resume_len)
+ open_mode = 'ab'
+ else:
+ resume_len = 0
+
+ count = 0
+ retries = self.params.get('retries', 0)
+ while count <= retries:
+ # Establish connection
+ try:
+ if count == 0 and 'urlhandle' in info_dict:
+ data = info_dict['urlhandle']
+ data = urllib2.urlopen(request)
+ break
+ except (urllib2.HTTPError, ), err:
+ if (err.code < 500 or err.code >= 600) and err.code != 416:
+ # Unexpected HTTP error
+ raise
+ elif err.code == 416:
+ # Unable to resume (requested range not satisfiable)
+ try:
+ # Open the connection again without the range header
+ data = urllib2.urlopen(basic_request)
+ content_length = data.info()['Content-Length']
+ except (urllib2.HTTPError, ), err:
+ if err.code < 500 or err.code >= 600:
+ raise
+ else:
+ # Examine the reported length
+ if (content_length is not None and
+ (resume_len - 100 < long(content_length) < resume_len + 100)):
+ # The file had already been fully downloaded.
+ # Explanation to the above condition: in issue #175 it was revealed that
+ # YouTube sometimes adds or removes a few bytes from the end of the file,
+ # changing the file size slightly and causing problems for some users. So
+ # I decided to implement a suggested change and consider the file
+ # completely downloaded if the file size differs less than 100 bytes from
+ # the one in the hard drive.
+ self.report_file_already_downloaded(filename)
+ self.try_rename(tmpfilename, filename)
+ return True
+ else:
+ # The length does not match, we start the download over
+ self.report_unable_to_resume()
+ open_mode = 'wb'
+ break
+ # Retry
+ count += 1
+ if count <= retries:
+ self.report_retry(count, retries)
+
+ if count > retries:
+ self.trouble(u'ERROR: giving up after %s retries' % retries)
+ return False
+
+ data_len = data.info().get('Content-length', None)
+ if data_len is not None:
+ data_len = long(data_len) + resume_len
+ data_len_str = self.format_bytes(data_len)
+ byte_counter = 0 + resume_len
+ block_size = 1024
+ start = time.time()
+ while True:
+ # Download and write
+ before = time.time()
+ data_block = data.read(block_size)
+ after = time.time()
+ if len(data_block) == 0:
+ break
+ byte_counter += len(data_block)
+
+ # Open file just in time
+ if stream is None:
+ try:
+ (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
+ assert stream is not None
+ filename = self.undo_temp_name(tmpfilename)
+ self.report_destination(filename)
+ except (OSError, IOError), err:
+ self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
+ return False
+ try:
+ stream.write(data_block)
+ except (IOError, OSError), err:
+ self.trouble(u'\nERROR: unable to write data: %s' % str(err))
+ return False
+ block_size = self.best_block_size(after - before, len(data_block))
+
+ # Progress message
+ speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
+ if data_len is None:
+ self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
+ else:
+ percent_str = self.calc_percent(byte_counter, data_len)
+ eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
+ self.report_progress(percent_str, data_len_str, speed_str, eta_str)
+
+ # Apply rate limit
+ self.slow_down(start, byte_counter - resume_len)
+
+ if stream is None:
+ self.trouble(u'\nERROR: Did not get any data blocks')
+ return False
+ stream.close()
+ self.report_finish()
+ if data_len is not None and byte_counter != data_len:
+ raise ContentTooShortError(byte_counter, long(data_len))
+ self.try_rename(tmpfilename, filename)
+
+ # Update file modification time
+ if self.params.get('updatetime', True):
+ info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
+
+ return True
+
+
+class InfoExtractor(object):
+ """Information Extractor class.
+
+ Information extractors are the classes that, given a URL, extract
+ information from the video (or videos) the URL refers to. This
+ information includes the real video URL, the video title and simplified
+ title, author and others. The information is stored in a dictionary
+ which is then passed to the FileDownloader. The FileDownloader
+ processes this information possibly downloading the video to the file
+ system, among other possible outcomes. The dictionaries must include
+ the following fields:
+
+ id: Video identifier.
+ url: Final video URL.
+ uploader: Nickname of the video uploader.
+ title: Literal title.
+ stitle: Simplified title.
+ ext: Video filename extension.
+ format: Video format.
+ player_url: SWF Player URL (may be None).
+
+ The following fields are optional. Their primary purpose is to allow
+ youtube-dl to serve as the backend for a video search function, such
+ as the one in youtube2mp3. They are only used when their respective
+ forced printing functions are called:
+
+ thumbnail: Full URL to a video thumbnail image.
+ description: One-line video description.
+
+ Subclasses of this one should re-define the _real_initialize() and
+ _real_extract() methods and define a _VALID_URL regexp.
+ Probably, they should also be added to the list of extractors.
+ """
+
+ _ready = False
+ _downloader = None
+
+ def __init__(self, downloader=None):
+ """Constructor. Receives an optional downloader."""
+ self._ready = False
+ self.set_downloader(downloader)
+
+ def suitable(self, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ return re.match(self._VALID_URL, url) is not None
+
+ def initialize(self):
+ """Initializes an instance (authentication, etc)."""
+ if not self._ready:
+ self._real_initialize()
+ self._ready = True
+
+ def extract(self, url):
+ """Extracts URL information and returns it in list of dicts."""
+ self.initialize()
+ return self._real_extract(url)
+
+ def set_downloader(self, downloader):
+ """Sets the downloader for this IE."""
+ self._downloader = downloader
+
+ def _real_initialize(self):
+ """Real initialization process. Redefine in subclasses."""
+ pass
+
+ def _real_extract(self, url):
+ """Real extraction process. Redefine in subclasses."""
+ pass
+
+
+class YoutubeIE(InfoExtractor):
+ """Information extractor for youtube.com."""
+
+ _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
+ _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+ _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
+ _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+ _NETRC_MACHINE = 'youtube'
+ # Listed in order of quality
+ _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
+ _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
+ _video_extensions = {
+ '13': '3gp',
+ '17': 'mp4',
+ '18': 'mp4',
+ '22': 'mp4',
+ '37': 'mp4',
+ '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
+ '43': 'webm',
+ '44': 'webm',
+ '45': 'webm',
+ }
+ _video_dimensions = {
+ '5': '240x400',
+ '6': '???',
+ '13': '???',
+ '17': '144x176',
+ '18': '360x640',
+ '22': '720x1280',
+ '34': '360x640',
+ '35': '480x854',
+ '37': '1080x1920',
+ '38': '3072x4096',
+ '43': '360x640',
+ '44': '480x854',
+ '45': '720x1280',
+ }
+ IE_NAME = u'youtube'
+
+ def report_lang(self):
+ """Report attempt to set language."""
+ self._downloader.to_screen(u'[youtube] Setting language')
+
+ def report_login(self):
+ """Report attempt to log in."""
+ self._downloader.to_screen(u'[youtube] Logging in')
+
+ def report_age_confirmation(self):
+ """Report attempt to confirm age."""
+ self._downloader.to_screen(u'[youtube] Confirming age')
+
+ def report_video_webpage_download(self, video_id):
+ """Report attempt to download video webpage."""
+ self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
+
+ def report_video_info_webpage_download(self, video_id):
+ """Report attempt to download video info webpage."""
+ self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
+
+ def report_information_extraction(self, video_id):
+ """Report attempt to extract video information."""
+ self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
+
+ def report_unavailable_format(self, video_id, format):
+ """Report extracted video URL."""
+ self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
+
+ def report_rtmp_download(self):
+ """Indicate the download will use the RTMP protocol."""
+ self._downloader.to_screen(u'[youtube] RTMP download detected')
+
+ def _print_formats(self, formats):
+ print 'Available formats:'
+ for x in formats:
+ print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
+
+ def _real_initialize(self):
+ if self._downloader is None:
+ return
+
+ username = None
+ password = None
+ downloader_params = self._downloader.params
+
+ # Attempt to use provided username and password or .netrc data
+ if downloader_params.get('username', None) is not None:
+ username = downloader_params['username']
+ password = downloader_params['password']
+ elif downloader_params.get('usenetrc', False):
+ try:
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+ if info is not None:
+ username = info[0]
+ password = info[2]
+ else:
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+ except (IOError, netrc.NetrcParseError), err:
+ self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
+ return
+
+ # Set language
+ request = urllib2.Request(self._LANG_URL)
+ try:
+ self.report_lang()
+ urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
+ return
+
+ # No authentication to be performed
+ if username is None:
+ return
+
+ # Log in
+ login_form = {
+ 'current_form': 'loginForm',
+ 'next': '/',
+ 'action_login': 'Log In',
+ 'username': username,
+ 'password': password,
+ }
+ request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
+ try:
+ self.report_login()
+ login_results = urllib2.urlopen(request).read()
+ if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
+ self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
+ return
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
+ return
+
+ # Confirm age
+ age_form = {
+ 'next_url': '/',
+ 'action_confirm': 'Confirm',
+ }
+ request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
+ try:
+ self.report_age_confirmation()
+ age_results = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
+ return
+
+ def _real_extract(self, url):
+ # Extract video id from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+ video_id = mobj.group(2)
+
+ # Get video webpage
+ self.report_video_webpage_download(video_id)
+ request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
+ try:
+ video_webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
+ return
+
+ # Attempt to extract SWF player URL
+ mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
+ if mobj is not None:
+ player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
+ else:
+ player_url = None
+
+ # Get video info
+ self.report_video_info_webpage_download(video_id)
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ % (video_id, el_type))
+ request = urllib2.Request(video_info_url)
+ try:
+ video_info_webpage = urllib2.urlopen(request).read()
+ video_info = parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
+ return
+ if 'token' not in video_info:
+ if 'reason' in video_info:
+ self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
+ else:
+ self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
+ return
+
+ # Start extracting information
+ self.report_information_extraction(video_id)
+
+ # uploader
+ if 'author' not in video_info:
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ return
+ video_uploader = urllib.unquote_plus(video_info['author'][0])
+
+ # title
+ if 'title' not in video_info:
+ self._downloader.trouble(u'ERROR: unable to extract video title')
+ return
+ video_title = urllib.unquote_plus(video_info['title'][0])
+ video_title = video_title.decode('utf-8')
+ video_title = sanitize_title(video_title)
+
+ # simplified title
+ simple_title = _simplify_title(video_title)
+
+ # thumbnail image
+ if 'thumbnail_url' not in video_info:
+ self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
+ video_thumbnail = ''
+ else: # don't panic if we can't find it
+ video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
+
+ # upload date
+ upload_date = u'NA'
+ mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+ if mobj is not None:
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+ format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
+ for expression in format_expressions:
+ try:
+ upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
+ except:
+ pass
+
+ # description
+ try:
+ lxml.etree
+ except NameError:
+ video_description = u'No description available.'
+ if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
+ mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
+ if mobj is not None:
+ video_description = mobj.group(1).decode('utf-8')
+ else:
+ html_parser = lxml.etree.HTMLParser(encoding='utf-8')
+ vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
+ video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
+ # TODO use another parser
+
+ # token
+ video_token = urllib.unquote_plus(video_info['token'][0])
+
+ # Decide which formats to download
+ req_format = self._downloader.params.get('format', None)
+
+ if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
+ self.report_rtmp_download()
+ video_url_list = [(None, video_info['conn'][0])]
+ elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
+ url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
+ url_data = [parse_qs(uds) for uds in url_data_strs]
+ url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
+ url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
+
+ format_limit = self._downloader.params.get('format_limit', None)
+ available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+ if format_limit is not None and format_limit in available_formats:
+ format_list = available_formats[available_formats.index(format_limit):]
+ else:
+ format_list = available_formats
+ existing_formats = [x for x in format_list if x in url_map]
+ if len(existing_formats) == 0:
+ self._downloader.trouble(u'ERROR: no known formats available for video')
+ return
+ if self._downloader.params.get('listformats', None):
+ self._print_formats(existing_formats)
+ return
+ if req_format is None or req_format == 'best':
+ video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
+ elif req_format == 'worst':
+ video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
+ elif req_format in ('-1', 'all'):
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+ else:
+ # Specific formats. We pick the first in a slash-delimeted sequence.
+ # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
+ req_formats = req_format.split('/')
+ video_url_list = None
+ for rf in req_formats:
+ if rf in url_map:
+ video_url_list = [(rf, url_map[rf])]
+ break
+ if video_url_list is None:
+ self._downloader.trouble(u'ERROR: requested format not available')
+ return
+ else:
+ self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
+ return
+
+ for format_param, video_real_url in video_url_list:
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+
+ # Extension
+ video_extension = self._video_extensions.get(format_param, 'flv')
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_real_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description,
+ 'player_url': player_url,
+ })
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class MetacafeIE(InfoExtractor):
+ """Information Extractor for metacafe.com."""
+
+ _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+ _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
+ _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
+ _youtube_ie = None
+ IE_NAME = u'metacafe'
+
+ def __init__(self, youtube_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._youtube_ie = youtube_ie
+
+ def report_disclaimer(self):
+ """Report disclaimer retrieval."""
+ self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
+
+ def report_age_confirmation(self):
+ """Report attempt to confirm age."""
+ self._downloader.to_screen(u'[metacafe] Confirming age')
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
+
+ def _real_initialize(self):
+ # Retrieve disclaimer
+ request = urllib2.Request(self._DISCLAIMER)
+ try:
+ self.report_disclaimer()
+ disclaimer = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
+ return
+
+ # Confirm age
+ disclaimer_form = {
+ 'filters': '0',
+ 'submit': "Continue - I'm over 18",
+ }
+ request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
+ try:
+ self.report_age_confirmation()
+ disclaimer = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
+ return
+
+ def _real_extract(self, url):
+ # Extract id and simplified title from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group(1)
+
+ # Check if video comes from YouTube
+ mobj2 = re.match(r'^yt-(.*)$', video_id)
+ if mobj2 is not None:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+
+ simple_title = mobj.group(2).decode('utf-8')
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract URL, uploader and title from webpage
+ self.report_extraction(video_id)
+ mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
+ if mobj is not None:
+ mediaURL = urllib.unquote(mobj.group(1))
+ video_extension = mediaURL[-3:]
+
+ # Extract gdaKey if available
+ mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
+ if mobj is None:
+ video_url = mediaURL
+ else:
+ gdaKey = mobj.group(1)
+ video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
+ else:
+ mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ vardict = parse_qs(mobj.group(1))
+ if 'mediaData' not in vardict:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ mediaURL = mobj.group(1).replace('\\/', '/')
+ video_extension = mediaURL[-3:]
+ video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
+
+ mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ video_title = sanitize_title(video_title)
+
+ mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ return
+ video_uploader = mobj.group(1)
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class DailymotionIE(InfoExtractor):
+ """Information Extractor for Dailymotion"""
+
+ _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
+ IE_NAME = u'dailymotion'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url):
+ # Extract id and simplified title from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+ video_id = mobj.group(1)
+
+ video_extension = 'flv'
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request(url)
+ request.add_header('Cookie', 'family_filter=off')
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract URL, uploader and title from webpage
+ self.report_extraction(video_id)
+ mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ sequence = urllib.unquote(mobj.group(1))
+ mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
+
+ # if needed add http://www.dailymotion.com/ if relative URL
+
+ video_url = mediaURL
+
+ mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
+ video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(video_title)
+
+ mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ return
+ video_uploader = mobj.group(1)
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class GoogleIE(InfoExtractor):
+ """Information extractor for video.google.com."""
+
+ _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
+ IE_NAME = u'video.google'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url):
+ # Extract id from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+ video_id = mobj.group(1)
+
+ video_extension = 'mp4'
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract URL, uploader, and title from webpage
+ self.report_extraction(video_id)
+ mobj = re.search(r"download_url:'([^']+)'", webpage)
+ if mobj is None:
+ video_extension = 'flv'
+ mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ mediaURL = urllib.unquote(mobj.group(1))
+ mediaURL = mediaURL.replace('\\x3d', '\x3d')
+ mediaURL = mediaURL.replace('\\x26', '\x26')
+
+ video_url = mediaURL
+
+ mobj = re.search(r'<title>(.*)</title>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(video_title)
+
+ # Extract video description
+ mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video description')
+ return
+ video_description = mobj.group(1).decode('utf-8')
+ if not video_description:
+ video_description = 'No description available.'
+
+ # Extract video thumbnail
+ if self._downloader.params.get('forcethumbnail', False):
+ request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
+ try:
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+ mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
+ return
+ video_thumbnail = mobj.group(1)
+ else: # we need something to pass to process_info
+ video_thumbnail = ''
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': u'NA',
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class PhotobucketIE(InfoExtractor):
+ """Information extractor for photobucket.com."""
+
+ _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
+ IE_NAME = u'photobucket'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url):
+ # Extract id from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+ video_id = mobj.group(1)
+
+ video_extension = 'flv'
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request(url)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract URL, uploader, and title from webpage
+ self.report_extraction(video_id)
+ mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
+ return
+ mediaURL = urllib.unquote(mobj.group(1))
+
+ video_url = mediaURL
+
+ mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(vide_title)
+
+ video_uploader = mobj.group(2).decode('utf-8')
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class YahooIE(InfoExtractor):
+ """Information extractor for video.yahoo.com."""
+
+ # _VALID_URL matches all Yahoo! Video URLs
+ # _VPAGE_URL matches only the extractable '/watch/' URLs
+ _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
+ _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
+ IE_NAME = u'video.yahoo'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url, new_video=True):
+ # Extract ID from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+ video_id = mobj.group(2)
+ video_extension = 'flv'
+
+ # Rewrite valid but non-extractable URLs as
+ # extractable English language /watch/ URLs
+ if re.match(self._VPAGE_URL, url) is None:
+ request = urllib2.Request(url)
+ try:
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Unable to extract id field')
+ return
+ yahoo_id = mobj.group(1)
+
+ mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Unable to extract vid field')
+ return
+ yahoo_vid = mobj.group(1)
+
+ url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
+ return self._real_extract(url, new_video=False)
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request(url)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract uploader and title from webpage
+ self.report_extraction(video_id)
+ mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ simple_title = _simplify_title(video_title)
+
+ mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video uploader')
+ return
+ video_uploader = mobj.group(1).decode('utf-8')
+
+ # Extract video thumbnail
+ mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
+ return
+ video_thumbnail = mobj.group(1).decode('utf-8')
+
+ # Extract video description
+ mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video description')
+ return
+ video_description = mobj.group(1).decode('utf-8')
+ if not video_description:
+ video_description = 'No description available.'
+
+ # Extract video height and width
+ mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video height')
+ return
+ yv_video_height = mobj.group(1)
+
+ mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video width')
+ return
+ yv_video_width = mobj.group(1)
+
+ # Retrieve video playlist to extract media URL
+ # I'm not completely sure what all these options are, but we
+ # seem to need most of them, otherwise the server sends a 401.
+ yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
+ yv_bitrate = '700' # according to Wikipedia this is hard-coded
+ request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
+ '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
+ '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ # Extract media URL from playlist XML
+ mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Unable to extract media URL')
+ return
+ video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
+ video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class VimeoIE(InfoExtractor):
+ """Information extractor for vimeo.com."""
+
+ # _VALID_URL matches Vimeo URLs
+ _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
+ IE_NAME = u'vimeo'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url, new_video=True):
+ # Extract ID from URL
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+ video_id = mobj.group(1)
+
+ # Retrieve video webpage to extract further information
+ request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+
+ # Now we begin extracting as much information as we can from what we
+ # retrieved. First we extract the information common to all extractors,
+ # and latter we extract those that are Vimeo specific.
+ self.report_extraction(video_id)
+
+ # Extract title
+ mobj = re.search(r'<caption>(.*?)</caption>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ simple_title = _simplify_title(video_title)
+
+ # Extract uploader
+ mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video uploader')
+ return
+ video_uploader = mobj.group(1).decode('utf-8')
+
+ # Extract video thumbnail
+ mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
+ return
+ video_thumbnail = mobj.group(1).decode('utf-8')
+
+ # # Extract video description
+ # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
+ # if mobj is None:
+ # self._downloader.trouble(u'ERROR: unable to extract video description')
+ # return
+ # video_description = mobj.group(1).decode('utf-8')
+ # if not video_description: video_description = 'No description available.'
+ video_description = 'Foo.'
+
+ # Vimeo specific: extract request signature
+ mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract request signature')
+ return
+ sig = mobj.group(1).decode('utf-8')
+
+ # Vimeo specific: extract video quality information
+ mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract video quality information')
+ return
+ quality = mobj.group(1).decode('utf-8')
+
+ if int(quality) == 1:
+ quality = 'hd'
+ else:
+ quality = 'sd'
+
+ # Vimeo specific: Extract request signature expiration
+ mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
+ return
+ sig_exp = mobj.group(1).decode('utf-8')
+
+ video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': u'mp4',
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'player_url': None,
+ })
+ except UnavailableVideoError:
+ self._downloader.trouble(u'ERROR: unable to download video')
+
+
+class GenericIE(InfoExtractor):
+ """Generic last-resort information extractor."""
+
+ _VALID_URL = r'.*'
+ IE_NAME = u'generic'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
+ self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
+
+ def _real_extract(self, url):
+ # At this point we have a new video
+ self._downloader.increment_downloads()
+
+ video_id = url.split('/')[-1]
+ request = urllib2.Request(url)
+ try:
+ self.report_download_webpage(video_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+ return
+ except ValueError, err:
+ # since this is the last-resort InfoExtractor, if
+ # this error is thrown, it'll be thrown here
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ self.report_extraction(video_id)
+ # Start with something easy: JW Player in SWFObject
+ mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+ if mobj is None:
+ # Broaden the search a little bit
+ mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ # It's possible that one of the regexes
+ # matched, but returned an empty group:
+ if mobj.group(1) is None:
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+ return
+
+ video_url = urllib.unquote(mobj.group(1))
+ video_id = os.path.basename(video_url)
+
+ # here's a fun little line of code for you:
+ video_extension = os.path.splitext(video_id)[1][1:]
+ video_id = os.path.splitext(video_id)[0]
+
+ # it's tempting to parse this further, but you would
+ # have to take into account all the variations like
+ # Video Title - Site Name
+ # Site Name | Video Title
+ # Video Title - Tagline | Site Name
+ # and so on and so forth; it's just not practical
+ mobj = re.search(r'<title>(.*)</title>', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_title = mobj.group(1).decode('utf-8')
+ video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(video_title)
+
+ # video uploader is domain name
+ mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ video_uploader = mobj.group(1).decode('utf-8')
+
+ try:
+ # Process video information
+ self._downloader.process_info({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'stitle': simple_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+
+
+class YoutubeSearchIE(InfoExtractor):
+ """Information Extractor for YouTube search queries."""
+ _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
+ _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
+ _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
+ _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
+ _youtube_ie = None
+ _max_youtube_results = 1000
+ IE_NAME = u'youtube:search'
+
+ def __init__(self, youtube_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._youtube_ie = youtube_ie
+
+ def report_download_page(self, query, pagenum):
+ """Report attempt to download playlist page with given number."""
+ query = query.decode(preferredencoding())
+ self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
+
+ def _real_initialize(self):
+ self._youtube_ie.initialize()
+
+ def _real_extract(self, query):
+ mobj = re.match(self._VALID_URL, query)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
+ return
+
+ prefix, query = query.split(':')
+ prefix = prefix[8:]
+ query = query.encode('utf-8')
+ if prefix == '':
+ self._download_n_results(query, 1)
+ return
+ elif prefix == 'all':
+ self._download_n_results(query, self._max_youtube_results)
+ return
+ else:
+ try:
+ n = long(prefix)
+ if n <= 0:
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
+ return
+ elif n > self._max_youtube_results:
+ self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
+ n = self._max_youtube_results
+ self._download_n_results(query, n)
+ return
+ except ValueError: # parsing prefix as integer fails
+ self._download_n_results(query, 1)
+ return
+
+ def _download_n_results(self, query, n):
+ """Downloads a specified number of results for a query"""
+
+ video_ids = []
+ already_seen = set()
+ pagenum = 1
+
+ while True:
+ self.report_download_page(query, pagenum)
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+ request = urllib2.Request(result_url)
+ try:
+ page = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ return
+
+ # Extract video identifiers
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
+ if video_id not in already_seen:
+ video_ids.append(video_id)
+ already_seen.add(video_id)
+ if len(video_ids) == n:
+ # Specified n videos reached
+ for id in video_ids:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ return
+
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ for id in video_ids:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ return
+
+ pagenum = pagenum + 1
+
+
+class GoogleSearchIE(InfoExtractor):
+ """Information Extractor for Google Video search queries."""
+ _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
+ _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
+ _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
+ _MORE_PAGES_INDICATOR = r'<span>Next</span>'
+ _google_ie = None
+ _max_google_results = 1000
+ IE_NAME = u'video.google:search'
+
+ def __init__(self, google_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._google_ie = google_ie
+
+ def report_download_page(self, query, pagenum):
+ """Report attempt to download playlist page with given number."""
+ query = query.decode(preferredencoding())
+ self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
+
+ def _real_initialize(self):
+ self._google_ie.initialize()
+
+ def _real_extract(self, query):
+ mobj = re.match(self._VALID_URL, query)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
+ return
+
+ prefix, query = query.split(':')
+ prefix = prefix[8:]
+ query = query.encode('utf-8')
+ if prefix == '':
+ self._download_n_results(query, 1)
+ return
+ elif prefix == 'all':
+ self._download_n_results(query, self._max_google_results)
+ return
+ else:
+ try:
+ n = long(prefix)
+ if n <= 0:
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
+ return
+ elif n > self._max_google_results:
+ self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
+ n = self._max_google_results
+ self._download_n_results(query, n)
+ return
+ except ValueError: # parsing prefix as integer fails
+ self._download_n_results(query, 1)
+ return
+
+ def _download_n_results(self, query, n):
+ """Downloads a specified number of results for a query"""
+
+ video_ids = []
+ already_seen = set()
+ pagenum = 1
+
+ while True:
+ self.report_download_page(query, pagenum)
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+ request = urllib2.Request(result_url)
+ try:
+ page = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ return
+
+ # Extract video identifiers
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ video_id = mobj.group(1)
+ if video_id not in already_seen:
+ video_ids.append(video_id)
+ already_seen.add(video_id)
+ if len(video_ids) == n:
+ # Specified n videos reached
+ for id in video_ids:
+ self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
+ return
+
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ for id in video_ids:
+ self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
+ return
+
+ pagenum = pagenum + 1
+
+
+class YahooSearchIE(InfoExtractor):
+ """Information Extractor for Yahoo! Video search queries."""
+ _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
+ _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
+ _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
+ _MORE_PAGES_INDICATOR = r'\s*Next'
+ _yahoo_ie = None
+ _max_yahoo_results = 1000
+ IE_NAME = u'video.yahoo:search'
+
+ def __init__(self, yahoo_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._yahoo_ie = yahoo_ie
+
+ def report_download_page(self, query, pagenum):
+ """Report attempt to download playlist page with given number."""
+ query = query.decode(preferredencoding())
+ self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
+
+ def _real_initialize(self):
+ self._yahoo_ie.initialize()
+
+ def _real_extract(self, query):
+ mobj = re.match(self._VALID_URL, query)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
+ return
+
+ prefix, query = query.split(':')
+ prefix = prefix[8:]
+ query = query.encode('utf-8')
+ if prefix == '':
+ self._download_n_results(query, 1)
+ return
+ elif prefix == 'all':
+ self._download_n_results(query, self._max_yahoo_results)
+ return
+ else:
+ try:
+ n = long(prefix)
+ if n <= 0:
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
+ return
+ elif n > self._max_yahoo_results:
+ self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
+ n = self._max_yahoo_results
+ self._download_n_results(query, n)
+ return
+ except ValueError: # parsing prefix as integer fails
+ self._download_n_results(query, 1)
+ return
+
+ def _download_n_results(self, query, n):
+ """Downloads a specified number of results for a query"""
+
+ video_ids = []
+ already_seen = set()
+ pagenum = 1
+
+ while True:
+ self.report_download_page(query, pagenum)
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+ request = urllib2.Request(result_url)
+ try:
+ page = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ return
+
+ # Extract video identifiers
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ video_id = mobj.group(1)
+ if video_id not in already_seen:
+ video_ids.append(video_id)
+ already_seen.add(video_id)
+ if len(video_ids) == n:
+ # Specified n videos reached
+ for id in video_ids:
+ self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
+ return
+
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ for id in video_ids:
+ self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
+ return
+
+ pagenum = pagenum + 1
+
+
+class YoutubePlaylistIE(InfoExtractor):
+ """Information Extractor for YouTube playlists."""
+
+ _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
+ _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
+ _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
+ _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
+ _youtube_ie = None
+ IE_NAME = u'youtube:playlist'
+
+ def __init__(self, youtube_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._youtube_ie = youtube_ie
+
+ def report_download_page(self, playlist_id, pagenum):
+ """Report attempt to download playlist page with given number."""
+ self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
+
+ def _real_initialize(self):
+ self._youtube_ie.initialize()
+
+ def _real_extract(self, url):
+ # Extract playlist id
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid url: %s' % url)
+ return
+
+ # Single video case
+ if mobj.group(3) is not None:
+ self._youtube_ie.extract(mobj.group(3))
+ return
+
+ # Download playlist pages
+ # prefix is 'p' as default for playlists but there are other types that need extra care
+ playlist_prefix = mobj.group(1)
+ if playlist_prefix == 'a':
+ playlist_access = 'artist'
+ else:
+ playlist_prefix = 'p'
+ playlist_access = 'view_play_list'
+ playlist_id = mobj.group(2)
+ video_ids = []
+ pagenum = 1
+
+ while True:
+ self.report_download_page(playlist_id, pagenum)
+ url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
+ request = urllib2.Request(url)
+ try:
+ page = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ return
+
+ # Extract video identifiers
+ ids_in_page = []
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ if mobj.group(1) not in ids_in_page:
+ ids_in_page.append(mobj.group(1))
+ video_ids.extend(ids_in_page)
+
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ break
+ pagenum = pagenum + 1
+
+ playliststart = self._downloader.params.get('playliststart', 1) - 1
+ playlistend = self._downloader.params.get('playlistend', -1)
+ video_ids = video_ids[playliststart:playlistend]
+
+ for id in video_ids:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ return
+
+
+class YoutubeUserIE(InfoExtractor):
+ """Information Extractor for YouTube users."""
+
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
+ _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
+ _GDATA_PAGE_SIZE = 50
+ _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
+ _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
+ _youtube_ie = None
+ IE_NAME = u'youtube:user'
+
+ def __init__(self, youtube_ie, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+ self._youtube_ie = youtube_ie
+
+ def report_download_page(self, username, start_index):
+ """Report attempt to download user page."""
+ self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
+ (username, start_index, start_index + self._GDATA_PAGE_SIZE))
+
+ def _real_initialize(self):
+ self._youtube_ie.initialize()
+
+ def _real_extract(self, url):
+ # Extract username
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid url: %s' % url)
+ return
+
+ username = mobj.group(1)
+
+ # Download video ids using YouTube Data API. Result size per
+ # query is limited (currently to 50 videos) so we need to query
+ # page by page until there are no video ids - it means we got
+ # all of them.
+
+ video_ids = []
+ pagenum = 0
+
+ while True:
+ start_index = pagenum * self._GDATA_PAGE_SIZE + 1
+ self.report_download_page(username, start_index)
+
+ request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
+
+ try:
+ page = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ return
+
+ # Extract video identifiers
+ ids_in_page = []
+
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ if mobj.group(1) not in ids_in_page:
+ ids_in_page.append(mobj.group(1))
+
+ video_ids.extend(ids_in_page)
+
+ # A little optimization - if current page is not
+ # "full", ie. does not contain PAGE_SIZE video ids then
+ # we can assume that this page is the last one - there
+ # are no more ids on further pages - no need to query
+ # again.
+
+ if len(ids_in_page) < self._GDATA_PAGE_SIZE:
+ break
+
+ pagenum += 1
+
+ all_ids_count = len(video_ids)
+ playliststart = self._downloader.params.get('playliststart', 1) - 1
+ playlistend = self._downloader.params.get('playlistend', -1)
+
+ if playlistend == -1:
+ video_ids = video_ids[playliststart:]
+ else:
+ video_ids = video_ids[playliststart:playlistend]
+
+ self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
+ (username, all_ids_count, len(video_ids)))
+
+ for video_id in video_ids:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
+
+
+class DepositFilesIE(InfoExtractor):
+ """Information extractor for depositfiles.com"""
+
+ _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
+ IE_NAME = u'DepositFiles'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_webpage(self, file_id):
+ """Report webpage download."""
+ self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
+
+ def report_extraction(self, file_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
+
+ def _real_extract(self, url):
+ # At this point we have a new file
+ self._downloader.increment_downloads()
+
+ file_id = url.split('/')[-1]
+ # Rebuild url in english locale
+ url = 'http://depositfiles.com/en/files/' + file_id
+
+ # Retrieve file webpage with 'Free download' button pressed
+ free_download_indication = { 'gateway_result' : '1' }
+ request = urllib2.Request(url, urllib.urlencode(free_download_indication))
+ try:
+ self.report_download_webpage(file_id)
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
+ return
+
+ # Search for the real file URL
+ mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
+ if (mobj is None) or (mobj.group(1) is None):
+ # Try to figure out reason of the error.
+ mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
+ if (mobj is not None) and (mobj.group(1) is not None):
+ restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
+ self._downloader.trouble(u'ERROR: %s' % restriction_message)
+ else:
+ self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
+ return
+
+ file_url = mobj.group(1)
+ file_extension = os.path.splitext(file_url)[1][1:]
+
+ # Search for file title
+ mobj = re.search(r'<b title="(.*?)">', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract title')
+ return
+ file_title = mobj.group(1).decode('utf-8')
+
+ try:
+ # Process file information
+ self._downloader.process_info({
+ 'id': file_id.decode('utf-8'),
+ 'url': file_url.decode('utf-8'),
+ 'uploader': u'NA',
+ 'upload_date': u'NA',
+ 'title': file_title,
+ 'stitle': file_title,
+ 'ext': file_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ })
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'ERROR: unable to download file')
+
+
+class FacebookIE(InfoExtractor):
+ """Information Extractor for Facebook"""
+
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
+ _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
+ _NETRC_MACHINE = 'facebook'
+ _available_formats = ['video', 'highqual', 'lowqual']
+ _video_extensions = {
+ 'video': 'mp4',
+ 'highqual': 'mp4',
+ 'lowqual': 'mp4',
+ }
+ IE_NAME = u'facebook'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def _reporter(self, message):
+ """Add header and report message."""
+ self._downloader.to_screen(u'[facebook] %s' % message)
+
+ def report_login(self):
+ """Report attempt to log in."""
+ self._reporter(u'Logging in')
+
+ def report_video_webpage_download(self, video_id):
+ """Report attempt to download video webpage."""
+ self._reporter(u'%s: Downloading video webpage' % video_id)
+
+ def report_information_extraction(self, video_id):
+ """Report attempt to extract video information."""
+ self._reporter(u'%s: Extracting video information' % video_id)
+
+ def _parse_page(self, video_webpage):
+ """Extract video information from page"""
+ # General data
+ data = {'title': r'\("video_title", "(.*?)"\)',
+ 'description': r'<div class="datawrap">(.*?)</div>',
+ 'owner': r'\("video_owner_name", "(.*?)"\)',
+ 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
+ }
+ video_info = {}
+ for piece in data.keys():
+ mobj = re.search(data[piece], video_webpage)
+ if mobj is not None:
+ video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
+
+ # Video urls
+ video_urls = {}
+ for fmt in self._available_formats:
+ mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
+ if mobj is not None:
+ # URL is in a Javascript segment inside an escaped Unicode format within
+ # the generally utf-8 page
+ video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
+ video_info['video_urls'] = video_urls
+
+ return video_info
+
+ def _real_initialize(self):
+ if self._downloader is None:
+ return
+
+ useremail = None
+ password = None
+ downloader_params = self._downloader.params
+
+ # Attempt to use provided username and password or .netrc data
+ if downloader_params.get('username', None) is not None:
+ useremail = downloader_params['username']
+ password = downloader_params['password']
+ elif downloader_params.get('usenetrc', False):
+ try:
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+ if info is not None:
+ useremail = info[0]
+ password = info[2]
+ else:
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+ except (IOError, netrc.NetrcParseError), err:
+ self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
+ return
+
+ if useremail is None:
+ return
+
+ # Log in
+ login_form = {
+ 'email': useremail,
+ 'pass': password,
+ 'login': 'Log+In'
+ }
+ request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
+ try:
+ self.report_login()
+ login_results = urllib2.urlopen(request).read()
+ if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
+ self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+ return
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
+ return