import re
import socket
import sys
+import netrc
from ..utils import (
compat_http_client,
clean_html,
compiled_regex_type,
ExtractorError,
+ unescapeHTML,
)
class InfoExtractor(object):
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
+ thumbnails: A list of dictionaries (with the entries "resolution" and
+ "url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
uploader: Full name of the video uploader.
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
- subtitles: The subtitle file contents.
+ subtitles: The subtitle file contents as a dictionary in the format
+ {language: subtitles}.
view_count: How many users have watched the video on the platform.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url) is not None
+
+ # This does not use has/getattr intentionally - we want to know whether
+ # we have cached the regexp for *this* class, whereas getattr would also
+ # match the superclass
+ if '_VALID_URL_RE' not in cls.__dict__:
+ cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+ return cls._VALID_URL_RE.match(url) is not None
@classmethod
def working(cls):
"""Real extraction process. Redefine in subclasses."""
pass
+ @classmethod
+ def ie_key(cls):
+ """A string for getting the InfoExtractor with get_info_extractor"""
+ return cls.__name__[:-2]
+
@property
def IE_NAME(self):
return type(self).__name__[:-2]
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
+
+ # Strip hashes from the URL (#1038)
+ if isinstance(url_or_request, (compat_str, str)):
+ url_or_request = url_or_request.partition('#')[0]
+
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
else:
- encoding = 'utf-8'
- webpage_bytes = urlh.read()
+ m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+ webpage_bytes[:1024])
+ if m:
+ encoding = m.group(1).decode('ascii')
+ else:
+ encoding = 'utf-8'
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
"""Report attempt to confirm age."""
self.to_screen(u'Confirming age')
+ def report_login(self):
+ """Report attempt to log in."""
+ self.to_screen(u'Logging in')
+
#Methods for following #608
- #They set the correct value of the '_type' key
- def video_result(self, video_info):
- """Returns a video"""
- video_info['_type'] = 'video'
- return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
else:
return res
+ def _get_login_info(self):
+ """
+ Get the the login info as (username, password)
+ It will look in the netrc file using the _NETRC_MACHINE value
+ If there's no info available, return (None, None)
+ """
+ if self._downloader is None:
+ return (None, None)
+
+ username = None
+ password = None
+ downloader_params = self._downloader.params
+
+ # Attempt to use provided username and password or .netrc data
+ if downloader_params.get('username', None) is not None:
+ username = downloader_params['username']
+ password = downloader_params['password']
+ elif downloader_params.get('usenetrc', False):
+ try:
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+ if info is not None:
+ username = info[0]
+ password = info[2]
+ else:
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+ except (IOError, netrc.NetrcParseError) as err:
+ self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+
+ return (username, password)
+
+ # Helper functions for extracting OpenGraph info
+ @staticmethod
+ def _og_regex(prop):
+ return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+
+ def _og_search_property(self, prop, html, name=None, **kargs):
+ if name is None:
+ name = 'OpenGraph %s' % prop
+ escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ return unescapeHTML(escaped)
+
+ def _og_search_thumbnail(self, html, **kargs):
+ return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+
+ def _og_search_description(self, html, **kargs):
+ return self._og_search_property('description', html, fatal=False, **kargs)
+
+ def _og_search_title(self, html, **kargs):
+ return self._og_search_property('title', html, **kargs)
+
+ def _og_search_video_url(self, html, name='video url', **kargs):
+ return self._html_search_regex([self._og_regex('video:secure_url'),
+ self._og_regex('video')],
+ html, name, **kargs)
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.