_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
- country code provided with geo_bypass_country. (experimental)
+ country code provided with geo_bypass_country.
_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
countries for this extractor. One of these countries will be used by
geo restriction bypass mechanism right away in order to bypass
- geo restriction, of course, if the mechanism is not disabled. (experimental)
+ geo restriction, of course, if the mechanism is not disabled.
- NB: both these geo attributes are experimental and may change in future
- or be completely removed.
+ _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
+ IP blocks in CIDR notation for this extractor. One of these IP blocks
+ will be used by geo restriction bypass mechanism similarly
+ to _GEO_COUNTRIES.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
_x_forwarded_for_ip = None
_GEO_BYPASS = True
_GEO_COUNTRIES = None
+ _GEO_IP_BLOCKS = None
_WORKING = True
def __init__(self, downloader=None):
def initialize(self):
"""Initializes an instance (authentication, etc)."""
- self._initialize_geo_bypass(self._GEO_COUNTRIES)
+ self._initialize_geo_bypass({
+ 'countries': self._GEO_COUNTRIES,
+ 'ip_blocks': self._GEO_IP_BLOCKS,
+ })
if not self._ready:
self._real_initialize()
self._ready = True
- def _initialize_geo_bypass(self, countries):
+ def _initialize_geo_bypass(self, geo_bypass_context):
"""
Initialize geo restriction bypass mechanism.
HTTP requests.
This method will be used for initial geo bypass mechanism initialization
- during the instance initialization with _GEO_COUNTRIES.
+ during the instance initialization with _GEO_COUNTRIES and
+ _GEO_IP_BLOCKS.
- You may also manually call it from extractor's code if geo countries
+ You may also manually call it from extractor's code if geo bypass
information is not available beforehand (e.g. obtained during
- extraction) or due to some another reason.
+ extraction) or due to some other reason. In this case you should pass
+ this information in geo bypass context passed as first argument. It may
+ contain following fields:
+
+ countries: List of geo unrestricted countries (similar
+ to _GEO_COUNTRIES)
+ ip_blocks: List of geo unrestricted IP blocks in CIDR notation
+ (similar to _GEO_IP_BLOCKS)
+
"""
if not self._x_forwarded_for_ip:
- country_code = self._downloader.params.get('geo_bypass_country', None)
- # If there is no explicit country for geo bypass specified and
- # the extractor is known to be geo restricted let's fake IP
- # as X-Forwarded-For right away.
- if (not country_code and
- self._GEO_BYPASS and
- self._downloader.params.get('geo_bypass', True) and
- countries):
- country_code = random.choice(countries)
- if country_code:
- self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+
+ # Geo bypass mechanism is explicitly disabled by user
+ if not self._downloader.params.get('geo_bypass', True):
+ return
+
+ if not geo_bypass_context:
+ geo_bypass_context = {}
+
+ # Backward compatibility: previously _initialize_geo_bypass
+ # expected a list of countries, some 3rd party code may still use
+ # it this way
+ if isinstance(geo_bypass_context, (list, tuple)):
+ geo_bypass_context = {
+ 'countries': geo_bypass_context,
+ }
+
+ # The whole point of geo bypass mechanism is to fake IP
+ # as X-Forwarded-For HTTP header based on some IP block or
+ # country code.
+
+ # Path 1: bypassing based on IP block in CIDR notation
+
+ # Explicit IP block specified by user, use it right away
+ # regardless of whether extractor is geo bypassable or not
+ ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+
+ # Otherwise use random IP block from geo bypass context but only
+ # if extractor is known as geo bypassable
+ if not ip_block:
+ ip_blocks = geo_bypass_context.get('ip_blocks')
+ if self._GEO_BYPASS and ip_blocks:
+ ip_block = random.choice(ip_blocks)
+
+ if ip_block:
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_screen(
+ '[debug] Using fake IP %s as X-Forwarded-For.'
+ % self._x_forwarded_for_ip)
+ return
+
+ # Path 2: bypassing based on country code
+
+ # Explicit country code specified by user, use it right away
+ # regardless of whether extractor is geo bypassable or not
+ country = self._downloader.params.get('geo_bypass_country', None)
+
+ # Otherwise use random country code from geo bypass context but
+ # only if extractor is known as geo bypassable
+ if not country:
+ countries = geo_bypass_context.get('countries')
+ if self._GEO_BYPASS and countries:
+ country = random.choice(countries)
+
+ if country:
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
if self._downloader.params.get('verbose', False):
self._downloader.to_screen(
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
- % (self._x_forwarded_for_ip, country_code.upper()))
+ % (self._x_forwarded_for_ip, country.upper()))
def extract(self, url):
"""Extracts URL information and returns it in list of dicts."""
content, _ = res
return content
+ def _download_xml_handle(
+ self, url_or_request, video_id, note='Downloading XML',
+ errnote='Unable to download XML', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={}):
+ """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query)
+ if res is False:
+ return res
+ xml_string, urlh = res
+ return self._parse_xml(
+ xml_string, video_id, transform_source=transform_source,
+ fatal=fatal), urlh
+
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
transform_source=None, fatal=True, encoding=None,
data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element"""
- xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
- if xml_string is False:
- return xml_string
- return self._parse_xml(
- xml_string, video_id, transform_source=transform_source,
- fatal=fatal)
+ res = self._download_xml_handle(
+ url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=transform_source, fatal=fatal, encoding=encoding,
+ data=data, headers=headers, query=query)
+ return res if res is False else res[0]
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
else:
self.report_warning(errmsg + str(ve))
- def _download_json(self, url_or_request, video_id,
- note='Downloading JSON metadata',
- errnote='Unable to download JSON metadata',
- transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- json_string = self._download_webpage(
+ def _download_json_handle(
+ self, url_or_request, video_id, note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={}):
+ """Return a tuple (JSON object, URL handle)"""
+ res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query)
- if (not fatal) and json_string is False:
- return None
+ if res is False:
+ return res
+ json_string, urlh = res
return self._parse_json(
- json_string, video_id, transform_source=transform_source, fatal=fatal)
+ json_string, video_id, transform_source=transform_source,
+ fatal=fatal), urlh
+
+ def _download_json(
+ self, url_or_request, video_id, note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={}):
+ res = self._download_json_handle(
+ url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=transform_source, fatal=fatal, encoding=encoding,
+ data=data, headers=headers, query=query)
+ return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
if transform_source:
if isinstance(json_ld, dict):
json_ld = [json_ld]
+ INTERACTION_TYPE_MAP = {
+ 'CommentAction': 'comment',
+ 'AgreeAction': 'like',
+ 'DisagreeAction': 'dislike',
+ 'LikeAction': 'like',
+ 'DislikeAction': 'dislike',
+ 'ListenAction': 'view',
+ 'WatchAction': 'view',
+ 'ViewAction': 'view',
+ }
+
+ def extract_interaction_statistic(e):
+ interaction_statistic = e.get('interactionStatistic')
+ if not isinstance(interaction_statistic, list):
+ return
+ for is_e in interaction_statistic:
+ if not isinstance(is_e, dict):
+ continue
+ if is_e.get('@type') != 'InteractionCounter':
+ continue
+ interaction_type = is_e.get('interactionType')
+ if not isinstance(interaction_type, compat_str):
+ continue
+ interaction_count = int_or_none(is_e.get('userInteractionCount'))
+ if interaction_count is None:
+ continue
+ count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
+ if not count_kind:
+ continue
+ count_key = '%s_count' % count_kind
+ if info.get(count_key) is not None:
+ continue
+ info[count_key] = interaction_count
+
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
info.update({
'height': int_or_none(e.get('height')),
'view_count': int_or_none(e.get('interactionCount')),
})
+ extract_interaction_statistic(e)
for e in json_ld:
- if e.get('@context') == 'http://schema.org':
+ if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
return info
})
return subtitles
- def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
xspf = self._download_xml(
- playlist_url, playlist_id, 'Downloading xpsf playlist',
+ xspf_url, playlist_id, 'Downloading xpsf playlist',
'Unable to download xspf manifest', fatal=fatal)
if xspf is False:
return []
- return self._parse_xspf(xspf, playlist_id)
+ return self._parse_xspf(
+ xspf, playlist_id, xspf_url=xspf_url,
+ xspf_base_url=base_url(xspf_url))
- def _parse_xspf(self, playlist, playlist_id):
+ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
NS_MAP = {
'xspf': 'http://xspf.org/ns/0/',
's1': 'http://static.streamone.nl/player/ns/0',
}
entries = []
- for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
title = xpath_text(
track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
description = xpath_text(
duration = float_or_none(
xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
- formats = [{
- 'url': location.text,
- 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
- 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
- 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
- } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+ formats = []
+ for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+ format_url = urljoin(xspf_base_url, location.text)
+ if not format_url:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'manifest_url': xspf_url,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ })
self._sort_formats(formats)
entries.append({
return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
- res = self._download_webpage_handle(
+ res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal)
if res is False:
return []
- mpd, urlh = res
+ mpd_doc, urlh = res
mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats(
- compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+ mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
formats_dict=formats_dict, mpd_url=mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
return formats
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
- res = self._download_webpage_handle(
+ res = self._download_xml_handle(
ism_url, video_id,
note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest',
fatal=fatal)
if res is False:
return []
- ism, urlh = res
+ ism_doc, urlh = res
- return self._parse_ism_formats(
- compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+ return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
"""
return formats
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
- def absolute_url(video_url):
- return compat_urlparse.urljoin(base_url, video_url)
+ def absolute_url(item_url):
+ return urljoin(base_url, item_url)
def parse_content_type(content_type):
if not content_type:
if src:
_, formats = _media_formats(src, media_type)
media_info['formats'].extend(formats)
- media_info['thumbnail'] = media_attributes.get('poster')
+ media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
source_attributes = extract_attributes(source_tag)