compat_cookies,
compat_etree_fromstring,
compat_getpass,
+ compat_integer_types,
compat_http_client,
compat_os_name,
compat_str,
GeoUtils,
int_or_none,
js_to_json,
+ JSON_LD_RE,
mimetype2ext,
orderedSet,
parse_codecs,
update_url_query,
urljoin,
url_basename,
+ url_or_none,
xpath_element,
xpath_text,
xpath_with_ns,
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
+ channel: Full name of the channel the video is uploaded on.
+ Note that channel fields may or may not repeat uploader
+ fields. This depends on a particular extractor.
+ channel_id: Id of the channel.
+ channel_url: Full URL to a channel webpage.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
- """ Returns the response handle """
+ @staticmethod
+ def __can_accept_status_code(err, expected_status):
+ assert isinstance(err, compat_urllib_error.HTTPError)
+ if expected_status is None:
+ return False
+ if isinstance(expected_status, compat_integer_types):
+ return err.code == expected_status
+ elif isinstance(expected_status, (list, tuple)):
+ return err.code in expected_status
+ elif callable(expected_status):
+ return expected_status(err.code) is True
+ else:
+ assert False
+
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the response handle.
+
+ See _download_webpage docstring for arguments specification.
+ """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if isinstance(err, compat_urllib_error.HTTPError):
+ if self.__can_accept_status_code(err, expected_status):
+ # Retain reference to error to prevent file object from
+ # being closed before it can be read. Works around the
+ # effects of <https://bugs.python.org/issue15002>
+ # introduced in Python 3.4.1.
+ err.fp._error = err
+ return err.fp
+
if errnote is False:
return False
if errnote is None:
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
- """ Returns a tuple (page content as string, URL handle) """
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return a tuple (page content as string, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
if urlh is False:
assert not fatal
return False
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
- """ Returns the data of the page as a string """
+ def _download_webpage(
+ self, url_or_request, video_id, note=None, errnote=None,
+ fatal=True, tries=1, timeout=5, encoding=None, data=None,
+ headers={}, query={}, expected_status=None):
+ """
+ Return the data of the page as a string.
+
+ Arguments:
+ url_or_request -- plain text URL as a string or
+ a compat_urllib_request.Requestobject
+ video_id -- Video/playlist/item identifier (string)
+
+ Keyword arguments:
+ note -- note printed before downloading (string)
+ errnote -- note printed in case of an error (string)
+ fatal -- flag denoting whether error should be considered fatal,
+ i.e. whether it should cause ExtractionError to be raised,
+ otherwise a warning will be reported and extraction continued
+ tries -- number of tries
+ timeout -- sleep interval between tries
+ encoding -- encoding for a page content decoding, guessed automatically
+ when not explicitly specified
+ data -- POST data (bytes)
+ headers -- HTTP headers (dict)
+ query -- URL query (dict)
+ expected_status -- allows to accept failed HTTP requests (non 2xx
+ status code) by explicitly specifying a set of accepted status
+ codes. Can be any of the following entities:
+ - an integer type specifying an exact failed status code to
+ accept
+ - a list or a tuple of integer types specifying a list of
+ failed status codes to accept
+ - a callable accepting an actual failed status code and
+ returning True if it should be accepted
+ Note that this argument does not affect success status codes (2xx)
+ which are always accepted.
+ """
+
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
def _download_xml_handle(
self, url_or_request, video_id, note='Downloading XML',
errnote='Unable to download XML', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
if res is False:
return res
xml_string, urlh = res
xml_string, video_id, transform_source=transform_source,
fatal=fatal), urlh
- def _download_xml(self, url_or_request, video_id,
- note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None,
- data=None, headers={}, query={}):
- """Return the xml as an xml.etree.ElementTree.Element"""
+ def _download_xml(
+ self, url_or_request, video_id,
+ note='Downloading XML', errnote='Unable to download XML',
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the xml as an xml.etree.ElementTree.Element.
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_xml_handle(
url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query)
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
return res if res is False else res[0]
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
def _download_json_handle(
self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- """Return a tuple (JSON object, URL handle)"""
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (JSON object, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
if res is False:
return res
json_string, urlh = res
def _download_json(
self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return the JSON object as a dict.
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_json_handle(
url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query)
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
- r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
- html, 'JSON-LD', group='json_ld', **kwargs)
+ JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
default = kwargs.get('default', NO_DEFAULT)
if not json_ld:
return default if default is not NO_DEFAULT else {}
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
info.update({
- 'url': e.get('contentUrl'),
+ 'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+ 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
'filesize': float_or_none(e.get('contentSize')),
if expected_type is not None and expected_type != item_type:
return info
if item_type in ('TVEpisode', 'Episode'):
+ episode_name = unescapeHTML(e.get('name'))
info.update({
- 'episode': unescapeHTML(e.get('name')),
+ 'episode': episode_name,
'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')),
})
+ if not info.get('title') and episode_name:
+ info['title'] = episode_name
part_of_season = e.get('partOfSeason')
if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name'))
+ elif item_type == 'Movie':
+ info.update({
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('dateCreated')),
+ })
elif item_type in ('Article', 'NewsArticle'):
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
- # referencing audio group an audio group, it represents
- # a complete (with audio and video) format. So, for such cases
- # we will ignore references to rendition groups and treat them
+ # referencing an audio group it represents a complete
+ # (with audio and video) format. So, for such cases we will
+ # ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f.get('vcodec') != 'none':
audio_group = groups.get(audio_group_id)
'height': height,
})
formats.extend(m3u8_formats)
- continue
-
- if src_ext == 'f4m':
+ elif src_ext == 'f4m':
f4m_url = src_url
if not f4m_params:
f4m_params = {
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
- continue
-
- if src_url.startswith('http') and self._is_valid_url(src, video_id):
+ elif src_ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src_url, video_id, mpd_id='dash', fatal=False))
+ elif re.search(r'\.ism/[Mm]anifest', src_url):
+ formats.extend(self._extract_ism_formats(
+ src_url, video_id, ism_id='mss', fatal=False))
+ elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
'width': width,
'height': height,
})
- continue
return formats
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
- t = representation_ms_info[template_name]
+ tmpl = representation_ms_info[template_name]
+ # First of, % characters outside $...$ templates
+ # must be escaped by doubling for proper processing
+ # by % operator string formatting used further (see
+ # https://github.com/rg3/youtube-dl/issues/16867).
+ t = ''
+ in_template = False
+ for c in tmpl:
+ t += c
+ if c == '$':
+ in_template = not in_template
+ elif c == '%' and not in_template:
+ t += c
+ # Next, $...$ templates are translated to their
+ # %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
media_info['subtitles'].setdefault(lang, []).append({
'url': absolute_url(src),
})
+ for f in media_info['formats']:
+ f.setdefault('http_headers', {})['Referer'] = base_url
if media_info['formats'] or media_info['subtitles']:
entries.append(media_info)
return entries