sanitized_Request,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
url_basename,
xpath_element,
xpath_text,
update_Request,
update_url_query,
parse_m3u8_attributes,
+ extract_attributes,
+ parse_codecs,
)
* "height" (optional, int)
* "resolution" (optional, string "{width}x{height"},
deprecated)
+ * "filesize" (optional, int)
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
else:
return res
+ def _get_netrc_login_info(self, netrc_machine=None):
+ username = None
+ password = None
+ netrc_machine = netrc_machine or self._NETRC_MACHINE
+
+ if self._downloader.params.get('usenetrc', False):
+ try:
+ info = netrc.netrc().authenticators(netrc_machine)
+ if info is not None:
+ username = info[0]
+ password = info[2]
+ else:
+ raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine)
+ except (IOError, netrc.NetrcParseError) as err:
+ self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+
+ return (username, password)
+
def _get_login_info(self):
"""
Get the login info as (username, password)
if downloader_params.get('username') is not None:
username = downloader_params['username']
password = downloader_params['password']
- elif downloader_params.get('usenetrc', False):
- try:
- info = netrc.netrc().authenticators(self._NETRC_MACHINE)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
- except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+ else:
+ username, password = self._get_netrc_login_info()
return (username, password)
[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs):
+ if not isinstance(prop, (list, tuple)):
+ prop = [prop]
if name is None:
- name = 'OpenGraph %s' % prop
- escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+ name = 'OpenGraph %s' % prop[0]
+ og_regexes = []
+ for p in prop:
+ og_regexes.extend(self._og_regexes(p))
+ escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
if escaped is None:
return None
return unescapeHTML(escaped)
return self._og_search_property('url', html, **kargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+ if not isinstance(name, (list, tuple)):
+ name = [name]
if display_name is None:
- display_name = name
+ display_name = name[0]
return self._html_search_regex(
- self._meta_regex(name),
+ [self._meta_regex(n) for n in name],
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _search_json_ld(self, html, video_id, **kwargs):
+ def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
html, 'JSON-LD', group='json_ld', **kwargs)
+ default = kwargs.get('default', NO_DEFAULT)
if not json_ld:
- return {}
- return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
-
- def _json_ld(self, json_ld, video_id, fatal=True):
+ return default if default is not NO_DEFAULT else {}
+ # JSON-LD may be malformed and thus `fatal` should be respected.
+ # At the same time `default` may be passed that assumes `fatal=False`
+ # for _search_regex. Let's simulate the same behavior here as well.
+ fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+ return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+
+ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
if not json_ld:
return {}
info = {}
- if json_ld.get('@context') == 'http://schema.org':
- item_type = json_ld.get('@type')
- if item_type == 'TVEpisode':
- info.update({
- 'episode': unescapeHTML(json_ld.get('name')),
- 'episode_number': int_or_none(json_ld.get('episodeNumber')),
- 'description': unescapeHTML(json_ld.get('description')),
- })
- part_of_season = json_ld.get('partOfSeason')
- if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
- info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
- part_of_series = json_ld.get('partOfSeries')
- if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
- info['series'] = unescapeHTML(part_of_series.get('name'))
- elif item_type == 'Article':
- info.update({
- 'timestamp': parse_iso8601(json_ld.get('datePublished')),
- 'title': unescapeHTML(json_ld.get('headline')),
- 'description': unescapeHTML(json_ld.get('articleBody')),
- })
+ if not isinstance(json_ld, (list, tuple, dict)):
+ return info
+ if isinstance(json_ld, dict):
+ json_ld = [json_ld]
+ for e in json_ld:
+ if e.get('@context') == 'http://schema.org':
+ item_type = e.get('@type')
+ if expected_type is not None and expected_type != item_type:
+ return info
+ if item_type == 'TVEpisode':
+ info.update({
+ 'episode': unescapeHTML(e.get('name')),
+ 'episode_number': int_or_none(e.get('episodeNumber')),
+ 'description': unescapeHTML(e.get('description')),
+ })
+ part_of_season = e.get('partOfSeason')
+ if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+ info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+ part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
+ if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+ info['series'] = unescapeHTML(part_of_series.get('name'))
+ elif item_type == 'Article':
+ info.update({
+ 'timestamp': parse_iso8601(e.get('datePublished')),
+ 'title': unescapeHTML(e.get('headline')),
+ 'description': unescapeHTML(e.get('articleBody')),
+ })
+ elif item_type == 'VideoObject':
+ info.update({
+ 'url': e.get('contentUrl'),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': e.get('thumbnailUrl'),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ })
+ break
return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod
f['ext'] = determine_ext(f['url'])
if isinstance(field_preference, (list, tuple)):
- return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+ return tuple(
+ f.get(field)
+ if f.get(field) is not None
+ else ('' if field == 'format_id' else -1)
+ for field in field_preference)
preference = f.get('preference')
if preference is None:
if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
preference -= 0.5
- proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+ protocol = f.get('protocol') or determine_protocol(f)
+ proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
if f.get('vcodec') == 'none': # audio only
preference -= 50
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
- 'preference': preference - 1 if preference else -1,
+ 'preference': preference - 100 if preference else -100,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
}
'url': format_url(line.strip()),
'tbr': tbr,
'ext': ext,
+ 'fps': float_or_none(last_info.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
- codecs = last_info.get('CODECS')
- if codecs:
- vcodec, acodec = [None] * 2
- va_codecs = codecs.split(',')
- if len(va_codecs) == 1:
- # Audio only entries usually come with single codec and
- # no resolution. For more robustness we also check it to
- # be mp4 audio.
- if not resolution and va_codecs[0].startswith('mp4a'):
- vcodec, acodec = 'none', va_codecs[0]
- else:
- vcodec = va_codecs[0]
- else:
- vcodec, acodec = va_codecs[:2]
+ # Unified Streaming Platform
+ mobj = re.search(
+ r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+ if mobj:
+ abr, vbr = mobj.groups()
+ abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
f.update({
- 'acodec': acodec,
- 'vcodec': vcodec,
+ 'vbr': vbr,
+ 'abr': abr,
})
+ f.update(parse_codecs(last_info.get('CODECS')))
if last_media is not None:
f['m3u8_media'] = last_media
last_media = None
compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+ """
+ Parse formats from MPD manifest.
+ References:
+ 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+ http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+ 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+ """
if mpd_doc.get('type') == 'dynamic':
return []
s_e = segment_timeline.findall(_add_ns('S'))
if s_e:
ms_info['total_number'] = 0
+ ms_info['s'] = []
for s in s_e:
- ms_info['total_number'] += 1 + int(s.get('r', '0'))
+ r = int(s.get('r', 0))
+ ms_info['total_number'] += 1 + r
+ ms_info['s'].append({
+ 't': int(s.get('t', 0)),
+ # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+ 'd': int(s.attrib['d']),
+ 'r': r,
+ })
else:
timescale = segment_template.get('timescale')
if timescale:
continue
representation_attrib = adaptation_set.attrib.copy()
representation_attrib.update(representation.attrib)
- # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+ # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
content_type = mime_type.split('/')[0]
if content_type == 'text':
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
media_template = representation_ms_info['media_template']
media_template = media_template.replace('$RepresentationID$', representation_id)
- media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
- media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
+ media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
+ media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
media_template.replace('$$', '$')
- representation_ms_info['segment_urls'] = [
- media_template % {
- 'Number': segment_number,
- 'Bandwidth': representation_attrib.get('bandwidth')}
- for segment_number in range(
- representation_ms_info['start_number'],
- representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+
+ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+ # can't be used at the same time
+ if '%(Number' in media_template:
+ representation_ms_info['segment_urls'] = [
+ media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': representation_attrib.get('bandwidth'),
+ }
+ for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ else:
+ representation_ms_info['segment_urls'] = []
+ segment_time = 0
+
+ def add_segment_url():
+ representation_ms_info['segment_urls'].append(
+ media_template % {
+ 'Time': segment_time,
+ 'Bandwidth': representation_attrib.get('bandwidth'),
+ }
+ )
+
+ for num, s in enumerate(representation_ms_info['s']):
+ segment_time = s.get('t') or segment_time
+ add_segment_url()
+ for r in range(s.get('r', 0)):
+ segment_time += s['d']
+ add_segment_url()
+ segment_time += s['d']
if 'segment_urls' in representation_ms_info:
f.update({
'segment_urls': representation_ms_info['segment_urls'],
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
+ def _parse_html5_media_entries(self, base_url, webpage):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ entries = []
+ for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ media_info['formats'].append({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'<source[^>]+>', media_content):
+ source_attributes = extract_attributes(source_tag)
+ src = source_attributes.get('src')
+ if not src:
+ continue
+ f = parse_content_type(source_attributes.get('type'))
+ f.update({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['formats'].append(f)
+ for track_tag in re.findall(r'<track[^>]+>', media_content):
+ track_attributes = extract_attributes(track_tag)
+ kind = track_attributes.get('kind')
+ if not kind or kind == 'subtitles':
+ src = track_attributes.get('src')
+ if not src:
+ continue
+ lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+ media_info['subtitles'].setdefault(lang, []).append({
+ 'url': absolute_url(src),
+ })
+ if media_info['formats']:
+ entries.append(media_info)
+ return entries
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
any_restricted = False
for tc in self.get_testcases(include_onlymatching=False):
- if 'playlist' in tc:
+ if tc.get('playlist', []):
tc = tc['playlist'][0]
is_restricted = age_restricted(
tc.get('info_dict', {}).get('age_limit'), age_limit)
def _mark_watched(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def geo_verification_headers(self):
+ headers = {}
+ geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ if geo_verification_proxy:
+ headers['Ytdl-request-proxy'] = geo_verification_proxy
+ return headers
+
class SearchInfoExtractor(InfoExtractor):
"""