parse_m3u8_attributes,
extract_attributes,
parse_codecs,
+ urljoin,
)
download, lower-case.
"http", "https", "rtsp", "rtmp", "rtmpe",
"m3u8", "m3u8_native" or "http_dash_segments".
- * fragments A list of fragments of the fragmented media,
- with the following entries:
- * "url" (mandatory) - fragment's URL
+ * fragment_base_url
+ Base URL for fragments. Each fragment's path
+ value (if present) will be relative to
+ this URL.
+ * fragments A list of fragments of a fragmented media.
+ Each fragment entry must contain either an url
+ or a path. If an url is present it should be
+ considered by a client. Otherwise both path and
+ fragment_base_url must be present. Here is
+ the list of all potential fields:
+ * "url" - fragment's URL
+ * "path" - fragment's path relative to
+ fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
* preference Order number of this format. If this field is
uploader_url: Full URL to a personal webpage of the video uploader.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
- {language: subformats}. "subformats" is a list sorted from
- lower to higher preference, each element is a dictionary
- with the "ext" entry and one of:
+ {tag: subformats}. "tag" is usually a language code, and
+ "subformats" is a list sorted from lower to higher
+ preference, each element is a dictionary with the "ext"
+ entry and one of:
* "data": The subtitles file contents
* "url": A URL pointing to the subtitles file
"ext" will be calculated from URL if missing
unique_formats.append(f)
formats[:] = unique_formats
- def _is_valid_url(self, url, video_id, item='video'):
+ def _is_valid_url(self, url, video_id, item='video', headers={}):
url = self._proto_relative_url(url, scheme='http:')
# For now assume non HTTP(S) URLs always valid
if not (url.startswith('http://') or url.startswith('https://')):
return True
try:
- self._request_webpage(url, video_id, 'Checking %s URL' % item)
+ self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
except ExtractorError as e:
if isinstance(e.cause, compat_urllib_error.URLError):
'protocol': entry_protocol,
'preference': preference,
}]
+ audio_in_video_stream = {}
last_info = {}
last_media = {}
for line in m3u8_doc.splitlines():
media = parse_m3u8_attributes(line)
media_type = media.get('TYPE')
if media_type in ('VIDEO', 'AUDIO'):
+ group_id = media.get('GROUP-ID')
media_url = media.get('URI')
if media_url:
format_id = []
- for v in (media.get('GROUP-ID'), media.get('NAME')):
+ for v in (group_id, media.get('NAME')):
if v:
format_id.append(v)
- formats.append({
+ f = {
'format_id': '-'.join(format_id),
'url': format_url(media_url),
'language': media.get('LANGUAGE'),
- 'vcodec': 'none' if media_type == 'AUDIO' else None,
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
- })
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ if group_id and not audio_in_video_stream.get(group_id):
+ audio_in_video_stream[group_id] = False
+ formats.append(f)
else:
# When there is no URI in EXT-X-MEDIA let this tag's
# data be used by regular URI lines below
last_media = media
+ if media_type == 'AUDIO' and group_id:
+ audio_in_video_stream[group_id] = True
elif line.startswith('#') or not line.strip():
continue
else:
'abr': abr,
})
f.update(parse_codecs(last_info.get('CODECS')))
+ if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
+ # TODO: update acodec for audio only formats with the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
last_info = {}
last_media = {}
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
extract_common(segment_template)
- media_template = segment_template.get('media')
- if media_template:
- ms_info['media_template'] = media_template
+ media = segment_template.get('media')
+ if media:
+ ms_info['media'] = media
initialization = segment_template.get('initialization')
if initialization:
- ms_info['initialization_url'] = initialization
+ ms_info['initialization'] = initialization
else:
extract_Initialization(segment_template)
return ms_info
- def combine_url(base_url, target_url):
- if re.match(r'^https?://', target_url):
- return target_url
- return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = []
for period in mpd_doc.findall(_add_ns('Period')):
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+ bandwidth = int_or_none(representation_attrib.get('bandwidth'))
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'url': base_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
+ 'tbr': int_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
- 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
- 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
}
+ f.update(parse_codecs(representation_attrib.get('codecs')))
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
- if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
- media_template = representation_ms_info['media_template']
- media_template = media_template.replace('$RepresentationID$', representation_id)
- media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
- media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
- media_template.replace('$$', '$')
+ def prepare_template(template_name, identifiers):
+ t = representation_ms_info[template_name]
+ t = t.replace('$RepresentationID$', representation_id)
+ t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+ t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t.replace('$$', '$')
+ return t
+
+ # @initialization is a regular template like @media one
+ # so it should be handled just the same way (see
+ # https://github.com/rg3/youtube-dl/issues/11605)
+ if 'initialization' in representation_ms_info:
+ initialization_template = prepare_template(
+ 'initialization',
+ # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+ # $Time$ shall not be included for @initialization thus
+ # only $Bandwidth$ remains
+ ('Bandwidth', ))
+ representation_ms_info['initialization_url'] = initialization_template % {
+ 'Bandwidth': bandwidth,
+ }
+
+ if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+ media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
representation_ms_info['fragments'] = [{
'url': media_template % {
'Number': segment_number,
- 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+ 'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
- 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+ 'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
fragments = []
- s_num = 0
- for segment_url in representation_ms_info['segment_urls']:
- s = representation_ms_info['s'][s_num]
+ segment_index = 0
+ timescale = representation_ms_info['timescale']
+ for s in representation_ms_info['s']:
+ duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
fragments.append({
- 'url': segment_url,
- 'duration': float_or_none(s['d'], representation_ms_info['timescale']),
+ 'url': representation_ms_info['segment_urls'][segment_index],
+ 'duration': duration,
})
+ segment_index += 1
representation_ms_info['fragments'] = fragments
# NB: MPD manifest may contain direct URLs to unfragmented media.
# No fragments key is present in this case.
'protocol': 'http_dash_segments',
})
if 'initialization_url' in representation_ms_info:
- initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
+ initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({'url': initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
for fragment in f['fragments']:
- fragment['url'] = combine_url(base_url, fragment['url'])
+ fragment['url'] = urljoin(base_url, fragment['url'])
try:
existing_format = next(
fo for fo in formats
})
return formats
- def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
def absolute_url(video_url):
return compat_urlparse.urljoin(base_url, video_url)
def _media_formats(src, cur_media_type):
full_url = absolute_url(src)
- if determine_ext(full_url) == 'm3u8':
+ ext = determine_ext(full_url)
+ if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+ elif ext == 'mpd':
+ is_plain_url = False
+ formats = self._extract_mpd_formats(
+ full_url, video_id, mpd_id=mpd_id)
else:
is_plain_url = True
formats = [{
media_tags = [(media_tag, media_type, '')
for media_tag, media_type
in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
- media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+ media_tags.extend(re.findall(
+ # We only allow video|audio followed by a whitespace or '>'.
+ # Allowing more characters may end up in significant slow down (see
+ # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+ # http://www.porntrex.com/maps/videositemap.xml).
+ r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags:
media_info = {
'formats': [],
entries.append(media_info)
return entries
- def _extract_akamai_formats(self, manifest_url, video_id):
+ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats = []
hdcore_sign = 'hdcore=3.7.0'
- f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ hds_host = hosts.get('hds')
+ if hds_host:
+ f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
if 'hdcore=' not in f4m_url:
f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
f4m_formats = self._extract_f4m_formats(
for entry in f4m_formats:
entry.update({'extra_param_to_segment_url': hdcore_sign})
formats.extend(f4m_formats)
- m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+ m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+ hls_host = hosts.get('hls')
+ if hls_host:
+ m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))