+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ if transform_rtmp_url:
+ streamer, src = transform_rtmp_url(streamer, src)
+ formats[-1].update({
+ 'url': streamer,
+ 'play_path': src,
+ })
+ continue
+
+ src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+ src_url = src_url.strip()
+
+ if proto == 'm3u8' or src_ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if len(m3u8_formats) == 1:
+ m3u8_count += 1
+ m3u8_formats[0].update({
+ 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ })
+ formats.extend(m3u8_formats)
+ elif src_ext == 'f4m':
+ f4m_url = src_url
+ if not f4m_params:
+ f4m_params = {
+ 'hdcore': '3.2.0',
+ 'plugin': 'flowplayer-3.2.0.1',
+ }
+ f4m_url += '&' if '?' in f4m_url else '?'
+ f4m_url += compat_urllib_parse_urlencode(f4m_params)
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
+ elif src_ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src_url, video_id, mpd_id='dash', fatal=False))
+ elif re.search(r'\.ism/[Mm]anifest', src_url):
+ formats.extend(self._extract_ism_formats(
+ src_url, video_id, ism_id='mss', fatal=False))
+ elif src_url.startswith('http') and self._is_valid_url(src, video_id):
+ http_count += 1
+ formats.append({
+ 'url': src_url,
+ 'ext': ext or src_ext or 'flv',
+ 'format_id': 'http-%d' % (bitrate or http_count),
+ 'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+
+ return formats
+
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ urls = []
+ subtitles = {}
+ for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ src = textstream.get('src')
+ if not src or src in urls:
+ continue
+ urls.append(src)
+ ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
+ lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+ subtitles.setdefault(lang, []).append({
+ 'url': src,
+ 'ext': ext,
+ })
+ return subtitles
+
+ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
+ xspf = self._download_xml(
+ xspf_url, playlist_id, 'Downloading xpsf playlist',
+ 'Unable to download xspf manifest', fatal=fatal)
+ if xspf is False:
+ return []
+ return self._parse_xspf(
+ xspf, playlist_id, xspf_url=xspf_url,
+ xspf_base_url=base_url(xspf_url))
+
+ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
+ NS_MAP = {
+ 'xspf': 'http://xspf.org/ns/0/',
+ 's1': 'http://static.streamone.nl/player/ns/0',
+ }
+
+ entries = []
+ for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ title = xpath_text(
+ track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+ description = xpath_text(
+ track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+ thumbnail = xpath_text(
+ track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+ duration = float_or_none(
+ xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+ formats = []
+ for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+ format_url = urljoin(xspf_base_url, location.text)
+ if not format_url:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'manifest_url': xspf_url,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ })
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ })
+ return entries
+
+ def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
+ res = self._download_xml_handle(
+ mpd_url, video_id,
+ note=note or 'Downloading MPD manifest',
+ errnote=errnote or 'Failed to download MPD manifest',
+ fatal=fatal, data=data, headers=headers, query=query)
+ if res is False:
+ return []
+ mpd_doc, urlh = res
+ if mpd_doc is None:
+ return []
+ mpd_base_url = base_url(urlh.geturl())
+
+ return self._parse_mpd_formats(
+ mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
+ formats_dict=formats_dict, mpd_url=mpd_url)
+
+ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+ """
+ Parse formats from MPD manifest.
+ References:
+ 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+ http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+ 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+ """
+ if mpd_doc.get('type') == 'dynamic':
+ return []
+
+ namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
+
+ def _add_ns(path):
+ return self._xpath_ns(path, namespace)
+
+ def is_drm_protected(element):
+ return element.find(_add_ns('ContentProtection')) is not None
+
+ def extract_multisegment_info(element, ms_parent_info):
+ ms_info = ms_parent_info.copy()
+
+ # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
+ # common attributes and elements. We will only extract relevant
+ # for us.
+ def extract_common(source):
+ segment_timeline = source.find(_add_ns('SegmentTimeline'))
+ if segment_timeline is not None:
+ s_e = segment_timeline.findall(_add_ns('S'))
+ if s_e:
+ ms_info['total_number'] = 0
+ ms_info['s'] = []
+ for s in s_e:
+ r = int(s.get('r', 0))
+ ms_info['total_number'] += 1 + r
+ ms_info['s'].append({
+ 't': int(s.get('t', 0)),
+ # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+ 'd': int(s.attrib['d']),
+ 'r': r,
+ })
+ start_number = source.get('startNumber')
+ if start_number:
+ ms_info['start_number'] = int(start_number)
+ timescale = source.get('timescale')
+ if timescale:
+ ms_info['timescale'] = int(timescale)
+ segment_duration = source.get('duration')
+ if segment_duration:
+ ms_info['segment_duration'] = float(segment_duration)
+
+ def extract_Initialization(source):
+ initialization = source.find(_add_ns('Initialization'))
+ if initialization is not None:
+ ms_info['initialization_url'] = initialization.attrib['sourceURL']
+
+ segment_list = element.find(_add_ns('SegmentList'))
+ if segment_list is not None:
+ extract_common(segment_list)
+ extract_Initialization(segment_list)
+ segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
+ if segment_urls_e:
+ ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+ else:
+ segment_template = element.find(_add_ns('SegmentTemplate'))
+ if segment_template is not None:
+ extract_common(segment_template)
+ media = segment_template.get('media')
+ if media:
+ ms_info['media'] = media
+ initialization = segment_template.get('initialization')
+ if initialization:
+ ms_info['initialization'] = initialization
+ else:
+ extract_Initialization(segment_template)
+ return ms_info
+
+ mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+ formats = []
+ for period in mpd_doc.findall(_add_ns('Period')):
+ period_duration = parse_duration(period.get('duration')) or mpd_duration
+ period_ms_info = extract_multisegment_info(period, {
+ 'start_number': 1,
+ 'timescale': 1,
+ })
+ for adaptation_set in period.findall(_add_ns('AdaptationSet')):
+ if is_drm_protected(adaptation_set):
+ continue
+ adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
+ for representation in adaptation_set.findall(_add_ns('Representation')):
+ if is_drm_protected(representation):
+ continue
+ representation_attrib = adaptation_set.attrib.copy()
+ representation_attrib.update(representation.attrib)
+ # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
+ mime_type = representation_attrib['mimeType']
+ content_type = mime_type.split('/')[0]
+ if content_type == 'text':
+ # TODO implement WebVTT downloading
+ pass
+ elif content_type in ('video', 'audio'):
+ base_url = ''
+ for element in (representation, adaptation_set, period, mpd_doc):
+ base_url_e = element.find(_add_ns('BaseURL'))
+ if base_url_e is not None:
+ base_url = base_url_e.text + base_url
+ if re.match(r'^https?://', base_url):
+ break
+ if mpd_base_url and not re.match(r'^https?://', base_url):
+ if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+ mpd_base_url += '/'
+ base_url = mpd_base_url + base_url
+ representation_id = representation_attrib.get('id')
+ lang = representation_attrib.get('lang')
+ url_el = representation.find(_add_ns('BaseURL'))
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+ bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ f = {
+ 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+ 'manifest_url': mpd_url,
+ 'ext': mimetype2ext(mime_type),
+ 'width': int_or_none(representation_attrib.get('width')),
+ 'height': int_or_none(representation_attrib.get('height')),
+ 'tbr': float_or_none(bandwidth, 1000),
+ 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
+ 'fps': int_or_none(representation_attrib.get('frameRate')),
+ 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
+ 'format_note': 'DASH %s' % content_type,
+ 'filesize': filesize,
+ 'container': mimetype2ext(mime_type) + '_dash',
+ }
+ f.update(parse_codecs(representation_attrib.get('codecs')))
+ representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
+
+ def prepare_template(template_name, identifiers):
+ tmpl = representation_ms_info[template_name]
+ # First of, % characters outside $...$ templates
+ # must be escaped by doubling for proper processing
+ # by % operator string formatting used further (see
+ # https://github.com/ytdl-org/youtube-dl/issues/16867).
+ t = ''
+ in_template = False
+ for c in tmpl:
+ t += c
+ if c == '$':
+ in_template = not in_template
+ elif c == '%' and not in_template:
+ t += c
+ # Next, $...$ templates are translated to their
+ # %(...) counterparts to be used with % operator
+ t = t.replace('$RepresentationID$', representation_id)
+ t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+ t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t.replace('$$', '$')
+ return t
+
+ # @initialization is a regular template like @media one
+ # so it should be handled just the same way (see
+ # https://github.com/ytdl-org/youtube-dl/issues/11605)
+ if 'initialization' in representation_ms_info:
+ initialization_template = prepare_template(
+ 'initialization',
+ # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+ # $Time$ shall not be included for @initialization thus
+ # only $Bandwidth$ remains
+ ('Bandwidth', ))
+ representation_ms_info['initialization_url'] = initialization_template % {
+ 'Bandwidth': bandwidth,
+ }
+
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
+ if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+ media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
+
+ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+ # can't be used at the same time
+ if '%(Number' in media_template and 's' not in representation_ms_info:
+ segment_duration = None
+ if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
+ segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+ representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+ representation_ms_info['fragments'] = [{
+ media_location_key: media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': bandwidth,
+ },
+ 'duration': segment_duration,
+ } for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ else:
+ # $Number*$ or $Time$ in media template with S list available
+ # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+ # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+ representation_ms_info['fragments'] = []
+ segment_time = 0
+ segment_d = None
+ segment_number = representation_ms_info['start_number']
+
+ def add_segment_url():
+ segment_url = media_template % {
+ 'Time': segment_time,
+ 'Bandwidth': bandwidth,
+ 'Number': segment_number,
+ }
+ representation_ms_info['fragments'].append({
+ media_location_key: segment_url,
+ 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+ })
+
+ for num, s in enumerate(representation_ms_info['s']):
+ segment_time = s.get('t') or segment_time
+ segment_d = s['d']
+ add_segment_url()
+ segment_number += 1
+ for r in range(s.get('r', 0)):
+ segment_time += segment_d
+ add_segment_url()
+ segment_number += 1
+ segment_time += segment_d
+ elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+ # No media template
+ # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+ # or any YouTube dashsegments video
+ fragments = []
+ segment_index = 0
+ timescale = representation_ms_info['timescale']
+ for s in representation_ms_info['s']:
+ duration = float_or_none(s['d'], timescale)
+ for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
+ fragments.append({
+ location_key(segment_uri): segment_uri,
+ 'duration': duration,
+ })
+ segment_index += 1
+ representation_ms_info['fragments'] = fragments
+ elif 'segment_urls' in representation_ms_info:
+ # Segment URLs with no SegmentTimeline
+ # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
+ # https://github.com/ytdl-org/youtube-dl/pull/14844
+ fragments = []
+ segment_duration = float_or_none(
+ representation_ms_info['segment_duration'],
+ representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
+ for segment_url in representation_ms_info['segment_urls']:
+ fragment = {
+ location_key(segment_url): segment_url,
+ }
+ if segment_duration:
+ fragment['duration'] = segment_duration
+ fragments.append(fragment)
+ representation_ms_info['fragments'] = fragments
+ # If there is a fragments key available then we correctly recognized fragmented media.
+ # Otherwise we will assume unfragmented media with direct access. Technically, such
+ # assumption is not necessarily correct since we may simply have no support for
+ # some forms of fragmented media renditions yet, but for now we'll use this fallback.
+ if 'fragments' in representation_ms_info:
+ f.update({
+ # NB: mpd_url may be empty when MPD manifest is parsed from a string
+ 'url': mpd_url or base_url,
+ 'fragment_base_url': base_url,
+ 'fragments': [],
+ 'protocol': 'http_dash_segments',
+ })
+ if 'initialization_url' in representation_ms_info:
+ initialization_url = representation_ms_info['initialization_url']
+ if not f.get('url'):
+ f['url'] = initialization_url
+ f['fragments'].append({location_key(initialization_url): initialization_url})
+ f['fragments'].extend(representation_ms_info['fragments'])
+ else:
+ # Assuming direct URL to unfragmented media.
+ f['url'] = base_url
+
+ # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
+ # is not necessarily unique within a Period thus formats with
+ # the same `format_id` are quite possible. There are numerous examples
+ # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
+ # https://github.com/ytdl-org/youtube-dl/issues/13919)
+ full_info = formats_dict.get(representation_id, {}).copy()
+ full_info.update(f)
+ formats.append(full_info)
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+ return formats
+
+ def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ res = self._download_xml_handle(
+ ism_url, video_id,
+ note=note or 'Downloading ISM manifest',
+ errnote=errnote or 'Failed to download ISM manifest',
+ fatal=fatal, data=data, headers=headers, query=query)
+ if res is False:
+ return []
+ ism_doc, urlh = res
+
+ return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
+
+ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
+ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
+ return []
+
+ duration = int(ism_doc.attrib['Duration'])
+ timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
+
+ formats = []
+ for stream in ism_doc.findall('StreamIndex'):
+ stream_type = stream.get('Type')
+ if stream_type not in ('video', 'audio'):
+ continue
+ url_pattern = stream.attrib['Url']
+ stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
+ stream_name = stream.get('Name')
+ for track in stream.findall('QualityLevel'):
+ fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
+ # TODO: add support for WVC1 and WMAP
+ if fourcc not in ('H264', 'AVC1', 'AACL'):
+ self.report_warning('%s is not a supported codec' % fourcc)
+ continue
+ tbr = int(track.attrib['Bitrate']) // 1000
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
+ sampling_rate = int_or_none(track.get('SamplingRate'))
+
+ track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
+ track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
+
+ fragments = []
+ fragment_ctx = {
+ 'time': 0,
+ }
+ stream_fragments = stream.findall('c')
+ for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
+ fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
+ fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
+ fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
+ if not fragment_ctx['duration']:
+ try:
+ next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
+ except IndexError:
+ next_fragment_time = duration
+ fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
+ for _ in range(fragment_repeat):
+ fragments.append({
+ 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
+ 'duration': fragment_ctx['duration'] / stream_timescale,
+ })
+ fragment_ctx['time'] += fragment_ctx['duration']
+
+ format_id = []
+ if ism_id:
+ format_id.append(ism_id)
+ if stream_name:
+ format_id.append(stream_name)
+ format_id.append(compat_str(tbr))
+
+ formats.append({
+ 'format_id': '-'.join(format_id),
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'ext': 'ismv' if stream_type == 'video' else 'isma',