+ for adaptation_set in period.findall(_add_ns('AdaptationSet')):
+ if is_drm_protected(adaptation_set):
+ continue
+ adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
+ for representation in adaptation_set.findall(_add_ns('Representation')):
+ if is_drm_protected(representation):
+ continue
+ representation_attrib = adaptation_set.attrib.copy()
+ representation_attrib.update(representation.attrib)
+ # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
+ mime_type = representation_attrib['mimeType']
+ content_type = mime_type.split('/')[0]
+ if content_type == 'text':
+ # TODO implement WebVTT downloading
+ pass
+ elif content_type in ('video', 'audio'):
+ base_url = ''
+ for element in (representation, adaptation_set, period, mpd_doc):
+ base_url_e = element.find(_add_ns('BaseURL'))
+ if base_url_e is not None:
+ base_url = base_url_e.text + base_url
+ if re.match(r'^https?://', base_url):
+ break
+ if mpd_base_url and not re.match(r'^https?://', base_url):
+ if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+ mpd_base_url += '/'
+ base_url = mpd_base_url + base_url
+ representation_id = representation_attrib.get('id')
+ lang = representation_attrib.get('lang')
+ url_el = representation.find(_add_ns('BaseURL'))
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+ bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ f = {
+ 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+ 'url': base_url,
+ 'manifest_url': mpd_url,
+ 'ext': mimetype2ext(mime_type),
+ 'width': int_or_none(representation_attrib.get('width')),
+ 'height': int_or_none(representation_attrib.get('height')),
+ 'tbr': float_or_none(bandwidth, 1000),
+ 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
+ 'fps': int_or_none(representation_attrib.get('frameRate')),
+ 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
+ 'format_note': 'DASH %s' % content_type,
+ 'filesize': filesize,
+ }
+ f.update(parse_codecs(representation_attrib.get('codecs')))
+ representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
+
+ def prepare_template(template_name, identifiers):
+ t = representation_ms_info[template_name]
+ t = t.replace('$RepresentationID$', representation_id)
+ t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+ t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t.replace('$$', '$')
+ return t
+
+ # @initialization is a regular template like @media one
+ # so it should be handled just the same way (see
+ # https://github.com/rg3/youtube-dl/issues/11605)
+ if 'initialization' in representation_ms_info:
+ initialization_template = prepare_template(
+ 'initialization',
+ # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+ # $Time$ shall not be included for @initialization thus
+ # only $Bandwidth$ remains
+ ('Bandwidth', ))
+ representation_ms_info['initialization_url'] = initialization_template % {
+ 'Bandwidth': bandwidth,
+ }
+
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
+ if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+ media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
+
+ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+ # can't be used at the same time
+ if '%(Number' in media_template and 's' not in representation_ms_info:
+ segment_duration = None
+ if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
+ segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+ representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+ representation_ms_info['fragments'] = [{
+ media_location_key: media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': bandwidth,
+ },
+ 'duration': segment_duration,
+ } for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ else:
+ # $Number*$ or $Time$ in media template with S list available
+ # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+ # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+ representation_ms_info['fragments'] = []
+ segment_time = 0
+ segment_d = None
+ segment_number = representation_ms_info['start_number']
+
+ def add_segment_url():
+ segment_url = media_template % {
+ 'Time': segment_time,
+ 'Bandwidth': bandwidth,
+ 'Number': segment_number,
+ }
+ representation_ms_info['fragments'].append({
+ media_location_key: segment_url,
+ 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+ })
+
+ for num, s in enumerate(representation_ms_info['s']):
+ segment_time = s.get('t') or segment_time
+ segment_d = s['d']
+ add_segment_url()
+ segment_number += 1
+ for r in range(s.get('r', 0)):
+ segment_time += segment_d
+ add_segment_url()
+ segment_number += 1
+ segment_time += segment_d
+ elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+ # No media template
+ # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+ # or any YouTube dashsegments video
+ fragments = []
+ segment_index = 0
+ timescale = representation_ms_info['timescale']
+ for s in representation_ms_info['s']:
+ duration = float_or_none(s['d'], timescale)
+ for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
+ fragments.append({
+ location_key(segment_uri): segment_uri,
+ 'duration': duration,
+ })
+ segment_index += 1
+ representation_ms_info['fragments'] = fragments
+ # NB: MPD manifest may contain direct URLs to unfragmented media.
+ # No fragments key is present in this case.
+ if 'fragments' in representation_ms_info:
+ f.update({
+ 'fragment_base_url': base_url,
+ 'fragments': [],
+ 'protocol': 'http_dash_segments',
+ })
+ if 'initialization_url' in representation_ms_info:
+ initialization_url = representation_ms_info['initialization_url']
+ if not f.get('url'):
+ f['url'] = initialization_url
+ f['fragments'].append({location_key(initialization_url): initialization_url})
+ f['fragments'].extend(representation_ms_info['fragments'])
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == representation_id)
+ except StopIteration:
+ full_info = formats_dict.get(representation_id, {}).copy()
+ full_info.update(f)
+ formats.append(full_info)
+ else:
+ existing_format.update(f)
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+ return formats
+
+ def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
+ res = self._download_webpage_handle(
+ ism_url, video_id,
+ note=note or 'Downloading ISM manifest',
+ errnote=errnote or 'Failed to download ISM manifest',
+ fatal=fatal)
+ if res is False:
+ return []
+ ism, urlh = res
+
+ return self._parse_ism_formats(
+ compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+
+ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
+ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
+ return []
+
+ duration = int(ism_doc.attrib['Duration'])
+ timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
+
+ formats = []
+ for stream in ism_doc.findall('StreamIndex'):
+ stream_type = stream.get('Type')
+ if stream_type not in ('video', 'audio'):
+ continue
+ url_pattern = stream.attrib['Url']
+ stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
+ stream_name = stream.get('Name')
+ for track in stream.findall('QualityLevel'):
+ fourcc = track.get('FourCC')
+ # TODO: add support for WVC1 and WMAP
+ if fourcc not in ('H264', 'AVC1', 'AACL'):
+ self.report_warning('%s is not a supported codec' % fourcc)
+ continue
+ tbr = int(track.attrib['Bitrate']) // 1000
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
+ sampling_rate = int_or_none(track.get('SamplingRate'))
+
+ track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
+ track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
+
+ fragments = []
+ fragment_ctx = {
+ 'time': 0,
+ }
+ stream_fragments = stream.findall('c')
+ for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
+ fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
+ fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
+ fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
+ if not fragment_ctx['duration']:
+ try:
+ next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
+ except IndexError:
+ next_fragment_time = duration
+ fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
+ for _ in range(fragment_repeat):
+ fragments.append({
+ 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
+ 'duration': fragment_ctx['duration'] / stream_timescale,
+ })
+ fragment_ctx['time'] += fragment_ctx['duration']
+
+ format_id = []
+ if ism_id:
+ format_id.append(ism_id)
+ if stream_name:
+ format_id.append(stream_name)
+ format_id.append(compat_str(tbr))
+
+ formats.append({
+ 'format_id': '-'.join(format_id),
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'ext': 'ismv' if stream_type == 'video' else 'isma',
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'asr': sampling_rate,
+ 'vcodec': 'none' if stream_type == 'audio' else fourcc,
+ 'acodec': 'none' if stream_type == 'video' else fourcc,
+ 'protocol': 'ism',
+ 'fragments': fragments,
+ '_download_params': {
+ 'duration': duration,
+ 'timescale': stream_timescale,
+ 'width': width or 0,
+ 'height': height or 0,
+ 'fourcc': fourcc,
+ 'codec_private_data': track.get('CodecPrivateData'),
+ 'sampling_rate': sampling_rate,
+ 'channels': int_or_none(track.get('Channels', 2)),
+ 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+ 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+ },
+ })
+ return formats
+
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ def _media_formats(src, cur_media_type, type_info={}):
+ full_url = absolute_url(src)
+ ext = type_info.get('ext') or determine_ext(full_url)
+ if ext == 'm3u8':
+ is_plain_url = False
+ formats = self._extract_m3u8_formats(
+ full_url, video_id, ext='mp4',
+ entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+ preference=preference, fatal=False)
+ elif ext == 'mpd':
+ is_plain_url = False
+ formats = self._extract_mpd_formats(
+ full_url, video_id, mpd_id=mpd_id, fatal=False)
+ else:
+ is_plain_url = True
+ formats = [{
+ 'url': full_url,
+ 'vcodec': 'none' if cur_media_type == 'audio' else None,
+ }]
+ return is_plain_url, formats
+
+ entries = []
+ # amp-video and amp-audio are very similar to their HTML5 counterparts
+ # so we wll include them right here (see
+ # https://www.ampproject.org/docs/reference/components/amp-video)
+ media_tags = [(media_tag, media_type, '')
+ for media_tag, media_type
+ in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
+ media_tags.extend(re.findall(
+ # We only allow video|audio followed by a whitespace or '>'.
+ # Allowing more characters may end up in significant slow down (see
+ # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+ # http://www.porntrex.com/maps/videositemap.xml).
+ r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
+ for media_tag, media_type, media_content in media_tags:
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ _, formats = _media_formats(src, media_type)
+ media_info['formats'].extend(formats)
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'<source[^>]+>', media_content):
+ source_attributes = extract_attributes(source_tag)
+ src = source_attributes.get('src')
+ if not src:
+ continue
+ f = parse_content_type(source_attributes.get('type'))
+ is_plain_url, formats = _media_formats(src, media_type, f)
+ if is_plain_url:
+ # res attribute is not standard but seen several times
+ # in the wild
+ f.update({
+ 'height': int_or_none(source_attributes.get('res')),
+ 'format_id': source_attributes.get('label'),
+ })
+ f.update(formats[0])
+ media_info['formats'].append(f)
+ else:
+ media_info['formats'].extend(formats)
+ for track_tag in re.findall(r'<track[^>]+>', media_content):
+ track_attributes = extract_attributes(track_tag)
+ kind = track_attributes.get('kind')
+ if not kind or kind in ('subtitles', 'captions'):
+ src = track_attributes.get('src')
+ if not src:
+ continue
+ lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+ media_info['subtitles'].setdefault(lang, []).append({
+ 'url': absolute_url(src),
+ })
+ if media_info['formats'] or media_info['subtitles']:
+ entries.append(media_info)
+ return entries
+
+ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+ formats = []
+ hdcore_sign = 'hdcore=3.7.0'
+ f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ hds_host = hosts.get('hds')
+ if hds_host:
+ f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
+ if 'hdcore=' not in f4m_url:
+ f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
+ f4m_formats = self._extract_f4m_formats(
+ f4m_url, video_id, f4m_id='hds', fatal=False)
+ for entry in f4m_formats:
+ entry.update({'extra_param_to_segment_url': hdcore_sign})
+ formats.extend(f4m_formats)
+ m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+ hls_host = hosts.get('hls')
+ if hls_host:
+ m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ return formats
+
+ def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+ url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
+ url_base = self._search_regex(
+ r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+ http_base_url = '%s:%s' % ('http', url_base)
+ formats = []
+ if 'm3u8' not in skip_protocols:
+ formats.extend(self._extract_m3u8_formats(
+ http_base_url + '/playlist.m3u8', video_id, 'mp4',
+ m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+ if 'f4m' not in skip_protocols:
+ formats.extend(self._extract_f4m_formats(
+ http_base_url + '/manifest.f4m',
+ video_id, f4m_id='hds', fatal=False))
+ if 'dash' not in skip_protocols:
+ formats.extend(self._extract_mpd_formats(
+ http_base_url + '/manifest.mpd',
+ video_id, mpd_id='dash', fatal=False))
+ if re.search(r'(?:/smil:|\.smil)', url_base):
+ if 'smil' not in skip_protocols:
+ rtmp_formats = self._extract_smil_formats(
+ http_base_url + '/jwplayer.smil',
+ video_id, fatal=False)
+ for rtmp_format in rtmp_formats:
+ rtsp_format = rtmp_format.copy()
+ rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+ del rtsp_format['play_path']
+ del rtsp_format['ext']
+ rtsp_format.update({
+ 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+ 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+ 'protocol': 'rtsp',
+ })
+ formats.extend([rtmp_format, rtsp_format])
+ else:
+ for protocol in ('rtmp', 'rtsp'):
+ if protocol not in skip_protocols:
+ formats.append({
+ 'url': '%s:%s' % (protocol, url_base),
+ 'format_id': protocol,
+ 'protocol': protocol,
+ })
+ return formats
+
+ def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
+ mobj = re.search(
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
+ webpage)
+ if mobj:
+ try:
+ jwplayer_data = self._parse_json(mobj.group('options'),
+ video_id=video_id,
+ transform_source=transform_source)
+ except ExtractorError:
+ pass
+ else:
+ if isinstance(jwplayer_data, dict):
+ return jwplayer_data
+
+ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+ jwplayer_data = self._find_jwplayer_data(
+ webpage, video_id, transform_source=js_to_json)
+ return self._parse_jwplayer_data(
+ jwplayer_data, video_id, *args, **kwargs)
+
+ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
+ m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ # JWPlayer backward compatibility: flattened playlists
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if 'playlist' not in jwplayer_data:
+ jwplayer_data = {'playlist': [jwplayer_data]}
+
+ entries = []
+
+ # JWPlayer backward compatibility: single playlist item
+ # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
+ if not isinstance(jwplayer_data['playlist'], list):
+ jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+
+ for video_data in jwplayer_data['playlist']:
+ # JWPlayer backward compatibility: flattened sources
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+ if 'sources' not in video_data:
+ video_data['sources'] = [video_data]
+
+ this_video_id = video_id or video_data['mediaid']
+
+ formats = self._parse_jwplayer_formats(
+ video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+ mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
+
+ subtitles = {}
+ tracks = video_data.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ track_url = urljoin(base_url, track.get('file'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('label') or 'en', []).append({
+ 'url': self._proto_relative_url(track_url)
+ })
+
+ entry = {
+ 'id': this_video_id,
+ 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
+ 'description': video_data.get('description'),
+ 'thumbnail': self._proto_relative_url(video_data.get('image')),
+ 'timestamp': int_or_none(video_data.get('pubdate')),
+ 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
+ 'subtitles': subtitles,
+ }
+ # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
+ if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
+ entry.update({
+ '_type': 'url_transparent',
+ 'url': formats[0]['url'],
+ })
+ else:
+ self._sort_formats(formats)
+ entry['formats'] = formats
+ entries.append(entry)
+ if len(entries) == 1:
+ return entries[0]
+ else:
+ return self.playlist_result(entries)