+ 'manifest_url': manifest_url,
+ 'ext': 'flv' if bootstrap_info is not None else None,
+ 'protocol': 'f4m',
+ 'tbr': tbr,
+ 'width': width,
+ 'height': height,
+ 'vcodec': vcodec,
+ 'preference': preference,
+ })
+ return formats
+
+ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+ return {
+ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+ 'url': m3u8_url,
+ 'ext': ext,
+ 'protocol': 'm3u8',
+ 'preference': preference - 100 if preference else -100,
+ 'resolution': 'multiple',
+ 'format_note': 'Quality selection URL',
+ }
+
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True, live=False):
+ res = self._download_webpage_handle(
+ m3u8_url, video_id,
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+
+ if res is False:
+ return []
+
+ m3u8_doc, urlh = res
+ m3u8_url = urlh.geturl()
+
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
+ if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
+ return []
+
+ if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
+ return []
+
+ formats = []
+
+ format_url = lambda u: (
+ u
+ if re.match(r'^https?://', u)
+ else compat_urlparse.urljoin(m3u8_url, u))
+
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
+ # Fortunately, master playlist can be easily distinguished from media
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
+ if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
+ return [{
+ 'url': m3u8_url,
+ 'format_id': m3u8_id,
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }]
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (m3u8_id, group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
+ for line in m3u8_doc.splitlines():
+ if line.startswith('#EXT-X-STREAM-INF:'):
+ last_stream_inf = parse_m3u8_attributes(line)
+ elif line.startswith('#EXT-X-MEDIA:'):
+ extract_media(line)
+ elif line.startswith('#') or not line.strip():
+ continue
+ else:
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
+ format_id = []
+ if m3u8_id:
+ format_id.append(m3u8_id)
+ stream_name = build_stream_name()
+ # Bandwidth of live streams may differ over time thus making
+ # format_id unpredictable. So it's better to keep provided
+ # format_id intact.
+ if not live:
+ format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
+ manifest_url = format_url(line.strip())
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': manifest_url,
+ 'manifest_url': m3u8_url,
+ 'tbr': tbr,
+ 'ext': ext,
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ resolution = last_stream_inf.get('RESOLUTION')
+ if resolution:
+ mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+ if mobj:
+ f['width'] = int(mobj.group('width'))
+ f['height'] = int(mobj.group('height'))
+ # Unified Streaming Platform
+ mobj = re.search(
+ r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+ if mobj:
+ abr, vbr = mobj.groups()
+ abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+ f.update({
+ 'vbr': vbr,
+ 'abr': abr,
+ })
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing an audio group it represents a complete
+ # (with audio and video) format. So, for such cases we will
+ # ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
+ formats.append(f)
+ last_stream_inf = {}
+ return formats
+
+ @staticmethod
+ def _xpath_ns(path, namespace=None):
+ if not namespace:
+ return path
+ out = []
+ for c in path.split('/'):
+ if not c or c == '.':
+ out.append(c)
+ else:
+ out.append('{%s}%s' % (namespace, c))
+ return '/'.join(out)
+
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
+
+ if smil is False:
+ assert not fatal
+ return []
+
+ namespace = self._parse_smil_namespace(smil)
+
+ return self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ if smil is False:
+ return {}
+ return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
+ return self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
+
+ def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+ namespace = self._parse_smil_namespace(smil)
+
+ formats = self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+ video_id = os.path.splitext(url_basename(smil_url))[0]
+ title = None
+ description = None
+ upload_date = None
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ name = meta.attrib.get('name')
+ content = meta.attrib.get('content')
+ if not name or not content:
+ continue
+ if not title and name == 'title':
+ title = content
+ elif not description and name in ('description', 'abstract'):
+ description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+ return {
+ 'id': video_id,
+ 'title': title or video_id,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _parse_smil_namespace(self, smil):
+ return self._search_regex(
+ r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ base = smil_url
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ b = meta.get('base') or meta.get('httpBase')
+ if b:
+ base = b
+ break
+
+ formats = []
+ rtmp_count = 0
+ http_count = 0
+ m3u8_count = 0
+
+ srcs = []
+ media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+ for medium in media:
+ src = medium.get('src')
+ if not src or src in srcs:
+ continue
+ srcs.append(src)
+
+ bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
+ filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
+ width = int_or_none(medium.get('width'))
+ height = int_or_none(medium.get('height'))
+ proto = medium.get('proto')
+ ext = medium.get('ext')
+ src_ext = determine_ext(src)
+ streamer = medium.get('streamer') or base
+
+ if proto == 'rtmp' or streamer.startswith('rtmp'):
+ rtmp_count += 1
+ formats.append({
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ if transform_rtmp_url:
+ streamer, src = transform_rtmp_url(streamer, src)
+ formats[-1].update({
+ 'url': streamer,
+ 'play_path': src,
+ })
+ continue
+
+ src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+ src_url = src_url.strip()
+
+ if proto == 'm3u8' or src_ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if len(m3u8_formats) == 1:
+ m3u8_count += 1
+ m3u8_formats[0].update({
+ 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ })
+ formats.extend(m3u8_formats)
+ elif src_ext == 'f4m':
+ f4m_url = src_url
+ if not f4m_params:
+ f4m_params = {
+ 'hdcore': '3.2.0',
+ 'plugin': 'flowplayer-3.2.0.1',
+ }
+ f4m_url += '&' if '?' in f4m_url else '?'
+ f4m_url += compat_urllib_parse_urlencode(f4m_params)
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
+ elif src_ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src_url, video_id, mpd_id='dash', fatal=False))
+ elif re.search(r'\.ism/[Mm]anifest', src_url):
+ formats.extend(self._extract_ism_formats(
+ src_url, video_id, ism_id='mss', fatal=False))
+ elif src_url.startswith('http') and self._is_valid_url(src, video_id):
+ http_count += 1
+ formats.append({
+ 'url': src_url,
+ 'ext': ext or src_ext or 'flv',
+ 'format_id': 'http-%d' % (bitrate or http_count),
+ 'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+
+ return formats
+
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ urls = []
+ subtitles = {}
+ for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ src = textstream.get('src')
+ if not src or src in urls:
+ continue
+ urls.append(src)
+ ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
+ lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+ subtitles.setdefault(lang, []).append({
+ 'url': src,
+ 'ext': ext,
+ })
+ return subtitles
+
+ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
+ xspf = self._download_xml(
+ xspf_url, playlist_id, 'Downloading xpsf playlist',
+ 'Unable to download xspf manifest', fatal=fatal)
+ if xspf is False:
+ return []
+ return self._parse_xspf(
+ xspf, playlist_id, xspf_url=xspf_url,
+ xspf_base_url=base_url(xspf_url))
+
+ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
+ NS_MAP = {
+ 'xspf': 'http://xspf.org/ns/0/',
+ 's1': 'http://static.streamone.nl/player/ns/0',
+ }
+
+ entries = []
+ for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ title = xpath_text(
+ track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+ description = xpath_text(
+ track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+ thumbnail = xpath_text(
+ track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+ duration = float_or_none(
+ xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+ formats = []
+ for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+ format_url = urljoin(xspf_base_url, location.text)
+ if not format_url:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'manifest_url': xspf_url,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ })
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ })
+ return entries
+
+ def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
+ res = self._download_xml_handle(
+ mpd_url, video_id,
+ note=note or 'Downloading MPD manifest',
+ errnote=errnote or 'Failed to download MPD manifest',
+ fatal=fatal)
+ if res is False:
+ return []
+ mpd_doc, urlh = res
+ mpd_base_url = base_url(urlh.geturl())
+
+ return self._parse_mpd_formats(
+ mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
+ formats_dict=formats_dict, mpd_url=mpd_url)
+
+ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+ """
+ Parse formats from MPD manifest.
+ References:
+ 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+ http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+ 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+ """
+ if mpd_doc.get('type') == 'dynamic':
+ return []
+
+ namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
+
+ def _add_ns(path):
+ return self._xpath_ns(path, namespace)
+
+ def is_drm_protected(element):
+ return element.find(_add_ns('ContentProtection')) is not None
+
+ def extract_multisegment_info(element, ms_parent_info):
+ ms_info = ms_parent_info.copy()
+
+ # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
+ # common attributes and elements. We will only extract relevant
+ # for us.
+ def extract_common(source):
+ segment_timeline = source.find(_add_ns('SegmentTimeline'))
+ if segment_timeline is not None:
+ s_e = segment_timeline.findall(_add_ns('S'))
+ if s_e:
+ ms_info['total_number'] = 0
+ ms_info['s'] = []
+ for s in s_e:
+ r = int(s.get('r', 0))
+ ms_info['total_number'] += 1 + r
+ ms_info['s'].append({
+ 't': int(s.get('t', 0)),
+ # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+ 'd': int(s.attrib['d']),
+ 'r': r,
+ })
+ start_number = source.get('startNumber')
+ if start_number:
+ ms_info['start_number'] = int(start_number)
+ timescale = source.get('timescale')
+ if timescale:
+ ms_info['timescale'] = int(timescale)
+ segment_duration = source.get('duration')
+ if segment_duration:
+ ms_info['segment_duration'] = float(segment_duration)
+
+ def extract_Initialization(source):
+ initialization = source.find(_add_ns('Initialization'))
+ if initialization is not None:
+ ms_info['initialization_url'] = initialization.attrib['sourceURL']
+
+ segment_list = element.find(_add_ns('SegmentList'))
+ if segment_list is not None:
+ extract_common(segment_list)
+ extract_Initialization(segment_list)
+ segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
+ if segment_urls_e:
+ ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+ else:
+ segment_template = element.find(_add_ns('SegmentTemplate'))
+ if segment_template is not None:
+ extract_common(segment_template)
+ media = segment_template.get('media')
+ if media:
+ ms_info['media'] = media
+ initialization = segment_template.get('initialization')
+ if initialization:
+ ms_info['initialization'] = initialization
+ else:
+ extract_Initialization(segment_template)
+ return ms_info
+
+ mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+ formats = []
+ for period in mpd_doc.findall(_add_ns('Period')):
+ period_duration = parse_duration(period.get('duration')) or mpd_duration
+ period_ms_info = extract_multisegment_info(period, {
+ 'start_number': 1,
+ 'timescale': 1,