X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/47d80ec0b18245caeb97018d4c1af18d0b5b972b..d2632ebbe0759622d4ab7aff134421194974b394:/youtube_dl/extractor/common.py

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 05c51fa..0b4e2ac 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -59,6 +59,7 @@ from ..utils import (
     parse_m3u8_attributes,
     extract_attributes,
     parse_codecs,
+    urljoin,
 )
 
 
@@ -120,9 +121,19 @@ class InfoExtractor(object):
                                  download, lower-case.
                                  "http", "https", "rtsp", "rtmp", "rtmpe",
                                  "m3u8", "m3u8_native" or "http_dash_segments".
-                    * fragments  A list of fragments of the fragmented media,
-                                 with the following entries:
-                                 * "url" (mandatory) - fragment's URL
+                    * fragment_base_url
+                                 Base URL for fragments. Each fragment's path
+                                 value (if present) will be relative to
+                                 this URL.
+                    * fragments  A list of fragments of a fragmented media.
+                                 Each fragment entry must contain either an url
+                                 or a path. If an url is present it should be
+                                 considered by a client. Otherwise both path and
+                                 fragment_base_url must be present. Here is
+                                 the list of all potential fields:
+                                 * "url" - fragment's URL
+                                 * "path" - fragment's path relative to
+                                            fragment_base_url
                                  * "duration" (optional, int or float)
                                  * "filesize" (optional, int)
                     * preference Order number of this format. If this field is
@@ -188,9 +199,10 @@ class InfoExtractor(object):
     uploader_url:   Full URL to a personal webpage of the video uploader.
     location:       Physical location where the video was filmed.
     subtitles:      The available subtitles as a dictionary in the format
-                    {language: subformats}. "subformats" is a list sorted from
-                    lower to higher preference, each element is a dictionary
-                    with the "ext" entry and one of:
+                    {tag: subformats}. "tag" is usually a language code, and
+                    "subformats" is a list sorted from lower to higher
+                    preference, each element is a dictionary with the "ext"
+                    entry and one of:
                         * "data": The subtitles file contents
                         * "url": A URL pointing to the subtitles file
                     "ext" will be calculated from URL if missing
@@ -1013,13 +1025,13 @@ class InfoExtractor(object):
                 unique_formats.append(f)
         formats[:] = unique_formats
 
-    def _is_valid_url(self, url, video_id, item='video'):
+    def _is_valid_url(self, url, video_id, item='video', headers={}):
         url = self._proto_relative_url(url, scheme='http:')
         # For now assume non HTTP(S) URLs always valid
         if not (url.startswith('http://') or url.startswith('https://')):
             return True
         try:
-            self._request_webpage(url, video_id, 'Checking %s URL' % item)
+            self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
             return True
         except ExtractorError as e:
             if isinstance(e.cause, compat_urllib_error.URLError):
@@ -1224,6 +1236,7 @@ class InfoExtractor(object):
                 'protocol': entry_protocol,
                 'preference': preference,
             }]
+        audio_in_video_stream = {}
         last_info = {}
         last_media = {}
         for line in m3u8_doc.splitlines():
@@ -1233,25 +1246,32 @@ class InfoExtractor(object):
                 media = parse_m3u8_attributes(line)
                 media_type = media.get('TYPE')
                 if media_type in ('VIDEO', 'AUDIO'):
+                    group_id = media.get('GROUP-ID')
                     media_url = media.get('URI')
                     if media_url:
                         format_id = []
-                        for v in (media.get('GROUP-ID'), media.get('NAME')):
+                        for v in (group_id, media.get('NAME')):
                             if v:
                                 format_id.append(v)
-                        formats.append({
+                        f = {
                             'format_id': '-'.join(format_id),
                             'url': format_url(media_url),
                             'language': media.get('LANGUAGE'),
-                            'vcodec': 'none' if media_type == 'AUDIO' else None,
                             'ext': ext,
                             'protocol': entry_protocol,
                             'preference': preference,
-                        })
+                        }
+                        if media_type == 'AUDIO':
+                            f['vcodec'] = 'none'
+                            if group_id and not audio_in_video_stream.get(group_id):
+                                audio_in_video_stream[group_id] = False
+                        formats.append(f)
                     else:
                         # When there is no URI in EXT-X-MEDIA let this tag's
                         # data be used by regular URI lines below
                         last_media = media
+                        if media_type == 'AUDIO' and group_id:
+                            audio_in_video_stream[group_id] = True
             elif line.startswith('#') or not line.strip():
                 continue
             else:
@@ -1295,6 +1315,9 @@ class InfoExtractor(object):
                         'abr': abr,
                     })
                 f.update(parse_codecs(last_info.get('CODECS')))
+                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
+                    # TODO: update acodec for audio only formats with the same GROUP-ID
+                    f['acodec'] = 'none'
                 formats.append(f)
                 last_info = {}
                 last_media = {}
@@ -1614,21 +1637,16 @@ class InfoExtractor(object):
                 segment_template = element.find(_add_ns('SegmentTemplate'))
                 if segment_template is not None:
                     extract_common(segment_template)
-                    media_template = segment_template.get('media')
-                    if media_template:
-                        ms_info['media_template'] = media_template
+                    media = segment_template.get('media')
+                    if media:
+                        ms_info['media'] = media
                     initialization = segment_template.get('initialization')
                     if initialization:
-                        ms_info['initialization_url'] = initialization
+                        ms_info['initialization'] = initialization
                     else:
                         extract_Initialization(segment_template)
             return ms_info
 
-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
         formats = []
         for period in mpd_doc.findall(_add_ns('Period')):
@@ -1668,6 +1686,7 @@ class InfoExtractor(object):
                         lang = representation_attrib.get('lang')
                         url_el = representation.find(_add_ns('BaseURL'))
                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+                        bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                         f = {
                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                             'url': base_url,
@@ -1675,23 +1694,41 @@ class InfoExtractor(object):
                             'ext': mimetype2ext(mime_type),
                             'width': int_or_none(representation_attrib.get('width')),
                             'height': int_or_none(representation_attrib.get('height')),
-                            'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
+                            'tbr': int_or_none(bandwidth, 1000),
                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                             'fps': int_or_none(representation_attrib.get('frameRate')),
-                            'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
-                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                             'format_note': 'DASH %s' % content_type,
                             'filesize': filesize,
                         }
+                        f.update(parse_codecs(representation_attrib.get('codecs')))
                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
-                        if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
 
-                            media_template = representation_ms_info['media_template']
-                            media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
-                            media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
-                            media_template.replace('$$', '$')
+                        def prepare_template(template_name, identifiers):
+                            t = representation_ms_info[template_name]
+                            t = t.replace('$RepresentationID$', representation_id)
+                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+                            t.replace('$$', '$')
+                            return t
+
+                        # @initialization is a regular template like @media one
+                        # so it should be handled just the same way (see
+                        # https://github.com/rg3/youtube-dl/issues/11605)
+                        if 'initialization' in representation_ms_info:
+                            initialization_template = prepare_template(
+                                'initialization',
+                                # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+                                # $Time$ shall not be included for @initialization thus
+                                # only $Bandwidth$ remains
+                                ('Bandwidth', ))
+                            representation_ms_info['initialization_url'] = initialization_template % {
+                                'Bandwidth': bandwidth,
+                            }
+
+                        if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+                            media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
 
                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                             # can't be used at the same time
@@ -1703,7 +1740,7 @@ class InfoExtractor(object):
                                 representation_ms_info['fragments'] = [{
                                     'url': media_template % {
                                         'Number': segment_number,
-                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+                                        'Bandwidth': bandwidth,
                                     },
                                     'duration': segment_duration,
                                 } for segment_number in range(
@@ -1721,7 +1758,7 @@ class InfoExtractor(object):
                                 def add_segment_url():
                                     segment_url = media_template % {
                                         'Time': segment_time,
-                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+                                        'Bandwidth': bandwidth,
                                         'Number': segment_number,
                                     }
                                     representation_ms_info['fragments'].append({
@@ -1744,14 +1781,16 @@ class InfoExtractor(object):
                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
                             # or any YouTube dashsegments video
                             fragments = []
-                            s_num = 0
-                            for segment_url in representation_ms_info['segment_urls']:
-                                s = representation_ms_info['s'][s_num]
+                            segment_index = 0
+                            timescale = representation_ms_info['timescale']
+                            for s in representation_ms_info['s']:
+                                duration = float_or_none(s['d'], timescale)
                                 for r in range(s.get('r', 0) + 1):
                                     fragments.append({
-                                        'url': segment_url,
-                                        'duration': float_or_none(s['d'], representation_ms_info['timescale']),
+                                        'url': representation_ms_info['segment_urls'][segment_index],
+                                        'duration': duration,
                                     })
+                                    segment_index += 1
                             representation_ms_info['fragments'] = fragments
                         # NB: MPD manifest may contain direct URLs to unfragmented media.
                         # No fragments key is present in this case.
@@ -1761,13 +1800,13 @@ class InfoExtractor(object):
                                 'protocol': 'http_dash_segments',
                             })
                             if 'initialization_url' in representation_ms_info:
-                                initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
+                                initialization_url = representation_ms_info['initialization_url']
                                 if not f.get('url'):
                                     f['url'] = initialization_url
                                 f['fragments'].append({'url': initialization_url})
                             f['fragments'].extend(representation_ms_info['fragments'])
                             for fragment in f['fragments']:
-                                fragment['url'] = combine_url(base_url, fragment['url'])
+                                fragment['url'] = urljoin(base_url, fragment['url'])
                         try:
                             existing_format = next(
                                 fo for fo in formats
@@ -1881,7 +1920,7 @@ class InfoExtractor(object):
                 })
         return formats
 
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
         def absolute_url(video_url):
             return compat_urlparse.urljoin(base_url, video_url)
 
@@ -1898,11 +1937,16 @@ class InfoExtractor(object):
 
         def _media_formats(src, cur_media_type):
             full_url = absolute_url(src)
-            if determine_ext(full_url) == 'm3u8':
+            ext = determine_ext(full_url)
+            if ext == 'm3u8':
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+            elif ext == 'mpd':
+                is_plain_url = False
+                formats = self._extract_mpd_formats(
+                    full_url, video_id, mpd_id=mpd_id)
             else:
                 is_plain_url = True
                 formats = [{
@@ -1915,7 +1959,12 @@ class InfoExtractor(object):
         media_tags = [(media_tag, media_type, '')
                       for media_tag, media_type
                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
-        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        media_tags.extend(re.findall(
+            # We only allow video|audio followed by a whitespace or '>'.
+            # Allowing more characters may end up in significant slow down (see
+            # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+            # http://www.porntrex.com/maps/videositemap.xml).
+            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
         for media_tag, media_type, media_content in media_tags:
             media_info = {
                 'formats': [],
@@ -1955,10 +2004,13 @@ class InfoExtractor(object):
                 entries.append(media_info)
         return entries
 
-    def _extract_akamai_formats(self, manifest_url, video_id):
+    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
         formats = []
         hdcore_sign = 'hdcore=3.7.0'
-        f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        hds_host = hosts.get('hds')
+        if hds_host:
+            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
         if 'hdcore=' not in f4m_url:
             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
         f4m_formats = self._extract_f4m_formats(
@@ -1966,7 +2018,10 @@ class InfoExtractor(object):
         for entry in f4m_formats:
             entry.update({'extra_param_to_segment_url': hdcore_sign})
         formats.extend(f4m_formats)
-        m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        hls_host = hosts.get('hls')
+        if hls_host:
+            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
         formats.extend(self._extract_m3u8_formats(
             m3u8_url, video_id, 'mp4', 'm3u8_native',
             m3u8_id='hls', fatal=False))