X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/a6abd8dc822fb70852437ab5a77ced5f633739f8..948e7e72e8c48fac09cc92b37abe001b421fda48:/youtube_dl/extractor/ard.py

diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 783b53e..2f47e21 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
@@ -8,63 +9,199 @@ from .generic import GenericIE
 from ..utils import (
     determine_ext,
     ExtractorError,
-    qualities,
     int_or_none,
     parse_duration,
+    qualities,
+    str_or_none,
+    try_get,
     unified_strdate,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
     xpath_text,
-    parse_xml,
 )
+from ..compat import compat_etree_fromstring
+
+
+class ARDMediathekBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['DE']
+
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+        return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
+
+    def _parse_media_info(self, media_info, video_id, fsk):
+        formats = self._extract_formats(media_info, video_id)
 
+        if not formats:
+            if fsk:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                self.raise_geo_restricted(
+                    'This video is not available due to geoblocking',
+                    countries=self._GEO_COUNTRIES)
 
-class ARDMediathekIE(InfoExtractor):
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'ttml',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': int_or_none(media_info.get('_duration')),
+            'thumbnail': media_info.get('_previewImage'),
+            'is_live': media_info.get('_isLive') is True,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    if not url_or_none(stream_url):
+                        continue
+                    ext = determine_ext(stream_url)
+                    if quality != 'auto' and ext in ('f4m', 'm3u8'):
+                        continue
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            update_url_query(stream_url, {
+                                'hdcore': '3.1.1',
+                                'plugin': 'aasp-3.1.1.69.124'
+                            }), video_id, f4m_id='hds', fatal=False))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        else:
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        m = re.search(
+                            r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
+                            stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
+
+class ARDMediathekIE(ARDMediathekBaseIE):
     IE_NAME = 'ARD:mediathek'
-    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
 
     _TESTS = [{
-        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+        # available till 26.07.2022
+        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
         'info_dict': {
-            'id': '22490580',
+            'id': '44726822',
             'ext': 'mp4',
-            'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
-            'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklÃ¤rt Prof. Boerne am Tatort.',
+            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
+            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
+            'duration': 1740,
         },
-        'skip': 'Blocked outside of Germany',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+        'only_matching': True,
+    }, {
+        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://mediathek.rbb-online.de/radio/HÃ¶rspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
+        'only_matching': True,
+    }, {
+        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
+        'only_matching': True,
     }]
 
+    @classmethod
+    def suitable(cls, url):
+        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
+
     def _real_extract(self, url):
         # determine video id from url
         m = re.match(self._VALID_URL, url)
 
+        document_id = None
+
         numid = re.search(r'documentId=([0-9]+)', url)
         if numid:
-            video_id = numid.group(1)
+            document_id = video_id = numid.group(1)
         else:
             video_id = m.group('video_id')
 
         webpage = self._download_webpage(url, video_id)
 
-        if '>Der gewÃ¼nschte Beitrag ist nicht mehr verfÃ¼gbar.<' in webpage:
-            raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
+        ERRORS = (
+            ('>Leider liegt eine StÃ¶rung vor.', 'Video %s is unavailable'),
+            ('>Der gewÃ¼nschte Beitrag ist nicht mehr verfÃ¼gbar.<',
+             'Video %s is no longer available'),
+        )
+
+        for pattern, message in ERRORS:
+            if pattern in webpage:
+                raise ExtractorError(message % video_id, expected=True)
 
         if re.search(r'[\?&]rss($|[=&])', url):
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
             if doc.tag == 'rss':
                 return GenericIE()._extract_rss(url, video_id, doc)
 
         title = self._html_search_regex(
             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
-             r'<meta name="dcterms.title" content="(.*?)"/>',
-             r'<h4 class="headline">(.*?)</h4>'],
+             r'<meta name="dcterms\.title" content="(.*?)"/>',
+             r'<h4 class="headline">(.*?)</h4>',
+             r'<title[^>]*>(.*?)</title>'],
             webpage, 'title')
         description = self._html_search_meta(
             'dcterms.abstract', webpage, 'description', default=None)
         if description is None:
             description = self._html_search_meta(
-                'description', webpage, 'meta description')
+                'description', webpage, 'meta description', default=None)
+        if description is None:
+            description = self._html_search_regex(
+                r'<p\s+class="teasertext">(.+?)</p>',
+                webpage, 'teaser text', default=None)
 
         # Thumbnail is sometimes not present.
         # It is in the mobile version, but that seems to use a different URL
@@ -89,63 +226,47 @@ class ARDMediathekIE(InfoExtractor):
                     'format_id': fid,
                     'url': furl,
                 })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
         else:  # request JSON file
-            media_info = self._download_json(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
-            # The second element of the _mediaArray contains the standard http urls
-            streams = media_info['_mediaArray'][1]['_mediaStreamArray']
-            if not streams:
-                if '"fsk"' in webpage:
-                    raise ExtractorError('This video is only available after 20:00')
-
-            formats = []
-            for s in streams:
-                if type(s['_stream']) == list:
-                    for index, url in enumerate(s['_stream'][::-1]):
-                        quality = s['_quality'] + index
-                        formats.append({
-                            'quality': quality,
-                            'url': url,
-                            'format_id': '%s-%s' % (determine_ext(url), quality)
-                        })
-                    continue
+            if not document_id:
+                video_id = self._search_regex(
+                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id,
+                webpage, video_id)
 
-                format = {
-                    'quality': s['_quality'],
-                    'url': s['_stream'],
-                }
-
-                format['format_id'] = '%s-%s' % (
-                    determine_ext(format['url']), format['quality'])
-
-                formats.append(format)
-
-        self._sort_formats(formats)
-
-        return {
+        info.update({
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if info.get('is_live') else title,
             'description': description,
-            'formats': formats,
             'thumbnail': thumbnail,
-        }
+        })
+
+        return info
 
 
 class ARDIE(InfoExtractor):
-    _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
-    _TEST = {
-        'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
-        'md5': 'd216c3a86493f9322545e045ddc3eb35',
+    _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+    _TESTS = [{
+        # available till 14.02.2019
+        'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
+        'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
         'info_dict': {
-            'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
-            'id': '100',
+            'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
+            'id': '102',
             'ext': 'mp4',
-            'duration': 2600,
-            'title': 'Die Story im Ersten: Mission unter falscher Flagge',
-            'upload_date': '20140804',
-            'thumbnail': 're:^https?://.*\.jpg$',
-        }
-    }
+            'duration': 4435.0,
+            'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
+            'upload_date': '20180214',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -186,3 +307,100 @@ class ARDIE(InfoExtractor):
             'upload_date': upload_date,
             'thumbnail': thumbnail,
         }
+
+
+class ARDBetaMediathekIE(ARDMediathekBaseIE):
+    _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P<client>[^/]+)/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?'
+    _TESTS = [{
+        'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita',
+        'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
+        'info_dict': {
+            'display_id': 'die-robuste-roswita',
+            'id': '70153354',
+            'title': 'Die robuste Roswita',
+            'description': r're:^Der Mord.*trÃ¼ber ist als die Ilm.',
+            'duration': 5316,
+            'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
+            'timestamp': 1577047500,
+            'upload_date': '20191222',
+            'ext': 'mp4',
+        },
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+        display_id = mobj.group('display_id') or video_id
+
+        player_page = self._download_json(
+            'https://api.ardmediathek.de/public-gateway',
+            display_id, data=json.dumps({
+                'query': '''{
+  playerPage(client:"%s", clipId: "%s") {
+    blockedByFsk
+    broadcastedOn
+    maturityContentRating
+    mediaCollection {
+      _duration
+      _geoblocked
+      _isLive
+      _mediaArray {
+        _mediaStreamArray {
+          _quality
+          _server
+          _stream
+        }
+      }
+      _previewImage
+      _subtitleUrl
+      _type
+    }
+    show {
+      title
+    }
+    synopsis
+    title
+    tracking {
+      atiCustomVars {
+        contentId
+      }
+    }
+  }
+}''' % (mobj.group('client'), video_id),
+            }).encode(), headers={
+                'Content-Type': 'application/json'
+            })['data']['playerPage']
+        title = player_page['title']
+        content_id = str_or_none(try_get(
+            player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
+        media_collection = player_page.get('mediaCollection') or {}
+        if not media_collection and content_id:
+            media_collection = self._download_json(
+                'https://www.ardmediathek.de/play/media/' + content_id,
+                content_id, fatal=False) or {}
+        info = self._parse_media_info(
+            media_collection, content_id or video_id,
+            player_page.get('blockedByFsk'))
+        age_limit = None
+        description = player_page.get('synopsis')
+        maturity_content_rating = player_page.get('maturityContentRating')
+        if maturity_content_rating:
+            age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
+        if not age_limit and description:
+            age_limit = int_or_none(self._search_regex(
+                r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
+        info.update({
+            'age_limit': age_limit,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
+            'series': try_get(player_page, lambda x: x['show']['title']),
+        })
+        return info