New upstream version 2020.09.14

[youtubedl] / youtube_dl / extractor / ard.py
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

index 957bdefcbec7666473530815df9e3eb5dc88e096..5b7b2dd6d2ee2427d0df3920f982b3eece6d67fc 100644 (file)
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -1,66 +1,207 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import json
  import re
  
  from .common import InfoExtractor
+from .generic import GenericIE
  from ..utils import (
      determine_ext,
      ExtractorError,
+    int_or_none,
+    parse_duration,
      qualities,
-    compat_urllib_parse_urlparse,
-    compat_urllib_parse,
+    str_or_none,
+    try_get,
+    unified_strdate,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+    xpath_text,
  )
+from ..compat import compat_etree_fromstring
  
  
-class ARDIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+class ARDMediathekBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['DE']
+
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+        return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
+
+    def _parse_media_info(self, media_info, video_id, fsk):
+        formats = self._extract_formats(media_info, video_id)
+
+        if not formats:
+            if fsk:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                self.raise_geo_restricted(
+                    'This video is not available due to geoblocking',
+                    countries=self._GEO_COUNTRIES)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'ttml',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': int_or_none(media_info.get('_duration')),
+            'thumbnail': media_info.get('_previewImage'),
+            'is_live': media_info.get('_isLive') is True,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    if not url_or_none(stream_url):
+                        continue
+                    ext = determine_ext(stream_url)
+                    if quality != 'auto' and ext in ('f4m', 'm3u8'):
+                        continue
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            update_url_query(stream_url, {
+                                'hdcore': '3.1.1',
+                                'plugin': 'aasp-3.1.1.69.124'
+                            }), video_id, f4m_id='hds', fatal=False))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        else:
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        m = re.search(
+                            r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
+                            stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
+
+class ARDMediathekIE(ARDMediathekBaseIE):
+    IE_NAME = 'ARD:mediathek'
+    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
  
      _TESTS = [{
-        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
-        'file': '22429276.mp4',
-        'md5': '469751912f1de0816a9fc9df8336476c',
-        'info_dict': {
-            'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
-            'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
-        },
-        'skip': 'Blocked outside of Germany',
-    }, {
-        'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+        # available till 26.07.2022
+        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
          'info_dict': {
-            'id': '22490580',
+            'id': '44726822',
              'ext': 'mp4',
-            'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
-            'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
+            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
+            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
+            'duration': 1740,
          },
-        'skip': 'Blocked outside of Germany',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+        'only_matching': True,
+    }, {
+        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
+        'only_matching': True,
+    }, {
+        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
+        'only_matching': True,
      }]
  
+    @classmethod
+    def suitable(cls, url):
+        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
+
      def _real_extract(self, url):
          # determine video id from url
          m = re.match(self._VALID_URL, url)
  
+        document_id = None
+
          numid = re.search(r'documentId=([0-9]+)', url)
          if numid:
-            video_id = numid.group(1)
+            document_id = video_id = numid.group(1)
          else:
              video_id = m.group('video_id')
  
-        urlp = compat_urllib_parse_urlparse(url)
-        url = urlp._replace(path=compat_urllib_parse.quote(urlp.path.encode('utf-8'))).geturl()
-
          webpage = self._download_webpage(url, video_id)
  
+        ERRORS = (
+            ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
+            ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
+             'Video %s is no longer available'),
+        )
+
+        for pattern, message in ERRORS:
+            if pattern in webpage:
+                raise ExtractorError(message % video_id, expected=True)
+
+        if re.search(r'[\?&]rss($|[=&])', url):
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
+            if doc.tag == 'rss':
+                return GenericIE()._extract_rss(url, video_id, doc)
+
          title = self._html_search_regex(
              [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
-             r'<meta name="dcterms.title" content="(.*?)"/>',
-             r'<h4 class="headline">(.*?)</h4>'],
+             r'<meta name="dcterms\.title" content="(.*?)"/>',
+             r'<h4 class="headline">(.*?)</h4>',
+             r'<title[^>]*>(.*?)</title>'],
              webpage, 'title')
          description = self._html_search_meta(
              'dcterms.abstract', webpage, 'description', default=None)
          if description is None:
              description = self._html_search_meta(
-                'description', webpage, 'meta description')
+                'description', webpage, 'meta description', default=None)
+        if description is None:
+            description = self._html_search_regex(
+                r'<p\s+class="teasertext">(.+?)</p>',
+                webpage, 'teaser text', default=None)
  
          # Thumbnail is sometimes not present.
          # It is in the mobile version, but that seems to use a different URL
@@ -85,43 +226,197 @@ class ARDIE(InfoExtractor):
                      'format_id': fid,
                      'url': furl,
                  })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
          else:  # request JSON file
-            media_info = self._download_json(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
-            # The second element of the _mediaArray contains the standard http urls
-            streams = media_info['_mediaArray'][1]['_mediaStreamArray']
-            if not streams:
-                if '"fsk"' in webpage:
-                    raise ExtractorError('This video is only available after 20:00')
+            if not document_id:
+                video_id = self._search_regex(
+                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id,
+                webpage, video_id)
+
+        info.update({
+            'id': video_id,
+            'title': self._live_title(title) if info.get('is_live') else title,
+            'description': description,
+            'thumbnail': thumbnail,
+        })
+
+        return info
  
-            formats = []
-            for s in streams:
-                if type(s['_stream']) == list:
-                    for index, url in enumerate(s['_stream'][::-1]):
-                        quality = s['_quality'] + index
-                        formats.append({
-                            'quality': quality,
-                            'url': url,
-                            'format_id': '%s-%s' % (determine_ext(url), quality)
-                        })
-                    continue
  
-                format = {
-                    'quality': s['_quality'],
-                    'url': s['_stream'],
-                }
+class ARDIE(InfoExtractor):
+    _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+    _TESTS = [{
+        # available till 14.02.2019
+        'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
+        'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
+        'info_dict': {
+            'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
+            'id': '102',
+            'ext': 'mp4',
+            'duration': 4435.0,
+            'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
+            'upload_date': '20180214',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
+        'only_matching': True,
+    }]
  
-                format['format_id'] = '%s-%s' % (
-                    determine_ext(format['url']), format['quality'])
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
  
-                formats.append(format)
+        player_url = mobj.group('mainurl') + '~playerXml.xml'
+        doc = self._download_xml(player_url, display_id)
+        video_node = doc.find('./video')
+        upload_date = unified_strdate(xpath_text(
+            video_node, './broadcastDate'))
+        thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
  
+        formats = []
+        for a in video_node.findall('.//asset'):
+            f = {
+                'format_id': a.attrib['type'],
+                'width': int_or_none(a.find('./frameWidth').text),
+                'height': int_or_none(a.find('./frameHeight').text),
+                'vbr': int_or_none(a.find('./bitrateVideo').text),
+                'abr': int_or_none(a.find('./bitrateAudio').text),
+                'vcodec': a.find('./codecVideo').text,
+                'tbr': int_or_none(a.find('./totalBitrate').text),
+            }
+            if a.find('./serverPrefix').text:
+                f['url'] = a.find('./serverPrefix').text
+                f['playpath'] = a.find('./fileName').text
+            else:
+                f['url'] = a.find('./fileName').text
+            formats.append(f)
          self._sort_formats(formats)
  
          return {
-            'id': video_id,
-            'title': title,
-            'description': description,
+            'id': mobj.group('id'),
              'formats': formats,
+            'display_id': display_id,
+            'title': video_node.find('./title').text,
+            'duration': parse_duration(video_node.find('./duration').text),
+            'upload_date': upload_date,
              'thumbnail': thumbnail,
          }
+
+
+class ARDBetaMediathekIE(ARDMediathekBaseIE):
+    _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+        'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
+        'info_dict': {
+            'display_id': 'die-robuste-roswita',
+            'id': '70153354',
+            'title': 'Die robuste Roswita',
+            'description': r're:^Der Mord.*trüber ist als die Ilm.',
+            'duration': 5316,
+            'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
+            'timestamp': 1577047500,
+            'upload_date': '20191222',
+            'ext': 'mp4',
+        },
+    }, {
+        'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+        'only_matching': True,
+    }, {
+        'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+        display_id = mobj.group('display_id')
+        if display_id:
+            display_id = display_id.rstrip('/')
+        if not display_id:
+            display_id = video_id
+
+        player_page = self._download_json(
+            'https://api.ardmediathek.de/public-gateway',
+            display_id, data=json.dumps({
+                'query': '''{
+  playerPage(client:"%s", clipId: "%s") {
+    blockedByFsk
+    broadcastedOn
+    maturityContentRating
+    mediaCollection {
+      _duration
+      _geoblocked
+      _isLive
+      _mediaArray {
+        _mediaStreamArray {
+          _quality
+          _server
+          _stream
+        }
+      }
+      _previewImage
+      _subtitleUrl
+      _type
+    }
+    show {
+      title
+    }
+    synopsis
+    title
+    tracking {
+      atiCustomVars {
+        contentId
+      }
+    }
+  }
+}''' % (mobj.group('client'), video_id),
+            }).encode(), headers={
+                'Content-Type': 'application/json'
+            })['data']['playerPage']
+        title = player_page['title']
+        content_id = str_or_none(try_get(
+            player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
+        media_collection = player_page.get('mediaCollection') or {}
+        if not media_collection and content_id:
+            media_collection = self._download_json(
+                'https://www.ardmediathek.de/play/media/' + content_id,
+                content_id, fatal=False) or {}
+        info = self._parse_media_info(
+            media_collection, content_id or video_id,
+            player_page.get('blockedByFsk'))
+        age_limit = None
+        description = player_page.get('synopsis')
+        maturity_content_rating = player_page.get('maturityContentRating')
+        if maturity_content_rating:
+            age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
+        if not age_limit and description:
+            age_limit = int_or_none(self._search_regex(
+                r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
+        info.update({
+            'age_limit': age_limit,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
+            'series': try_get(player_page, lambda x: x['show']['title']),
+        })
+        return info