X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/22bc55bffeb45b7d2f3056ae863eb3228e6507e8..4fbf6829491780534e93bd27e5a749e608c97b46:/youtube_dl/extractor/ard.py diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61..2f47e21 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -8,70 +9,42 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - qualities, int_or_none, parse_duration, + qualities, + str_or_none, + try_get, unified_strdate, - xpath_text, + unified_timestamp, update_url_query, url_or_none, + xpath_text, ) from ..compat import compat_etree_fromstring -class ARDMediathekIE(InfoExtractor): - IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - - _TESTS = [{ - # available till 26.07.2022 - 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', - 'info_dict': { - 'id': '44726822', - 'ext': 'mp4', - 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', - 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', - 'duration': 1740, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', - 'only_matching': True, - }, { - # audio - 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'only_matching': True, - }, { - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, - }, { - # audio - 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'only_matching': True, - }] +class ARDMediathekBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') + return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) + def _parse_media_info(self, media_info, video_id, fsk): formats = self._extract_formats(media_info, video_id) if not formats: - if '"fsk"' in webpage: + if fsk: raise ExtractorError( 'This video is only available after 20:00', expected=True) elif media_info.get('_geoblocked'): - raise ExtractorError('This video is not available due to geo restriction', expected=True) + self.raise_geo_restricted( + 'This video is not available due to geoblocking', + countries=self._GEO_COUNTRIES) self._sort_formats(formats) - duration = int_or_none(media_info.get('_duration')) - thumbnail = media_info.get('_previewImage') - is_live = media_info.get('_isLive') is True - subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: @@ -82,9 +55,9 @@ class ARDMediathekIE(InfoExtractor): return { 'id': video_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'is_live': is_live, + 'duration': int_or_none(media_info.get('_duration')), + 'thumbnail': media_info.get('_previewImage'), + 'is_live': media_info.get('_isLive') is True, 'formats': formats, 'subtitles': subtitles, } @@ -113,11 +86,11 @@ class ARDMediathekIE(InfoExtractor): update_url_query(stream_url, { 'hdcore': '3.1.1', 'plugin': 'aasp-3.1.1.69.124' - }), - video_id, f4m_id='hds', fatal=False)) + }), video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -130,7 +103,9 @@ class ARDMediathekIE(InfoExtractor): 'url': stream_url, 'format_id': 'a%s-%s-%s' % (num, ext, quality) } - m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + m = re.search( + r'_(?P\d+)x(?P\d+)\.mp4$', + stream_url) if m: f.update({ 'width': int(m.group('width')), @@ -141,6 +116,48 @@ class ARDMediathekIE(InfoExtractor): formats.append(f) return formats + +class ARDMediathekIE(ARDMediathekBaseIE): + IE_NAME = 'ARD:mediathek' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + + _TESTS = [{ + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'info_dict': { + 'id': '44726822', + 'ext': 'mp4', + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'only_matching': True, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -173,13 +190,18 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'(.*?)', r'', - r'

(.*?)

'], + r'

(.*?)

', + r']*>(.*?)'], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( - 'description', webpage, 'meta description') + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

', + webpage, 'teaser text', default=None) # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL @@ -287,74 +309,98 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P[a-zA-Z0-9]+)/(?P[^/?#]+)' +class ARDBetaMediathekIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', - 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'title': 'Tatort: Die robuste Roswita', + 'id': '70153354', + 'title': 'Die robuste Roswita', 'description': r're:^Der Mord.*trüber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', - 'upload_date': '20180826', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', + 'timestamp': 1577047500, + 'upload_date': '20191222', 'ext': 'mp4', }, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') - data = self._parse_json(data_json, display_id) - - res = { - 'id': video_id, - 'display_id': display_id, + display_id = mobj.group('display_id') or video_id + + player_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + display_id, data=json.dumps({ + 'query': '''{ + playerPage(client:"%s", clipId: "%s") { + blockedByFsk + broadcastedOn + maturityContentRating + mediaCollection { + _duration + _geoblocked + _isLive + _mediaArray { + _mediaStreamArray { + _quality + _server + _stream } - formats = [] - for widget in data.values(): - if widget.get('_geoblocked'): - raise ExtractorError('This video is not available due to geoblocking', expected=True) - - if '_duration' in widget: - res['duration'] = widget['_duration'] - if 'clipTitle' in widget: - res['title'] = widget['clipTitle'] - if '_previewImage' in widget: - res['thumbnail'] = widget['_previewImage'] - if 'broadcastedOn' in widget: - res['upload_date'] = unified_strdate(widget['broadcastedOn']) - if 'synopsis' in widget: - res['description'] = widget['synopsis'] - if '_subtitleUrl' in widget: - res['subtitles'] = {'de': [{ - 'ext': 'ttml', - 'url': widget['_subtitleUrl'], - }]} - if '_quality' in widget: - format_url = widget['_stream']['json'][0] - - if format_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=3.11.0', - video_id, f4m_id='hds', fatal=False)) - elif format_url.endswith('m3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': 'http-' + widget['_quality'], - 'url': format_url, - 'preference': 10, # Plain HTTP, that's nice - }) - - self._sort_formats(formats) - res['formats'] = formats - - return res + } + _previewImage + _subtitleUrl + _type + } + show { + title + } + synopsis + title + tracking { + atiCustomVars { + contentId + } + } + } +}''' % (mobj.group('client'), video_id), + }).encode(), headers={ + 'Content-Type': 'application/json' + })['data']['playerPage'] + title = player_page['title'] + content_id = str_or_none(try_get( + player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) + media_collection = player_page.get('mediaCollection') or {} + if not media_collection and content_id: + media_collection = self._download_json( + 'https://www.ardmediathek.de/play/media/' + content_id, + content_id, fatal=False) or {} + info = self._parse_media_info( + media_collection, content_id or video_id, + player_page.get('blockedByFsk')) + age_limit = None + description = player_page.get('synopsis') + maturity_content_rating = player_page.get('maturityContentRating') + if maturity_content_rating: + age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) + if not age_limit and description: + age_limit = int_or_none(self._search_regex( + r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + info.update({ + 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), + 'series': try_get(player_page, lambda x: x['show']['title']), + }) + return info