]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py
2f47e21c324102d73cf277accc0b30ac41ddfab6
   2 from __future__ 
import unicode_literals
 
   7 from .common 
import InfoExtractor
 
   8 from .generic 
import GenericIE
 
  23 from ..compat 
import compat_etree_fromstring
 
  26 class ARDMediathekBaseIE(InfoExtractor
): 
  27     _GEO_COUNTRIES 
= ['DE'] 
  29     def _extract_media_info(self
, media_info_url
, webpage
, video_id
): 
  30         media_info 
= self
._download
_json
( 
  31             media_info_url
, video_id
, 'Downloading media JSON') 
  32         return self
._parse
_media
_info
(media_info
, video_id
, '"fsk"' in webpage
) 
  34     def _parse_media_info(self
, media_info
, video_id
, fsk
): 
  35         formats 
= self
._extract
_formats
(media_info
, video_id
) 
  40                     'This video is only available after 20:00', expected
=True) 
  41             elif media_info
.get('_geoblocked'): 
  42                 self
.raise_geo_restricted( 
  43                     'This video is not available due to geoblocking', 
  44                     countries
=self
._GEO
_COUNTRIES
) 
  46         self
._sort
_formats
(formats
) 
  49         subtitle_url 
= media_info
.get('_subtitleUrl') 
  58             'duration': int_or_none(media_info
.get('_duration')), 
  59             'thumbnail': media_info
.get('_previewImage'), 
  60             'is_live': media_info
.get('_isLive') is True, 
  62             'subtitles': subtitles
, 
  65     def _extract_formats(self
, media_info
, video_id
): 
  66         type_ 
= media_info
.get('_type') 
  67         media_array 
= media_info
.get('_mediaArray', []) 
  69         for num
, media 
in enumerate(media_array
): 
  70             for stream 
in media
.get('_mediaStreamArray', []): 
  71                 stream_urls 
= stream
.get('_stream') 
  74                 if not isinstance(stream_urls
, list): 
  75                     stream_urls 
= [stream_urls
] 
  76                 quality 
= stream
.get('_quality') 
  77                 server 
= stream
.get('_server') 
  78                 for stream_url 
in stream_urls
: 
  79                     if not url_or_none(stream_url
): 
  81                     ext 
= determine_ext(stream_url
) 
  82                     if quality 
!= 'auto' and ext 
in ('f4m', 'm3u8'): 
  85                         formats
.extend(self
._extract
_f
4m
_formats
( 
  86                             update_url_query(stream_url
, { 
  88                                 'plugin': 'aasp-3.1.1.69.124' 
  89                             }), video_id
, f4m_id
='hds', fatal
=False)) 
  91                         formats
.extend(self
._extract
_m
3u8_formats
( 
  92                             stream_url
, video_id
, 'mp4', 'm3u8_native', 
  93                             m3u8_id
='hls', fatal
=False)) 
  95                         if server 
and server
.startswith('rtmp'): 
  98                                 'play_path': stream_url
, 
  99                                 'format_id': 'a%s-rtmp-%s' % (num
, quality
), 
 104                                 'format_id': 'a%s-%s-%s' % (num
, ext
, quality
) 
 107                             r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', 
 111                                 'width': int(m
.group('width')), 
 112                                 'height': int(m
.group('height')), 
 120 class ARDMediathekIE(ARDMediathekBaseIE
): 
 121     IE_NAME 
= 'ARD:mediathek' 
 122     _VALID_URL 
= r
'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' 
 125         # available till 26.07.2022 
 126         'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', 
 130             'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', 
 131             'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', 
 136             'skip_download': True, 
 139         'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', 
 140         'only_matching': True, 
 143         'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', 
 144         'only_matching': True, 
 146         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 
 147         'only_matching': True, 
 150         'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', 
 151         'only_matching': True, 
 153         'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', 
 154         'only_matching': True, 
 158     def suitable(cls
, url
): 
 159         return False if ARDBetaMediathekIE
.suitable(url
) else super(ARDMediathekIE
, cls
).suitable(url
) 
 161     def _real_extract(self
, url
): 
 162         # determine video id from url 
 163         m 
= re
.match(self
._VALID
_URL
, url
) 
 167         numid 
= re
.search(r
'documentId=([0-9]+)', url
) 
 169             document_id 
= video_id 
= numid
.group(1) 
 171             video_id 
= m
.group('video_id') 
 173         webpage 
= self
._download
_webpage
(url
, video_id
) 
 176             ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), 
 177             ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', 
 178              'Video %s is no longer available'), 
 181         for pattern
, message 
in ERRORS
: 
 182             if pattern 
in webpage
: 
 183                 raise ExtractorError(message 
% video_id
, expected
=True) 
 185         if re
.search(r
'[\?&]rss($|[=&])', url
): 
 186             doc 
= compat_etree_fromstring(webpage
.encode('utf-8')) 
 188                 return GenericIE()._extract
_rss
(url
, video_id
, doc
) 
 190         title 
= self
._html
_search
_regex
( 
 191             [r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', 
 192              r
'<meta name="dcterms\.title" content="(.*?)"/>', 
 193              r
'<h4 class="headline">(.*?)</h4>', 
 194              r
'<title[^>]*>(.*?)</title>'], 
 196         description 
= self
._html
_search
_meta
( 
 197             'dcterms.abstract', webpage
, 'description', default
=None) 
 198         if description 
is None: 
 199             description 
= self
._html
_search
_meta
( 
 200                 'description', webpage
, 'meta description', default
=None) 
 201         if description 
is None: 
 202             description 
= self
._html
_search
_regex
( 
 203                 r
'<p\s+class="teasertext">(.+?)</p>', 
 204                 webpage
, 'teaser text', default
=None) 
 206         # Thumbnail is sometimes not present. 
 207         # It is in the mobile version, but that seems to use a different URL 
 208         # structure altogether. 
 209         thumbnail 
= self
._og
_search
_thumbnail
(webpage
, default
=None) 
 211         media_streams 
= re
.findall(r
'''(?x) 
 212             mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* 
 213             "([^"]+)"''', webpage
) 
 216             QUALITIES 
= qualities(['lo', 'hi', 'hq']) 
 218             for furl 
in set(media_streams
): 
 219                 if furl
.endswith('.f4m'): 
 222                     fid_m 
= re
.match(r
'.*\.([^.]+)\.[^.]+$', furl
) 
 223                     fid 
= fid_m
.group(1) if fid_m 
else None 
 225                     'quality': QUALITIES(fid
), 
 229             self
._sort
_formats
(formats
) 
 233         else:  # request JSON file 
 235                 video_id 
= self
._search
_regex
( 
 236                     r
'/play/(?:config|media)/(\d+)', webpage
, 'media id') 
 237             info 
= self
._extract
_media
_info
( 
 238                 'http://www.ardmediathek.de/play/media/%s' % video_id
, 
 243             'title': self
._live
_title
(title
) if info
.get('is_live') else title
, 
 244             'description': description
, 
 245             'thumbnail': thumbnail
, 
 251 class ARDIE(InfoExtractor
): 
 252     _VALID_URL 
= r
'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html' 
 254         # available till 14.02.2019 
 255         'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', 
 256         'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', 
 258             'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', 
 262             'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', 
 263             'upload_date': '20180214', 
 264             'thumbnail': r
're:^https?://.*\.jpg$', 
 267         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 
 268         'only_matching': True, 
 271     def _real_extract(self
, url
): 
 272         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 273         display_id 
= mobj
.group('display_id') 
 275         player_url 
= mobj
.group('mainurl') + '~playerXml.xml' 
 276         doc 
= self
._download
_xml
(player_url
, display_id
) 
 277         video_node 
= doc
.find('./video') 
 278         upload_date 
= unified_strdate(xpath_text( 
 279             video_node
, './broadcastDate')) 
 280         thumbnail 
= xpath_text(video_node
, './/teaserImage//variant/url') 
 283         for a 
in video_node
.findall('.//asset'): 
 285                 'format_id': a
.attrib
['type'], 
 286                 'width': int_or_none(a
.find('./frameWidth').text
), 
 287                 'height': int_or_none(a
.find('./frameHeight').text
), 
 288                 'vbr': int_or_none(a
.find('./bitrateVideo').text
), 
 289                 'abr': int_or_none(a
.find('./bitrateAudio').text
), 
 290                 'vcodec': a
.find('./codecVideo').text
, 
 291                 'tbr': int_or_none(a
.find('./totalBitrate').text
), 
 293             if a
.find('./serverPrefix').text
: 
 294                 f
['url'] = a
.find('./serverPrefix').text
 
 295                 f
['playpath'] = a
.find('./fileName').text
 
 297                 f
['url'] = a
.find('./fileName').text
 
 299         self
._sort
_formats
(formats
) 
 302             'id': mobj
.group('id'), 
 304             'display_id': display_id
, 
 305             'title': video_node
.find('./title').text
, 
 306             'duration': parse_duration(video_node
.find('./duration').text
), 
 307             'upload_date': upload_date
, 
 308             'thumbnail': thumbnail
, 
 312 class ARDBetaMediathekIE(ARDMediathekBaseIE
): 
 313     _VALID_URL 
= r
'https://(?:beta|www)\.ardmediathek\.de/(?P<client>[^/]+)/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?' 
 315         'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', 
 316         'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', 
 318             'display_id': 'die-robuste-roswita', 
 320             'title': 'Die robuste Roswita', 
 321             'description': r
're:^Der Mord.*trüber ist als die Ilm.', 
 323             'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', 
 324             'timestamp': 1577047500, 
 325             'upload_date': '20191222', 
 329         'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', 
 330         'only_matching': True, 
 332         'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 
 333         'only_matching': True, 
 336     def _real_extract(self
, url
): 
 337         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 338         video_id 
= mobj
.group('video_id') 
 339         display_id 
= mobj
.group('display_id') or video_id
 
 341         player_page 
= self
._download
_json
( 
 342             'https://api.ardmediathek.de/public-gateway', 
 343             display_id
, data
=json
.dumps({ 
 345   playerPage(client:"%s", clipId: "%s") { 
 348     maturityContentRating 
 375 }''' % (mobj
.group('client'), video_id
), 
 376             }).encode(), headers
={ 
 377                 'Content-Type': 'application/json' 
 378             })['data']['playerPage'] 
 379         title 
= player_page
['title'] 
 380         content_id 
= str_or_none(try_get( 
 381             player_page
, lambda x
: x
['tracking']['atiCustomVars']['contentId'])) 
 382         media_collection 
= player_page
.get('mediaCollection') or {} 
 383         if not media_collection 
and content_id
: 
 384             media_collection 
= self
._download
_json
( 
 385                 'https://www.ardmediathek.de/play/media/' + content_id
, 
 386                 content_id
, fatal
=False) or {} 
 387         info 
= self
._parse
_media
_info
( 
 388             media_collection
, content_id 
or video_id
, 
 389             player_page
.get('blockedByFsk')) 
 391         description 
= player_page
.get('synopsis') 
 392         maturity_content_rating 
= player_page
.get('maturityContentRating') 
 393         if maturity_content_rating
: 
 394             age_limit 
= int_or_none(maturity_content_rating
.lstrip('FSK')) 
 395         if not age_limit 
and description
: 
 396             age_limit 
= int_or_none(self
._search
_regex
( 
 397                 r
'\(FSK\s*(\d+)\)\s*$', description
, 'age limit', default
=None)) 
 399             'age_limit': age_limit
, 
 400             'display_id': display_id
, 
 402             'description': description
, 
 403             'timestamp': unified_timestamp(player_page
.get('broadcastedOn')), 
 404             'series': try_get(player_page
, lambda x
: x
['show']['title']),