]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
7 from . generic
import GenericIE
18 from .. compat
import compat_etree_fromstring
21 class ARDMediathekIE ( InfoExtractor
):
22 IE_NAME
= 'ARD:mediathek'
23 _VALID_URL
= r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
26 'url' : 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,
30 'title' : 'Ich liebe das Leben trotzdem' ,
31 'description' : 'md5:45e4c225c72b27993314b31a84a5261c' ,
36 'skip_download' : True ,
38 'skip' : 'HTTP Error 404: Not Found' ,
40 'url' : 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,
41 'md5' : 'f4d98b10759ac06c0072bbcd1f0b9e3e' ,
45 'title' : 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,
46 'description' : 'md5:196392e79876d0ac94c94e8cdb2875f1' ,
49 'skip' : 'HTTP Error 404: Not Found' ,
52 'url' : 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,
53 'md5' : '219d94d8980b4f538c7fcb0865eb7f2c' ,
57 'title' : 'Tod eines Fußballers' ,
58 'description' : 'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,
61 'skip' : 'HTTP Error 404: Not Found' ,
63 'url' : 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,
64 'only_matching' : True ,
67 'url' : 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158' ,
68 'md5' : '4e8f00631aac0395fee17368ac0e9867' ,
72 'title' : 'Vor dem Fest' ,
73 'description' : 'md5:c0c1c8048514deaed2a73b3a60eecacb' ,
76 'skip' : 'Video is no longer available' ,
79 def _extract_media_info ( self
, media_info_url
, webpage
, video_id
):
80 media_info
= self
._ download
_ json
(
81 media_info_url
, video_id
, 'Downloading media JSON' )
83 formats
= self
._ extract
_ formats
( media_info
, video_id
)
86 if '"fsk"' in webpage
:
88 'This video is only available after 20:00' , expected
= True )
89 elif media_info
. get ( '_geoblocked' ):
90 raise ExtractorError ( 'This video is not available due to geo restriction' , expected
= True )
92 self
._ sort
_ formats
( formats
)
94 duration
= int_or_none ( media_info
. get ( '_duration' ))
95 thumbnail
= media_info
. get ( '_previewImage' )
98 subtitle_url
= media_info
. get ( '_subtitleUrl' )
107 'duration' : duration
,
108 'thumbnail' : thumbnail
,
110 'subtitles' : subtitles
,
113 def _extract_formats ( self
, media_info
, video_id
):
114 type_
= media_info
. get ( '_type' )
115 media_array
= media_info
. get ( '_mediaArray' , [])
117 for num
, media
in enumerate ( media_array
):
118 for stream
in media
. get ( '_mediaStreamArray' , []):
119 stream_urls
= stream
. get ( '_stream' )
122 if not isinstance ( stream_urls
, list ):
123 stream_urls
= [ stream_urls
]
124 quality
= stream
. get ( '_quality' )
125 server
= stream
. get ( '_server' )
126 for stream_url
in stream_urls
:
127 ext
= determine_ext ( stream_url
)
128 if quality
!= 'auto' and ext
in ( 'f4m' , 'm3u8' ):
131 formats
. extend ( self
._ extract
_ f
4 m
_ formats
(
132 update_url_query ( stream_url
, {
134 'plugin' : 'aasp-3.1.1.69.124'
136 video_id
, f4m_id
= 'hds' , fatal
= False ))
138 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
139 stream_url
, video_id
, 'mp4' , m3u8_id
= 'hls' , fatal
= False ))
141 if server
and server
. startswith ( 'rtmp' ):
144 'play_path' : stream_url
,
145 'format_id' : 'a %s-r tmp- %s ' % ( num
, quality
),
147 elif stream_url
. startswith ( 'http' ):
150 'format_id' : 'a %s-%s-%s ' % ( num
, ext
, quality
)
154 m
= re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' , stream_url
)
157 'width' : int ( m
. group ( 'width' )),
158 'height' : int ( m
. group ( 'height' )),
165 def _real_extract ( self
, url
):
166 # determine video id from url
167 m
= re
. match ( self
._ VALID
_U RL
, url
)
169 numid
= re
. search ( r
'documentId=([0-9]+)' , url
)
171 video_id
= numid
. group ( 1 )
173 video_id
= m
. group ( 'video_id' )
175 webpage
= self
._ download
_ webpage
( url
, video_id
)
177 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage
:
178 raise ExtractorError ( 'Video %s is no longer available' % video_id
, expected
= True )
180 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage
:
181 raise ExtractorError ( 'This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id
, expected
= True )
183 if re
. search ( r
'[\?&]rss($|[=&])' , url
):
184 doc
= compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))
186 return GenericIE () ._ extract
_ rss
( url
, video_id
, doc
)
188 title
= self
._ html
_ search
_ regex
(
189 [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,
190 r
'<meta name="dcterms.title" content="(.*?)"/>' ,
191 r
'<h4 class="headline">(.*?)</h4>' ],
193 description
= self
._ html
_ search
_ meta
(
194 'dcterms.abstract' , webpage
, 'description' , default
= None )
195 if description
is None :
196 description
= self
._ html
_ search
_ meta
(
197 'description' , webpage
, 'meta description' )
199 # Thumbnail is sometimes not present.
200 # It is in the mobile version, but that seems to use a different URL
201 # structure altogether.
202 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
, default
= None )
204 media_streams
= re
. findall ( r
'''(?x)
205 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
206 "([^"]+)"''' , webpage
)
209 QUALITIES
= qualities ([ 'lo' , 'hi' , 'hq' ])
211 for furl
in set ( media_streams
):
212 if furl
. endswith ( '.f4m' ):
215 fid_m
= re
. match ( r
'.*\.([^.]+)\.[^.]+$' , furl
)
216 fid
= fid_m
. group ( 1 ) if fid_m
else None
218 'quality' : QUALITIES ( fid
),
222 self
._ sort
_ formats
( formats
)
226 else : # request JSON file
227 info
= self
._ extract
_ media
_ info
(
228 'http://www.ardmediathek.de/play/media/ %s ' % video_id
, webpage
, video_id
)
233 'description' : description
,
234 'thumbnail' : thumbnail
,
240 class ARDIE ( InfoExtractor
):
241 _VALID_URL
= '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
243 'url' : 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,
244 'md5' : 'd216c3a86493f9322545e045ddc3eb35' ,
246 'display_id' : 'die-story-im-ersten-mission-unter-falscher-flagge' ,
250 'title' : 'Die Story im Ersten: Mission unter falscher Flagge' ,
251 'upload_date' : '20140804' ,
252 'thumbnail' : 're:^https?://.*\.jpg$' ,
254 'skip' : 'HTTP Error 404: Not Found' ,
257 def _real_extract ( self
, url
):
258 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
259 display_id
= mobj
. group ( 'display_id' )
261 player_url
= mobj
. group ( 'mainurl' ) + '~playerXml.xml'
262 doc
= self
._ download
_ xml
( player_url
, display_id
)
263 video_node
= doc
. find ( './video' )
264 upload_date
= unified_strdate ( xpath_text (
265 video_node
, './broadcastDate' ))
266 thumbnail
= xpath_text ( video_node
, './/teaserImage//variant/url' )
269 for a
in video_node
. findall ( './/asset' ):
271 'format_id' : a
. attrib
[ 'type' ],
272 'width' : int_or_none ( a
. find ( './frameWidth' ). text
),
273 'height' : int_or_none ( a
. find ( './frameHeight' ). text
),
274 'vbr' : int_or_none ( a
. find ( './bitrateVideo' ). text
),
275 'abr' : int_or_none ( a
. find ( './bitrateAudio' ). text
),
276 'vcodec' : a
. find ( './codecVideo' ). text
,
277 'tbr' : int_or_none ( a
. find ( './totalBitrate' ). text
),
279 if a
. find ( './serverPrefix' ). text
:
280 f
[ 'url' ] = a
. find ( './serverPrefix' ). text
281 f
[ 'playpath' ] = a
. find ( './fileName' ). text
283 f
[ 'url' ] = a
. find ( './fileName' ). text
285 self
._ sort
_ formats
( formats
)
288 'id' : mobj
. group ( 'id' ),
290 'display_id' : display_id
,
291 'title' : video_node
. find ( './title' ). text
,
292 'duration' : parse_duration ( video_node
. find ( './duration' ). text
),
293 'upload_date' : upload_date
,
294 'thumbnail' : thumbnail
,