]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py
9fb84911a0b81fd42de2c9bd410cdaf2dd4813a6
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
7 from . generic
import GenericIE
11 get_element_by_attribute
,
18 from .. compat
import compat_etree_fromstring
21 class ARDMediathekIE ( InfoExtractor
):
22 IE_NAME
= 'ARD:mediathek'
23 _VALID_URL
= r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
26 'url' : 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,
30 'title' : 'Ich liebe das Leben trotzdem' ,
31 'description' : 'md5:45e4c225c72b27993314b31a84a5261c' ,
36 'skip_download' : True ,
39 'url' : 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,
40 'md5' : 'f4d98b10759ac06c0072bbcd1f0b9e3e' ,
44 'title' : 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,
45 'description' : 'md5:196392e79876d0ac94c94e8cdb2875f1' ,
50 'url' : 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,
51 'md5' : '219d94d8980b4f538c7fcb0865eb7f2c' ,
55 'title' : 'Tod eines Fußballers' ,
56 'description' : 'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,
60 'url' : 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,
61 'only_matching' : True ,
64 def _extract_media_info ( self
, media_info_url
, webpage
, video_id
):
65 media_info
= self
._ download
_ json
(
66 media_info_url
, video_id
, 'Downloading media JSON' )
68 formats
= self
._ extract
_ formats
( media_info
, video_id
)
71 if '"fsk"' in webpage
:
73 'This video is only available after 20:00' , expected
= True )
74 elif media_info
. get ( '_geoblocked' ):
75 raise ExtractorError ( 'This video is not available due to geo restriction' , expected
= True )
77 self
._ sort
_ formats
( formats
)
79 duration
= int_or_none ( media_info
. get ( '_duration' ))
80 thumbnail
= media_info
. get ( '_previewImage' )
83 subtitle_url
= media_info
. get ( '_subtitleUrl' )
93 'thumbnail' : thumbnail
,
95 'subtitles' : subtitles
,
98 def _extract_formats ( self
, media_info
, video_id
):
99 type_
= media_info
. get ( '_type' )
100 media_array
= media_info
. get ( '_mediaArray' , [])
102 for num
, media
in enumerate ( media_array
):
103 for stream
in media
. get ( '_mediaStreamArray' , []):
104 stream_urls
= stream
. get ( '_stream' )
107 if not isinstance ( stream_urls
, list ):
108 stream_urls
= [ stream_urls
]
109 quality
= stream
. get ( '_quality' )
110 server
= stream
. get ( '_server' )
111 for stream_url
in stream_urls
:
112 ext
= determine_ext ( stream_url
)
113 if quality
!= 'auto' and ext
in ( 'f4m' , 'm3u8' ):
116 formats
. extend ( self
._ extract
_ f
4 m
_ formats
(
117 stream_url
+ '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,
118 video_id
, preference
=- 1 , f4m_id
= 'hds' , fatal
= False ))
120 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
121 stream_url
, video_id
, 'mp4' , preference
= 1 , m3u8_id
= 'hls' , fatal
= False ))
123 if server
and server
. startswith ( 'rtmp' ):
126 'play_path' : stream_url
,
127 'format_id' : 'a %s-r tmp- %s ' % ( num
, quality
),
129 elif stream_url
. startswith ( 'http' ):
132 'format_id' : 'a %s-%s-%s ' % ( num
, ext
, quality
)
136 m
= re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' , stream_url
)
139 'width' : int ( m
. group ( 'width' )),
140 'height' : int ( m
. group ( 'height' )),
147 def _real_extract ( self
, url
):
148 # determine video id from url
149 m
= re
. match ( self
._ VALID
_U RL
, url
)
151 numid
= re
. search ( r
'documentId=([0-9]+)' , url
)
153 video_id
= numid
. group ( 1 )
155 video_id
= m
. group ( 'video_id' )
157 webpage
= self
._ download
_ webpage
( url
, video_id
)
159 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage
:
160 raise ExtractorError ( 'Video %s is no longer available' % video_id
, expected
= True )
162 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage
:
163 raise ExtractorError ( 'This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id
, expected
= True )
165 if re
. search ( r
'[\?&]rss($|[=&])' , url
):
166 doc
= compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))
168 return GenericIE () ._ extract
_ rss
( url
, video_id
, doc
)
170 title
= self
._ html
_ search
_ regex
(
171 [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,
172 r
'<meta name="dcterms.title" content="(.*?)"/>' ,
173 r
'<h4 class="headline">(.*?)</h4>' ],
175 description
= self
._ html
_ search
_ meta
(
176 'dcterms.abstract' , webpage
, 'description' , default
= None )
177 if description
is None :
178 description
= self
._ html
_ search
_ meta
(
179 'description' , webpage
, 'meta description' )
181 # Thumbnail is sometimes not present.
182 # It is in the mobile version, but that seems to use a different URL
183 # structure altogether.
184 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
, default
= None )
186 media_streams
= re
. findall ( r
'''(?x)
187 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
188 "([^"]+)"''' , webpage
)
191 QUALITIES
= qualities ([ 'lo' , 'hi' , 'hq' ])
193 for furl
in set ( media_streams
):
194 if furl
. endswith ( '.f4m' ):
197 fid_m
= re
. match ( r
'.*\.([^.]+)\.[^.]+$' , furl
)
198 fid
= fid_m
. group ( 1 ) if fid_m
else None
200 'quality' : QUALITIES ( fid
),
204 self
._ sort
_ formats
( formats
)
208 else : # request JSON file
209 info
= self
._ extract
_ media
_ info
(
210 'http://www.ardmediathek.de/play/media/ %s ' % video_id
, webpage
, video_id
)
215 'description' : description
,
216 'thumbnail' : thumbnail
,
222 class ARDIE ( InfoExtractor
):
223 _VALID_URL
= '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
225 'url' : 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,
226 'md5' : 'd216c3a86493f9322545e045ddc3eb35' ,
228 'display_id' : 'die-story-im-ersten-mission-unter-falscher-flagge' ,
232 'title' : 'Die Story im Ersten: Mission unter falscher Flagge' ,
233 'upload_date' : '20140804' ,
234 'thumbnail' : 're:^https?://.*\.jpg$' ,
238 def _real_extract ( self
, url
):
239 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
240 display_id
= mobj
. group ( 'display_id' )
242 player_url
= mobj
. group ( 'mainurl' ) + '~playerXml.xml'
243 doc
= self
._ download
_ xml
( player_url
, display_id
)
244 video_node
= doc
. find ( './video' )
245 upload_date
= unified_strdate ( xpath_text (
246 video_node
, './broadcastDate' ))
247 thumbnail
= xpath_text ( video_node
, './/teaserImage//variant/url' )
250 for a
in video_node
. findall ( './/asset' ):
252 'format_id' : a
. attrib
[ 'type' ],
253 'width' : int_or_none ( a
. find ( './frameWidth' ). text
),
254 'height' : int_or_none ( a
. find ( './frameHeight' ). text
),
255 'vbr' : int_or_none ( a
. find ( './bitrateVideo' ). text
),
256 'abr' : int_or_none ( a
. find ( './bitrateAudio' ). text
),
257 'vcodec' : a
. find ( './codecVideo' ). text
,
258 'tbr' : int_or_none ( a
. find ( './totalBitrate' ). text
),
260 if a
. find ( './serverPrefix' ). text
:
261 f
[ 'url' ] = a
. find ( './serverPrefix' ). text
262 f
[ 'playpath' ] = a
. find ( './fileName' ). text
264 f
[ 'url' ] = a
. find ( './fileName' ). text
266 self
._ sort
_ formats
( formats
)
269 'id' : mobj
. group ( 'id' ),
271 'display_id' : display_id
,
272 'title' : video_node
. find ( './title' ). text
,
273 'duration' : parse_duration ( video_node
. find ( './duration' ). text
),
274 'upload_date' : upload_date
,
275 'thumbnail' : thumbnail
,
279 class SportschauIE ( ARDMediathekIE
):
280 IE_NAME
= 'Sportschau'
281 _VALID_URL
= r
'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
283 'url' : 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html' ,
285 'id' : 'seppeltkokainhatnichtsmitklassischemdopingzutun100' ,
287 'title' : 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"' ,
288 'thumbnail' : 're:^https?://.*\.jpg$' ,
289 'description' : 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.' ,
293 'skip_download' : True ,
297 def _real_extract ( self
, url
):
298 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
299 video_id
= mobj
. group ( 'id' )
300 base_url
= mobj
. group ( 'baseurl' )
302 webpage
= self
._ download
_ webpage
( url
, video_id
)
303 title
= get_element_by_attribute ( 'class' , 'headline' , webpage
)
304 description
= self
._ html
_ search
_ meta
( 'description' , webpage
, 'description' )
306 info
= self
._ extract
_ media
_ info
(
307 base_url
+ '-mc_defaultQuality-h.json' , webpage
, video_id
)
311 'description' : description
,