]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py
6f465789b497a6625776c383ff699a64b0b5c346
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
7 from . generic
import GenericIE
11 get_element_by_attribute
,
21 class ARDMediathekIE ( InfoExtractor
):
22 IE_NAME
= 'ARD:mediathek'
23 _VALID_URL
= r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
26 'url' : 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,
30 'title' : 'Ich liebe das Leben trotzdem' ,
31 'description' : 'md5:45e4c225c72b27993314b31a84a5261c' ,
36 'skip_download' : True ,
39 'url' : 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,
40 'md5' : 'f4d98b10759ac06c0072bbcd1f0b9e3e' ,
44 'title' : 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,
45 'description' : 'md5:196392e79876d0ac94c94e8cdb2875f1' ,
50 'url' : 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,
51 'md5' : '219d94d8980b4f538c7fcb0865eb7f2c' ,
55 'title' : 'Tod eines Fußballers' ,
56 'description' : 'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,
60 'url' : 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,
61 'only_matching' : True ,
64 def _extract_media_info ( self
, media_info_url
, webpage
, video_id
):
65 media_info
= self
._ download
_ json
(
66 media_info_url
, video_id
, 'Downloading media JSON' )
68 formats
= self
._ extract
_ formats
( media_info
, video_id
)
71 if '"fsk"' in webpage
:
73 'This video is only available after 20:00' , expected
= True )
74 elif media_info
. get ( '_geoblocked' ):
75 raise ExtractorError ( 'This video is not available due to geo restriction' , expected
= True )
77 self
._ sort
_ formats
( formats
)
79 duration
= int_or_none ( media_info
. get ( '_duration' ))
80 thumbnail
= media_info
. get ( '_previewImage' )
83 subtitle_url
= media_info
. get ( '_subtitleUrl' )
93 'thumbnail' : thumbnail
,
95 'subtitles' : subtitles
,
98 def _extract_formats ( self
, media_info
, video_id
):
99 type_
= media_info
. get ( '_type' )
100 media_array
= media_info
. get ( '_mediaArray' , [])
102 for num
, media
in enumerate ( media_array
):
103 for stream
in media
. get ( '_mediaStreamArray' , []):
104 stream_urls
= stream
. get ( '_stream' )
107 if not isinstance ( stream_urls
, list ):
108 stream_urls
= [ stream_urls
]
109 quality
= stream
. get ( '_quality' )
110 server
= stream
. get ( '_server' )
111 for stream_url
in stream_urls
:
112 ext
= determine_ext ( stream_url
)
114 formats
. extend ( self
._ extract
_ f
4 m
_ formats
(
115 stream_url
+ '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,
116 video_id
, preference
=- 1 , f4m_id
= 'hds' ))
118 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
119 stream_url
, video_id
, 'mp4' , preference
= 1 , m3u8_id
= 'hls' ))
121 if server
and server
. startswith ( 'rtmp' ):
124 'play_path' : stream_url
,
125 'format_id' : 'a %s-r tmp- %s ' % ( num
, quality
),
127 elif stream_url
. startswith ( 'http' ):
130 'format_id' : 'a %s-%s-%s ' % ( num
, ext
, quality
)
134 m
= re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' , stream_url
)
137 'width' : int ( m
. group ( 'width' )),
138 'height' : int ( m
. group ( 'height' )),
145 def _real_extract ( self
, url
):
146 # determine video id from url
147 m
= re
. match ( self
._ VALID
_U RL
, url
)
149 numid
= re
. search ( r
'documentId=([0-9]+)' , url
)
151 video_id
= numid
. group ( 1 )
153 video_id
= m
. group ( 'video_id' )
155 webpage
= self
._ download
_ webpage
( url
, video_id
)
157 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage
:
158 raise ExtractorError ( 'Video %s is no longer available' % video_id
, expected
= True )
160 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage
:
161 raise ExtractorError ( 'This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id
, expected
= True )
163 if re
. search ( r
'[\?&]rss($|[=&])' , url
):
164 doc
= parse_xml ( webpage
)
166 return GenericIE () ._ extract
_ rss
( url
, video_id
, doc
)
168 title
= self
._ html
_ search
_ regex
(
169 [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,
170 r
'<meta name="dcterms.title" content="(.*?)"/>' ,
171 r
'<h4 class="headline">(.*?)</h4>' ],
173 description
= self
._ html
_ search
_ meta
(
174 'dcterms.abstract' , webpage
, 'description' , default
= None )
175 if description
is None :
176 description
= self
._ html
_ search
_ meta
(
177 'description' , webpage
, 'meta description' )
179 # Thumbnail is sometimes not present.
180 # It is in the mobile version, but that seems to use a different URL
181 # structure altogether.
182 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
, default
= None )
184 media_streams
= re
. findall ( r
'''(?x)
185 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
186 "([^"]+)"''' , webpage
)
189 QUALITIES
= qualities ([ 'lo' , 'hi' , 'hq' ])
191 for furl
in set ( media_streams
):
192 if furl
. endswith ( '.f4m' ):
195 fid_m
= re
. match ( r
'.*\.([^.]+)\.[^.]+$' , furl
)
196 fid
= fid_m
. group ( 1 ) if fid_m
else None
198 'quality' : QUALITIES ( fid
),
202 self
._ sort
_ formats
( formats
)
206 else : # request JSON file
207 info
= self
._ extract
_ media
_ info
(
208 'http://www.ardmediathek.de/play/media/ %s ' % video_id
, webpage
, video_id
)
213 'description' : description
,
214 'thumbnail' : thumbnail
,
220 class ARDIE ( InfoExtractor
):
221 _VALID_URL
= '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
223 'url' : 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,
224 'md5' : 'd216c3a86493f9322545e045ddc3eb35' ,
226 'display_id' : 'die-story-im-ersten-mission-unter-falscher-flagge' ,
230 'title' : 'Die Story im Ersten: Mission unter falscher Flagge' ,
231 'upload_date' : '20140804' ,
232 'thumbnail' : 're:^https?://.*\.jpg$' ,
236 def _real_extract ( self
, url
):
237 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
238 display_id
= mobj
. group ( 'display_id' )
240 player_url
= mobj
. group ( 'mainurl' ) + '~playerXml.xml'
241 doc
= self
._ download
_ xml
( player_url
, display_id
)
242 video_node
= doc
. find ( './video' )
243 upload_date
= unified_strdate ( xpath_text (
244 video_node
, './broadcastDate' ))
245 thumbnail
= xpath_text ( video_node
, './/teaserImage//variant/url' )
248 for a
in video_node
. findall ( './/asset' ):
250 'format_id' : a
. attrib
[ 'type' ],
251 'width' : int_or_none ( a
. find ( './frameWidth' ). text
),
252 'height' : int_or_none ( a
. find ( './frameHeight' ). text
),
253 'vbr' : int_or_none ( a
. find ( './bitrateVideo' ). text
),
254 'abr' : int_or_none ( a
. find ( './bitrateAudio' ). text
),
255 'vcodec' : a
. find ( './codecVideo' ). text
,
256 'tbr' : int_or_none ( a
. find ( './totalBitrate' ). text
),
258 if a
. find ( './serverPrefix' ). text
:
259 f
[ 'url' ] = a
. find ( './serverPrefix' ). text
260 f
[ 'playpath' ] = a
. find ( './fileName' ). text
262 f
[ 'url' ] = a
. find ( './fileName' ). text
264 self
._ sort
_ formats
( formats
)
267 'id' : mobj
. group ( 'id' ),
269 'display_id' : display_id
,
270 'title' : video_node
. find ( './title' ). text
,
271 'duration' : parse_duration ( video_node
. find ( './duration' ). text
),
272 'upload_date' : upload_date
,
273 'thumbnail' : thumbnail
,
277 class SportschauIE ( ARDMediathekIE
):
278 IE_NAME
= 'Sportschau'
279 _VALID_URL
= r
'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
281 'url' : 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html' ,
283 'id' : 'seppeltkokainhatnichtsmitklassischemdopingzutun100' ,
285 'title' : 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"' ,
286 'thumbnail' : 're:^https?://.*\.jpg$' ,
287 'description' : 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.' ,
291 'skip_download' : True ,
295 def _real_extract ( self
, url
):
296 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
297 video_id
= mobj
. group ( 'id' )
298 base_url
= mobj
. group ( 'baseurl' )
300 webpage
= self
._ download
_ webpage
( url
, video_id
)
301 title
= get_element_by_attribute ( 'class' , 'headline' , webpage
)
302 description
= self
._ html
_ search
_ meta
( 'description' , webpage
, 'description' )
304 info
= self
._ extract
_ media
_ info
(
305 base_url
+ '-mc_defaultQuality-h.json' , webpage
, video_id
)
309 'description' : description
,