]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py
fd45b3e42b374ec6a7077454d238932c66d16d46
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
7 from . generic
import GenericIE
17 from .. compat
import compat_etree_fromstring
20 class ARDMediathekIE ( InfoExtractor
):
21 IE_NAME
= 'ARD:mediathek'
22 _VALID_URL
= r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
25 'url' : 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,
29 'title' : 'Ich liebe das Leben trotzdem' ,
30 'description' : 'md5:45e4c225c72b27993314b31a84a5261c' ,
35 'skip_download' : True ,
38 'url' : 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,
39 'md5' : 'f4d98b10759ac06c0072bbcd1f0b9e3e' ,
43 'title' : 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,
44 'description' : 'md5:196392e79876d0ac94c94e8cdb2875f1' ,
49 'url' : 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,
50 'md5' : '219d94d8980b4f538c7fcb0865eb7f2c' ,
54 'title' : 'Tod eines Fußballers' ,
55 'description' : 'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,
59 'url' : 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,
60 'only_matching' : True ,
63 def _extract_media_info ( self
, media_info_url
, webpage
, video_id
):
64 media_info
= self
._ download
_ json
(
65 media_info_url
, video_id
, 'Downloading media JSON' )
67 formats
= self
._ extract
_ formats
( media_info
, video_id
)
70 if '"fsk"' in webpage
:
72 'This video is only available after 20:00' , expected
= True )
73 elif media_info
. get ( '_geoblocked' ):
74 raise ExtractorError ( 'This video is not available due to geo restriction' , expected
= True )
76 self
._ sort
_ formats
( formats
)
78 duration
= int_or_none ( media_info
. get ( '_duration' ))
79 thumbnail
= media_info
. get ( '_previewImage' )
82 subtitle_url
= media_info
. get ( '_subtitleUrl' )
92 'thumbnail' : thumbnail
,
94 'subtitles' : subtitles
,
97 def _extract_formats ( self
, media_info
, video_id
):
98 type_
= media_info
. get ( '_type' )
99 media_array
= media_info
. get ( '_mediaArray' , [])
101 for num
, media
in enumerate ( media_array
):
102 for stream
in media
. get ( '_mediaStreamArray' , []):
103 stream_urls
= stream
. get ( '_stream' )
106 if not isinstance ( stream_urls
, list ):
107 stream_urls
= [ stream_urls
]
108 quality
= stream
. get ( '_quality' )
109 server
= stream
. get ( '_server' )
110 for stream_url
in stream_urls
:
111 ext
= determine_ext ( stream_url
)
112 if quality
!= 'auto' and ext
in ( 'f4m' , 'm3u8' ):
115 formats
. extend ( self
._ extract
_ f
4 m
_ formats
(
116 stream_url
+ '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,
117 video_id
, preference
=- 1 , f4m_id
= 'hds' , fatal
= False ))
119 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
120 stream_url
, video_id
, 'mp4' , preference
= 1 , m3u8_id
= 'hls' , fatal
= False ))
122 if server
and server
. startswith ( 'rtmp' ):
125 'play_path' : stream_url
,
126 'format_id' : 'a %s-r tmp- %s ' % ( num
, quality
),
128 elif stream_url
. startswith ( 'http' ):
131 'format_id' : 'a %s-%s-%s ' % ( num
, ext
, quality
)
135 m
= re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' , stream_url
)
138 'width' : int ( m
. group ( 'width' )),
139 'height' : int ( m
. group ( 'height' )),
146 def _real_extract ( self
, url
):
147 # determine video id from url
148 m
= re
. match ( self
._ VALID
_U RL
, url
)
150 numid
= re
. search ( r
'documentId=([0-9]+)' , url
)
152 video_id
= numid
. group ( 1 )
154 video_id
= m
. group ( 'video_id' )
156 webpage
= self
._ download
_ webpage
( url
, video_id
)
158 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage
:
159 raise ExtractorError ( 'Video %s is no longer available' % video_id
, expected
= True )
161 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage
:
162 raise ExtractorError ( 'This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id
, expected
= True )
164 if re
. search ( r
'[\?&]rss($|[=&])' , url
):
165 doc
= compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))
167 return GenericIE () ._ extract
_ rss
( url
, video_id
, doc
)
169 title
= self
._ html
_ search
_ regex
(
170 [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,
171 r
'<meta name="dcterms.title" content="(.*?)"/>' ,
172 r
'<h4 class="headline">(.*?)</h4>' ],
174 description
= self
._ html
_ search
_ meta
(
175 'dcterms.abstract' , webpage
, 'description' , default
= None )
176 if description
is None :
177 description
= self
._ html
_ search
_ meta
(
178 'description' , webpage
, 'meta description' )
180 # Thumbnail is sometimes not present.
181 # It is in the mobile version, but that seems to use a different URL
182 # structure altogether.
183 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
, default
= None )
185 media_streams
= re
. findall ( r
'''(?x)
186 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
187 "([^"]+)"''' , webpage
)
190 QUALITIES
= qualities ([ 'lo' , 'hi' , 'hq' ])
192 for furl
in set ( media_streams
):
193 if furl
. endswith ( '.f4m' ):
196 fid_m
= re
. match ( r
'.*\.([^.]+)\.[^.]+$' , furl
)
197 fid
= fid_m
. group ( 1 ) if fid_m
else None
199 'quality' : QUALITIES ( fid
),
203 self
._ sort
_ formats
( formats
)
207 else : # request JSON file
208 info
= self
._ extract
_ media
_ info
(
209 'http://www.ardmediathek.de/play/media/ %s ' % video_id
, webpage
, video_id
)
214 'description' : description
,
215 'thumbnail' : thumbnail
,
221 class ARDIE ( InfoExtractor
):
222 _VALID_URL
= '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
224 'url' : 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,
225 'md5' : 'd216c3a86493f9322545e045ddc3eb35' ,
227 'display_id' : 'die-story-im-ersten-mission-unter-falscher-flagge' ,
231 'title' : 'Die Story im Ersten: Mission unter falscher Flagge' ,
232 'upload_date' : '20140804' ,
233 'thumbnail' : 're:^https?://.*\.jpg$' ,
237 def _real_extract ( self
, url
):
238 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
239 display_id
= mobj
. group ( 'display_id' )
241 player_url
= mobj
. group ( 'mainurl' ) + '~playerXml.xml'
242 doc
= self
._ download
_ xml
( player_url
, display_id
)
243 video_node
= doc
. find ( './video' )
244 upload_date
= unified_strdate ( xpath_text (
245 video_node
, './broadcastDate' ))
246 thumbnail
= xpath_text ( video_node
, './/teaserImage//variant/url' )
249 for a
in video_node
. findall ( './/asset' ):
251 'format_id' : a
. attrib
[ 'type' ],
252 'width' : int_or_none ( a
. find ( './frameWidth' ). text
),
253 'height' : int_or_none ( a
. find ( './frameHeight' ). text
),
254 'vbr' : int_or_none ( a
. find ( './bitrateVideo' ). text
),
255 'abr' : int_or_none ( a
. find ( './bitrateAudio' ). text
),
256 'vcodec' : a
. find ( './codecVideo' ). text
,
257 'tbr' : int_or_none ( a
. find ( './totalBitrate' ). text
),
259 if a
. find ( './serverPrefix' ). text
:
260 f
[ 'url' ] = a
. find ( './serverPrefix' ). text
261 f
[ 'playpath' ] = a
. find ( './fileName' ). text
263 f
[ 'url' ] = a
. find ( './fileName' ). text
265 self
._ sort
_ formats
( formats
)
268 'id' : mobj
. group ( 'id' ),
270 'display_id' : display_id
,
271 'title' : video_node
. find ( './title' ). text
,
272 'duration' : parse_duration ( video_node
. find ( './duration' ). text
),
273 'upload_date' : upload_date
,
274 'thumbnail' : thumbnail
,