]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tagesschau.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
15 class TagesschauPlayerIE ( InfoExtractor
):
16 IE_NAME
= 'tagesschau:player'
17 _VALID_URL
= r
'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
20 'url' : 'http://www.tagesschau.de/multimedia/video/video-179517~player.html' ,
21 'md5' : '8d09548d5c15debad38bee3a4d15ca21' ,
25 'title' : 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD' ,
26 'thumbnail' : 're:^https?:.*\.jpg$' ,
27 'formats' : 'mincount:6' ,
30 'url' : 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html' ,
31 'md5' : '76e6eec6ebd40740671cf0a2c88617e5' ,
35 'title' : 'Trabi - Bye, bye Rennpappe' ,
36 'thumbnail' : 're:^https?:.*\.jpg$' ,
37 'formats' : 'mincount:2' ,
40 'url' : 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html' ,
41 'only_matching' : True ,
46 's' : { 'width' : 320 , 'height' : 180 , 'quality' : 1 },
47 'm' : { 'width' : 512 , 'height' : 288 , 'quality' : 2 },
48 'l' : { 'width' : 960 , 'height' : 540 , 'quality' : 3 },
49 'xl' : { 'width' : 1280 , 'height' : 720 , 'quality' : 4 },
50 'xxl' : { 'quality' : 5 },
53 def _extract_via_api ( self
, kind
, video_id
):
54 info
= self
._ download
_ json
(
55 'https://www.tagesschau.de/api/multimedia/ {0} / {0} - {1} .json' . format ( kind
, video_id
),
57 title
= info
[ 'headline' ]
59 for media
in info
[ 'mediadata' ]:
60 for format_id
, format_url
in media
. items ():
61 if determine_ext ( format_url
) == 'm3u8' :
62 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
63 format_url
, video_id
, 'mp4' ,
64 entry_protocol
= 'm3u8_native' , m3u8_id
= 'hls' ))
68 'format_id' : format_id
,
69 'vcodec' : 'none' if kind
== 'audio' else None ,
71 self
._ sort
_ formats
( formats
)
72 timestamp
= parse_iso8601 ( info
. get ( 'date' ))
76 'timestamp' : timestamp
,
80 def _real_extract ( self
, url
):
81 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
82 video_id
= mobj
. group ( 'id' )
84 # kind = mobj.group('kind').lower()
86 # return self._extract_via_api(kind, video_id)
88 # JSON api does not provide some audio formats (e.g. ogg) thus
89 # extractiong audio via webpage
91 webpage
= self
._ download
_ webpage
( url
, video_id
)
93 title
= self
._ og
_ search
_ title
( webpage
). strip ()
96 for media_json
in re
. findall ( r
'({src\s*:\s*["\' ] http
[ ^
}]+ type \s
*:[ ^
}]+}) ', webpage):
97 media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
100 src = media.get(' src
')
103 quality = media.get(' quality
')
104 kind = media.get(' type ', ' ').split(' / ')[0]
105 ext = determine_ext(src)
108 ' format_id
': ' %s _ %s ' % (quality, ext) if quality else ext,
110 ' vcodec
': ' none
' if kind == ' audio
' else None,
112 f.update(self._FORMATS.get(quality, {}))
115 self._sort_formats(formats)
117 thumbnail = self._og_search_thumbnail(webpage)
122 ' thumbnail
': thumbnail,
127 class TagesschauIE(InfoExtractor):
128 _VALID_URL = r' https?
://( ?
: www\
.) ?tagesschau\
. de
/( ?P
< path
>[ ^
/]+/( ?
:[ ^
/]+/)* ?
( ?P
< id >[ ^
/ #?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
131 'url' : 'http://www.tagesschau.de/multimedia/video/video-102143.html' ,
132 'md5' : 'f7c27a0eff3bfe8c7727e65f8fe1b1e6' ,
134 'id' : 'video-102143' ,
136 'title' : 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt' ,
137 'description' : '18.07.2015 20:10 Uhr' ,
138 'thumbnail' : 're:^https?:.*\.jpg$' ,
141 'url' : 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html' ,
142 'md5' : '3c54c1f6243d279b706bde660ceec633' ,
146 'title' : 'Sendung: tagesschau \t 04.12.2014 20:00 Uhr' ,
147 'description' : 'md5:695c01bfd98b7e313c501386327aea59' ,
148 'thumbnail' : 're:^https?:.*\.jpg$' ,
152 'url' : 'http://www.tagesschau.de/multimedia/audio/audio-29417.html' ,
153 'md5' : '76e6eec6ebd40740671cf0a2c88617e5' ,
157 'title' : 'Trabi - Bye, bye Rennpappe' ,
158 'description' : 'md5:8687dda862cbbe2cfb2df09b56341317' ,
159 'thumbnail' : 're:^https?:.*\.jpg$' ,
163 'url' : 'http://www.tagesschau.de/inland/bnd-303.html' ,
164 'md5' : 'e0916c623e85fc1d2b26b78f299d3958' ,
168 'title' : 'Viele Baustellen für neuen BND-Chef' ,
169 'description' : 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4' ,
170 'thumbnail' : 're:^https?:.*\.jpg$' ,
173 'url' : 'http://www.tagesschau.de/inland/afd-parteitag-135.html' ,
175 'id' : 'afd-parteitag-135' ,
176 'title' : 'Möchtegern-Underdog mit Machtanspruch' ,
180 'url' : 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html' ,
181 'only_matching' : True ,
183 'url' : 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html' ,
184 'only_matching' : True ,
186 'url' : 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html' ,
187 'only_matching' : True ,
189 'url' : 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html' ,
190 'only_matching' : True ,
192 'url' : 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html' ,
193 'only_matching' : True ,
195 'url' : 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html' ,
196 'only_matching' : True ,
198 'url' : 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html' ,
199 'only_matching' : True ,
201 'url' : 'http://www.tagesschau.de/100sekunden/index.html' ,
202 'only_matching' : True ,
204 # playlist article with collapsing sections
205 'url' : 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html' ,
206 'only_matching' : True ,
210 def suitable ( cls
, url
):
211 return False if TagesschauPlayerIE
. suitable ( url
) else super ( TagesschauIE
, cls
). suitable ( url
)
213 def _extract_formats ( self
, download_text
, media_kind
):
215 r
'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>' ,
219 link_url
= l
. group ( 'url' )
222 format_id
= self
._ search
_ regex
(
223 r
'.*/[^/.]+\.([^/]+)\.[^/.]+$' , link_url
, 'format ID' ,
224 default
= determine_ext ( link_url
))
226 'format_id' : format_id
,
227 'url' : l
. group ( 'url' ),
228 'format_name' : l
. group ( 'name' ),
230 title
= l
. group ( 'title' )
232 if media_kind
. lower () == 'video' :
235 Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
236 (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
237 (?P<vbr>[0-9]+)kbps&\#10;
238 Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
239 Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''' ,
243 'format_note' : m
. group ( 'audio_desc' ),
244 'vcodec' : m
. group ( 'vcodec' ),
245 'width' : int ( m
. group ( 'width' )),
246 'height' : int ( m
. group ( 'height' )),
247 'abr' : int ( m
. group ( 'abr' )),
248 'vbr' : int ( m
. group ( 'vbr' )),
249 'filesize_approx' : parse_filesize ( m
. group ( 'filesize_approx' )),
253 r
'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)' ,
257 'format_note' : ' %s , %s ' % ( m
. group ( 'format' ), m
. group ( 'note' )),
259 'abr' : int ( m
. group ( 'abr' )),
261 formats
. append ( format
)
262 self
._ sort
_ formats
( formats
)
265 def _real_extract ( self
, url
):
266 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
267 video_id
= mobj
. group ( 'id' ) or mobj
. group ( 'path' )
268 display_id
= video_id
. lstrip ( '-' )
270 webpage
= self
._ download
_ webpage
( url
, display_id
)
272 title
= self
._ html
_ search
_ regex
(
273 r
'<span[^>]*class="headline"[^>]*>(.+?)</span>' ,
274 webpage
, 'title' , default
= None ) or self
._ og
_ search
_ title
( webpage
)
276 DOWNLOAD_REGEX
= r
'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
278 webpage_type
= self
._ og
_ search
_ property
( 'type' , webpage
, default
= None )
279 if webpage_type
== 'website' : # Article
281 for num
, ( entry_title
, media_kind
, download_text
) in enumerate ( re
. findall (
282 r
'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*? %s ' % DOWNLOAD_REGEX
,
285 'id' : ' %s-%d ' % ( display_id
, num
),
286 'title' : ' %s ' % entry_title
,
287 'formats' : self
._ extract
_ formats
( download_text
, media_kind
),
290 return self
. playlist_result ( entries
, display_id
, title
)
291 formats
= entries
[ 0 ][ 'formats' ]
292 else : # Assume single video
293 download_text
= self
._ search
_ regex
(
294 DOWNLOAD_REGEX
, webpage
, 'download links' , group
= 'links' )
295 media_kind
= self
._ search
_ regex
(
296 DOWNLOAD_REGEX
, webpage
, 'media kind' , default
= 'Video' , group
= 'kind' )
297 formats
= self
._ extract
_ formats
( download_text
, media_kind
)
298 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
)
299 description
= self
._ html
_ search
_ regex
(
300 r
'(?s)<p class="teasertext">(.*?)</p>' ,
301 webpage
, 'description' , default
= None )
303 self
._ sort
_ formats
( formats
)
308 'thumbnail' : thumbnail
,
310 'description' : description
,