]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbccouk.py
1 from __future__
import unicode_literals
3 import xml
. etree
. ElementTree
5 from . common
import InfoExtractor
10 from .. compat
import compat_HTTPError
13 class BBCCoUkIE ( InfoExtractor
):
15 IE_DESC
= 'BBC iPlayer'
16 _VALID_URL
= r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z] {8} )'
20 'url' : 'http://www.bbc.co.uk/programmes/b039g8p7' ,
24 'title' : 'Kaleidoscope, Leonard Cohen' ,
25 'description' : 'The Canadian poet and songwriter reflects on his musical career.' ,
30 'skip_download' : True ,
34 'url' : 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,
38 'title' : 'The Man in Black: Series 3: The Printed Name' ,
39 'description' : "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,
44 'skip_download' : True ,
46 'skip' : 'Episode is no longer available on BBC iPlayer Radio' ,
49 'url' : 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,
53 'title' : 'The Voice UK: Series 3: Blind Auditions 5' ,
54 'description' : "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,
59 'skip_download' : True ,
61 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
64 'url' : 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,
68 'title' : "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,
69 'description' : '2. Invasion' ,
74 'skip_download' : True ,
76 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
78 'url' : 'http://www.bbc.co.uk/programmes/b04v20dw' ,
82 'title' : 'Pete Tong, The Essential New Tune Special' ,
83 'description' : "Pete has a very special mix - all of 2014's Essential New Tunes!" ,
88 'skip_download' : True ,
91 'url' : 'http://www.bbc.co.uk/music/clips/p02frcc3' ,
96 'title' : 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,
97 'description' : 'French house superstar Madeon takes us out of the club and onto the after party.' ,
102 'skip_download' : True ,
105 'url' : 'http://www.bbc.co.uk/music/clips/p025c0zz' ,
110 'title' : 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,
111 'description' : 'Rae Morris performs Closer for BBC Three at Reading 2014' ,
116 'skip_download' : True ,
119 'url' : 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,
123 'title' : 'Natural World, 2015-2016: 2. Super Powered Owls' ,
124 'description' : 'md5:e4db5c937d0e95a7c6b5e654d429183d' ,
129 'skip_download' : True ,
131 'skip' : 'geolocation' ,
133 'url' : 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,
134 'only_matching' : True ,
136 'url' : 'http://www.bbc.co.uk/music/clips#p02frcc3' ,
137 'only_matching' : True ,
139 'url' : 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,
140 'only_matching' : True ,
144 def _extract_asx_playlist ( self
, connection
, programme_id
):
145 asx
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading ASX playlist' )
146 return [ ref
. get ( 'href' ) for ref
in asx
. findall ( './Entry/ref' )]
148 def _extract_connection ( self
, connection
, programme_id
):
150 protocol
= connection
. get ( 'protocol' )
151 supplier
= connection
. get ( 'supplier' )
152 if protocol
== 'http' :
153 href
= connection
. get ( 'href' )
155 if supplier
== 'asx' :
156 for i
, ref
in enumerate ( self
._ extract
_ asx
_ playlist
( connection
, programme_id
)):
159 'format_id' : 'ref %s _ %s ' % ( i
, supplier
),
165 'format_id' : supplier
,
167 elif protocol
== 'rtmp' :
168 application
= connection
. get ( 'application' , 'ondemand' )
169 auth_string
= connection
. get ( 'authString' )
170 identifier
= connection
. get ( 'identifier' )
171 server
= connection
. get ( 'server' )
173 'url' : ' %s :// %s / %s ? %s ' % ( protocol
, server
, application
, auth_string
),
174 'play_path' : identifier
,
175 'app' : ' %s ? %s ' % ( application
, auth_string
),
176 'page_url' : 'http://www.bbc.co.uk' ,
177 'player_url' : 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,
180 'format_id' : supplier
,
184 def _extract_items ( self
, playlist
):
185 return playlist
. findall ( './{http://bbc.co.uk/2008/emp/playlist}item' )
187 def _extract_medias ( self
, media_selection
):
188 error
= media_selection
. find ( './{http://bbc.co.uk/2008/mp/mediaselection}error' )
189 if error
is not None :
190 raise ExtractorError (
191 ' %s returned error: %s ' % ( self
. IE_NAME
, error
. get ( 'id' )), expected
= True )
192 return media_selection
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}media' )
194 def _extract_connections ( self
, media
):
195 return media
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}connection' )
197 def _extract_video ( self
, media
, programme_id
):
199 vbr
= int ( media
. get ( 'bitrate' ))
200 vcodec
= media
. get ( 'encoding' )
201 service
= media
. get ( 'service' )
202 width
= int ( media
. get ( 'width' ))
203 height
= int ( media
. get ( 'height' ))
204 file_size
= int ( media
. get ( 'media_file_size' ))
205 for connection
in self
._ extract
_ connections
( media
):
206 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
207 for format
in conn_formats
:
209 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
214 'filesize' : file_size
,
216 formats
. extend ( conn_formats
)
219 def _extract_audio ( self
, media
, programme_id
):
221 abr
= int ( media
. get ( 'bitrate' ))
222 acodec
= media
. get ( 'encoding' )
223 service
= media
. get ( 'service' )
224 for connection
in self
._ extract
_ connections
( media
):
225 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
226 for format
in conn_formats
:
228 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
232 formats
. extend ( conn_formats
)
235 def _get_subtitles ( self
, media
, programme_id
):
237 for connection
in self
._ extract
_ connections
( media
):
238 captions
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading captions' )
239 lang
= captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' , 'en' )
240 ps
= captions
. findall ( './ {0} body/ {0} div/ {0} p' . format ( '{http://www.w3.org/2006/10/ttaf1}' ))
243 def _extract_text ( p
):
244 if p
. text
is not None :
245 stripped_text
= p
. text
. strip ()
248 return ' ' . join ( span
. text
. strip () for span
in p
. findall ( '{http://www.w3.org/2006/10/ttaf1}span' ))
249 for pos
, p
in enumerate ( ps
):
250 srt
+= ' %s \r\n %s --> %s \r\n %s \r\n\r\n ' % ( str ( pos
), p
. get ( 'begin' ), p
. get ( 'end' ), _extract_text ( p
))
253 'url' : connection
. get ( 'href' ),
263 def _download_media_selector ( self
, programme_id
):
265 media_selection
= self
._ download
_ xml
(
266 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' % programme_id
,
267 programme_id
, 'Downloading media selection XML' )
268 except ExtractorError
as ee
:
269 if isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 403 :
270 media_selection
= xml
. etree
. ElementTree
. fromstring ( ee
. cause
. read (). encode ( 'utf-8' ))
277 for media
in self
._ extract
_ medias
( media_selection
):
278 kind
= media
. get ( 'kind' )
280 formats
. extend ( self
._ extract
_ audio
( media
, programme_id
))
281 elif kind
== 'video' :
282 formats
. extend ( self
._ extract
_ video
( media
, programme_id
))
283 elif kind
== 'captions' :
284 subtitles
= self
. extract_subtitles ( media
, programme_id
)
286 return formats
, subtitles
288 def _download_playlist ( self
, playlist_id
):
290 playlist
= self
._ download
_ json
(
291 'http://www.bbc.co.uk/programmes/ %s /playlist.json' % playlist_id
,
292 playlist_id
, 'Downloading playlist JSON' )
294 version
= playlist
. get ( 'defaultAvailableVersion' )
296 smp_config
= version
[ 'smpConfig' ]
297 title
= smp_config
[ 'title' ]
298 description
= smp_config
[ 'summary' ]
299 for item
in smp_config
[ 'items' ]:
301 if kind
!= 'programme' and kind
!= 'radioProgramme' :
303 programme_id
= item
. get ( 'vpid' )
304 duration
= int ( item
. get ( 'duration' ))
305 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
306 return programme_id
, title
, description
, duration
, formats
, subtitles
307 except ExtractorError
as ee
:
308 if not ( isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 404 ):
311 # fallback to legacy playlist
312 playlist
= self
._ download
_ xml
(
313 'http://www.bbc.co.uk/iplayer/playlist/ %s ' % playlist_id
,
314 playlist_id
, 'Downloading legacy playlist XML' )
316 no_items
= playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}noItems' )
317 if no_items
is not None :
318 reason
= no_items
. get ( 'reason' )
319 if reason
== 'preAvailability' :
320 msg
= 'Episode %s is not yet available' % playlist_id
321 elif reason
== 'postAvailability' :
322 msg
= 'Episode %s is no longer available' % playlist_id
323 elif reason
== 'noMedia' :
324 msg
= 'Episode %s is not currently available' % playlist_id
326 msg
= 'Episode %s is not available: %s ' % ( playlist_id
, reason
)
327 raise ExtractorError ( msg
, expected
= True )
329 for item
in self
._ extract
_ items
( playlist
):
330 kind
= item
. get ( 'kind' )
331 if kind
!= 'programme' and kind
!= 'radioProgramme' :
333 title
= playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}title' ). text
334 description
= playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}summary' ). text
335 programme_id
= item
. get ( 'identifier' )
336 duration
= int ( item
. get ( 'duration' ))
337 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
339 return programme_id
, title
, description
, duration
, formats
, subtitles
341 def _real_extract ( self
, url
):
342 group_id
= self
._ match
_ id
( url
)
344 webpage
= self
._ download
_ webpage
( url
, group_id
, 'Downloading video page' )
348 tviplayer
= self
._ search
_ regex
(
349 r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,
350 webpage
, 'player' , default
= None )
353 player
= self
._ parse
_ json
( tviplayer
, group_id
). get ( 'player' , {})
354 duration
= int_or_none ( player
. get ( 'duration' ))
355 programme_id
= player
. get ( 'vpid' )
358 programme_id
= self
._ search
_ regex
(
359 r
'"vpid"\s*:\s*"([\da-z] {8} )"' , webpage
, 'vpid' , fatal
= False , default
= None )
362 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
363 title
= self
._ og
_ search
_ title
( webpage
)
364 description
= self
._ search
_ regex
(
365 r
'<p class="medium-description">([^<]+)</p>' ,
366 webpage
, 'description' , fatal
= False )
368 programme_id
, title
, description
, duration
, formats
, subtitles
= self
._ download
_ playlist
( group_id
)
370 self
._ sort
_ formats
( formats
)
375 'description' : description
,
376 'thumbnail' : self
._ og
_ search
_ thumbnail
( webpage
, default
= None ),
377 'duration' : duration
,
379 'subtitles' : subtitles
,