]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbc.py
9d0dfb9611687b15075d0e6fc7d57dfa0244c60a
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
16 from .. compat
import (
17 compat_etree_fromstring
,
22 class BBCCoUkIE ( InfoExtractor
):
24 IE_DESC
= 'BBC iPlayer'
25 _ID_REGEX
= r
'[pb][\da-z] {7} '
28 (?:www\.)?bbc\.co\.uk/
30 programmes/(?!articles/)|
31 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
38 _MEDIASELECTOR_URLS
= [
39 # Provides HQ HLS streams with even better quality that pc mediaset but fails
40 # with geolocation in some cases when it's even not geo restricted at all (e.g.
41 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
42 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,
43 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' ,
46 _MEDIASELECTION_NS
= 'http://bbc.co.uk/2008/mp/mediaselection'
47 _EMP_PLAYLIST_NS
= 'http://bbc.co.uk/2008/emp/playlist'
56 'url' : 'http://www.bbc.co.uk/programmes/b039g8p7' ,
60 'title' : 'Leonard Cohen, Kaleidoscope - BBC Radio 4' ,
61 'description' : 'The Canadian poet and songwriter reflects on his musical career.' ,
65 'skip_download' : True ,
69 'url' : 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,
73 'title' : 'The Man in Black: Series 3: The Printed Name' ,
74 'description' : "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,
79 'skip_download' : True ,
81 'skip' : 'Episode is no longer available on BBC iPlayer Radio' ,
84 'url' : 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,
88 'title' : 'The Voice UK: Series 3: Blind Auditions 5' ,
89 'description' : 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.' ,
94 'skip_download' : True ,
96 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
99 'url' : 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,
103 'title' : "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,
104 'description' : '2. Invasion' ,
109 'skip_download' : True ,
111 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
113 'url' : 'http://www.bbc.co.uk/programmes/b04v20dw' ,
117 'title' : 'Pete Tong, The Essential New Tune Special' ,
118 'description' : "Pete has a very special mix - all of 2014's Essential New Tunes!" ,
123 'skip_download' : True ,
125 'skip' : 'Episode is no longer available on BBC iPlayer Radio' ,
127 'url' : 'http://www.bbc.co.uk/music/clips/p022h44b' ,
132 'title' : 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances' ,
133 'description' : "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances." ,
138 'skip_download' : True ,
141 'url' : 'http://www.bbc.co.uk/music/clips/p025c0zz' ,
146 'title' : 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,
147 'description' : 'Rae Morris performs Closer for BBC Three at Reading 2014' ,
152 'skip_download' : True ,
155 'url' : 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,
159 'title' : 'Natural World, 2015-2016: 2. Super Powered Owls' ,
160 'description' : 'md5:e4db5c937d0e95a7c6b5e654d429183d' ,
165 'skip_download' : True ,
167 'skip' : 'geolocation' ,
169 'url' : 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,
173 'description' : 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,
174 'title' : 'Royal Academy Summer Exhibition' ,
179 'skip_download' : True ,
181 'skip' : 'geolocation' ,
183 # iptv-all mediaset fails with geolocation however there is no geo restriction
184 # for this programme at all
185 'url' : 'http://www.bbc.co.uk/programmes/b06rkn85' ,
189 'title' : "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1" ,
190 'description' : "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!" ,
194 'skip_download' : True ,
197 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
198 'url' : 'http://www.bbc.co.uk/programmes/p028bfkf/player' ,
202 'title' : 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,
203 'description' : 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,
207 'skip_download' : True ,
210 'url' : 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,
211 'only_matching' : True ,
213 'url' : 'http://www.bbc.co.uk/music/clips#p02frcc3' ,
214 'only_matching' : True ,
216 'url' : 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,
217 'only_matching' : True ,
219 'url' : 'http://www.bbc.co.uk/radio/player/p03cchwf' ,
220 'only_matching' : True ,
224 class MediaSelectionError ( Exception ):
225 def __init__ ( self
, id ):
228 def _extract_asx_playlist ( self
, connection
, programme_id
):
229 asx
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading ASX playlist' )
230 return [ ref
. get ( 'href' ) for ref
in asx
. findall ( './Entry/ref' )]
232 def _extract_connection ( self
, connection
, programme_id
):
234 kind
= connection
. get ( 'kind' )
235 protocol
= connection
. get ( 'protocol' )
236 supplier
= connection
. get ( 'supplier' )
237 if protocol
== 'http' :
238 href
= connection
. get ( 'href' )
239 transfer_format
= connection
. get ( 'transferFormat' )
241 if supplier
== 'asx' :
242 for i
, ref
in enumerate ( self
._ extract
_ asx
_ playlist
( connection
, programme_id
)):
245 'format_id' : 'ref %s _ %s ' % ( i
, supplier
),
247 # Skip DASH until supported
248 elif transfer_format
== 'dash' :
250 elif transfer_format
== 'hls' :
251 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
252 href
, programme_id
, ext
= 'mp4' , entry_protocol
= 'm3u8_native' ,
253 m3u8_id
= supplier
, fatal
= False ))
258 'format_id' : supplier
or kind
or protocol
,
260 elif protocol
== 'rtmp' :
261 application
= connection
. get ( 'application' , 'ondemand' )
262 auth_string
= connection
. get ( 'authString' )
263 identifier
= connection
. get ( 'identifier' )
264 server
= connection
. get ( 'server' )
266 'url' : ' %s :// %s / %s ? %s ' % ( protocol
, server
, application
, auth_string
),
267 'play_path' : identifier
,
268 'app' : ' %s ? %s ' % ( application
, auth_string
),
269 'page_url' : 'http://www.bbc.co.uk' ,
270 'player_url' : 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,
273 'format_id' : supplier
,
277 def _extract_items ( self
, playlist
):
278 return playlist
. findall ( './{ %s }item' % self
._ EMP
_ PLAYLIST
_ NS
)
280 def _findall_ns ( self
, element
, xpath
):
282 for ns
in self
._ NAMESPACES
:
283 elements
. extend ( element
. findall ( xpath
% ns
))
286 def _extract_medias ( self
, media_selection
):
287 error
= media_selection
. find ( './{ %s }error' % self
._ MEDIASELECTION
_ NS
)
289 media_selection
. find ( './{ %s }error' % self
._ EMP
_ PLAYLIST
_ NS
)
290 if error
is not None :
291 raise BBCCoUkIE
. MediaSelectionError ( error
. get ( 'id' ))
292 return self
._ findall
_ ns
( media_selection
, './{ %s }media' )
294 def _extract_connections ( self
, media
):
295 return self
._ findall
_ ns
( media
, './{ %s }connection' )
297 def _extract_video ( self
, media
, programme_id
):
299 vbr
= int_or_none ( media
. get ( 'bitrate' ))
300 vcodec
= media
. get ( 'encoding' )
301 service
= media
. get ( 'service' )
302 width
= int_or_none ( media
. get ( 'width' ))
303 height
= int_or_none ( media
. get ( 'height' ))
304 file_size
= int_or_none ( media
. get ( 'media_file_size' ))
305 for connection
in self
._ extract
_ connections
( media
):
306 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
307 for format
in conn_formats
:
313 'filesize' : file_size
,
316 format
[ 'format_id' ] = ' %s _ %s ' % ( service
, format
[ 'format_id' ])
317 formats
. extend ( conn_formats
)
320 def _extract_audio ( self
, media
, programme_id
):
322 abr
= int_or_none ( media
. get ( 'bitrate' ))
323 acodec
= media
. get ( 'encoding' )
324 service
= media
. get ( 'service' )
325 for connection
in self
._ extract
_ connections
( media
):
326 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
327 for format
in conn_formats
:
329 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
333 formats
. extend ( conn_formats
)
336 def _get_subtitles ( self
, media
, programme_id
):
338 for connection
in self
._ extract
_ connections
( media
):
339 captions
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading captions' )
340 lang
= captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' , 'en' )
343 'url' : connection
. get ( 'href' ),
349 def _raise_extractor_error ( self
, media_selection_error
):
350 raise ExtractorError (
351 ' %s returned error: %s ' % ( self
. IE_NAME
, media_selection_error
. id ),
354 def _download_media_selector ( self
, programme_id
):
355 last_exception
= None
356 for mediaselector_url
in self
._ MEDIASELECTOR
_U RLS
:
358 return self
._ download
_ media
_ selector
_u rl
(
359 mediaselector_url
% programme_id
, programme_id
)
360 except BBCCoUkIE
. MediaSelectionError
as e
:
361 if e
. id in ( 'notukerror' , 'geolocation' , 'selectionunavailable' ):
364 self
._ raise
_ extractor
_ error
( e
)
365 self
._ raise
_ extractor
_ error
( last_exception
)
367 def _download_media_selector_url ( self
, url
, programme_id
= None ):
369 media_selection
= self
._ download
_ xml
(
370 url
, programme_id
, 'Downloading media selection XML' )
371 except ExtractorError
as ee
:
372 if isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
in ( 403 , 404 ):
373 media_selection
= compat_etree_fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))
376 return self
._ process
_ media
_ selector
( media_selection
, programme_id
)
378 def _process_media_selector ( self
, media_selection
, programme_id
):
382 for media
in self
._ extract
_ medias
( media_selection
):
383 kind
= media
. get ( 'kind' )
385 formats
. extend ( self
._ extract
_ audio
( media
, programme_id
))
386 elif kind
== 'video' :
387 formats
. extend ( self
._ extract
_ video
( media
, programme_id
))
388 elif kind
== 'captions' :
389 subtitles
= self
. extract_subtitles ( media
, programme_id
)
390 return formats
, subtitles
392 def _download_playlist ( self
, playlist_id
):
394 playlist
= self
._ download
_ json
(
395 'http://www.bbc.co.uk/programmes/ %s /playlist.json' % playlist_id
,
396 playlist_id
, 'Downloading playlist JSON' )
398 version
= playlist
. get ( 'defaultAvailableVersion' )
400 smp_config
= version
[ 'smpConfig' ]
401 title
= smp_config
[ 'title' ]
402 description
= smp_config
[ 'summary' ]
403 for item
in smp_config
[ 'items' ]:
405 if kind
!= 'programme' and kind
!= 'radioProgramme' :
407 programme_id
= item
. get ( 'vpid' )
408 duration
= int_or_none ( item
. get ( 'duration' ))
409 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
410 return programme_id
, title
, description
, duration
, formats
, subtitles
411 except ExtractorError
as ee
:
412 if not ( isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 404 ):
415 # fallback to legacy playlist
416 return self
._ process
_l egacy
_ playlist
( playlist_id
)
418 def _process_legacy_playlist_url ( self
, url
, display_id
):
419 playlist
= self
._ download
_l egacy
_ playlist
_u rl
( url
, display_id
)
420 return self
._ extract
_ from
_l egacy
_ playlist
( playlist
, display_id
)
422 def _process_legacy_playlist ( self
, playlist_id
):
423 return self
._ process
_l egacy
_ playlist
_u rl
(
424 'http://www.bbc.co.uk/iplayer/playlist/ %s ' % playlist_id
, playlist_id
)
426 def _download_legacy_playlist_url ( self
, url
, playlist_id
= None ):
427 return self
._ download
_ xml
(
428 url
, playlist_id
, 'Downloading legacy playlist XML' )
430 def _extract_from_legacy_playlist ( self
, playlist
, playlist_id
):
431 no_items
= playlist
. find ( './{ %s }noItems' % self
._ EMP
_ PLAYLIST
_ NS
)
432 if no_items
is not None :
433 reason
= no_items
. get ( 'reason' )
434 if reason
== 'preAvailability' :
435 msg
= 'Episode %s is not yet available' % playlist_id
436 elif reason
== 'postAvailability' :
437 msg
= 'Episode %s is no longer available' % playlist_id
438 elif reason
== 'noMedia' :
439 msg
= 'Episode %s is not currently available' % playlist_id
441 msg
= 'Episode %s is not available: %s ' % ( playlist_id
, reason
)
442 raise ExtractorError ( msg
, expected
= True )
444 for item
in self
._ extract
_ items
( playlist
):
445 kind
= item
. get ( 'kind' )
446 if kind
!= 'programme' and kind
!= 'radioProgramme' :
448 title
= playlist
. find ( './{ %s }title' % self
._ EMP
_ PLAYLIST
_ NS
). text
449 description_el
= playlist
. find ( './{ %s }summary' % self
._ EMP
_ PLAYLIST
_ NS
)
450 description
= description_el
. text
if description_el
is not None else None
452 def get_programme_id ( item
):
453 def get_from_attributes ( item
):
454 for p
in ( 'identifier' , 'group' ):
456 if value
and re
. match ( r
'^[pb][\da-z] {7} $' , value
):
458 get_from_attributes ( item
)
459 mediator
= item
. find ( './{ %s }mediator' % self
._ EMP
_ PLAYLIST
_ NS
)
460 if mediator
is not None :
461 return get_from_attributes ( mediator
)
463 programme_id
= get_programme_id ( item
)
464 duration
= int_or_none ( item
. get ( 'duration' ))
467 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
469 formats
, subtitles
= self
._ process
_ media
_ selector
( item
, playlist_id
)
470 programme_id
= playlist_id
472 return programme_id
, title
, description
, duration
, formats
, subtitles
474 def _real_extract ( self
, url
):
475 group_id
= self
._ match
_ id
( url
)
477 webpage
= self
._ download
_ webpage
( url
, group_id
, 'Downloading video page' )
482 tviplayer
= self
._ search
_ regex
(
483 r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,
484 webpage
, 'player' , default
= None )
487 player
= self
._ parse
_ json
( tviplayer
, group_id
). get ( 'player' , {})
488 duration
= int_or_none ( player
. get ( 'duration' ))
489 programme_id
= player
. get ( 'vpid' )
492 programme_id
= self
._ search
_ regex
(
493 r
'"vpid"\s*:\s*"( %s )"' % self
._ ID
_ REGEX
, webpage
, 'vpid' , fatal
= False , default
= None )
496 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
497 title
= self
._ og
_ search
_ title
( webpage
, default
= None ) or self
._ html
_ search
_ regex
(
498 ( r
'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>' ,
499 r
'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>' ), webpage
, 'title' )
500 description
= self
._ search
_ regex
(
501 ( r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,
502 r
'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>' ),
503 webpage
, 'description' , default
= None )
505 description
= self
._ html
_ search
_ meta
( 'description' , webpage
)
507 programme_id
, title
, description
, duration
, formats
, subtitles
= self
._ download
_ playlist
( group_id
)
509 self
._ sort
_ formats
( formats
)
514 'description' : description
,
515 'thumbnail' : self
._ og
_ search
_ thumbnail
( webpage
, default
= None ),
516 'duration' : duration
,
518 'subtitles' : subtitles
,
522 class BBCIE ( BBCCoUkIE
):
525 _VALID_URL
= r
'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
527 _MEDIASELECTOR_URLS
= [
528 # Provides HQ HLS streams but fails with geolocation in some cases when it's
529 # even not geo restricted at all
530 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,
531 # Provides more formats, namely direct mp4 links, but fails on some videos with
532 # notukerror for non UK (?) users (e.g.
533 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
534 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s ' ,
535 # Provides fewer formats, but works everywhere for everybody (hopefully)
536 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ %s ' ,
540 # article with multiple videos embedded with data-playable containing vpids
541 'url' : 'http://www.bbc.com/news/world-europe-32668511' ,
543 'id' : 'world-europe-32668511' ,
544 'title' : 'Russia stages massive WW2 parade despite Western boycott' ,
545 'description' : 'md5:00ff61976f6081841f759a08bf78cc9c' ,
549 # article with multiple videos embedded with data-playable (more videos)
550 'url' : 'http://www.bbc.com/news/business-28299555' ,
552 'id' : 'business-28299555' ,
553 'title' : 'Farnborough Airshow: Video highlights' ,
554 'description' : 'BBC reports and video highlights at the Farnborough Airshow.' ,
559 # article with multiple videos embedded with `new SMP()`
561 'url' : 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460' ,
563 'id' : '3662a707-0af9-3149-963f-47bea720b460' ,
564 'title' : 'BBC Blogs - Adam Curtis - BUGGER' ,
566 'playlist_count' : 18 ,
568 # single video embedded with data-playable containing vpid
569 'url' : 'http://www.bbc.com/news/world-europe-32041533' ,
573 'title' : 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,
574 'description' : 'md5:2868290467291b37feda7863f7a83f54' ,
576 'timestamp' : 1427219242 ,
577 'upload_date' : '20150324' ,
581 'skip_download' : True ,
584 # article with single video embedded with data-playable containing XML playlist
585 # with direct video links as progressiveDownloadUrl (for now these are extracted)
586 # and playlist with f4m and m3u8 as streamingUrl
587 'url' : 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,
589 'id' : '150615_telabyad_kentin_cogu' ,
591 'title' : "YPG: Tel Abyad'ın tamamı kontrolĆ¼mĆ¼zde" ,
592 'timestamp' : 1434397334 ,
593 'upload_date' : '20150615' ,
596 'skip_download' : True ,
599 # single video embedded with data-playable containing XML playlists (regional section)
600 'url' : 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,
602 'id' : '150619_video_honduras_militares_hospitales_corrupcion_aw' ,
604 'title' : 'Honduras militariza sus hospitales por nuevo escĆ”ndalo de corrupciĆ³n' ,
605 'timestamp' : 1434713142 ,
606 'upload_date' : '20150619' ,
609 'skip_download' : True ,
612 # single video from video playlist embedded with vxp-playlist-data JSON
613 'url' : 'http://www.bbc.com/news/video_and_audio/must_see/33376376' ,
617 'title' : '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,
619 'description' : '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,
622 'skip_download' : True ,
625 # single video story with digitalData
626 'url' : 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret' ,
630 'title' : 'Sri Lankaās spicy secret' ,
631 'description' : 'As a new train line to Jaffna opens up the countryās north, travellers can experience a truly distinct slice of Tamil culture.' ,
632 'timestamp' : 1437674293 ,
633 'upload_date' : '20150723' ,
637 'skip_download' : True ,
640 # single video story without digitalData
641 'url' : 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star' ,
645 'title' : 'Hyundai Santa Fe Sport: Rock star' ,
646 'description' : 'md5:b042a26142c4154a6e472933cf20793d' ,
647 'timestamp' : 1415867444 ,
648 'upload_date' : '20141113' ,
652 'skip_download' : True ,
655 # single video with playlist.sxml URL in playlist param
656 'url' : 'http://www.bbc.com/sport/0/football/33653409' ,
660 'title' : 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?' ,
661 'description' : 'BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.' ,
666 'skip_download' : True ,
669 # article with multiple videos embedded with playlist.sxml in playlist param
670 'url' : 'http://www.bbc.com/sport/0/football/34475836' ,
673 'title' : 'What Liverpool can expect from Klopp' ,
677 # single video with playlist URL from weather section
678 'url' : 'http://www.bbc.com/weather/features/33601775' ,
679 'only_matching' : True ,
681 # custom redirection to www.bbc.com
682 'url' : 'http://www.bbc.co.uk/news/science-environment-33661876' ,
683 'only_matching' : True ,
687 def suitable ( cls
, url
):
688 return False if BBCCoUkIE
. suitable ( url
) or BBCCoUkArticleIE
. suitable ( url
) else super ( BBCIE
, cls
). suitable ( url
)
690 def _extract_from_media_meta ( self
, media_meta
, video_id
):
691 # Direct links to media in media metadata (e.g.
692 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
693 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
694 source_files
= media_meta
. get ( 'sourceFiles' )
698 'format_id' : format_id
,
699 'ext' : f
. get ( 'encoding' ),
700 'tbr' : float_or_none ( f
. get ( 'bitrate' ), 1000 ),
701 'filesize' : int_or_none ( f
. get ( 'filesize' )),
702 } for format_id
, f
in source_files
. items () if f
. get ( 'url' )], []
704 programme_id
= media_meta
. get ( 'externalId' )
706 return self
._ download
_ media
_ selector
( programme_id
)
708 # Process playlist.sxml as legacy playlist
709 href
= media_meta
. get ( 'href' )
711 playlist
= self
._ download
_l egacy
_ playlist
_u rl
( href
)
712 _
, _
, _
, _
, formats
, subtitles
= self
._ extract
_ from
_l egacy
_ playlist
( playlist
, video_id
)
713 return formats
, subtitles
717 def _extract_from_playlist_sxml ( self
, url
, playlist_id
, timestamp
):
718 programme_id
, title
, description
, duration
, formats
, subtitles
= \
719 self
._ process
_l egacy
_ playlist
_u rl
( url
, playlist_id
)
720 self
._ sort
_ formats
( formats
)
724 'description' : description
,
725 'duration' : duration
,
726 'timestamp' : timestamp
,
728 'subtitles' : subtitles
,
731 def _real_extract ( self
, url
):
732 playlist_id
= self
._ match
_ id
( url
)
734 webpage
= self
._ download
_ webpage
( url
, playlist_id
)
736 json_ld_info
= self
._ search
_ json
_l d
( webpage
, playlist_id
, default
= None )
737 timestamp
= json_ld_info
. get ( 'timestamp' )
738 playlist_title
= json_ld_info
. get ( 'title' )
739 playlist_description
= json_ld_info
. get ( 'description' )
742 timestamp
= parse_iso8601 ( self
._ search
_ regex
(
743 [ r
'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"' ,
744 r
'itemprop="datePublished"[^>]+datetime="([^"]+)"' ,
745 r
'"datePublished":\s*"([^"]+)' ],
746 webpage
, 'date' , default
= None ))
750 # article with multiple videos embedded with playlist.sxml (e.g.
751 # http://www.bbc.com/sport/0/football/34475836)
752 playlists
= re
. findall ( r
'<param[^>]+name="playlist"[^>]+value="([^"]+)"' , webpage
)
753 playlists
. extend ( re
. findall ( r
'data-media-id="([^"]+/playlist\.sxml)"' , webpage
))
756 self
._ extract
_ from
_ playlist
_ sxml
( playlist_url
, playlist_id
, timestamp
)
757 for playlist_url
in playlists
]
759 # news article with multiple videos embedded with data-playable
760 data_playables
= re
. findall ( r
'data-playable=(["\' ])({.+ ?
}) \
1 ', webpage)
762 for _, data_playable_json in data_playables:
763 data_playable = self._parse_json(
764 unescapeHTML(data_playable_json), playlist_id, fatal=False)
765 if not data_playable:
767 settings = data_playable.get(' settings
', {})
769 # data-playable with video vpid in settings.playlistObject.items (e.g.
770 # http://www.bbc.com/news/world-us-canada-34473351)
771 playlist_object = settings.get(' playlistObject
', {})
773 items = playlist_object.get(' items
')
774 if items and isinstance(items, list):
775 title = playlist_object[' title
']
776 description = playlist_object.get(' summary
')
777 duration = int_or_none(items[0].get(' duration
'))
778 programme_id = items[0].get(' vpid
')
779 formats, subtitles = self._download_media_selector(programme_id)
780 self._sort_formats(formats)
784 ' description
': description,
785 ' timestamp
': timestamp,
786 ' duration
': duration,
788 ' subtitles
': subtitles,
791 # data-playable without vpid but with a playlist.sxml URLs
792 # in otherSettings.playlist (e.g.
793 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
794 playlist = data_playable.get(' otherSettings
', {}).get(' playlist
', {})
796 entries.append(self._extract_from_playlist_sxml(
797 playlist.get(' progressiveDownloadUrl
'), playlist_id, timestamp))
800 playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News
')
801 playlist_description = playlist_description or self._og_search_description(webpage, default=None)
802 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
804 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
805 programme_id = self._search_regex(
806 [r' data
- video
- player
- vpid
= "( %s )" ' % self._ID_REGEX,
807 r' < param
[ ^
>]+ name
= "externalIdentifier" [ ^
>]+ value
= "( %s )" ' % self._ID_REGEX,
808 r' videoId\s
*: \s
*[ " \' ]( %s )[" \' ] ' % self._ID_REGEX],
809 webpage, ' vpid
', default=None)
812 formats, subtitles = self._download_media_selector(programme_id)
813 self._sort_formats(formats)
814 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
815 digital_data = self._parse_json(
817 r' var\s
+ digitalData\s
*= \s
*({.+ ?
}); ?
\n ', webpage, ' digital data
', default=' {} '),
818 programme_id, fatal=False)
819 page_info = digital_data.get(' page
', {}).get(' pageInfo
', {})
820 title = page_info.get(' pageName
') or self._og_search_title(webpage)
821 description = page_info.get(' description
') or self._og_search_description(webpage)
822 timestamp = parse_iso8601(page_info.get(' publicationDate
')) or timestamp
826 ' description
': description,
827 ' timestamp
': timestamp,
829 ' subtitles
': subtitles,
832 playlist_title = self._html_search_regex(
833 r' < title
>(.* ?
)( ?
: \s
*- \s
* BBC
[ ^
]+) ?
</ title
> ', webpage, ' playlist title
')
834 playlist_description = self._og_search_description(webpage, default=None)
836 def extract_all(pattern):
837 return list(filter(None, map(
838 lambda s: self._parse_json(s, playlist_id, fatal=False),
839 re.findall(pattern, webpage))))
841 # Multiple video article (e.g.
842 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
843 EMBED_URL = r' https?
://( ?
: www\
.) ?bbc\
. co\
. uk
/( ?
:[ ^
/]+/)+ %s( ?
: \b [ ^
"]+)?' % self._ID_REGEX
845 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
846 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
847 if embed_url and re.match(EMBED_URL, embed_url):
848 entries.append(embed_url)
849 entries.extend(re.findall(
850 r'setPlaylist\(" ( %s) "\)' % EMBED_URL, webpage))
852 return self.playlist_result(
853 [self.url_result(entry, 'BBCCoUk') for entry in entries],
854 playlist_id, playlist_title, playlist_description)
856 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
857 medias = extract_all(r" data
- media
- meta
= '({[^' ]+}) '")
860 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
861 media_asset = self._search_regex(
862 r' mediaAssetPage\
. init\
( \s
*({.+ ?
}), "/',
863 webpage, 'media asset', default=None)
865 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
867 for video in media_asset_page.get('videos', {}).values():
868 medias.extend(video.values())
871 # Multiple video playlist with single `now playing` entry (e.g.
872 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
873 vxp_playlist = self._parse_json(
875 r'<script[^>]+class=" vxp
- playlist
- data
"[^>]+type=" application
/ json
"[^>]*>([^<]+)</script>',
876 webpage, 'playlist data'),
879 for item in vxp_playlist:
880 media = item.get('media')
883 playlist_medias.append(media)
884 # Download single video if found media with asset id matching the video id from URL
885 if item.get('advert', {}).get('assetId') == playlist_id:
888 # Fallback to the whole playlist
890 medias = playlist_medias
893 for num, media_meta in enumerate(medias, start=1):
894 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
897 self._sort_formats(formats)
899 video_id = media_meta.get('externalId')
901 video_id = playlist_id if len(medias) == 1 else ' %s-%s ' % (playlist_id, num)
903 title = media_meta.get('caption')
905 title = playlist_title if len(medias) == 1 else ' %s - Video %s ' % (playlist_title, num)
907 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
910 for image in media_meta.get('images', {}).values():
911 images.extend(image.values())
912 if 'image' in media_meta:
913 images.append(media_meta['image'])
916 'url': image.get('href'),
917 'width': int_or_none(image.get('width')),
918 'height': int_or_none(image.get('height')),
919 } for image in images]
924 'thumbnails': thumbnails,
925 'duration': duration,
926 'timestamp': timestamp,
928 'subtitles': subtitles,
931 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
934 class BBCCoUkArticleIE(InfoExtractor):
935 _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
936 IE_NAME = 'bbc.co.uk:article'
937 IE_DESC = 'BBC articles'
940 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
942 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
943 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
944 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
947 'add_ie': ['BBCCoUk'],
950 def _real_extract(self, url):
951 playlist_id = self._match_id(url)
953 webpage = self._download_webpage(url, playlist_id)
955 title = self._og_search_title(webpage)
956 description = self._og_search_description(webpage).strip()
958 entries = [self.url_result(programme_url) for programme_url in re.findall(
959 r'<div[^>]+typeof=" Clip
"[^>]+resource=" ([ ^
"]+)" ', webpage)]
961 return self.playlist_result(entries, playlist_id, title, description)