]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbc.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
16 from .. compat
import (
17 compat_etree_fromstring
,
22 class BBCCoUkIE ( InfoExtractor
):
24 IE_DESC
= 'BBC iPlayer'
25 _VALID_URL
= r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z] {8} )'
27 _MEDIASELECTOR_URLS
= [
28 # Provides HQ HLS streams with even better quality that pc mediaset but fails
29 # with geolocation in some cases when it's even not geo restricted at all (e.g.
30 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
31 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,
32 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' ,
35 _MEDIASELECTION_NS
= 'http://bbc.co.uk/2008/mp/mediaselection'
36 _EMP_PLAYLIST_NS
= 'http://bbc.co.uk/2008/emp/playlist'
45 'url' : 'http://www.bbc.co.uk/programmes/b039g8p7' ,
49 'title' : 'Kaleidoscope, Leonard Cohen' ,
50 'description' : 'The Canadian poet and songwriter reflects on his musical career.' ,
55 'skip_download' : True ,
59 'url' : 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,
63 'title' : 'The Man in Black: Series 3: The Printed Name' ,
64 'description' : "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,
69 'skip_download' : True ,
71 'skip' : 'Episode is no longer available on BBC iPlayer Radio' ,
74 'url' : 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,
78 'title' : 'The Voice UK: Series 3: Blind Auditions 5' ,
79 'description' : "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,
84 'skip_download' : True ,
86 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
89 'url' : 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,
93 'title' : "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,
94 'description' : '2. Invasion' ,
99 'skip_download' : True ,
101 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
103 'url' : 'http://www.bbc.co.uk/programmes/b04v20dw' ,
107 'title' : 'Pete Tong, The Essential New Tune Special' ,
108 'description' : "Pete has a very special mix - all of 2014's Essential New Tunes!" ,
113 'skip_download' : True ,
116 'url' : 'http://www.bbc.co.uk/music/clips/p02frcc3' ,
121 'title' : 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,
122 'description' : 'French house superstar Madeon takes us out of the club and onto the after party.' ,
127 'skip_download' : True ,
130 'url' : 'http://www.bbc.co.uk/music/clips/p025c0zz' ,
135 'title' : 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,
136 'description' : 'Rae Morris performs Closer for BBC Three at Reading 2014' ,
141 'skip_download' : True ,
144 'url' : 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,
148 'title' : 'Natural World, 2015-2016: 2. Super Powered Owls' ,
149 'description' : 'md5:e4db5c937d0e95a7c6b5e654d429183d' ,
154 'skip_download' : True ,
156 'skip' : 'geolocation' ,
158 'url' : 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,
162 'description' : 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,
163 'title' : 'Royal Academy Summer Exhibition' ,
168 'skip_download' : True ,
170 'skip' : 'geolocation' ,
172 # iptv-all mediaset fails with geolocation however there is no geo restriction
173 # for this programme at all
174 'url' : 'http://www.bbc.co.uk/programmes/b06bp7lf' ,
178 'title' : "Annie Mac's Friday Night, B.Traits sits in for Annie" ,
179 'description' : 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.' ,
184 'skip_download' : True ,
187 'url' : 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,
188 'only_matching' : True ,
190 'url' : 'http://www.bbc.co.uk/music/clips#p02frcc3' ,
191 'only_matching' : True ,
193 'url' : 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,
194 'only_matching' : True ,
198 class MediaSelectionError ( Exception ):
199 def __init__ ( self
, id ):
202 def _extract_asx_playlist ( self
, connection
, programme_id
):
203 asx
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading ASX playlist' )
204 return [ ref
. get ( 'href' ) for ref
in asx
. findall ( './Entry/ref' )]
206 def _extract_connection ( self
, connection
, programme_id
):
208 kind
= connection
. get ( 'kind' )
209 protocol
= connection
. get ( 'protocol' )
210 supplier
= connection
. get ( 'supplier' )
211 if protocol
== 'http' :
212 href
= connection
. get ( 'href' )
213 transfer_format
= connection
. get ( 'transferFormat' )
215 if supplier
== 'asx' :
216 for i
, ref
in enumerate ( self
._ extract
_ asx
_ playlist
( connection
, programme_id
)):
219 'format_id' : 'ref %s _ %s ' % ( i
, supplier
),
221 # Skip DASH until supported
222 elif transfer_format
== 'dash' :
224 elif transfer_format
== 'hls' :
225 m3u8_formats
= self
._ extract
_ m
3u8_ formats
(
226 href
, programme_id
, ext
= 'mp4' , entry_protocol
= 'm3u8_native' ,
227 m3u8_id
= supplier
, fatal
= False )
229 formats
. extend ( m3u8_formats
)
234 'format_id' : supplier
or kind
or protocol
,
236 elif protocol
== 'rtmp' :
237 application
= connection
. get ( 'application' , 'ondemand' )
238 auth_string
= connection
. get ( 'authString' )
239 identifier
= connection
. get ( 'identifier' )
240 server
= connection
. get ( 'server' )
242 'url' : ' %s :// %s / %s ? %s ' % ( protocol
, server
, application
, auth_string
),
243 'play_path' : identifier
,
244 'app' : ' %s ? %s ' % ( application
, auth_string
),
245 'page_url' : 'http://www.bbc.co.uk' ,
246 'player_url' : 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,
249 'format_id' : supplier
,
253 def _extract_items ( self
, playlist
):
254 return playlist
. findall ( './{ %s }item' % self
._ EMP
_ PLAYLIST
_ NS
)
256 def _findall_ns ( self
, element
, xpath
):
258 for ns
in self
._ NAMESPACES
:
259 elements
. extend ( element
. findall ( xpath
% ns
))
262 def _extract_medias ( self
, media_selection
):
263 error
= media_selection
. find ( './{ %s }error' % self
._ MEDIASELECTION
_ NS
)
265 media_selection
. find ( './{ %s }error' % self
._ EMP
_ PLAYLIST
_ NS
)
266 if error
is not None :
267 raise BBCCoUkIE
. MediaSelectionError ( error
. get ( 'id' ))
268 return self
._ findall
_ ns
( media_selection
, './{ %s }media' )
270 def _extract_connections ( self
, media
):
271 return self
._ findall
_ ns
( media
, './{ %s }connection' )
273 def _extract_video ( self
, media
, programme_id
):
275 vbr
= int_or_none ( media
. get ( 'bitrate' ))
276 vcodec
= media
. get ( 'encoding' )
277 service
= media
. get ( 'service' )
278 width
= int_or_none ( media
. get ( 'width' ))
279 height
= int_or_none ( media
. get ( 'height' ))
280 file_size
= int_or_none ( media
. get ( 'media_file_size' ))
281 for connection
in self
._ extract
_ connections
( media
):
282 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
283 for format
in conn_formats
:
289 'filesize' : file_size
,
292 format
[ 'format_id' ] = ' %s _ %s ' % ( service
, format
[ 'format_id' ])
293 formats
. extend ( conn_formats
)
296 def _extract_audio ( self
, media
, programme_id
):
298 abr
= int_or_none ( media
. get ( 'bitrate' ))
299 acodec
= media
. get ( 'encoding' )
300 service
= media
. get ( 'service' )
301 for connection
in self
._ extract
_ connections
( media
):
302 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
303 for format
in conn_formats
:
305 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
309 formats
. extend ( conn_formats
)
312 def _get_subtitles ( self
, media
, programme_id
):
314 for connection
in self
._ extract
_ connections
( media
):
315 captions
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading captions' )
316 lang
= captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' , 'en' )
319 'url' : connection
. get ( 'href' ),
325 def _raise_extractor_error ( self
, media_selection_error
):
326 raise ExtractorError (
327 ' %s returned error: %s ' % ( self
. IE_NAME
, media_selection_error
. id ),
330 def _download_media_selector ( self
, programme_id
):
331 last_exception
= None
332 for mediaselector_url
in self
._ MEDIASELECTOR
_U RLS
:
334 return self
._ download
_ media
_ selector
_u rl
(
335 mediaselector_url
% programme_id
, programme_id
)
336 except BBCCoUkIE
. MediaSelectionError
as e
:
337 if e
. id in ( 'notukerror' , 'geolocation' , 'selectionunavailable' ):
340 self
._ raise
_ extractor
_ error
( e
)
341 self
._ raise
_ extractor
_ error
( last_exception
)
343 def _download_media_selector_url ( self
, url
, programme_id
= None ):
345 media_selection
= self
._ download
_ xml
(
346 url
, programme_id
, 'Downloading media selection XML' )
347 except ExtractorError
as ee
:
348 if isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
in ( 403 , 404 ):
349 media_selection
= compat_etree_fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))
352 return self
._ process
_ media
_ selector
( media_selection
, programme_id
)
354 def _process_media_selector ( self
, media_selection
, programme_id
):
358 for media
in self
._ extract
_ medias
( media_selection
):
359 kind
= media
. get ( 'kind' )
361 formats
. extend ( self
._ extract
_ audio
( media
, programme_id
))
362 elif kind
== 'video' :
363 formats
. extend ( self
._ extract
_ video
( media
, programme_id
))
364 elif kind
== 'captions' :
365 subtitles
= self
. extract_subtitles ( media
, programme_id
)
366 return formats
, subtitles
368 def _download_playlist ( self
, playlist_id
):
370 playlist
= self
._ download
_ json
(
371 'http://www.bbc.co.uk/programmes/ %s /playlist.json' % playlist_id
,
372 playlist_id
, 'Downloading playlist JSON' )
374 version
= playlist
. get ( 'defaultAvailableVersion' )
376 smp_config
= version
[ 'smpConfig' ]
377 title
= smp_config
[ 'title' ]
378 description
= smp_config
[ 'summary' ]
379 for item
in smp_config
[ 'items' ]:
381 if kind
!= 'programme' and kind
!= 'radioProgramme' :
383 programme_id
= item
. get ( 'vpid' )
384 duration
= int_or_none ( item
. get ( 'duration' ))
385 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
386 return programme_id
, title
, description
, duration
, formats
, subtitles
387 except ExtractorError
as ee
:
388 if not ( isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 404 ):
391 # fallback to legacy playlist
392 return self
._ process
_l egacy
_ playlist
( playlist_id
)
394 def _process_legacy_playlist_url ( self
, url
, display_id
):
395 playlist
= self
._ download
_l egacy
_ playlist
_u rl
( url
, display_id
)
396 return self
._ extract
_ from
_l egacy
_ playlist
( playlist
, display_id
)
398 def _process_legacy_playlist ( self
, playlist_id
):
399 return self
._ process
_l egacy
_ playlist
_u rl
(
400 'http://www.bbc.co.uk/iplayer/playlist/ %s ' % playlist_id
, playlist_id
)
402 def _download_legacy_playlist_url ( self
, url
, playlist_id
= None ):
403 return self
._ download
_ xml
(
404 url
, playlist_id
, 'Downloading legacy playlist XML' )
406 def _extract_from_legacy_playlist ( self
, playlist
, playlist_id
):
407 no_items
= playlist
. find ( './{ %s }noItems' % self
._ EMP
_ PLAYLIST
_ NS
)
408 if no_items
is not None :
409 reason
= no_items
. get ( 'reason' )
410 if reason
== 'preAvailability' :
411 msg
= 'Episode %s is not yet available' % playlist_id
412 elif reason
== 'postAvailability' :
413 msg
= 'Episode %s is no longer available' % playlist_id
414 elif reason
== 'noMedia' :
415 msg
= 'Episode %s is not currently available' % playlist_id
417 msg
= 'Episode %s is not available: %s ' % ( playlist_id
, reason
)
418 raise ExtractorError ( msg
, expected
= True )
420 for item
in self
._ extract
_ items
( playlist
):
421 kind
= item
. get ( 'kind' )
422 if kind
!= 'programme' and kind
!= 'radioProgramme' :
424 title
= playlist
. find ( './{ %s }title' % self
._ EMP
_ PLAYLIST
_ NS
). text
425 description_el
= playlist
. find ( './{ %s }summary' % self
._ EMP
_ PLAYLIST
_ NS
)
426 description
= description_el
. text
if description_el
is not None else None
428 def get_programme_id ( item
):
429 def get_from_attributes ( item
):
430 for p
in ( 'identifier' , 'group' ):
432 if value
and re
. match ( r
'^[pb][\da-z] {7} $' , value
):
434 get_from_attributes ( item
)
435 mediator
= item
. find ( './{ %s }mediator' % self
._ EMP
_ PLAYLIST
_ NS
)
436 if mediator
is not None :
437 return get_from_attributes ( mediator
)
439 programme_id
= get_programme_id ( item
)
440 duration
= int_or_none ( item
. get ( 'duration' ))
443 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
445 formats
, subtitles
= self
._ process
_ media
_ selector
( item
, playlist_id
)
446 programme_id
= playlist_id
448 return programme_id
, title
, description
, duration
, formats
, subtitles
450 def _real_extract ( self
, url
):
451 group_id
= self
._ match
_ id
( url
)
453 webpage
= self
._ download
_ webpage
( url
, group_id
, 'Downloading video page' )
457 tviplayer
= self
._ search
_ regex
(
458 r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,
459 webpage
, 'player' , default
= None )
462 player
= self
._ parse
_ json
( tviplayer
, group_id
). get ( 'player' , {})
463 duration
= int_or_none ( player
. get ( 'duration' ))
464 programme_id
= player
. get ( 'vpid' )
467 programme_id
= self
._ search
_ regex
(
468 r
'"vpid"\s*:\s*"([\da-z] {8} )"' , webpage
, 'vpid' , fatal
= False , default
= None )
471 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
472 title
= self
._ og
_ search
_ title
( webpage
)
473 description
= self
._ search
_ regex
(
474 r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,
475 webpage
, 'description' , fatal
= False )
477 programme_id
, title
, description
, duration
, formats
, subtitles
= self
._ download
_ playlist
( group_id
)
479 self
._ sort
_ formats
( formats
)
484 'description' : description
,
485 'thumbnail' : self
._ og
_ search
_ thumbnail
( webpage
, default
= None ),
486 'duration' : duration
,
488 'subtitles' : subtitles
,
492 class BBCIE ( BBCCoUkIE
):
495 _VALID_URL
= r
'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
497 _MEDIASELECTOR_URLS
= [
498 # Provides HQ HLS streams but fails with geolocation in some cases when it's
499 # even not geo restricted at all
500 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,
501 # Provides more formats, namely direct mp4 links, but fails on some videos with
502 # notukerror for non UK (?) users (e.g.
503 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
504 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s ' ,
505 # Provides fewer formats, but works everywhere for everybody (hopefully)
506 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ %s ' ,
510 # article with multiple videos embedded with data-playable containing vpids
511 'url' : 'http://www.bbc.com/news/world-europe-32668511' ,
513 'id' : 'world-europe-32668511' ,
514 'title' : 'Russia stages massive WW2 parade despite Western boycott' ,
515 'description' : 'md5:00ff61976f6081841f759a08bf78cc9c' ,
519 # article with multiple videos embedded with data-playable (more videos)
520 'url' : 'http://www.bbc.com/news/business-28299555' ,
522 'id' : 'business-28299555' ,
523 'title' : 'Farnborough Airshow: Video highlights' ,
524 'description' : 'BBC reports and video highlights at the Farnborough Airshow.' ,
529 # article with multiple videos embedded with `new SMP()`
531 'url' : 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460' ,
533 'id' : '3662a707-0af9-3149-963f-47bea720b460' ,
534 'title' : 'BBC Blogs - Adam Curtis - BUGGER' ,
536 'playlist_count' : 18 ,
538 # single video embedded with data-playable containing vpid
539 'url' : 'http://www.bbc.com/news/world-europe-32041533' ,
543 'title' : 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,
544 'description' : 'md5:2868290467291b37feda7863f7a83f54' ,
546 'timestamp' : 1427219242 ,
547 'upload_date' : '20150324' ,
551 'skip_download' : True ,
554 # article with single video embedded with data-playable containing XML playlist
555 # with direct video links as progressiveDownloadUrl (for now these are extracted)
556 # and playlist with f4m and m3u8 as streamingUrl
557 'url' : 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,
559 'id' : '150615_telabyad_kentin_cogu' ,
561 'title' : "YPG: Tel Abyad'ın tamamı kontrolĆ¼mĆ¼zde" ,
562 'timestamp' : 1434397334 ,
563 'upload_date' : '20150615' ,
566 'skip_download' : True ,
569 # single video embedded with data-playable containing XML playlists (regional section)
570 'url' : 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,
572 'id' : '150619_video_honduras_militares_hospitales_corrupcion_aw' ,
574 'title' : 'Honduras militariza sus hospitales por nuevo escĆ”ndalo de corrupciĆ³n' ,
575 'timestamp' : 1434713142 ,
576 'upload_date' : '20150619' ,
579 'skip_download' : True ,
582 # single video from video playlist embedded with vxp-playlist-data JSON
583 'url' : 'http://www.bbc.com/news/video_and_audio/must_see/33376376' ,
587 'title' : '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,
591 'skip_download' : True ,
594 # single video story with digitalData
595 'url' : 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret' ,
599 'title' : 'Sri Lankaās spicy secret' ,
600 'description' : 'As a new train line to Jaffna opens up the countryās north, travellers can experience a truly distinct slice of Tamil culture.' ,
601 'timestamp' : 1437674293 ,
602 'upload_date' : '20150723' ,
606 'skip_download' : True ,
609 # single video story without digitalData
610 'url' : 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star' ,
614 'title' : 'Hyundai Santa Fe Sport: Rock star' ,
615 'description' : 'md5:b042a26142c4154a6e472933cf20793d' ,
616 'timestamp' : 1415867444 ,
617 'upload_date' : '20141113' ,
621 'skip_download' : True ,
624 # single video with playlist.sxml URL in playlist param
625 'url' : 'http://www.bbc.com/sport/0/football/33653409' ,
629 'title' : 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?' ,
630 'description' : 'BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.' ,
635 'skip_download' : True ,
638 # article with multiple videos embedded with playlist.sxml in playlist param
639 'url' : 'http://www.bbc.com/sport/0/football/34475836' ,
642 'title' : 'What Liverpool can expect from Klopp' ,
646 # single video with playlist URL from weather section
647 'url' : 'http://www.bbc.com/weather/features/33601775' ,
648 'only_matching' : True ,
650 # custom redirection to www.bbc.com
651 'url' : 'http://www.bbc.co.uk/news/science-environment-33661876' ,
652 'only_matching' : True ,
656 def suitable ( cls
, url
):
657 return False if BBCCoUkIE
. suitable ( url
) or BBCCoUkArticleIE
. suitable ( url
) else super ( BBCIE
, cls
). suitable ( url
)
659 def _extract_from_media_meta ( self
, media_meta
, video_id
):
660 # Direct links to media in media metadata (e.g.
661 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
662 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
663 source_files
= media_meta
. get ( 'sourceFiles' )
667 'format_id' : format_id
,
668 'ext' : f
. get ( 'encoding' ),
669 'tbr' : float_or_none ( f
. get ( 'bitrate' ), 1000 ),
670 'filesize' : int_or_none ( f
. get ( 'filesize' )),
671 } for format_id
, f
in source_files
. items () if f
. get ( 'url' )], []
673 programme_id
= media_meta
. get ( 'externalId' )
675 return self
._ download
_ media
_ selector
( programme_id
)
677 # Process playlist.sxml as legacy playlist
678 href
= media_meta
. get ( 'href' )
680 playlist
= self
._ download
_l egacy
_ playlist
_u rl
( href
)
681 _
, _
, _
, _
, formats
, subtitles
= self
._ extract
_ from
_l egacy
_ playlist
( playlist
, video_id
)
682 return formats
, subtitles
686 def _extract_from_playlist_sxml ( self
, url
, playlist_id
, timestamp
):
687 programme_id
, title
, description
, duration
, formats
, subtitles
= \
688 self
._ process
_l egacy
_ playlist
_u rl
( url
, playlist_id
)
689 self
._ sort
_ formats
( formats
)
693 'description' : description
,
694 'duration' : duration
,
695 'timestamp' : timestamp
,
697 'subtitles' : subtitles
,
700 def _real_extract ( self
, url
):
701 playlist_id
= self
._ match
_ id
( url
)
703 webpage
= self
._ download
_ webpage
( url
, playlist_id
)
706 playlist_title
= None
707 playlist_description
= None
709 ld
= self
._ parse
_ json
(
711 r
'(?s)<script type="application/ld\+json">(.+?)</script>' ,
712 webpage
, 'ld json' , default
= '{}' ),
713 playlist_id
, fatal
= False )
715 timestamp
= parse_iso8601 ( ld
. get ( 'datePublished' ))
716 playlist_title
= ld
. get ( 'headline' )
717 playlist_description
= ld
. get ( 'articleBody' )
720 timestamp
= parse_iso8601 ( self
._ search
_ regex
(
721 [ r
'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"' ,
722 r
'itemprop="datePublished"[^>]+datetime="([^"]+)"' ,
723 r
'"datePublished":\s*"([^"]+)' ],
724 webpage
, 'date' , default
= None ))
728 # article with multiple videos embedded with playlist.sxml (e.g.
729 # http://www.bbc.com/sport/0/football/34475836)
730 playlists
= re
. findall ( r
'<param[^>]+name="playlist"[^>]+value="([^"]+)"' , webpage
)
733 self
._ extract
_ from
_ playlist
_ sxml
( playlist_url
, playlist_id
, timestamp
)
734 for playlist_url
in playlists
]
736 # news article with multiple videos embedded with data-playable
737 data_playables
= re
. findall ( r
'data-playable=(["\' ])({.+ ?
}) \
1 ', webpage)
739 for _, data_playable_json in data_playables:
740 data_playable = self._parse_json(
741 unescapeHTML(data_playable_json), playlist_id, fatal=False)
742 if not data_playable:
744 settings = data_playable.get(' settings
', {})
746 # data-playable with video vpid in settings.playlistObject.items (e.g.
747 # http://www.bbc.com/news/world-us-canada-34473351)
748 playlist_object = settings.get(' playlistObject
', {})
750 items = playlist_object.get(' items
')
751 if items and isinstance(items, list):
752 title = playlist_object[' title
']
753 description = playlist_object.get(' summary
')
754 duration = int_or_none(items[0].get(' duration
'))
755 programme_id = items[0].get(' vpid
')
756 formats, subtitles = self._download_media_selector(programme_id)
757 self._sort_formats(formats)
761 ' description
': description,
762 ' timestamp
': timestamp,
763 ' duration
': duration,
765 ' subtitles
': subtitles,
768 # data-playable without vpid but with a playlist.sxml URLs
769 # in otherSettings.playlist (e.g.
770 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
771 playlist = data_playable.get(' otherSettings
', {}).get(' playlist
', {})
773 entries.append(self._extract_from_playlist_sxml(
774 playlist.get(' progressiveDownloadUrl
'), playlist_id, timestamp))
777 playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News
')
778 playlist_description = playlist_description or self._og_search_description(webpage, default=None)
779 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
781 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
782 programme_id = self._search_regex(
783 [r' data
- video
- player
- vpid
= "([\da-z] {8} )" ',
784 r' < param
[ ^
>]+ name
= "externalIdentifier" [ ^
>]+ value
= "([\da-z] {8} )" '],
785 webpage, ' vpid
', default=None)
788 formats, subtitles = self._download_media_selector(programme_id)
789 self._sort_formats(formats)
790 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
791 digital_data = self._parse_json(
793 r' var\s
+ digitalData\s
*= \s
*({.+ ?
}); ?
\n ', webpage, ' digital data
', default=' {} '),
794 programme_id, fatal=False)
795 page_info = digital_data.get(' page
', {}).get(' pageInfo
', {})
796 title = page_info.get(' pageName
') or self._og_search_title(webpage)
797 description = page_info.get(' description
') or self._og_search_description(webpage)
798 timestamp = parse_iso8601(page_info.get(' publicationDate
')) or timestamp
802 ' description
': description,
803 ' timestamp
': timestamp,
805 ' subtitles
': subtitles,
808 playlist_title = self._html_search_regex(
809 r' < title
>(.* ?
)( ?
: \s
*- \s
* BBC
[ ^
]+) ?
</ title
> ', webpage, ' playlist title
')
810 playlist_description = self._og_search_description(webpage, default=None)
812 def extract_all(pattern):
813 return list(filter(None, map(
814 lambda s: self._parse_json(s, playlist_id, fatal=False),
815 re.findall(pattern, webpage))))
817 # Multiple video article (e.g.
818 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
819 EMBED_URL = r' https?
://( ?
: www\
.) ?bbc\
. co\
. uk
/( ?
:[ ^
/]+/)+[ \da
- z
] {8}
( ?
: \b [ ^
"]+)?'
821 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
822 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
823 if embed_url and re.match(EMBED_URL, embed_url):
824 entries.append(embed_url)
825 entries.extend(re.findall(
826 r'setPlaylist\(" ( %s) "\)' % EMBED_URL, webpage))
828 return self.playlist_result(
829 [self.url_result(entry, 'BBCCoUk') for entry in entries],
830 playlist_id, playlist_title, playlist_description)
832 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
833 medias = extract_all(r" data
- media
- meta
= '({[^' ]+}) '")
836 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
837 media_asset = self._search_regex(
838 r' mediaAssetPage\
. init\
( \s
*({.+ ?
}), "/',
839 webpage, 'media asset', default=None)
841 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
843 for video in media_asset_page.get('videos', {}).values():
844 medias.extend(video.values())
847 # Multiple video playlist with single `now playing` entry (e.g.
848 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
849 vxp_playlist = self._parse_json(
851 r'<script[^>]+class=" vxp
- playlist
- data
"[^>]+type=" application
/ json
"[^>]*>([^<]+)</script>',
852 webpage, 'playlist data'),
855 for item in vxp_playlist:
856 media = item.get('media')
859 playlist_medias.append(media)
860 # Download single video if found media with asset id matching the video id from URL
861 if item.get('advert', {}).get('assetId') == playlist_id:
864 # Fallback to the whole playlist
866 medias = playlist_medias
869 for num, media_meta in enumerate(medias, start=1):
870 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
873 self._sort_formats(formats)
875 video_id = media_meta.get('externalId')
877 video_id = playlist_id if len(medias) == 1 else ' %s-%s ' % (playlist_id, num)
879 title = media_meta.get('caption')
881 title = playlist_title if len(medias) == 1 else ' %s - Video %s ' % (playlist_title, num)
883 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
886 for image in media_meta.get('images', {}).values():
887 images.extend(image.values())
888 if 'image' in media_meta:
889 images.append(media_meta['image'])
892 'url': image.get('href'),
893 'width': int_or_none(image.get('width')),
894 'height': int_or_none(image.get('height')),
895 } for image in images]
900 'thumbnails': thumbnails,
901 'duration': duration,
902 'timestamp': timestamp,
904 'subtitles': subtitles,
907 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
910 class BBCCoUkArticleIE(InfoExtractor):
911 _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
912 IE_NAME = 'bbc.co.uk:article'
913 IE_DESC = 'BBC articles'
916 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
918 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
919 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
920 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
923 'add_ie': ['BBCCoUk'],
926 def _real_extract(self, url):
927 playlist_id = self._match_id(url)
929 webpage = self._download_webpage(url, playlist_id)
931 title = self._og_search_title(webpage)
932 description = self._og_search_description(webpage).strip()
934 entries = [self.url_result(programme_url) for programme_url in re.findall(
935 r'<div[^>]+typeof=" Clip
"[^>]+resource=" ([ ^
"]+)" ', webpage)]
937 return self.playlist_result(entries, playlist_id, title, description)