]>
 
 
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbc.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  16  from  .. compat 
import  (  
  17      compat_etree_fromstring
,  
  22  class  BBCCoUkIE ( InfoExtractor
):  
  24      IE_DESC 
=  'BBC iPlayer'  
  25      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z] {8} )'  
  27      _MEDIASELECTOR_URLS 
= [  
  28          # Provides HQ HLS streams with even better quality that pc mediaset but fails  
  29          # with geolocation in some cases when it's even not geo restricted at all (e.g.  
  30          # http://www.bbc.co.uk/programmes/b06bp7lf)  
  31          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,  
  32          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' ,  
  35      _MEDIASELECTION_NS 
=  'http://bbc.co.uk/2008/mp/mediaselection'  
  36      _EMP_PLAYLIST_NS 
=  'http://bbc.co.uk/2008/emp/playlist'  
  45              'url' :  'http://www.bbc.co.uk/programmes/b039g8p7' ,  
  49                  'title' :  'Kaleidoscope, Leonard Cohen' ,  
  50                  'description' :  'The Canadian poet and songwriter reflects on his musical career.' ,  
  55                  'skip_download' :  True ,  
  59              'url' :  'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,  
  63                  'title' :  'The Man in Black: Series 3: The Printed Name' ,  
  64                  'description' :  "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,  
  69                  'skip_download' :  True ,  
  71              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,  
  74              'url' :  'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,  
  78                  'title' :  'The Voice UK: Series 3: Blind Auditions 5' ,  
  79                  'description' :  "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,  
  84                  'skip_download' :  True ,  
  86              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
  89              'url' :  'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,  
  93                  'title' :  "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,  
  94                  'description' :  '2. Invasion' ,  
  99                  'skip_download' :  True ,  
 101              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
 103              'url' :  'http://www.bbc.co.uk/programmes/b04v20dw' ,  
 107                  'title' :  'Pete Tong, The Essential New Tune Special' ,  
 108                  'description' :  "Pete has a very special mix - all of 2014's Essential New Tunes!" ,  
 113                  'skip_download' :  True ,  
 116              'url' :  'http://www.bbc.co.uk/music/clips/p02frcc3' ,  
 121                  'title' :  'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,  
 122                  'description' :  'French house superstar Madeon takes us out of the club and onto the after party.' ,  
 127                  'skip_download' :  True ,  
 130              'url' :  'http://www.bbc.co.uk/music/clips/p025c0zz' ,  
 135                  'title' :  'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,  
 136                  'description' :  'Rae Morris performs Closer for BBC Three at Reading 2014' ,  
 141                  'skip_download' :  True ,  
 144              'url' :  'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,  
 148                  'title' :  'Natural World, 2015-2016: 2. Super Powered Owls' ,  
 149                  'description' :  'md5:e4db5c937d0e95a7c6b5e654d429183d' ,  
 154                  'skip_download' :  True ,  
 156              'skip' :  'geolocation' ,  
 158              'url' :  'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,  
 162                  'description' :  'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,  
 163                  'title' :  'Royal Academy Summer Exhibition' ,  
 168                  'skip_download' :  True ,  
 170              'skip' :  'geolocation' ,  
 172              # iptv-all mediaset fails with geolocation however there is no geo restriction  
 173              # for this programme at all  
 174              'url' :  'http://www.bbc.co.uk/programmes/b06bp7lf' ,  
 178                  'title' :  "Annie Mac's Friday Night, B.Traits sits in for Annie" ,  
 179                  'description' :  'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.' ,  
 184                  'skip_download' :  True ,  
 187              'url' :  'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,  
 188              'only_matching' :  True ,  
 190              'url' :  'http://www.bbc.co.uk/music/clips#p02frcc3' ,  
 191              'only_matching' :  True ,  
 193              'url' :  'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,  
 194              'only_matching' :  True ,  
 198      class  MediaSelectionError ( Exception ):  
 199          def  __init__ ( self
,  id ):  
 202      def  _extract_asx_playlist ( self
,  connection
,  programme_id
):  
 203          asx 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading ASX playlist' )  
 204          return  [ ref
. get ( 'href' )  for  ref 
in  asx
. findall ( './Entry/ref' )]  
 206      def  _extract_connection ( self
,  connection
,  programme_id
):  
 208          kind 
=  connection
. get ( 'kind' )  
 209          protocol 
=  connection
. get ( 'protocol' )  
 210          supplier 
=  connection
. get ( 'supplier' )  
 211          if  protocol 
==  'http' :  
 212              href 
=  connection
. get ( 'href' )  
 213              transfer_format 
=  connection
. get ( 'transferFormat' )  
 215              if  supplier 
==  'asx' :  
 216                  for  i
,  ref 
in  enumerate ( self
._ extract
_ asx
_ playlist
( connection
,  programme_id
)):  
 219                          'format_id' :  'ref %s _ %s '  % ( i
,  supplier
),  
 221              # Skip DASH until supported  
 222              elif  transfer_format 
==  'dash' :  
 224              elif  transfer_format 
==  'hls' :  
 225                  m3u8_formats 
=  self
._ extract
_ m
3u8_ formats
(  
 226                      href
,  programme_id
,  ext
= 'mp4' ,  entry_protocol
= 'm3u8_native' ,  
 227                      m3u8_id
= supplier
,  fatal
= False )  
 229                      formats
. extend ( m3u8_formats
)  
 234                      'format_id' :  supplier 
or  kind 
or  protocol
,  
 236          elif  protocol 
==  'rtmp' :  
 237              application 
=  connection
. get ( 'application' ,  'ondemand' )  
 238              auth_string 
=  connection
. get ( 'authString' )  
 239              identifier 
=  connection
. get ( 'identifier' )  
 240              server 
=  connection
. get ( 'server' )  
 242                  'url' :  ' %s :// %s / %s ? %s '  % ( protocol
,  server
,  application
,  auth_string
),  
 243                  'play_path' :  identifier
,  
 244                  'app' :  ' %s ? %s '  % ( application
,  auth_string
),  
 245                  'page_url' :  'http://www.bbc.co.uk' ,  
 246                  'player_url' :  'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,  
 249                  'format_id' :  supplier
,  
 253      def  _extract_items ( self
,  playlist
):  
 254          return  playlist
. findall ( './{ %s }item'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 256      def  _findall_ns ( self
,  element
,  xpath
):  
 258          for  ns 
in  self
._ NAMESPACES
:  
 259              elements
. extend ( element
. findall ( xpath 
%  ns
))  
 262      def  _extract_medias ( self
,  media_selection
):  
 263          error 
=  media_selection
. find ( './{ %s }error'  %  self
._ MEDIASELECTION
_ NS
)  
 265              media_selection
. find ( './{ %s }error'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 266          if  error 
is not None :  
 267              raise  BBCCoUkIE
. MediaSelectionError ( error
. get ( 'id' ))  
 268          return  self
._ findall
_ ns
( media_selection
,  './{ %s }media' )  
 270      def  _extract_connections ( self
,  media
):  
 271          return  self
._ findall
_ ns
( media
,  './{ %s }connection' )  
 273      def  _extract_video ( self
,  media
,  programme_id
):  
 275          vbr 
=  int_or_none ( media
. get ( 'bitrate' ))  
 276          vcodec 
=  media
. get ( 'encoding' )  
 277          service 
=  media
. get ( 'service' )  
 278          width 
=  int_or_none ( media
. get ( 'width' ))  
 279          height 
=  int_or_none ( media
. get ( 'height' ))  
 280          file_size 
=  int_or_none ( media
. get ( 'media_file_size' ))  
 281          for  connection 
in  self
._ extract
_ connections
( media
):  
 282              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 283              for  format 
in  conn_formats
:  
 289                      'filesize' :  file_size
,  
 292                      format
[ 'format_id' ] =  ' %s _ %s '  % ( service
,  format
[ 'format_id' ])  
 293              formats
. extend ( conn_formats
)  
 296      def  _extract_audio ( self
,  media
,  programme_id
):  
 298          abr 
=  int_or_none ( media
. get ( 'bitrate' ))  
 299          acodec 
=  media
. get ( 'encoding' )  
 300          service 
=  media
. get ( 'service' )  
 301          for  connection 
in  self
._ extract
_ connections
( media
):  
 302              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 303              for  format 
in  conn_formats
:  
 305                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  
 309              formats
. extend ( conn_formats
)  
 312      def  _get_subtitles ( self
,  media
,  programme_id
):  
 314          for  connection 
in  self
._ extract
_ connections
( media
):  
 315              captions 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading captions' )  
 316              lang 
=  captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' ,  'en' )  
 319                      'url' :  connection
. get ( 'href' ),  
 325      def  _raise_extractor_error ( self
,  media_selection_error
):  
 326          raise  ExtractorError (  
 327              ' %s  returned error:  %s '  % ( self
. IE_NAME
,  media_selection_error
. id ),  
 330      def  _download_media_selector ( self
,  programme_id
):  
 331          last_exception 
=  None  
 332          for  mediaselector_url 
in  self
._ MEDIASELECTOR
_U RLS
:  
 334                  return  self
._ download
_ media
_ selector
_u rl
(  
 335                      mediaselector_url 
%  programme_id
,  programme_id
)  
 336              except  BBCCoUkIE
. MediaSelectionError 
as  e
:  
 337                  if  e
. id  in  ( 'notukerror' ,  'geolocation' ):  
 340                  self
._ raise
_ extractor
_ error
( e
)  
 341          self
._ raise
_ extractor
_ error
( last_exception
)  
 343      def  _download_media_selector_url ( self
,  url
,  programme_id
= None ):  
 345              media_selection 
=  self
._ download
_ xml
(  
 346                  url
,  programme_id
,  'Downloading media selection XML' )  
 347          except  ExtractorError 
as  ee
:  
 348              if  isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  403 :  
 349                  media_selection 
=  compat_etree_fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))  
 352          return  self
._ process
_ media
_ selector
( media_selection
,  programme_id
)  
 354      def  _process_media_selector ( self
,  media_selection
,  programme_id
):  
 358          for  media 
in  self
._ extract
_ medias
( media_selection
):  
 359              kind 
=  media
. get ( 'kind' )  
 361                  formats
. extend ( self
._ extract
_ audio
( media
,  programme_id
))  
 362              elif  kind 
==  'video' :  
 363                  formats
. extend ( self
._ extract
_ video
( media
,  programme_id
))  
 364              elif  kind 
==  'captions' :  
 365                  subtitles 
=  self
. extract_subtitles ( media
,  programme_id
)  
 366          return  formats
,  subtitles
 
 368      def  _download_playlist ( self
,  playlist_id
):  
 370              playlist 
=  self
._ download
_ json
(  
 371                  'http://www.bbc.co.uk/programmes/ %s /playlist.json'  %  playlist_id
,  
 372                  playlist_id
,  'Downloading playlist JSON' )  
 374              version 
=  playlist
. get ( 'defaultAvailableVersion' )  
 376                  smp_config 
=  version
[ 'smpConfig' ]  
 377                  title 
=  smp_config
[ 'title' ]  
 378                  description 
=  smp_config
[ 'summary' ]  
 379                  for  item 
in  smp_config
[ 'items' ]:  
 381                      if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 383                      programme_id 
=  item
. get ( 'vpid' )  
 384                      duration 
=  int_or_none ( item
. get ( 'duration' ))  
 385                      formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 386                  return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 387          except  ExtractorError 
as  ee
:  
 388              if not  ( isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  404 ):  
 391          # fallback to legacy playlist  
 392          return  self
._ process
_l egacy
_ playlist
( playlist_id
)  
 394      def  _process_legacy_playlist_url ( self
,  url
,  display_id
):  
 395          playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( url
,  display_id
)  
 396          return  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  display_id
)  
 398      def  _process_legacy_playlist ( self
,  playlist_id
):  
 399          return  self
._ process
_l egacy
_ playlist
_u rl
(  
 400              'http://www.bbc.co.uk/iplayer/playlist/ %s '  %  playlist_id
,  playlist_id
)  
 402      def  _download_legacy_playlist_url ( self
,  url
,  playlist_id
= None ):  
 403          return  self
._ download
_ xml
(  
 404              url
,  playlist_id
,  'Downloading legacy playlist XML' )  
 406      def  _extract_from_legacy_playlist ( self
,  playlist
,  playlist_id
):  
 407          no_items 
=  playlist
. find ( './{ %s }noItems'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 408          if  no_items 
is not None :  
 409              reason 
=  no_items
. get ( 'reason' )  
 410              if  reason 
==  'preAvailability' :  
 411                  msg 
=  'Episode  %s  is not yet available'  %  playlist_id
 
 412              elif  reason 
==  'postAvailability' :  
 413                  msg 
=  'Episode  %s  is no longer available'  %  playlist_id
 
 414              elif  reason 
==  'noMedia' :  
 415                  msg 
=  'Episode  %s  is not currently available'  %  playlist_id
 
 417                  msg 
=  'Episode  %s  is not available:  %s '  % ( playlist_id
,  reason
)  
 418              raise  ExtractorError ( msg
,  expected
= True )  
 420          for  item 
in  self
._ extract
_ items
( playlist
):  
 421              kind 
=  item
. get ( 'kind' )  
 422              if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 424              title 
=  playlist
. find ( './{ %s }title'  %  self
._ EMP
_ PLAYLIST
_ NS
). text
 
 425              description_el 
=  playlist
. find ( './{ %s }summary'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 426              description 
=  description_el
. text 
if  description_el 
is not None else None  
 428              def  get_programme_id ( item
):  
 429                  def  get_from_attributes ( item
):  
 430                      for  p 
in ( 'identifier' ,  'group' ):  
 432                          if  value 
and  re
. match ( r
'^[pb][\da-z] {7} $' ,  value
):  
 434                  get_from_attributes ( item
)  
 435                  mediator 
=  item
. find ( './{ %s }mediator'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 436                  if  mediator 
is not None :  
 437                      return  get_from_attributes ( mediator
)  
 439              programme_id 
=  get_programme_id ( item
)  
 440              duration 
=  int_or_none ( item
. get ( 'duration' ))  
 443                  formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 445                  formats
,  subtitles 
=  self
._ process
_ media
_ selector
( item
,  playlist_id
)  
 446                  programme_id 
=  playlist_id
 
 448          return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 450      def  _real_extract ( self
,  url
):  
 451          group_id 
=  self
._ match
_ id
( url
)  
 453          webpage 
=  self
._ download
_ webpage
( url
,  group_id
,  'Downloading video page' )  
 457          tviplayer 
=  self
._ search
_ regex
(  
 458              r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,  
 459              webpage
,  'player' ,  default
= None )  
 462              player 
=  self
._ parse
_ json
( tviplayer
,  group_id
). get ( 'player' , {})  
 463              duration 
=  int_or_none ( player
. get ( 'duration' ))  
 464              programme_id 
=  player
. get ( 'vpid' )  
 467              programme_id 
=  self
._ search
_ regex
(  
 468                  r
'"vpid"\s*:\s*"([\da-z] {8} )"' ,  webpage
,  'vpid' ,  fatal
= False ,  default
= None )  
 471              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 472              title 
=  self
._ og
_ search
_ title
( webpage
)  
 473              description 
=  self
._ search
_ regex
(  
 474                  r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,  
 475                  webpage
,  'description' ,  fatal
= False )  
 477              programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  self
._ download
_ playlist
( group_id
)  
 479          self
._ sort
_ formats
( formats
)  
 484              'description' :  description
,  
 485              'thumbnail' :  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None ),  
 486              'duration' :  duration
,  
 488              'subtitles' :  subtitles
,  
 492  class  BBCIE ( BBCCoUkIE
):  
 495      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'  
 497      _MEDIASELECTOR_URLS 
= [  
 498          # Provides HQ HLS streams but fails with geolocation in some cases when it's  
 499          # even not geo restricted at all  
 500          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,  
 501          # Provides more formats, namely direct mp4 links, but fails on some videos with  
 502          # notukerror for non UK (?) users (e.g.  
 503          # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  
 504          'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s ' ,  
 505          # Provides fewer formats, but works everywhere for everybody (hopefully)  
 506          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ %s ' ,  
 510          # article with multiple videos embedded with data-playable containing vpids  
 511          'url' :  'http://www.bbc.com/news/world-europe-32668511' ,  
 513              'id' :  'world-europe-32668511' ,  
 514              'title' :  'Russia stages massive WW2 parade despite Western boycott' ,  
 515              'description' :  'md5:00ff61976f6081841f759a08bf78cc9c' ,  
 519          # article with multiple videos embedded with data-playable (more videos)  
 520          'url' :  'http://www.bbc.com/news/business-28299555' ,  
 522              'id' :  'business-28299555' ,  
 523              'title' :  'Farnborough Airshow: Video highlights' ,  
 524              'description' :  'BBC reports and video highlights at the Farnborough Airshow.' ,  
 529          # article with multiple videos embedded with `new SMP()`  
 531          'url' :  'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460' ,  
 533              'id' :  '3662a707-0af9-3149-963f-47bea720b460' ,  
 534              'title' :  'BBC Blogs - Adam Curtis - BUGGER' ,  
 536          'playlist_count' :  18 ,  
 538          # single video embedded with data-playable containing vpid  
 539          'url' :  'http://www.bbc.com/news/world-europe-32041533' ,  
 543              'title' :  'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,  
 544              'description' :  'md5:2868290467291b37feda7863f7a83f54' ,  
 546              'timestamp' :  1427219242 ,  
 547              'upload_date' :  '20150324' ,  
 551              'skip_download' :  True ,  
 554          # article with single video embedded with data-playable containing XML playlist  
 555          # with direct video links as progressiveDownloadUrl (for now these are extracted)  
 556          # and playlist with f4m and m3u8 as streamingUrl  
 557          'url' :  'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,  
 559              'id' :  '150615_telabyad_kentin_cogu' ,  
 561              'title' :  "YPG: Tel Abyad'ın tamamı kontrolümüzde" ,  
 562              'timestamp' :  1434397334 ,  
 563              'upload_date' :  '20150615' ,  
 566              'skip_download' :  True ,  
 569          # single video embedded with data-playable containing XML playlists (regional section)  
 570          'url' :  'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,  
 572              'id' :  '150619_video_honduras_militares_hospitales_corrupcion_aw' ,  
 574              'title' :  'Honduras militariza sus hospitales por nuevo escÔndalo de corrupción' ,  
 575              'timestamp' :  1434713142 ,  
 576              'upload_date' :  '20150619' ,  
 579              'skip_download' :  True ,  
 582          # single video from video playlist embedded with vxp-playlist-data JSON  
 583          'url' :  'http://www.bbc.com/news/video_and_audio/must_see/33376376' ,  
 587              'title' :  '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,  
 591              'skip_download' :  True ,  
 594          # single video story with digitalData  
 595          'url' :  'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret' ,  
 599              'title' :  'Sri Lankaās spicy secret' ,  
 600              'description' :  'As a new train line to Jaffna opens up the countryās north, travellers can experience a truly distinct slice of Tamil culture.' ,  
 601              'timestamp' :  1437674293 ,  
 602              'upload_date' :  '20150723' ,  
 606              'skip_download' :  True ,  
 609          # single video story without digitalData  
 610          'url' :  'http://www.bbc.com/autos/story/20130513-hyundais-rock-star' ,  
 614              'title' :  'Hyundai Santa Fe Sport: Rock star' ,  
 615              'description' :  'md5:b042a26142c4154a6e472933cf20793d' ,  
 616              'timestamp' :  1415867444 ,  
 617              'upload_date' :  '20141113' ,  
 621              'skip_download' :  True ,  
 624          # single video with playlist.sxml URL in playlist param  
 625          'url' :  'http://www.bbc.com/sport/0/football/33653409' ,  
 629              'title' :  'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?' ,  
 630              'description' :  'BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.' ,  
 635              'skip_download' :  True ,  
 638          # article with multiple videos embedded with playlist.sxml in playlist param  
 639          'url' :  'http://www.bbc.com/sport/0/football/34475836' ,  
 642              'title' :  'What Liverpool can expect from Klopp' ,  
 646          # single video with playlist URL from weather section  
 647          'url' :  'http://www.bbc.com/weather/features/33601775' ,  
 648          'only_matching' :  True ,  
 650          # custom redirection to www.bbc.com  
 651          'url' :  'http://www.bbc.co.uk/news/science-environment-33661876' ,  
 652          'only_matching' :  True ,  
 656      def  suitable ( cls
,  url
):  
 657          return False if  BBCCoUkIE
. suitable ( url
)  or  BBCCoUkArticleIE
. suitable ( url
)  else  super ( BBCIE
,  cls
). suitable ( url
)  
 659      def  _extract_from_media_meta ( self
,  media_meta
,  video_id
):  
 660          # Direct links to media in media metadata (e.g.  
 661          # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)  
 662          # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml  
 663          source_files 
=  media_meta
. get ( 'sourceFiles' )  
 667                  'format_id' :  format_id
,  
 668                  'ext' :  f
. get ( 'encoding' ),  
 669                  'tbr' :  float_or_none ( f
. get ( 'bitrate' ),  1000 ),  
 670                  'filesize' :  int_or_none ( f
. get ( 'filesize' )),  
 671              }  for  format_id
,  f 
in  source_files
. items ()  if  f
. get ( 'url' )], []  
 673          programme_id 
=  media_meta
. get ( 'externalId' )  
 675              return  self
._ download
_ media
_ selector
( programme_id
)  
 677          # Process playlist.sxml as legacy playlist  
 678          href 
=  media_meta
. get ( 'href' )  
 680              playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( href
)  
 681              _
,  _
,  _
,  _
,  formats
,  subtitles 
=  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  video_id
)  
 682              return  formats
,  subtitles
 
 686      def  _extract_from_playlist_sxml ( self
,  url
,  playlist_id
,  timestamp
):  
 687          programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  \
 
 688              self
._ process
_l egacy
_ playlist
_u rl
( url
,  playlist_id
)  
 689          self
._ sort
_ formats
( formats
)  
 693              'description' :  description
,  
 694              'duration' :  duration
,  
 695              'timestamp' :  timestamp
,  
 697              'subtitles' :  subtitles
,  
 700      def  _real_extract ( self
,  url
):  
 701          playlist_id 
=  self
._ match
_ id
( url
)  
 703          webpage 
=  self
._ download
_ webpage
( url
,  playlist_id
)  
 706          playlist_title 
=  None  
 707          playlist_description 
=  None  
 709          ld 
=  self
._ parse
_ json
(  
 711                  r
'(?s)<script type="application/ld\+json">(.+?)</script>' ,  
 712                  webpage
,  'ld json' ,  default
= '{}' ),  
 713              playlist_id
,  fatal
= False )  
 715              timestamp 
=  parse_iso8601 ( ld
. get ( 'datePublished' ))  
 716              playlist_title 
=  ld
. get ( 'headline' )  
 717              playlist_description 
=  ld
. get ( 'articleBody' )  
 720              timestamp 
=  parse_iso8601 ( self
._ search
_ regex
(  
 721                  [ r
'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"' ,  
 722                   r
'itemprop="datePublished"[^>]+datetime="([^"]+)"' ,  
 723                   r
'"datePublished":\s*"([^"]+)' ],  
 724                  webpage
,  'date' ,  default
= None ))  
 728          # article with multiple videos embedded with playlist.sxml (e.g.  
 729          # http://www.bbc.com/sport/0/football/34475836)  
 730          playlists 
=  re
. findall ( r
'<param[^>]+name="playlist"[^>]+value="([^"]+)"' ,  webpage
)  
 733                  self
._ extract
_ from
_ playlist
_ sxml
( playlist_url
,  playlist_id
,  timestamp
)  
 734                  for  playlist_url 
in  playlists
]  
 736          # news article with multiple videos embedded with data-playable  
 737          data_playables 
=  re
. findall ( r
'data-playable=(["\' ])({.+ ?
}) \
1 ', webpage)  
 739              for _, data_playable_json in data_playables:  
 740                  data_playable = self._parse_json(  
 741                      unescapeHTML(data_playable_json), playlist_id, fatal=False)  
 742                  if not data_playable:  
 744                  settings = data_playable.get(' settings
', {})  
 746                      # data-playable with video vpid in settings.playlistObject.items (e.g.  
 747                      # http://www.bbc.com/news/world-us-canada-34473351)  
 748                      playlist_object = settings.get(' playlistObject
', {})  
 750                          items = playlist_object.get(' items
')  
 751                          if items and isinstance(items, list):  
 752                              title = playlist_object[' title
']  
 753                              description = playlist_object.get(' summary
')  
 754                              duration = int_or_none(items[0].get(' duration
'))  
 755                              programme_id = items[0].get(' vpid
')  
 756                              formats, subtitles = self._download_media_selector(programme_id)  
 757                              self._sort_formats(formats)  
 761                                  ' description
': description,  
 762                                  ' timestamp
': timestamp,  
 763                                  ' duration
': duration,  
 765                                  ' subtitles
': subtitles,  
 768                          # data-playable without vpid but with a playlist.sxml URLs  
 769                          # in otherSettings.playlist (e.g.  
 770                          # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)  
 771                          playlist = data_playable.get(' otherSettings
', {}).get(' playlist
', {})  
 773                              entries.append(self._extract_from_playlist_sxml(  
 774                                  playlist.get(' progressiveDownloadUrl
'), playlist_id, timestamp))  
 777              playlist_title = playlist_title or remove_end(self._og_search_title(webpage), '  -  BBC News
')  
 778              playlist_description = playlist_description or self._og_search_description(webpage, default=None)  
 779              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  
 781          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  
 782          programme_id = self._search_regex(  
 783              [r' data
- video
- player
- vpid
= "([\da-z] {8} )" ',  
 784               r' < param
[ ^
>]+ name
= "externalIdentifier" [ ^
>]+ value
= "([\da-z] {8} )" '],  
 785              webpage, ' vpid
', default=None)  
 788              formats, subtitles = self._download_media_selector(programme_id)  
 789              self._sort_formats(formats)  
 790              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)  
 791              digital_data = self._parse_json(  
 793                      r' var\s
+ digitalData\s
*= \s
*({.+ ?
}); ?
\n ', webpage, ' digital data
', default=' {} '),  
 794                  programme_id, fatal=False)  
 795              page_info = digital_data.get(' page
', {}).get(' pageInfo
', {})  
 796              title = page_info.get(' pageName
') or self._og_search_title(webpage)  
 797              description = page_info.get(' description
') or self._og_search_description(webpage)  
 798              timestamp = parse_iso8601(page_info.get(' publicationDate
')) or timestamp  
 802                  ' description
': description,  
 803                  ' timestamp
': timestamp,  
 805                  ' subtitles
': subtitles,  
 808          playlist_title = self._html_search_regex(  
 809              r' < title
>(.* ?
)( ?
: \s
*- \s
* BBC 
[ ^ 
]+) ?
</ title
> ', webpage, ' playlist title
')  
 810          playlist_description = self._og_search_description(webpage, default=None)  
 812          def extract_all(pattern):  
 813              return list(filter(None, map(  
 814                  lambda s: self._parse_json(s, playlist_id, fatal=False),  
 815                  re.findall(pattern, webpage))))  
 817          # Multiple video article (e.g.  
 818          # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)  
 819          EMBED_URL = r' https?
://( ?
: www\
.) ?bbc\
. co\
. uk
/( ?
:[ ^
/]+/)+[ \da
- z
] {8}
( ?
: \b [ ^
"]+)?'  
 821          for match in extract_all(r'new\s+SMP\(({.+?})\)'):  
 822              embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')  
 823              if embed_url and re.match(EMBED_URL, embed_url):  
 824                  entries.append(embed_url)  
 825          entries.extend(re.findall(  
 826              r'setPlaylist\(" ( %s) "\)' % EMBED_URL, webpage))  
 828              return self.playlist_result(  
 829                  [self.url_result(entry, 'BBCCoUk') for entry in entries],  
 830                  playlist_id, playlist_title, playlist_description)  
 832          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)  
 833          medias = extract_all(r" data
- media
- meta
= '({[^' ]+}) '")  
 836              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)  
 837              media_asset = self._search_regex(  
 838                  r' mediaAssetPage\
. init\
( \s
*({.+ ?
}),  "/',  
 839                  webpage, 'media asset', default=None)  
 841                  media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)  
 843                  for video in media_asset_page.get('videos', {}).values():  
 844                      medias.extend(video.values())  
 847              # Multiple video playlist with single `now playing` entry (e.g.  
 848              # http://www.bbc.com/news/video_and_audio/must_see/33767813)  
 849              vxp_playlist = self._parse_json(  
 851                      r'<script[^>]+class=" vxp
- playlist
- data
"[^>]+type=" application
/ json
"[^>]*>([^<]+)</script>',  
 852                      webpage, 'playlist data'),  
 855              for item in vxp_playlist:  
 856                  media = item.get('media')  
 859                  playlist_medias.append(media)  
 860                  # Download single video if found media with asset id matching the video id from URL  
 861                  if item.get('advert', {}).get('assetId') == playlist_id:  
 864              # Fallback to the whole playlist  
 866                  medias = playlist_medias  
 869          for num, media_meta in enumerate(medias, start=1):  
 870              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)  
 873              self._sort_formats(formats)  
 875              video_id = media_meta.get('externalId')  
 877                  video_id = playlist_id if len(medias) == 1 else ' %s-%s ' % (playlist_id, num)  
 879              title = media_meta.get('caption')  
 881                  title = playlist_title if len(medias) == 1 else ' %s  - Video  %s ' % (playlist_title, num)  
 883              duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))  
 886              for image in media_meta.get('images', {}).values():  
 887                  images.extend(image.values())  
 888              if 'image' in media_meta:  
 889                  images.append(media_meta['image'])  
 892                  'url': image.get('href'),  
 893                  'width': int_or_none(image.get('width')),  
 894                  'height': int_or_none(image.get('height')),  
 895              } for image in images]  
 900                  'thumbnails': thumbnails,  
 901                  'duration': duration,  
 902                  'timestamp': timestamp,  
 904                  'subtitles': subtitles,  
 907          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  
 910  class BBCCoUkArticleIE(InfoExtractor):  
 911      _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'  
 912      IE_NAME = 'bbc.co.uk:article'  
 913      IE_DESC = 'BBC articles'  
 916          'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',  
 918              'id': '3jNQLTMrPlYGTBn0WV6M2MS',  
 919              'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',  
 920              'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',  
 923          'add_ie': ['BBCCoUk'],  
 926      def _real_extract(self, url):  
 927          playlist_id = self._match_id(url)  
 929          webpage = self._download_webpage(url, playlist_id)  
 931          title = self._og_search_title(webpage)  
 932          description = self._og_search_description(webpage).strip()  
 934          entries = [self.url_result(programme_url) for programme_url in re.findall(  
 935              r'<div[^>]+typeof=" Clip
"[^>]+resource=" ([ ^
"]+)" ', webpage)]  
 937          return self.playlist_result(entries, playlist_id, title, description)