]>
 
 
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbc.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  16  from  .. compat 
import  (  
  17      compat_etree_fromstring
,  
  22  class  BBCCoUkIE ( InfoExtractor
):  
  24      IE_DESC 
=  'BBC iPlayer'  
  25      _ID_REGEX 
=  r
'[pb][\da-z] {7} '  
  28                          (?:www\.)?bbc\.co\.uk/  
  30                              programmes/(?!articles/)|  
  31                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)|  
  38      _MEDIASELECTOR_URLS 
= [  
  39          # Provides HQ HLS streams with even better quality that pc mediaset but fails  
  40          # with geolocation in some cases when it's even not geo restricted at all (e.g.  
  41          # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.  
  42          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,  
  43          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' ,  
  46      _MEDIASELECTION_NS 
=  'http://bbc.co.uk/2008/mp/mediaselection'  
  47      _EMP_PLAYLIST_NS 
=  'http://bbc.co.uk/2008/emp/playlist'  
  56              'url' :  'http://www.bbc.co.uk/programmes/b039g8p7' ,  
  60                  'title' :  'Leonard Cohen, Kaleidoscope - BBC Radio 4' ,  
  61                  'description' :  'The Canadian poet and songwriter reflects on his musical career.' ,  
  65                  'skip_download' :  True ,  
  69              'url' :  'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,  
  73                  'title' :  'The Man in Black: Series 3: The Printed Name' ,  
  74                  'description' :  "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,  
  79                  'skip_download' :  True ,  
  81              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,  
  84              'url' :  'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,  
  88                  'title' :  'The Voice UK: Series 3: Blind Auditions 5' ,  
  89                  'description' :  'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.' ,  
  94                  'skip_download' :  True ,  
  96              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
  99              'url' :  'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,  
 103                  'title' :  "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,  
 104                  'description' :  '2. Invasion' ,  
 109                  'skip_download' :  True ,  
 111              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
 113              'url' :  'http://www.bbc.co.uk/programmes/b04v20dw' ,  
 117                  'title' :  'Pete Tong, The Essential New Tune Special' ,  
 118                  'description' :  "Pete has a very special mix - all of 2014's Essential New Tunes!" ,  
 123                  'skip_download' :  True ,  
 125              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,  
 127              'url' :  'http://www.bbc.co.uk/music/clips/p022h44b' ,  
 132                  'title' :  'BBC Proms Music Guides, Rachmaninov: Symphonic Dances' ,  
 133                  'description' :  "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances." ,  
 138                  'skip_download' :  True ,  
 141              'url' :  'http://www.bbc.co.uk/music/clips/p025c0zz' ,  
 146                  'title' :  'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,  
 147                  'description' :  'Rae Morris performs Closer for BBC Three at Reading 2014' ,  
 152                  'skip_download' :  True ,  
 155              'url' :  'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,  
 159                  'title' :  'Natural World, 2015-2016: 2. Super Powered Owls' ,  
 160                  'description' :  'md5:e4db5c937d0e95a7c6b5e654d429183d' ,  
 165                  'skip_download' :  True ,  
 167              'skip' :  'geolocation' ,  
 169              'url' :  'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,  
 173                  'description' :  'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,  
 174                  'title' :  'Royal Academy Summer Exhibition' ,  
 179                  'skip_download' :  True ,  
 181              'skip' :  'geolocation' ,  
 183              # iptv-all mediaset fails with geolocation however there is no geo restriction  
 184              # for this programme at all  
 185              'url' :  'http://www.bbc.co.uk/programmes/b06rkn85' ,  
 189                  'title' :  "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1" ,  
 190                  'description' :  "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!" ,  
 194                  'skip_download' :  True ,  
 197              # compact player (https://github.com/rg3/youtube-dl/issues/8147)  
 198              'url' :  'http://www.bbc.co.uk/programmes/p028bfkf/player' ,  
 202                  'title' :  'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,  
 203                  'description' :  'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,  
 207                  'skip_download' :  True ,  
 210              'url' :  'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,  
 211              'only_matching' :  True ,  
 213              'url' :  'http://www.bbc.co.uk/music/clips#p02frcc3' ,  
 214              'only_matching' :  True ,  
 216              'url' :  'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,  
 217              'only_matching' :  True ,  
 219              'url' :  'http://www.bbc.co.uk/radio/player/p03cchwf' ,  
 220              'only_matching' :  True ,  
 224      class  MediaSelectionError ( Exception ):  
 225          def  __init__ ( self
,  id ):  
 228      def  _extract_asx_playlist ( self
,  connection
,  programme_id
):  
 229          asx 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading ASX playlist' )  
 230          return  [ ref
. get ( 'href' )  for  ref 
in  asx
. findall ( './Entry/ref' )]  
 232      def  _extract_connection ( self
,  connection
,  programme_id
):  
 234          kind 
=  connection
. get ( 'kind' )  
 235          protocol 
=  connection
. get ( 'protocol' )  
 236          supplier 
=  connection
. get ( 'supplier' )  
 237          if  protocol 
==  'http' :  
 238              href 
=  connection
. get ( 'href' )  
 239              transfer_format 
=  connection
. get ( 'transferFormat' )  
 241              if  supplier 
==  'asx' :  
 242                  for  i
,  ref 
in  enumerate ( self
._ extract
_ asx
_ playlist
( connection
,  programme_id
)):  
 245                          'format_id' :  'ref %s _ %s '  % ( i
,  supplier
),  
 247              # Skip DASH until supported  
 248              elif  transfer_format 
==  'dash' :  
 250              elif  transfer_format 
==  'hls' :  
 251                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  
 252                      href
,  programme_id
,  ext
= 'mp4' ,  entry_protocol
= 'm3u8_native' ,  
 253                      m3u8_id
= supplier
,  fatal
= False ))  
 258                      'format_id' :  supplier 
or  kind 
or  protocol
,  
 260          elif  protocol 
==  'rtmp' :  
 261              application 
=  connection
. get ( 'application' ,  'ondemand' )  
 262              auth_string 
=  connection
. get ( 'authString' )  
 263              identifier 
=  connection
. get ( 'identifier' )  
 264              server 
=  connection
. get ( 'server' )  
 266                  'url' :  ' %s :// %s / %s ? %s '  % ( protocol
,  server
,  application
,  auth_string
),  
 267                  'play_path' :  identifier
,  
 268                  'app' :  ' %s ? %s '  % ( application
,  auth_string
),  
 269                  'page_url' :  'http://www.bbc.co.uk' ,  
 270                  'player_url' :  'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,  
 273                  'format_id' :  supplier
,  
 277      def  _extract_items ( self
,  playlist
):  
 278          return  playlist
. findall ( './{ %s }item'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 280      def  _findall_ns ( self
,  element
,  xpath
):  
 282          for  ns 
in  self
._ NAMESPACES
:  
 283              elements
. extend ( element
. findall ( xpath 
%  ns
))  
 286      def  _extract_medias ( self
,  media_selection
):  
 287          error 
=  media_selection
. find ( './{ %s }error'  %  self
._ MEDIASELECTION
_ NS
)  
 289              media_selection
. find ( './{ %s }error'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 290          if  error 
is not None :  
 291              raise  BBCCoUkIE
. MediaSelectionError ( error
. get ( 'id' ))  
 292          return  self
._ findall
_ ns
( media_selection
,  './{ %s }media' )  
 294      def  _extract_connections ( self
,  media
):  
 295          return  self
._ findall
_ ns
( media
,  './{ %s }connection' )  
 297      def  _extract_video ( self
,  media
,  programme_id
):  
 299          vbr 
=  int_or_none ( media
. get ( 'bitrate' ))  
 300          vcodec 
=  media
. get ( 'encoding' )  
 301          service 
=  media
. get ( 'service' )  
 302          width 
=  int_or_none ( media
. get ( 'width' ))  
 303          height 
=  int_or_none ( media
. get ( 'height' ))  
 304          file_size 
=  int_or_none ( media
. get ( 'media_file_size' ))  
 305          for  connection 
in  self
._ extract
_ connections
( media
):  
 306              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 307              for  format 
in  conn_formats
:  
 313                      'filesize' :  file_size
,  
 316                      format
[ 'format_id' ] =  ' %s _ %s '  % ( service
,  format
[ 'format_id' ])  
 317              formats
. extend ( conn_formats
)  
 320      def  _extract_audio ( self
,  media
,  programme_id
):  
 322          abr 
=  int_or_none ( media
. get ( 'bitrate' ))  
 323          acodec 
=  media
. get ( 'encoding' )  
 324          service 
=  media
. get ( 'service' )  
 325          for  connection 
in  self
._ extract
_ connections
( media
):  
 326              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 327              for  format 
in  conn_formats
:  
 329                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  
 333              formats
. extend ( conn_formats
)  
 336      def  _get_subtitles ( self
,  media
,  programme_id
):  
 338          for  connection 
in  self
._ extract
_ connections
( media
):  
 339              captions 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading captions' )  
 340              lang 
=  captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' ,  'en' )  
 343                      'url' :  connection
. get ( 'href' ),  
 349      def  _raise_extractor_error ( self
,  media_selection_error
):  
 350          raise  ExtractorError (  
 351              ' %s  returned error:  %s '  % ( self
. IE_NAME
,  media_selection_error
. id ),  
 354      def  _download_media_selector ( self
,  programme_id
):  
 355          last_exception 
=  None  
 356          for  mediaselector_url 
in  self
._ MEDIASELECTOR
_U RLS
:  
 358                  return  self
._ download
_ media
_ selector
_u rl
(  
 359                      mediaselector_url 
%  programme_id
,  programme_id
)  
 360              except  BBCCoUkIE
. MediaSelectionError 
as  e
:  
 361                  if  e
. id  in  ( 'notukerror' ,  'geolocation' ,  'selectionunavailable' ):  
 364                  self
._ raise
_ extractor
_ error
( e
)  
 365          self
._ raise
_ extractor
_ error
( last_exception
)  
 367      def  _download_media_selector_url ( self
,  url
,  programme_id
= None ):  
 369              media_selection 
=  self
._ download
_ xml
(  
 370                  url
,  programme_id
,  'Downloading media selection XML' )  
 371          except  ExtractorError 
as  ee
:  
 372              if  isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
in  ( 403 ,  404 ):  
 373                  media_selection 
=  compat_etree_fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))  
 376          return  self
._ process
_ media
_ selector
( media_selection
,  programme_id
)  
 378      def  _process_media_selector ( self
,  media_selection
,  programme_id
):  
 382          for  media 
in  self
._ extract
_ medias
( media_selection
):  
 383              kind 
=  media
. get ( 'kind' )  
 385                  formats
. extend ( self
._ extract
_ audio
( media
,  programme_id
))  
 386              elif  kind 
==  'video' :  
 387                  formats
. extend ( self
._ extract
_ video
( media
,  programme_id
))  
 388              elif  kind 
==  'captions' :  
 389                  subtitles 
=  self
. extract_subtitles ( media
,  programme_id
)  
 390          return  formats
,  subtitles
 
 392      def  _download_playlist ( self
,  playlist_id
):  
 394              playlist 
=  self
._ download
_ json
(  
 395                  'http://www.bbc.co.uk/programmes/ %s /playlist.json'  %  playlist_id
,  
 396                  playlist_id
,  'Downloading playlist JSON' )  
 398              version 
=  playlist
. get ( 'defaultAvailableVersion' )  
 400                  smp_config 
=  version
[ 'smpConfig' ]  
 401                  title 
=  smp_config
[ 'title' ]  
 402                  description 
=  smp_config
[ 'summary' ]  
 403                  for  item 
in  smp_config
[ 'items' ]:  
 405                      if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 407                      programme_id 
=  item
. get ( 'vpid' )  
 408                      duration 
=  int_or_none ( item
. get ( 'duration' ))  
 409                      formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 410                  return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 411          except  ExtractorError 
as  ee
:  
 412              if not  ( isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  404 ):  
 415          # fallback to legacy playlist  
 416          return  self
._ process
_l egacy
_ playlist
( playlist_id
)  
 418      def  _process_legacy_playlist_url ( self
,  url
,  display_id
):  
 419          playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( url
,  display_id
)  
 420          return  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  display_id
)  
 422      def  _process_legacy_playlist ( self
,  playlist_id
):  
 423          return  self
._ process
_l egacy
_ playlist
_u rl
(  
 424              'http://www.bbc.co.uk/iplayer/playlist/ %s '  %  playlist_id
,  playlist_id
)  
 426      def  _download_legacy_playlist_url ( self
,  url
,  playlist_id
= None ):  
 427          return  self
._ download
_ xml
(  
 428              url
,  playlist_id
,  'Downloading legacy playlist XML' )  
 430      def  _extract_from_legacy_playlist ( self
,  playlist
,  playlist_id
):  
 431          no_items 
=  playlist
. find ( './{ %s }noItems'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 432          if  no_items 
is not None :  
 433              reason 
=  no_items
. get ( 'reason' )  
 434              if  reason 
==  'preAvailability' :  
 435                  msg 
=  'Episode  %s  is not yet available'  %  playlist_id
 
 436              elif  reason 
==  'postAvailability' :  
 437                  msg 
=  'Episode  %s  is no longer available'  %  playlist_id
 
 438              elif  reason 
==  'noMedia' :  
 439                  msg 
=  'Episode  %s  is not currently available'  %  playlist_id
 
 441                  msg 
=  'Episode  %s  is not available:  %s '  % ( playlist_id
,  reason
)  
 442              raise  ExtractorError ( msg
,  expected
= True )  
 444          for  item 
in  self
._ extract
_ items
( playlist
):  
 445              kind 
=  item
. get ( 'kind' )  
 446              if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 448              title 
=  playlist
. find ( './{ %s }title'  %  self
._ EMP
_ PLAYLIST
_ NS
). text
 
 449              description_el 
=  playlist
. find ( './{ %s }summary'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 450              description 
=  description_el
. text 
if  description_el 
is not None else None  
 452              def  get_programme_id ( item
):  
 453                  def  get_from_attributes ( item
):  
 454                      for  p 
in ( 'identifier' ,  'group' ):  
 456                          if  value 
and  re
. match ( r
'^[pb][\da-z] {7} $' ,  value
):  
 458                  get_from_attributes ( item
)  
 459                  mediator 
=  item
. find ( './{ %s }mediator'  %  self
._ EMP
_ PLAYLIST
_ NS
)  
 460                  if  mediator 
is not None :  
 461                      return  get_from_attributes ( mediator
)  
 463              programme_id 
=  get_programme_id ( item
)  
 464              duration 
=  int_or_none ( item
. get ( 'duration' ))  
 467                  formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 469                  formats
,  subtitles 
=  self
._ process
_ media
_ selector
( item
,  playlist_id
)  
 470                  programme_id 
=  playlist_id
 
 472          return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 474      def  _real_extract ( self
,  url
):  
 475          group_id 
=  self
._ match
_ id
( url
)  
 477          webpage 
=  self
._ download
_ webpage
( url
,  group_id
,  'Downloading video page' )  
 482          tviplayer 
=  self
._ search
_ regex
(  
 483              r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,  
 484              webpage
,  'player' ,  default
= None )  
 487              player 
=  self
._ parse
_ json
( tviplayer
,  group_id
). get ( 'player' , {})  
 488              duration 
=  int_or_none ( player
. get ( 'duration' ))  
 489              programme_id 
=  player
. get ( 'vpid' )  
 492              programme_id 
=  self
._ search
_ regex
(  
 493                  r
'"vpid"\s*:\s*"( %s )"'  %  self
._ ID
_ REGEX
,  webpage
,  'vpid' ,  fatal
= False ,  default
= None )  
 496              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 497              title 
=  self
._ og
_ search
_ title
( webpage
,  default
= None )  or  self
._ html
_ search
_ regex
(  
 498                  ( r
'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>' ,  
 499                   r
'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>' ),  webpage
,  'title' )  
 500              description 
=  self
._ search
_ regex
(  
 501                  ( r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,  
 502                   r
'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>' ),  
 503                  webpage
,  'description' ,  default
= None )  
 505                  description 
=  self
._ html
_ search
_ meta
( 'description' ,  webpage
)  
 507              programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  self
._ download
_ playlist
( group_id
)  
 509          self
._ sort
_ formats
( formats
)  
 514              'description' :  description
,  
 515              'thumbnail' :  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None ),  
 516              'duration' :  duration
,  
 518              'subtitles' :  subtitles
,  
 522  class  BBCIE ( BBCCoUkIE
):  
 525      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'  
 527      _MEDIASELECTOR_URLS 
= [  
 528          # Provides HQ HLS streams but fails with geolocation in some cases when it's  
 529          # even not geo restricted at all  
 530          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,  
 531          # Provides more formats, namely direct mp4 links, but fails on some videos with  
 532          # notukerror for non UK (?) users (e.g.  
 533          # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  
 534          'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s ' ,  
 535          # Provides fewer formats, but works everywhere for everybody (hopefully)  
 536          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ %s ' ,  
 540          # article with multiple videos embedded with data-playable containing vpids  
 541          'url' :  'http://www.bbc.com/news/world-europe-32668511' ,  
 543              'id' :  'world-europe-32668511' ,  
 544              'title' :  'Russia stages massive WW2 parade despite Western boycott' ,  
 545              'description' :  'md5:00ff61976f6081841f759a08bf78cc9c' ,  
 549          # article with multiple videos embedded with data-playable (more videos)  
 550          'url' :  'http://www.bbc.com/news/business-28299555' ,  
 552              'id' :  'business-28299555' ,  
 553              'title' :  'Farnborough Airshow: Video highlights' ,  
 554              'description' :  'BBC reports and video highlights at the Farnborough Airshow.' ,  
 559          # article with multiple videos embedded with `new SMP()`  
 561          'url' :  'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460' ,  
 563              'id' :  '3662a707-0af9-3149-963f-47bea720b460' ,  
 564              'title' :  'BBC Blogs - Adam Curtis - BUGGER' ,  
 566          'playlist_count' :  18 ,  
 568          # single video embedded with data-playable containing vpid  
 569          'url' :  'http://www.bbc.com/news/world-europe-32041533' ,  
 573              'title' :  'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,  
 574              'description' :  'md5:2868290467291b37feda7863f7a83f54' ,  
 576              'timestamp' :  1427219242 ,  
 577              'upload_date' :  '20150324' ,  
 581              'skip_download' :  True ,  
 584          # article with single video embedded with data-playable containing XML playlist  
 585          # with direct video links as progressiveDownloadUrl (for now these are extracted)  
 586          # and playlist with f4m and m3u8 as streamingUrl  
 587          'url' :  'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,  
 589              'id' :  '150615_telabyad_kentin_cogu' ,  
 591              'title' :  "YPG: Tel Abyad'ın tamamı kontrolümüzde" ,  
 592              'timestamp' :  1434397334 ,  
 593              'upload_date' :  '20150615' ,  
 596              'skip_download' :  True ,  
 599          # single video embedded with data-playable containing XML playlists (regional section)  
 600          'url' :  'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,  
 602              'id' :  '150619_video_honduras_militares_hospitales_corrupcion_aw' ,  
 604              'title' :  'Honduras militariza sus hospitales por nuevo escÔndalo de corrupción' ,  
 605              'timestamp' :  1434713142 ,  
 606              'upload_date' :  '20150619' ,  
 609              'skip_download' :  True ,  
 612          # single video from video playlist embedded with vxp-playlist-data JSON  
 613          'url' :  'http://www.bbc.com/news/video_and_audio/must_see/33376376' ,  
 617              'title' :  '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,  
 619              'description' :  '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,  
 622              'skip_download' :  True ,  
 625          # single video story with digitalData  
 626          'url' :  'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret' ,  
 630              'title' :  'Sri Lankaās spicy secret' ,  
 631              'description' :  'As a new train line to Jaffna opens up the countryās north, travellers can experience a truly distinct slice of Tamil culture.' ,  
 632              'timestamp' :  1437674293 ,  
 633              'upload_date' :  '20150723' ,  
 637              'skip_download' :  True ,  
 640          # single video story without digitalData  
 641          'url' :  'http://www.bbc.com/autos/story/20130513-hyundais-rock-star' ,  
 645              'title' :  'Hyundai Santa Fe Sport: Rock star' ,  
 646              'description' :  'md5:b042a26142c4154a6e472933cf20793d' ,  
 647              'timestamp' :  1415867444 ,  
 648              'upload_date' :  '20141113' ,  
 652              'skip_download' :  True ,  
 655          # single video with playlist.sxml URL in playlist param  
 656          'url' :  'http://www.bbc.com/sport/0/football/33653409' ,  
 660              'title' :  'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?' ,  
 661              'description' :  'BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.' ,  
 666              'skip_download' :  True ,  
 669          # article with multiple videos embedded with playlist.sxml in playlist param  
 670          'url' :  'http://www.bbc.com/sport/0/football/34475836' ,  
 673              'title' :  'What Liverpool can expect from Klopp' ,  
 677          # single video with playlist URL from weather section  
 678          'url' :  'http://www.bbc.com/weather/features/33601775' ,  
 679          'only_matching' :  True ,  
 681          # custom redirection to www.bbc.com  
 682          'url' :  'http://www.bbc.co.uk/news/science-environment-33661876' ,  
 683          'only_matching' :  True ,  
 687      def  suitable ( cls
,  url
):  
 688          return False if  BBCCoUkIE
. suitable ( url
)  or  BBCCoUkArticleIE
. suitable ( url
)  else  super ( BBCIE
,  cls
). suitable ( url
)  
 690      def  _extract_from_media_meta ( self
,  media_meta
,  video_id
):  
 691          # Direct links to media in media metadata (e.g.  
 692          # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)  
 693          # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml  
 694          source_files 
=  media_meta
. get ( 'sourceFiles' )  
 698                  'format_id' :  format_id
,  
 699                  'ext' :  f
. get ( 'encoding' ),  
 700                  'tbr' :  float_or_none ( f
. get ( 'bitrate' ),  1000 ),  
 701                  'filesize' :  int_or_none ( f
. get ( 'filesize' )),  
 702              }  for  format_id
,  f 
in  source_files
. items ()  if  f
. get ( 'url' )], []  
 704          programme_id 
=  media_meta
. get ( 'externalId' )  
 706              return  self
._ download
_ media
_ selector
( programme_id
)  
 708          # Process playlist.sxml as legacy playlist  
 709          href 
=  media_meta
. get ( 'href' )  
 711              playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( href
)  
 712              _
,  _
,  _
,  _
,  formats
,  subtitles 
=  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  video_id
)  
 713              return  formats
,  subtitles
 
 717      def  _extract_from_playlist_sxml ( self
,  url
,  playlist_id
,  timestamp
):  
 718          programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  \
 
 719              self
._ process
_l egacy
_ playlist
_u rl
( url
,  playlist_id
)  
 720          self
._ sort
_ formats
( formats
)  
 724              'description' :  description
,  
 725              'duration' :  duration
,  
 726              'timestamp' :  timestamp
,  
 728              'subtitles' :  subtitles
,  
 731      def  _real_extract ( self
,  url
):  
 732          playlist_id 
=  self
._ match
_ id
( url
)  
 734          webpage 
=  self
._ download
_ webpage
( url
,  playlist_id
)  
 736          json_ld_info 
=  self
._ search
_ json
_l d
( webpage
,  playlist_id
,  default
= None )  
 737          timestamp 
=  json_ld_info
. get ( 'timestamp' )  
 738          playlist_title 
=  json_ld_info
. get ( 'title' )  
 739          playlist_description 
=  json_ld_info
. get ( 'description' )  
 742              timestamp 
=  parse_iso8601 ( self
._ search
_ regex
(  
 743                  [ r
'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"' ,  
 744                   r
'itemprop="datePublished"[^>]+datetime="([^"]+)"' ,  
 745                   r
'"datePublished":\s*"([^"]+)' ],  
 746                  webpage
,  'date' ,  default
= None ))  
 750          # article with multiple videos embedded with playlist.sxml (e.g.  
 751          # http://www.bbc.com/sport/0/football/34475836)  
 752          playlists 
=  re
. findall ( r
'<param[^>]+name="playlist"[^>]+value="([^"]+)"' ,  webpage
)  
 753          playlists
. extend ( re
. findall ( r
'data-media-id="([^"]+/playlist\.sxml)"' ,  webpage
))  
 756                  self
._ extract
_ from
_ playlist
_ sxml
( playlist_url
,  playlist_id
,  timestamp
)  
 757                  for  playlist_url 
in  playlists
]  
 759          # news article with multiple videos embedded with data-playable  
 760          data_playables 
=  re
. findall ( r
'data-playable=(["\' ])({.+ ?
}) \
1 ', webpage)  
 762              for _, data_playable_json in data_playables:  
 763                  data_playable = self._parse_json(  
 764                      unescapeHTML(data_playable_json), playlist_id, fatal=False)  
 765                  if not data_playable:  
 767                  settings = data_playable.get(' settings
', {})  
 769                      # data-playable with video vpid in settings.playlistObject.items (e.g.  
 770                      # http://www.bbc.com/news/world-us-canada-34473351)  
 771                      playlist_object = settings.get(' playlistObject
', {})  
 773                          items = playlist_object.get(' items
')  
 774                          if items and isinstance(items, list):  
 775                              title = playlist_object[' title
']  
 776                              description = playlist_object.get(' summary
')  
 777                              duration = int_or_none(items[0].get(' duration
'))  
 778                              programme_id = items[0].get(' vpid
')  
 779                              formats, subtitles = self._download_media_selector(programme_id)  
 780                              self._sort_formats(formats)  
 784                                  ' description
': description,  
 785                                  ' timestamp
': timestamp,  
 786                                  ' duration
': duration,  
 788                                  ' subtitles
': subtitles,  
 791                          # data-playable without vpid but with a playlist.sxml URLs  
 792                          # in otherSettings.playlist (e.g.  
 793                          # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)  
 794                          playlist = data_playable.get(' otherSettings
', {}).get(' playlist
', {})  
 796                              entries.append(self._extract_from_playlist_sxml(  
 797                                  playlist.get(' progressiveDownloadUrl
'), playlist_id, timestamp))  
 800              playlist_title = playlist_title or remove_end(self._og_search_title(webpage), '  -  BBC News
')  
 801              playlist_description = playlist_description or self._og_search_description(webpage, default=None)  
 802              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  
 804          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  
 805          programme_id = self._search_regex(  
 806              [r' data
- video
- player
- vpid
= "( %s )" ' % self._ID_REGEX,  
 807               r' < param
[ ^
>]+ name
= "externalIdentifier" [ ^
>]+ value
= "( %s )" ' % self._ID_REGEX,  
 808               r' videoId\s
*: \s
*[ " \' ]( %s )[" \' ] ' % self._ID_REGEX],  
 809              webpage, ' vpid
', default=None)  
 812              formats, subtitles = self._download_media_selector(programme_id)  
 813              self._sort_formats(formats)  
 814              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)  
 815              digital_data = self._parse_json(  
 817                      r' var\s
+ digitalData\s
*= \s
*({.+ ?
}); ?
\n ', webpage, ' digital data
', default=' {} '),  
 818                  programme_id, fatal=False)  
 819              page_info = digital_data.get(' page
', {}).get(' pageInfo
', {})  
 820              title = page_info.get(' pageName
') or self._og_search_title(webpage)  
 821              description = page_info.get(' description
') or self._og_search_description(webpage)  
 822              timestamp = parse_iso8601(page_info.get(' publicationDate
')) or timestamp  
 826                  ' description
': description,  
 827                  ' timestamp
': timestamp,  
 829                  ' subtitles
': subtitles,  
 832          playlist_title = self._html_search_regex(  
 833              r' < title
>(.* ?
)( ?
: \s
*- \s
* BBC 
[ ^ 
]+) ?
</ title
> ', webpage, ' playlist title
')  
 834          playlist_description = self._og_search_description(webpage, default=None)  
 836          def extract_all(pattern):  
 837              return list(filter(None, map(  
 838                  lambda s: self._parse_json(s, playlist_id, fatal=False),  
 839                  re.findall(pattern, webpage))))  
 841          # Multiple video article (e.g.  
 842          # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)  
 843          EMBED_URL = r' https?
://( ?
: www\
.) ?bbc\
. co\
. uk
/( ?
:[ ^
/]+/)+ %s( ?
: \b [ ^
"]+)?' % self._ID_REGEX  
 845          for match in extract_all(r'new\s+SMP\(({.+?})\)'):  
 846              embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')  
 847              if embed_url and re.match(EMBED_URL, embed_url):  
 848                  entries.append(embed_url)  
 849          entries.extend(re.findall(  
 850              r'setPlaylist\(" ( %s) "\)' % EMBED_URL, webpage))  
 852              return self.playlist_result(  
 853                  [self.url_result(entry, 'BBCCoUk') for entry in entries],  
 854                  playlist_id, playlist_title, playlist_description)  
 856          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)  
 857          medias = extract_all(r" data
- media
- meta
= '({[^' ]+}) '")  
 860              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)  
 861              media_asset = self._search_regex(  
 862                  r' mediaAssetPage\
. init\
( \s
*({.+ ?
}),  "/',  
 863                  webpage, 'media asset', default=None)  
 865                  media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)  
 867                  for video in media_asset_page.get('videos', {}).values():  
 868                      medias.extend(video.values())  
 871              # Multiple video playlist with single `now playing` entry (e.g.  
 872              # http://www.bbc.com/news/video_and_audio/must_see/33767813)  
 873              vxp_playlist = self._parse_json(  
 875                      r'<script[^>]+class=" vxp
- playlist
- data
"[^>]+type=" application
/ json
"[^>]*>([^<]+)</script>',  
 876                      webpage, 'playlist data'),  
 879              for item in vxp_playlist:  
 880                  media = item.get('media')  
 883                  playlist_medias.append(media)  
 884                  # Download single video if found media with asset id matching the video id from URL  
 885                  if item.get('advert', {}).get('assetId') == playlist_id:  
 888              # Fallback to the whole playlist  
 890                  medias = playlist_medias  
 893          for num, media_meta in enumerate(medias, start=1):  
 894              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)  
 897              self._sort_formats(formats)  
 899              video_id = media_meta.get('externalId')  
 901                  video_id = playlist_id if len(medias) == 1 else ' %s-%s ' % (playlist_id, num)  
 903              title = media_meta.get('caption')  
 905                  title = playlist_title if len(medias) == 1 else ' %s  - Video  %s ' % (playlist_title, num)  
 907              duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))  
 910              for image in media_meta.get('images', {}).values():  
 911                  images.extend(image.values())  
 912              if 'image' in media_meta:  
 913                  images.append(media_meta['image'])  
 916                  'url': image.get('href'),  
 917                  'width': int_or_none(image.get('width')),  
 918                  'height': int_or_none(image.get('height')),  
 919              } for image in images]  
 924                  'thumbnails': thumbnails,  
 925                  'duration': duration,  
 926                  'timestamp': timestamp,  
 928                  'subtitles': subtitles,  
 931          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  
 934  class BBCCoUkArticleIE(InfoExtractor):  
 935      _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'  
 936      IE_NAME = 'bbc.co.uk:article'  
 937      IE_DESC = 'BBC articles'  
 940          'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',  
 942              'id': '3jNQLTMrPlYGTBn0WV6M2MS',  
 943              'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',  
 944              'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',  
 947          'add_ie': ['BBCCoUk'],  
 950      def _real_extract(self, url):  
 951          playlist_id = self._match_id(url)  
 953          webpage = self._download_webpage(url, playlist_id)  
 955          title = self._og_search_title(webpage)  
 956          description = self._og_search_description(webpage).strip()  
 958          entries = [self.url_result(programme_url) for programme_url in re.findall(  
 959              r'<div[^>]+typeof=" Clip
"[^>]+resource=" ([ ^
"]+)" ', webpage)]  
 961          return self.playlist_result(entries, playlist_id, title, description)