]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbc.py 
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  16  from  .. compat 
import  (   17      compat_etree_fromstring
,   22  class  BBCCoUkIE ( InfoExtractor
):   24      IE_DESC 
=  'BBC iPlayer'   25      _ID_REGEX 
=  r
'[pb][\da-z] {7} '   28                          (?:www\.)?bbc\.co\.uk/   30                              programmes/(?!articles/)|   31                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)|   38      _MEDIASELECTOR_URLS 
= [   39          # Provides HQ HLS streams with even better quality that pc mediaset but fails   40          # with geolocation in some cases when it's even not geo restricted at all (e.g.   41          # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.   42          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,   43          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s ' ,   46      _MEDIASELECTION_NS 
=  'http://bbc.co.uk/2008/mp/mediaselection'   47      _EMP_PLAYLIST_NS 
=  'http://bbc.co.uk/2008/emp/playlist'   56              'url' :  'http://www.bbc.co.uk/programmes/b039g8p7' ,   60                  'title' :  'Leonard Cohen, Kaleidoscope - BBC Radio 4' ,   61                  'description' :  'The Canadian poet and songwriter reflects on his musical career.' ,   65                  'skip_download' :  True ,   69              'url' :  'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,   73                  'title' :  'The Man in Black: Series 3: The Printed Name' ,   74                  'description' :  "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,   79                  'skip_download' :  True ,   81              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,   84              'url' :  'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,   88                  'title' :  'The Voice UK: Series 3: Blind Auditions 5' ,   89                  'description' :  'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.' ,   94                  'skip_download' :  True ,   96              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,   99              'url' :  'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,  103                  'title' :  "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,  104                  'description' :  '2. Invasion' ,  109                  'skip_download' :  True ,  111              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  113              'url' :  'http://www.bbc.co.uk/programmes/b04v20dw' ,  117                  'title' :  'Pete Tong, The Essential New Tune Special' ,  118                  'description' :  "Pete has a very special mix - all of 2014's Essential New Tunes!" ,  123                  'skip_download' :  True ,  125              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,  127              'url' :  'http://www.bbc.co.uk/music/clips/p022h44b' ,  132                  'title' :  'BBC Proms Music Guides, Rachmaninov: Symphonic Dances' ,  133                  'description' :  "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances." ,  138                  'skip_download' :  True ,  141              'url' :  'http://www.bbc.co.uk/music/clips/p025c0zz' ,  146                  'title' :  'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,  147                  'description' :  'Rae Morris performs Closer for BBC Three at Reading 2014' ,  152                  'skip_download' :  True ,  155              'url' :  'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,  159                  'title' :  'Natural World, 2015-2016: 2. Super Powered Owls' ,  160                  'description' :  'md5:e4db5c937d0e95a7c6b5e654d429183d' ,  165                  'skip_download' :  True ,  167              'skip' :  'geolocation' ,  169              'url' :  'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,  173                  'description' :  'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,  174                  'title' :  'Royal Academy Summer Exhibition' ,  179                  'skip_download' :  True ,  181              'skip' :  'geolocation' ,  183              # iptv-all mediaset fails with geolocation however there is no geo restriction  184              # for this programme at all  185              'url' :  'http://www.bbc.co.uk/programmes/b06rkn85' ,  189                  'title' :  "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1" ,  190                  'description' :  "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!" ,  194                  'skip_download' :  True ,  197              # compact player (https://github.com/rg3/youtube-dl/issues/8147)  198              'url' :  'http://www.bbc.co.uk/programmes/p028bfkf/player' ,  202                  'title' :  'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,  203                  'description' :  'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews' ,  207                  'skip_download' :  True ,  210              'url' :  'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,  211              'only_matching' :  True ,  213              'url' :  'http://www.bbc.co.uk/music/clips#p02frcc3' ,  214              'only_matching' :  True ,  216              'url' :  'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,  217              'only_matching' :  True ,  219              'url' :  'http://www.bbc.co.uk/radio/player/p03cchwf' ,  220              'only_matching' :  True ,  224      class  MediaSelectionError ( Exception ):  225          def  __init__ ( self
,  id ):  228      def  _extract_asx_playlist ( self
,  connection
,  programme_id
):  229          asx 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading ASX playlist' )  230          return  [ ref
. get ( 'href' )  for  ref 
in  asx
. findall ( './Entry/ref' )]  232      def  _extract_connection ( self
,  connection
,  programme_id
):  234          kind 
=  connection
. get ( 'kind' )  235          protocol 
=  connection
. get ( 'protocol' )  236          supplier 
=  connection
. get ( 'supplier' )  237          if  protocol 
==  'http' :  238              href 
=  connection
. get ( 'href' )  239              transfer_format 
=  connection
. get ( 'transferFormat' )  241              if  supplier 
==  'asx' :  242                  for  i
,  ref 
in  enumerate ( self
._ extract
_ asx
_ playlist
( connection
,  programme_id
)):  245                          'format_id' :  'ref %s _ %s '  % ( i
,  supplier
),  247              # Skip DASH until supported  248              elif  transfer_format 
==  'dash' :  250              elif  transfer_format 
==  'hls' :  251                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  252                      href
,  programme_id
,  ext
= 'mp4' ,  entry_protocol
= 'm3u8_native' ,  253                      m3u8_id
= supplier
,  fatal
= False ))  258                      'format_id' :  supplier 
or  kind 
or  protocol
,  260          elif  protocol 
==  'rtmp' :  261              application 
=  connection
. get ( 'application' ,  'ondemand' )  262              auth_string 
=  connection
. get ( 'authString' )  263              identifier 
=  connection
. get ( 'identifier' )  264              server 
=  connection
. get ( 'server' )  266                  'url' :  ' %s :// %s / %s ? %s '  % ( protocol
,  server
,  application
,  auth_string
),  267                  'play_path' :  identifier
,  268                  'app' :  ' %s ? %s '  % ( application
,  auth_string
),  269                  'page_url' :  'http://www.bbc.co.uk' ,  270                  'player_url' :  'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,  273                  'format_id' :  supplier
,  277      def  _extract_items ( self
,  playlist
):  278          return  playlist
. findall ( './{ %s }item'  %  self
._ EMP
_ PLAYLIST
_ NS
)  280      def  _findall_ns ( self
,  element
,  xpath
):  282          for  ns 
in  self
._ NAMESPACES
:  283              elements
. extend ( element
. findall ( xpath 
%  ns
))  286      def  _extract_medias ( self
,  media_selection
):  287          error 
=  media_selection
. find ( './{ %s }error'  %  self
._ MEDIASELECTION
_ NS
)  289              media_selection
. find ( './{ %s }error'  %  self
._ EMP
_ PLAYLIST
_ NS
)  290          if  error 
is not None :  291              raise  BBCCoUkIE
. MediaSelectionError ( error
. get ( 'id' ))  292          return  self
._ findall
_ ns
( media_selection
,  './{ %s }media' )  294      def  _extract_connections ( self
,  media
):  295          return  self
._ findall
_ ns
( media
,  './{ %s }connection' )  297      def  _extract_video ( self
,  media
,  programme_id
):  299          vbr 
=  int_or_none ( media
. get ( 'bitrate' ))  300          vcodec 
=  media
. get ( 'encoding' )  301          service 
=  media
. get ( 'service' )  302          width 
=  int_or_none ( media
. get ( 'width' ))  303          height 
=  int_or_none ( media
. get ( 'height' ))  304          file_size 
=  int_or_none ( media
. get ( 'media_file_size' ))  305          for  connection 
in  self
._ extract
_ connections
( media
):  306              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  307              for  format 
in  conn_formats
:  313                      'filesize' :  file_size
,  316                      format
[ 'format_id' ] =  ' %s _ %s '  % ( service
,  format
[ 'format_id' ])  317              formats
. extend ( conn_formats
)  320      def  _extract_audio ( self
,  media
,  programme_id
):  322          abr 
=  int_or_none ( media
. get ( 'bitrate' ))  323          acodec 
=  media
. get ( 'encoding' )  324          service 
=  media
. get ( 'service' )  325          for  connection 
in  self
._ extract
_ connections
( media
):  326              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  327              for  format 
in  conn_formats
:  329                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  333              formats
. extend ( conn_formats
)  336      def  _get_subtitles ( self
,  media
,  programme_id
):  338          for  connection 
in  self
._ extract
_ connections
( media
):  339              captions 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading captions' )  340              lang 
=  captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' ,  'en' )  343                      'url' :  connection
. get ( 'href' ),  349      def  _raise_extractor_error ( self
,  media_selection_error
):  350          raise  ExtractorError (  351              ' %s  returned error:  %s '  % ( self
. IE_NAME
,  media_selection_error
. id ),  354      def  _download_media_selector ( self
,  programme_id
):  355          last_exception 
=  None  356          for  mediaselector_url 
in  self
._ MEDIASELECTOR
_U RLS
:  358                  return  self
._ download
_ media
_ selector
_u rl
(  359                      mediaselector_url 
%  programme_id
,  programme_id
)  360              except  BBCCoUkIE
. MediaSelectionError 
as  e
:  361                  if  e
. id  in  ( 'notukerror' ,  'geolocation' ,  'selectionunavailable' ):  364                  self
._ raise
_ extractor
_ error
( e
)  365          self
._ raise
_ extractor
_ error
( last_exception
)  367      def  _download_media_selector_url ( self
,  url
,  programme_id
= None ):  369              media_selection 
=  self
._ download
_ xml
(  370                  url
,  programme_id
,  'Downloading media selection XML' )  371          except  ExtractorError 
as  ee
:  372              if  isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
in  ( 403 ,  404 ):  373                  media_selection 
=  compat_etree_fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))  376          return  self
._ process
_ media
_ selector
( media_selection
,  programme_id
)  378      def  _process_media_selector ( self
,  media_selection
,  programme_id
):  382          for  media 
in  self
._ extract
_ medias
( media_selection
):  383              kind 
=  media
. get ( 'kind' )  385                  formats
. extend ( self
._ extract
_ audio
( media
,  programme_id
))  386              elif  kind 
==  'video' :  387                  formats
. extend ( self
._ extract
_ video
( media
,  programme_id
))  388              elif  kind 
==  'captions' :  389                  subtitles 
=  self
. extract_subtitles ( media
,  programme_id
)  390          return  formats
,  subtitles
 392      def  _download_playlist ( self
,  playlist_id
):  394              playlist 
=  self
._ download
_ json
(  395                  'http://www.bbc.co.uk/programmes/ %s /playlist.json'  %  playlist_id
,  396                  playlist_id
,  'Downloading playlist JSON' )  398              version 
=  playlist
. get ( 'defaultAvailableVersion' )  400                  smp_config 
=  version
[ 'smpConfig' ]  401                  title 
=  smp_config
[ 'title' ]  402                  description 
=  smp_config
[ 'summary' ]  403                  for  item 
in  smp_config
[ 'items' ]:  405                      if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  407                      programme_id 
=  item
. get ( 'vpid' )  408                      duration 
=  int_or_none ( item
. get ( 'duration' ))  409                      formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  410                  return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 411          except  ExtractorError 
as  ee
:  412              if not  ( isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  404 ):  415          # fallback to legacy playlist  416          return  self
._ process
_l egacy
_ playlist
( playlist_id
)  418      def  _process_legacy_playlist_url ( self
,  url
,  display_id
):  419          playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( url
,  display_id
)  420          return  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  display_id
)  422      def  _process_legacy_playlist ( self
,  playlist_id
):  423          return  self
._ process
_l egacy
_ playlist
_u rl
(  424              'http://www.bbc.co.uk/iplayer/playlist/ %s '  %  playlist_id
,  playlist_id
)  426      def  _download_legacy_playlist_url ( self
,  url
,  playlist_id
= None ):  427          return  self
._ download
_ xml
(  428              url
,  playlist_id
,  'Downloading legacy playlist XML' )  430      def  _extract_from_legacy_playlist ( self
,  playlist
,  playlist_id
):  431          no_items 
=  playlist
. find ( './{ %s }noItems'  %  self
._ EMP
_ PLAYLIST
_ NS
)  432          if  no_items 
is not None :  433              reason 
=  no_items
. get ( 'reason' )  434              if  reason 
==  'preAvailability' :  435                  msg 
=  'Episode  %s  is not yet available'  %  playlist_id
 436              elif  reason 
==  'postAvailability' :  437                  msg 
=  'Episode  %s  is no longer available'  %  playlist_id
 438              elif  reason 
==  'noMedia' :  439                  msg 
=  'Episode  %s  is not currently available'  %  playlist_id
 441                  msg 
=  'Episode  %s  is not available:  %s '  % ( playlist_id
,  reason
)  442              raise  ExtractorError ( msg
,  expected
= True )  444          for  item 
in  self
._ extract
_ items
( playlist
):  445              kind 
=  item
. get ( 'kind' )  446              if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  448              title 
=  playlist
. find ( './{ %s }title'  %  self
._ EMP
_ PLAYLIST
_ NS
). text
 449              description_el 
=  playlist
. find ( './{ %s }summary'  %  self
._ EMP
_ PLAYLIST
_ NS
)  450              description 
=  description_el
. text 
if  description_el 
is not None else None  452              def  get_programme_id ( item
):  453                  def  get_from_attributes ( item
):  454                      for  p 
in ( 'identifier' ,  'group' ):  456                          if  value 
and  re
. match ( r
'^[pb][\da-z] {7} $' ,  value
):  458                  get_from_attributes ( item
)  459                  mediator 
=  item
. find ( './{ %s }mediator'  %  self
._ EMP
_ PLAYLIST
_ NS
)  460                  if  mediator 
is not None :  461                      return  get_from_attributes ( mediator
)  463              programme_id 
=  get_programme_id ( item
)  464              duration 
=  int_or_none ( item
. get ( 'duration' ))  467                  formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  469                  formats
,  subtitles 
=  self
._ process
_ media
_ selector
( item
,  playlist_id
)  470                  programme_id 
=  playlist_id
 472          return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 474      def  _real_extract ( self
,  url
):  475          group_id 
=  self
._ match
_ id
( url
)  477          webpage 
=  self
._ download
_ webpage
( url
,  group_id
,  'Downloading video page' )  482          tviplayer 
=  self
._ search
_ regex
(  483              r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,  484              webpage
,  'player' ,  default
= None )  487              player 
=  self
._ parse
_ json
( tviplayer
,  group_id
). get ( 'player' , {})  488              duration 
=  int_or_none ( player
. get ( 'duration' ))  489              programme_id 
=  player
. get ( 'vpid' )  492              programme_id 
=  self
._ search
_ regex
(  493                  r
'"vpid"\s*:\s*"( %s )"'  %  self
._ ID
_ REGEX
,  webpage
,  'vpid' ,  fatal
= False ,  default
= None )  496              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  497              title 
=  self
._ og
_ search
_ title
( webpage
,  default
= None )  or  self
._ html
_ search
_ regex
(  498                  ( r
'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>' ,  499                   r
'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>' ),  webpage
,  'title' )  500              description 
=  self
._ search
_ regex
(  501                  ( r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,  502                   r
'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>' ),  503                  webpage
,  'description' ,  default
= None )  505                  description 
=  self
._ html
_ search
_ meta
( 'description' ,  webpage
)  507              programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  self
._ download
_ playlist
( group_id
)  509          self
._ sort
_ formats
( formats
)  514              'description' :  description
,  515              'thumbnail' :  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None ),  516              'duration' :  duration
,  518              'subtitles' :  subtitles
,  522  class  BBCIE ( BBCCoUkIE
):  525      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'  527      _MEDIASELECTOR_URLS 
= [  528          # Provides HQ HLS streams but fails with geolocation in some cases when it's  529          # even not geo restricted at all  530          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/ %s ' ,  531          # Provides more formats, namely direct mp4 links, but fails on some videos with  532          # notukerror for non UK (?) users (e.g.  533          # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  534          'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s ' ,  535          # Provides fewer formats, but works everywhere for everybody (hopefully)  536          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ %s ' ,  540          # article with multiple videos embedded with data-playable containing vpids  541          'url' :  'http://www.bbc.com/news/world-europe-32668511' ,  543              'id' :  'world-europe-32668511' ,  544              'title' :  'Russia stages massive WW2 parade despite Western boycott' ,  545              'description' :  'md5:00ff61976f6081841f759a08bf78cc9c' ,  549          # article with multiple videos embedded with data-playable (more videos)  550          'url' :  'http://www.bbc.com/news/business-28299555' ,  552              'id' :  'business-28299555' ,  553              'title' :  'Farnborough Airshow: Video highlights' ,  554              'description' :  'BBC reports and video highlights at the Farnborough Airshow.' ,  559          # article with multiple videos embedded with `new SMP()`  561          'url' :  'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460' ,  563              'id' :  '3662a707-0af9-3149-963f-47bea720b460' ,  564              'title' :  'BBC Blogs - Adam Curtis - BUGGER' ,  566          'playlist_count' :  18 ,  568          # single video embedded with data-playable containing vpid  569          'url' :  'http://www.bbc.com/news/world-europe-32041533' ,  573              'title' :  'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,  574              'description' :  'md5:2868290467291b37feda7863f7a83f54' ,  576              'timestamp' :  1427219242 ,  577              'upload_date' :  '20150324' ,  581              'skip_download' :  True ,  584          # article with single video embedded with data-playable containing XML playlist  585          # with direct video links as progressiveDownloadUrl (for now these are extracted)  586          # and playlist with f4m and m3u8 as streamingUrl  587          'url' :  'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,  589              'id' :  '150615_telabyad_kentin_cogu' ,  591              'title' :  "YPG: Tel Abyad'ın tamamı kontrolümüzde" ,  592              'timestamp' :  1434397334 ,  593              'upload_date' :  '20150615' ,  596              'skip_download' :  True ,  599          # single video embedded with data-playable containing XML playlists (regional section)  600          'url' :  'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,  602              'id' :  '150619_video_honduras_militares_hospitales_corrupcion_aw' ,  604              'title' :  'Honduras militariza sus hospitales por nuevo escĆ”ndalo de corrupción' ,  605              'timestamp' :  1434713142 ,  606              'upload_date' :  '20150619' ,  609              'skip_download' :  True ,  612          # single video from video playlist embedded with vxp-playlist-data JSON  613          'url' :  'http://www.bbc.com/news/video_and_audio/must_see/33376376' ,  617              'title' :  '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,  619              'description' :  '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''' ,  622              'skip_download' :  True ,  625          # single video story with digitalData  626          'url' :  'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret' ,  630              'title' :  'Sri Lankaās spicy secret' ,  631              'description' :  'As a new train line to Jaffna opens up the countryās north, travellers can experience a truly distinct slice of Tamil culture.' ,  632              'timestamp' :  1437674293 ,  633              'upload_date' :  '20150723' ,  637              'skip_download' :  True ,  640          # single video story without digitalData  641          'url' :  'http://www.bbc.com/autos/story/20130513-hyundais-rock-star' ,  645              'title' :  'Hyundai Santa Fe Sport: Rock star' ,  646              'description' :  'md5:b042a26142c4154a6e472933cf20793d' ,  647              'timestamp' :  1415867444 ,  648              'upload_date' :  '20141113' ,  652              'skip_download' :  True ,  655          # single video with playlist.sxml URL in playlist param  656          'url' :  'http://www.bbc.com/sport/0/football/33653409' ,  660              'title' :  'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?' ,  661              'description' :  'BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.' ,  666              'skip_download' :  True ,  669          # article with multiple videos embedded with playlist.sxml in playlist param  670          'url' :  'http://www.bbc.com/sport/0/football/34475836' ,  673              'title' :  'What Liverpool can expect from Klopp' ,  677          # single video with playlist URL from weather section  678          'url' :  'http://www.bbc.com/weather/features/33601775' ,  679          'only_matching' :  True ,  681          # custom redirection to www.bbc.com  682          'url' :  'http://www.bbc.co.uk/news/science-environment-33661876' ,  683          'only_matching' :  True ,  687      def  suitable ( cls
,  url
):  688          return False if  BBCCoUkIE
. suitable ( url
)  or  BBCCoUkArticleIE
. suitable ( url
)  else  super ( BBCIE
,  cls
). suitable ( url
)  690      def  _extract_from_media_meta ( self
,  media_meta
,  video_id
):  691          # Direct links to media in media metadata (e.g.  692          # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)  693          # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml  694          source_files 
=  media_meta
. get ( 'sourceFiles' )  698                  'format_id' :  format_id
,  699                  'ext' :  f
. get ( 'encoding' ),  700                  'tbr' :  float_or_none ( f
. get ( 'bitrate' ),  1000 ),  701                  'filesize' :  int_or_none ( f
. get ( 'filesize' )),  702              }  for  format_id
,  f 
in  source_files
. items ()  if  f
. get ( 'url' )], []  704          programme_id 
=  media_meta
. get ( 'externalId' )  706              return  self
._ download
_ media
_ selector
( programme_id
)  708          # Process playlist.sxml as legacy playlist  709          href 
=  media_meta
. get ( 'href' )  711              playlist 
=  self
._ download
_l egacy
_ playlist
_u rl
( href
)  712              _
,  _
,  _
,  _
,  formats
,  subtitles 
=  self
._ extract
_ from
_l egacy
_ playlist
( playlist
,  video_id
)  713              return  formats
,  subtitles
 717      def  _extract_from_playlist_sxml ( self
,  url
,  playlist_id
,  timestamp
):  718          programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  \
 719              self
._ process
_l egacy
_ playlist
_u rl
( url
,  playlist_id
)  720          self
._ sort
_ formats
( formats
)  724              'description' :  description
,  725              'duration' :  duration
,  726              'timestamp' :  timestamp
,  728              'subtitles' :  subtitles
,  731      def  _real_extract ( self
,  url
):  732          playlist_id 
=  self
._ match
_ id
( url
)  734          webpage 
=  self
._ download
_ webpage
( url
,  playlist_id
)  736          json_ld_info 
=  self
._ search
_ json
_l d
( webpage
,  playlist_id
,  default
= None )  737          timestamp 
=  json_ld_info
. get ( 'timestamp' )  738          playlist_title 
=  json_ld_info
. get ( 'title' )  739          playlist_description 
=  json_ld_info
. get ( 'description' )  742              timestamp 
=  parse_iso8601 ( self
._ search
_ regex
(  743                  [ r
'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"' ,  744                   r
'itemprop="datePublished"[^>]+datetime="([^"]+)"' ,  745                   r
'"datePublished":\s*"([^"]+)' ],  746                  webpage
,  'date' ,  default
= None ))  750          # article with multiple videos embedded with playlist.sxml (e.g.  751          # http://www.bbc.com/sport/0/football/34475836)  752          playlists 
=  re
. findall ( r
'<param[^>]+name="playlist"[^>]+value="([^"]+)"' ,  webpage
)  753          playlists
. extend ( re
. findall ( r
'data-media-id="([^"]+/playlist\.sxml)"' ,  webpage
))  756                  self
._ extract
_ from
_ playlist
_ sxml
( playlist_url
,  playlist_id
,  timestamp
)  757                  for  playlist_url 
in  playlists
]  759          # news article with multiple videos embedded with data-playable  760          data_playables 
=  re
. findall ( r
'data-playable=(["\' ])({.+ ?
}) \
1 ', webpage)  762              for _, data_playable_json in data_playables:  763                  data_playable = self._parse_json(  764                      unescapeHTML(data_playable_json), playlist_id, fatal=False)  765                  if not data_playable:  767                  settings = data_playable.get(' settings
', {})  769                      # data-playable with video vpid in settings.playlistObject.items (e.g.  770                      # http://www.bbc.com/news/world-us-canada-34473351)  771                      playlist_object = settings.get(' playlistObject
', {})  773                          items = playlist_object.get(' items
')  774                          if items and isinstance(items, list):  775                              title = playlist_object[' title
']  776                              description = playlist_object.get(' summary
')  777                              duration = int_or_none(items[0].get(' duration
'))  778                              programme_id = items[0].get(' vpid
')  779                              formats, subtitles = self._download_media_selector(programme_id)  780                              self._sort_formats(formats)  784                                  ' description
': description,  785                                  ' timestamp
': timestamp,  786                                  ' duration
': duration,  788                                  ' subtitles
': subtitles,  791                          # data-playable without vpid but with a playlist.sxml URLs  792                          # in otherSettings.playlist (e.g.  793                          # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)  794                          playlist = data_playable.get(' otherSettings
', {}).get(' playlist
', {})  796                              entries.append(self._extract_from_playlist_sxml(  797                                  playlist.get(' progressiveDownloadUrl
'), playlist_id, timestamp))  800              playlist_title = playlist_title or remove_end(self._og_search_title(webpage), '  -  BBC News
')  801              playlist_description = playlist_description or self._og_search_description(webpage, default=None)  802              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  804          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)  805          programme_id = self._search_regex(  806              [r' data
- video
- player
- vpid
= "( %s )" ' % self._ID_REGEX,  807               r' < param
[ ^
>]+ name
= "externalIdentifier" [ ^
>]+ value
= "( %s )" ' % self._ID_REGEX,  808               r' videoId\s
*: \s
*[ " \' ]( %s )[" \' ] ' % self._ID_REGEX],  809              webpage, ' vpid
', default=None)  812              formats, subtitles = self._download_media_selector(programme_id)  813              self._sort_formats(formats)  814              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)  815              digital_data = self._parse_json(  817                      r' var\s
+ digitalData\s
*= \s
*({.+ ?
}); ?
\n ', webpage, ' digital data
', default=' {} '),  818                  programme_id, fatal=False)  819              page_info = digital_data.get(' page
', {}).get(' pageInfo
', {})  820              title = page_info.get(' pageName
') or self._og_search_title(webpage)  821              description = page_info.get(' description
') or self._og_search_description(webpage)  822              timestamp = parse_iso8601(page_info.get(' publicationDate
')) or timestamp  826                  ' description
': description,  827                  ' timestamp
': timestamp,  829                  ' subtitles
': subtitles,  832          playlist_title = self._html_search_regex(  833              r' < title
>(.* ?
)( ?
: \s
*- \s
* BBC 
[ ^ 
]+) ?
</ title
> ', webpage, ' playlist title
')  834          playlist_description = self._og_search_description(webpage, default=None)  836          def extract_all(pattern):  837              return list(filter(None, map(  838                  lambda s: self._parse_json(s, playlist_id, fatal=False),  839                  re.findall(pattern, webpage))))  841          # Multiple video article (e.g.  842          # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)  843          EMBED_URL = r' https?
://( ?
: www\
.) ?bbc\
. co\
. uk
/( ?
:[ ^
/]+/)+ %s( ?
: \b [ ^
"]+)?' % self._ID_REGEX  845          for match in extract_all(r'new\s+SMP\(({.+?})\)'):  846              embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')  847              if embed_url and re.match(EMBED_URL, embed_url):  848                  entries.append(embed_url)  849          entries.extend(re.findall(  850              r'setPlaylist\(" ( %s) "\)' % EMBED_URL, webpage))  852              return self.playlist_result(  853                  [self.url_result(entry, 'BBCCoUk') for entry in entries],  854                  playlist_id, playlist_title, playlist_description)  856          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)  857          medias = extract_all(r" data
- media
- meta
= '({[^' ]+}) '")  860              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)  861              media_asset = self._search_regex(  862                  r' mediaAssetPage\
. init\
( \s
*({.+ ?
}),  "/',  863                  webpage, 'media asset', default=None)  865                  media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)  867                  for video in media_asset_page.get('videos', {}).values():  868                      medias.extend(video.values())  871              # Multiple video playlist with single `now playing` entry (e.g.  872              # http://www.bbc.com/news/video_and_audio/must_see/33767813)  873              vxp_playlist = self._parse_json(  875                      r'<script[^>]+class=" vxp
- playlist
- data
"[^>]+type=" application
/ json
"[^>]*>([^<]+)</script>',  876                      webpage, 'playlist data'),  879              for item in vxp_playlist:  880                  media = item.get('media')  883                  playlist_medias.append(media)  884                  # Download single video if found media with asset id matching the video id from URL  885                  if item.get('advert', {}).get('assetId') == playlist_id:  888              # Fallback to the whole playlist  890                  medias = playlist_medias  893          for num, media_meta in enumerate(medias, start=1):  894              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)  897              self._sort_formats(formats)  899              video_id = media_meta.get('externalId')  901                  video_id = playlist_id if len(medias) == 1 else ' %s-%s ' % (playlist_id, num)  903              title = media_meta.get('caption')  905                  title = playlist_title if len(medias) == 1 else ' %s  - Video  %s ' % (playlist_title, num)  907              duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))  910              for image in media_meta.get('images', {}).values():  911                  images.extend(image.values())  912              if 'image' in media_meta:  913                  images.append(media_meta['image'])  916                  'url': image.get('href'),  917                  'width': int_or_none(image.get('width')),  918                  'height': int_or_none(image.get('height')),  919              } for image in images]  924                  'thumbnails': thumbnails,  925                  'duration': duration,  926                  'timestamp': timestamp,  928                  'subtitles': subtitles,  931          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)  934  class BBCCoUkArticleIE(InfoExtractor):  935      _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'  936      IE_NAME = 'bbc.co.uk:article'  937      IE_DESC = 'BBC articles'  940          'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',  942              'id': '3jNQLTMrPlYGTBn0WV6M2MS',  943              'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',  944              'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',  947          'add_ie': ['BBCCoUk'],  950      def _real_extract(self, url):  951          playlist_id = self._match_id(url)  953          webpage = self._download_webpage(url, playlist_id)  955          title = self._og_search_title(webpage)  956          description = self._og_search_description(webpage).strip()  958          entries = [self.url_result(programme_url) for programme_url in re.findall(  959              r'<div[^>]+typeof=" Clip
"[^>]+resource=" ([ ^
"]+)" ', webpage)]  961          return self.playlist_result(entries, playlist_id, title, description)