]>
 
 
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbccouk.py 
 
 
 
 
 
 
 
 
   1  from  __future__ 
import  unicode_literals
 
   3  import  xml
. etree
. ElementTree
 
   5  from  . common 
import  InfoExtractor
 
  10  from  .. compat 
import  compat_HTTPError
 
  13  class  BBCCoUkIE ( InfoExtractor
):  
  15      IE_DESC 
=  'BBC iPlayer'  
  16      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z] {8} )'  
  20              'url' :  'http://www.bbc.co.uk/programmes/b039g8p7' ,  
  24                  'title' :  'Kaleidoscope, Leonard Cohen' ,  
  25                  'description' :  'The Canadian poet and songwriter reflects on his musical career.' ,  
  30                  'skip_download' :  True ,  
  34              'url' :  'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,  
  38                  'title' :  'The Man in Black: Series 3: The Printed Name' ,  
  39                  'description' :  "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,  
  44                  'skip_download' :  True ,  
  46              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,  
  49              'url' :  'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,  
  53                  'title' :  'The Voice UK: Series 3: Blind Auditions 5' ,  
  54                  'description' :  "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,  
  59                  'skip_download' :  True ,  
  61              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
  64              'url' :  'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,  
  68                  'title' :  "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,  
  69                  'description' :  '2. Invasion' ,  
  74                  'skip_download' :  True ,  
  76              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,  
  78              'url' :  'http://www.bbc.co.uk/programmes/b04v20dw' ,  
  82                  'title' :  'Pete Tong, The Essential New Tune Special' ,  
  83                  'description' :  "Pete has a very special mix - all of 2014's Essential New Tunes!" ,  
  88                  'skip_download' :  True ,  
  91              'url' :  'http://www.bbc.co.uk/music/clips/p02frcc3' ,  
  96                  'title' :  'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,  
  97                  'description' :  'French house superstar Madeon takes us out of the club and onto the after party.' ,  
 102                  'skip_download' :  True ,  
 105              'url' :  'http://www.bbc.co.uk/music/clips/p025c0zz' ,  
 110                  'title' :  'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,  
 111                  'description' :  'Rae Morris performs Closer for BBC Three at Reading 2014' ,  
 116                  'skip_download' :  True ,  
 119              'url' :  'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,  
 123                  'title' :  'Natural World, 2015-2016: 2. Super Powered Owls' ,  
 124                  'description' :  'md5:e4db5c937d0e95a7c6b5e654d429183d' ,  
 129                  'skip_download' :  True ,  
 131              'skip' :  'geolocation' ,  
 133              'url' :  'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,  
 134              'only_matching' :  True ,  
 136              'url' :  'http://www.bbc.co.uk/music/clips#p02frcc3' ,  
 137              'only_matching' :  True ,  
 139              'url' :  'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,  
 140              'only_matching' :  True ,  
 144      def  _extract_asx_playlist ( self
,  connection
,  programme_id
):  
 145          asx 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading ASX playlist' )  
 146          return  [ ref
. get ( 'href' )  for  ref 
in  asx
. findall ( './Entry/ref' )]  
 148      def  _extract_connection ( self
,  connection
,  programme_id
):  
 150          protocol 
=  connection
. get ( 'protocol' )  
 151          supplier 
=  connection
. get ( 'supplier' )  
 152          if  protocol 
==  'http' :  
 153              href 
=  connection
. get ( 'href' )  
 155              if  supplier 
==  'asx' :  
 156                  for  i
,  ref 
in  enumerate ( self
._ extract
_ asx
_ playlist
( connection
,  programme_id
)):  
 159                          'format_id' :  'ref %s _ %s '  % ( i
,  supplier
),  
 165                      'format_id' :  supplier
,  
 167          elif  protocol 
==  'rtmp' :  
 168              application 
=  connection
. get ( 'application' ,  'ondemand' )  
 169              auth_string 
=  connection
. get ( 'authString' )  
 170              identifier 
=  connection
. get ( 'identifier' )  
 171              server 
=  connection
. get ( 'server' )  
 173                  'url' :  ' %s :// %s / %s ? %s '  % ( protocol
,  server
,  application
,  auth_string
),  
 174                  'play_path' :  identifier
,  
 175                  'app' :  ' %s ? %s '  % ( application
,  auth_string
),  
 176                  'page_url' :  'http://www.bbc.co.uk' ,  
 177                  'player_url' :  'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,  
 180                  'format_id' :  supplier
,  
 184      def  _extract_items ( self
,  playlist
):  
 185          return  playlist
. findall ( './{http://bbc.co.uk/2008/emp/playlist}item' )  
 187      def  _extract_medias ( self
,  media_selection
):  
 188          error 
=  media_selection
. find ( './{http://bbc.co.uk/2008/mp/mediaselection}error' )  
 189          if  error 
is not None :  
 190              raise  ExtractorError (  
 191                  ' %s  returned error:  %s '  % ( self
. IE_NAME
,  error
. get ( 'id' )),  expected
= True )  
 192          return  media_selection
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}media' )  
 194      def  _extract_connections ( self
,  media
):  
 195          return  media
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}connection' )  
 197      def  _extract_video ( self
,  media
,  programme_id
):  
 199          vbr 
=  int ( media
. get ( 'bitrate' ))  
 200          vcodec 
=  media
. get ( 'encoding' )  
 201          service 
=  media
. get ( 'service' )  
 202          width 
=  int ( media
. get ( 'width' ))  
 203          height 
=  int ( media
. get ( 'height' ))  
 204          file_size 
=  int ( media
. get ( 'media_file_size' ))  
 205          for  connection 
in  self
._ extract
_ connections
( media
):  
 206              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 207              for  format 
in  conn_formats
:  
 209                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  
 214                      'filesize' :  file_size
,  
 216              formats
. extend ( conn_formats
)  
 219      def  _extract_audio ( self
,  media
,  programme_id
):  
 221          abr 
=  int ( media
. get ( 'bitrate' ))  
 222          acodec 
=  media
. get ( 'encoding' )  
 223          service 
=  media
. get ( 'service' )  
 224          for  connection 
in  self
._ extract
_ connections
( media
):  
 225              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  
 226              for  format 
in  conn_formats
:  
 228                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  
 232              formats
. extend ( conn_formats
)  
 235      def  _get_subtitles ( self
,  media
,  programme_id
):  
 237          for  connection 
in  self
._ extract
_ connections
( media
):  
 238              captions 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading captions' )  
 239              lang 
=  captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' ,  'en' )  
 240              ps 
=  captions
. findall ( './ {0} body/ {0} div/ {0} p' . format ( '{http://www.w3.org/2006/10/ttaf1}' ))  
 243              def  _extract_text ( p
):  
 244                  if  p
. text 
is not None :  
 245                      stripped_text 
=  p
. text
. strip ()  
 248                  return  ' ' . join ( span
. text
. strip ()  for  span 
in  p
. findall ( '{http://www.w3.org/2006/10/ttaf1}span' ))  
 249              for  pos
,  p 
in  enumerate ( ps
):  
 250                  srt 
+=  ' %s \r\n %s  -->  %s \r\n %s \r\n\r\n '  % ( str ( pos
),  p
. get ( 'begin' ),  p
. get ( 'end' ),  _extract_text ( p
))  
 253                      'url' :  connection
. get ( 'href' ),  
 263      def  _download_media_selector ( self
,  programme_id
):  
 265              media_selection 
=  self
._ download
_ xml
(  
 266                  'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s '  %  programme_id
,  
 267                  programme_id
,  'Downloading media selection XML' )  
 268          except  ExtractorError 
as  ee
:  
 269              if  isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  403 :  
 270                  media_selection 
=  xml
. etree
. ElementTree
. fromstring ( ee
. cause
. read (). encode ( 'utf-8' ))  
 277          for  media 
in  self
._ extract
_ medias
( media_selection
):  
 278              kind 
=  media
. get ( 'kind' )  
 280                  formats
. extend ( self
._ extract
_ audio
( media
,  programme_id
))  
 281              elif  kind 
==  'video' :  
 282                  formats
. extend ( self
._ extract
_ video
( media
,  programme_id
))  
 283              elif  kind 
==  'captions' :  
 284                  subtitles 
=  self
. extract_subtitles ( media
,  programme_id
)  
 286          return  formats
,  subtitles
 
 288      def  _download_playlist ( self
,  playlist_id
):  
 290              playlist 
=  self
._ download
_ json
(  
 291                  'http://www.bbc.co.uk/programmes/ %s /playlist.json'  %  playlist_id
,  
 292                  playlist_id
,  'Downloading playlist JSON' )  
 294              version 
=  playlist
. get ( 'defaultAvailableVersion' )  
 296                  smp_config 
=  version
[ 'smpConfig' ]  
 297                  title 
=  smp_config
[ 'title' ]  
 298                  description 
=  smp_config
[ 'summary' ]  
 299                  for  item 
in  smp_config
[ 'items' ]:  
 301                      if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 303                      programme_id 
=  item
. get ( 'vpid' )  
 304                      duration 
=  int ( item
. get ( 'duration' ))  
 305                      formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 306                  return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 307          except  ExtractorError 
as  ee
:  
 308              if not  ( isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  404 ):  
 311          # fallback to legacy playlist  
 312          playlist 
=  self
._ download
_ xml
(  
 313              'http://www.bbc.co.uk/iplayer/playlist/ %s '  %  playlist_id
,  
 314              playlist_id
,  'Downloading legacy playlist XML' )  
 316          no_items 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}noItems' )  
 317          if  no_items 
is not None :  
 318              reason 
=  no_items
. get ( 'reason' )  
 319              if  reason 
==  'preAvailability' :  
 320                  msg 
=  'Episode  %s  is not yet available'  %  playlist_id
 
 321              elif  reason 
==  'postAvailability' :  
 322                  msg 
=  'Episode  %s  is no longer available'  %  playlist_id
 
 323              elif  reason 
==  'noMedia' :  
 324                  msg 
=  'Episode  %s  is not currently available'  %  playlist_id
 
 326                  msg 
=  'Episode  %s  is not available:  %s '  % ( playlist_id
,  reason
)  
 327              raise  ExtractorError ( msg
,  expected
= True )  
 329          for  item 
in  self
._ extract
_ items
( playlist
):  
 330              kind 
=  item
. get ( 'kind' )  
 331              if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  
 333              title 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}title' ). text
 
 334              description 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}summary' ). text
 
 335              programme_id 
=  item
. get ( 'identifier' )  
 336              duration 
=  int ( item
. get ( 'duration' ))  
 337              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 339          return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 
 341      def  _real_extract ( self
,  url
):  
 342          group_id 
=  self
._ match
_ id
( url
)  
 344          webpage 
=  self
._ download
_ webpage
( url
,  group_id
,  'Downloading video page' )  
 348          tviplayer 
=  self
._ search
_ regex
(  
 349              r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,  
 350              webpage
,  'player' ,  default
= None )  
 353              player 
=  self
._ parse
_ json
( tviplayer
,  group_id
). get ( 'player' , {})  
 354              duration 
=  int_or_none ( player
. get ( 'duration' ))  
 355              programme_id 
=  player
. get ( 'vpid' )  
 358              programme_id 
=  self
._ search
_ regex
(  
 359                  r
'"vpid"\s*:\s*"([\da-z] {8} )"' ,  webpage
,  'vpid' ,  fatal
= False ,  default
= None )  
 362              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  
 363              title 
=  self
._ og
_ search
_ title
( webpage
)  
 364              description 
=  self
._ search
_ regex
(  
 365                  r
'<p class="medium-description">([^<]+)</p>' ,  
 366                  webpage
,  'description' ,  fatal
= False )  
 368              programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  self
._ download
_ playlist
( group_id
)  
 370          self
._ sort
_ formats
( formats
)  
 375              'description' :  description
,  
 376              'thumbnail' :  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None ),  
 377              'duration' :  duration
,  
 379              'subtitles' :  subtitles
,