]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bbccouk.py 
   1  from  __future__ 
import  unicode_literals
   3  import  xml
. etree
. ElementTree
   5  from  . common 
import  InfoExtractor
  10  from  .. compat 
import  compat_HTTPError
  13  class  BBCCoUkIE ( InfoExtractor
):   15      IE_DESC 
=  'BBC iPlayer'   16      _VALID_URL 
=  r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z] {8} )'   20              'url' :  'http://www.bbc.co.uk/programmes/b039g8p7' ,   24                  'title' :  'Kaleidoscope, Leonard Cohen' ,   25                  'description' :  'The Canadian poet and songwriter reflects on his musical career.' ,   30                  'skip_download' :  True ,   34              'url' :  'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,   38                  'title' :  'The Man in Black: Series 3: The Printed Name' ,   39                  'description' :  "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,   44                  'skip_download' :  True ,   46              'skip' :  'Episode is no longer available on BBC iPlayer Radio' ,   49              'url' :  'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,   53                  'title' :  'The Voice UK: Series 3: Blind Auditions 5' ,   54                  'description' :  "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,   59                  'skip_download' :  True ,   61              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,   64              'url' :  'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,   68                  'title' :  "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,   69                  'description' :  '2. Invasion' ,   74                  'skip_download' :  True ,   76              'skip' :  'Currently BBC iPlayer TV programmes are available to play in the UK only' ,   78              'url' :  'http://www.bbc.co.uk/programmes/b04v20dw' ,   82                  'title' :  'Pete Tong, The Essential New Tune Special' ,   83                  'description' :  "Pete has a very special mix - all of 2014's Essential New Tunes!" ,   88                  'skip_download' :  True ,   91              'url' :  'http://www.bbc.co.uk/music/clips/p02frcc3' ,   96                  'title' :  'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,   97                  'description' :  'French house superstar Madeon takes us out of the club and onto the after party.' ,  102                  'skip_download' :  True ,  105              'url' :  'http://www.bbc.co.uk/music/clips/p025c0zz' ,  110                  'title' :  'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,  111                  'description' :  'Rae Morris performs Closer for BBC Three at Reading 2014' ,  116                  'skip_download' :  True ,  119              'url' :  'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,  123                  'title' :  'Natural World, 2015-2016: 2. Super Powered Owls' ,  124                  'description' :  'md5:e4db5c937d0e95a7c6b5e654d429183d' ,  129                  'skip_download' :  True ,  131              'skip' :  'geolocation' ,  133              'url' :  'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,  134              'only_matching' :  True ,  136              'url' :  'http://www.bbc.co.uk/music/clips#p02frcc3' ,  137              'only_matching' :  True ,  139              'url' :  'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,  140              'only_matching' :  True ,  144      def  _extract_asx_playlist ( self
,  connection
,  programme_id
):  145          asx 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading ASX playlist' )  146          return  [ ref
. get ( 'href' )  for  ref 
in  asx
. findall ( './Entry/ref' )]  148      def  _extract_connection ( self
,  connection
,  programme_id
):  150          protocol 
=  connection
. get ( 'protocol' )  151          supplier 
=  connection
. get ( 'supplier' )  152          if  protocol 
==  'http' :  153              href 
=  connection
. get ( 'href' )  155              if  supplier 
==  'asx' :  156                  for  i
,  ref 
in  enumerate ( self
._ extract
_ asx
_ playlist
( connection
,  programme_id
)):  159                          'format_id' :  'ref %s _ %s '  % ( i
,  supplier
),  165                      'format_id' :  supplier
,  167          elif  protocol 
==  'rtmp' :  168              application 
=  connection
. get ( 'application' ,  'ondemand' )  169              auth_string 
=  connection
. get ( 'authString' )  170              identifier 
=  connection
. get ( 'identifier' )  171              server 
=  connection
. get ( 'server' )  173                  'url' :  ' %s :// %s / %s ? %s '  % ( protocol
,  server
,  application
,  auth_string
),  174                  'play_path' :  identifier
,  175                  'app' :  ' %s ? %s '  % ( application
,  auth_string
),  176                  'page_url' :  'http://www.bbc.co.uk' ,  177                  'player_url' :  'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,  180                  'format_id' :  supplier
,  184      def  _extract_items ( self
,  playlist
):  185          return  playlist
. findall ( './{http://bbc.co.uk/2008/emp/playlist}item' )  187      def  _extract_medias ( self
,  media_selection
):  188          error 
=  media_selection
. find ( './{http://bbc.co.uk/2008/mp/mediaselection}error' )  189          if  error 
is not None :  190              raise  ExtractorError (  191                  ' %s  returned error:  %s '  % ( self
. IE_NAME
,  error
. get ( 'id' )),  expected
= True )  192          return  media_selection
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}media' )  194      def  _extract_connections ( self
,  media
):  195          return  media
. findall ( './{http://bbc.co.uk/2008/mp/mediaselection}connection' )  197      def  _extract_video ( self
,  media
,  programme_id
):  199          vbr 
=  int ( media
. get ( 'bitrate' ))  200          vcodec 
=  media
. get ( 'encoding' )  201          service 
=  media
. get ( 'service' )  202          width 
=  int ( media
. get ( 'width' ))  203          height 
=  int ( media
. get ( 'height' ))  204          file_size 
=  int ( media
. get ( 'media_file_size' ))  205          for  connection 
in  self
._ extract
_ connections
( media
):  206              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  207              for  format 
in  conn_formats
:  209                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  214                      'filesize' :  file_size
,  216              formats
. extend ( conn_formats
)  219      def  _extract_audio ( self
,  media
,  programme_id
):  221          abr 
=  int ( media
. get ( 'bitrate' ))  222          acodec 
=  media
. get ( 'encoding' )  223          service 
=  media
. get ( 'service' )  224          for  connection 
in  self
._ extract
_ connections
( media
):  225              conn_formats 
=  self
._ extract
_ connection
( connection
,  programme_id
)  226              for  format 
in  conn_formats
:  228                      'format_id' :  ' %s _ %s '  % ( service
,  format
[ 'format_id' ]),  232              formats
. extend ( conn_formats
)  235      def  _get_subtitles ( self
,  media
,  programme_id
):  237          for  connection 
in  self
._ extract
_ connections
( media
):  238              captions 
=  self
._ download
_ xml
( connection
. get ( 'href' ),  programme_id
,  'Downloading captions' )  239              lang 
=  captions
. get ( '{http://www.w3.org/XML/1998/namespace}lang' ,  'en' )  240              ps 
=  captions
. findall ( './ {0} body/ {0} div/ {0} p' . format ( '{http://www.w3.org/2006/10/ttaf1}' ))  243              def  _extract_text ( p
):  244                  if  p
. text 
is not None :  245                      stripped_text 
=  p
. text
. strip ()  248                  return  ' ' . join ( span
. text
. strip ()  for  span 
in  p
. findall ( '{http://www.w3.org/2006/10/ttaf1}span' ))  249              for  pos
,  p 
in  enumerate ( ps
):  250                  srt 
+=  ' %s \r\n %s  -->  %s \r\n %s \r\n\r\n '  % ( str ( pos
),  p
. get ( 'begin' ),  p
. get ( 'end' ),  _extract_text ( p
))  253                      'url' :  connection
. get ( 'href' ),  263      def  _download_media_selector ( self
,  programme_id
):  265              media_selection 
=  self
._ download
_ xml
(  266                  'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s '  %  programme_id
,  267                  programme_id
,  'Downloading media selection XML' )  268          except  ExtractorError 
as  ee
:  269              if  isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  403 :  270                  media_selection 
=  xml
. etree
. ElementTree
. fromstring ( ee
. cause
. read (). encode ( 'utf-8' ))  277          for  media 
in  self
._ extract
_ medias
( media_selection
):  278              kind 
=  media
. get ( 'kind' )  280                  formats
. extend ( self
._ extract
_ audio
( media
,  programme_id
))  281              elif  kind 
==  'video' :  282                  formats
. extend ( self
._ extract
_ video
( media
,  programme_id
))  283              elif  kind 
==  'captions' :  284                  subtitles 
=  self
. extract_subtitles ( media
,  programme_id
)  286          return  formats
,  subtitles
 288      def  _download_playlist ( self
,  playlist_id
):  290              playlist 
=  self
._ download
_ json
(  291                  'http://www.bbc.co.uk/programmes/ %s /playlist.json'  %  playlist_id
,  292                  playlist_id
,  'Downloading playlist JSON' )  294              version 
=  playlist
. get ( 'defaultAvailableVersion' )  296                  smp_config 
=  version
[ 'smpConfig' ]  297                  title 
=  smp_config
[ 'title' ]  298                  description 
=  smp_config
[ 'summary' ]  299                  for  item 
in  smp_config
[ 'items' ]:  301                      if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  303                      programme_id 
=  item
. get ( 'vpid' )  304                      duration 
=  int ( item
. get ( 'duration' ))  305                      formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  306                  return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 307          except  ExtractorError 
as  ee
:  308              if not  ( isinstance ( ee
. cause
,  compat_HTTPError
)  and  ee
. cause
. code 
==  404 ):  311          # fallback to legacy playlist  312          playlist 
=  self
._ download
_ xml
(  313              'http://www.bbc.co.uk/iplayer/playlist/ %s '  %  playlist_id
,  314              playlist_id
,  'Downloading legacy playlist XML' )  316          no_items 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}noItems' )  317          if  no_items 
is not None :  318              reason 
=  no_items
. get ( 'reason' )  319              if  reason 
==  'preAvailability' :  320                  msg 
=  'Episode  %s  is not yet available'  %  playlist_id
 321              elif  reason 
==  'postAvailability' :  322                  msg 
=  'Episode  %s  is no longer available'  %  playlist_id
 323              elif  reason 
==  'noMedia' :  324                  msg 
=  'Episode  %s  is not currently available'  %  playlist_id
 326                  msg 
=  'Episode  %s  is not available:  %s '  % ( playlist_id
,  reason
)  327              raise  ExtractorError ( msg
,  expected
= True )  329          for  item 
in  self
._ extract
_ items
( playlist
):  330              kind 
=  item
. get ( 'kind' )  331              if  kind 
!=  'programme'  and  kind 
!=  'radioProgramme' :  333              title 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}title' ). text
 334              description 
=  playlist
. find ( './{http://bbc.co.uk/2008/emp/playlist}summary' ). text
 335              programme_id 
=  item
. get ( 'identifier' )  336              duration 
=  int ( item
. get ( 'duration' ))  337              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  339          return  programme_id
,  title
,  description
,  duration
,  formats
,  subtitles
 341      def  _real_extract ( self
,  url
):  342          group_id 
=  self
._ match
_ id
( url
)  344          webpage 
=  self
._ download
_ webpage
( url
,  group_id
,  'Downloading video page' )  348          tviplayer 
=  self
._ search
_ regex
(  349              r
'mediator\.bind\(({.+?})\s*,\s*document\.getElementById' ,  350              webpage
,  'player' ,  default
= None )  353              player 
=  self
._ parse
_ json
( tviplayer
,  group_id
). get ( 'player' , {})  354              duration 
=  int_or_none ( player
. get ( 'duration' ))  355              programme_id 
=  player
. get ( 'vpid' )  358              programme_id 
=  self
._ search
_ regex
(  359                  r
'"vpid"\s*:\s*"([\da-z] {8} )"' ,  webpage
,  'vpid' ,  fatal
= False ,  default
= None )  362              formats
,  subtitles 
=  self
._ download
_ media
_ selector
( programme_id
)  363              title 
=  self
._ og
_ search
_ title
( webpage
)  364              description 
=  self
._ search
_ regex
(  365                  r
'<p class="medium-description">([^<]+)</p>' ,  366                  webpage
,  'description' ,  fatal
= False )  368              programme_id
,  title
,  description
,  duration
,  formats
,  subtitles 
=  self
._ download
_ playlist
( group_id
)  370          self
._ sort
_ formats
( formats
)  375              'description' :  description
,  376              'thumbnail' :  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None ),  377              'duration' :  duration
,  379              'subtitles' :  subtitles
,