]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/cbc.py 
54b4b9be958ae49f0ea4f7d37cadcdf4e2c8b1c7
   2  from  __future__ 
import  unicode_literals
   7  from  . common 
import  InfoExtractor
  28  class  CBCIE ( InfoExtractor
):   30      _VALID_URL 
=  r
'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'   33          'url' :  'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs' ,   34          'md5' :  '97e24d09672fc4cf56256d6faa6c25bc' ,   38              'title' :  'Don Cherry – All-Stars' ,   39              'description' :  'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.' ,   40              'timestamp' :  1454463000 ,   41              'upload_date' :  '20160203' ,   42              'uploader' :  'CBCC-NEW' ,   44          'skip' :  'Geo-restricted to Canada' ,   46          # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com   47          'url' :  'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4' ,   48          'md5' :  '162adfa070274b144f4fdc3c3b8207db' ,   52              'title' :  '22 Minutes Update: What Not To Wear Quebec' ,   53              'description' :  "This week's latest Canadian top political story is What Not To Wear Quebec." ,   54              'upload_date' :  '20131025' ,   55              'uploader' :  'CBCC-NEW' ,   56              'timestamp' :  1382717907 ,   59          # with clipId, feed only available via tpfeed.cbc.ca   60          'url' :  'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live' ,   61          'md5' :  '0274a90b51a9b4971fe005c63f592f12' ,   65              'title' :  'Robin Williams freestyles on 90 Minutes Live' ,   66              'description' :  'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC \' s 90 Minutes Live.' ,   67              'upload_date' :  '19780210' ,   68              'uploader' :  'CBCC-NEW' ,   69              'timestamp' :  255977160 ,   73          'url' :  'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot' ,   75              'md5' :  '377572d0b49c4ce0c9ad77470e0b96b4' ,   79                  'title' :  'An Eagle \' s-Eye View Off Burrard Bridge' ,   80                  'description' :  'Hercules the eagle flies from Vancouver \' s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.' ,   81                  'upload_date' :  '20160201' ,   82                  'timestamp' :  1454342820 ,   83                  'uploader' :  'CBCC-NEW' ,   86              'md5' :  '415a0e3f586113894174dfb31aa5bb1a' ,   90                  'title' :  'Fly like an eagle!' ,   91                  'description' :  'Eagle equipped with a mini camera flies from the world \' s tallest tower' ,   92                  'upload_date' :  '20150315' ,   93                  'timestamp' :  1426443984 ,   94                  'uploader' :  'CBCC-NEW' ,   97          'skip' :  'Geo-restricted to Canada' ,   99          # multiple CBC.APP.Caffeine.initInstance(...)  100          'url' :  'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238' ,  102              'title' :  'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks' ,  103              'id' :  'dog-indoor-exercise-winter-1.3928238' ,  104              'description' :  'md5:c18552e41726ee95bd75210d1ca9194c' ,  106          'playlist_mincount' :  6 ,  110      def  suitable ( cls
,  url
):  111          return False if  CBCPlayerIE
. suitable ( url
)  else  super ( CBCIE
,  cls
). suitable ( url
)  113      def  _extract_player_init ( self
,  player_init
,  display_id
):  114          player_info 
=  self
._ parse
_ json
( player_init
,  display_id
,  js_to_json
)  115          media_id 
=  player_info
. get ( 'mediaId' )  117              clip_id 
=  player_info
[ 'clipId' ]  118              feed 
=  self
._ download
_ json
(  119                  'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue= {:mpsReleases} { %s }'  %  clip_id
,  120                  clip_id
,  fatal
= False )  122                  media_id 
=  try_get ( feed
,  lambda  x
:  x
[ 'entries' ][ 0 ][ 'guid' ],  compat_str
)  124                  media_id 
=  self
._ download
_ json
(  125                      'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D'  +  clip_id
,  126                      clip_id
)[ 'entries' ][ 0 ][ 'id' ]. split ( '/' )[- 1 ]  127          return  self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  129      def  _real_extract ( self
,  url
):  130          display_id 
=  self
._ match
_ id
( url
)  131          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  133              self
._ extract
_ player
_ init
( player_init
,  display_id
)  134              for  player_init 
in  re
. findall ( r
'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);' ,  webpage
)]  136              self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  137              for  media_id 
in  re
. findall ( r
'<iframe[^>]+src="[^"]+?mediaId=(\d+)"' ,  webpage
)])  138          return  self
. playlist_result (  140              self
._ og
_ search
_ title
( webpage
,  fatal
= False ),  141              self
._ og
_ search
_ description
( webpage
))  144  class  CBCPlayerIE ( InfoExtractor
):  145      IE_NAME 
=  'cbc.ca:player'  146      _VALID_URL 
=  r
'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'  148          'url' :  'http://www.cbc.ca/player/play/2683190193' ,  149          'md5' :  '64d25f841ddf4ddb28a235338af32e2c' ,  153              'title' :  'Gerry Runs a Sweat Shop' ,  154              'description' :  'md5:b457e1c01e8ff408d9d801c1c2cd29b0' ,  155              'timestamp' :  1455071400 ,  156              'upload_date' :  '20160210' ,  157              'uploader' :  'CBCC-NEW' ,  159          'skip' :  'Geo-restricted to Canada' ,  161          # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/  162          'url' :  'http://www.cbc.ca/player/play/2657631896' ,  163          'md5' :  'e5e708c34ae6fca156aafe17c43e8b75' ,  167              'title' :  'CBC Montreal is organizing its first ever community hackathon!' ,  168              'description' :  'The modern technology we tend to depend on so heavily, is never without it \' s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.' ,  169              'timestamp' :  1425704400 ,  170              'upload_date' :  '20150307' ,  171              'uploader' :  'CBCC-NEW' ,  174          'url' :  'http://www.cbc.ca/player/play/2164402062' ,  175          'md5' :  '33fcd8f6719b9dd60a5e73adcb83b9f6' ,  179              'title' :  'Cancer survivor four times over' ,  180              'description' :  'Tim Mayer has beaten three different forms of cancer four times in five years.' ,  181              'timestamp' :  1320410746 ,  182              'upload_date' :  '20111104' ,  183              'uploader' :  'CBCC-NEW' ,  187      def  _real_extract ( self
,  url
):  188          video_id 
=  self
._ match
_ id
( url
)  190              '_type' :  'url_transparent' ,  191              'ie_key' :  'ThePlatform' ,  193                  'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/ %s ?mbr=true&formats=MPEG4,FLV,MP3'  %  video_id
, {  194                      'force_smil_url' :  True  200  class  CBCWatchBaseIE ( InfoExtractor
):  203      _API_BASE_URL 
=  'https://api-cbc.cloud.clearleap.com/cloffice/client/'  205          'media' :  'http://search.yahoo.com/mrss/' ,  206          'clearleap' :  'http://www.clearleap.com/namespace/clearleap/1.0/' ,  208      _GEO_COUNTRIES 
= [ 'CA' ]  210      def  _call_api ( self
,  path
,  video_id
):  211          url 
=  path 
if  path
. startswith ( 'http' )  else  self
._ API
_ BASE
_U RL 
+  path
 214                  result 
=  self
._ download
_ xml
( url
,  video_id
,  headers
={  215                      'X-Clearleap-DeviceId' :  self
._ device
_ id
,  216                      'X-Clearleap-DeviceToken' :  self
._ device
_ token
,  218              except  ExtractorError 
as  e
:  219                  if  isinstance ( e
. cause
,  compat_HTTPError
)  and  e
. cause
. code 
==  401 :  220                      # Device token has expired, re-acquiring device token  221                      self
._ register
_ device
()  224          error_message 
=  xpath_text ( result
,  'userMessage' )  or  xpath_text ( result
,  'systemMessage' )  226              raise  ExtractorError ( ' %s  said:  %s '  % ( self
. IE_NAME
,  error_message
))  229      def  _real_initialize ( self
):  230          if  self
._ valid
_ device
_ token
():  232          device 
=  self
._ downloader
. cache
. load ( 'cbcwatch' ,  'device' )  or  {}  233          self
._ device
_ id
,  self
._ device
_ token 
=  device
. get ( 'id' ),  device
. get ( 'token' )  234          if  self
._ valid
_ device
_ token
():  236          self
._ register
_ device
()  238      def  _valid_device_token ( self
):  239          return  self
._ device
_ id 
and  self
._ device
_ token
 241      def  _register_device ( self
):  242          self
._ device
_ id 
=  self
._ device
_ token 
=  None  243          result 
=  self
._ download
_ xml
(  244              self
._ API
_ BASE
_U RL 
+  'device/register' ,  245              None ,  'Acquiring device token' ,  246              data
= b
'<device><type>web</type></device>' )  247          self
._ device
_ id 
=  xpath_text ( result
,  'deviceId' ,  fatal
= True )  248          self
._ device
_ token 
=  xpath_text ( result
,  'deviceToken' ,  fatal
= True )  249          self
._ downloader
. cache
. store (  250              'cbcwatch' ,  'device' , {  251                  'id' :  self
._ device
_ id
,  252                  'token' :  self
._ device
_ token
,  255      def  _parse_rss_feed ( self
,  rss
):  256          channel 
=  xpath_element ( rss
,  'channel' ,  fatal
= True )  259              return  xpath_with_ns ( path
,  self
._ NS
_ MAP
)  262          for  item 
in  channel
. findall ( 'item' ):  263              guid 
=  xpath_text ( item
,  'guid' ,  fatal
= True )  264              title 
=  xpath_text ( item
,  'title' ,  fatal
= True )  266              media_group 
=  xpath_element ( item
,  _add_ns ( 'media:group' ),  fatal
= True )  267              content 
=  xpath_element ( media_group
,  _add_ns ( 'media:content' ),  fatal
= True )  268              content_url 
=  content
. attrib
[ 'url' ]  271              for  thumbnail 
in  media_group
. findall ( _add_ns ( 'media:thumbnail' )):  272                  thumbnail_url 
=  thumbnail
. get ( 'url' )  273                  if not  thumbnail_url
:  276                      'id' :  thumbnail
. get ( 'profile' ),  277                      'url' :  thumbnail_url
,  278                      'width' :  int_or_none ( thumbnail
. get ( 'width' )),  279                      'height' :  int_or_none ( thumbnail
. get ( 'height' )),  283              release_date 
=  find_xpath_attr (  284                  item
,  _add_ns ( 'media:credit' ),  'role' ,  'releaseDate' )  285              if  release_date 
is not None :  286                  timestamp 
=  parse_iso8601 ( release_date
. text
)  289                  '_type' :  'url_transparent' ,  293                  'description' :  xpath_text ( item
,  'description' ),  294                  'timestamp' :  timestamp
,  295                  'duration' :  int_or_none ( content
. get ( 'duration' )),  296                  'age_limit' :  parse_age_limit ( xpath_text ( item
,  _add_ns ( 'media:rating' ))),  297                  'episode' :  xpath_text ( item
,  _add_ns ( 'clearleap:episode' )),  298                  'episode_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:episodeInSeason' ))),  299                  'series' :  xpath_text ( item
,  _add_ns ( 'clearleap:series' )),  300                  'season_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:season' ))),  301                  'thumbnails' :  thumbnails
,  302                  'ie_key' :  'CBCWatchVideo' ,  305          return  self
. playlist_result (  306              entries
,  xpath_text ( channel
,  'guid' ),  307              xpath_text ( channel
,  'title' ),  308              xpath_text ( channel
,  'description' ))  311  class  CBCWatchVideoIE ( CBCWatchBaseIE
):  312      IE_NAME 
=  'cbc.ca:watch:video'  313      _VALID_URL 
=  r
'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'  315          # geo-restricted to Canada, bypassable  316          'url' :  'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235' ,  317          'only_matching' :  True ,  320      def  _real_extract ( self
,  url
):  321          video_id 
=  self
._ match
_ id
( url
)  322          result 
=  self
._ call
_ api
( url
,  video_id
)  324          m3u8_url 
=  xpath_text ( result
,  'url' ,  fatal
= True )  325          formats 
=  self
._ extract
_ m
3u8_ formats
( re
. sub ( r
'/([^/]+)/[^/?]+\.m3u8' ,  r
'/\1/\1.m3u8' ,  m3u8_url
),  video_id
,  'mp4' ,  fatal
= False )  327              formats 
=  self
._ extract
_ m
3u8_ formats
( m3u8_url
,  video_id
,  'mp4' )  329              format_id 
=  f
. get ( 'format_id' )  330              if  format_id
. startswith ( 'AAC' ):  332              elif  format_id
. startswith ( 'AC3' ):  334          self
._ sort
_ formats
( formats
)  342          rss 
=  xpath_element ( result
,  'rss' )  344              info
. update ( self
._ parse
_ rss
_ feed
( rss
)[ 'entries' ][ 0 ])  351  class  CBCWatchIE ( CBCWatchBaseIE
):  352      IE_NAME 
=  'cbc.ca:watch'  353      _VALID_URL 
=  r
'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'  355          # geo-restricted to Canada, bypassable  356          'url' :  'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4' ,  358              'id' :  '9673749a-5e77-484c-8b62-a1092a6b5168' ,  360              'title' :  'Customer (Dis)Service' ,  361              'description' :  'md5:8bdd6913a0fe03d4b2a17ebe169c7c87' ,  362              'upload_date' :  '20160219' ,  363              'timestamp' :  1455840000 ,  367              'skip_download' :  True ,  368              'format' :  'bestvideo' ,  371          # geo-restricted to Canada, bypassable  372          'url' :  'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057' ,  374              'id' :  '1ed4b385-cd84-49cf-95f0-80f004680057' ,  376              'description' :  'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.' ,  378          'playlist_mincount' :  30 ,  381      def  _real_extract ( self
,  url
):  382          video_id 
=  self
._ match
_ id
( url
)  383          rss 
=  self
._ call
_ api
( 'web/browse/'  +  video_id
,  video_id
)  384          return  self
._ parse
_ rss
_ feed
( rss
)  387  class  CBCOlympicsIE ( InfoExtractor
):  388      IE_NAME 
=  'cbc.ca:olympics'  389      _VALID_URL 
=  r
'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)'  391          'url' :  'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/' ,  392          'only_matching' :  True ,  395      def  _real_extract ( self
,  url
):  396          display_id 
=  self
._ match
_ id
( url
)  397          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  398          video_id 
=  self
._ hidden
_ inputs
( webpage
)[ 'videoId' ]  399          video_doc 
=  self
._ download
_ xml
(  400              'https://olympics.cbc.ca/videodata/ %s .xml'  %  video_id
,  video_id
)  401          title 
=  xpath_text ( video_doc
,  'title' ,  fatal
= True )  402          is_live 
=  xpath_text ( video_doc
,  'kind' ) ==  'Live'  404              title 
=  self
._l ive
_ title
( title
)  407          for  video_source 
in  video_doc
. findall ( 'videoSources/videoSource' ):  408              uri 
=  xpath_text ( video_source
,  'uri' )  411              tokenize 
=  self
._ download
_ json
(  412                  'https://olympics.cbc.ca/api/api-akamai/tokenize' ,  413                  video_id
,  data
= json
. dumps ({  415                  }). encode (),  headers
={  416                      'Content-Type' :  'application/json' ,  418                      # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js  419                      'Cookie' :  '_dvp=TK:C0ObxjerU' ,   # AKAMAI CDN cookie  423              content_url 
=  tokenize
[ 'ContentUrl' ]  424              video_source_format 
=  video_source
. get ( 'format' )  425              if  video_source_format 
==  'IIS' :  426                  formats
. extend ( self
._ extract
_ ism
_ formats
(  427                      content_url
,  video_id
,  ism_id
= video_source_format
,  fatal
= False ))  429                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  430                      content_url
,  video_id
,  'mp4' ,  431                      'm3u8'  if  is_live 
else  'm3u8_native' ,  432                      m3u8_id
= video_source_format
,  fatal
= False ))  433          self
._ sort
_ formats
( formats
)  437              'display_id' :  display_id
,  439              'description' :  xpath_text ( video_doc
,  'description' ),  440              'thumbnail' :  xpath_text ( video_doc
,  'thumbnailUrl' ),  441              'duration' :  parse_duration ( xpath_text ( video_doc
,  'duration' )),