]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/cbc.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  .. compat 
import  compat_str
 
  23  class  CBCIE ( InfoExtractor
):  
  25      _VALID_URL 
=  r
'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'  
  28          'url' :  'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs' ,  
  29          'md5' :  '97e24d09672fc4cf56256d6faa6c25bc' ,  
  33              'title' :  'Don Cherry – All-Stars' ,  
  34              'description' :  'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.' ,  
  35              'timestamp' :  1454463000 ,  
  36              'upload_date' :  '20160203' ,  
  37              'uploader' :  'CBCC-NEW' ,  
  39          'skip' :  'Geo-restricted to Canada' ,  
  41          # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com  
  42          'url' :  'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4' ,  
  43          'md5' :  '162adfa070274b144f4fdc3c3b8207db' ,  
  47              'title' :  '22 Minutes Update: What Not To Wear Quebec' ,  
  48              'description' :  "This week's latest Canadian top political story is What Not To Wear Quebec." ,  
  49              'upload_date' :  '20131025' ,  
  50              'uploader' :  'CBCC-NEW' ,  
  51              'timestamp' :  1382717907 ,  
  54          # with clipId, feed only available via tpfeed.cbc.ca  
  55          'url' :  'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live' ,  
  56          'md5' :  '0274a90b51a9b4971fe005c63f592f12' ,  
  60              'title' :  'Robin Williams freestyles on 90 Minutes Live' ,  
  61              'description' :  'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC \' s 90 Minutes Live.' ,  
  62              'upload_date' :  '19780210' ,  
  63              'uploader' :  'CBCC-NEW' ,  
  64              'timestamp' :  255977160 ,  
  68          'url' :  'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot' ,  
  70              'md5' :  '377572d0b49c4ce0c9ad77470e0b96b4' ,  
  74                  'title' :  'An Eagle \' s-Eye View Off Burrard Bridge' ,  
  75                  'description' :  'Hercules the eagle flies from Vancouver \' s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.' ,  
  76                  'upload_date' :  '20160201' ,  
  77                  'timestamp' :  1454342820 ,  
  78                  'uploader' :  'CBCC-NEW' ,  
  81              'md5' :  '415a0e3f586113894174dfb31aa5bb1a' ,  
  85                  'title' :  'Fly like an eagle!' ,  
  86                  'description' :  'Eagle equipped with a mini camera flies from the world \' s tallest tower' ,  
  87                  'upload_date' :  '20150315' ,  
  88                  'timestamp' :  1426443984 ,  
  89                  'uploader' :  'CBCC-NEW' ,  
  92          'skip' :  'Geo-restricted to Canada' ,  
  94          # multiple CBC.APP.Caffeine.initInstance(...)  
  95          'url' :  'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238' ,  
  97              'title' :  'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks' ,  
  98              'id' :  'dog-indoor-exercise-winter-1.3928238' ,  
  99              'description' :  'md5:c18552e41726ee95bd75210d1ca9194c' ,  
 101          'playlist_mincount' :  6 ,  
 105      def  suitable ( cls
,  url
):  
 106          return False if  CBCPlayerIE
. suitable ( url
)  else  super ( CBCIE
,  cls
). suitable ( url
)  
 108      def  _extract_player_init ( self
,  player_init
,  display_id
):  
 109          player_info 
=  self
._ parse
_ json
( player_init
,  display_id
,  js_to_json
)  
 110          media_id 
=  player_info
. get ( 'mediaId' )  
 112              clip_id 
=  player_info
[ 'clipId' ]  
 113              feed 
=  self
._ download
_ json
(  
 114                  'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue= {:mpsReleases} { %s }'  %  clip_id
,  
 115                  clip_id
,  fatal
= False )  
 117                  media_id 
=  try_get ( feed
,  lambda  x
:  x
[ 'entries' ][ 0 ][ 'guid' ],  compat_str
)  
 119                  media_id 
=  self
._ download
_ json
(  
 120                      'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D'  +  clip_id
,  
 121                      clip_id
)[ 'entries' ][ 0 ][ 'id' ]. split ( '/' )[- 1 ]  
 122          return  self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  
 124      def  _real_extract ( self
,  url
):  
 125          display_id 
=  self
._ match
_ id
( url
)  
 126          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  
 128              self
._ extract
_ player
_ init
( player_init
,  display_id
)  
 129              for  player_init 
in  re
. findall ( r
'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);' ,  webpage
)]  
 131              self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  
 132              for  media_id 
in  re
. findall ( r
'<iframe[^>]+src="[^"]+?mediaId=(\d+)"' ,  webpage
)])  
 133          return  self
. playlist_result (  
 135              self
._ og
_ search
_ title
( webpage
,  fatal
= False ),  
 136              self
._ og
_ search
_ description
( webpage
))  
 139  class  CBCPlayerIE ( InfoExtractor
):  
 140      IE_NAME 
=  'cbc.ca:player'  
 141      _VALID_URL 
=  r
'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'  
 143          'url' :  'http://www.cbc.ca/player/play/2683190193' ,  
 144          'md5' :  '64d25f841ddf4ddb28a235338af32e2c' ,  
 148              'title' :  'Gerry Runs a Sweat Shop' ,  
 149              'description' :  'md5:b457e1c01e8ff408d9d801c1c2cd29b0' ,  
 150              'timestamp' :  1455071400 ,  
 151              'upload_date' :  '20160210' ,  
 152              'uploader' :  'CBCC-NEW' ,  
 154          'skip' :  'Geo-restricted to Canada' ,  
 156          # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/  
 157          'url' :  'http://www.cbc.ca/player/play/2657631896' ,  
 158          'md5' :  'e5e708c34ae6fca156aafe17c43e8b75' ,  
 162              'title' :  'CBC Montreal is organizing its first ever community hackathon!' ,  
 163              'description' :  'The modern technology we tend to depend on so heavily, is never without it \' s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.' ,  
 164              'timestamp' :  1425704400 ,  
 165              'upload_date' :  '20150307' ,  
 166              'uploader' :  'CBCC-NEW' ,  
 169          'url' :  'http://www.cbc.ca/player/play/2164402062' ,  
 170          'md5' :  '33fcd8f6719b9dd60a5e73adcb83b9f6' ,  
 174              'title' :  'Cancer survivor four times over' ,  
 175              'description' :  'Tim Mayer has beaten three different forms of cancer four times in five years.' ,  
 176              'timestamp' :  1320410746 ,  
 177              'upload_date' :  '20111104' ,  
 178              'uploader' :  'CBCC-NEW' ,  
 182      def  _real_extract ( self
,  url
):  
 183          video_id 
=  self
._ match
_ id
( url
)  
 185              '_type' :  'url_transparent' ,  
 186              'ie_key' :  'ThePlatform' ,  
 188                  'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/ %s ?mbr=true&formats=MPEG4,FLV,MP3'  %  video_id
, {  
 189                      'force_smil_url' :  True  
 195  class  CBCWatchBaseIE ( InfoExtractor
):  
 198      _API_BASE_URL 
=  'https://api-cbc.cloud.clearleap.com/cloffice/client/'  
 200          'media' :  'http://search.yahoo.com/mrss/' ,  
 201          'clearleap' :  'http://www.clearleap.com/namespace/clearleap/1.0/' ,  
 203      _GEO_COUNTRIES 
= [ 'CA' ]  
 205      def  _call_api ( self
,  path
,  video_id
):  
 206          url 
=  path 
if  path
. startswith ( 'http' )  else  self
._ API
_ BASE
_U RL 
+  path
 
 207          result 
=  self
._ download
_ xml
( url
,  video_id
,  headers
={  
 208              'X-Clearleap-DeviceId' :  self
._ device
_ id
,  
 209              'X-Clearleap-DeviceToken' :  self
._ device
_ token
,  
 211          error_message 
=  xpath_text ( result
,  'userMessage' )  or  xpath_text ( result
,  'systemMessage' )  
 213              raise  ExtractorError ( ' %s  said:  %s '  % ( self
. IE_NAME
,  error_message
))  
 216      def  _real_initialize ( self
):  
 217          if not  self
._ device
_ id 
or not  self
._ device
_ token
:  
 218              device 
=  self
._ downloader
. cache
. load ( 'cbcwatch' ,  'device' )  or  {}  
 219              self
._ device
_ id
,  self
._ device
_ token 
=  device
. get ( 'id' ),  device
. get ( 'token' )  
 220              if not  self
._ device
_ id 
or not  self
._ device
_ token
:  
 221                  result 
=  self
._ download
_ xml
(  
 222                      self
._ API
_ BASE
_U RL 
+  'device/register' ,  
 223                      None ,  data
= b
'<device><type>web</type></device>' )  
 224                  self
._ device
_ id 
=  xpath_text ( result
,  'deviceId' ,  fatal
= True )  
 225                  self
._ device
_ token 
=  xpath_text ( result
,  'deviceToken' ,  fatal
= True )  
 226                  self
._ downloader
. cache
. store (  
 227                      'cbcwatch' ,  'device' , {  
 228                          'id' :  self
._ device
_ id
,  
 229                          'token' :  self
._ device
_ token
,  
 232      def  _parse_rss_feed ( self
,  rss
):  
 233          channel 
=  xpath_element ( rss
,  'channel' ,  fatal
= True )  
 236              return  xpath_with_ns ( path
,  self
._ NS
_ MAP
)  
 239          for  item 
in  channel
. findall ( 'item' ):  
 240              guid 
=  xpath_text ( item
,  'guid' ,  fatal
= True )  
 241              title 
=  xpath_text ( item
,  'title' ,  fatal
= True )  
 243              media_group 
=  xpath_element ( item
,  _add_ns ( 'media:group' ),  fatal
= True )  
 244              content 
=  xpath_element ( media_group
,  _add_ns ( 'media:content' ),  fatal
= True )  
 245              content_url 
=  content
. attrib
[ 'url' ]  
 248              for  thumbnail 
in  media_group
. findall ( _add_ns ( 'media:thumbnail' )):  
 249                  thumbnail_url 
=  thumbnail
. get ( 'url' )  
 250                  if not  thumbnail_url
:  
 253                      'id' :  thumbnail
. get ( 'profile' ),  
 254                      'url' :  thumbnail_url
,  
 255                      'width' :  int_or_none ( thumbnail
. get ( 'width' )),  
 256                      'height' :  int_or_none ( thumbnail
. get ( 'height' )),  
 260              release_date 
=  find_xpath_attr (  
 261                  item
,  _add_ns ( 'media:credit' ),  'role' ,  'releaseDate' )  
 262              if  release_date 
is not None :  
 263                  timestamp 
=  parse_iso8601 ( release_date
. text
)  
 266                  '_type' :  'url_transparent' ,  
 270                  'description' :  xpath_text ( item
,  'description' ),  
 271                  'timestamp' :  timestamp
,  
 272                  'duration' :  int_or_none ( content
. get ( 'duration' )),  
 273                  'age_limit' :  parse_age_limit ( xpath_text ( item
,  _add_ns ( 'media:rating' ))),  
 274                  'episode' :  xpath_text ( item
,  _add_ns ( 'clearleap:episode' )),  
 275                  'episode_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:episodeInSeason' ))),  
 276                  'series' :  xpath_text ( item
,  _add_ns ( 'clearleap:series' )),  
 277                  'season_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:season' ))),  
 278                  'thumbnails' :  thumbnails
,  
 279                  'ie_key' :  'CBCWatchVideo' ,  
 282          return  self
. playlist_result (  
 283              entries
,  xpath_text ( channel
,  'guid' ),  
 284              xpath_text ( channel
,  'title' ),  
 285              xpath_text ( channel
,  'description' ))  
 288  class  CBCWatchVideoIE ( CBCWatchBaseIE
):  
 289      IE_NAME 
=  'cbc.ca:watch:video'  
 290      _VALID_URL 
=  r
'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'  
 292          # geo-restricted to Canada, bypassable  
 293          'url' :  'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235' ,  
 294          'only_matching' :  True ,  
 297      def  _real_extract ( self
,  url
):  
 298          video_id 
=  self
._ match
_ id
( url
)  
 299          result 
=  self
._ call
_ api
( url
,  video_id
)  
 301          m3u8_url 
=  xpath_text ( result
,  'url' ,  fatal
= True )  
 302          formats 
=  self
._ extract
_ m
3u8_ formats
( re
. sub ( r
'/([^/]+)/[^/?]+\.m3u8' ,  r
'/\1/\1.m3u8' ,  m3u8_url
),  video_id
,  'mp4' ,  fatal
= False )  
 304              formats 
=  self
._ extract
_ m
3u8_ formats
( m3u8_url
,  video_id
,  'mp4' )  
 306              format_id 
=  f
. get ( 'format_id' )  
 307              if  format_id
. startswith ( 'AAC' ):  
 309              elif  format_id
. startswith ( 'AC3' ):  
 311          self
._ sort
_ formats
( formats
)  
 319          rss 
=  xpath_element ( result
,  'rss' )  
 321              info
. update ( self
._ parse
_ rss
_ feed
( rss
)[ 'entries' ][ 0 ])  
 328  class  CBCWatchIE ( CBCWatchBaseIE
):  
 329      IE_NAME 
=  'cbc.ca:watch'  
 330      _VALID_URL 
=  r
'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'  
 332          # geo-restricted to Canada, bypassable  
 333          'url' :  'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4' ,  
 335              'id' :  '9673749a-5e77-484c-8b62-a1092a6b5168' ,  
 337              'title' :  'Customer (Dis)Service' ,  
 338              'description' :  'md5:8bdd6913a0fe03d4b2a17ebe169c7c87' ,  
 339              'upload_date' :  '20160219' ,  
 340              'timestamp' :  1455840000 ,  
 344              'skip_download' :  True ,  
 345              'format' :  'bestvideo' ,  
 348          # geo-restricted to Canada, bypassable  
 349          'url' :  'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057' ,  
 351              'id' :  '1ed4b385-cd84-49cf-95f0-80f004680057' ,  
 353              'description' :  'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.' ,  
 355          'playlist_mincount' :  30 ,  
 358      def  _real_extract ( self
,  url
):  
 359          video_id 
=  self
._ match
_ id
( url
)  
 360          rss 
=  self
._ call
_ api
( 'web/browse/'  +  video_id
,  video_id
)  
 361          return  self
._ parse
_ rss
_ feed
( rss
)