]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/cbc.py 
   2  from  __future__ 
import  unicode_literals
   7  from  . common 
import  InfoExtractor
   8  from  .. compat 
import  compat_str
  25  class  CBCIE ( InfoExtractor
):   27      _VALID_URL 
=  r
'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'   30          'url' :  'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs' ,   31          'md5' :  '97e24d09672fc4cf56256d6faa6c25bc' ,   35              'title' :  'Don Cherry – All-Stars' ,   36              'description' :  'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.' ,   37              'timestamp' :  1454463000 ,   38              'upload_date' :  '20160203' ,   39              'uploader' :  'CBCC-NEW' ,   41          'skip' :  'Geo-restricted to Canada' ,   43          # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com   44          'url' :  'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4' ,   45          'md5' :  '162adfa070274b144f4fdc3c3b8207db' ,   49              'title' :  '22 Minutes Update: What Not To Wear Quebec' ,   50              'description' :  "This week's latest Canadian top political story is What Not To Wear Quebec." ,   51              'upload_date' :  '20131025' ,   52              'uploader' :  'CBCC-NEW' ,   53              'timestamp' :  1382717907 ,   56          # with clipId, feed only available via tpfeed.cbc.ca   57          'url' :  'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live' ,   58          'md5' :  '0274a90b51a9b4971fe005c63f592f12' ,   62              'title' :  'Robin Williams freestyles on 90 Minutes Live' ,   63              'description' :  'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC \' s 90 Minutes Live.' ,   64              'upload_date' :  '19780210' ,   65              'uploader' :  'CBCC-NEW' ,   66              'timestamp' :  255977160 ,   70          'url' :  'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot' ,   72              'md5' :  '377572d0b49c4ce0c9ad77470e0b96b4' ,   76                  'title' :  'An Eagle \' s-Eye View Off Burrard Bridge' ,   77                  'description' :  'Hercules the eagle flies from Vancouver \' s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.' ,   78                  'upload_date' :  '20160201' ,   79                  'timestamp' :  1454342820 ,   80                  'uploader' :  'CBCC-NEW' ,   83              'md5' :  '415a0e3f586113894174dfb31aa5bb1a' ,   87                  'title' :  'Fly like an eagle!' ,   88                  'description' :  'Eagle equipped with a mini camera flies from the world \' s tallest tower' ,   89                  'upload_date' :  '20150315' ,   90                  'timestamp' :  1426443984 ,   91                  'uploader' :  'CBCC-NEW' ,   94          'skip' :  'Geo-restricted to Canada' ,   96          # multiple CBC.APP.Caffeine.initInstance(...)   97          'url' :  'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238' ,   99              'title' :  'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks' ,  100              'id' :  'dog-indoor-exercise-winter-1.3928238' ,  101              'description' :  'md5:c18552e41726ee95bd75210d1ca9194c' ,  103          'playlist_mincount' :  6 ,  107      def  suitable ( cls
,  url
):  108          return False if  CBCPlayerIE
. suitable ( url
)  else  super ( CBCIE
,  cls
). suitable ( url
)  110      def  _extract_player_init ( self
,  player_init
,  display_id
):  111          player_info 
=  self
._ parse
_ json
( player_init
,  display_id
,  js_to_json
)  112          media_id 
=  player_info
. get ( 'mediaId' )  114              clip_id 
=  player_info
[ 'clipId' ]  115              feed 
=  self
._ download
_ json
(  116                  'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue= {:mpsReleases} { %s }'  %  clip_id
,  117                  clip_id
,  fatal
= False )  119                  media_id 
=  try_get ( feed
,  lambda  x
:  x
[ 'entries' ][ 0 ][ 'guid' ],  compat_str
)  121                  media_id 
=  self
._ download
_ json
(  122                      'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D'  +  clip_id
,  123                      clip_id
)[ 'entries' ][ 0 ][ 'id' ]. split ( '/' )[- 1 ]  124          return  self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  126      def  _real_extract ( self
,  url
):  127          display_id 
=  self
._ match
_ id
( url
)  128          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  130              self
._ extract
_ player
_ init
( player_init
,  display_id
)  131              for  player_init 
in  re
. findall ( r
'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);' ,  webpage
)]  133              self
. url_result ( 'cbcplayer: %s '  %  media_id
,  'CBCPlayer' ,  media_id
)  134              for  media_id 
in  re
. findall ( r
'<iframe[^>]+src="[^"]+?mediaId=(\d+)"' ,  webpage
)])  135          return  self
. playlist_result (  137              self
._ og
_ search
_ title
( webpage
,  fatal
= False ),  138              self
._ og
_ search
_ description
( webpage
))  141  class  CBCPlayerIE ( InfoExtractor
):  142      IE_NAME 
=  'cbc.ca:player'  143      _VALID_URL 
=  r
'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'  145          'url' :  'http://www.cbc.ca/player/play/2683190193' ,  146          'md5' :  '64d25f841ddf4ddb28a235338af32e2c' ,  150              'title' :  'Gerry Runs a Sweat Shop' ,  151              'description' :  'md5:b457e1c01e8ff408d9d801c1c2cd29b0' ,  152              'timestamp' :  1455071400 ,  153              'upload_date' :  '20160210' ,  154              'uploader' :  'CBCC-NEW' ,  156          'skip' :  'Geo-restricted to Canada' ,  158          # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/  159          'url' :  'http://www.cbc.ca/player/play/2657631896' ,  160          'md5' :  'e5e708c34ae6fca156aafe17c43e8b75' ,  164              'title' :  'CBC Montreal is organizing its first ever community hackathon!' ,  165              'description' :  'The modern technology we tend to depend on so heavily, is never without it \' s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.' ,  166              'timestamp' :  1425704400 ,  167              'upload_date' :  '20150307' ,  168              'uploader' :  'CBCC-NEW' ,  171          'url' :  'http://www.cbc.ca/player/play/2164402062' ,  172          'md5' :  '33fcd8f6719b9dd60a5e73adcb83b9f6' ,  176              'title' :  'Cancer survivor four times over' ,  177              'description' :  'Tim Mayer has beaten three different forms of cancer four times in five years.' ,  178              'timestamp' :  1320410746 ,  179              'upload_date' :  '20111104' ,  180              'uploader' :  'CBCC-NEW' ,  184      def  _real_extract ( self
,  url
):  185          video_id 
=  self
._ match
_ id
( url
)  187              '_type' :  'url_transparent' ,  188              'ie_key' :  'ThePlatform' ,  190                  'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/ %s ?mbr=true&formats=MPEG4,FLV,MP3'  %  video_id
, {  191                      'force_smil_url' :  True  197  class  CBCWatchBaseIE ( InfoExtractor
):  200      _API_BASE_URL 
=  'https://api-cbc.cloud.clearleap.com/cloffice/client/'  202          'media' :  'http://search.yahoo.com/mrss/' ,  203          'clearleap' :  'http://www.clearleap.com/namespace/clearleap/1.0/' ,  205      _GEO_COUNTRIES 
= [ 'CA' ]  207      def  _call_api ( self
,  path
,  video_id
):  208          url 
=  path 
if  path
. startswith ( 'http' )  else  self
._ API
_ BASE
_U RL 
+  path
 209          result 
=  self
._ download
_ xml
( url
,  video_id
,  headers
={  210              'X-Clearleap-DeviceId' :  self
._ device
_ id
,  211              'X-Clearleap-DeviceToken' :  self
._ device
_ token
,  213          error_message 
=  xpath_text ( result
,  'userMessage' )  or  xpath_text ( result
,  'systemMessage' )  215              raise  ExtractorError ( ' %s  said:  %s '  % ( self
. IE_NAME
,  error_message
))  218      def  _real_initialize ( self
):  219          if not  self
._ device
_ id 
or not  self
._ device
_ token
:  220              device 
=  self
._ downloader
. cache
. load ( 'cbcwatch' ,  'device' )  or  {}  221              self
._ device
_ id
,  self
._ device
_ token 
=  device
. get ( 'id' ),  device
. get ( 'token' )  222              if not  self
._ device
_ id 
or not  self
._ device
_ token
:  223                  result 
=  self
._ download
_ xml
(  224                      self
._ API
_ BASE
_U RL 
+  'device/register' ,  225                      None ,  data
= b
'<device><type>web</type></device>' )  226                  self
._ device
_ id 
=  xpath_text ( result
,  'deviceId' ,  fatal
= True )  227                  self
._ device
_ token 
=  xpath_text ( result
,  'deviceToken' ,  fatal
= True )  228                  self
._ downloader
. cache
. store (  229                      'cbcwatch' ,  'device' , {  230                          'id' :  self
._ device
_ id
,  231                          'token' :  self
._ device
_ token
,  234      def  _parse_rss_feed ( self
,  rss
):  235          channel 
=  xpath_element ( rss
,  'channel' ,  fatal
= True )  238              return  xpath_with_ns ( path
,  self
._ NS
_ MAP
)  241          for  item 
in  channel
. findall ( 'item' ):  242              guid 
=  xpath_text ( item
,  'guid' ,  fatal
= True )  243              title 
=  xpath_text ( item
,  'title' ,  fatal
= True )  245              media_group 
=  xpath_element ( item
,  _add_ns ( 'media:group' ),  fatal
= True )  246              content 
=  xpath_element ( media_group
,  _add_ns ( 'media:content' ),  fatal
= True )  247              content_url 
=  content
. attrib
[ 'url' ]  250              for  thumbnail 
in  media_group
. findall ( _add_ns ( 'media:thumbnail' )):  251                  thumbnail_url 
=  thumbnail
. get ( 'url' )  252                  if not  thumbnail_url
:  255                      'id' :  thumbnail
. get ( 'profile' ),  256                      'url' :  thumbnail_url
,  257                      'width' :  int_or_none ( thumbnail
. get ( 'width' )),  258                      'height' :  int_or_none ( thumbnail
. get ( 'height' )),  262              release_date 
=  find_xpath_attr (  263                  item
,  _add_ns ( 'media:credit' ),  'role' ,  'releaseDate' )  264              if  release_date 
is not None :  265                  timestamp 
=  parse_iso8601 ( release_date
. text
)  268                  '_type' :  'url_transparent' ,  272                  'description' :  xpath_text ( item
,  'description' ),  273                  'timestamp' :  timestamp
,  274                  'duration' :  int_or_none ( content
. get ( 'duration' )),  275                  'age_limit' :  parse_age_limit ( xpath_text ( item
,  _add_ns ( 'media:rating' ))),  276                  'episode' :  xpath_text ( item
,  _add_ns ( 'clearleap:episode' )),  277                  'episode_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:episodeInSeason' ))),  278                  'series' :  xpath_text ( item
,  _add_ns ( 'clearleap:series' )),  279                  'season_number' :  int_or_none ( xpath_text ( item
,  _add_ns ( 'clearleap:season' ))),  280                  'thumbnails' :  thumbnails
,  281                  'ie_key' :  'CBCWatchVideo' ,  284          return  self
. playlist_result (  285              entries
,  xpath_text ( channel
,  'guid' ),  286              xpath_text ( channel
,  'title' ),  287              xpath_text ( channel
,  'description' ))  290  class  CBCWatchVideoIE ( CBCWatchBaseIE
):  291      IE_NAME 
=  'cbc.ca:watch:video'  292      _VALID_URL 
=  r
'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'  294          # geo-restricted to Canada, bypassable  295          'url' :  'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235' ,  296          'only_matching' :  True ,  299      def  _real_extract ( self
,  url
):  300          video_id 
=  self
._ match
_ id
( url
)  301          result 
=  self
._ call
_ api
( url
,  video_id
)  303          m3u8_url 
=  xpath_text ( result
,  'url' ,  fatal
= True )  304          formats 
=  self
._ extract
_ m
3u8_ formats
( re
. sub ( r
'/([^/]+)/[^/?]+\.m3u8' ,  r
'/\1/\1.m3u8' ,  m3u8_url
),  video_id
,  'mp4' ,  fatal
= False )  306              formats 
=  self
._ extract
_ m
3u8_ formats
( m3u8_url
,  video_id
,  'mp4' )  308              format_id 
=  f
. get ( 'format_id' )  309              if  format_id
. startswith ( 'AAC' ):  311              elif  format_id
. startswith ( 'AC3' ):  313          self
._ sort
_ formats
( formats
)  321          rss 
=  xpath_element ( result
,  'rss' )  323              info
. update ( self
._ parse
_ rss
_ feed
( rss
)[ 'entries' ][ 0 ])  330  class  CBCWatchIE ( CBCWatchBaseIE
):  331      IE_NAME 
=  'cbc.ca:watch'  332      _VALID_URL 
=  r
'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'  334          # geo-restricted to Canada, bypassable  335          'url' :  'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4' ,  337              'id' :  '9673749a-5e77-484c-8b62-a1092a6b5168' ,  339              'title' :  'Customer (Dis)Service' ,  340              'description' :  'md5:8bdd6913a0fe03d4b2a17ebe169c7c87' ,  341              'upload_date' :  '20160219' ,  342              'timestamp' :  1455840000 ,  346              'skip_download' :  True ,  347              'format' :  'bestvideo' ,  350          # geo-restricted to Canada, bypassable  351          'url' :  'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057' ,  353              'id' :  '1ed4b385-cd84-49cf-95f0-80f004680057' ,  355              'description' :  'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.' ,  357          'playlist_mincount' :  30 ,  360      def  _real_extract ( self
,  url
):  361          video_id 
=  self
._ match
_ id
( url
)  362          rss 
=  self
._ call
_ api
( 'web/browse/'  +  video_id
,  video_id
)  363          return  self
._ parse
_ rss
_ feed
( rss
)  366  class  CBCOlympicsIE ( InfoExtractor
):  367      IE_NAME 
=  'cbc.ca:olympics'  368      _VALID_URL 
=  r
'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)'  370          'url' :  'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/' ,  371          'only_matching' :  True ,  374      def  _real_extract ( self
,  url
):  375          display_id 
=  self
._ match
_ id
( url
)  376          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  377          video_id 
=  self
._ hidden
_ inputs
( webpage
)[ 'videoId' ]  378          video_doc 
=  self
._ download
_ xml
(  379              'https://olympics.cbc.ca/videodata/ %s .xml'  %  video_id
,  video_id
)  380          title 
=  xpath_text ( video_doc
,  'title' ,  fatal
= True )  381          is_live 
=  xpath_text ( video_doc
,  'kind' ) ==  'Live'  383              title 
=  self
._l ive
_ title
( title
)  386          for  video_source 
in  video_doc
. findall ( 'videoSources/videoSource' ):  387              uri 
=  xpath_text ( video_source
,  'uri' )  390              tokenize 
=  self
._ download
_ json
(  391                  'https://olympics.cbc.ca/api/api-akamai/tokenize' ,  392                  video_id
,  data
= json
. dumps ({  394                  }). encode (),  headers
={  395                      'Content-Type' :  'application/json' ,  397                      # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js  398                      'Cookie' :  '_dvp=TK:C0ObxjerU' ,   # AKAMAI CDN cookie  402              content_url 
=  tokenize
[ 'ContentUrl' ]  403              video_source_format 
=  video_source
. get ( 'format' )  404              if  video_source_format 
==  'IIS' :  405                  formats
. extend ( self
._ extract
_ ism
_ formats
(  406                      content_url
,  video_id
,  ism_id
= video_source_format
,  fatal
= False ))  408                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  409                      content_url
,  video_id
,  'mp4' ,  410                      'm3u8'  if  is_live 
else  'm3u8_native' ,  411                      m3u8_id
= video_source_format
,  fatal
= False ))  412          self
._ sort
_ formats
( formats
)  416              'display_id' :  display_id
,  418              'description' :  xpath_text ( video_doc
,  'description' ),  419              'thumbnail' :  xpath_text ( video_doc
,  'thumbnailUrl' ),  420              'duration' :  parse_duration ( xpath_text ( video_doc
,  'duration' )),