2 from __future__
import unicode_literals
7 from .common
import InfoExtractor
30 class CBCIE(InfoExtractor
):
32 _VALID_URL
= r
'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
35 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
36 'md5': '97e24d09672fc4cf56256d6faa6c25bc',
40 'title': 'Don Cherry – All-Stars',
41 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
42 'timestamp': 1454463000,
43 'upload_date': '20160203',
44 'uploader': 'CBCC-NEW',
46 'skip': 'Geo-restricted to Canada',
48 # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com
49 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4',
50 'md5': '162adfa070274b144f4fdc3c3b8207db',
54 'title': '22 Minutes Update: What Not To Wear Quebec',
55 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.",
56 'upload_date': '20131025',
57 'uploader': 'CBCC-NEW',
58 'timestamp': 1382717907,
61 # with clipId, feed only available via tpfeed.cbc.ca
62 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
63 'md5': '0274a90b51a9b4971fe005c63f592f12',
67 'title': 'Robin Williams freestyles on 90 Minutes Live',
68 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
69 'upload_date': '19780210',
70 'uploader': 'CBCC-NEW',
71 'timestamp': 255977160,
75 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
77 'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
81 'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
82 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
83 'upload_date': '20160201',
84 'timestamp': 1454342820,
85 'uploader': 'CBCC-NEW',
88 'md5': '415a0e3f586113894174dfb31aa5bb1a',
92 'title': 'Fly like an eagle!',
93 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
94 'upload_date': '20150315',
95 'timestamp': 1426443984,
96 'uploader': 'CBCC-NEW',
99 'skip': 'Geo-restricted to Canada',
101 # multiple CBC.APP.Caffeine.initInstance(...)
102 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238',
104 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
105 'id': 'dog-indoor-exercise-winter-1.3928238',
106 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
108 'playlist_mincount': 6,
112 def suitable(cls
, url
):
113 return False if CBCPlayerIE
.suitable(url
) else super(CBCIE
, cls
).suitable(url
)
115 def _extract_player_init(self
, player_init
, display_id
):
116 player_info
= self
._parse
_json
(player_init
, display_id
, js_to_json
)
117 media_id
= player_info
.get('mediaId')
119 clip_id
= player_info
['clipId']
120 feed
= self
._download
_json
(
121 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id
,
122 clip_id
, fatal
=False)
124 media_id
= try_get(feed
, lambda x
: x
['entries'][0]['guid'], compat_str
)
126 media_id
= self
._download
_json
(
127 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id
,
128 clip_id
)['entries'][0]['id'].split('/')[-1]
129 return self
.url_result('cbcplayer:%s' % media_id
, 'CBCPlayer', media_id
)
131 def _real_extract(self
, url
):
132 display_id
= self
._match
_id
(url
)
133 webpage
= self
._download
_webpage
(url
, display_id
)
134 title
= self
._og
_search
_title
(webpage
, default
=None) or self
._html
_search
_meta
(
135 'twitter:title', webpage
, 'title', default
=None) or self
._html
_search
_regex
(
136 r
'<title>([^<]+)</title>', webpage
, 'title', fatal
=False)
138 self
._extract
_player
_init
(player_init
, display_id
)
139 for player_init
in re
.findall(r
'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage
)]
142 r
'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
143 r
'<div[^>]+\bid=["\']player
-(\d
+)',
144 r'guid
["\']\s*:\s*["\'](\d
+)'):
145 media_ids.extend(re.findall(media_id_re, webpage))
147 self.url_result('cbcplayer
:%s' % media_id, 'CBCPlayer
', media_id)
148 for media_id in orderedSet(media_ids)])
149 return self.playlist_result(
150 entries, display_id, strip_or_none(title),
151 self._og_search_description(webpage))
154 class CBCPlayerIE(InfoExtractor):
155 IE_NAME = 'cbc
.ca
:player
'
156 _VALID_URL = r'(?
:cbcplayer
:|https?
://(?
:www\
.)?cbc\
.ca
/(?
:player
/play
/|i
/caffeine
/syndicate
/\?mediaId
=))(?P
<id>\d
+)'
158 'url
': 'http
://www
.cbc
.ca
/player
/play
/2683190193',
159 'md5
': '64d25f841ddf4ddb28a235338af32e2c
',
163 'title
': 'Gerry Runs a Sweat Shop
',
164 'description
': 'md5
:b457e1c01e8ff408d9d801c1c2cd29b0
',
165 'timestamp
': 1455071400,
166 'upload_date
': '20160210',
167 'uploader
': 'CBCC
-NEW
',
169 'skip
': 'Geo
-restricted to Canada
',
171 # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
172 'url
': 'http
://www
.cbc
.ca
/player
/play
/2657631896',
173 'md5
': 'e5e708c34ae6fca156aafe17c43e8b75
',
177 'title
': 'CBC Montreal
is organizing its first ever community hackathon
!',
178 'description
': 'The modern technology we tend to depend on so heavily
, is never without it
\'s share of hiccups
and headaches
. Next weekend
- CBC Montreal will be getting members of the public
for its first Hackathon
.',
179 'timestamp
': 1425704400,
180 'upload_date
': '20150307',
181 'uploader
': 'CBCC
-NEW
',
184 'url
': 'http
://www
.cbc
.ca
/player
/play
/2164402062',
185 'md5
': '33fcd8f6719b9dd60a5e73adcb83b9f6
',
189 'title
': 'Cancer survivor four times over
',
190 'description
': 'Tim Mayer has beaten three different forms of cancer four times
in five years
.',
191 'timestamp
': 1320410746,
192 'upload_date
': '20111104',
193 'uploader
': 'CBCC
-NEW
',
197 def _real_extract(self, url):
198 video_id = self._match_id(url)
200 '_type
': 'url_transparent
',
201 'ie_key
': 'ThePlatform
',
203 'http
://link
.theplatform
.com
/s
/ExhSPC
/media
/guid
/2655402169/%s?mbr
=true
&formats
=MPEG4
,FLV
,MP3
' % video_id, {
204 'force_smil_url
': True
210 class CBCWatchBaseIE(InfoExtractor):
213 _API_BASE_URL = 'https
://api
-cbc
.cloud
.clearleap
.com
/cloffice
/client
/'
215 'media
': 'http
://search
.yahoo
.com
/mrss
/',
216 'clearleap
': 'http
://www
.clearleap
.com
/namespace
/clearleap
/1.0/',
218 _GEO_COUNTRIES = ['CA
']
220 def _call_api(self, path, video_id):
221 url = path if path.startswith('http
') else self._API_BASE_URL + path
224 result = self._download_xml(url, video_id, headers={
225 'X
-Clearleap
-DeviceId
': self._device_id,
226 'X
-Clearleap
-DeviceToken
': self._device_token,
228 except ExtractorError as e:
229 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
230 # Device token has expired, re-acquiring device token
231 self._register_device()
234 error_message = xpath_text(result, 'userMessage
') or xpath_text(result, 'systemMessage
')
236 raise ExtractorError('%s said
: %s' % (self.IE_NAME, error_message))
239 def _real_initialize(self):
240 if self._valid_device_token():
242 device = self._downloader.cache.load('cbcwatch
', 'device
') or {}
243 self._device_id, self._device_token = device.get('id'), device.get('token
')
244 if self._valid_device_token():
246 self._register_device()
248 def _valid_device_token(self):
249 return self._device_id and self._device_token
251 def _register_device(self):
252 self._device_id = self._device_token = None
253 result = self._download_xml(
254 self._API_BASE_URL + 'device
/register
',
255 None, 'Acquiring device token
',
256 data=b'<device
><type>web
</type></device
>')
257 self._device_id = xpath_text(result, 'deviceId
', fatal=True)
258 self._device_token = xpath_text(result, 'deviceToken
', fatal=True)
259 self._downloader.cache.store(
260 'cbcwatch
', 'device
', {
261 'id': self._device_id,
262 'token
': self._device_token,
265 def _parse_rss_feed(self, rss):
266 channel = xpath_element(rss, 'channel
', fatal=True)
269 return xpath_with_ns(path, self._NS_MAP)
272 for item in channel.findall('item
'):
273 guid = xpath_text(item, 'guid
', fatal=True)
274 title = xpath_text(item, 'title
', fatal=True)
276 media_group = xpath_element(item, _add_ns('media
:group
'), fatal=True)
277 content = xpath_element(media_group, _add_ns('media
:content
'), fatal=True)
278 content_url = content.attrib['url
']
281 for thumbnail in media_group.findall(_add_ns('media
:thumbnail
')):
282 thumbnail_url = thumbnail.get('url
')
283 if not thumbnail_url:
286 'id': thumbnail.get('profile
'),
287 'url
': thumbnail_url,
288 'width
': int_or_none(thumbnail.get('width
')),
289 'height
': int_or_none(thumbnail.get('height
')),
293 release_date = find_xpath_attr(
294 item, _add_ns('media
:credit
'), 'role
', 'releaseDate
')
295 if release_date is not None:
296 timestamp = parse_iso8601(release_date.text)
299 '_type
': 'url_transparent
',
303 'description
': xpath_text(item, 'description
'),
304 'timestamp
': timestamp,
305 'duration
': int_or_none(content.get('duration
')),
306 'age_limit
': parse_age_limit(xpath_text(item, _add_ns('media
:rating
'))),
307 'episode
': xpath_text(item, _add_ns('clearleap
:episode
')),
308 'episode_number
': int_or_none(xpath_text(item, _add_ns('clearleap
:episodeInSeason
'))),
309 'series
': xpath_text(item, _add_ns('clearleap
:series
')),
310 'season_number
': int_or_none(xpath_text(item, _add_ns('clearleap
:season
'))),
311 'thumbnails
': thumbnails,
312 'ie_key
': 'CBCWatchVideo
',
315 return self.playlist_result(
316 entries, xpath_text(channel, 'guid
'),
317 xpath_text(channel, 'title
'),
318 xpath_text(channel, 'description
'))
321 class CBCWatchVideoIE(CBCWatchBaseIE):
322 IE_NAME = 'cbc
.ca
:watch
:video
'
323 _VALID_URL = r'https?
://api
-cbc\
.cloud\
.clearleap\
.com
/cloffice
/client
/web
/play
/?
\?.*?
\bcontentId
=(?P
<id>[\da
-f
]{8}
-[\da
-f
]{4}
-[\da
-f
]{4}
-[\da
-f
]{4}
-[\da
-f
]{12}
)'
325 # geo-restricted to Canada, bypassable
326 'url
': 'https
://api
-cbc
.cloud
.clearleap
.com
/cloffice
/client
/web
/play
/?contentId
=3c84472a
-1eea
-4dee
-9267-2655d5055dcf
&categoryId
=ebc258f5
-ee40
-4cca
-b66b
-ba6bd55b7235
',
327 'only_matching
': True,
330 def _real_extract(self, url):
331 video_id = self._match_id(url)
332 result = self._call_api(url, video_id)
334 m3u8_url = xpath_text(result, 'url
', fatal=True)
335 formats = self._extract_m3u8_formats(re.sub(r'/([^
/]+)/[^
/?
]+\
.m3u8
', r'/\
1/\
1.m3u8
', m3u8_url), video_id, 'mp4
', fatal=False)
337 formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4
')
339 format_id = f.get('format_id
')
340 if format_id.startswith('AAC
'):
342 elif format_id.startswith('AC3
'):
344 self._sort_formats(formats)
352 rss = xpath_element(result, 'rss
')
354 info.update(self._parse_rss_feed(rss)['entries
'][0])
361 class CBCWatchIE(CBCWatchBaseIE):
362 IE_NAME = 'cbc
.ca
:watch
'
363 _VALID_URL = r'https?
://watch\
.cbc\
.ca
/(?
:[^
/]+/)+(?P
<id>[0-9a
-f
-]+)'
365 # geo-restricted to Canada, bypassable
366 'url
': 'http
://watch
.cbc
.ca
/doc
-zone
/season
-6/customer
-disservice
/38e815a
-009e3ab12e4
',
368 'id': '9673749a
-5e77
-484c
-8b62
-a1092a6b5168
',
370 'title
': 'Customer (Dis
)Service
',
371 'description
': 'md5
:8bdd6913a0fe03d4b2a17ebe169c7c87
',
372 'upload_date
': '20160219',
373 'timestamp
': 1455840000,
377 'skip_download
': True,
378 'format
': 'bestvideo
',
381 # geo-restricted to Canada, bypassable
382 'url
': 'http
://watch
.cbc
.ca
/arthur
/all
/1ed4b385
-cd84
-49cf
-95f0
-80f004680057
',
384 'id': '1ed4b385
-cd84
-49cf
-95f0
-80f004680057
',
386 'description
': 'Arthur
, the sweetest
8-year
-old aardvark
, and his pals solve all kinds of problems
with humour
, kindness
and teamwork
.',
388 'playlist_mincount
': 30,
391 def _real_extract(self, url):
392 video_id = self._match_id(url)
393 rss = self._call_api('web
/browse
/' + video_id, video_id)
394 return self._parse_rss_feed(rss)
397 class CBCOlympicsIE(InfoExtractor):
398 IE_NAME = 'cbc
.ca
:olympics
'
399 _VALID_URL = r'https?
://olympics\
.cbc\
.ca
/video
/[^
/]+/(?P
<id>[^
/?
#]+)'
401 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/',
402 'only_matching': True,
405 def _real_extract(self
, url
):
406 display_id
= self
._match
_id
(url
)
407 webpage
= self
._download
_webpage
(url
, display_id
)
408 video_id
= self
._hidden
_inputs
(webpage
)['videoId']
409 video_doc
= self
._download
_xml
(
410 'https://olympics.cbc.ca/videodata/%s.xml' % video_id
, video_id
)
411 title
= xpath_text(video_doc
, 'title', fatal
=True)
412 is_live
= xpath_text(video_doc
, 'kind') == 'Live'
414 title
= self
._live
_title
(title
)
417 for video_source
in video_doc
.findall('videoSources/videoSource'):
418 uri
= xpath_text(video_source
, 'uri')
421 tokenize
= self
._download
_json
(
422 'https://olympics.cbc.ca/api/api-akamai/tokenize',
423 video_id
, data
=json
.dumps({
425 }).encode(), headers
={
426 'Content-Type': 'application/json',
428 # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js
429 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie
433 content_url
= tokenize
['ContentUrl']
434 video_source_format
= video_source
.get('format')
435 if video_source_format
== 'IIS':
436 formats
.extend(self
._extract
_ism
_formats
(
437 content_url
, video_id
, ism_id
=video_source_format
, fatal
=False))
439 formats
.extend(self
._extract
_m
3u8_formats
(
440 content_url
, video_id
, 'mp4',
441 'm3u8' if is_live
else 'm3u8_native',
442 m3u8_id
=video_source_format
, fatal
=False))
443 self
._sort
_formats
(formats
)
447 'display_id': display_id
,
449 'description': xpath_text(video_doc
, 'description'),
450 'thumbnail': xpath_text(video_doc
, 'thumbnailUrl'),
451 'duration': parse_duration(xpath_text(video_doc
, 'duration')),