Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/canvas.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from .gigya import GigyaBaseIE
   8 from ..compat import compat_HTTPError
   9 from ..utils import (
  10     ExtractorError,
  11     strip_or_none,
  12     float_or_none,
  13     int_or_none,
  14     merge_dicts,
  15     parse_iso8601,
  16 )
  17
  18
  19 class CanvasIE(InfoExtractor):
  20     _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
  21     _TESTS = [{
  22         'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  23         'md5': '90139b746a0a9bd7bb631283f6e2a64e',
  24         'info_dict': {
  25             'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  26             'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  27             'ext': 'flv',
  28             'title': 'Nachtwacht: De Greystook',
  29             'description': 'md5:1db3f5dc4c7109c821261e7512975be7',
  30             'thumbnail': r're:^https?://.*\.jpg$',
  31             'duration': 1468.03,
  32         },
  33         'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
  34     }, {
  35         'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  36         'only_matching': True,
  37     }]
  38     _HLS_ENTRY_PROTOCOLS_MAP = {
  39         'HLS': 'm3u8_native',
  40         'HLS_AES': 'm3u8',
  41     }
  42
  43     def _real_extract(self, url):
  44         mobj = re.match(self._VALID_URL, url)
  45         site_id, video_id = mobj.group('site_id'), mobj.group('id')
  46
  47         data = self._download_json(
  48             'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  49             % (site_id, video_id), video_id)
  50
  51         title = data['title']
  52         description = data.get('description')
  53
  54         formats = []
  55         for target in data['targetUrls']:
  56             format_url, format_type = target.get('url'), target.get('type')
  57             if not format_url or not format_type:
  58                 continue
  59             if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  60                 formats.extend(self._extract_m3u8_formats(
  61                     format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  62                     m3u8_id=format_type, fatal=False))
  63             elif format_type == 'HDS':
  64                 formats.extend(self._extract_f4m_formats(
  65                     format_url, video_id, f4m_id=format_type, fatal=False))
  66             elif format_type == 'MPEG_DASH':
  67                 formats.extend(self._extract_mpd_formats(
  68                     format_url, video_id, mpd_id=format_type, fatal=False))
  69             elif format_type == 'HSS':
  70                 formats.extend(self._extract_ism_formats(
  71                     format_url, video_id, ism_id='mss', fatal=False))
  72             else:
  73                 formats.append({
  74                     'format_id': format_type,
  75                     'url': format_url,
  76                 })
  77         self._sort_formats(formats)
  78
  79         subtitles = {}
  80         subtitle_urls = data.get('subtitleUrls')
  81         if isinstance(subtitle_urls, list):
  82             for subtitle in subtitle_urls:
  83                 subtitle_url = subtitle.get('url')
  84                 if subtitle_url and subtitle.get('type') == 'CLOSED':
  85                     subtitles.setdefault('nl', []).append({'url': subtitle_url})
  86
  87         return {
  88             'id': video_id,
  89             'display_id': video_id,
  90             'title': title,
  91             'description': description,
  92             'formats': formats,
  93             'duration': float_or_none(data.get('duration'), 1000),
  94             'thumbnail': data.get('posterImageUrl'),
  95             'subtitles': subtitles,
  96         }
  97
  98
  99 class CanvasEenIE(InfoExtractor):
 100     IE_DESC = 'canvas.be and een.be'
 101     _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 102     _TESTS = [{
 103         'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
 104         'md5': 'ed66976748d12350b118455979cca293',
 105         'info_dict': {
 106             'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
 107             'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
 108             'ext': 'flv',
 109             'title': 'De afspraak veilt voor de Warmste Week',
 110             'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
 111             'thumbnail': r're:^https?://.*\.jpg$',
 112             'duration': 49.02,
 113         },
 114         'expected_warnings': ['is not a supported codec'],
 115     }, {
 116         # with subtitles
 117         'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
 118         'info_dict': {
 119             'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
 120             'display_id': 'pieter-0167',
 121             'ext': 'mp4',
 122             'title': 'Pieter 0167',
 123             'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
 124             'thumbnail': r're:^https?://.*\.jpg$',
 125             'duration': 2553.08,
 126             'subtitles': {
 127                 'nl': [{
 128                     'ext': 'vtt',
 129                 }],
 130             },
 131         },
 132         'params': {
 133             'skip_download': True,
 134         },
 135         'skip': 'Pagina niet gevonden',
 136     }, {
 137         'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles',
 138         'info_dict': {
 139             'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f',
 140             'display_id': 'herbekijk-sorry-voor-alles',
 141             'ext': 'mp4',
 142             'title': 'Herbekijk Sorry voor alles',
 143             'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3',
 144             'thumbnail': r're:^https?://.*\.jpg$',
 145             'duration': 3788.06,
 146         },
 147         'params': {
 148             'skip_download': True,
 149         },
 150         'skip': 'Episode no longer available',
 151     }, {
 152         'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
 153         'only_matching': True,
 154     }]
 155
 156     def _real_extract(self, url):
 157         mobj = re.match(self._VALID_URL, url)
 158         site_id, display_id = mobj.group('site_id'), mobj.group('id')
 159
 160         webpage = self._download_webpage(url, display_id)
 161
 162         title = strip_or_none(self._search_regex(
 163             r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
 164             webpage, 'title', default=None) or self._og_search_title(
 165             webpage, default=None))
 166
 167         video_id = self._html_search_regex(
 168             r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
 169             group='id')
 170
 171         return {
 172             '_type': 'url_transparent',
 173             'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
 174             'ie_key': CanvasIE.ie_key(),
 175             'id': video_id,
 176             'display_id': display_id,
 177             'title': title,
 178             'description': self._og_search_description(webpage),
 179         }
 180
 181
 182 class VrtNUIE(GigyaBaseIE):
 183     IE_DESC = 'VrtNU.be'
 184     _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 185     _TESTS = [{
 186         'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
 187         'info_dict': {
 188             'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
 189             'ext': 'flv',
 190             'title': 'De zwarte weduwe',
 191             'description': 'md5:d90c21dced7db869a85db89a623998d4',
 192             'duration': 1457.04,
 193             'thumbnail': r're:^https?://.*\.jpg$',
 194             'season': '1',
 195             'season_number': 1,
 196             'episode_number': 1,
 197         },
 198         'skip': 'This video is only available for registered users'
 199     }]
 200     _NETRC_MACHINE = 'vrtnu'
 201     _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
 202     _CONTEXT_ID = 'R3595707040'
 203
 204     def _real_initialize(self):
 205         self._login()
 206
 207     def _login(self):
 208         username, password = self._get_login_info()
 209         if username is None:
 210             return
 211
 212         auth_data = {
 213             'APIKey': self._APIKEY,
 214             'targetEnv': 'jssdk',
 215             'loginID': username,
 216             'password': password,
 217             'authMode': 'cookie',
 218         }
 219
 220         auth_info = self._gigya_login(auth_data)
 221
 222         # Sometimes authentication fails for no good reason, retry
 223         login_attempt = 1
 224         while login_attempt <= 3:
 225             try:
 226                 # When requesting a token, no actual token is returned, but the
 227                 # necessary cookies are set.
 228                 self._request_webpage(
 229                     'https://token.vrt.be',
 230                     None, note='Requesting a token', errnote='Could not get a token',
 231                     headers={
 232                         'Content-Type': 'application/json',
 233                         'Referer': 'https://www.vrt.be/vrtnu/',
 234                     },
 235                     data=json.dumps({
 236                         'uid': auth_info['UID'],
 237                         'uidsig': auth_info['UIDSignature'],
 238                         'ts': auth_info['signatureTimestamp'],
 239                         'email': auth_info['profile']['email'],
 240                     }).encode('utf-8'))
 241             except ExtractorError as e:
 242                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
 243                     login_attempt += 1
 244                     self.report_warning('Authentication failed')
 245                     self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
 246                 else:
 247                     raise e
 248             else:
 249                 break
 250
 251     def _real_extract(self, url):
 252         display_id = self._match_id(url)
 253
 254         webpage, urlh = self._download_webpage_handle(url, display_id)
 255
 256         info = self._search_json_ld(webpage, display_id, default={})
 257
 258         # title is optional here since it may be extracted by extractor
 259         # that is delegated from here
 260         title = strip_or_none(self._html_search_regex(
 261             r'(?ms)<h1 class="content__heading">(.+?)</h1>',
 262             webpage, 'title', default=None))
 263
 264         description = self._html_search_regex(
 265             r'(?ms)<div class="content__description">(.+?)</div>',
 266             webpage, 'description', default=None)
 267
 268         season = self._html_search_regex(
 269             [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
 270                     <span>seizoen\ (.+?)</span>\s*
 271                 </div>''',
 272              r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
 273             webpage, 'season', default=None)
 274
 275         season_number = int_or_none(season)
 276
 277         episode_number = int_or_none(self._html_search_regex(
 278             r'''(?xms)<div\ class="content__episode">\s*
 279                     <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
 280                 </div>''',
 281             webpage, 'episode_number', default=None))
 282
 283         release_date = parse_iso8601(self._html_search_regex(
 284             r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
 285             webpage, 'release_date', default=None))
 286
 287         # If there's a ? or a # in the URL, remove them and everything after
 288         clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
 289         securevideo_url = clean_url + '.mssecurevideo.json'
 290
 291         try:
 292             video = self._download_json(securevideo_url, display_id)
 293         except ExtractorError as e:
 294             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
 295                 self.raise_login_required()
 296             raise
 297
 298         # We are dealing with a '../<show>.relevant' URL
 299         redirect_url = video.get('url')
 300         if redirect_url:
 301             return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
 302
 303         # There is only one entry, but with an unknown key, so just get
 304         # the first one
 305         video_id = list(video.values())[0].get('videoid')
 306
 307         return merge_dicts(info, {
 308             '_type': 'url_transparent',
 309             'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
 310             'ie_key': CanvasIE.ie_key(),
 311             'id': video_id,
 312             'display_id': display_id,
 313             'title': title,
 314             'description': description,
 315             'season': season,
 316             'season_number': season_number,
 317             'episode_number': episode_number,
 318             'release_date': release_date,
 319         })