]> Raphaël G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/radiocanada.py
Prepare to upload.
[youtubedl] / youtube_dl / extractor / radiocanada.py
index 8ec402646767a22f8a1e7cedbc89eb8576b59804..a28b1a24c4df1ea7809778b36319f06a580a68e2 100644 (file)
@@ -4,117 +4,133 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_HTTPError
 from ..utils import (
 from ..utils import (
-    xpath_text,
-    find_xpath_attr,
     determine_ext,
     determine_ext,
+    ExtractorError,
     int_or_none,
     unified_strdate,
     int_or_none,
     unified_strdate,
-    xpath_element,
-    ExtractorError,
-    determine_protocol,
 )
 
 
 class RadioCanadaIE(InfoExtractor):
     IE_NAME = 'radiocanada'
     _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
 )
 
 
 class RadioCanadaIE(InfoExtractor):
     IE_NAME = 'radiocanada'
     _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
-        'info_dict': {
-            'id': '7184272',
-            'ext': 'mp4',
-            'title': 'Le parcours du tireur capté sur vidéo',
-            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
-            'upload_date': '20141023',
+    _TESTS = [
+        {
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+            'info_dict': {
+                'id': '7184272',
+                'ext': 'mp4',
+                'title': 'Le parcours du tireur capté sur vidéo',
+                'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+                'upload_date': '20141023',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
         },
         },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
+        {
+            # empty Title
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/',
+            'info_dict': {
+                'id': '7754998',
+                'ext': 'mp4',
+                'title': 'letelejournal22h',
+                'description': 'INTEGRALE WEB 22H-TJ',
+                'upload_date': '20170720',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         },
-    }
-
-    def _real_extract(self, url):
-        app_code, video_id = re.match(self._VALID_URL, url).groups()
-
-        device_types = ['ipad', 'android']
-        if app_code != 'toutv':
-            device_types.append('flash')
-
-        formats = []
-        # TODO: extract f4m formats
-        # f4m formats can be extracted using flashhd device_type but they produce unplayable file
-        for device_type in device_types:
-            v_data = self._download_xml(
-                'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
-                video_id, note='Downloading %s XML' % device_type, query={
-                    'appCode': app_code,
-                    'idMedia': video_id,
-                    'connectionType': 'broadband',
-                    'multibitrate': 'true',
-                    'deviceType': device_type,
-                    # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
-                    'paysJ391wsHjbOJwvCs26toz': 'CA',
-                    'bypasslock': 'NZt5K62gRqfc',
-                }, fatal=False)
-            v_url = xpath_text(v_data, 'url')
-            if not v_url:
-                continue
-            if v_url == 'null':
-                raise ExtractorError('%s said: %s' % (
-                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
-            ext = determine_ext(v_url)
-            if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
-            elif ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    v_url, video_id, f4m_id='hds', fatal=False))
-            else:
-                ext = determine_ext(v_url)
-                bitrates = xpath_element(v_data, 'bitrates')
-                for url_e in bitrates.findall('url'):
-                    tbr = int_or_none(url_e.get('bitrate'))
-                    if not tbr:
-                        continue
-                    f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
-                    protocol = determine_protocol({'url': f_url})
-                    formats.append({
-                        'format_id': '%s-%d' % (protocol, tbr),
-                        'url': f_url,
-                        'ext': 'flv' if protocol == 'rtmp' else ext,
-                        'protocol': protocol,
-                        'width': int_or_none(url_e.get('width')),
-                        'height': int_or_none(url_e.get('height')),
-                        'tbr': tbr,
-                    })
-                    if protocol == 'rtsp':
-                        base_url = self._search_regex(
-                            r'rtsp://([^?]+)', f_url, 'base url', default=None)
-                        if base_url:
-                            base_url = 'http://' + base_url
-                            formats.extend(self._extract_m3u8_formats(
-                                base_url + '/playlist.m3u8', video_id, 'mp4',
-                                'm3u8_native', m3u8_id='hls', fatal=False))
-                            formats.extend(self._extract_f4m_formats(
-                                base_url + '/manifest.f4m', video_id,
-                                f4m_id='hds', fatal=False))
-        self._sort_formats(formats)
+        {
+            # with protectionType but not actually DRM protected
+            'url': 'radiocanada:toutv:140872',
+            'info_dict': {
+                'id': '140872',
+                'title': 'Épisode 1',
+                'series': 'District 31',
+            },
+            'only_matching': True,
+        }
+    ]
+    _GEO_COUNTRIES = ['CA']
+    _access_token = None
+    _claims = None
 
 
-        metadata = self._download_xml(
-            'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
-            video_id, note='Downloading metadata XML', query={
+    def _call_api(self, path, video_id=None, app_code=None, query=None):
+        if not query:
+            query = {}
+        query.update({
+            'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb',
+            'output': 'json',
+        })
+        if video_id:
+            query.update({
                 'appCode': app_code,
                 'idMedia': video_id,
             })
                 'appCode': app_code,
                 'idMedia': video_id,
             })
+        if self._access_token:
+            query['access_token'] = self._access_token
+        try:
+            return self._download_json(
+                'https://services.radio-canada.ca/media/' + path, video_id, query=query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
+                data = self._parse_json(e.cause.read().decode(), None)
+                error = data.get('error_description') or data['errorMessage']['text']
+                raise ExtractorError(error, expected=True)
+            raise
+
+    def _extract_info(self, app_code, video_id):
+        metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas']
 
         def get_meta(name):
 
         def get_meta(name):
-            el = find_xpath_attr(metadata, './/Meta', 'name', name)
-            return el.text if el is not None else None
+            for meta in metas:
+                if meta.get('name') == name:
+                    text = meta.get('text')
+                    if text:
+                        return text
+
+        # protectionType does not necessarily mean the video is DRM protected (see
+        # https://github.com/ytdl-org/youtube-dl/pull/18609).
+        if get_meta('protectionType'):
+            self.report_warning('This video is probably DRM protected.')
+
+        query = {
+            'connectionType': 'hd',
+            'deviceType': 'ipad',
+            'multibitrate': 'true',
+        }
+        if self._claims:
+            query['claims'] = self._claims
+        v_data = self._call_api('validation/v2/', video_id, app_code, query)
+        v_url = v_data.get('url')
+        if not v_url:
+            error = v_data['message']
+            if error == "Le contenu sélectionné n'est pas disponible dans votre pays":
+                raise self.raise_geo_restricted(error, self._GEO_COUNTRIES)
+            if error == 'Le contenu sélectionné est disponible seulement en premium':
+                self.raise_login_required(error)
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+        formats = self._extract_m3u8_formats(v_url, video_id, 'mp4')
+        self._sort_formats(formats)
+
+        subtitles = {}
+        closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5')
+        if closed_caption_url:
+            subtitles['fr'] = [{
+                'url': closed_caption_url,
+                'ext': determine_ext(closed_caption_url, 'vtt'),
+            }]
 
         return {
             'id': video_id,
 
         return {
             'id': video_id,
-            'title': get_meta('Title'),
+            'title': get_meta('Title') or get_meta('AV-nomEmission'),
             'description': get_meta('Description') or get_meta('ShortDescription'),
             'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
             'duration': int_or_none(get_meta('length')),
             'description': get_meta('Description') or get_meta('ShortDescription'),
             'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
             'duration': int_or_none(get_meta('length')),
@@ -122,14 +138,18 @@ class RadioCanadaIE(InfoExtractor):
             'season_number': int_or_none('SrcSaison'),
             'episode_number': int_or_none('SrcEpisode'),
             'upload_date': unified_strdate(get_meta('Date')),
             'season_number': int_or_none('SrcSaison'),
             'episode_number': int_or_none('SrcEpisode'),
             'upload_date': unified_strdate(get_meta('Date')),
+            'subtitles': subtitles,
             'formats': formats,
         }
 
             'formats': formats,
         }
 
+    def _real_extract(self, url):
+        return self._extract_info(*re.match(self._VALID_URL, url).groups())
+
 
 class RadioCanadaAudioVideoIE(InfoExtractor):
 
 class RadioCanadaAudioVideoIE(InfoExtractor):
-    'radiocanada:audiovideo'
-    _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
-    _TEST = {
+    IE_NAME = 'radiocanada:audiovideo'
+    _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)'
+    _TESTS = [{
         'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
         'info_dict': {
             'id': '7527184',
         'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
         'info_dict': {
             'id': '7527184',
@@ -142,7 +162,10 @@ class RadioCanadaAudioVideoIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         },
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         return self.url_result('radiocanada:medianet:%s' % self._match_id(url))
 
     def _real_extract(self, url):
         return self.url_result('radiocanada:medianet:%s' % self._match_id(url))