+# coding: utf-8
+from __future__ import unicode_literals
+
import re
-import json
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- # This is used by the not implemented extractLiveStream method
- compat_urllib_parse,
-
ExtractorError,
+ int_or_none,
+ qualities,
+ try_get,
unified_strdate,
)
-class ArteTvIE(InfoExtractor):
- """
- There are two sources of video in arte.tv: videos.arte.tv and
- www.arte.tv/guide, the extraction process is different for each one.
- The videos expire in 7 days, so we can't add tests.
- """
- _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
- _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
- _LIVE_URL = r'index-[0-9]+\.html$'
-
- IE_NAME = u'arte.tv'
-
- @classmethod
- def suitable(cls, url):
- return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
-
- # TODO implement Live Stream
- # def extractLiveStream(self, url):
- # video_lang = url.split('/')[-4]
- # info = self.grep_webpage(
- # url,
- # r'src="(.*?/videothek_js.*?\.js)',
- # 0,
- # [
- # (1, 'url', u'Invalid URL: %s' % url)
- # ]
- # )
- # http_host = url.split('/')[2]
- # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
- # info = self.grep_webpage(
- # next_url,
- # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
- # '(http://.*?\.swf).*?' +
- # '(rtmp://.*?)\'',
- # re.DOTALL,
- # [
- # (1, 'path', u'could not extract video path: %s' % url),
- # (2, 'player', u'could not extract video player: %s' % url),
- # (3, 'url', u'could not extract video url: %s' % url)
- # ]
- # )
- # video_url = u'%s/%s' % (info.get('url'), info.get('path'))
+# There are different sources of video in arte.tv, the extraction process
+# is different for each one. The videos usually expire in 7 days, so we can't
+# add tests.
- def _real_extract(self, url):
- mobj = re.match(self._EMISSION_URL, url)
- if mobj is not None:
- name = mobj.group('name')
- # This is not a real id, it can be for example AJT for the news
- # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
- video_id = mobj.group('id')
- return self._extract_emission(url, video_id)
-
- mobj = re.match(self._VIDEOS_URL, url)
- if mobj is not None:
- id = mobj.group('id')
- return self._extract_video(url, id)
-
- if re.search(self._LIVE_URL, video_id) is not None:
- raise ExtractorError(u'Arte live streams are not yet supported, sorry')
- # self.extractLiveStream(url)
- # return
-
- def _extract_emission(self, url, video_id):
- """Extract from www.arte.tv/guide"""
- webpage = self._download_webpage(url, video_id)
- json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
-
- json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
- self.report_extraction(video_id)
- info = json.loads(json_info)
+
+class ArteTVBaseIE(InfoExtractor):
+ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
+ info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
- info_dict = {'id': player_info['VID'],
- 'title': player_info['VTI'],
- 'description': player_info['VDE'],
- 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
- 'thumbnail': player_info['programImage'],
- 'ext': 'flv',
- }
-
- formats = player_info['VSR'].values()
- # We order the formats by quality
- formats = sorted(formats, key=lambda f: int(f['height']))
- # Pick the best quality
- format_info = formats[-1]
- if format_info['mediaType'] == u'rtmp':
- info_dict['url'] = format_info['streamer']
- info_dict['play_path'] = 'mp4:' + format_info['url']
- else:
- info_dict['url'] = format_info['url']
+ vsr = try_get(player_info, lambda x: x['VSR'], dict)
+ if not vsr:
+ error = None
+ if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
+ error = try_get(
+ player_info, lambda x: x['custom_msg']['msg'], compat_str)
+ if not error:
+ error = 'Video %s is not available' % player_info.get('VID') or video_id
+ raise ExtractorError(error, expected=True)
+
+ upload_date_str = player_info.get('shootingDate')
+ if not upload_date_str:
+ upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
+
+ title = (player_info.get('VTI') or title or player_info['VID']).strip()
+ subtitle = player_info.get('VSU', '').strip()
+ if subtitle:
+ title += ' - %s' % subtitle
+
+ info_dict = {
+ 'id': player_info['VID'],
+ 'title': title,
+ 'description': player_info.get('VDE'),
+ 'upload_date': unified_strdate(upload_date_str),
+ 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+ }
+ qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
+
+ LANGS = {
+ 'fr': 'F',
+ 'de': 'A',
+ 'en': 'E[ANG]',
+ 'es': 'E[ESP]',
+ 'it': 'E[ITA]',
+ 'pl': 'E[POL]',
+ }
+
+ langcode = LANGS.get(lang, lang)
+
+ formats = []
+ for format_id, format_dict in vsr.items():
+ f = dict(format_dict)
+ versionCode = f.get('versionCode')
+ l = re.escape(langcode)
+
+ # Language preference from most to least priority
+ # Reference: section 6.8 of
+ # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
+ PREFERENCES = (
+ # original version in requested language, without subtitles
+ r'VO{0}$'.format(l),
+ # original version in requested language, with partial subtitles in requested language
+ r'VO{0}-ST{0}$'.format(l),
+ # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'VO{0}-STM{0}$'.format(l),
+ # non-original (dubbed) version in requested language, without subtitles
+ r'V{0}$'.format(l),
+ # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
+ r'V{0}-ST{0}$'.format(l),
+ # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'V{0}-STM{0}$'.format(l),
+ # original version in requested language, with partial subtitles in different language
+ r'VO{0}-ST(?!{0}).+?$'.format(l),
+ # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
+ r'VO{0}-STM(?!{0}).+?$'.format(l),
+ # original version in different language, with partial subtitles in requested language
+ r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
+ # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
+ # original version in different language, without subtitles
+ r'VO(?:(?!{0}))?$'.format(l),
+ # original version in different language, with partial subtitles in different language
+ r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
+ # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
+ r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
+ )
+
+ for pref, p in enumerate(PREFERENCES):
+ if re.match(p, versionCode):
+ lang_pref = len(PREFERENCES) - pref
+ break
+ else:
+ lang_pref = -1
+
+ format = {
+ 'format_id': format_id,
+ 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
+ 'language_preference': lang_pref,
+ 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ 'quality': qfunc(f.get('quality')),
+ }
+
+ if f.get('mediaType') == 'rtmp':
+ format['url'] = f['streamer']
+ format['play_path'] = 'mp4:' + f['url']
+ format['ext'] = 'flv'
+ else:
+ format['url'] = f['url']
+
+ formats.append(format)
+
+ self._check_formats(formats, video_id)
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
return info_dict
- def _extract_video(self, url, video_id):
- """Extract from videos.arte.tv"""
- config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
- config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
- config_xml = self._download_webpage(config_xml_url, video_id)
- config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
- config_xml = self._download_webpage(config_xml_url, video_id)
-
- video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
- def _key(m):
- quality = m.group('quality')
- if quality == 'hd':
- return 2
- else:
- return 1
- # We pick the best quality
- video_urls = sorted(video_urls, key=_key)
- video_url = list(video_urls)[-1].group('url')
-
- title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
- thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
- config_xml, 'thumbnail')
- return {'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'url': video_url,
- 'ext': 'flv',
- }
+
+class ArteTVPlus7IE(ArteTVBaseIE):
+ IE_NAME = 'arte.tv:+7'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
+
+ _TESTS = [{
+ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+ 'info_dict': {
+ 'id': '088501-000-A',
+ 'ext': 'mp4',
+ 'title': 'Mexico: Stealing Petrol to Survive',
+ 'upload_date': '20190628',
+ },
+ }]
+
+ def _real_extract(self, url):
+ lang, video_id = re.match(self._VALID_URL, url).groups()
+ return self._extract_from_json_url(
+ 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
+ video_id, lang)
+
+
+class ArteTVEmbedIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:embed'
+ _VALID_URL = r'''(?x)
+ https://www\.arte\.tv
+ /player/v3/index\.php\?json_url=
+ (?P<json_url>
+ https?://api\.arte\.tv/api/player/v1/config/
+ (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
+ )
+ '''
+
+ _TESTS = []
+
+ def _real_extract(self, url):
+ json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
+ return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVPlaylistIE(ArteTVBaseIE):
+ IE_NAME = 'arte.tv:playlist'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
+
+ _TESTS = [{
+ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
+ 'info_dict': {
+ 'id': 'RC-016954',
+ 'title': 'Earn a Living',
+ 'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
+ },
+ 'playlist_mincount': 6,
+ }]
+
+ def _real_extract(self, url):
+ lang, playlist_id = re.match(self._VALID_URL, url).groups()
+ collection = self._download_json(
+ 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
+ % (lang, playlist_id), playlist_id)
+ title = collection.get('title')
+ description = collection.get('shortDescription') or collection.get('teaserText')
+ entries = [
+ self._extract_from_json_url(
+ video['jsonUrl'], video.get('programId') or playlist_id, lang)
+ for video in collection['videos'] if video.get('jsonUrl')]
+ return self.playlist_result(entries, playlist_id, title, description)