X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/feb5020b37d7d3ba4005a8bac6f4efece4ce4b8c..179629569ec7a2eda04957287c17fab7e55b1a7f:/youtube_dl/extractor/arte.py diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 183274e..e7a91a1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,11 +1,9 @@ import re import json +import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - # This is used by the not implemented extractLiveStream method - compat_urllib_parse, - ExtractorError, unified_strdate, ) @@ -16,8 +14,8 @@ class ArteTvIE(InfoExtractor): www.arte.tv/guide, the extraction process is different for each one. The videos expire in 7 days, so we can't add tests. """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P.*?).html' + _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @@ -27,6 +25,7 @@ class ArteTvIE(InfoExtractor): return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) # TODO implement Live Stream + # from ..utils import compat_urllib_parse # def extractLiveStream(self, url): # video_lang = url.split('/')[-4] # info = self.grep_webpage( @@ -56,23 +55,24 @@ class ArteTvIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._EMISSION_URL, url) if mobj is not None: - name = mobj.group('name') + lang = mobj.group('lang') # This is not a real id, it can be for example AJT for the news # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal video_id = mobj.group('id') - return self._extract_emission(url, video_id) + return self._extract_emission(url, video_id, lang) mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') - return self._extract_video(url, id) + lang = mobj.group('lang') + return self._extract_video(url, id, lang) if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id): + def _extract_emission(self, url, video_id, lang): """Extract from www.arte.tv/guide""" webpage = self._download_webpage(url, video_id) json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') @@ -91,6 +91,16 @@ class ArteTvIE(InfoExtractor): } formats = player_info['VSR'].values() + def _match_lang(f): + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'V%s-ST.' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) # Pick the best quality @@ -103,13 +113,15 @@ class ArteTvIE(InfoExtractor): return info_dict - def _extract_video(self, url, video_id): + def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" - config_xml_url = url.replace('/videos/', '/do_delegate/videos/') - config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml') - config_xml = self._download_webpage(config_xml_url, video_id) - config_xml_url = self._html_search_regex(r'