import re import socket from .common import InfoExtractor from ..utils import ( compat_http_client, compat_str, compat_urllib_error, compat_urllib_parse, compat_urllib_request, ExtractorError, unified_strdate, ) class ArteTvIE(InfoExtractor): """arte.tv information extractor.""" _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' def fetch_webpage(self, url): request = compat_urllib_request.Request(url) try: self.report_download_webpage(url) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) except ValueError as err: raise ExtractorError(u'Invalid URL: %s' % url) return webpage def grep_webpage(self, url, regex, regexFlags, matchTuples): page = self.fetch_webpage(url) mobj = re.search(regex, page, regexFlags) info = {} if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) for (i, key, err) in matchTuples: if mobj.group(i) is None: raise ExtractorError(err) else: info[key] = mobj.group(i) return info # TODO implement Live Stream # def extractLiveStream(self, url): # video_lang = url.split('/')[-4] # info = self.grep_webpage( # url, # r'src="(.*?/videothek_js.*?\.js)', # 0, # [ # (1, 'url', u'Invalid URL: %s' % url) # ] # ) # http_host = url.split('/')[2] # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) # info = self.grep_webpage( # next_url, # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + # '(http://.*?\.swf).*?' + # '(rtmp://.*?)\'', # re.DOTALL, # [ # (1, 'path', u'could not extract video path: %s' % url), # (2, 'player', u'could not extract video player: %s' % url), # (3, 'url', u'could not extract video url: %s' % url) # ] # ) # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def extractPlus7Stream(self, url): video_lang = url.split('/')[-3] info = self.grep_webpage( url, r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', 0, [ (1, 'url', u'Invalid URL: %s' % url) ] ) next_url = compat_urllib_parse.unquote(info.get('url')) info = self.grep_webpage( next_url, r'