X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/bddc9fc577d16b1428924bf8a5c37ef1d9295f14..c91254967656b8722087896c531ed6d815cbe670:/youtube_dl/extractor/zdf.py diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509c..689f197 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,65 +1,125 @@ +# coding: utf-8 + +import operator import re from .common import InfoExtractor from ..utils import ( - ExtractorError, - unescapeHTML, + unified_strdate, ) + class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' - _TITLE = r'(?P.*)</h1>' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + + _TEST = { + u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", + u"file": u"2037704.webm", + u"info_dict": { + u"upload_date": u"20131127", + u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", + u"uploader": u"spezial", + u"title": u"ZDFspezial - Ende des Machtpokers" + }, + u"skip": u"Videos on ZDF.de are depublicised in short order", + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') - - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: - raise ExtractorError(u'No stream found.') - - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') - - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) - if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) - - mobj = re.search(self._MMS_STREAM, media_link) - if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') - - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') - - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] + xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + doc = self._download_xml( + xml_url, video_id, + note=u'Downloading video info', + errnote=u'Failed to download video info') + + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + uploader_node = doc.find('.//details/originChannelTitle') + uploader = None if uploader_node is None else uploader_node.text + duration_str = doc.find('.//details/length').text + duration_m = re.match(r'''(?x)^ + (?P<hours>[0-9]{2}) + :(?P<minutes>[0-9]{2}) + :(?P<seconds>[0-9]{2}) + (?:\.(?P<ms>[0-9]+)?) + ''', duration_str) + duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m + else None + ) + upload_date = unified_strdate(doc.find('.//details/airtime').text) + + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = u'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + ext = format_m.group('container') + is_supported = ext != 'f4f' + + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] + try: + proto_pref = -PROTO_ORDER.index(format_m.group('proto')) + except ValueError: + proto_pref = 999 + + quality = fnode.find('./quality').text + QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] + try: + quality_pref = -QUALITY_ORDER.index(quality) + except ValueError: + quality_pref = 999 + + abr = int(fnode.find('./audioBitrate').text) // 1000 + vbr = int(fnode.find('./videoBitrate').text) // 1000 + pref = (is_available, is_supported, + proto_pref, quality_pref, vbr, abr) + + format_note = u'' + if not is_supported: + format_note += u'(unsupported)' + if not format_note: + format_note = None + + return { + 'format_id': format_id + u'-' + quality, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': int(fnode.find('./width').text), + 'height': int(fnode.find('./height').text), + 'filesize': int(fnode.find('./filesize').text), + 'format_note': format_note, + '_pref': pref, + '_available': is_available, + } + + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = sorted(filter(lambda f: f['_available'], + map(xml_to_format, format_nodes)), + key=operator.itemgetter('_pref')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'duration': duration, + 'upload_date': upload_date, + }