X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/9815bb0a551468e4939cacfffbc2d5cb8dd12431..2ed70d87eb852121963682cc3227f3bab9079714:/youtube_dl/extractor/zdf.py diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509c..3b1ac4e 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,65 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - ExtractorError, - unescapeHTML, + int_or_none, + unified_strdate, ) + class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' - _TITLE = r'(?P.*)</h1>' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + + _TEST = { + 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'info_dict': { + 'id': '2037704', + 'ext': 'webm', + 'title': 'ZDFspezial - Ende des Machtpokers', + 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', + 'duration': 1022, + 'uploader': 'spezial', + 'uploader_id': '225948', + 'upload_date': '20131127', + }, + 'skip': 'Videos on ZDF.de are depublicised in short order', + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') - - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: - raise ExtractorError(u'No stream found.') - - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') - - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) - if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) - - mobj = re.search(self._MMS_STREAM, media_link) - if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') - - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') - - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] + xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + duration = int(doc.find('.//details/lengthSec').text) + uploader_node = doc.find('.//details/originChannelTitle') + uploader = None if uploader_node is None else uploader_node.text + uploader_id_node = doc.find('.//details/originChannelId') + uploader_id = None if uploader_id_node is None else uploader_id_node.text + upload_date = unified_strdate(doc.find('.//details/airtime').text) + + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = 'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + ext = format_m.group('container') + proto = format_m.group('proto').lower() + + quality = fnode.find('./quality').text + abr = int(fnode.find('./audioBitrate').text) // 1000 + vbr_node = fnode.find('./videoBitrate') + vbr = None if vbr_node is None else int(vbr_node.text) // 1000 + + width_node = fnode.find('./width') + width = None if width_node is None else int_or_none(width_node.text) + height_node = fnode.find('./height') + height = None if height_node is None else int_or_none(height_node.text) + + format_note = '' + if not format_note: + format_note = None + + return { + 'format_id': format_id + '-' + quality, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': width, + 'height': height, + 'filesize': int_or_none(fnode.find('./filesize').text), + 'format_note': format_note, + 'protocol': proto, + '_available': is_available, + } + + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = list(filter( + lambda f: f['_available'], + map(xml_to_format, format_nodes))) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'formats': formats, + } \ No newline at end of file