X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/3477c644417600d9ec8f8d2a44f82da0a4b15eb5..3f49721bd802c357ee9e5c1b6f07e0b68ac47fc2:/youtube_dl/extractor/npo.py diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eb12fb8..c91f584 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,12 +3,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( fix_xml_ampersands, + orderedSet, parse_duration, qualities, strip_jsonp, unified_strdate, + ExtractorError, ) @@ -180,16 +183,23 @@ class NPOIE(NPOBaseIE): continue streams = format_info.get('streams') if streams: - video_info = self._download_json( - streams[0] + '&type=json', - video_id, 'Downloading %s stream JSON' % format_id) + try: + video_info = self._download_json( + streams[0] + '&type=json', + video_id, 'Downloading %s stream JSON' % format_id) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring') + if error: + raise ExtractorError(error, expected=True) + raise else: video_info = format_info video_url = video_info.get('url') if not video_url: continue if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) else: formats.append({ 'url': video_url, @@ -406,9 +416,62 @@ class NPORadioFragmentIE(InfoExtractor): } -class VPROIE(NPOIE): +class SchoolTVIE(InfoExtractor): + IE_NAME = 'schooltv' + _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P[^/?#&]+)' + + _TEST = { + 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', + 'info_dict': { + 'id': 'WO_NTR_429477', + 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', + 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', + 'ext': 'mp4', + 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-mid=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video_id', group='id') + return { + '_type': 'url_transparent', + 'ie_key': 'NPO', + 'url': 'npo:%s' % video_id, + 'display_id': display_id + } + + +class NPOPlaylistBaseIE(NPOIE): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) + for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage)) + ] + + playlist_title = self._html_search_regex( + self._PLAYLIST_TITLE_RE, webpage, 'playlist title', + default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) + + +class VPROIE(NPOPlaylistBaseIE): IE_NAME = 'vpro' - _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P[^/]+)\.html' + _PLAYLIST_TITLE_RE = (r']+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)', + r']+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)') + _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"' _TESTS = [ { @@ -421,12 +484,13 @@ class VPROIE(NPOIE): 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, + 'skip': 'Video gone', }, { 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', 'info_dict': { 'id': 'sergio-herman', - 'title': 'Sergio Herman: Fucking perfect', + 'title': 'sergio herman: fucking perfect', }, 'playlist_count': 2, }, @@ -435,54 +499,61 @@ class VPROIE(NPOIE): 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', 'info_dict': { 'id': 'education-education', - 'title': '2Doc', + 'title': 'education education', }, 'playlist_count': 2, + }, + { + 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', + 'info_dict': { + 'id': 'de-tegenprestatie', + 'title': 'De Tegenprestatie', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html', + 'info_dict': { + 'id': 'VARA_101375237', + 'ext': 'm4v', + 'title': 'MH17: Het verdriet van Nederland', + 'description': 'md5:09e1a37c1fdb144621e22479691a9f18', + 'upload_date': '20150716', + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + }, } ] - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) - for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) - ] - - playlist_title = self._search_regex( - r'\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', - webpage, 'playlist title', default=None) or self._og_search_title(webpage) - return self.playlist_result(entries, playlist_id, playlist_title) - - -class WNLIE(InfoExtractor): +class WNLIE(NPOPlaylistBaseIE): + IE_NAME = 'wnl' _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P[^/]+)__\d+' + _PLAYLIST_TITLE_RE = r'(?s)]+class="subject"[^>]*>(.+?)' + _PLAYLIST_ENTRY_RE = r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+' - _TEST = { + _TESTS = [{ 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', 'info_dict': { 'id': 'vandaag-de-dag-6-mei', 'title': 'Vandaag de Dag 6 mei', }, 'playlist_count': 4, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) + }] - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result('npo:%s' % video_id, 'NPO') - for video_id, part in re.findall( - r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) - ] - playlist_title = self._html_search_regex( - r'(?s)]+class="subject"[^>]*>(.+?)', - webpage, 'playlist title') +class AndereTijdenIE(NPOPlaylistBaseIE): + IE_NAME = 'anderetijden' + _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P[^/?#&]+)' + _PLAYLIST_TITLE_RE = r'(?s)]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)' + _PLAYLIST_ENTRY_RE = r']+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']' - return self.playlist_result(entries, playlist_id, playlist_title) + _TESTS = [{ + 'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'info_dict': { + 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'title': 'Duitse soldaten over de Slag bij Arnhem', + }, + 'playlist_count': 3, + }]