X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/0865c28fb29a6481cd837cf8c1ef0cd134c6ef8e..d018d3313032e12968a6add6800e51d412e2f602:/youtube_dl/extractor/rai.py diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 144e339..7ff1d06 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urlparse, ) from ..utils import ( parse_duration, @@ -13,7 +14,7 @@ from ..utils import ( class RaiIE(InfoExtractor): - _VALID_URL = r'(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' + _VALID_URL = r'(?P(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -62,34 +63,93 @@ class RaiIE(InfoExtractor): 'description': 'Edizione delle ore 20:30 ', } }, + { + 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', + 'md5': '02b64456f7cc09f96ff14e7dd489017e', + 'info_dict': { + 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', + 'ext': 'flv', + 'title': 'Il Candidato - Primo episodio: "Le Primarie"', + 'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!', + 'uploader': 'RaiTre', + } + }, + { + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '037104d2c14132887e5e4cf114569214', + 'info_dict': { + 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', + 'ext': 'flv', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'uploader': 'RaiTre', + 'upload_date': '20141221', + }, + } ] + def _extract_relinker_url(self, webpage): + return self._proto_relative_url(self._search_regex( + [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'], + webpage, 'relinker url', default=None)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + host = mobj.group('host') + + webpage = self._download_webpage(url, video_id) - media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON') + relinker_url = self._extract_relinker_url(webpage) - title = media.get('name') - description = media.get('desc') - thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') - duration = parse_duration(media.get('length')) - uploader = media.get('author') - upload_date = unified_strdate(media.get('date')) + if not relinker_url: + iframe_url = self._search_regex( + [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe') + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + webpage = self._download_webpage( + iframe_url, video_id) + relinker_url = self._extract_relinker_url(webpage) - formats = [] + relinker = self._download_json( + '%s&output=47' % relinker_url, video_id) - for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']: - media_url = media.get(format_id) - if not media_url: - continue - formats.append({ + media_url = relinker['video'][0] + ct = relinker.get('ct') + if ct == 'f4m': + formats = self._extract_f4m_formats( + media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id) + else: + formats = [{ 'url': media_url, - 'format_id': format_id, - 'ext': 'mp4', - }) + 'format_id': ct, + }] + + json_link = self._html_search_meta( + 'jsonlink', webpage, 'JSON link', default=None) + if json_link: + media = self._download_json( + host + json_link, video_id, 'Downloading video JSON') + title = media.get('name') + description = media.get('desc') + thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') + duration = parse_duration(media.get('length')) + uploader = media.get('author') + upload_date = unified_strdate(media.get('date')) + else: + title = (self._search_regex( + r'var\s+videoTitolo\s*=\s*"(.+?)";', + webpage, 'title', default=None) or self._og_search_title(webpage)).replace('\\"', '"') + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = None + uploader = self._html_search_meta('Editore', webpage, 'uploader') + upload_date = unified_strdate(self._html_search_meta( + 'item-date', webpage, 'upload date', default=None)) - subtitles = self.extract_subtitles(video_id, url) + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, @@ -103,8 +163,7 @@ class RaiIE(InfoExtractor): 'subtitles': subtitles, } - def _get_subtitles(self, video_id, url): - webpage = self._download_webpage(url, video_id) + def _get_subtitles(self, video_id, webpage): subtitles = {} m = re.search(r'