X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/3477c644417600d9ec8f8d2a44f82da0a4b15eb5..0433503f78928c1f0c76a8d66ef236dc35ff23df:/youtube_dl/extractor/ceskatelevize.py diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index e857e66..e250de1 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -1,31 +1,33 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( - compat_urllib_request, - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, float_or_none, + sanitized_Request, + unescapeHTML, + urlencode_postdata, + USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { - 'id': '61924494876951776', + 'id': '61924494877246241', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace', - 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', - 'thumbnail': 're:^https?://.*\.jpg', + 'title': 'Hyde Park Civilizace: Život v Grónsku', + 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 3350, }, 'params': { @@ -33,54 +35,40 @@ class CeskaTelevizeIE(InfoExtractor): 'skip_download': True, }, }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { - 'id': '61924494876844374', + 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 81.3, }, 'params': { # m3u8 download 'skip_download': True, }, }, { - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], 'params': { # m3u8 download 'skip_download': True, }, + 'skip': 'Georestricted to Czech Republic', + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, }] def _real_extract(self, url): - url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') - - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) @@ -88,71 +76,126 @@ class CeskaTelevizeIE(InfoExtractor): if '%s

' % NOT_AVAILABLE_STRING in webpage: raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - typ = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') - episode_id = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + type_ = None + episode_id = None + + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') + + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') data = { - 'playlist[0][type]': typ, + 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, 'requestUrl': compat_urllib_parse_urlparse(url).path, 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=compat_urllib_parse.urlencode(data)) + entries = [] - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') - req.add_header('Referer', url) + for user_agent in (None, USER_AGENTS['Safari']): + req = sanitized_Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=urlencode_postdata(data)) - playlistpage = self._download_json(req, playlist_id) + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + if user_agent: + req.add_header('User-Agent', user_agent) + req.add_header('Referer', url) - playlist_url = playlistpage['url'] - if playlist_url == 'error_region': - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + playlistpage = self._download_json(req, playlist_id, fatal=False) - req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) + if not playlistpage: + continue - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - playlist = self._download_json(req, playlist_id)['playlist'] - playlist_len = len(playlist) + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) + req.add_header('Referer', url) - entries = [] - for item in playlist: - formats = [] - for format_id, stream_url in item['streamUrls'].items(): - formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) - self._sort_formats(formats) - - item_id = item.get('id') or item['assetId'] - title = item['title'] - - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') - - subtitles = {} - if item.get('type') == 'VOD': - subs = item.get('subtitles') - if subs: - subtitles = self.extract_subtitles(episode_id, subs) - - entries.append({ - 'id': item_id, - 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), - 'description': playlist_description if playlist_len == 1 else None, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - }) + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) + + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + + playlist = playlist.get('playlist') + if not isinstance(playlist, list): + continue + + playlist_len = len(playlist) + + for num, item in enumerate(playlist): + is_live = item.get('type') == 'LIVE' + formats = [] + for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'playerType=flash' in stream_url: + stream_formats = self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % format_id, fatal=False) + else: + stream_formats = self._extract_mpd_formats( + stream_url, playlist_id, + mpd_id='dash-%s' % format_id, fatal=False) + # See https://github.com/rg3/youtube-dl/issues/12119#issuecomment-280037031 + if format_id == 'audioDescription': + for f in stream_formats: + f['source_preference'] = -10 + formats.extend(stream_formats) + + if user_agent and len(entries) == playlist_len: + entries[num]['formats'].extend(formats) + continue + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + + entries.append({ + 'id': item_id, + 'title': final_title, + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + }) + + for e in entries: + self._sort_formats(e['formats']) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -177,16 +220,60 @@ class CeskaTelevizeIE(InfoExtractor): for divider in [1000, 60, 60, 100]: components.append(msec % divider) msec //= divider - return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) def _fix_subtitle(subtitle): for line in subtitle.splitlines(): - m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) if m: yield m.group(1) start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) - yield "{0} --> {1}".format(start, stop) + yield '{0} --> {1}'.format(start, stop) else: yield line - return "\r\n".join(_fix_subtitle(subtitles)) + return '\r\n'.join(_fix_subtitle(subtitles)) + + +class CeskaTelevizePoradyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' + _TESTS = [{ + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = unescapeHTML(self._search_regex( + r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'iframe player url', group='url')) + + return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())