X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/22bc55bffeb45b7d2f3056ae863eb3228e6507e8..9bb07a5ee663304e100edc65967d5fc4a521bcd0:/youtube_dl/extractor/ted.py diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 212ac80..db5a4f4 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -5,8 +5,12 @@ import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse +) from ..utils import ( + extract_attributes, float_or_none, int_or_none, try_get, @@ -20,7 +24,7 @@ class TEDIE(InfoExtractor): (?Phttps?://) (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ ( - (?Pplaylists(?:/\d+)?) # We have a playlist + (?Pplaylists(?:/(?P\d+))?) # We have a playlist | ((?Ptalks)) # We have a simple talk | @@ -84,6 +88,7 @@ class TEDIE(InfoExtractor): 'info_dict': { 'id': '10', 'title': 'Who are the hackers?', + 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' }, 'playlist_mincount': 6, }, { @@ -128,7 +133,7 @@ class TEDIE(InfoExtractor): def _extract_info(self, webpage): info_json = self._search_regex( - r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*', + r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*', webpage, 'info json') return json.loads(info_json) @@ -150,22 +155,22 @@ class TEDIE(InfoExtractor): webpage = self._download_webpage(url, name, 'Downloading playlist webpage') - info = self._extract_info(webpage) - playlist_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['playlist'], - dict) or info['playlist'] + playlist_entries = [] + for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage): + attrs = extract_attributes(entry) + entry_url = compat_urlparse.urljoin(url, attrs['href']) + playlist_entries.append(self.url_result(entry_url, self.ie_key())) + + final_url = self._og_search_url(webpage, fatal=False) + playlist_id = ( + re.match(self._VALID_URL, final_url).group('playlist_id') + if final_url else None) - playlist_entries = [ - self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) - for talk in try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'], - dict) or info['talks'] - ] return self.playlist_result( - playlist_entries, - playlist_id=compat_str(playlist_info['id']), - playlist_title=playlist_info['title']) + playlist_entries, playlist_id=playlist_id, + playlist_title=self._og_search_title(webpage, fatal=False), + playlist_description=self._og_search_description(webpage)) def _talk_info(self, url, video_name): webpage = self._download_webpage(url, video_name) @@ -203,17 +208,13 @@ class TEDIE(InfoExtractor): ext_url = None if service.lower() == 'youtube': ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } + + return self.url_result(ext_url or external['uri']) resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None for format_id, resources in resources_.items(): - if not isinstance(resources, dict): - continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -242,6 +243,8 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + if not isinstance(resources, dict): + continue stream_url = url_or_none(resources.get('stream')) if not stream_url: continue @@ -267,6 +270,8 @@ class TEDIE(InfoExtractor): 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) + if f.get('acodec') == 'none': + del f['acodec'] formats.append(f) audio_download = talk_info.get('audioDownload')