X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/453698570f26bebd37b39df8537d993b57d77b8b..f8eb4d6a400267895af2f217c36101294f8a94d2:/youtube_dl/extractor/webofstories.py diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 396cf4e..f2b8d19 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -1,8 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + orderedSet, +) class WebOfStoriesIE(InfoExtractor): @@ -10,54 +15,66 @@ class WebOfStoriesIE(InfoExtractor): _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' - _TESTS = [ - { - 'url': 'http://www.webofstories.com/play/hans.bethe/71', - 'md5': '373e4dd915f60cfe3116322642ddf364', - 'info_dict': { - 'id': '4536', - 'ext': 'mp4', - 'title': 'The temperature of the sun', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Hans Bethe talks about calculating the temperature of the sun', - 'duration': 238, - } + _TESTS = [{ + 'url': 'http://www.webofstories.com/play/hans.bethe/71', + 'md5': '373e4dd915f60cfe3116322642ddf364', + 'info_dict': { + 'id': '4536', + 'ext': 'mp4', + 'title': 'The temperature of the sun', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Hans Bethe talks about calculating the temperature of the sun', + 'duration': 238, + } + }, { + 'url': 'http://www.webofstories.com/play/55908', + 'md5': '2985a698e1fe3211022422c4b5ed962c', + 'info_dict': { + 'id': '55908', + 'ext': 'mp4', + 'title': 'The story of Gemmata obscuriglobus', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', + 'duration': 169, }, - { - 'url': 'http://www.webofstories.com/play/55908', - 'md5': '2985a698e1fe3211022422c4b5ed962c', - 'info_dict': { - 'id': '55908', - 'ext': 'mp4', - 'title': 'The story of Gemmata obscuriglobus', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', - 'duration': 169, - } + 'skip': 'notfound', + }, { + # malformed og:title meta + 'url': 'http://www.webofstories.com/play/54215?o=MS', + 'info_dict': { + 'id': '54215', + 'ext': 'mp4', + 'title': '"A Leg to Stand On"', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Oliver Sacks talks about the death and resurrection of a limb', + 'duration': 97, }, - ] + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + # Sometimes og:title meta is malformed + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'(?s)Title:\s*(.+?)<', webpage, 'title') description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) - story_filename = self._search_regex( - r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename') - speaker_id = self._search_regex( - r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID') - story_id = self._search_regex( - r'\.storyId\((\d+)\)', webpage, 'story ID') - speaker_type = self._search_regex( - r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type') - great_life = self._search_regex( - r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story') + embed_params = [s.strip(" \r\n\t'") for s in self._search_regex( + r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)', + webpage, 'embed params').split(',')] + + ( + _, speaker_id, story_id, story_duration, + speaker_type, great_life, _thumbnail, _has_subtitles, + story_filename, _story_order) = embed_params + is_great_life_series = great_life == 'true' - duration = int_or_none(self._search_regex( - r'\.duration\((\d+)\)', webpage, 'duration', fatal=False)) + duration = int_or_none(story_duration) # URL building, see: http://www.webofstories.com/scripts/player.js ms_prefix = '' @@ -100,3 +117,44 @@ class WebOfStoriesIE(InfoExtractor): 'description': description, 'duration': duration, } + + +class WebOfStoriesPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P[^/]+)' + _TEST = { + 'url': 'http://www.webofstories.com/playAll/donald.knuth', + 'info_dict': { + 'id': 'donald.knuth', + 'title': 'Donald Knuth (Scientist)', + }, + 'playlist_mincount': 97, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'http://www.webofstories.com/play/%s' % video_id, + 'WebOfStories', video_id=video_id) + for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage)) + ] + + title = self._search_regex( + r'
\s*([^<]+)', + webpage, 'speaker', default=None) + if title: + field = self._search_regex( + r'([^<]+)', + webpage, 'field', default=None) + if field: + title += ' (%s)' % field + + if not title: + title = self._search_regex( + r'Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories', + webpage, 'title') + + return self.playlist_result(entries, playlist_id, title)