X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/03342304420e5daeb428ffdcc7bbd2bbfecfa61a..c64c42cb1f9487398f8b91a51266ff6e79115b90:/youtube_dl/extractor/pbs.py diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f43e3a1..b490ef7 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, int_or_none, js_to_json, strip_jsonp, + strip_or_none, unified_strdate, US_RATINGS, ) @@ -196,31 +196,25 @@ class PBSIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'md5': '173dc391afd361fa72eab5d3d918968d', 'info_dict': { 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', 'duration': 3190, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '143c98aa54a346738a3d78f54c925321', + 'md5': '6f722cb3c3982186d34b0f13374499c7', 'info_dict': { 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, - 'params': { - 'skip_download': True, # requires ffmpeg - } }, { 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -229,7 +223,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:95a19f568689d09a166dff9edada3301', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', 'duration': 801, }, }, @@ -244,9 +238,6 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -262,9 +253,6 @@ class PBSIE(InfoExtractor): 'upload_date': '20140122', 'age_limit': 10, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -280,7 +268,7 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -290,6 +278,7 @@ class PBSIE(InfoExtractor): }, { 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', 'info_dict': { 'id': '2365245528', 'display_id': '2365245528', @@ -299,27 +288,22 @@ class PBSIE(InfoExtractor): 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "', webpage): url = self._search_regex( @@ -441,10 +428,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id, None + return video_id, display_id, None, description def _real_extract(self, url): - video_id, display_id, upload_date = self._extract_webpage(url) + video_id, display_id, upload_date, description = self._extract_webpage(url) if isinstance(video_id, list): entries = [self.url_result( @@ -466,17 +453,6 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) - try: - video_info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id, 'Downloading video info JSON') - extract_redirect_urls(video_info) - info = video_info - except ExtractorError as e: - # videoInfo API may not work for some videos - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: - raise - # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -494,6 +470,7 @@ class PBSIE(InfoExtractor): info = video_info formats = [] + http_url = None for num, redirect in enumerate(redirects): redirect_id = redirect.get('eeid') @@ -514,13 +491,41 @@ class PBSIE(InfoExtractor): if determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', preference=1, m3u8_id='hls')) + format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': format_url, 'format_id': redirect_id, }) + if re.search(r'^https?://.*(?:\d+k|baseline)', format_url): + http_url = format_url self._remove_duplicate_formats(formats) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + formats)) + if http_url: + for m3u8_format in m3u8_formats: + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: + continue + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): + continue + f = m3u8_format.copy() + f.update({ + 'url': f_url, + 'format_id': m3u8_format['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) self._sort_formats(formats) rating_str = info.get('rating') @@ -535,6 +540,19 @@ class PBSIE(InfoExtractor): 'ext': 'ttml', 'url': closed_captions_url, }] + mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url) + if mobj: + ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1) + ttml_caption_id = int(ttml_caption_id) + subtitles['en'].extend([{ + 'url': closed_captions_url.replace( + ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)), + 'ext': 'srt', + }, { + 'url': closed_captions_url.replace( + ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)), + 'ext': 'vtt', + }]) # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) # Try turning it to 'program - title' naming scheme if possible @@ -542,11 +560,14 @@ class PBSIE(InfoExtractor): if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + description = info.get('description') or info.get( + 'program', {}).get('description') or description + return { 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info.get('description') or info.get('program', {}).get('description'), + 'description': description, 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit,