import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
+ orderedSet,
strip_jsonp,
strip_or_none,
unified_strdate,
+ url_or_none,
US_RATINGS,
)
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
- (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+ (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
# Article with embedded player (or direct video)
- (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+ (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
''' % '|'.join(list(zip(*_STATIONS))[0])
+ _GEO_COUNTRIES = ['US']
+
_TESTS = [
{
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b',
'duration': 6559,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
'description': 'md5:c741d14e979fc53228c575894094f157',
'title': 'NOVA - Killer Typhoon',
'duration': 3172,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140122',
'age_limit': 10,
},
},
'playlist_count': 2,
},
+ {
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+ 'info_dict': {
+ 'id': 'great-war',
+ },
+ 'playlist_count': 3,
+ },
{
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
'title': 'American Experience - Death and the Civil War, Chapter 1',
'description': 'md5:67fa89a9402e2ee7d08f53b920674c18',
'duration': 682,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True, # requires ffmpeg
'title': 'FRONTLINE - United States of Secrets (Part One)',
'description': 'md5:55756bd5c551519cc4b7703e373e217e',
'duration': 6851,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
# Video embedded in iframe containing angle brackets as attribute's value (e.g.
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
- # https://github.com/rg3/youtube-dl/issues/7059)
+ # https://github.com/ytdl-org/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
'md5': '59b0ef5009f9ac8a319cc5efebcd865e',
'info_dict': {
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5',
'duration': 1480,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
'title': 'FRONTLINE - The Atomic Artists',
'description': 'md5:f677e4520cfacb4a5ce1471e31b57800',
'duration': 723,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True, # requires ffmpeg
'ext': 'mp4',
'title': 'FRONTLINE - Netanyahu at War',
'duration': 6852,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'formats': 'mincount:8',
},
},
+ {
+ # https://github.com/ytdl-org/youtube-dl/issues/13801
+ 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+ 'info_dict': {
+ 'id': '3003333873',
+ 'ext': 'mp4',
+ 'title': 'PBS NewsHour - full episode July 31, 2017',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'duration': 3265,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/',
+ 'info_dict': {
+ 'id': '2365936247',
+ 'ext': 'mp4',
+ 'title': 'Antiques Roadshow - Indianapolis, Hour 2',
+ 'description': 'md5:524b32249db55663e7231b6b8d1671a2',
+ 'duration': 3180,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
+ {
+ 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/',
+ 'info_dict': {
+ 'id': '3007193718',
+ 'ext': 'mp4',
+ 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster",
+ 'description': 'md5:37efbac85e0c09b009586523ec143652',
+ 'duration': 6292,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
+ {
+ 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/',
+ 'info_dict': {
+ 'id': '3011407934',
+ 'ext': 'mp4',
+ 'title': 'Stories from the Stage - Road Trip',
+ 'duration': 1619,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
{
'url': 'http://watch.knpb.org/video/2365616055/',
'only_matching': True,
+ },
+ {
+ 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
+ 'only_matching': True,
}
]
_ERRORS = {
410: 'This video has expired and is no longer available for online streaming.',
}
+ def _real_initialize(self):
+ cookie = (self._download_json(
+ 'http://localization.services.pbs.org/localize/auto/cookie/',
+ None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie')
+ if cookie:
+ station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station')
+ if station:
+ self._set_cookie('.pbs.org', 'pbsol.station', station)
+
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
- r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
- tabbed_videos = re.findall(p, webpage)
+ tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
r'class="coveplayerid">([^<]+)<', # coveplayer
r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
+ r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
+ r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
+ r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
]
media_id = self._search_regex(
if url:
break
- mobj = re.match(self._VALID_URL, url)
+ if not url:
+ url = self._og_search_url(webpage)
+
+ mobj = re.match(
+ self._VALID_URL, self._proto_relative_url(url.strip()))
player_id = mobj.group('player_id')
if not display_id:
url, display_id, note='Downloading player page',
errnote='Could not download player page')
video_id = self._search_regex(
- r'<div\s+id="video_([0-9]+)"', player_page, 'video ID')
+ r'<div\s+id=["\']video_(\d+)', player_page, 'video ID',
+ default=None)
+ if not video_id:
+ video_info = self._extract_video_data(
+ player_page, 'video data', display_id)
+ video_id = compat_str(
+ video_info.get('id') or video_info['contentID'])
else:
video_id = mobj.group('id')
display_id = video_id
return video_id, display_id, None, description
+ def _extract_video_data(self, string, name, video_id, fatal=True):
+ return self._parse_json(
+ self._search_regex(
+ [r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+ r'window\.videoBridge\s*=\s*({.+?});'],
+ string, name, default='{}'),
+ video_id, transform_source=js_to_json, fatal=fatal)
+
def _real_extract(self, url):
video_id, display_id, upload_date, description = self._extract_webpage(url)
if redirect_url and redirect_url not in redirect_urls:
redirects.append(redirect)
redirect_urls.add(redirect_url)
-
+ encodings = info.get('encodings')
+ if isinstance(encodings, list):
+ for encoding in encodings:
+ encoding_url = url_or_none(encoding)
+ if encoding_url and encoding_url not in redirect_urls:
+ redirects.append({'url': encoding_url})
+ redirect_urls.add(encoding_url)
+
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
'http://player.pbs.org/%s/%s' % (page, video_id),
display_id, 'Downloading %s page' % page, fatal=False)
if player:
- video_info = self._parse_json(
- self._search_regex(
- r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
- player, '%s video data' % page, default='{}'),
- display_id, transform_source=js_to_json, fatal=False)
+ video_info = self._extract_video_data(
+ player, '%s video data' % page, display_id, fatal=False)
if video_info:
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ raw_chapters = video_info.get('chapters') or []
+ if not raw_chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ raw_chapters.append(chapter)
+ for chapter in raw_chapters:
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
redirect_info = self._download_json(
'%s?format=json' % redirect['url'], display_id,
- 'Downloading %s video url info' % (redirect_id or num))
+ 'Downloading %s video url info' % (redirect_id or num),
+ headers=self.geo_verification_headers())
if redirect_info['status'] == 'error':
+ message = self._ERRORS.get(
+ redirect_info['http_code'], redirect_info['message'])
+ if redirect_info['http_code'] == 403:
+ self.raise_geo_restricted(
+ msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
- '%s said: %s' % (
- self.IE_NAME,
- self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])),
- expected=True)
+ '%s said: %s' % (self.IE_NAME, message), expected=True)
format_url = redirect_info.get('url')
if not format_url:
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
# we won't try extracting them.
# Since summer 2016 higher quality formats (4500k and 6500k) are also available
# albeit they are not documented in [2].
- # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656
+ # 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656
# 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
if not bitrate or int(bitrate) < 400:
continue
# Try turning it to 'program - title' naming scheme if possible
alt_title = info.get('program', {}).get('title')
if alt_title:
- info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+ info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title'])
description = info.get('description') or info.get(
'program', {}).get('description') or description
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}