import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
determine_ext,
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
- (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+ (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/',
+ 'info_dict': {
+ 'id': '2365936247',
+ 'ext': 'mp4',
+ 'title': 'Antiques Roadshow - Indianapolis, Hour 2',
+ 'description': 'md5:524b32249db55663e7231b6b8d1671a2',
+ 'duration': 3180,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
+ {
+ 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/',
+ 'info_dict': {
+ 'id': '3007193718',
+ 'ext': 'mp4',
+ 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster",
+ 'description': 'md5:37efbac85e0c09b009586523ec143652',
+ 'duration': 6292,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
+ {
+ 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/',
+ 'info_dict': {
+ 'id': '3011407934',
+ 'ext': 'mp4',
+ 'title': 'Stories from the Stage - Road Trip',
+ 'duration': 1619,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
+ },
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
{
'url': 'http://watch.knpb.org/video/2365616055/',
'only_matching': True,
+ },
+ {
+ 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
+ 'only_matching': True,
}
]
_ERRORS = {
r'class="coveplayerid">([^<]+)<', # coveplayer
r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
+ r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
+ r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
+ r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
]
media_id = self._search_regex(
if not url:
url = self._og_search_url(webpage)
- mobj = re.match(self._VALID_URL, url)
+ mobj = re.match(
+ self._VALID_URL, self._proto_relative_url(url.strip()))
player_id = mobj.group('player_id')
if not display_id:
url, display_id, note='Downloading player page',
errnote='Could not download player page')
video_id = self._search_regex(
- r'<div\s+id="video_([0-9]+)"', player_page, 'video ID')
+ r'<div\s+id=["\']video_(\d+)', player_page, 'video ID',
+ default=None)
+ if not video_id:
+ video_info = self._extract_video_data(
+ player_page, 'video data', display_id)
+ video_id = compat_str(
+ video_info.get('id') or video_info['contentID'])
else:
video_id = mobj.group('id')
display_id = video_id
return video_id, display_id, None, description
+ def _extract_video_data(self, string, name, video_id, fatal=True):
+ return self._parse_json(
+ self._search_regex(
+ [r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+ r'window\.videoBridge\s*=\s*({.+?});'],
+ string, name, default='{}'),
+ video_id, transform_source=js_to_json, fatal=fatal)
+
def _real_extract(self, url):
video_id, display_id, upload_date, description = self._extract_webpage(url)
'http://player.pbs.org/%s/%s' % (page, video_id),
display_id, 'Downloading %s page' % page, fatal=False)
if player:
- video_info = self._parse_json(
- self._search_regex(
- r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
- player, '%s video data' % page, default='{}'),
- display_id, transform_source=js_to_json, fatal=False)
+ video_info = self._extract_video_data(
+ player, '%s video data' % page, display_id, fatal=False)
if video_info:
extract_redirect_urls(video_info)
if not info:
info = video_info
if not chapters:
- for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
- chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
- if not chapter:
- continue
+ raw_chapters = video_info.get('chapters') or []
+ if not raw_chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ raw_chapters.append(chapter)
+ for chapter in raw_chapters:
start_time = float_or_none(chapter.get('start_time'), 1000)
duration = float_or_none(chapter.get('duration'), 1000)
if start_time is None or duration is None: