ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
+ orderedSet,
strip_jsonp,
strip_or_none,
unified_strdate,
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
- (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+ (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
# Article with embedded player (or direct video)
- (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+ (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
''' % '|'.join(list(zip(*_STATIONS))[0])
+ _GEO_COUNTRIES = ['US']
+
_TESTS = [
{
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b',
'duration': 6559,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
'description': 'md5:c741d14e979fc53228c575894094f157',
'title': 'NOVA - Killer Typhoon',
'duration': 3172,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140122',
'age_limit': 10,
},
},
'playlist_count': 2,
},
+ {
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+ 'info_dict': {
+ 'id': 'great-war',
+ },
+ 'playlist_count': 3,
+ },
{
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
'title': 'American Experience - Death and the Civil War, Chapter 1',
'description': 'md5:67fa89a9402e2ee7d08f53b920674c18',
'duration': 682,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True, # requires ffmpeg
'title': 'FRONTLINE - United States of Secrets (Part One)',
'description': 'md5:55756bd5c551519cc4b7703e373e217e',
'duration': 6851,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5',
'duration': 1480,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
},
{
'title': 'FRONTLINE - The Atomic Artists',
'description': 'md5:f677e4520cfacb4a5ce1471e31b57800',
'duration': 723,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True, # requires ffmpeg
'ext': 'mp4',
'title': 'FRONTLINE - Netanyahu at War',
'duration': 6852,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'formats': 'mincount:8',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/13801
+ 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+ 'info_dict': {
+ 'id': '3003333873',
+ 'ext': 'mp4',
+ 'title': 'PBS NewsHour - full episode July 31, 2017',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'duration': 3265,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
{
'url': 'http://watch.knpb.org/video/2365616055/',
'only_matching': True,
+ },
+ {
+ 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
+ 'only_matching': True,
}
]
_ERRORS = {
410: 'This video has expired and is no longer available for online streaming.',
}
+ def _real_initialize(self):
+ cookie = (self._download_json(
+ 'http://localization.services.pbs.org/localize/auto/cookie/',
+ None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie')
+ if cookie:
+ station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station')
+ if station:
+ self._set_cookie('.pbs.org', 'pbsol.station', station)
+
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
- r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
- tabbed_videos = re.findall(p, webpage)
+ tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
if url:
break
+ if not url:
+ url = self._og_search_url(webpage)
+
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
redirect_info = self._download_json(
'%s?format=json' % redirect['url'], display_id,
- 'Downloading %s video url info' % (redirect_id or num))
+ 'Downloading %s video url info' % (redirect_id or num),
+ headers=self.geo_verification_headers())
if redirect_info['status'] == 'error':
+ message = self._ERRORS.get(
+ redirect_info['http_code'], redirect_info['message'])
+ if redirect_info['http_code'] == 403:
+ self.raise_geo_restricted(
+ msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
- '%s said: %s' % (
- self.IE_NAME,
- self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])),
- expected=True)
+ '%s said: %s' % (self.IE_NAME, message), expected=True)
format_url = redirect_info.get('url')
if not format_url:
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
# Try turning it to 'program - title' naming scheme if possible
alt_title = info.get('program', {}).get('title')
if alt_title:
- info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+ info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title'])
description = info.get('description') or info.get(
'program', {}).get('description') or description
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}