import xml.etree.ElementTree
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..utils import ExtractorError
from ..compat import compat_HTTPError
-class BBCCoUkIE(SubtitlesInfoExtractor):
+class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
_TESTS = [
{
}, {
'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
+ 'only_matching': True,
}
]
formats.extend(conn_formats)
return formats
- def _extract_captions(self, media, programme_id):
+ def _get_subtitles(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = ''
+
+ def _extract_text(p):
+ if p.text is not None:
+ stripped_text = p.text.strip()
+ if stripped_text:
+ return stripped_text
+ return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
for pos, p in enumerate(ps):
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
- p.text.strip() if p.text is not None else '')
- subtitles[lang] = srt
+ srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
+ subtitles[lang] = [
+ {
+ 'url': connection.get('href'),
+ 'ext': 'ttml',
+ },
+ {
+ 'data': srt,
+ 'ext': 'srt',
+ },
+ ]
return subtitles
def _download_media_selector(self, programme_id):
elif kind == 'video':
formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions':
- subtitles = self._extract_captions(media, programme_id)
+ subtitles = self.extract_subtitles(media, programme_id)
return formats, subtitles
formats, subtitles = self._download_media_selector(programme_id)
return programme_id, title, description, duration, formats, subtitles
except ExtractorError as ee:
- if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+ if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
raise
# fallback to legacy playlist
else:
programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(programme_id, subtitles)
- return
-
self._sort_formats(formats)
return {