]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/bbccouk.py
Imported Upstream version 2015.02.28
[youtubedl] / youtube_dl / extractor / bbccouk.py
index 126c8824cccedbca287ac3ebfc92d1a5e2d93b57..abc34a5761487b5a900294dac59db4a053b95cb0 100644 (file)
@@ -2,12 +2,12 @@ from __future__ import unicode_literals
 
 import xml.etree.ElementTree
 
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
 from ..utils import ExtractorError
 from ..compat import compat_HTTPError
 
 
-class BBCCoUkIE(SubtitlesInfoExtractor):
+class BBCCoUkIE(InfoExtractor):
     IE_NAME = 'bbc.co.uk'
     IE_DESC = 'BBC iPlayer'
     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
@@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             formats.extend(conn_formats)
         return formats
 
-    def _extract_captions(self, media, programme_id):
+    def _get_subtitles(self, media, programme_id):
         subtitles = {}
         for connection in self._extract_connections(media):
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
             ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
             srt = ''
+
+            def _extract_text(p):
+                if p.text is not None:
+                    stripped_text = p.text.strip()
+                    if stripped_text:
+                        return stripped_text
+                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
             for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
-                                                          p.text.strip() if p.text is not None else '')
-            subtitles[lang] = srt
+                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
+            subtitles[lang] = [
+                {
+                    'url': connection.get('href'),
+                    'ext': 'ttml',
+                },
+                {
+                    'data': srt,
+                    'ext': 'srt',
+                },
+            ]
         return subtitles
 
     def _download_media_selector(self, programme_id):
@@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             elif kind == 'video':
                 formats.extend(self._extract_video(media, programme_id))
             elif kind == 'captions':
-                subtitles = self._extract_captions(media, programme_id)
+                subtitles = self.extract_subtitles(media, programme_id)
 
         return formats, subtitles
 
@@ -273,7 +288,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
                     formats, subtitles = self._download_media_selector(programme_id)
                 return programme_id, title, description, duration, formats, subtitles
         except ExtractorError as ee:
-            if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
                 raise
 
         # fallback to legacy playlist
@@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(programme_id, subtitles)
-            return
-
         self._sort_formats(formats)
 
         return {