Fix extraction from youtube.

[youtubedl] / youtube_dl / extractor / cbc.py
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py

index 54b4b9be958ae49f0ea4f7d37cadcdf4e2c8b1c7..43f95c739deed7e497b2d85b23393c24f0f5c864 100644 (file)
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -17,9 +17,11 @@ from ..utils import (
      xpath_element,
      xpath_with_ns,
      find_xpath_attr,
      xpath_element,
      xpath_with_ns,
      find_xpath_attr,
+    orderedSet,
      parse_duration,
      parse_iso8601,
      parse_age_limit,
      parse_duration,
      parse_iso8601,
      parse_age_limit,
+    strip_or_none,
      int_or_none,
      ExtractorError,
  )
      int_or_none,
      ExtractorError,
  )
@@ -129,15 +131,23 @@ class CBCIE(InfoExtractor):
      def _real_extract(self, url):
          display_id = self._match_id(url)
          webpage = self._download_webpage(url, display_id)
      def _real_extract(self, url):
          display_id = self._match_id(url)
          webpage = self._download_webpage(url, display_id)
+        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
+                r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
          entries = [
              self._extract_player_init(player_init, display_id)
              for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
          entries = [
              self._extract_player_init(player_init, display_id)
              for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
+        media_ids = []
+        for media_id_re in (
+                r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
+                r'<div[^>]+\bid=["\']player-(\d+)',
+                r'guid["\']\s*:\s*["\'](\d+)'):
+            media_ids.extend(re.findall(media_id_re, webpage))
          entries.extend([
              self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
          entries.extend([
              self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
-            for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)])
+            for media_id in orderedSet(media_ids)])
          return self.playlist_result(
          return self.playlist_result(
-            entries, display_id,
-            self._og_search_title(webpage, fatal=False),
+            entries, display_id, strip_or_none(title),
              self._og_search_description(webpage))
  
  
              self._og_search_description(webpage))