debian/control: Remove trailing whitespace at EOF.

[youtubedl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 1d9da8115832126671233101dbc3b51759e63a33..dad951b751853f900a26e6cd2e7bce5ca964b749 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -23,6 +23,7 @@ from ..utils import (
      is_html,
      js_to_json,
      KNOWN_EXTENSIONS,
      is_html,
      js_to_json,
      KNOWN_EXTENSIONS,
+    merge_dicts,
      mimetype2ext,
      orderedSet,
      sanitized_Request,
      mimetype2ext,
      orderedSet,
      sanitized_Request,
@@ -58,6 +59,7 @@ from .xhamster import XHamsterEmbedIE
  from .tnaflix import TNAFlixNetworkEmbedIE
  from .drtuber import DrTuberIE
  from .redtube import RedTubeIE
  from .tnaflix import TNAFlixNetworkEmbedIE
  from .drtuber import DrTuberIE
  from .redtube import RedTubeIE
+from .tube8 import Tube8IE
  from .vimeo import VimeoIE
  from .dailymotion import DailymotionIE
  from .dailymail import DailyMailIE
  from .vimeo import VimeoIE
  from .dailymotion import DailymotionIE
  from .dailymail import DailyMailIE
@@ -102,6 +104,13 @@ from .channel9 import Channel9IE
  from .vshare import VShareIE
  from .mediasite import MediasiteIE
  from .springboardplatform import SpringboardPlatformIE
  from .vshare import VShareIE
  from .mediasite import MediasiteIE
  from .springboardplatform import SpringboardPlatformIE
+from .yapfiles import YapFilesIE
+from .vice import ViceIE
+from .xfileshare import XFileShareIE
+from .cloudflarestream import CloudflareStreamIE
+from .peertube import PeerTubeIE
+from .indavideo import IndavideoEmbedIE
+from .apa import APAIE
  
  
  class GenericIE(InfoExtractor):
  
  
  class GenericIE(InfoExtractor):
@@ -186,6 +195,16 @@ class GenericIE(InfoExtractor):
                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
              }
          },
                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
              }
          },
+        # RSS feed with enclosures and unsupported link URLs
+        {
+            'url': 'http://www.hellointernet.fm/podcast?format=rss',
+            'info_dict': {
+                'id': 'http://www.hellointernet.fm/podcast?format=rss',
+                'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.',
+                'title': 'Hello Internet',
+            },
+            'playlist_mincount': 100,
+        },
          # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
          {
              'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
          # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
          {
              'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
@@ -1216,7 +1235,7 @@ class GenericIE(InfoExtractor):
                  'title': '35871',
                  'timestamp': 1355743100,
                  'upload_date': '20121217',
                  'title': '35871',
                  'timestamp': 1355743100,
                  'upload_date': '20121217',
-                'uploader_id': 'batchUser',
+                'uploader_id': 'cplapp@learn360.com',
              },
              'add_ie': ['Kaltura'],
          },
              },
              'add_ie': ['Kaltura'],
          },
@@ -1267,23 +1286,38 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Kaltura'],
          },
              },
              'add_ie': ['Kaltura'],
          },
-        # EaglePlatform embed (generic URL)
          {
          {
-            'url': 'http://lenta.ru/news/2015/03/06/navalny/',
-            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
+            # Kaltura iframe embed, more sophisticated
+            'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html',
              'info_dict': {
              'info_dict': {
-                'id': '227304',
+                'id': '1_9gzouybz',
                  'ext': 'mp4',
                  'ext': 'mp4',
-                'title': 'Навальный вышел на свободу',
-                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'duration': 87,
-                'view_count': int,
-                'age_limit': 0,
+                'title': 'lecture-05sep2017',
+                'description': 'md5:40f347d91fd4ba047e511c5321064b49',
+                'upload_date': '20170913',
+                'uploader_id': 'eps2',
+                'timestamp': 1505340777,
              },
              'params': {
                  'skip_download': True,
              },
              },
              'params': {
                  'skip_download': True,
              },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # meta twitter:player
+            'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/',
+            'info_dict': {
+                'id': '0_01b42zps',
+                'ext': 'mp4',
+                'title': 'Main Twerk (Video)',
+                'upload_date': '20171208',
+                'uploader_id': 'sebastian.salinas@thechive.com',
+                'timestamp': 1512713057,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['Kaltura'],
          },
          # referrer protected EaglePlatform embed
          {
          },
          # referrer protected EaglePlatform embed
          {
@@ -1441,21 +1475,6 @@ class GenericIE(InfoExtractor):
              },
              'expected_warnings': ['Failed to parse JSON Expecting value'],
          },
              },
              'expected_warnings': ['Failed to parse JSON Expecting value'],
          },
-        # Ooyala embed
-        {
-            'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
-            'info_dict': {
-                'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
-                'ext': 'mp4',
-                'description': 'Index/Match versus VLOOKUP.',
-                'title': 'This is what separates the Excel masters from the wannabes',
-                'duration': 191.933,
-            },
-            'params': {
-                # m3u8 downloads
-                'skip_download': True,
-            }
-        },
          # Brightcove URL in single quotes
          {
              'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
          # Brightcove URL in single quotes
          {
              'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
@@ -1954,7 +1973,102 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
              'add_ie': [SpringboardPlatformIE.ie_key()],
                  'skip_download': True,
              },
              'add_ie': [SpringboardPlatformIE.ie_key()],
-        }
+        },
+        {
+            'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
+            'info_dict': {
+                'id': 'uPDB5I9wfp8',
+                'ext': 'webm',
+                'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
+                'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
+                'upload_date': '20160219',
+                'uploader': 'Pocoyo - Português (BR)',
+                'uploader_id': 'PocoyoBrazil',
+            },
+            'add_ie': [YoutubeIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
+            'info_dict': {
+                'id': 'vMDE4NzI1Mjgt690b',
+                'ext': 'mp4',
+                'title': 'Котята',
+            },
+            'add_ie': [YapFilesIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # CloudflareStream embed
+            'url': 'https://www.cloudflare.com/products/cloudflare-stream/',
+            'info_dict': {
+                'id': '31c9291ab41fac05471db4e73aa11717',
+                'ext': 'mp4',
+                'title': '31c9291ab41fac05471db4e73aa11717',
+            },
+            'add_ie': [CloudflareStreamIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # PeerTube embed
+            'url': 'https://joinpeertube.org/fr/home/',
+            'info_dict': {
+                'id': 'home',
+                'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # Indavideo embed
+            'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/',
+            'info_dict': {
+                'id': '1693903',
+                'ext': 'mp4',
+                'title': 'Így kell otthon hamburgert sütni',
+                'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7',
+                'timestamp': 1426330212,
+                'upload_date': '20150314',
+                'uploader': 'StreetKitchen',
+                'uploader_id': '546363',
+            },
+            'add_ie': [IndavideoEmbedIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # APA embed via JWPlatform embed
+            'url': 'http://www.vol.at/blue-man-group/5593454',
+            'info_dict': {
+                'id': 'jjv85FdZ',
+                'ext': 'mp4',
+                'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 254,
+                'timestamp': 1519211149,
+                'upload_date': '20180221',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://share-videos.se/auto/video/83645793?uid=13',
+            'md5': 'b68d276de422ab07ee1d49388103f457',
+            'info_dict': {
+                'id': '83645793',
+                'title': 'Lock up and get excited',
+                'ext': 'mp4'
+            },
+            'skip': 'TODO: fix nested playlists processing in tests',
+        },
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
@@ -1985,13 +2099,15 @@ class GenericIE(InfoExtractor):
  
          entries = []
          for it in doc.findall('./channel/item'):
  
          entries = []
          for it in doc.findall('./channel/item'):
-            next_url = xpath_text(it, 'link', fatal=False)
+            next_url = None
+            enclosure_nodes = it.findall('./enclosure')
+            for e in enclosure_nodes:
+                next_url = e.attrib.get('url')
+                if next_url:
+                    break
+
              if not next_url:
              if not next_url:
-                enclosure_nodes = it.findall('./enclosure')
-                for e in enclosure_nodes:
-                    next_url = e.attrib.get('url')
-                    if next_url:
-                        break
+                next_url = xpath_text(it, 'link', fatal=False)
  
              if not next_url:
                  continue
  
              if not next_url:
                  continue
@@ -2201,7 +2317,11 @@ class GenericIE(InfoExtractor):
                  self._sort_formats(smil['formats'])
                  return smil
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                  self._sort_formats(smil['formats'])
                  return smil
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
-                return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
+                return self.playlist_result(
+                    self._parse_xspf(
+                        doc, video_id, xspf_url=url,
+                        xspf_base_url=compat_str(full_response.geturl())),
+                    video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
                      doc,
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
                      doc,
@@ -2280,7 +2400,10 @@ class GenericIE(InfoExtractor):
          # Look for Brightcove New Studio embeds
          bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
          if bc_urls:
          # Look for Brightcove New Studio embeds
          bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
          if bc_urls:
-            return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
+            return self.playlist_from_matches(
+                bc_urls, video_id, video_title,
+                getter=lambda x: smuggle_url(x, {'referrer': url}),
+                ie='BrightcoveNew')
  
          # Look for Nexx embeds
          nexx_urls = NexxIE._extract_urls(webpage)
  
          # Look for Nexx embeds
          nexx_urls = NexxIE._extract_urls(webpage)
@@ -2526,6 +2649,11 @@ class GenericIE(InfoExtractor):
          if redtube_urls:
              return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
  
          if redtube_urls:
              return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
  
+        # Look for embedded Tube8 player
+        tube8_urls = Tube8IE._extract_urls(webpage)
+        if tube8_urls:
+            return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key())
+
          # Look for embedded Tvigle player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
          # Look for embedded Tvigle player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -2928,20 +3056,47 @@ class GenericIE(InfoExtractor):
                  springboardplatform_urls, video_id, video_title,
                  ie=SpringboardPlatformIE.ie_key())
  
                  springboardplatform_urls, video_id, video_title,
                  ie=SpringboardPlatformIE.ie_key())
  
-        def merge_dicts(dict1, dict2):
-            merged = {}
-            for k, v in dict1.items():
-                if v is not None:
-                    merged[k] = v
-            for k, v in dict2.items():
-                if v is None:
-                    continue
-                if (k not in merged or
-                        (isinstance(v, compat_str) and v and
-                            isinstance(merged[k], compat_str) and
-                            not merged[k])):
-                    merged[k] = v
-            return merged
+        yapfiles_urls = YapFilesIE._extract_urls(webpage)
+        if yapfiles_urls:
+            return self.playlist_from_matches(
+                yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
+
+        vice_urls = ViceIE._extract_urls(webpage)
+        if vice_urls:
+            return self.playlist_from_matches(
+                vice_urls, video_id, video_title, ie=ViceIE.ie_key())
+
+        xfileshare_urls = XFileShareIE._extract_urls(webpage)
+        if xfileshare_urls:
+            return self.playlist_from_matches(
+                xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key())
+
+        cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage)
+        if cloudflarestream_urls:
+            return self.playlist_from_matches(
+                cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
+
+        peertube_urls = PeerTubeIE._extract_urls(webpage)
+        if peertube_urls:
+            return self.playlist_from_matches(
+                peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
+
+        indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
+        if indavideo_urls:
+            return self.playlist_from_matches(
+                indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key())
+
+        apa_urls = APAIE._extract_urls(webpage)
+        if apa_urls:
+            return self.playlist_from_matches(
+                apa_urls, video_id, video_title, ie=APAIE.ie_key())
+
+        sharevideos_urls = [mobj.group('url') for mobj in re.finditer(
+            r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
+            webpage)]
+        if sharevideos_urls:
+            return self.playlist_from_matches(
+                sharevideos_urls, video_id, video_title)
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')