Update upstream source from tag 'upstream/2019.09.28'

[youtubedl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 067de28cd16b83f20717aca02f77e25051ac222d..d1725d98b0c63f72031068a001ea150b54fd23ec 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -89,7 +89,10 @@ from .piksel import PikselIE
  from .videa import VideaIE
  from .twentymin import TwentyMinutenIE
  from .ustream import UstreamIE
-from .openload import OpenloadIE
+from .openload import (
+    OpenloadIE,
+    VerystreamIE,
+)
  from .videopress import VideoPressIE
  from .rutube import RutubeIE
  from .limelight import LimelightBaseIE
@@ -430,7 +433,7 @@ class GenericIE(InfoExtractor):
              },
          },
          {
-            # https://github.com/rg3/youtube-dl/issues/2253
+            # https://github.com/ytdl-org/youtube-dl/issues/2253
              'url': 'http://bcove.me/i6nfkrc3',
              'md5': '0ba9446db037002366bab3b3eb30c88c',
              'info_dict': {
@@ -455,7 +458,7 @@ class GenericIE(InfoExtractor):
              },
          },
          {
-            # https://github.com/rg3/youtube-dl/issues/3541
+            # https://github.com/ytdl-org/youtube-dl/issues/3541
              'add_ie': ['BrightcoveLegacy'],
              'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
              'info_dict': {
@@ -919,7 +922,7 @@ class GenericIE(InfoExtractor):
              }
          },
          # Multiple brightcove videos
-        # https://github.com/rg3/youtube-dl/issues/2283
+        # https://github.com/ytdl-org/youtube-dl/issues/2283
          {
              'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
              'info_dict': {
@@ -2072,6 +2075,22 @@ class GenericIE(InfoExtractor):
              },
              'playlist_count': 6,
          },
+        {
+            # Squarespace video embed, 2019-08-28
+            'url': 'http://ootboxford.com',
+            'info_dict': {
+                'id': 'Tc7b_JGdZfw',
+                'title': 'Out of the Blue, at Childish Things 10',
+                'ext': 'mp4',
+                'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
+                'uploader_id': 'helendouglashouse',
+                'uploader': 'Helen & Douglas House',
+                'upload_date': '20140328',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          {
              # Zype embed
              'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
@@ -2101,6 +2120,23 @@ class GenericIE(InfoExtractor):
              },
              'expected_warnings': ['Failed to download MPD manifest'],
          },
+        {
+            # DailyMotion embed with DM.player
+            'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804',
+            'info_dict': {
+                'id': 'k6aKkGHd9FJs4mtJN39',
+                'ext': 'mp4',
+                'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final',
+                'description': 'This video is private.',
+                'uploader_id': 'x1jf30l',
+                'uploader': 'beIN SPORTS USA',
+                'upload_date': '20190528',
+                'timestamp': 1559062971,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
@@ -2206,7 +2242,7 @@ class GenericIE(InfoExtractor):
                  default_search = 'fixup_error'
  
              if default_search in ('auto', 'auto_warning', 'fixup_error'):
-                if '/' in url:
+                if re.match(r'^[^\s/]+\.[^\s/]+/', url):
                      self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
                      return self.url_result('http://' + url)
                  elif default_search != 'fixup_error':
@@ -2371,10 +2407,16 @@ class GenericIE(InfoExtractor):
              return camtasia_res
  
          # Sometimes embedded video player is hidden behind percent encoding
-        # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
+        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
          webpage = compat_urllib_parse_unquote(webpage)
  
+        # Unescape squarespace embeds to be detected by generic extractor,
+        # see https://github.com/ytdl-org/youtube-dl/issues/21294
+        webpage = re.sub(
+            r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
+            lambda x: unescapeHTML(x.group(0)), webpage)
+
          # it's tempting to parse this further, but you would
          # have to take into account all the variations like
          #   Video Title - Site Name
@@ -2546,11 +2588,11 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group('url'))
  
          # Look for Ooyala videos
-        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
-                re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
-                re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
-                re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
-                re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
+                or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
+                or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
          if mobj is not None:
              embed_token = self._search_regex(
                  r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
@@ -2580,19 +2622,6 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              return self.url_result(mobj.group(1), 'Mpora')
  
-        # Look for embedded NovaMov-based player
-        mobj = re.search(
-            r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
-                    (?P<url>http://(?:(?:embed|www)\.)?
-                        (?:novamov\.com|
-                           nowvideo\.(?:ch|sx|eu|at|ag|co)|
-                           videoweed\.(?:es|com)|
-                           movshare\.(?:net|sx|ag)|
-                           divxstage\.(?:eu|net|ch|co|at|ag))
-                        /embed\.php.+?)\1''', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
          # Look for embedded Facebook player
          facebook_urls = FacebookIE._extract_urls(webpage)
          if facebook_urls:
@@ -3017,6 +3046,12 @@ class GenericIE(InfoExtractor):
              return self.playlist_from_matches(
                  openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
  
+        # Look for Verystream embeds
+        verystream_urls = VerystreamIE._extract_urls(webpage)
+        if verystream_urls:
+            return self.playlist_from_matches(
+                verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key())
+
          # Look for VideoPress embeds
          videopress_urls = VideoPressIE._extract_urls(webpage)
          if videopress_urls:
@@ -3177,7 +3212,7 @@ class GenericIE(InfoExtractor):
                      jwplayer_data, video_id, require_title=False, base_url=url)
                  return merge_dicts(info, info_dict)
              except ExtractorError:
-                # See https://github.com/rg3/youtube-dl/pull/16735
+                # See https://github.com/ytdl-org/youtube-dl/pull/16735
                  pass
  
          # Video.js embed
@@ -3212,8 +3247,8 @@ class GenericIE(InfoExtractor):
                  else:
                      formats.append({
                          'url': src,
-                        'ext': (mimetype2ext(src_type) or
-                                ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+                        'ext': (mimetype2ext(src_type)
+                                or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
                      })
              if formats:
                  self._sort_formats(formats)