New upstream version 2017.09.24

[youtubedl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index c108d4a8a4e9a31d6b931d39c8a6233e75be1b0d..7d0edf09c6a2e8428b84b3e7ebf787738db1bfaa 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -10,6 +10,7 @@ from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
      compat_etree_fromstring,
+    compat_str,
      compat_urllib_parse_unquote,
      compat_urlparse,
      compat_xml_parse_error,
@@ -35,6 +36,10 @@ from .brightcove import (
      BrightcoveLegacyIE,
      BrightcoveNewIE,
  )
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
  from .nbc import NBCSportsVPlayerIE
  from .ooyala import OoyalaIE
  from .rutv import RUTVIE
@@ -56,6 +61,7 @@ from .dailymotion import (
      DailymotionIE,
      DailymotionCloudIE,
  )
+from .dailymail import DailyMailIE
  from .onionstudios import OnionStudiosIE
  from .viewlift import ViewLiftEmbedIE
  from .mtv import MTVServicesEmbeddedIE
@@ -90,6 +96,9 @@ from .anvato import AnvatoIE
  from .washingtonpost import WashingtonPostIE
  from .wistia import WistiaIE
  from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
  
  
  class GenericIE(InfoExtractor):
@@ -567,6 +576,19 @@ class GenericIE(InfoExtractor):
              },
              'skip': 'movie expired',
          },
+        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+        {
+            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+            'info_dict': {
+                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+                'ext': 'mp4',
+                'title': 'Steampunk Fest Comes to Honesdale',
+                'duration': 43.276,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
          # embed.ly video
          {
              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -758,6 +780,20 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Dailymotion'],
          },
+        # DailyMail embed
+        {
+            'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+            'info_dict': {
+                'id': '1495629',
+                'ext': 'mp4',
+                'title': 'Care worker punches elderly dementia patient in head 11 times',
+                'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+            },
+            'add_ie': ['DailyMail'],
+            'params': {
+                'skip_download': True,
+            },
+        },
          # YouTube embed
          {
              'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@@ -1184,7 +1220,7 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Kaltura'],
          },
-        # Eagle.Platform embed (generic URL)
+        # EaglePlatform embed (generic URL)
          {
              'url': 'http://lenta.ru/news/2015/03/06/navalny/',
              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1198,8 +1234,26 @@ class GenericIE(InfoExtractor):
                  'view_count': int,
                  'age_limit': 0,
              },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # referrer protected EaglePlatform embed
+        {
+            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+            'info_dict': {
+                'id': '582306',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 3382,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
          },
-        # ClipYou (Eagle.Platform) embed (custom URL)
+        # ClipYou (EaglePlatform) embed (custom URL)
          {
              'url': 'http://muz-tv.ru/play/7129/',
              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1211,6 +1265,9 @@ class GenericIE(InfoExtractor):
                  'duration': 216,
                  'view_count': int,
              },
+            'params': {
+                'skip_download': True,
+            },
          },
          # Pladform embed
          {
@@ -1462,14 +1519,27 @@ class GenericIE(InfoExtractor):
          # LiveLeak embed
          {
              'url': 'http://www.wykop.pl/link/3088787/',
-            'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+            'md5': '7619da8c820e835bef21a1efa2a0fc71',
              'info_dict': {
                  'id': '874_1459135191',
                  'ext': 'mp4',
                  'title': 'Man shows poor quality of new apartment building',
                  'description': 'The wall is like a sand pile.',
                  'uploader': 'Lake8737',
-            }
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
+        },
+        # Another LiveLeak embed pattern (#13336)
+        {
+            'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+            'info_dict': {
+                'id': '2eb_1496309988',
+                'ext': 'mp4',
+                'title': 'Thief robs place where everyone was armed',
+                'description': 'md5:694d73ee79e535953cf2488562288eee',
+                'uploader': 'brazilwtf',
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
          },
          # Duplicated embedded video URLs
          {
@@ -1511,6 +1581,22 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['BrightcoveLegacy'],
          },
+        # Nexx embed
+        {
+            'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
+            'info_dict': {
+                'id': '247746',
+                'ext': 'mp4',
+                'title': "Yesterday's Jam (OV)",
+                'description': 'md5:09bc0984723fed34e2581624a84e05f0',
+                'timestamp': 1492594816,
+                'upload_date': '20170419',
+            },
+            'params': {
+                'format': 'bestvideo',
+                'skip_download': True,
+            },
+        },
          # Facebook <iframe> embed
          {
              'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1521,6 +1607,21 @@ class GenericIE(InfoExtractor):
                  'title': 'Facebook video #599637780109885',
              },
          },
+        # Facebook <iframe> embed, plugin video
+        {
+            'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
+            'info_dict': {
+                'id': '1754168231264132',
+                'ext': 'mp4',
+                'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
+                'uploader': 'Tariq Ramadan (official)',
+                'timestamp': 1496758379,
+                'upload_date': '20170606',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          # Facebook API embed
          {
              'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
@@ -1697,6 +1798,21 @@ class GenericIE(InfoExtractor):
              },
              'playlist_mincount': 5,
          },
+        {
+            # Limelight embed (LimelightPlayerUtil.embed)
+            'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+            'info_dict': {
+                'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+                'ext': 'mp4',
+                'title': '07448641',
+                'timestamp': 1499890639,
+                'upload_date': '20170712',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['LimelightMedia'],
+        },
          {
              'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
              'info_dict': {
@@ -1733,6 +1849,45 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [MediasetIE.ie_key()],
          },
+        {
+            # JOJ.sk embeds
+            'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+            'info_dict': {
+                'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+                'title': 'Slovenskom sa prehnala vlna silných búrok',
+            },
+            'playlist_mincount': 5,
+            'add_ie': [JojIE.ie_key()],
+        },
+        {
+            # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+            'url': 'https://tvrain.ru/amp/418921/',
+            'md5': 'cc00413936695987e8de148b67d14f1d',
+            'info_dict': {
+                'id': '418921',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+            },
+        },
+        {
+            # vzaar embed
+            'url': 'http://help.vzaar.com/article/165-embedding-video',
+            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+            'info_dict': {
+                'id': '8707641',
+                'ext': 'mp4',
+                'title': 'Building A Business Online: Principal Chairs Q & A',
+            },
+        },
+        {
+            # multiple HTML5 videos on one page
+            'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
+            'info_dict': {
+                'id': 'keyscenarios',
+                'title': 'Rescue Kit 14 Free Edition - Getting started',
+            },
+            'playlist_count': 4,
+        }
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
@@ -1882,7 +2037,7 @@ class GenericIE(InfoExtractor):
  
          if head_response is not False:
              # Check for redirect
-            new_url = head_response.geturl()
+            new_url = compat_str(head_response.geturl())
              if url != new_url:
                  self.report_following_redirect(new_url)
                  if force_videoid:
@@ -1907,14 +2062,14 @@ class GenericIE(InfoExtractor):
          content_type = head_response.headers.get('Content-Type', '').lower()
          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
          if m:
-            format_id = m.group('format_id')
+            format_id = compat_str(m.group('format_id'))
              if format_id.endswith('mpegurl'):
                  formats = self._extract_m3u8_formats(url, video_id, 'mp4')
              elif format_id == 'f4m':
                  formats = self._extract_f4m_formats(url, video_id)
              else:
                  formats = [{
-                    'format_id': m.group('format_id'),
+                    'format_id': format_id,
                      'url': url,
                      'vcodec': 'none' if m.group('type') == 'audio' else None
                  }]
@@ -1983,7 +2138,7 @@ class GenericIE(InfoExtractor):
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
                      doc, video_id,
-                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
                      mpd_url=url)
                  self._sort_formats(info_dict['formats'])
                  return info_dict
@@ -2032,6 +2187,13 @@ class GenericIE(InfoExtractor):
          video_description = self._og_search_description(webpage, default=None)
          video_thumbnail = self._og_search_thumbnail(webpage, default=None)
  
+        info_dict.update({
+            'title': video_title,
+            'description': video_description,
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        })
+
          # Look for Brightcove Legacy Studio embeds
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
          if bc_urls:
@@ -2053,6 +2215,16 @@ class GenericIE(InfoExtractor):
          if bc_urls:
              return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
  
+        # Look for Nexx embeds
+        nexx_urls = NexxIE._extract_urls(webpage)
+        if nexx_urls:
+            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+        # Look for Nexx iFrame embeds
+        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+        if nexx_embed_urls:
+            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
+
          # Look for ThePlatform embeds
          tp_urls = ThePlatformIE._extract_urls(webpage)
          if tp_urls:
@@ -2080,36 +2252,11 @@ class GenericIE(InfoExtractor):
          if vid_me_embed_url is not None:
              return self.url_result(vid_me_embed_url, 'Vidme')
  
-        # Look for embedded YouTube player
-        matches = re.findall(r'''(?x)
-            (?:
-                <iframe[^>]+?src=|
-                data-video-url=|
-                <embed[^>]+?src=|
-                embedSWF\(?:\s*|
-                <object[^>]+data=|
-                new\s+SWFObject\(
-            )
-            (["\'])
-                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
-                (?:embed|v|p)/.+?)
-            \1''', webpage)
-        if matches:
+        # Look for YouTube embeds
+        youtube_urls = YoutubeIE._extract_urls(webpage)
+        if youtube_urls:
              return self.playlist_from_matches(
-                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
-
-        # Look for lazyYT YouTube embed
-        matches = re.findall(
-            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
-
-        # Look for Wordpress "YouTube Video Importer" plugin
-        matches = re.findall(r'''(?x)<div[^>]+
-            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
-            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
+                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
  
          matches = DailymotionIE._extract_urls(webpage)
          if matches:
@@ -2125,6 +2272,12 @@ class GenericIE(InfoExtractor):
                  return self.playlist_from_matches(
                      playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
  
+        # Look for DailyMail embeds
+        dailymail_urls = DailyMailIE._extract_urls(webpage)
+        if dailymail_urls:
+            return self.playlist_from_matches(
+                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
+
          # Look for embedded Wistia player
          wistia_url = WistiaIE._extract_url(webpage)
          if wistia_url:
@@ -2176,6 +2329,7 @@ class GenericIE(InfoExtractor):
          # Look for Ooyala videos
          mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+                re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
          if mobj is not None:
@@ -2221,9 +2375,9 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group('url'))
  
          # Look for embedded Facebook player
-        facebook_url = FacebookIE._extract_url(webpage)
-        if facebook_url is not None:
-            return self.url_result(facebook_url, 'Facebook')
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        if facebook_urls:
+            return self.playlist_from_matches(facebook_urls, video_id, video_title)
  
          # Look for embedded VK player
          mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -2420,12 +2574,12 @@ class GenericIE(InfoExtractor):
          if kaltura_url:
              return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
  
-        # Look for Eagle.Platform embeds
+        # Look for EaglePlatform embeds
          eagleplatform_url = EaglePlatformIE._extract_url(webpage)
          if eagleplatform_url:
-            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
  
-        # Look for ClipYou (uses Eagle.Platform) embeds
+        # Look for ClipYou (uses EaglePlatform) embeds
          mobj = re.search(
              r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
          if mobj is not None:
@@ -2600,9 +2754,9 @@ class GenericIE(InfoExtractor):
                  self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
  
          # Look for LiveLeak embeds
-        liveleak_url = LiveLeakIE._extract_url(webpage)
-        if liveleak_url:
-            return self.url_result(liveleak_url, 'LiveLeak')
+        liveleak_urls = LiveLeakIE._extract_urls(webpage)
+        if liveleak_urls:
+            return self.playlist_from_matches(liveleak_urls, video_id, video_title)
  
          # Look for 3Q SDN embeds
          threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
@@ -2654,7 +2808,7 @@ class GenericIE(InfoExtractor):
          rutube_urls = RutubeIE._extract_urls(webpage)
          if rutube_urls:
              return self.playlist_from_matches(
-                rutube_urls, ie=RutubeIE.ie_key())
+                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
  
          # Look for WashingtonPost embeds
          wapo_urls = WashingtonPostIE._extract_urls(webpage)
@@ -2668,38 +2822,69 @@ class GenericIE(InfoExtractor):
              return self.playlist_from_matches(
                  mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
  
-        # Looking for http://schema.org/VideoObject
-        json_ld = self._search_json_ld(
-            webpage, video_id, default={}, expected_type='VideoObject')
-        if json_ld.get('url'):
-            info_dict.update({
-                'title': video_title or info_dict['title'],
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'age_limit': age_limit
-            })
-            info_dict.update(json_ld)
-            return info_dict
+        # Look for JOJ.sk embeds
+        joj_urls = JojIE._extract_urls(webpage)
+        if joj_urls:
+            return self.playlist_from_matches(
+                joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+        # Look for megaphone.fm embeds
+        mpfn_urls = MegaphoneIE._extract_urls(webpage)
+        if mpfn_urls:
+            return self.playlist_from_matches(
+                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+        # Look for vzaar embeds
+        vzaar_urls = VzaarIE._extract_urls(webpage)
+        if vzaar_urls:
+            return self.playlist_from_matches(
+                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
+        def merge_dicts(dict1, dict2):
+            merged = {}
+            for k, v in dict1.items():
+                if v is not None:
+                    merged[k] = v
+            for k, v in dict2.items():
+                if v is None:
+                    continue
+                if (k not in merged or
+                        (isinstance(v, compat_str) and v and
+                            isinstance(merged[k], compat_str) and
+                            not merged[k])):
+                    merged[k] = v
+            return merged
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
          if entries:
-            for entry in entries:
-                entry.update({
+            if len(entries) == 1:
+                entries[0].update({
                      'id': video_id,
                      'title': video_title,
                  })
+            else:
+                for num, entry in enumerate(entries, start=1):
+                    entry.update({
+                        'id': '%s-%s' % (video_id, num),
+                        'title': '%s (%d)' % (video_title, num),
+                    })
+            for entry in entries:
                  self._sort_formats(entry['formats'])
-            return self.playlist_result(entries)
+            return self.playlist_result(entries, video_id, video_title)
  
          jwplayer_data = self._find_jwplayer_data(
              webpage, video_id, transform_source=js_to_json)
          if jwplayer_data:
              info = self._parse_jwplayer_data(
                  jwplayer_data, video_id, require_title=False, base_url=url)
-            if not info.get('title'):
-                info['title'] = video_title
-            return info
+            return merge_dicts(info, info_dict)
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
+            return merge_dicts(json_ld, info_dict)
  
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):