New upstream version 2017.09.24

[youtubedl] / youtube_dl / extractor / anvato.py
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py

index cb29cf11122f3f53ede969f829af5ea27666b0ff..8023da70236599e1777172ac416a8ad828a6ec0c 100644 (file)
--- a/youtube_dl/extractor/anvato.py
+++ b/youtube_dl/extractor/anvato.py
@@ -5,6 +5,7 @@ import base64
  import hashlib
  import json
  import random
+import re
  import time
  
  from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
      intlist_to_bytes,
      int_or_none,
      strip_jsonp,
+    unescapeHTML,
  )
  
  
@@ -26,6 +28,8 @@ def md5_text(s):
  
  
  class AnvatoIE(InfoExtractor):
+    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
      # Copied from anvplayer.min.js
      _ANVACK_TABLE = {
          'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
          'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
      }
  
+    _MCP_TO_ACCESS_KEY_TABLE = {
+        'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+        'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+        'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+        'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+        'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+        'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+        'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+    }
+
+    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
      _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
  
      def __init__(self, *args, **kwargs):
@@ -157,22 +177,16 @@ class AnvatoIE(InfoExtractor):
              video_data_url, video_id, transform_source=strip_jsonp,
              data=json.dumps(payload).encode('utf-8'))
  
-    def _extract_anvato_videos(self, webpage, video_id):
-        anvplayer_data = self._parse_json(self._html_search_regex(
-            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
-            'Anvato player data'), video_id)
-
-        video_id = anvplayer_data['video']
-        access_key = anvplayer_data['accessKey']
-
+    def _get_anvato_videos(self, access_key, video_id):
          video_data = self._get_video_json(access_key, video_id)
  
          formats = []
          for published_url in video_data['published_urls']:
              video_url = published_url['embed_url']
+            media_format = published_url.get('format')
              ext = determine_ext(video_url)
  
-            if ext == 'smil':
+            if ext == 'smil' or media_format == 'smil':
                  formats.extend(self._extract_smil_formats(video_url, video_id))
                  continue
  
@@ -183,18 +197,13 @@ class AnvatoIE(InfoExtractor):
                  'tbr': tbr if tbr != 0 else None,
              }
  
-            if ext == 'm3u8':
-                # Not using _extract_m3u8_formats here as individual media
-                # playlists are also included in published_urls.
-                if tbr is None:
-                    formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
-                    continue
-                else:
+            if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'):
+                if tbr is not None:
                      a_format.update({
                          'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
                          'ext': 'mp4',
                      })
-            elif ext == 'mp3':
+            elif ext == 'mp3' or media_format == 'mp3':
                  a_format['vcodec'] = 'none'
              else:
                  a_format.update({
@@ -218,7 +227,52 @@ class AnvatoIE(InfoExtractor):
              'formats': formats,
              'title': video_data.get('def_title'),
              'description': video_data.get('def_description'),
+            'tags': video_data.get('def_tags', '').split(','),
              'categories': video_data.get('categories'),
              'thumbnail': video_data.get('thumbnail'),
+            'timestamp': int_or_none(video_data.get(
+                'ts_published') or video_data.get('ts_added')),
+            'uploader': video_data.get('mcp_id'),
+            'duration': int_or_none(video_data.get('duration')),
              'subtitles': subtitles,
          }
+
+    @staticmethod
+    def _extract_urls(ie, webpage, video_id):
+        entries = []
+        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+            anvplayer_data = ie._parse_json(
+                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+                fatal=False)
+            if not anvplayer_data:
+                continue
+            video = anvplayer_data.get('video')
+            if not isinstance(video, compat_str) or not video.isdigit():
+                continue
+            access_key = anvplayer_data.get('accessKey')
+            if not access_key:
+                mcp = anvplayer_data.get('mcp')
+                if mcp:
+                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+                        mcp.lower())
+            if not access_key:
+                continue
+            entries.append(ie.url_result(
+                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+                video_id=video))
+        return entries
+
+    def _extract_anvato_videos(self, webpage, video_id):
+        anvplayer_data = self._parse_json(
+            self._html_search_regex(
+                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+            video_id)
+        return self._get_anvato_videos(
+            anvplayer_data['accessKey'], anvplayer_data['video'])
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+        if access_key not in self._ANVACK_TABLE:
+            access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+        return self._get_anvato_videos(access_key, video_id)