Prepare to upload.

[youtubedl] / youtube_dl / extractor / theplatform.py
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index 25edc310008ef0da7b407c97db86ce5e49c78a50..93d8715716d00be7e15c53afbe804a3703b38e68 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -16,11 +16,12 @@ from ..compat import (
  from ..utils import (
      determine_ext,
      ExtractorError,
-    xpath_with_ns,
-    unsmuggle_url,
-    int_or_none,
-    url_basename,
      float_or_none,
+    int_or_none,
+    sanitized_Request,
+    unsmuggle_url,
+    xpath_with_ns,
+    mimetype2ext,
  )
  
  default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -68,7 +69,7 @@ class ThePlatformBaseIE(InfoExtractor):
              for caption in captions:
                  lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
                  subtitles[lang] = [{
-                    'ext': 'srt' if mime == 'text/srt' else 'ttml',
+                    'ext': mimetype2ext(mime),
                      'url': src,
                  }]
  
@@ -84,7 +85,7 @@ class ThePlatformBaseIE(InfoExtractor):
  class ThePlatformIE(ThePlatformBaseIE):
      _VALID_URL = r'''(?x)
          (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
-           (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+           (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
           |theplatform:)(?P<id>[^/\?&]+)'''
  
      _TESTS = [{
@@ -139,6 +140,11 @@ class ThePlatformIE(ThePlatformBaseIE):
              'upload_date': '20150701',
              'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
          },
+    }, {
+        # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
+        # geo-restricted (US), HLS encrypted with AES-128
+        'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
+        'only_matching': True,
      }]
  
      @staticmethod
@@ -182,8 +188,12 @@ class ThePlatformIE(ThePlatformBaseIE):
              # Seems there's no pattern for the interested script filename, so
              # I try one by one
              for script in reversed(scripts):
-                feed_script = self._download_webpage(script, video_id, 'Downloading feed script')
-                feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None)
+                feed_script = self._download_webpage(
+                    self._proto_relative_url(script, 'http:'),
+                    video_id, 'Downloading feed script')
+                feed_id = self._search_regex(
+                    r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
+                    'default feed id', default=None)
                  if feed_id is not None:
                      break
              if feed_id is None:
@@ -193,6 +203,20 @@ class ThePlatformIE(ThePlatformBaseIE):
  
          if smuggled_data.get('force_smil_url', False):
              smil_url = url
+        # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385)
+        elif '/guid/' in url:
+            headers = {}
+            source_url = smuggled_data.get('source_url')
+            if source_url:
+                headers['Referer'] = source_url
+            request = sanitized_Request(url, headers=headers)
+            webpage = self._download_webpage(request, video_id)
+            smil_url = self._search_regex(
+                r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
+                webpage, 'smil url', group='url')
+            path = self._search_regex(
+                r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
+            smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL'
          elif mobj.group('config'):
              config_url = url + '&form=json'
              config_url = config_url.replace('swf/', 'config/')
@@ -259,8 +283,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
          first_video_id = None
          duration = None
          for item in entry['media$content']:
-            smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M'
-            cur_video_id = url_basename(smil_url)
+            smil_url = item['plfile$url'] + '&format=SMIL&mbr=true'
+            cur_video_id = ThePlatformIE._match_id(smil_url)
              if first_video_id is None:
                  first_video_id = cur_video_id
                  duration = float_or_none(item.get('plfile$duration'))