New upstream version 2018.06.18

[youtubedl] / youtube_dl / extractor / ted.py
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py

index 4bca62ba003e325ebedd0fcc74c953bd64120cd5..06a27fd0428b469abb64e6428faf263fc080cc33 100644 (file)
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -1,117 +1,320 @@
+from __future__ import unicode_literals
+
  import json
  import re
  
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
  
+from ..compat import compat_str
  from ..utils import (
-    RegexNotFoundError,
+    int_or_none,
+    try_get,
  )
  
-class TEDIE(SubtitlesInfoExtractor):
-    _VALID_URL=r'''http://www\.ted\.com/
-                   (
-                        ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
-                        |
-                        ((?P<type_talk>talks)) # We have a simple talk
-                   )
-                   (/lang/(.*?))? # The url may contain the language
-                   /(?P<name>\w+) # Here goes the name and then ".html"
-                   '''
-    _TEST = {
-        u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
-        u'file': u'102.mp4',
-        u'md5': u'2d76ee1576672e0bd8f187513267adf6',
-        u'info_dict': {
-            u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922", 
-            u"title": u"Dan Dennett: The illusion of consciousness"
+
+class TEDIE(InfoExtractor):
+    IE_NAME = 'ted'
+    _VALID_URL = r'''(?x)
+        (?P<proto>https?://)
+        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
+        (
+            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
+            |
+            ((?P<type_talk>talks)) # We have a simple talk
+            |
+            (?P<type_watch>watch)/[^/]+/[^/]+
+        )
+        (/lang/(.*?))? # The url may contain the language
+        /(?P<name>[\w-]+) # Here goes the name and then ".html"
+        .*)$
+        '''
+    _TESTS = [{
+        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
+        'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
+        'info_dict': {
+            'id': '102',
+            'ext': 'mp4',
+            'title': 'The illusion of consciousness',
+            'description': ('Philosopher Dan Dennett makes a compelling '
+                            'argument that not only don\'t we understand our own '
+                            'consciousness, but that half the time our brains are '
+                            'actively fooling us.'),
+            'uploader': 'Dan Dennett',
+            'width': 853,
+            'duration': 1308,
          }
+    }, {
+        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
+        'md5': 'b899ac15e345fb39534d913f7606082b',
+        'info_dict': {
+            'id': 'tSVI8ta_P4w',
+            'ext': 'mp4',
+            'title': 'Vishal Sikka: The beauty and power of algorithms',
+            'thumbnail': r're:^https?://.+\.jpg',
+            'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
+            'upload_date': '20140122',
+            'uploader_id': 'TEDInstitute',
+            'uploader': 'TED Institute',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+        'md5': '71b3ab2f4233012dce09d515c9c39ce2',
+        'info_dict': {
+            'id': '1972',
+            'ext': 'mp4',
+            'title': 'Be passionate. Be courageous. Be your best.',
+            'uploader': 'Gabby Giffords and Mark Kelly',
+            'description': 'md5:5174aed4d0f16021b704120360f72b92',
+            'duration': 1128,
+        },
+    }, {
+        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
+        'info_dict': {
+            'id': '10',
+            'title': 'Who are the hackers?',
+        },
+        'playlist_mincount': 6,
+    }, {
+        # contains a youtube video
+        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': '_ZG8HBuDjgc',
+            'ext': 'webm',
+            'title': 'Douglas Adams: Parrots the Universe and Everything',
+            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
+            'uploader': 'University of California Television (UCTV)',
+            'uploader_id': 'UCtelevision',
+            'upload_date': '20080522',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # YouTube video
+        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': 'aFBIPO-P7LM',
+            'ext': 'mp4',
+            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
+            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
+            'uploader': 'TEDx Talks',
+            'uploader_id': 'TEDxTalks',
+            'upload_date': '20111216',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _NATIVE_FORMATS = {
+        'low': {'width': 320, 'height': 180},
+        'medium': {'width': 512, 'height': 288},
+        'high': {'width': 854, 'height': 480},
      }
  
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+    def _extract_info(self, webpage):
+        info_json = self._search_regex(
+            r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
+            webpage, 'info json')
+        return json.loads(info_json)
  
      def _real_extract(self, url):
-        m=re.match(self._VALID_URL, url, re.VERBOSE)
+        m = re.match(self._VALID_URL, url, re.VERBOSE)
+        if m.group('type').startswith('embed'):
+            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+            return self.url_result(desktop_url, 'TED')
+        name = m.group('name')
          if m.group('type_talk'):
-            return self._talk_info(url)
-        else :
-            playlist_id=m.group('playlist_id')
-            name=m.group('name')
-            self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
-            return [self._playlist_videos_info(url,name,playlist_id)]
-
+            return self._talk_info(url, name)
+        elif m.group('type_watch'):
+            return self._watch_info(url, name)
+        else:
+            return self._playlist_videos_info(url, name)
  
-    def _playlist_videos_info(self, url, name, playlist_id):
+    def _playlist_videos_info(self, url, name):
          '''Returns the videos of the playlist'''
  
-        webpage = self._download_webpage(
-            url, playlist_id, u'Downloading playlist webpage')
-        matches = re.finditer(
-            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
-            webpage)
+        webpage = self._download_webpage(url, name,
+                                         'Downloading playlist webpage')
+        info = self._extract_info(webpage)
  
-        playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
-                                                 webpage, 'playlist title')
+        playlist_info = try_get(
+            info, lambda x: x['__INITIAL_DATA__']['playlist'],
+            dict) or info['playlist']
  
          playlist_entries = [
-            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
-            for m in matches
+            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
+            for talk in try_get(
+                info, lambda x: x['__INITIAL_DATA__']['talks'],
+                dict) or info['talks']
          ]
          return self.playlist_result(
-            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
-
-    def _talk_info(self, url, video_id=0):
-        """Return the video for the talk in the url"""
-        m = re.match(self._VALID_URL, url,re.VERBOSE)
-        video_name = m.group('name')
-        webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
-        self.report_extraction(video_name)
-        # If the url includes the language we get the title translated
-        title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
-                                        webpage, 'title')
-        json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
-                                    webpage, 'json data')
-        info = json.loads(json_data)
-        desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
-                                       webpage, 'description', flags = re.DOTALL)
-        
-        thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
-                                       webpage, 'thumbnail')
+            playlist_entries,
+            playlist_id=compat_str(playlist_info['id']),
+            playlist_title=playlist_info['title'])
+
+    def _talk_info(self, url, video_name):
+        webpage = self._download_webpage(url, video_name)
+
+        info = self._extract_info(webpage)
+
+        talk_info = try_get(
+            info, lambda x: x['__INITIAL_DATA__']['talks'][0],
+            dict) or info['talks'][0]
+
+        title = talk_info['title'].strip()
+
+        external = talk_info.get('external')
+        if external:
+            service = external['service']
+            self.to_screen('Found video from %s' % service)
+            ext_url = None
+            if service.lower() == 'youtube':
+                ext_url = external.get('code')
+            return {
+                '_type': 'url',
+                'url': ext_url or external['uri'],
+            }
+
+        native_downloads = try_get(
+            talk_info, lambda x: x['downloads']['nativeDownloads'],
+            dict) or talk_info['nativeDownloads']
+
          formats = [{
-            'ext': 'mp4',
-            'url': stream['file'],
-            'format': stream['id']
-        } for stream in info['htmlStreams']]
+            'url': format_url,
+            'format_id': format_id,
+            'format': format_id,
+        } for (format_id, format_url) in native_downloads.items() if format_url is not None]
+        if formats:
+            for f in formats:
+                finfo = self._NATIVE_FORMATS.get(f['format_id'])
+                if finfo:
+                    f.update(finfo)
+
+        player_talk = talk_info['player_talks'][0]
+
+        resources_ = player_talk.get('resources') or talk_info.get('resources')
+
+        http_url = None
+        for format_id, resources in resources_.items():
+            if format_id == 'h264':
+                for resource in resources:
+                    h264_url = resource.get('file')
+                    if not h264_url:
+                        continue
+                    bitrate = int_or_none(resource.get('bitrate'))
+                    formats.append({
+                        'url': h264_url,
+                        'format_id': '%s-%sk' % (format_id, bitrate),
+                        'tbr': bitrate,
+                    })
+                    if re.search(r'\d+k', h264_url):
+                        http_url = h264_url
+            elif format_id == 'rtmp':
+                streamer = talk_info.get('streamer')
+                if not streamer:
+                    continue
+                for resource in resources:
+                    formats.append({
+                        'format_id': '%s-%s' % (format_id, resource.get('name')),
+                        'url': streamer,
+                        'play_path': resource['file'],
+                        'ext': 'flv',
+                        'width': int_or_none(resource.get('width')),
+                        'height': int_or_none(resource.get('height')),
+                        'tbr': int_or_none(resource.get('bitrate')),
+                    })
+            elif format_id == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
  
-        video_id = info['id']
+        m3u8_formats = list(filter(
+            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
+            formats))
+        if http_url:
+            for m3u8_format in m3u8_formats:
+                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+                if not bitrate:
+                    continue
+                f = m3u8_format.copy()
+                f.update({
+                    'url': re.sub(r'\d+k', bitrate, http_url),
+                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+                    'protocol': 'http',
+                })
+                formats.append(f)
  
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, webpage)
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, webpage)
-            return
+        audio_download = talk_info.get('audioDownload')
+        if audio_download:
+            formats.append({
+                'url': audio_download,
+                'format_id': 'audio',
+                'vcodec': 'none',
+            })
+
+        self._sort_formats(formats)
+
+        video_id = compat_str(talk_info['id'])
  
          return {
              'id': video_id,
              'title': title,
-            'thumbnail': thumbnail,
-            'description': desc,
-            'subtitles': video_subtitles,
+            'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
+            'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
+            'description': self._og_search_description(webpage),
+            'subtitles': self._get_subtitles(video_id, talk_info),
              'formats': formats,
+            'duration': talk_info.get('duration'),
          }
  
-    def _get_available_subtitles(self, video_id, webpage):
-        try:
-            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
-            languages = re.findall(r'(?:<option value=")(\S+)"', options)
-            if languages:
-                sub_lang_list = {}
-                for l in languages:
-                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
-                    sub_lang_list[l] = url
-                return sub_lang_list
-        except RegexNotFoundError:
-            self._downloader.report_warning(u'video doesn\'t have subtitles')
-        return {}
+    def _get_subtitles(self, video_id, talk_info):
+        sub_lang_list = {}
+        for language in try_get(
+                talk_info,
+                (lambda x: x['downloads']['languages'],
+                 lambda x: x['languages']), list):
+            lang_code = language.get('languageCode') or language.get('ianaCode')
+            if not lang_code:
+                continue
+            sub_lang_list[lang_code] = [
+                {
+                    'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
+                    'ext': ext,
+                }
+                for ext in ['ted', 'srt']
+            ]
+        return sub_lang_list
+
+    def _watch_info(self, url, name):
+        webpage = self._download_webpage(url, name)
+
+        config_json = self._html_search_regex(
+            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
+            webpage, 'config', default=None)
+        if not config_json:
+            embed_url = self._search_regex(
+                r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
+            return self.url_result(self._proto_relative_url(embed_url))
+        config = json.loads(config_json)['config']
+        video_url = config['video']['url']
+        thumbnail = config.get('image', {}).get('url')
+
+        title = self._html_search_regex(
+            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
+        description = self._html_search_regex(
+            [
+                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
+                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
+            ],
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': name,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'description': description,
+        }