Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/slideshare.py

   1 import re
   2 import json
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     compat_urlparse,
   7     ExtractorError,
   8 )
   9
  10
  11 class SlideshareIE(InfoExtractor):
  12     _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
  13
  14     _TEST = {
  15         u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
  16         u'file': u'25665706.mp4',
  17         u'info_dict': {
  18             u'title': u'Managing Scale and Complexity',
  19             u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
  20         },
  21     }
  22
  23     def _real_extract(self, url):
  24         mobj = re.match(self._VALID_URL, url)
  25         page_title = mobj.group('title')
  26         webpage = self._download_webpage(url, page_title)
  27         slideshare_obj = self._search_regex(
  28             r'var slideshare_object =  ({.*?}); var user_info =',
  29             webpage, u'slideshare object')
  30         info = json.loads(slideshare_obj)
  31         if info['slideshow']['type'] != u'video':
  32             raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
  33
  34         doc = info['doc']
  35         bucket = info['jsplayer']['video_bucket']
  36         ext = info['jsplayer']['video_extension']
  37         video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
  38
  39         return {
  40             '_type': 'video',
  41             'id': info['slideshow']['id'],
  42             'title': info['slideshow']['title'],
  43             'ext': ext,
  44             'url': video_url,
  45             'thumbnail': info['slideshow']['pin_image_url'],
  46             'description': self._og_search_description(webpage),
  47         }