Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/techtalks.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     get_element_by_attribute,
   8     clean_html,
   9 )
  10
  11
  12 class TechTalksIE(InfoExtractor):
  13     _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)'
  14
  15     _TESTS = [{
  16         'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
  17         'info_dict': {
  18             'id': '57758',
  19             'title': 'Learning Topic Models --- Going beyond SVD',
  20         },
  21         'playlist': [
  22             {
  23                 'info_dict': {
  24                     'id': '57758',
  25                     'ext': 'flv',
  26                     'title': 'Learning Topic Models --- Going beyond SVD',
  27                 },
  28             },
  29             {
  30                 'info_dict': {
  31                     'id': '57758-slides',
  32                     'ext': 'flv',
  33                     'title': 'Learning Topic Models --- Going beyond SVD',
  34                 },
  35             },
  36         ],
  37         'params': {
  38             # rtmp download
  39             'skip_download': True,
  40         },
  41     }, {
  42         'url': 'http://techtalks.tv/talks/57758',
  43         'only_matching': True,
  44     }]
  45
  46     def _real_extract(self, url):
  47         mobj = re.match(self._VALID_URL, url)
  48         talk_id = mobj.group('id')
  49         webpage = self._download_webpage(url, talk_id)
  50         rtmp_url = self._search_regex(
  51             r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
  52         play_path = self._search_regex(
  53             r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
  54             webpage, 'presenter play path')
  55         title = clean_html(get_element_by_attribute('class', 'title', webpage))
  56         video_info = {
  57             'id': talk_id,
  58             'title': title,
  59             'url': rtmp_url,
  60             'play_path': play_path,
  61             'ext': 'flv',
  62         }
  63         m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
  64         if m_slides is None:
  65             return video_info
  66         else:
  67             return {
  68                 '_type': 'playlist',
  69                 'id': talk_id,
  70                 'title': title,
  71                 'entries': [
  72                     video_info,
  73                     # The slides video
  74                     {
  75                         'id': talk_id + '-slides',
  76                         'title': title,
  77                         'url': rtmp_url,
  78                         'play_path': m_slides.group(1),
  79                         'ext': 'flv',
  80                     },
  81                 ],
  82             }