Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/abc7news.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import parse_iso8601
   7
   8
   9 class Abc7NewsIE(InfoExtractor):
  10     _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)'
  11     _TESTS = [
  12         {
  13             'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
  14             'info_dict': {
  15                 'id': '472581',
  16                 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
  17                 'ext': 'mp4',
  18                 'title': 'East Bay museum celebrates history of synthesized music',
  19                 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10',
  20                 'thumbnail': 're:^https?://.*\.jpg$',
  21                 'timestamp': 1421123075,
  22                 'upload_date': '20150113',
  23                 'uploader': 'Jonathan Bloom',
  24             },
  25             'params': {
  26                 # m3u8 download
  27                 'skip_download': True,
  28             },
  29         },
  30         {
  31             'url': 'http://abc7news.com/472581',
  32             'only_matching': True,
  33         },
  34     ]
  35
  36     def _real_extract(self, url):
  37         mobj = re.match(self._VALID_URL, url)
  38         video_id = mobj.group('id')
  39         display_id = mobj.group('display_id') or video_id
  40
  41         webpage = self._download_webpage(url, display_id)
  42
  43         m3u8 = self._html_search_meta(
  44             'contentURL', webpage, 'm3u8 url', fatal=True)
  45
  46         formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
  47
  48         title = self._og_search_title(webpage).strip()
  49         description = self._og_search_description(webpage).strip()
  50         thumbnail = self._og_search_thumbnail(webpage)
  51         timestamp = parse_iso8601(self._search_regex(
  52             r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">',
  53             webpage, 'upload date', fatal=False))
  54         uploader = self._search_regex(
  55             r'rel="author">([^<]+)</a>',
  56             webpage, 'uploader', default=None)
  57
  58         return {
  59             'id': video_id,
  60             'display_id': display_id,
  61             'title': title,
  62             'description': description,
  63             'thumbnail': thumbnail,
  64             'timestamp': timestamp,
  65             'uploader': uploader,
  66             'formats': formats,
  67         }