Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/elpais.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import unified_strdate
   8
   9
  10 class ElPaisIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
  12     IE_DESC = 'El País'
  13
  14     _TEST = {
  15         'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
  16         'md5': '98406f301f19562170ec071b83433d55',
  17         'info_dict': {
  18             'id': 'tiempo-nuevo-recetas-viejas',
  19             'ext': 'mp4',
  20             'title': 'Tiempo nuevo, recetas viejas',
  21             'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
  22             'upload_date': '20140206',
  23         }
  24     }
  25
  26     def _real_extract(self, url):
  27         mobj = re.match(self._VALID_URL, url)
  28         video_id = mobj.group('id')
  29
  30         webpage = self._download_webpage(url, video_id)
  31
  32         prefix = self._html_search_regex(
  33             r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
  34         video_suffix = self._search_regex(
  35             r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
  36         video_url = prefix + video_suffix
  37         thumbnail_suffix = self._search_regex(
  38             r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
  39             fatal=False)
  40         thumbnail = (
  41             None if thumbnail_suffix is None
  42             else prefix + thumbnail_suffix)
  43         title = self._html_search_regex(
  44             '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
  45             webpage, 'title')
  46         date_str = self._search_regex(
  47             r'<p class="date-header date-int updated"\s+title="([^"]+)">',
  48             webpage, 'upload date', fatal=False)
  49         upload_date = (None if date_str is None else unified_strdate(date_str))
  50
  51         return {
  52             'id': video_id,
  53             'url': video_url,
  54             'title': title,
  55             'description': self._og_search_description(webpage),
  56             'thumbnail': thumbnail,
  57             'upload_date': upload_date,
  58         }