Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/minhateca.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_urllib_parse
   6 from ..utils import (
   7     int_or_none,
   8     parse_duration,
   9     parse_filesize,
  10     sanitized_Request,
  11 )
  12
  13
  14 class MinhatecaIE(InfoExtractor):
  15     _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
  16     _TEST = {
  17         'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
  18         'info_dict': {
  19             'id': '125848331',
  20             'ext': 'mp4',
  21             'title': 'youtube-dl test video',
  22             'thumbnail': 're:^https?://.*\.jpg$',
  23             'filesize_approx': 1530000,
  24             'duration': 9,
  25             'view_count': int,
  26         }
  27     }
  28
  29     def _real_extract(self, url):
  30         video_id = self._match_id(url)
  31         webpage = self._download_webpage(url, video_id)
  32
  33         token = self._html_search_regex(
  34             r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
  35             webpage, 'request token')
  36         token_data = [
  37             ('fileId', video_id),
  38             ('__RequestVerificationToken', token),
  39         ]
  40         req = sanitized_Request(
  41             'http://minhateca.com.br/action/License/Download',
  42             data=compat_urllib_parse.urlencode(token_data))
  43         req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  44         data = self._download_json(
  45             req, video_id, note='Downloading metadata')
  46
  47         video_url = data['redirectUrl']
  48         title_str = self._html_search_regex(
  49             r'<h1.*?>(.*?)</h1>', webpage, 'title')
  50         title, _, ext = title_str.rpartition('.')
  51         filesize_approx = parse_filesize(self._html_search_regex(
  52             r'<p class="fileSize">(.*?)</p>',
  53             webpage, 'file size approximation', fatal=False))
  54         duration = parse_duration(self._html_search_regex(
  55             r'(?s)<p class="fileLeng[ht][th]">.*?class="bold">(.*?)<',
  56             webpage, 'duration', fatal=False))
  57         view_count = int_or_none(self._html_search_regex(
  58             r'<p class="downloadsCounter">([0-9]+)</p>',
  59             webpage, 'view count', fatal=False))
  60
  61         return {
  62             'id': video_id,
  63             'url': video_url,
  64             'title': title,
  65             'ext': ext,
  66             'filesize_approx': filesize_approx,
  67             'duration': duration,
  68             'view_count': view_count,
  69             'thumbnail': self._og_search_thumbnail(webpage),
  70         }