Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tnaflix.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     parse_duration,
   8     fix_xml_ampersands,
   9 )
  10
  11
  12 class TNAFlixIE(InfoExtractor):
  13     _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
  14
  15     _TITLE_REGEX = None
  16     _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
  17     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
  18
  19     _TEST = {
  20         'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
  21         'md5': 'ecf3498417d09216374fc5907f9c6ec0',
  22         'info_dict': {
  23             'id': '553878',
  24             'display_id': 'Carmella-Decesare-striptease',
  25             'ext': 'mp4',
  26             'title': 'Carmella Decesare - striptease',
  27             'description': '',
  28             'thumbnail': 're:https?://.*\.jpg$',
  29             'duration': 91,
  30             'age_limit': 18,
  31         }
  32     }
  33
  34     def _real_extract(self, url):
  35         mobj = re.match(self._VALID_URL, url)
  36         video_id = mobj.group('id')
  37         display_id = mobj.group('display_id')
  38
  39         webpage = self._download_webpage(url, display_id)
  40
  41         title = self._html_search_regex(
  42             self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
  43         description = self._html_search_regex(
  44             self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
  45
  46         age_limit = self._rta_search(webpage)
  47
  48         duration = self._html_search_meta('duration', webpage, 'duration', default=None)
  49         if duration:
  50             duration = parse_duration(duration[1:])
  51
  52         cfg_url = self._html_search_regex(
  53             self._CONFIG_REGEX, webpage, 'flashvars.config')
  54
  55         cfg_xml = self._download_xml(
  56             cfg_url, display_id, note='Downloading metadata',
  57             transform_source=fix_xml_ampersands)
  58
  59         thumbnail = cfg_xml.find('./startThumb').text
  60
  61         formats = []
  62         for item in cfg_xml.findall('./quality/item'):
  63             video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
  64             format_id = item.find('res').text
  65             fmt = {
  66                 'url': video_url,
  67                 'format_id': format_id,
  68             }
  69             m = re.search(r'^(\d+)', format_id)
  70             if m:
  71                 fmt['height'] = int(m.group(1))
  72             formats.append(fmt)
  73         self._sort_formats(formats)
  74
  75         return {
  76             'id': video_id,
  77             'display_id': display_id,
  78             'title': title,
  79             'description': description,
  80             'thumbnail': thumbnail,
  81             'duration': duration,
  82             'age_limit': age_limit,
  83             'formats': formats,
  84         }