]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/redtube.py
721fc3a9e2d2b3431051ea00982f72ae1d98ff65
   1 from __future__ 
import unicode_literals
 
   3 from .common 
import InfoExtractor
 
  12 class RedTubeIE(InfoExtractor
): 
  13     _VALID_URL 
= r
'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' 
  15         'url': 'http://www.redtube.com/66418', 
  16         'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 
  20             'title': 'Sucked on a toilet', 
  21             'upload_date': '20120831', 
  28     def _real_extract(self
, url
): 
  29         video_id 
= self
._match
_id
(url
) 
  30         webpage 
= self
._download
_webpage
(url
, video_id
) 
  32         if any(s 
in webpage 
for s 
in ['video-deleted-info', '>This video has been removed']): 
  33             raise ExtractorError('Video %s has been removed' % video_id
, expected
=True) 
  35         title 
= self
._html
_search
_regex
( 
  36             (r
'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>', 
  37              r
'videoTitle\s*:\s*(["\'])(?P
<title
>)\
1'), 
  38             webpage, 'title
', group='title
') 
  41         sources = self._parse_json( 
  43                 r'sources\s
*:\s
*({.+?
})', webpage, 'source
', default='{}'), 
  44             video_id, fatal=False) 
  45         if sources and isinstance(sources, dict): 
  46             for format_id, format_url in sources.items(): 
  50                         'format_id
': format_id, 
  51                         'height
': int_or_none(format_id), 
  54             video_url = self._html_search_regex( 
  55                 r'<source src
="(.+?)" type="video/mp4">', webpage, 'video URL
') 
  56             formats.append({'url
': video_url}) 
  57         self._sort_formats(formats) 
  59         thumbnail = self._og_search_thumbnail(webpage) 
  60         upload_date = unified_strdate(self._search_regex( 
  61             r'<span
[^
>]+class="added-time"[^
>]*>ADDED ([^
<]+)<', 
  62             webpage, 'upload date
', fatal=False)) 
  63         duration = int_or_none(self._search_regex( 
  64             r'videoDuration\s
*:\s
*(\d
+)', webpage, 'duration
', fatal=False)) 
  65         view_count = str_to_int(self._search_regex( 
  66             r'<span
[^
>]*>VIEWS
</span
></td
>\s
*<td
>([\d
,.]+)', 
  67             webpage, 'view count
', fatal=False)) 
  69         # No self-labeling, but they describe themselves as 
  70         # "Home of Videos Porno" 
  77             'thumbnail
': thumbnail, 
  78             'upload_date
': upload_date, 
  80             'view_count
': view_count, 
  81             'age_limit
': age_limit,