]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/porncom.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
   6 from ..compat 
import compat_urlparse
 
  15 class PornComIE(InfoExtractor
): 
  16     _VALID_URL 
= r
'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' 
  18         'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', 
  19         'md5': '3f30ce76267533cd12ba999263156de7', 
  22             'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', 
  24             'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', 
  25             'thumbnail': 're:^https?://.*\.jpg$', 
  33         'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', 
  34         'only_matching': True, 
  37     def _real_extract(self
, url
): 
  38         mobj 
= re
.match(self
._VALID
_URL
, url
) 
  39         video_id 
= mobj
.group('id') 
  40         display_id 
= mobj
.group('display_id') or video_id
 
  42         webpage 
= self
._download
_webpage
(url
, display_id
) 
  44         config 
= self
._parse
_json
( 
  46                 r
'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', 
  47                 webpage
, 'config', default
='{}'), 
  48             display_id
, transform_source
=js_to_json
, fatal
=False) 
  51             title 
= config
['title'] 
  54                 'format_id': stream
.get('id'), 
  55                 'height': int_or_none(self
._search
_regex
( 
  56                     r
'^(\d+)[pP]', stream
.get('id') or '', 'height', default
=None)) 
  57             } for stream 
in config
['streams'] if stream
.get('url')] 
  58             thumbnail 
= (compat_urlparse
.urljoin( 
  59                 config
['thumbCDN'], config
['poster']) 
  60                 if config
.get('thumbCDN') and config
.get('poster') else None) 
  61             duration 
= int_or_none(config
.get('length')) 
  63             title 
= self
._search
_regex
( 
  64                 (r
'<title>([^<]+)</title>', r
'<h1[^>]*>([^<]+)</h1>'), 
  67                 'url': compat_urlparse
.urljoin(url
, format_url
), 
  68                 'format_id': '%sp' % height
, 
  69                 'height': int(height
), 
  70                 'filesize_approx': parse_filesize(filesize
), 
  71             } for format_url
, height
, filesize 
in re
.findall( 
  72                 r
'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', 
  77         self
._sort
_formats
(formats
) 
  79         view_count 
= str_to_int(self
._search
_regex
( 
  80             r
'class=["\']views
["\'][^>]*><p>([\d,.]+)', webpage, 
  81             'view count', fatal=False)) 
  83         def extract_list(kind): 
  84             s = self._search_regex( 
  85                 r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize(), 
  86                 webpage, kind, fatal=False) 
  87             return re.findall(r'<a[^>]+>([^<]+)</a>', s or '') 
  91             'display_id': display_id, 
  93             'thumbnail': thumbnail, 
  95             'view_count': view_count, 
  98             'categories': extract_list('categories'), 
  99             'tags': extract_list('tags'),