]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/pornhub.py
1dcc8df00b4c2eb2da3b2719a35d874eb290c4ed
   2 from __future__ 
import unicode_literals
 
  10 from .common 
import InfoExtractor
 
  11 from ..compat 
import ( 
  13     # compat_urllib_parse_unquote, 
  14     # compat_urllib_parse_unquote_plus, 
  15     # compat_urllib_parse_urlparse, 
  31 class PornHubIE(InfoExtractor
): 
  32     IE_DESC 
= 'PornHub and Thumbzilla' 
  36                             (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| 
  37                             (?:www\.)?thumbzilla\.com/video/ 
  42         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 
  43         'md5': '1e19b41231a02eba417839222ac9d58e', 
  47             'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 
  60         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', 
  65             'uploader': 'cj397186295', 
  76             'skip_download': True, 
  79         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 
  80         'only_matching': True, 
  82         # removed at the request of cam4.com 
  83         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 
  84         'only_matching': True, 
  86         # removed at the request of the copyright owner 
  87         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', 
  88         'only_matching': True, 
  91         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', 
  92         'only_matching': True, 
  95         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', 
  96         'only_matching': True, 
  98         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 
  99         'only_matching': True, 
 101         'url': 'http://www.pornhub.com/video/show?viewkey=648719015', 
 102         'only_matching': True, 
 106     def _extract_urls(webpage
): 
 108             r
'<iframe[^>]+?src=["\'](?P
<url
>(?
:https?
:)?
//(?
:www\
.)?pornhub\
.com
/embed
/[\da
-z
]+)', 
 111     def _extract_count(self, pattern, webpage, name): 
 112         return str_to_int(self._search_regex( 
 113             pattern, webpage, '%s count
' % name, fatal=False)) 
 115     def _real_extract(self, url): 
 116         video_id = self._match_id(url) 
 118         def dl_webpage(platform): 
 119             return self._download_webpage( 
 120                 'http
://www
.pornhub
.com
/view_video
.php?viewkey
=%s' % video_id, 
 122                     'Cookie
': 'age_verified
=1; platform
=%s' % platform, 
 125         webpage = dl_webpage('pc
') 
 127         error_msg = self._html_search_regex( 
 128             r'(?s
)<div
[^
>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', 
 129             webpage, 'error message', default=None, group='error') 
 131             error_msg = re.sub(r'\s+', ' ', error_msg) 
 132             raise ExtractorError( 
 133                 'PornHub said: %s' % error_msg, 
 134                 expected=True, video_id=video_id) 
 136         tv_webpage = dl_webpage('tv') 
 138         assignments = self._search_regex( 
 139             r'(var.+?mediastring.+?)</script>', tv_webpage, 
 140             'encoded url').split(';') 
 144         def parse_js_value(inp): 
 145             inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) 
 147                 inps = inp.split('+') 
 148                 return functools.reduce( 
 149                     operator.concat, map(parse_js_value, inps)) 
 153             return remove_quotes(inp) 
 155         for assn in assignments: 
 159             assn = re.sub(r'var\s+', '', assn) 
 160             vname, value = assn.split('=', 1) 
 161             js_vars[vname] = parse_js_value(value) 
 163         video_url = js_vars['mediastring'] 
 165         title = self._search_regex( 
 166             r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) 
 168         # video_title from flashvars contains whitespace instead of non-ASCII (see 
 169         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying 
 171         title = title or self._html_search_meta( 
 172             'twitter:title', webpage, default=None) or self._search_regex( 
 173             (r'<h1[^>]+class=["\']title
["\'][^>]*>(?P<title>[^<]+)', 
 174              r'<div[^>]+data-video-title=(["\'])(?P
<title
>.+?
)\
1', 
 175              r'shareTitle\s
*=\s
*(["\'])(?P<title>.+?)\1'), 
 176             webpage, 'title', group='title') 
 178         flashvars = self._parse_json( 
 180                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), 
 183             thumbnail = flashvars.get('image_url') 
 184             duration = int_or_none(flashvars.get('video_duration')) 
 186             title, thumbnail, duration = [None] * 3 
 188         video_uploader = self._html_search_regex( 
 189             r'(?s)From: .+?<(?:a href="/users
/|a href
="/channels/|span class="username
)[^
>]+>(.+?
)<', 
 190             webpage, 'uploader
', fatal=False) 
 192         view_count = self._extract_count( 
 193             r'<span 
class="count">([\d
,\
.]+)</span
> views
', webpage, 'view
') 
 194         like_count = self._extract_count( 
 195             r'<span 
class="votesUp">([\d
,\
.]+)</span
>', webpage, 'like
') 
 196         dislike_count = self._extract_count( 
 197             r'<span 
class="votesDown">([\d
,\
.]+)</span
>', webpage, 'dislike
') 
 198         comment_count = self._extract_count( 
 199             r'All Comments\s
*<span
>\
(([\d
,.]+)\
)', webpage, 'comment
') 
 201         page_params = self._parse_json(self._search_regex( 
 202             r'page_params\
.zoneDetails\
[([\'"])[^\'"]+\
1\
]\s
*=\s
*(?P
<data
>{[^
}]+})', 
 203             webpage, 'page parameters
', group='data
', default='{}'), 
 204             video_id, transform_source=js_to_json, fatal=False) 
 205         tags = categories = None 
 207             tags = page_params.get('tags
', '').split(',') 
 208             categories = page_params.get('categories
', '').split(',') 
 213             'uploader
': video_uploader, 
 215             'thumbnail
': thumbnail, 
 216             'duration
': duration, 
 217             'view_count
': view_count, 
 218             'like_count
': like_count, 
 219             'dislike_count
': dislike_count, 
 220             'comment_count
': comment_count, 
 221             # 'formats
': formats, 
 224             'categories
': categories, 
 228 class PornHubPlaylistBaseIE(InfoExtractor): 
 229     def _extract_entries(self, webpage): 
 232                 'http
://www
.pornhub
.com
/%s' % video_url, 
 233                 PornHubIE.ie_key(), video_title=title) 
 234             for video_url, title in orderedSet(re.findall( 
 235                 r'href
="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^
"]+)"', 
 239     def _real_extract(self, url): 
 240         playlist_id = self._match_id(url) 
 242         webpage = self._download_webpage(url, playlist_id) 
 244         # Only process container div with main playlist content skipping 
 245         # drop-down menu that uses similar pattern for videos (see 
 246         # https://github.com/rg3/youtube-dl/issues/11594). 
 247         container = self._search_regex( 
 248             r'(?s
)(<div
[^
>]+class=["\']container.+)', webpage, 
 249             'container', default=webpage) 
 251         entries = self._extract_entries(container) 
 253         playlist = self._parse_json( 
 255                 r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), 
 258         return self.playlist_result( 
 259             entries, playlist_id, playlist.get('title'), playlist.get('description')) 
 262 class PornHubPlaylistIE(PornHubPlaylistBaseIE): 
 263     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' 
 265         'url': 'http://www.pornhub.com/playlist/4667351', 
 268             'title': 'Nataly Hot', 
 270         'playlist_mincount': 2, 
 274 class PornHubUserVideosIE(PornHubPlaylistBaseIE): 
 275     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' 
 277         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 
 281         'playlist_mincount': 171, 
 283         'url': 'http://www.pornhub.com/users/rushandlia/videos', 
 284         'only_matching': True, 
 287     def _real_extract(self, url): 
 288         user_id = self._match_id(url) 
 291         for page_num in itertools.count(1): 
 293                 webpage = self._download_webpage( 
 294                     url, user_id, 'Downloading page %d' % page_num, 
 295                     query={'page': page_num}) 
 296             except ExtractorError as e: 
 297                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: 
 299             page_entries = self._extract_entries(webpage) 
 302             entries.extend(page_entries) 
 304         return self.playlist_result(entries, user_id)