2 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
  11     compat_urllib_parse_unquote
, 
  12     compat_urllib_parse_unquote_plus
, 
  13     compat_urllib_parse_urlparse
, 
  28 class PornHubIE(InfoExtractor
): 
  29     IE_DESC 
= 'PornHub and Thumbzilla' 
  33                             (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| 
  34                             (?:www\.)?thumbzilla\.com/video/ 
  39         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 
  40         'md5': '1e19b41231a02eba417839222ac9d58e', 
  44             'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 
  57         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', 
  62             'uploader': 'cj397186295', 
  73             'skip_download': True, 
  76         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 
  77         'only_matching': True, 
  79         # removed at the request of cam4.com 
  80         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 
  81         'only_matching': True, 
  83         # removed at the request of the copyright owner 
  84         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', 
  85         'only_matching': True, 
  88         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', 
  89         'only_matching': True, 
  92         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', 
  93         'only_matching': True, 
  95         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 
  96         'only_matching': True, 
 100     def _extract_urls(webpage
): 
 102             r
'<iframe[^>]+?src=["\'](?P
<url
>(?
:https?
:)?
//(?
:www\
.)?pornhub\
.com
/embed
/[\da
-z
]+)', 
 105     def _extract_count(self, pattern, webpage, name): 
 106         return str_to_int(self._search_regex( 
 107             pattern, webpage, '%s count
' % name, fatal=False)) 
 109     def _real_extract(self, url): 
 110         video_id = self._match_id(url) 
 112         req = sanitized_Request( 
 113             'http
://www
.pornhub
.com
/view_video
.php?viewkey
=%s' % video_id) 
 114         req.add_header('Cookie
', 'age_verified
=1') 
 115         webpage = self._download_webpage(req, video_id) 
 117         error_msg = self._html_search_regex( 
 118             r'(?s
)<div
[^
>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', 
 119             webpage, 'error message', default=None, group='error') 
 121             error_msg = re.sub(r'\s+', ' ', error_msg) 
 122             raise ExtractorError( 
 123                 'PornHub said: %s' % error_msg, 
 124                 expected=True, video_id=video_id) 
 126         # video_title from flashvars contains whitespace instead of non-ASCII (see 
 127         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying 
 129         title = self._html_search_meta( 
 130             'twitter:title', webpage, default=None) or self._search_regex( 
 131             (r'<h1[^>]+class=["\']title
["\'][^>]*>(?P<title>[^<]+)', 
 132              r'<div[^>]+data-video-title=(["\'])(?P
<title
>.+?
)\
1', 
 133              r'shareTitle\s
*=\s
*(["\'])(?P<title>.+?)\1'), 
 134             webpage, 'title', group='title') 
 136         flashvars = self._parse_json( 
 138                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), 
 141             thumbnail = flashvars.get('image_url') 
 142             duration = int_or_none(flashvars.get('video_duration')) 
 144             title, thumbnail, duration = [None] * 3 
 146         video_uploader = self._html_search_regex( 
 147             r'(?s)From: .+?<(?:a href="/users
/|a href
="/channels/|span class="username
)[^
>]+>(.+?
)<', 
 148             webpage, 'uploader
', fatal=False) 
 150         view_count = self._extract_count( 
 151             r'<span 
class="count">([\d
,\
.]+)</span
> views
', webpage, 'view
') 
 152         like_count = self._extract_count( 
 153             r'<span 
class="votesUp">([\d
,\
.]+)</span
>', webpage, 'like
') 
 154         dislike_count = self._extract_count( 
 155             r'<span 
class="votesDown">([\d
,\
.]+)</span
>', webpage, 'dislike
') 
 156         comment_count = self._extract_count( 
 157             r'All Comments\s
*<span
>\
(([\d
,.]+)\
)', webpage, 'comment
') 
 159         video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^
']+)'", webpage))) 
 160         if webpage.find('"encrypted
":true') != -1: 
 161             password = compat_urllib_parse_unquote_plus( 
 162                 self._search_regex(r'"video_title
":"([^
"]+)', webpage, 'password')) 
 163             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) 
 166         for video_url in video_urls: 
 167             path = compat_urllib_parse_urlparse(video_url).path 
 168             extension = os.path.splitext(path)[1][1:] 
 169             format = path.split('/')[5].split('_')[:2] 
 170             format = '-'.join(format) 
 172             m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) 
 177                 height = int(m.group('height')) 
 178                 tbr = int(m.group('tbr')) 
 188         self._sort_formats(formats) 
 190         page_params = self._parse_json(self._search_regex( 
 191             r'page_params\.zoneDetails\[([\'"])[^
\'"]+\1\]\s*=\s*(?P<data>{[^}]+})', 
 192             webpage, 'page parameters', group='data', default='{}'), 
 193             video_id, transform_source=js_to_json, fatal=False) 
 194         tags = categories = None 
 196             tags = page_params.get('tags', '').split(',') 
 197             categories = page_params.get('categories', '').split(',') 
 201             'uploader': video_uploader, 
 203             'thumbnail': thumbnail, 
 204             'duration': duration, 
 205             'view_count': view_count, 
 206             'like_count': like_count, 
 207             'dislike_count': dislike_count, 
 208             'comment_count': comment_count, 
 212             'categories': categories, 
 216 class PornHubPlaylistBaseIE(InfoExtractor): 
 217     def _extract_entries(self, webpage): 
 220                 'http://www.pornhub.com/%s' % video_url, 
 221                 PornHubIE.ie_key(), video_title=title) 
 222             for video_url, title in orderedSet(re.findall( 
 223                 r'href="/?
(view_video\
.php
\?.*\bviewkey
=[\da
-z
]+[^
"]*)"[^
>]*\s
+title
="([^"]+)"', 
 227     def _real_extract(self, url): 
 228         playlist_id = self._match_id(url) 
 230         webpage = self._download_webpage(url, playlist_id) 
 232         entries = self._extract_entries(webpage) 
 234         playlist = self._parse_json( 
 236                 r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), 
 239         return self.playlist_result( 
 240             entries, playlist_id, playlist.get('title'), playlist.get('description')) 
 243 class PornHubPlaylistIE(PornHubPlaylistBaseIE): 
 244     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' 
 246         'url': 'http://www.pornhub.com/playlist/6201671', 
 251         'playlist_mincount': 35, 
 255 class PornHubUserVideosIE(PornHubPlaylistBaseIE): 
 256     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' 
 258         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 
 262         'playlist_mincount': 171, 
 264         'url': 'http://www.pornhub.com/users/rushandlia/videos', 
 265         'only_matching': True, 
 268     def _real_extract(self, url): 
 269         user_id = self._match_id(url) 
 272         for page_num in itertools.count(1): 
 274                 webpage = self._download_webpage( 
 275                     url, user_id, 'Downloading page %d' % page_num, 
 276                     query={'page': page_num}) 
 277             except ExtractorError as e: 
 278                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: 
 280             page_entries = self._extract_entries(webpage) 
 283             entries.extend(page_entries) 
 285         return self.playlist_result(entries, user_id)