]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/pornhub.py
debian/changelog: Annotate the log with bugs to close.
[youtubedl] / youtube_dl / extractor / pornhub.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import functools
5 import itertools
6 import operator
7 # import os
8 import re
9
10 from .common import InfoExtractor
11 from ..compat import (
12 compat_HTTPError,
13 # compat_urllib_parse_unquote,
14 # compat_urllib_parse_unquote_plus,
15 # compat_urllib_parse_urlparse,
16 )
17 from ..utils import (
18 ExtractorError,
19 int_or_none,
20 js_to_json,
21 orderedSet,
22 # sanitized_Request,
23 remove_quotes,
24 str_to_int,
25 )
26 # from ..aes import (
27 # aes_decrypt_text
28 # )
29
30
31 class PornHubIE(InfoExtractor):
32 IE_DESC = 'PornHub and Thumbzilla'
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:
36 (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
37 (?:www\.)?thumbzilla\.com/video/
38 )
39 (?P<id>[\da-z]+)
40 '''
41 _TESTS = [{
42 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
43 'md5': '1e19b41231a02eba417839222ac9d58e',
44 'info_dict': {
45 'id': '648719015',
46 'ext': 'mp4',
47 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
48 'uploader': 'Babes',
49 'duration': 361,
50 'view_count': int,
51 'like_count': int,
52 'dislike_count': int,
53 'comment_count': int,
54 'age_limit': 18,
55 'tags': list,
56 'categories': list,
57 },
58 }, {
59 # non-ASCII title
60 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
61 'info_dict': {
62 'id': '1331683002',
63 'ext': 'mp4',
64 'title': '重庆婷婷女王足交',
65 'uploader': 'cj397186295',
66 'duration': 1753,
67 'view_count': int,
68 'like_count': int,
69 'dislike_count': int,
70 'comment_count': int,
71 'age_limit': 18,
72 'tags': list,
73 'categories': list,
74 },
75 'params': {
76 'skip_download': True,
77 },
78 }, {
79 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
80 'only_matching': True,
81 }, {
82 # removed at the request of cam4.com
83 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
84 'only_matching': True,
85 }, {
86 # removed at the request of the copyright owner
87 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
88 'only_matching': True,
89 }, {
90 # removed by uploader
91 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
92 'only_matching': True,
93 }, {
94 # private video
95 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
96 'only_matching': True,
97 }, {
98 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
99 'only_matching': True,
100 }, {
101 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
102 'only_matching': True,
103 }]
104
105 @staticmethod
106 def _extract_urls(webpage):
107 return re.findall(
108 r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
109 webpage)
110
111 def _extract_count(self, pattern, webpage, name):
112 return str_to_int(self._search_regex(
113 pattern, webpage, '%s count' % name, fatal=False))
114
115 def _real_extract(self, url):
116 video_id = self._match_id(url)
117
118 self._set_cookie('pornhub.com', 'age_verified', '1')
119
120 def dl_webpage(platform):
121 self._set_cookie('pornhub.com', 'platform', platform)
122 return self._download_webpage(
123 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
124 video_id)
125
126 webpage = dl_webpage('pc')
127
128 error_msg = self._html_search_regex(
129 r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
130 webpage, 'error message', default=None, group='error')
131 if error_msg:
132 error_msg = re.sub(r'\s+', ' ', error_msg)
133 raise ExtractorError(
134 'PornHub said: %s' % error_msg,
135 expected=True, video_id=video_id)
136
137 tv_webpage = dl_webpage('tv')
138
139 assignments = self._search_regex(
140 r'(var.+?mediastring.+?)</script>', tv_webpage,
141 'encoded url').split(';')
142
143 js_vars = {}
144
145 def parse_js_value(inp):
146 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
147 if '+' in inp:
148 inps = inp.split('+')
149 return functools.reduce(
150 operator.concat, map(parse_js_value, inps))
151 inp = inp.strip()
152 if inp in js_vars:
153 return js_vars[inp]
154 return remove_quotes(inp)
155
156 for assn in assignments:
157 assn = assn.strip()
158 if not assn:
159 continue
160 assn = re.sub(r'var\s+', '', assn)
161 vname, value = assn.split('=', 1)
162 js_vars[vname] = parse_js_value(value)
163
164 video_url = js_vars['mediastring']
165
166 title = self._search_regex(
167 r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
168
169 # video_title from flashvars contains whitespace instead of non-ASCII (see
170 # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
171 # on that anymore.
172 title = title or self._html_search_meta(
173 'twitter:title', webpage, default=None) or self._search_regex(
174 (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
175 r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
176 r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
177 webpage, 'title', group='title')
178
179 flashvars = self._parse_json(
180 self._search_regex(
181 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
182 video_id)
183 if flashvars:
184 thumbnail = flashvars.get('image_url')
185 duration = int_or_none(flashvars.get('video_duration'))
186 else:
187 title, thumbnail, duration = [None] * 3
188
189 video_uploader = self._html_search_regex(
190 r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
191 webpage, 'uploader', fatal=False)
192
193 view_count = self._extract_count(
194 r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
195 like_count = self._extract_count(
196 r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
197 dislike_count = self._extract_count(
198 r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
199 comment_count = self._extract_count(
200 r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
201
202 page_params = self._parse_json(self._search_regex(
203 r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
204 webpage, 'page parameters', group='data', default='{}'),
205 video_id, transform_source=js_to_json, fatal=False)
206 tags = categories = None
207 if page_params:
208 tags = page_params.get('tags', '').split(',')
209 categories = page_params.get('categories', '').split(',')
210
211 return {
212 'id': video_id,
213 'url': video_url,
214 'uploader': video_uploader,
215 'title': title,
216 'thumbnail': thumbnail,
217 'duration': duration,
218 'view_count': view_count,
219 'like_count': like_count,
220 'dislike_count': dislike_count,
221 'comment_count': comment_count,
222 # 'formats': formats,
223 'age_limit': 18,
224 'tags': tags,
225 'categories': categories,
226 }
227
228
229 class PornHubPlaylistBaseIE(InfoExtractor):
230 def _extract_entries(self, webpage):
231 # Only process container div with main playlist content skipping
232 # drop-down menu that uses similar pattern for videos (see
233 # https://github.com/rg3/youtube-dl/issues/11594).
234 container = self._search_regex(
235 r'(?s)(<div[^>]+class=["\']container.+)', webpage,
236 'container', default=webpage)
237
238 return [
239 self.url_result(
240 'http://www.pornhub.com/%s' % video_url,
241 PornHubIE.ie_key(), video_title=title)
242 for video_url, title in orderedSet(re.findall(
243 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
244 container))
245 ]
246
247 def _real_extract(self, url):
248 playlist_id = self._match_id(url)
249
250 webpage = self._download_webpage(url, playlist_id)
251
252 entries = self._extract_entries(webpage)
253
254 playlist = self._parse_json(
255 self._search_regex(
256 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
257 'playlist', default='{}'),
258 playlist_id, fatal=False)
259 title = playlist.get('title') or self._search_regex(
260 r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
261
262 return self.playlist_result(
263 entries, playlist_id, title, playlist.get('description'))
264
265
266 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
267 _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
268 _TESTS = [{
269 'url': 'http://www.pornhub.com/playlist/4667351',
270 'info_dict': {
271 'id': '4667351',
272 'title': 'Nataly Hot',
273 },
274 'playlist_mincount': 2,
275 }, {
276 'url': 'https://de.pornhub.com/playlist/4667351',
277 'only_matching': True,
278 }]
279
280
281 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
282 _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos'
283 _TESTS = [{
284 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
285 'info_dict': {
286 'id': 'zoe_ph',
287 },
288 'playlist_mincount': 171,
289 }, {
290 'url': 'http://www.pornhub.com/users/rushandlia/videos',
291 'only_matching': True,
292 }, {
293 # default sorting as Top Rated Videos
294 'url': 'https://www.pornhub.com/channels/povd/videos',
295 'info_dict': {
296 'id': 'povd',
297 },
298 'playlist_mincount': 293,
299 }, {
300 # Top Rated Videos
301 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
302 'only_matching': True,
303 }, {
304 # Most Recent Videos
305 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
306 'only_matching': True,
307 }, {
308 # Most Viewed Videos
309 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
310 'only_matching': True,
311 }, {
312 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
313 'only_matching': True,
314 }]
315
316 def _real_extract(self, url):
317 user_id = self._match_id(url)
318
319 entries = []
320 for page_num in itertools.count(1):
321 try:
322 webpage = self._download_webpage(
323 url, user_id, 'Downloading page %d' % page_num,
324 query={'page': page_num})
325 except ExtractorError as e:
326 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
327 break
328 raise
329 page_entries = self._extract_entries(webpage)
330 if not page_entries:
331 break
332 entries.extend(page_entries)
333
334 return self.playlist_result(entries, user_id)