Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/pornhub.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import operator
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12     compat_str,
  13     compat_urllib_request,
  14 )
  15 from .openload import PhantomJSwrapper
  16 from ..utils import (
  17     determine_ext,
  18     ExtractorError,
  19     int_or_none,
  20     NO_DEFAULT,
  21     orderedSet,
  22     remove_quotes,
  23     str_to_int,
  24     url_or_none,
  25 )
  26
  27
  28 class PornHubBaseIE(InfoExtractor):
  29     def _download_webpage_handle(self, *args, **kwargs):
  30         def dl(*args, **kwargs):
  31             return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  32
  33         webpage, urlh = dl(*args, **kwargs)
  34
  35         if any(re.search(p, webpage) for p in (
  36                 r'<body\b[^>]+\bonload=["\']go\(\)',
  37                 r'document\.cookie\s*=\s*["\']RNKEY=',
  38                 r'document\.location\.reload\(true\)')):
  39             url_or_request = args[0]
  40             url = (url_or_request.get_full_url()
  41                    if isinstance(url_or_request, compat_urllib_request.Request)
  42                    else url_or_request)
  43             phantom = PhantomJSwrapper(self, required_version='2.0')
  44             phantom.get(url, html=webpage)
  45             webpage, urlh = dl(*args, **kwargs)
  46
  47         return webpage, urlh
  48
  49
  50 class PornHubIE(PornHubBaseIE):
  51     IE_DESC = 'PornHub and Thumbzilla'
  52     _VALID_URL = r'''(?x)
  53                     https?://
  54                         (?:
  55                             (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  56                             (?:www\.)?thumbzilla\.com/video/
  57                         )
  58                         (?P<id>[\da-z]+)
  59                     '''
  60     _TESTS = [{
  61         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  62         'md5': '1e19b41231a02eba417839222ac9d58e',
  63         'info_dict': {
  64             'id': '648719015',
  65             'ext': 'mp4',
  66             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  67             'uploader': 'Babes',
  68             'upload_date': '20130628',
  69             'duration': 361,
  70             'view_count': int,
  71             'like_count': int,
  72             'dislike_count': int,
  73             'comment_count': int,
  74             'age_limit': 18,
  75             'tags': list,
  76             'categories': list,
  77         },
  78     }, {
  79         # non-ASCII title
  80         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  81         'info_dict': {
  82             'id': '1331683002',
  83             'ext': 'mp4',
  84             'title': '重庆婷婷女王足交',
  85             'uploader': 'Unknown',
  86             'upload_date': '20150213',
  87             'duration': 1753,
  88             'view_count': int,
  89             'like_count': int,
  90             'dislike_count': int,
  91             'comment_count': int,
  92             'age_limit': 18,
  93             'tags': list,
  94             'categories': list,
  95         },
  96         'params': {
  97             'skip_download': True,
  98         },
  99     }, {
 100         # subtitles
 101         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
 102         'info_dict': {
 103             'id': 'ph5af5fef7c2aa7',
 104             'ext': 'mp4',
 105             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
 106             'uploader': 'BFFs',
 107             'duration': 622,
 108             'view_count': int,
 109             'like_count': int,
 110             'dislike_count': int,
 111             'comment_count': int,
 112             'age_limit': 18,
 113             'tags': list,
 114             'categories': list,
 115             'subtitles': {
 116                 'en': [{
 117                     "ext": 'srt'
 118                 }]
 119             },
 120         },
 121         'params': {
 122             'skip_download': True,
 123         },
 124     }, {
 125         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
 126         'only_matching': True,
 127     }, {
 128         # removed at the request of cam4.com
 129         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 130         'only_matching': True,
 131     }, {
 132         # removed at the request of the copyright owner
 133         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 134         'only_matching': True,
 135     }, {
 136         # removed by uploader
 137         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 138         'only_matching': True,
 139     }, {
 140         # private video
 141         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 142         'only_matching': True,
 143     }, {
 144         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 145         'only_matching': True,
 146     }, {
 147         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 148         'only_matching': True,
 149     }, {
 150         'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
 151         'only_matching': True,
 152     }]
 153
 154     @staticmethod
 155     def _extract_urls(webpage):
 156         return re.findall(
 157             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
 158             webpage)
 159
 160     def _extract_count(self, pattern, webpage, name):
 161         return str_to_int(self._search_regex(
 162             pattern, webpage, '%s count' % name, fatal=False))
 163
 164     def _real_extract(self, url):
 165         mobj = re.match(self._VALID_URL, url)
 166         host = mobj.group('host') or 'pornhub.com'
 167         video_id = mobj.group('id')
 168
 169         self._set_cookie(host, 'age_verified', '1')
 170
 171         def dl_webpage(platform):
 172             self._set_cookie(host, 'platform', platform)
 173             return self._download_webpage(
 174                 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
 175                 video_id, 'Downloading %s webpage' % platform)
 176
 177         webpage = dl_webpage('pc')
 178
 179         error_msg = self._html_search_regex(
 180             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 181             webpage, 'error message', default=None, group='error')
 182         if error_msg:
 183             error_msg = re.sub(r'\s+', ' ', error_msg)
 184             raise ExtractorError(
 185                 'PornHub said: %s' % error_msg,
 186                 expected=True, video_id=video_id)
 187
 188         # video_title from flashvars contains whitespace instead of non-ASCII (see
 189         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 190         # on that anymore.
 191         title = self._html_search_meta(
 192             'twitter:title', webpage, default=None) or self._search_regex(
 193             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
 194              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
 195              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
 196             webpage, 'title', group='title')
 197
 198         video_urls = []
 199         video_urls_set = set()
 200         subtitles = {}
 201
 202         flashvars = self._parse_json(
 203             self._search_regex(
 204                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 205             video_id)
 206         if flashvars:
 207             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 208             if subtitle_url:
 209                 subtitles.setdefault('en', []).append({
 210                     'url': subtitle_url,
 211                     'ext': 'srt',
 212                 })
 213             thumbnail = flashvars.get('image_url')
 214             duration = int_or_none(flashvars.get('video_duration'))
 215             media_definitions = flashvars.get('mediaDefinitions')
 216             if isinstance(media_definitions, list):
 217                 for definition in media_definitions:
 218                     if not isinstance(definition, dict):
 219                         continue
 220                     video_url = definition.get('videoUrl')
 221                     if not video_url or not isinstance(video_url, compat_str):
 222                         continue
 223                     if video_url in video_urls_set:
 224                         continue
 225                     video_urls_set.add(video_url)
 226                     video_urls.append(
 227                         (video_url, int_or_none(definition.get('quality'))))
 228         else:
 229             thumbnail, duration = [None] * 2
 230
 231         def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
 232             assignments = self._search_regex(
 233                 pattern, webpage, 'encoded url', default=default)
 234             if not assignments:
 235                 return {}
 236
 237             assignments = assignments.split(';')
 238
 239             js_vars = {}
 240
 241             def parse_js_value(inp):
 242                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 243                 if '+' in inp:
 244                     inps = inp.split('+')
 245                     return functools.reduce(
 246                         operator.concat, map(parse_js_value, inps))
 247                 inp = inp.strip()
 248                 if inp in js_vars:
 249                     return js_vars[inp]
 250                 return remove_quotes(inp)
 251
 252             for assn in assignments:
 253                 assn = assn.strip()
 254                 if not assn:
 255                     continue
 256                 assn = re.sub(r'var\s+', '', assn)
 257                 vname, value = assn.split('=', 1)
 258                 js_vars[vname] = parse_js_value(value)
 259             return js_vars
 260
 261         def add_video_url(video_url):
 262             v_url = url_or_none(video_url)
 263             if not v_url:
 264                 return
 265             if v_url in video_urls_set:
 266                 return
 267             video_urls.append((v_url, None))
 268             video_urls_set.add(v_url)
 269
 270         if not video_urls:
 271             FORMAT_PREFIXES = ('media', 'quality')
 272             js_vars = extract_js_vars(
 273                 webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
 274                 default=None)
 275             if js_vars:
 276                 for key, format_url in js_vars.items():
 277                     if any(key.startswith(p) for p in FORMAT_PREFIXES):
 278                         add_video_url(format_url)
 279             if not video_urls and re.search(
 280                     r'<[^>]+\bid=["\']lockedPlayer', webpage):
 281                 raise ExtractorError(
 282                     'Video %s is locked' % video_id, expected=True)
 283
 284         if not video_urls:
 285             js_vars = extract_js_vars(
 286                 dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
 287             add_video_url(js_vars['mediastring'])
 288
 289         for mobj in re.finditer(
 290                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 291                 webpage):
 292             video_url = mobj.group('url')
 293             if video_url not in video_urls_set:
 294                 video_urls.append((video_url, None))
 295                 video_urls_set.add(video_url)
 296
 297         upload_date = None
 298         formats = []
 299         for video_url, height in video_urls:
 300             if not upload_date:
 301                 upload_date = self._search_regex(
 302                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
 303                 if upload_date:
 304                     upload_date = upload_date.replace('/', '')
 305             ext = determine_ext(video_url)
 306             if ext == 'mpd':
 307                 formats.extend(self._extract_mpd_formats(
 308                     video_url, video_id, mpd_id='dash', fatal=False))
 309                 continue
 310             elif ext == 'm3u8':
 311                 formats.extend(self._extract_m3u8_formats(
 312                     video_url, video_id, 'mp4', entry_protocol='m3u8_native',
 313                     m3u8_id='hls', fatal=False))
 314                 continue
 315             tbr = None
 316             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
 317             if mobj:
 318                 if not height:
 319                     height = int(mobj.group('height'))
 320                 tbr = int(mobj.group('tbr'))
 321             formats.append({
 322                 'url': video_url,
 323                 'format_id': '%dp' % height if height else None,
 324                 'height': height,
 325                 'tbr': tbr,
 326             })
 327         self._sort_formats(formats)
 328
 329         video_uploader = self._html_search_regex(
 330             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 331             webpage, 'uploader', fatal=False)
 332
 333         view_count = self._extract_count(
 334             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
 335         like_count = self._extract_count(
 336             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
 337         dislike_count = self._extract_count(
 338             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 339         comment_count = self._extract_count(
 340             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 341
 342         def extract_list(meta_key):
 343             div = self._search_regex(
 344                 r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
 345                 % meta_key, webpage, meta_key, default=None)
 346             if div:
 347                 return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
 348
 349         return {
 350             'id': video_id,
 351             'uploader': video_uploader,
 352             'upload_date': upload_date,
 353             'title': title,
 354             'thumbnail': thumbnail,
 355             'duration': duration,
 356             'view_count': view_count,
 357             'like_count': like_count,
 358             'dislike_count': dislike_count,
 359             'comment_count': comment_count,
 360             'formats': formats,
 361             'age_limit': 18,
 362             'tags': extract_list('tags'),
 363             'categories': extract_list('categories'),
 364             'subtitles': subtitles,
 365         }
 366
 367
 368 class PornHubPlaylistBaseIE(PornHubBaseIE):
 369     def _extract_entries(self, webpage, host):
 370         # Only process container div with main playlist content skipping
 371         # drop-down menu that uses similar pattern for videos (see
 372         # https://github.com/ytdl-org/youtube-dl/issues/11594).
 373         container = self._search_regex(
 374             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 375             'container', default=webpage)
 376
 377         return [
 378             self.url_result(
 379                 'http://www.%s/%s' % (host, video_url),
 380                 PornHubIE.ie_key(), video_title=title)
 381             for video_url, title in orderedSet(re.findall(
 382                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 383                 container))
 384         ]
 385
 386     def _real_extract(self, url):
 387         mobj = re.match(self._VALID_URL, url)
 388         host = mobj.group('host')
 389         playlist_id = mobj.group('id')
 390
 391         webpage = self._download_webpage(url, playlist_id)
 392
 393         entries = self._extract_entries(webpage, host)
 394
 395         playlist = self._parse_json(
 396             self._search_regex(
 397                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
 398                 'playlist', default='{}'),
 399             playlist_id, fatal=False)
 400         title = playlist.get('title') or self._search_regex(
 401             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 402
 403         return self.playlist_result(
 404             entries, playlist_id, title, playlist.get('description'))
 405
 406
 407 class PornHubUserIE(PornHubPlaylistBaseIE):
 408     _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
 409     _TESTS = [{
 410         'url': 'https://www.pornhub.com/model/zoe_ph',
 411         'playlist_mincount': 118,
 412     }, {
 413         'url': 'https://www.pornhub.com/pornstar/liz-vicious',
 414         'info_dict': {
 415             'id': 'liz-vicious',
 416         },
 417         'playlist_mincount': 118,
 418     }, {
 419         'url': 'https://www.pornhub.com/users/russianveet69',
 420         'only_matching': True,
 421     }, {
 422         'url': 'https://www.pornhub.com/channels/povd',
 423         'only_matching': True,
 424     }, {
 425         'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
 426         'only_matching': True,
 427     }]
 428
 429     def _real_extract(self, url):
 430         mobj = re.match(self._VALID_URL, url)
 431         user_id = mobj.group('id')
 432         return self.url_result(
 433             '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
 434             video_id=user_id)
 435
 436
 437 class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
 438     @staticmethod
 439     def _has_more(webpage):
 440         return re.search(
 441             r'''(?x)
 442                 <li[^>]+\bclass=["\']page_next|
 443                 <link[^>]+\brel=["\']next|
 444                 <button[^>]+\bid=["\']moreDataBtn
 445             ''', webpage) is not None
 446
 447     def _real_extract(self, url):
 448         mobj = re.match(self._VALID_URL, url)
 449         host = mobj.group('host')
 450         item_id = mobj.group('id')
 451
 452         page = int_or_none(self._search_regex(
 453             r'\bpage=(\d+)', url, 'page', default=None))
 454
 455         entries = []
 456         for page_num in (page, ) if page is not None else itertools.count(1):
 457             try:
 458                 webpage = self._download_webpage(
 459                     url, item_id, 'Downloading page %d' % page_num,
 460                     query={'page': page_num})
 461             except ExtractorError as e:
 462                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
 463                     break
 464                 raise
 465             page_entries = self._extract_entries(webpage, host)
 466             if not page_entries:
 467                 break
 468             entries.extend(page_entries)
 469             if not self._has_more(webpage):
 470                 break
 471
 472         return self.playlist_result(orderedSet(entries), item_id)
 473
 474
 475 class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
 476     _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
 477     _TESTS = [{
 478         'url': 'https://www.pornhub.com/model/zoe_ph/videos',
 479         'only_matching': True,
 480     }, {
 481         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 482         'only_matching': True,
 483     }, {
 484         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
 485         'info_dict': {
 486             'id': 'pornstar/jenny-blighe/videos',
 487         },
 488         'playlist_mincount': 149,
 489     }, {
 490         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
 491         'info_dict': {
 492             'id': 'pornstar/jenny-blighe/videos',
 493         },
 494         'playlist_mincount': 40,
 495     }, {
 496         # default sorting as Top Rated Videos
 497         'url': 'https://www.pornhub.com/channels/povd/videos',
 498         'info_dict': {
 499             'id': 'channels/povd/videos',
 500         },
 501         'playlist_mincount': 293,
 502     }, {
 503         # Top Rated Videos
 504         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 505         'only_matching': True,
 506     }, {
 507         # Most Recent Videos
 508         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 509         'only_matching': True,
 510     }, {
 511         # Most Viewed Videos
 512         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 513         'only_matching': True,
 514     }, {
 515         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 516         'only_matching': True,
 517     }, {
 518         # Most Viewed Videos
 519         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
 520         'only_matching': True,
 521     }, {
 522         # Top Rated Videos
 523         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
 524         'only_matching': True,
 525     }, {
 526         # Longest Videos
 527         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
 528         'only_matching': True,
 529     }, {
 530         # Newest Videos
 531         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
 532         'only_matching': True,
 533     }, {
 534         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
 535         'only_matching': True,
 536     }, {
 537         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
 538         'only_matching': True,
 539     }, {
 540         'url': 'https://www.pornhub.com/video',
 541         'only_matching': True,
 542     }, {
 543         'url': 'https://www.pornhub.com/video?page=3',
 544         'only_matching': True,
 545     }, {
 546         'url': 'https://www.pornhub.com/video/search?search=123',
 547         'only_matching': True,
 548     }, {
 549         'url': 'https://www.pornhub.com/categories/teen',
 550         'only_matching': True,
 551     }, {
 552         'url': 'https://www.pornhub.com/categories/teen?page=3',
 553         'only_matching': True,
 554     }, {
 555         'url': 'https://www.pornhub.com/hd',
 556         'only_matching': True,
 557     }, {
 558         'url': 'https://www.pornhub.com/hd?page=3',
 559         'only_matching': True,
 560     }, {
 561         'url': 'https://www.pornhub.com/described-video',
 562         'only_matching': True,
 563     }, {
 564         'url': 'https://www.pornhub.com/described-video?page=2',
 565         'only_matching': True,
 566     }, {
 567         'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
 568         'only_matching': True,
 569     }, {
 570         'url': 'https://www.pornhub.com/playlist/44121572',
 571         'info_dict': {
 572             'id': 'playlist/44121572',
 573         },
 574         'playlist_mincount': 132,
 575     }, {
 576         'url': 'https://www.pornhub.com/playlist/4667351',
 577         'only_matching': True,
 578     }, {
 579         'url': 'https://de.pornhub.com/playlist/4667351',
 580         'only_matching': True,
 581     }]
 582
 583     @classmethod
 584     def suitable(cls, url):
 585         return (False
 586                 if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
 587                 else super(PornHubPagedVideoListIE, cls).suitable(url))
 588
 589
 590 class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
 591     _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
 592     _TESTS = [{
 593         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 594         'info_dict': {
 595             'id': 'jenny-blighe',
 596         },
 597         'playlist_mincount': 129,
 598     }, {
 599         'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
 600         'only_matching': True,
 601     }]