Update upstream source from tag 'upstream/2018.06.18'

[youtubedl] / youtube_dl / extractor / motherless.py
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py

index 6fe3b6049b2917ed5d7b075d0ca2c7ae943c459f..e24396e791cfddf88e53a34639e563b54795694e 100644 (file)
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -4,8 +4,11 @@ import datetime
  import re
  
  from .common import InfoExtractor
+from ..compat import compat_urlparse
  from ..utils import (
      ExtractorError,
+    InAdvancePagedList,
+    orderedSet,
      str_to_int,
      unified_strdate,
  )
@@ -114,3 +117,86 @@ class MotherlessIE(InfoExtractor):
              'age_limit': age_limit,
              'url': video_url,
          }
+
+
+class MotherlessGroupIE(InfoExtractor):
+    _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
+    _TESTS = [{
+        'url': 'http://motherless.com/g/movie_scenes',
+        'info_dict': {
+            'id': 'movie_scenes',
+            'title': 'Movie Scenes',
+            'description': 'Hot and sexy scenes from "regular" movies... '
+                           'Beautiful actresses fully nude... A looot of '
+                           'skin! :)Enjoy!',
+        },
+        'playlist_mincount': 662,
+    }, {
+        'url': 'http://motherless.com/gv/sex_must_be_funny',
+        'info_dict': {
+            'id': 'sex_must_be_funny',
+            'title': 'Sex must be funny',
+            'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
+                           'any kind!'
+        },
+        'playlist_mincount': 9,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if MotherlessIE.suitable(url)
+                else super(MotherlessGroupIE, cls).suitable(url))
+
+    def _extract_entries(self, webpage, base):
+        entries = []
+        for mobj in re.finditer(
+                r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
+                webpage):
+            video_url = compat_urlparse.urljoin(base, mobj.group('href'))
+            if not MotherlessIE.suitable(video_url):
+                continue
+            video_id = MotherlessIE._match_id(video_url)
+            title = mobj.group('title')
+            entries.append(self.url_result(
+                video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
+                video_title=title))
+        # Alternative fallback
+        if not entries:
+            entries = [
+                self.url_result(
+                    compat_urlparse.urljoin(base, '/' + video_id),
+                    ie=MotherlessIE.ie_key(), video_id=video_id)
+                for video_id in orderedSet(re.findall(
+                    r'data-codename=["\']([A-Z0-9]+)', webpage))]
+        return entries
+
+    def _real_extract(self, url):
+        group_id = self._match_id(url)
+        page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
+        webpage = self._download_webpage(page_url, group_id)
+        title = self._search_regex(
+            r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
+        description = self._html_search_meta(
+            'description', webpage, fatal=False)
+        page_count = self._int(self._search_regex(
+            r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
+            webpage, 'page_count'), 'page_count')
+        PAGE_SIZE = 80
+
+        def _get_page(idx):
+            webpage = self._download_webpage(
+                page_url, group_id, query={'page': idx + 1},
+                note='Downloading page %d/%d' % (idx + 1, page_count)
+            )
+            for entry in self._extract_entries(webpage, url):
+                yield entry
+
+        playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+
+        return {
+            '_type': 'playlist',
+            'id': group_id,
+            'title': title,
+            'description': description,
+            'entries': playlist
+        }