]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/motherless.py
debian/control: Mark my GitHub trees as the preferred repos.
[youtubedl] / youtube_dl / extractor / motherless.py
index 6fe3b6049b2917ed5d7b075d0ca2c7ae943c459f..43fd70f112005a893377c8e5cf489291fb8cc812 100644 (file)
@@ -4,8 +4,11 @@ import datetime
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     ExtractorError,
+    InAdvancePagedList,
+    orderedSet,
     str_to_int,
     unified_strdate,
 )
@@ -74,8 +77,11 @@ class MotherlessIE(InfoExtractor):
 
         title = self._html_search_regex(
             r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
-        video_url = self._html_search_regex(
-            r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')
+        video_url = (self._html_search_regex(
+            (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+             r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
+            webpage, 'video URL', default=None, group='url')
+            or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
         age_limit = self._rta_search(webpage)
         view_count = str_to_int(self._html_search_regex(
             r'<strong>Views</strong>\s+([^<]+)<',
@@ -114,3 +120,86 @@ class MotherlessIE(InfoExtractor):
             'age_limit': age_limit,
             'url': video_url,
         }
+
+
+class MotherlessGroupIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
+    _TESTS = [{
+        'url': 'http://motherless.com/g/movie_scenes',
+        'info_dict': {
+            'id': 'movie_scenes',
+            'title': 'Movie Scenes',
+            'description': 'Hot and sexy scenes from "regular" movies... '
+                           'Beautiful actresses fully nude... A looot of '
+                           'skin! :)Enjoy!',
+        },
+        'playlist_mincount': 662,
+    }, {
+        'url': 'http://motherless.com/gv/sex_must_be_funny',
+        'info_dict': {
+            'id': 'sex_must_be_funny',
+            'title': 'Sex must be funny',
+            'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
+                           'any kind!'
+        },
+        'playlist_mincount': 9,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if MotherlessIE.suitable(url)
+                else super(MotherlessGroupIE, cls).suitable(url))
+
+    def _extract_entries(self, webpage, base):
+        entries = []
+        for mobj in re.finditer(
+                r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
+                webpage):
+            video_url = compat_urlparse.urljoin(base, mobj.group('href'))
+            if not MotherlessIE.suitable(video_url):
+                continue
+            video_id = MotherlessIE._match_id(video_url)
+            title = mobj.group('title')
+            entries.append(self.url_result(
+                video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
+                video_title=title))
+        # Alternative fallback
+        if not entries:
+            entries = [
+                self.url_result(
+                    compat_urlparse.urljoin(base, '/' + entry_id),
+                    ie=MotherlessIE.ie_key(), video_id=entry_id)
+                for entry_id in orderedSet(re.findall(
+                    r'data-codename=["\']([A-Z0-9]+)', webpage))]
+        return entries
+
+    def _real_extract(self, url):
+        group_id = self._match_id(url)
+        page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
+        webpage = self._download_webpage(page_url, group_id)
+        title = self._search_regex(
+            r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
+        description = self._html_search_meta(
+            'description', webpage, fatal=False)
+        page_count = self._int(self._search_regex(
+            r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
+            webpage, 'page_count'), 'page_count')
+        PAGE_SIZE = 80
+
+        def _get_page(idx):
+            webpage = self._download_webpage(
+                page_url, group_id, query={'page': idx + 1},
+                note='Downloading page %d/%d' % (idx + 1, page_count)
+            )
+            for entry in self._extract_entries(webpage, url):
+                yield entry
+
+        playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+
+        return {
+            '_type': 'playlist',
+            'id': group_id,
+            'title': title,
+            'description': description,
+            'entries': playlist
+        }