]> Raphaël G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/crunchyroll.py
debian/control: Update list of extractors in long description.
[youtubedl] / youtube_dl / extractor / crunchyroll.py
index 8bdaf0c2c5af7aa1a562033487a16867c3304f19..ba8b9fa7eff3105fb657cf322fdacab17da632eb 100644 (file)
@@ -3,13 +3,14 @@ from __future__ import unicode_literals
 
 import re
 import json
 
 import re
 import json
-import base64
 import zlib
 
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
 import zlib
 
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
+from .vrv import VRVIE
 from ..compat import (
 from ..compat import (
+    compat_b64decode,
     compat_etree_fromstring,
     compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_etree_fromstring,
     compat_urllib_parse_urlencode,
     compat_urllib_request,
@@ -18,6 +19,8 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     bytes_to_intlist,
 from ..utils import (
     ExtractorError,
     bytes_to_intlist,
+    extract_attributes,
+    float_or_none,
     intlist_to_bytes,
     int_or_none,
     lowercase_escape,
     intlist_to_bytes,
     int_or_none,
     lowercase_escape,
@@ -26,7 +29,6 @@ from ..utils import (
     unified_strdate,
     urlencode_postdata,
     xpath_text,
     unified_strdate,
     urlencode_postdata,
     xpath_text,
-    extract_attributes,
 )
 from ..aes import (
     aes_cbc_decrypt,
 )
 from ..aes import (
     aes_cbc_decrypt,
@@ -38,11 +40,32 @@ class CrunchyrollBaseIE(InfoExtractor):
     _LOGIN_FORM = 'login_form'
     _NETRC_MACHINE = 'crunchyroll'
 
     _LOGIN_FORM = 'login_form'
     _NETRC_MACHINE = 'crunchyroll'
 
+    def _call_rpc_api(self, method, video_id, note=None, data=None):
+        data = data or {}
+        data['req'] = 'RpcApi' + method
+        data = compat_urllib_parse_urlencode(data).encode('utf-8')
+        return self._download_xml(
+            'http://www.crunchyroll.com/xml/',
+            video_id, note, fatal=False, data=data, headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
+
     def _login(self):
     def _login(self):
-        (username, password) = self._get_login_info()
+        username, password = self._get_login_info()
         if username is None:
             return
 
         if username is None:
             return
 
+        self._download_webpage(
+            'https://www.crunchyroll.com/?a=formhandler',
+            None, 'Logging in', 'Wrong login info',
+            data=urlencode_postdata({
+                'formname': 'RpcApiUser_Login',
+                'next_url': 'https://www.crunchyroll.com/acct/membership',
+                'name': username,
+                'password': password,
+            }))
+
+        '''
         login_page = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading login page')
 
         login_page = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading login page')
 
@@ -86,6 +109,7 @@ class CrunchyrollBaseIE(InfoExtractor):
             raise ExtractorError('Unable to login: %s' % error, expected=True)
 
         raise ExtractorError('Unable to log in')
             raise ExtractorError('Unable to login: %s' % error, expected=True)
 
         raise ExtractorError('Unable to log in')
+        '''
 
     def _real_initialize(self):
         self._login()
 
     def _real_initialize(self):
         self._login()
@@ -117,7 +141,8 @@ class CrunchyrollBaseIE(InfoExtractor):
             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 
 
             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 
 
-class CrunchyrollIE(CrunchyrollBaseIE):
+class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+    IE_NAME = 'crunchyroll'
     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -126,7 +151,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             'ext': 'mp4',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
             'ext': 'mp4',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
-            'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
             'upload_date': '20131013',
             'url': 're:(?!.*&amp)',
             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
             'upload_date': '20131013',
             'url': 're:(?!.*&amp)',
@@ -199,7 +224,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
         'info_dict': {
             'id': '535080',
             'ext': 'mp4',
         'info_dict': {
             'id': '535080',
             'ext': 'mp4',
-            'title': '11eyes Episode 1 – Piros éjszaka - Red Night',
+            'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka',
             'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
             'uploader': 'Marvelous AQL Inc.',
             'upload_date': '20091021',
             'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
             'uploader': 'Marvelous AQL Inc.',
             'upload_date': '20091021',
@@ -240,6 +265,9 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             # Just test metadata extraction
             'skip_download': True,
         },
             # Just test metadata extraction
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-723735',
+        'only_matching': True,
     }]
 
     _FORMAT_IDS = {
     }]
 
     _FORMAT_IDS = {
@@ -250,8 +278,8 @@ class CrunchyrollIE(CrunchyrollBaseIE):
     }
 
     def _decrypt_subtitles(self, data, iv, id):
     }
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
-        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
+        data = bytes_to_intlist(compat_b64decode(data))
+        iv = bytes_to_intlist(compat_b64decode(iv))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -365,15 +393,19 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
-            sub_page = self._download_webpage(
-                'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
-                video_id, note='Downloading subtitles for ' + sub_name)
-            id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
-            iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
-            data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
-            if not id or not iv or not data:
+            sub_doc = self._call_rpc_api(
+                'Subtitle_GetXml', video_id,
+                'Downloading subtitles for ' + sub_name, data={
+                    'subtitle_script_id': sub_id,
+                })
+            if sub_doc is None:
+                continue
+            sid = sub_doc.get('id')
+            iv = xpath_text(sub_doc, 'iv', 'subtitle iv')
+            data = xpath_text(sub_doc, 'data', 'subtitle data')
+            if not sid or not iv or not data:
                 continue
                 continue
-            subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
+            subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8')
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
@@ -408,13 +440,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         if 'To view this, please log in to verify you are 18 or older.' in webpage:
             self.raise_login_required()
 
         if 'To view this, please log in to verify you are 18 or older.' in webpage:
             self.raise_login_required()
 
+        media = self._parse_json(self._search_regex(
+            r'vilos\.config\.media\s*=\s*({.+?});',
+            webpage, 'vilos media', default='{}'), video_id)
+        media_metadata = media.get('metadata') or {}
+
         video_title = self._html_search_regex(
             r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
             webpage, 'video_title')
         video_title = re.sub(r' {2,}', ' ', video_title)
         video_title = self._html_search_regex(
             r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
             webpage, 'video_title')
         video_title = re.sub(r' {2,}', ' ', video_title)
-        video_description = self._parse_json(self._html_search_regex(
+        video_description = (self._parse_json(self._html_search_regex(
             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
-            webpage, 'description', default='{}'), video_id).get('description')
+            webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
         if video_description:
             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
         video_upload_date = self._html_search_regex(
         if video_description:
             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
         video_upload_date = self._html_search_regex(
@@ -427,86 +464,118 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
             webpage, 'video_uploader', fatal=False)
 
             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
             webpage, 'video_uploader', fatal=False)
 
-        available_fmts = []
-        for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
-            attrs = extract_attributes(a)
-            href = attrs.get('href')
-            if href and '/freetrial' in href:
-                continue
-            available_fmts.append(fmt)
-        if not available_fmts:
-            for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
-                available_fmts = re.findall(p, webpage)
-                if available_fmts:
-                    break
-        video_encode_ids = []
         formats = []
         formats = []
-        for fmt in available_fmts:
-            stream_quality, stream_format = self._FORMAT_IDS[fmt]
-            video_format = fmt + 'p'
-            streamdata_req = sanitized_Request(
-                'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
-                % (video_id, stream_format, stream_quality),
-                compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8'))
-            streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            streamdata = self._download_xml(
-                streamdata_req, video_id,
-                note='Downloading media info for %s' % video_format)
-            stream_info = streamdata.find('./{default}preload/stream_info')
-            video_encode_id = xpath_text(stream_info, './video_encode_id')
-            if video_encode_id in video_encode_ids:
-                continue
-            video_encode_ids.append(video_encode_id)
-
-            video_file = xpath_text(stream_info, './file')
-            if not video_file:
-                continue
-            if video_file.startswith('http'):
-                formats.extend(self._extract_m3u8_formats(
-                    video_file, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
-                continue
+        for stream in media.get('streams', []):
+            formats.extend(self._extract_vrv_formats(
+                stream.get('url'), video_id, stream.get('format'),
+                stream.get('audio_lang'), stream.get('hardsub_lang')))
+        if not formats:
+            available_fmts = []
+            for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+                attrs = extract_attributes(a)
+                href = attrs.get('href')
+                if href and '/freetrial' in href:
+                    continue
+                available_fmts.append(fmt)
+            if not available_fmts:
+                for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+                    available_fmts = re.findall(p, webpage)
+                    if available_fmts:
+                        break
+            if not available_fmts:
+                available_fmts = self._FORMAT_IDS.keys()
+            video_encode_ids = []
+
+            for fmt in available_fmts:
+                stream_quality, stream_format = self._FORMAT_IDS[fmt]
+                video_format = fmt + 'p'
+                stream_infos = []
+                streamdata = self._call_rpc_api(
+                    'VideoPlayer_GetStandardConfig', video_id,
+                    'Downloading media info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_quality': stream_quality,
+                        'current_page': url,
+                    })
+                if streamdata is not None:
+                    stream_info = streamdata.find('./{default}preload/stream_info')
+                    if stream_info is not None:
+                        stream_infos.append(stream_info)
+                stream_info = self._call_rpc_api(
+                    'VideoEncode_GetStreamInfo', video_id,
+                    'Downloading stream info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_encode_quality': stream_quality,
+                    })
+                if stream_info is not None:
+                    stream_infos.append(stream_info)
+                for stream_info in stream_infos:
+                    video_encode_id = xpath_text(stream_info, './video_encode_id')
+                    if video_encode_id in video_encode_ids:
+                        continue
+                    video_encode_ids.append(video_encode_id)
+
+                    video_file = xpath_text(stream_info, './file')
+                    if not video_file:
+                        continue
+                    if video_file.startswith('http'):
+                        formats.extend(self._extract_m3u8_formats(
+                            video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id='hls', fatal=False))
+                        continue
+
+                    video_url = xpath_text(stream_info, './host')
+                    if not video_url:
+                        continue
+                    metadata = stream_info.find('./metadata')
+                    format_info = {
+                        'format': video_format,
+                        'height': int_or_none(xpath_text(metadata, './height')),
+                        'width': int_or_none(xpath_text(metadata, './width')),
+                    }
+
+                    if '.fplive.net/' in video_url:
+                        video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+                        parsed_video_url = compat_urlparse.urlparse(video_url)
+                        direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+                            netloc='v.lvlt.crcdn.net',
+                            path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
+                        if self._is_valid_url(direct_video_url, video_id, video_format):
+                            format_info.update({
+                                'format_id': 'http-' + video_format,
+                                'url': direct_video_url,
+                            })
+                            formats.append(format_info)
+                            continue
 
 
-            video_url = xpath_text(stream_info, './host')
-            if not video_url:
-                continue
-            metadata = stream_info.find('./metadata')
-            format_info = {
-                'format': video_format,
-                'format_id': video_format,
-                'height': int_or_none(xpath_text(metadata, './height')),
-                'width': int_or_none(xpath_text(metadata, './width')),
-            }
-
-            if '.fplive.net/' in video_url:
-                video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
-                parsed_video_url = compat_urlparse.urlparse(video_url)
-                direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
-                    netloc='v.lvlt.crcdn.net',
-                    path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
-                if self._is_valid_url(direct_video_url, video_id, video_format):
                     format_info.update({
                     format_info.update({
-                        'url': direct_video_url,
+                        'format_id': 'rtmp-' + video_format,
+                        'url': video_url,
+                        'play_path': video_file,
+                        'ext': 'flv',
                     })
                     formats.append(format_info)
                     })
                     formats.append(format_info)
-                    continue
-
-            format_info.update({
-                'url': video_url,
-                'play_path': video_file,
-                'ext': 'flv',
-            })
-            formats.append(format_info)
-        self._sort_formats(formats)
+        self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
 
 
-        metadata = self._download_xml(
-            'http://www.crunchyroll.com/xml', video_id,
-            note='Downloading media info', query={
-                'req': 'RpcApiVideoPlayer_GetMediaMetadata',
+        metadata = self._call_rpc_api(
+            'VideoPlayer_GetMediaMetadata', video_id,
+            note='Downloading media info', data={
                 'media_id': video_id,
             })
 
                 'media_id': video_id,
             })
 
-        subtitles = self.extract_subtitles(video_id, webpage)
+        subtitles = {}
+        for subtitle in media.get('subtitles', []):
+            subtitle_url = subtitle.get('url')
+            if not subtitle_url:
+                continue
+            subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
+                'url': subtitle_url,
+                'ext': subtitle.get('format', 'ass'),
+            })
+        if not subtitles:
+            subtitles = self.extract_subtitles(video_id, webpage)
 
         # webpage provide more accurate data than series_title from XML
         series = self._html_search_regex(
 
         # webpage provide more accurate data than series_title from XML
         series = self._html_search_regex(
@@ -514,8 +583,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             webpage, 'series', fatal=False)
         season = xpath_text(metadata, 'series_title')
 
             webpage, 'series', fatal=False)
         season = xpath_text(metadata, 'series_title')
 
-        episode = xpath_text(metadata, 'episode_title')
-        episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
+        episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
+        episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
 
         season_number = int_or_none(self._search_regex(
             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
 
         season_number = int_or_none(self._search_regex(
             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@@ -525,7 +594,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             'id': video_id,
             'title': video_title,
             'description': video_description,
             'id': video_id,
             'title': video_title,
             'description': video_description,
-            'thumbnail': xpath_text(metadata, 'episode_image_url'),
+            'duration': float_or_none(media_metadata.get('duration'), 1000),
+            'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
             'uploader': video_uploader,
             'upload_date': video_upload_date,
             'series': series,
             'uploader': video_uploader,
             'upload_date': video_upload_date,
             'series': series,
@@ -540,7 +610,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
     IE_NAME = 'crunchyroll:playlist'
 
 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
     IE_NAME = 'crunchyroll:playlist'
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
 
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',