]> Raphaël G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/crunchyroll.py
Update upstream source from tag 'upstream/2018.09.10'
[youtubedl] / youtube_dl / extractor / crunchyroll.py
index b92f2544799769abc9690a04d4d1090af435ca79..ba8b9fa7eff3105fb657cf322fdacab17da632eb 100644 (file)
@@ -3,13 +3,14 @@ from __future__ import unicode_literals
 
 import re
 import json
 
 import re
 import json
-import base64
 import zlib
 
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
 import zlib
 
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
+from .vrv import VRVIE
 from ..compat import (
 from ..compat import (
+    compat_b64decode,
     compat_etree_fromstring,
     compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_etree_fromstring,
     compat_urllib_parse_urlencode,
     compat_urllib_request,
@@ -18,6 +19,8 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     bytes_to_intlist,
 from ..utils import (
     ExtractorError,
     bytes_to_intlist,
+    extract_attributes,
+    float_or_none,
     intlist_to_bytes,
     int_or_none,
     lowercase_escape,
     intlist_to_bytes,
     int_or_none,
     lowercase_escape,
@@ -26,7 +29,6 @@ from ..utils import (
     unified_strdate,
     urlencode_postdata,
     xpath_text,
     unified_strdate,
     urlencode_postdata,
     xpath_text,
-    extract_attributes,
 )
 from ..aes import (
     aes_cbc_decrypt,
 )
 from ..aes import (
     aes_cbc_decrypt,
@@ -49,7 +51,7 @@ class CrunchyrollBaseIE(InfoExtractor):
             })
 
     def _login(self):
             })
 
     def _login(self):
-        (username, password) = self._get_login_info()
+        username, password = self._get_login_info()
         if username is None:
             return
 
         if username is None:
             return
 
@@ -139,7 +141,8 @@ class CrunchyrollBaseIE(InfoExtractor):
             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 
 
             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 
 
-class CrunchyrollIE(CrunchyrollBaseIE):
+class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+    IE_NAME = 'crunchyroll'
     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -148,7 +151,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             'ext': 'mp4',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
             'ext': 'mp4',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
-            'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
             'upload_date': '20131013',
             'url': 're:(?!.*&amp)',
             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
             'upload_date': '20131013',
             'url': 're:(?!.*&amp)',
@@ -221,7 +224,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
         'info_dict': {
             'id': '535080',
             'ext': 'mp4',
         'info_dict': {
             'id': '535080',
             'ext': 'mp4',
-            'title': '11eyes Episode 1 – Piros éjszaka - Red Night',
+            'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka',
             'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
             'uploader': 'Marvelous AQL Inc.',
             'upload_date': '20091021',
             'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
             'uploader': 'Marvelous AQL Inc.',
             'upload_date': '20091021',
@@ -262,6 +265,9 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             # Just test metadata extraction
             'skip_download': True,
         },
             # Just test metadata extraction
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-723735',
+        'only_matching': True,
     }]
 
     _FORMAT_IDS = {
     }]
 
     _FORMAT_IDS = {
@@ -272,8 +278,8 @@ class CrunchyrollIE(CrunchyrollBaseIE):
     }
 
     def _decrypt_subtitles(self, data, iv, id):
     }
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
-        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
+        data = bytes_to_intlist(compat_b64decode(data))
+        iv = bytes_to_intlist(compat_b64decode(iv))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -434,13 +440,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         if 'To view this, please log in to verify you are 18 or older.' in webpage:
             self.raise_login_required()
 
         if 'To view this, please log in to verify you are 18 or older.' in webpage:
             self.raise_login_required()
 
+        media = self._parse_json(self._search_regex(
+            r'vilos\.config\.media\s*=\s*({.+?});',
+            webpage, 'vilos media', default='{}'), video_id)
+        media_metadata = media.get('metadata') or {}
+
         video_title = self._html_search_regex(
             r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
             webpage, 'video_title')
         video_title = re.sub(r' {2,}', ' ', video_title)
         video_title = self._html_search_regex(
             r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
             webpage, 'video_title')
         video_title = re.sub(r' {2,}', ' ', video_title)
-        video_description = self._parse_json(self._html_search_regex(
+        video_description = (self._parse_json(self._html_search_regex(
             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
-            webpage, 'description', default='{}'), video_id).get('description')
+            webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
         if video_description:
             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
         video_upload_date = self._html_search_regex(
         if video_description:
             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
         video_upload_date = self._html_search_regex(
@@ -453,91 +464,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
             webpage, 'video_uploader', fatal=False)
 
             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
             webpage, 'video_uploader', fatal=False)
 
-        available_fmts = []
-        for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
-            attrs = extract_attributes(a)
-            href = attrs.get('href')
-            if href and '/freetrial' in href:
-                continue
-            available_fmts.append(fmt)
-        if not available_fmts:
-            for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
-                available_fmts = re.findall(p, webpage)
-                if available_fmts:
-                    break
-        video_encode_ids = []
         formats = []
         formats = []
-        for fmt in available_fmts:
-            stream_quality, stream_format = self._FORMAT_IDS[fmt]
-            video_format = fmt + 'p'
-            stream_infos = []
-            streamdata = self._call_rpc_api(
-                'VideoPlayer_GetStandardConfig', video_id,
-                'Downloading media info for %s' % video_format, data={
-                    'media_id': video_id,
-                    'video_format': stream_format,
-                    'video_quality': stream_quality,
-                    'current_page': url,
-                })
-            if streamdata is not None:
-                stream_info = streamdata.find('./{default}preload/stream_info')
+        for stream in media.get('streams', []):
+            formats.extend(self._extract_vrv_formats(
+                stream.get('url'), video_id, stream.get('format'),
+                stream.get('audio_lang'), stream.get('hardsub_lang')))
+        if not formats:
+            available_fmts = []
+            for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+                attrs = extract_attributes(a)
+                href = attrs.get('href')
+                if href and '/freetrial' in href:
+                    continue
+                available_fmts.append(fmt)
+            if not available_fmts:
+                for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+                    available_fmts = re.findall(p, webpage)
+                    if available_fmts:
+                        break
+            if not available_fmts:
+                available_fmts = self._FORMAT_IDS.keys()
+            video_encode_ids = []
+
+            for fmt in available_fmts:
+                stream_quality, stream_format = self._FORMAT_IDS[fmt]
+                video_format = fmt + 'p'
+                stream_infos = []
+                streamdata = self._call_rpc_api(
+                    'VideoPlayer_GetStandardConfig', video_id,
+                    'Downloading media info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_quality': stream_quality,
+                        'current_page': url,
+                    })
+                if streamdata is not None:
+                    stream_info = streamdata.find('./{default}preload/stream_info')
+                    if stream_info is not None:
+                        stream_infos.append(stream_info)
+                stream_info = self._call_rpc_api(
+                    'VideoEncode_GetStreamInfo', video_id,
+                    'Downloading stream info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_encode_quality': stream_quality,
+                    })
                 if stream_info is not None:
                     stream_infos.append(stream_info)
                 if stream_info is not None:
                     stream_infos.append(stream_info)
-            stream_info = self._call_rpc_api(
-                'VideoEncode_GetStreamInfo', video_id,
-                'Downloading stream info for %s' % video_format, data={
-                    'media_id': video_id,
-                    'video_format': stream_format,
-                    'video_encode_quality': stream_quality,
-                })
-            if stream_info is not None:
-                stream_infos.append(stream_info)
-            for stream_info in stream_infos:
-                video_encode_id = xpath_text(stream_info, './video_encode_id')
-                if video_encode_id in video_encode_ids:
-                    continue
-                video_encode_ids.append(video_encode_id)
-
-                video_file = xpath_text(stream_info, './file')
-                if not video_file:
-                    continue
-                if video_file.startswith('http'):
-                    formats.extend(self._extract_m3u8_formats(
-                        video_file, video_id, 'mp4', entry_protocol='m3u8_native',
-                        m3u8_id='hls', fatal=False))
-                    continue
+                for stream_info in stream_infos:
+                    video_encode_id = xpath_text(stream_info, './video_encode_id')
+                    if video_encode_id in video_encode_ids:
+                        continue
+                    video_encode_ids.append(video_encode_id)
 
 
-                video_url = xpath_text(stream_info, './host')
-                if not video_url:
-                    continue
-                metadata = stream_info.find('./metadata')
-                format_info = {
-                    'format': video_format,
-                    'height': int_or_none(xpath_text(metadata, './height')),
-                    'width': int_or_none(xpath_text(metadata, './width')),
-                }
-
-                if '.fplive.net/' in video_url:
-                    video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
-                    parsed_video_url = compat_urlparse.urlparse(video_url)
-                    direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
-                        netloc='v.lvlt.crcdn.net',
-                        path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
-                    if self._is_valid_url(direct_video_url, video_id, video_format):
-                        format_info.update({
-                            'format_id': 'http-' + video_format,
-                            'url': direct_video_url,
-                        })
-                        formats.append(format_info)
+                    video_file = xpath_text(stream_info, './file')
+                    if not video_file:
+                        continue
+                    if video_file.startswith('http'):
+                        formats.extend(self._extract_m3u8_formats(
+                            video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id='hls', fatal=False))
                         continue
 
                         continue
 
-                format_info.update({
-                    'format_id': 'rtmp-' + video_format,
-                    'url': video_url,
-                    'play_path': video_file,
-                    'ext': 'flv',
-                })
-                formats.append(format_info)
+                    video_url = xpath_text(stream_info, './host')
+                    if not video_url:
+                        continue
+                    metadata = stream_info.find('./metadata')
+                    format_info = {
+                        'format': video_format,
+                        'height': int_or_none(xpath_text(metadata, './height')),
+                        'width': int_or_none(xpath_text(metadata, './width')),
+                    }
+
+                    if '.fplive.net/' in video_url:
+                        video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+                        parsed_video_url = compat_urlparse.urlparse(video_url)
+                        direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+                            netloc='v.lvlt.crcdn.net',
+                            path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
+                        if self._is_valid_url(direct_video_url, video_id, video_format):
+                            format_info.update({
+                                'format_id': 'http-' + video_format,
+                                'url': direct_video_url,
+                            })
+                            formats.append(format_info)
+                            continue
+
+                    format_info.update({
+                        'format_id': 'rtmp-' + video_format,
+                        'url': video_url,
+                        'play_path': video_file,
+                        'ext': 'flv',
+                    })
+                    formats.append(format_info)
         self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
 
         metadata = self._call_rpc_api(
         self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
 
         metadata = self._call_rpc_api(
@@ -546,7 +565,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
                 'media_id': video_id,
             })
 
                 'media_id': video_id,
             })
 
-        subtitles = self.extract_subtitles(video_id, webpage)
+        subtitles = {}
+        for subtitle in media.get('subtitles', []):
+            subtitle_url = subtitle.get('url')
+            if not subtitle_url:
+                continue
+            subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
+                'url': subtitle_url,
+                'ext': subtitle.get('format', 'ass'),
+            })
+        if not subtitles:
+            subtitles = self.extract_subtitles(video_id, webpage)
 
         # webpage provide more accurate data than series_title from XML
         series = self._html_search_regex(
 
         # webpage provide more accurate data than series_title from XML
         series = self._html_search_regex(
@@ -554,8 +583,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             webpage, 'series', fatal=False)
         season = xpath_text(metadata, 'series_title')
 
             webpage, 'series', fatal=False)
         season = xpath_text(metadata, 'series_title')
 
-        episode = xpath_text(metadata, 'episode_title')
-        episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
+        episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
+        episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
 
         season_number = int_or_none(self._search_regex(
             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
 
         season_number = int_or_none(self._search_regex(
             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@@ -565,7 +594,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             'id': video_id,
             'title': video_title,
             'description': video_description,
             'id': video_id,
             'title': video_title,
             'description': video_description,
-            'thumbnail': xpath_text(metadata, 'episode_image_url'),
+            'duration': float_or_none(media_metadata.get('duration'), 1000),
+            'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
             'uploader': video_uploader,
             'upload_date': video_upload_date,
             'series': series,
             'uploader': video_uploader,
             'upload_date': video_upload_date,
             'series': series,
@@ -580,7 +610,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
     IE_NAME = 'crunchyroll:playlist'
 
 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
     IE_NAME = 'crunchyroll:playlist'
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
 
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',