X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/87a0165ca7e39af4dacb7ec637063b2cd35ae40b..f8df414a4abcde0ddd39325dac26ca071d2d15c6:/youtube_dl/extractor/crunchyroll.py?ds=inline diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8d5b69f..8bdaf0c 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -123,7 +123,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { 'id': '645513', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', @@ -142,7 +142,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', }, @@ -158,7 +158,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'mp4', 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'TV TOKYO', 'upload_date': '20160508', }, @@ -166,6 +166,26 @@ class CrunchyrollIE(CrunchyrollBaseIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', + 'info_dict': { + 'id': '727589', + 'ext': 'mp4', + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", + 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Kadokawa Pictures Inc.', + 'upload_date': '20170118', + 'series': "KONOSUBA -God's blessing on this wonderful world!", + 'season': "KONOSUBA -God's blessing on this wonderful world! 2", + 'season_number': 2, + 'episode': 'Give Me Deliverance From This Judicial Injustice!', + 'episode_number': 1, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -173,6 +193,53 @@ class CrunchyrollIE(CrunchyrollBaseIE): # geo-restricted (US), 18+ maturity wall, non-premium available 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', 'only_matching': True, + }, { + # A description with double quotes + 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', + 'info_dict': { + 'id': '535080', + 'ext': 'mp4', + 'title': '11eyes Episode 1 – Piros éjszaka - Red Night', + 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', + 'uploader': 'Marvelous AQL Inc.', + 'upload_date': '20091021', + }, + 'params': { + # Just test metadata extraction + 'skip_download': True, + }, + }, { + # make sure we can extract an uploader name that's not a link + 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', + 'info_dict': { + 'id': '606899', + 'ext': 'mp4', + 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', + 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', + 'uploader': 'Geneon Entertainment', + 'upload_date': '20120717', + }, + 'params': { + # just test metadata extraction + 'skip_download': True, + }, + }, { + # A video with a vastly different season name compared to the series name + 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', + 'info_dict': { + 'id': '590532', + 'ext': 'mp4', + 'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test', + 'description': 'Mahiro and Nyaruko talk about official certification.', + 'uploader': 'TV TOKYO', + 'upload_date': '20120305', + 'series': 'Nyarko-san: Another Crawling Chaos', + 'season': 'Haiyoru! Nyaruani (ONA)', + }, + 'params': { + # Just test metadata extraction + 'skip_download': True, + }, }] _FORMAT_IDS = { @@ -236,8 +303,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ScaledBorderAndShadow: no - + output += """ [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding """ @@ -324,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') + webpage = self._download_webpage( + self._add_skip_wall(webpage_url), video_id, + headers=self.geo_verification_headers()) note_m = self._html_search_regex( r'
(.+?)
', webpage, 'trailer-notice', default='') @@ -344,9 +412,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex( - r']*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, - webpage, 'description', default=None) + video_description = self._parse_json(self._html_search_regex( + r']*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, + webpage, 'description', default='{}'), video_id).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( @@ -355,8 +423,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_upload_date: video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex( - r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, - 'video_uploader', fatal=False) + # try looking for both an uploader that's a link and one that's not + [r']+href="/publisher/[^"]+"[^>]*>([^<]+)', r'
\s*Publisher:\s*\s*(.+?)\s*\s*
'], + webpage, 'video_uploader', fatal=False) available_fmts = [] for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): @@ -439,6 +508,19 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text subtitles = self.extract_subtitles(video_id, webpage) + # webpage provide more accurate data than series_title from XML + series = self._html_search_regex( + r'(?s)]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)]+id=["\']showmedia_about_episode_num[^>]+>.+?\s*

\s*Season (\d+)', + webpage, 'season number', default=None)) + return { 'id': video_id, 'title': video_title, @@ -446,9 +528,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, - 'series': xpath_text(metadata, 'series_title'), - 'episode': xpath_text(metadata, 'episode_title'), - 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'subtitles': subtitles, 'formats': formats, } @@ -483,16 +567,18 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(self._add_skip_wall(url), show_id) + webpage = self._download_webpage( + self._add_skip_wall(url), show_id, + headers=self.geo_verification_headers()) title = self._html_search_regex( r'(?s)]*>\s*(.*?)', webpage, 'title') episode_paths = re.findall( - r'(?s)
  • ]+>.*?]+>.*?