X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/a316b1d93c357b5edf19d5e3100526a94191c029..eeb287ed58128659e5de200bba1e0762fb972560:/youtube_dl/extractor/sohu.py?ds=inline diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 77bb0a8..c047919 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -1,81 +1,100 @@ # encoding: utf-8 +from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import ExtractorError +from .common import compat_str class SohuIE(InfoExtractor): - _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P\d+)\.shtml.*?' + _VALID_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' _TEST = { - u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', - u'file': u'382479172.mp4', - u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', - u'info_dict': { - u'title': u'MV:Far East Movement《The Illest》', + 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', + 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', + 'info_dict': { + 'id': '382479172', + 'ext': 'mp4', + 'title': 'MV:Far East Movement《The Illest》', }, + 'skip': 'Only available from China', } def _real_extract(self, url): - def _fetch_data(vid_id): - base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - data_url = base_data_url + str(vid_id) - data_json = self._download_webpage( - data_url, video_id, - note=u'Downloading JSON data for ' + str(vid_id)) - return json.loads(data_json) + def _fetch_data(vid_id, mytv=False): + if mytv: + base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' + else: + base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + + return self._download_json( + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) - raw_title = self._html_search_regex(r'(?s)(.+?)', - webpage, u'video title') + raw_title = self._html_search_regex( + r'(?s)(.+?)', + webpage, 'video title') title = raw_title.partition('-')[0].strip() - vid = self._html_search_regex(r'var vid="(\d+)"', webpage, - u'video path') - data = _fetch_data(vid) - - QUALITIES = ('ori', 'super', 'high', 'nor') - vid_ids = [data['data'][q + 'Vid'] - for q in QUALITIES - if data['data'][q + 'Vid'] != 0] - if not vid_ids: - raise ExtractorError(u'No formats available for this video') + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) - # For now, we just pick the highest available quality - vid_id = vid_ids[-1] + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) - format_data = data if vid == vid_id else _fetch_data(vid_id) - part_count = format_data['data']['totalBlocks'] - allot = format_data['allot'] - prot = format_data['prot'] - clipsURL = format_data['data']['clipsURL'] - su = format_data['data']['su'] + part_count = vid_data['data']['totalBlocks'] playlist = [] for i in range(part_count): - part_url = ('http://%s/?prot=%s&file=%s&new=%s' % - (allot, prot, clipsURL[i], su[i])) - part_str = self._download_webpage( - part_url, video_id, - note=u'Downloading part %d of %d' % (i+1, part_count)) - - part_info = part_str.split('|') - video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) - - video_info = { - 'id': '%s_part%02d' % (video_id, i + 1), + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + prot = format_data['prot'] + + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + part_str = self._download_webpage( + 'http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clips_url[i], su[i]), + video_id, + 'Downloading %s video URL part %d of %d' + % (format_id, i + 1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': data['clipsBytes'][i], + 'width': data['width'], + 'height': data['height'], + 'fps': data['fps'], + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), 'title': title, - 'url': video_url, - 'ext': 'mp4', - } - playlist.append(video_info) + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) if len(playlist) == 1: info = playlist[0]