X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/46113edab215c2211a604c06245c16d5d4e57dcf..9865587cf7b579793ba274067c3e44065d5ff77d:/youtube_dl/extractor/aparat.py?ds=sidebyside diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 7486088..883dcee 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,59 +1,95 @@ -#coding: utf-8 - +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, + int_or_none, + merge_dicts, + mimetype2ext, + url_or_none, ) class AparatIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', - 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, }, - # 'skip': 'Extremely unreliable', - } + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work - embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + - video_id + '/vt/frame') - webpage = self._download_webpage(embed_url, video_id) - - video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) - for i, video_url in enumerate(video_urls): - req = HEADRequest(video_url) - res = self._request_webpage( - req, video_id, note=u'Testing video URL %d' % i, errnote=False) - if res: - break - else: - raise ExtractorError(u'No working video URLs found') - - title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title') - thumbnail = self._search_regex( - r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False) - - return { + video_id = self._match_id(url) + + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) + + options = self._parse_json( + self._search_regex( + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), + video_id) + + player = options['plugins']['sabaPlayerPlugin'] + + formats = [] + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { 'id': video_id, - 'title': title, - 'url': video_url, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), + 'formats': formats, + })