X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/47d80ec0b18245caeb97018d4c1af18d0b5b972b..4d073ced87f1fff7fd6fdfd82462444769b9f90b:/youtube_dl/extractor/teamcoco.py diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7534639..73469cc 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,33 +1,34 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 -import binascii -import re import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( + determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, qualities, - determine_ext, ) -from ..compat import compat_ord -class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P[0-9]+)?/?(?P.*)' +class TeamcocoIE(TurnerBaseIE): + _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -38,7 +39,8 @@ class TeamcocoIE(InfoExtractor): 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -47,6 +49,8 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -61,110 +65,125 @@ class TeamcocoIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) - - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') - - data = None + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'http://teamcoco.com/graphql/', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }))['data'][find_object] - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') + def _real_extract(self, url): + display_id = self._match_id(url) + + response = self._graphql_call('''{ + %s(slug: "%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + file { + url + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) + + record = response['record'] + video_id = record['id'] + + info = { + 'id': video_id, + 'display_id': display_id, + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), + } - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = base64.b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): continue - - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) - - formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction - continue - else: - if filed['url'].startswith('/mp4:protected/'): - # TODO Correct extraction for these files + src_url = src.get('src') + if not src_url: continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) - - formats.append({ - 'url': filed['url'], - 'ext': 'mp4', - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), - } + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info