X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/f19349c26118065acbe21509383c63465df794fe..99b46abf79266c2bf5eabc291dc4f366e61b5dde:/youtube_dl/extractor/vimeo.py diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f392ccf..7e854f3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,13 +8,15 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( - encode_dict, + determine_ext, ExtractorError, InAdvancePagedList, int_or_none, + NO_DEFAULT, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -23,6 +25,8 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, + parse_filesize, + try_get, ) @@ -40,22 +44,42 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - })) + }) login_request = sanitized_Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Referer', self._LOGIN_URL) self._set_vimeo_cookie('vuid', vuid) self._download_webpage(login_request, None, False, 'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') + def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( - r'xsrft\s*[=:]\s*(?P["\'])(?P.+?)(?P=q)', + r'(?:(?P["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P["\'])(?P.+?)(?P=q)', webpage, 'login token', group='xsrft') vuid = self._search_regex( r'["\']vuid["\']\s*:\s*(["\'])(?P.+?)\1', @@ -65,21 +89,96 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _set_vimeo_cookie(self, name, value): self._set_cookie('vimeo.com', name, value) + def _vimeo_sort_formats(self, formats): + # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. This lead to wrong sorting. + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) + + def _parse_config(self, config, video_id): + # Extract title + video_title = config['video']['title'] + + # Extract uploader, uploader_url and uploader_id + video_uploader = config['video'].get('owner', {}).get('name') + video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None + + # Extract video thumbnail + video_thumbnail = config['video'].get('thumbnail') + if video_thumbnail is None: + video_thumbs = config['video'].get('thumbs') + if video_thumbs and isinstance(video_thumbs, dict): + _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] + + # Extract video duration + video_duration = int_or_none(config['video'].get('duration')) + + formats = [] + config_files = config['video'].get('files') or config['request'].get('files', {}) + for f in config_files.get('progressive', []): + video_url = f.get('url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': 'http-%s' % f.get('quality'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'fps': int_or_none(f.get('fps')), + 'tbr': int_or_none(f.get('bitrate')), + }) + m3u8_url = config_files.get('hls', {}).get('url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + subtitles = {} + text_tracks = config['request'].get('text_tracks') + if text_tracks: + for tt in text_tracks: + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'https://vimeo.com' + tt['url'], + }] + + return { + 'title': video_title, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, + 'thumbnail': video_thumbnail, + 'duration': video_duration, + 'formats': formats, + 'subtitles': subtitles, + } + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?:(?:www|(?Pplayer))\.)? - vimeo(?Ppro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|album/) - (?:.*?/)? - (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? - (?:videos?/)? - (?P[0-9]+) - /?(?:[?&].*)?(?:[#].*)?$''' + https?:// + (?: + (?: + www| + (?Pplayer) + ) + \. + )? + vimeo(?Ppro)?\.com/ + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)? + (?: + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + (?P[0-9]+) + (?:/[\da-f]+)? + /?(?:[?&].*)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _TESTS = [ { @@ -91,6 +190,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'upload_date': '20121220', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, @@ -103,6 +203,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '68093876', 'ext': 'mp4', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus', 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', @@ -119,6 +220,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, 'description': None, @@ -133,10 +235,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl password protected test video', 'upload_date': '20130614', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', + 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -145,13 +248,12 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', 'title': 'Key & Peele: Terrorist Interrogation', 'description': 'md5:8678b246399b070816b12313e8b4eb5c', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', 'upload_date': '20130927', @@ -167,6 +269,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', 'upload_date': '20131015', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, @@ -181,9 +284,48 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Pier Solar OUYA Official Trailer', 'uploader': 'Tulio Gonçalves', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', }, }, + { + # contains original format + 'url': 'https://vimeo.com/33951933', + 'md5': '2d9f5475e0537f013d0073e812ab89e6', + 'info_dict': { + 'id': '33951933', + 'ext': 'mp4', + 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', + 'uploader': 'The DMCI', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci', + 'uploader_id': 'dmci', + 'upload_date': '20111220', + 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + }, + }, + { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', @@ -193,6 +335,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, + { + # source file returns 403: Forbidden + 'url': 'https://vimeo.com/7809605', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/160743502/abd0e13fb4', + 'only_matching': True, + } ] @staticmethod @@ -202,56 +357,40 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) if mobj: player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'Referer': url}) + surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) return surl # Look for embedded (swf embed) Vimeo player mobj = re.search( r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return mobj.group(1) - - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword', None) - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ - 'password': password, - 'token': token, - })) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + # Look more for non-standard embedded Vimeo player + mobj = re.search( + r']+src=(?P[\'"])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) + if mobj: + return mobj.group('url') def _verify_player_video_password(self, url, video_id): - password = self._downloader.params.get('videopassword', None) + password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata(encode_dict({'password': password})) + data = urlencode_postdata({'password': password}) pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) return self._download_json( password_request, video_id, - 'Verifying the password', - 'Wrong password') + 'Verifying the password', 'Wrong password') def _real_initialize(self): self._login() def _real_extract(self, url): - url, data = unsmuggle_url(url) - headers = std_headers - if data is not None: - headers = headers.copy() - headers.update(data) + url, data = unsmuggle_url(url, {}) + headers = std_headers.copy() + if 'http_headers' in data: + headers.update(data['http_headers']) if 'Referer' not in headers: headers['Referer'] = url @@ -261,11 +400,11 @@ class VimeoIE(VimeoBaseInfoExtractor): orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information - request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, headers=headers) try: webpage = self._download_webpage(request, video_id) except ExtractorError as ee: @@ -327,7 +466,7 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') if re.search(r']+?id="pw_form"', webpage) is not None: - if data and '_video_password_verified' in data: + if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') self._verify_video_password(url, video_id, webpage) return self._real_extract( @@ -339,19 +478,23 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(url, video_id) - # Extract title - video_title = config["video"]["title"] - - # Extract uploader and uploader_id - video_uploader = config["video"]["owner"]["name"] - video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None - - # Extract video thumbnail - video_thumbnail = config["video"].get("thumbnail") - if video_thumbnail is None: - video_thumbs = config["video"].get("thumbs") - if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + label = try_get( + config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) + if label and label.startswith('You rented this'): + return True + return False + + if is_rented(): + feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if feature_id and not data.get('force_feature_id', False): + return self.url_result(smuggle_url( + 'https://player.vimeo.com/player/%s' % feature_id, + {'force_feature_id': True}), 'Vimeo') # Extract video description @@ -372,9 +515,6 @@ class VimeoIE(VimeoBaseInfoExtractor): if not video_description and not mobj.group('player'): self._downloader.report_warning('Cannot find video description') - # Extract video duration - video_duration = int_or_none(config["video"].get("duration")) - # Extract upload date video_upload_date = None mobj = re.search(r']+datetime="([^"]+)"', webpage) @@ -392,54 +532,74 @@ class VimeoIE(VimeoBaseInfoExtractor): comment_count = None formats = [] - config_files = config['video'].get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): - video_url = f.get('url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'format_id': 'http-%s' % f.get('quality'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'fps': int_or_none(f.get('fps')), - 'tbr': int_or_none(f.get('bitrate')), - }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id')) - - subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], - }] - - return { + download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest'}) + download_data = self._download_json(download_request, video_id, fatal=False) + if download_data: + source_file = download_data.get('source_file') + if isinstance(source_file, dict): + download_url = source_file.get('download_url') + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = source_file.get('extension', determine_ext(download_url)).lower() + formats.append({ + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'preference': 1, + }) + + info_dict = self._parse_config(config, video_id) + formats.extend(info_dict['formats']) + self._vimeo_sort_formats(formats) + info_dict.update({ 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, + 'formats': formats, 'upload_date': video_upload_date, - 'title': video_title, - 'thumbnail': video_thumbnail, 'description': video_description, - 'duration': video_duration, - 'formats': formats, 'webpage_url': url, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': subtitles, - } + }) + + return info_dict + + +class VimeoOndemandIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:ondemand' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _TESTS = [{ + # ondemand video not available via https://vimeo.com/id + 'url': 'https://vimeo.com/ondemand/20704', + 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', + 'info_dict': { + 'id': '105442900', + 'ext': 'mp4', + 'title': 'המעבדה - במאי יותם פלדמן', + 'uploader': 'גם סרטים', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', + 'uploader_id': 'gumfilms', + }, + }, { + 'url': 'https://vimeo.com/ondemand/nazmaalik', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/141692381', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) class VimeoChannelIE(VimeoBaseInfoExtractor): @@ -470,14 +630,14 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): if not login_form: return webpage - password = self._downloader.params.get('videopassword', None) + password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) fields = self._hidden_inputs(login_form) token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(encode_dict(fields)) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) @@ -501,8 +661,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*]+href="(/(?:[^/]+/)*\1)', webpage) + if clips: + for video_id, video_url in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -539,7 +712,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'