X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/00368b4c3a5d4e909e1b7ecfc4030bf28da020f3..bf9a12f869d6384a1245eef0b3167d05cffe37c1:/youtube_dl/extractor/vk.py diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 105e172..f57ed22 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,13 +3,9 @@ from __future__ import unicode_literals import collections import re -import sys from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( clean_html, ExtractorError, @@ -17,9 +13,11 @@ from ..utils import ( int_or_none, orderedSet, remove_start, + str_or_none, str_to_int, unescapeHTML, unified_timestamp, + url_or_none, urlencode_postdata, ) from .dailymotion import DailymotionIE @@ -32,7 +30,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -46,28 +44,13 @@ class VKBaseIE(InfoExtractor): 'pass': password.encode('cp1251'), }) - # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header - # and expects the first one to be set rather than second (see - # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). - # As of RFC6265 the newer one cookie should be set into cookie store - # what actually happens. - # We will workaround this VK issue by resetting the remixlhk cookie to - # the first one manually. - for header, cookies in url_handle.headers.items(): - if header.lower() != 'set-cookie': - continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') - remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) - if remixlhk: - value, domain = remixlhk.groups() - self._set_cookie(domain, 'remixlhk', value) - break + # vk serves two same remixlhk cookies in Set-Cookie header and expects + # first one to be actually set + self._apply_first_set_cookie_header(url_handle, 'remixlhk') login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, - note='Logging in as %s' % username, + note='Logging in', data=urlencode_postdata(login_form)) if re.search(r'onLoginFailed', login_page): @@ -99,30 +82,30 @@ class VKIE(VKBaseIE): _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'md5': '0deae91935c54e00003c2a00646315f0', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { - 'id': '162222515', - 'ext': 'flv', + 'id': '-77521_162222515', + 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', 'duration': 195, - 'timestamp': 1329060660, + 'timestamp': 1329049880, 'upload_date': '20120212', - 'view_count': int, }, }, { 'url': 'http://vk.com/video205387401_165548505', 'md5': '6c0aeb2e90396ba97035b9cbde548700', 'info_dict': { - 'id': '165548505', + 'id': '205387401_165548505', 'ext': 'mp4', - 'uploader': 'Tom Cruise', 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', 'duration': 9, - 'timestamp': 1374374880, - 'upload_date': '20130721', - 'view_count': int, + 'timestamp': 1374364108, + 'upload_date': '20130720', } }, { @@ -130,7 +113,7 @@ class VKIE(VKBaseIE): 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 'info_dict': { - 'id': '162925554', + 'id': '32194266_162925554', 'ext': 'mp4', 'uploader': 'Vladimir Gavrin', 'title': 'Lin Dan', @@ -147,7 +130,7 @@ class VKIE(VKBaseIE): 'md5': 'a590bcaf3d543576c9bd162812387666', 'note': 'Only available for registered users', 'info_dict': { - 'id': '164049491', + 'id': '-8871596_164049491', 'ext': 'mp4', 'uploader': 'Триллеры', 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', @@ -161,7 +144,7 @@ class VKIE(VKBaseIE): 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 'md5': '4d7a5ef8cf114dfa09577e57b2993202', 'info_dict': { - 'id': '168067957', + 'id': '-43215063_168067957', 'ext': 'mp4', 'uploader': 'Киномания - лучшее из мира кино', 'title': ' ', @@ -175,7 +158,7 @@ class VKIE(VKBaseIE): 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', 'note': 'ivi.ru embed', 'info_dict': { - 'id': '60690', + 'id': '-43215063_169084319', 'ext': 'mp4', 'title': 'Книга Илая', 'duration': 6771, @@ -189,7 +172,7 @@ class VKIE(VKBaseIE): 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 'md5': '091287af5402239a1051c37ec7b92913', 'info_dict': { - 'id': '171201961', + 'id': '30481095_171201961', 'ext': 'mp4', 'title': 'ТюменцевВВ_09.07.2015', 'uploader': 'Anton Ivanov', @@ -204,12 +187,12 @@ class VKIE(VKBaseIE): 'url': 'https://vk.com/video276849682_170681728', 'info_dict': { 'id': 'V3K4mi0SYkc', - 'ext': 'webm', + 'ext': 'mp4', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:d9903938abdc74c738af77f527ca0596', + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', 'duration': 178, 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation", + 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, }, @@ -221,6 +204,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + # TODO: fix test by fixing dailymotion description extraction 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', @@ -236,26 +220,32 @@ class VKIE(VKBaseIE): 'url': 'http://vk.com/video-110305615_171782105', 'md5': 'e13fcda136f99764872e739d13fac1d1', 'info_dict': { - 'id': '171782105', + 'id': '-110305615_171782105', 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', - 'timestamp': 1454870100, + 'uploader_id': '-110305615', + 'timestamp': 1454859345, 'upload_date': '20160207', - 'view_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # finished live stream, postlive_mp4 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', - 'md5': '90d22d051fccbbe9becfccc615be6791', 'info_dict': { - 'id': '456242764', + 'id': '-387766_456242764', 'ext': 'mp4', - 'title': 'ИгроМир 2016 — день 1', + 'title': 'ИгроМир 2016 День 1 — Игромания Утром', 'uploader': 'Игромания', 'duration': 5239, - 'view_count': int, + # TODO: use act=show to extract view_count + # 'view_count': int, + 'upload_date': '20160929', + 'uploader_id': '-387766', + 'timestamp': 1475137527, }, }, { @@ -287,15 +277,19 @@ class VKIE(VKBaseIE): # This video is no longer available, because its author has been blocked. 'url': 'https://vk.com/video-10639516_456240611', 'only_matching': True, - } - ] + }, + { + # The video is not available in your region. + 'url': 'https://vk.com/video-51812607_171445436', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') if video_id: - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: @@ -318,9 +312,14 @@ class VKIE(VKBaseIE): 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', expected=True) + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - 'Video %s has been removed from public access due to rightholder complaint.', + ERROR_COPYRIGHT, + + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, r'Please log in or <': 'Video %s is only available for registered users, ' @@ -340,6 +339,12 @@ class VKIE(VKBaseIE): r'This video is no longer available, because its author has been blocked.': 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', + + r'The video .+? is not available in your region.': + 'Video %s is not available in your region.', } for error_re, error_msg in ERRORS.items(): @@ -388,7 +393,8 @@ class VKIE(VKBaseIE): if not data: data = self._parse_json( self._search_regex( - r'\s*({.+?})\s*', info_page, 'json', default='{}'), + [r'\s*({.+?})\s*', r'\s*({.+})'], + info_page, 'json', default='{}'), video_id) if data: data = data['player']['params'][0] @@ -410,18 +416,19 @@ class VKIE(VKBaseIE): timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', fatal=False)) + 'upload date', default=None)) or int_or_none(data.get('date')) view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', - info_page, 'view count', fatal=False)) + info_page, 'view count', default=None)) formats = [] for format_id, format_url in data.items(): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): continue - if (format_id.startswith(('url', 'cache')) or - format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): + if (format_id.startswith(('url', 'cache')) + or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): height = int_or_none(self._search_regex( r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) formats.append({ @@ -442,14 +449,17 @@ class VKIE(VKBaseIE): self._sort_formats(formats) return { - 'id': compat_str(data.get('vid') or video_id), + 'id': video_id, 'formats': formats, 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), 'is_live': is_live, }