]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/vk.py
   2 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
  27 from .dailymotion 
import DailymotionIE
 
  28 from .pladform 
import PladformIE
 
  29 from .vimeo 
import VimeoIE
 
  30 from .youtube 
import YoutubeIE
 
  33 class VKBaseIE(InfoExtractor
): 
  37         username
, password 
= self
._get
_login
_info
() 
  41         login_page
, url_handle 
= self
._download
_webpage
_handle
( 
  42             'https://vk.com', None, 'Downloading login page') 
  44         login_form 
= self
._hidden
_inputs
(login_page
) 
  47             'email': username
.encode('cp1251'), 
  48             'pass': password
.encode('cp1251'), 
  51         # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header 
  52         # and expects the first one to be set rather than second (see 
  53         # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). 
  54         # As of RFC6265 the newer one cookie should be set into cookie store 
  55         # what actually happens. 
  56         # We will workaround this VK issue by resetting the remixlhk cookie to 
  57         # the first one manually. 
  58         for header
, cookies 
in url_handle
.headers
.items(): 
  59             if header
.lower() != 'set-cookie': 
  61             if sys
.version_info
[0] >= 3: 
  62                 cookies 
= cookies
.encode('iso-8859-1') 
  63             cookies 
= cookies
.decode('utf-8') 
  64             remixlhk 
= re
.search(r
'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies
) 
  66                 value
, domain 
= remixlhk
.groups() 
  67                 self
._set
_cookie
(domain
, 'remixlhk', value
) 
  70         login_page 
= self
._download
_webpage
( 
  71             'https://login.vk.com/?act=login', None, 
  73             data
=urlencode_postdata(login_form
)) 
  75         if re
.search(r
'onLoginFailed', login_page
): 
  77                 'Unable to login, incorrect username and/or password', expected
=True) 
  79     def _real_initialize(self
): 
  90                                 (?:(?:m|new)\.)?vk\.com/video_| 
  93                             ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| 
  95                                 (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| 
  96                                 (?:www\.)?daxab.com/embed/ 
  98                             (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? 
 103             'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 
 104             'md5': '7babad3b85ea2e91948005b1b8b0cb84', 
 108                 'title': 'ProtivoGunz - Хуёвая песня', 
 109                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 
 110                 'uploader_id': '-77521', 
 112                 'timestamp': 1329049880, 
 113                 'upload_date': '20120212', 
 117             'url': 'http://vk.com/video205387401_165548505', 
 118             'md5': '6c0aeb2e90396ba97035b9cbde548700', 
 123                 'uploader': 'Tom Cruise', 
 124                 'uploader_id': '205387401', 
 126                 'timestamp': 1374364108, 
 127                 'upload_date': '20130720', 
 131             'note': 'Embedded video', 
 132             'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 
 133             'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 
 137                 'uploader': 'Vladimir Gavrin', 
 140                 'upload_date': '20120730', 
 143             'skip': 'This video has been removed from public access.', 
 147             # please update if you find a video whose URL follows the same pattern 
 148             'url': 'http://vk.com/video-8871596_164049491', 
 149             'md5': 'a590bcaf3d543576c9bd162812387666', 
 150             'note': 'Only available for registered users', 
 154                 'uploader': 'Триллеры', 
 155                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 
 157                 'upload_date': '20121218', 
 160             'skip': 'Requires vk account credentials', 
 163             'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 
 164             'md5': '4d7a5ef8cf114dfa09577e57b2993202', 
 168                 'uploader': 'Киномания - лучшее из мира кино', 
 171                 'upload_date': '20140328', 
 173             'skip': 'Requires vk account credentials', 
 176             'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', 
 177             'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', 
 178             'note': 'ivi.ru embed', 
 182                 'title': 'Книга Илая', 
 184                 'upload_date': '20140626', 
 187             'skip': 'Only works from Russia', 
 190             # video (removed?) only available with list id 
 191             'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 
 192             'md5': '091287af5402239a1051c37ec7b92913', 
 196                 'title': 'ТюменцевВВ_09.07.2015', 
 197                 'uploader': 'Anton Ivanov', 
 199                 'upload_date': '20150709', 
 206             'url': 'https://vk.com/video276849682_170681728', 
 210                 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 
 211                 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', 
 213                 'upload_date': '20130116', 
 214                 'uploader': "Children's Joy Foundation Inc.", 
 215                 'uploader_id': 'thecjf', 
 221             'url': 'https://vk.com/video-37468416_456239855', 
 223                 'id': 'k3lz2cmXyRuJQSjGHUv', 
 225                 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', 
 226                 # TODO: fix test by fixing dailymotion description extraction 
 227                 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 
 228                 'uploader': 'AniLibria.Tv', 
 229                 'upload_date': '20160914', 
 230                 'uploader_id': 'x1p5vl5', 
 231                 'timestamp': 1473877246, 
 234                 'skip_download': True, 
 238             # video key is extra_data not url\d+ 
 239             'url': 'http://vk.com/video-110305615_171782105', 
 240             'md5': 'e13fcda136f99764872e739d13fac1d1', 
 244                 'title': 'S-Dance, репетиции к The way show', 
 245                 'uploader': 'THE WAY SHOW | 17 апреля', 
 246                 'uploader_id': '-110305615', 
 247                 'timestamp': 1454859345, 
 248                 'upload_date': '20160207', 
 251                 'skip_download': True, 
 255             # finished live stream, postlive_mp4 
 256             'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', 
 257             'md5': '90d22d051fccbbe9becfccc615be6791', 
 261                 'title': 'ИгроМир 2016 — день 1', 
 262                 'uploader': 'Игромания', 
 268             # live stream, hls and rtmp links, most likely already finished live 
 269             # stream by the time you are reading this comment 
 270             'url': 'https://vk.com/video-140332_456239111', 
 271             'only_matching': True, 
 274             # removed video, just testing that we match the pattern 
 275             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 
 276             'only_matching': True, 
 279             # age restricted video, requires vk account credentials 
 280             'url': 'https://vk.com/video205387401_164765225', 
 281             'only_matching': True, 
 285             'url': 'https://vk.com/video-76116461_171554880', 
 286             'only_matching': True, 
 289             'url': 'http://new.vk.com/video205387401_165548505', 
 290             'only_matching': True, 
 293             # This video is no longer available, because its author has been blocked. 
 294             'url': 'https://vk.com/video-10639516_456240611', 
 295             'only_matching': True, 
 298             # The video is not available in your region. 
 299             'url': 'https://vk.com/video-51812607_171445436', 
 300             'only_matching': True, 
 303     def _real_extract(self
, url
): 
 304         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 305         video_id 
= mobj
.group('videoid') 
 308             info_url 
= 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
 
 309             # Some videos (removed?) can only be downloaded with list id specified 
 310             list_id 
= mobj
.group('list_id') 
 312                 info_url 
+= '&list=%s' % list_id
 
 314             info_url 
= 'http://vk.com/video_ext.php?' + mobj
.group('embed_query') 
 315             video_id 
= '%s_%s' % (mobj
.group('oid'), mobj
.group('id')) 
 317         info_page 
= self
._download
_webpage
(info_url
, video_id
) 
 319         error_message 
= self
._html
_search
_regex
( 
 320             [r
'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', 
 321                 r
'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], 
 322             info_page
, 'error message', default
=None) 
 324             raise ExtractorError(error_message
, expected
=True) 
 326         if re
.search(r
'<!>/login\.php\?.*\bact=security_check', info_page
): 
 327             raise ExtractorError( 
 328                 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', 
 331         ERROR_COPYRIGHT 
= 'Video %s has been removed from public access due to rightholder complaint.' 
 334             r
'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': 
 337             r
'>The video .*? was removed from public access by request of the copyright holder.<': 
 340             r
'<!>Please log in or <': 
 341             'Video %s is only available for registered users, ' 
 342             'use --username and --password options to provide account credentials.', 
 345             'Video %s does not exist.', 
 347             r
'<!>Видео временно недоступно': 
 348             'Video %s is temporarily unavailable.', 
 351             'Access denied to video %s.', 
 353             r
'<!>Видеозапись недоступна, так как её автор был заблокирован.': 
 354             'Video %s is no longer available, because its author has been blocked.', 
 356             r
'<!>This video is no longer available, because its author has been blocked.': 
 357             'Video %s is no longer available, because its author has been blocked.', 
 359             r
'<!>This video is no longer available, because it has been deleted.': 
 360             'Video %s is no longer available, because it has been deleted.', 
 362             r
'<!>The video .+? is not available in your region.': 
 363             'Video %s is not available in your region.', 
 366         for error_re
, error_msg 
in ERRORS
.items(): 
 367             if re
.search(error_re
, info_page
): 
 368                 raise ExtractorError(error_msg 
% video_id
, expected
=True) 
 370         youtube_url 
= YoutubeIE
._extract
_url
(info_page
) 
 372             return self
.url_result(youtube_url
, ie
=YoutubeIE
.ie_key()) 
 374         vimeo_url 
= VimeoIE
._extract
_url
(url
, info_page
) 
 375         if vimeo_url 
is not None: 
 376             return self
.url_result(vimeo_url
) 
 378         pladform_url 
= PladformIE
._extract
_url
(info_page
) 
 380             return self
.url_result(pladform_url
) 
 382         m_rutube 
= re
.search( 
 383             r
'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page
) 
 384         if m_rutube 
is not None: 
 385             rutube_url 
= self
._proto
_relative
_url
( 
 386                 m_rutube
.group(1).replace('\\', '')) 
 387             return self
.url_result(rutube_url
) 
 389         dailymotion_urls 
= DailymotionIE
._extract
_urls
(info_page
) 
 391             return self
.url_result(dailymotion_urls
[0], DailymotionIE
.ie_key()) 
 393         m_opts 
= re
.search(r
'(?s)var\s+opts\s*=\s*({.+?});', info_page
) 
 395             m_opts_url 
= re
.search(r
"url\s*:\s*'((?!/\b)[^']+)", m_opts
.group(1)) 
 397                 opts_url 
= m_opts_url
.group(1) 
 398                 if opts_url
.startswith('//'): 
 399                     opts_url 
= 'http:' + opts_url
 
 400                 return self
.url_result(opts_url
) 
 402         # vars does not look to be served anymore since 24.10.2016 
 403         data 
= self
._parse
_json
( 
 405                 r
'var\s+vars\s*=\s*({.+?});', info_page
, 'vars', default
='{}'), 
 406             video_id
, fatal
=False) 
 408         # <!json> is served instead 
 410             data 
= self
._parse
_json
( 
 412                     [r
'<!json>\s*({.+?})\s*<!>', r
'<!json>\s*({.+})'], 
 413                     info_page
, 'json', default
='{}'), 
 416                 data 
= data
['player']['params'][0] 
 419             data 
= self
._parse
_json
( 
 421                     r
'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page
, 
 423                 video_id
)['params'][0] 
 425         title 
= unescapeHTML(data
['md_title']) 
 428         # 3 = post live (finished live) 
 429         is_live 
= data
.get('live') == 2 
 431             title 
= self
._live
_title
(title
) 
 433         timestamp 
= unified_timestamp(self
._html
_search
_regex
( 
 434             r
'class=["\']mv_info_date
[^
>]+>([^
<]+)(?
:<|
from)', info_page, 
 435             'upload date
', default=None)) or int_or_none(data.get('date
')) 
 437         view_count = str_to_int(self._search_regex( 
 438             r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', 
 439             info_page, 'view count', default=None)) 
 442         for format_id, format_url in data.items(): 
 443             format_url = url_or_none(format_url) 
 444             if not format_url or not format_url.startswith(('http', '//', 'rtmp')): 
 446             if (format_id.startswith(('url', 'cache')) or 
 447                     format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): 
 448                 height = int_or_none(self._search_regex( 
 449                     r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) 
 451                     'format_id': format_id, 
 455             elif format_id == 'hls': 
 456                 formats.extend(self._extract_m3u8_formats( 
 457                     format_url, video_id, 'mp4', 'm3u8_native', 
 458                     m3u8_id=format_id, fatal=False, live=is_live)) 
 459             elif format_id == 'rtmp': 
 461                     'format_id': format_id, 
 465         self._sort_formats(formats) 
 468             'id': compat_str(data.get('vid') or video_id), 
 471             'thumbnail': data.get('jpg'), 
 472             'uploader': data.get('md_author'), 
 473             'uploader_id': str_or_none(data.get('author_id')), 
 474             'duration': data.get('duration'), 
 475             'timestamp': timestamp, 
 476             'view_count': view_count, 
 477             'like_count': int_or_none(data.get('liked')), 
 478             'dislike_count': int_or_none(data.get('nolikes')), 
 483 class VKUserVideosIE(VKBaseIE): 
 484     IE_NAME = 'vk:uservideos' 
 485     IE_DESC = "VK 
- User
's Videos" 
 486     _VALID_URL = r'https?
://(?
:(?
:m|new
)\
.)?vk\
.com
/videos(?P
<id>-?
[0-9]+)(?
!\?.*\bz
=video
)(?
:[/?
#&]|$)' 
 487     _TEMPLATE_URL 
= 'https://vk.com/videos' 
 489         'url': 'http://vk.com/videos205387401', 
 492             'title': "Tom Cruise's Videos", 
 494         'playlist_mincount': 4, 
 496         'url': 'http://vk.com/videos-77521', 
 497         'only_matching': True, 
 499         'url': 'http://vk.com/videos-97664626?section=all', 
 500         'only_matching': True, 
 502         'url': 'http://m.vk.com/videos205387401', 
 503         'only_matching': True, 
 505         'url': 'http://new.vk.com/videos205387401', 
 506         'only_matching': True, 
 509     def _real_extract(self
, url
): 
 510         page_id 
= self
._match
_id
(url
) 
 512         webpage 
= self
._download
_webpage
(url
, page_id
) 
 516                 'http://vk.com/video' + video_id
, 'VK', video_id
=video_id
) 
 517             for video_id 
in orderedSet(re
.findall(r
'href="/video(-?[0-9_]+)"', webpage
))] 
 519         title 
= unescapeHTML(self
._search
_regex
( 
 520             r
'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', 
 521             webpage
, 'title', default
=page_id
)) 
 523         return self
.playlist_result(entries
, page_id
, title
) 
 526 class VKWallPostIE(VKBaseIE
): 
 527     IE_NAME 
= 'vk:wallpost' 
 528     _VALID_URL 
= r
'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))' 
 530         # public page URL, audio playlist 
 531         'url': 'https://vk.com/bs.official?w=wall-23538238_35', 
 534             'title': 'Black Shadow - Wall post 23538238_35', 
 535             'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 
 538             'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 
 540                 'id': '135220665_111806521', 
 542                 'title': 'Black Shadow - Слепое Верование', 
 544                 'uploader': 'Black Shadow', 
 545                 'artist': 'Black Shadow', 
 546                 'track': 'Слепое Верование', 
 549             'md5': '4cc7e804579122b17ea95af7834c9233', 
 551                 'id': '135220665_111802303', 
 553                 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 
 555                 'uploader': 'Black Shadow', 
 556                 'artist': 'Black Shadow', 
 557                 'track': 'Война - Негасимое Бездны Пламя!', 
 560                 'skip_download': True, 
 566         'skip': 'Requires vk account credentials', 
 568         # single YouTube embed, no leading - 
 569         'url': 'https://vk.com/wall85155021_6319', 
 571             'id': '85155021_6319', 
 572             'title': 'Sergey Gorbunov - Wall post 85155021_6319', 
 578         'skip': 'Requires vk account credentials', 
 581         'url': 'https://vk.com/wall-23538238_35', 
 582         'only_matching': True, 
 584         # mobile wall page URL 
 585         'url': 'https://m.vk.com/wall-23538238_35', 
 586         'only_matching': True, 
 589     def _real_extract(self
, url
): 
 590         post_id 
= self
._match
_id
(url
) 
 592         wall_url 
= 'https://vk.com/wall%s' % post_id
 
 594         post_id 
= remove_start(post_id
, '-') 
 596         webpage 
= self
._download
_webpage
(wall_url
, post_id
) 
 598         error 
= self
._html
_search
_regex
( 
 599             r
'>Error</div>\s*<div[^>]+class=["\']body
["\'][^>]*>([^<]+)', 
 600             webpage, 'error', default=None) 
 602             raise ExtractorError('VK said: %s' % error, expected=True) 
 604         description = clean_html(get_element_by_class('wall_post_text', webpage)) 
 605         uploader = clean_html(get_element_by_class('author', webpage)) 
 606         thumbnail = self._og_search_thumbnail(webpage) 
 610         audio_ids = re.findall(r'data-full-id=["\'](\d
+_\d
+)', webpage) 
 612             al_audio = self._download_webpage( 
 613                 'https
://vk
.com
/al_audio
.php
', post_id, 
 614                 note='Downloading audio info
', fatal=False, 
 615                 data=urlencode_postdata({ 
 616                     'act
': 'reload_audio
', 
 618                     'ids
': ','.join(audio_ids) 
 621                 Audio = collections.namedtuple( 
 622                     'Audio
', ['id', 'user_id
', 'url
', 'track
', 'artist
', 'duration
']) 
 623                 audios = self._parse_json( 
 625                         r'<!json
>(.+?
)<!>', al_audio, 'audios
', default='[]'), 
 626                     post_id, fatal=False, transform_source=unescapeHTML) 
 627                 if isinstance(audios, list): 
 629                         a = Audio._make(audio[:6]) 
 631                             'id': '%s_%s' % (a.user_id, a.id), 
 633                             'title
': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, 
 634                             'thumbnail
': thumbnail, 
 635                             'duration
': a.duration, 
 636                             'uploader
': uploader, 
 641         for video in re.finditer( 
 642                 r'<a
[^
>]+href
=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): 
 643             entries.append(self.url_result( 
 644                 compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) 
 646         title = 'Wall post %s' % post_id 
 648         return self.playlist_result( 
 649             orderedSet(entries), post_id, 
 650             '%s - %s' % (uploader, title) if uploader else title,