]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/vk.py
   2 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
  26 from .vimeo 
import VimeoIE
 
  27 from .pladform 
import PladformIE
 
  30 class VKBaseIE(InfoExtractor
): 
  34         (username
, password
) = self
._get
_login
_info
() 
  38         login_page
, url_handle 
= self
._download
_webpage
_handle
( 
  39             'https://vk.com', None, 'Downloading login page') 
  41         login_form 
= self
._hidden
_inputs
(login_page
) 
  44             'email': username
.encode('cp1251'), 
  45             'pass': password
.encode('cp1251'), 
  48         # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header 
  49         # and expects the first one to be set rather than second (see 
  50         # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). 
  51         # As of RFC6265 the newer one cookie should be set into cookie store 
  52         # what actually happens. 
  53         # We will workaround this VK issue by resetting the remixlhk cookie to 
  54         # the first one manually. 
  55         cookies 
= url_handle
.headers
.get('Set-Cookie') 
  57             if sys
.version_info
[0] >= 3: 
  58                 cookies 
= cookies
.encode('iso-8859-1') 
  59             cookies 
= cookies
.decode('utf-8') 
  60             remixlhk 
= re
.search(r
'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies
) 
  62                 value
, domain 
= remixlhk
.groups() 
  63                 self
._set
_cookie
(domain
, 'remixlhk', value
) 
  65         login_page 
= self
._download
_webpage
( 
  66             'https://login.vk.com/?act=login', None, 
  67             note
='Logging in as %s' % username
, 
  68             data
=urlencode_postdata(login_form
)) 
  70         if re
.search(r
'onLoginFailed', login_page
): 
  72                 'Unable to login, incorrect username and/or password', expected
=True) 
  74     def _real_initialize(self
): 
  85                                 (?:(?:m|new)\.)?vk\.com/video_| 
  88                             ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| 
  90                                 (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| 
  91                                 (?:www\.)?daxab.com/embed/ 
  93                             (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? 
  98             'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 
  99             'md5': '0deae91935c54e00003c2a00646315f0', 
 103                 'title': 'ProtivoGunz - Хуёвая песня', 
 104                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 
 106                 'upload_date': '20120212', 
 111             'url': 'http://vk.com/video205387401_165548505', 
 112             'md5': '6c0aeb2e90396ba97035b9cbde548700', 
 116                 'uploader': 'Tom Cruise', 
 119                 'upload_date': '20130721', 
 124             'note': 'Embedded video', 
 125             'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 
 126             'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 
 130                 'uploader': 'Vladimir Gavrin', 
 133                 'upload_date': '20120730', 
 136             'skip': 'This video has been removed from public access.', 
 140             # please update if you find a video whose URL follows the same pattern 
 141             'url': 'http://vk.com/video-8871596_164049491', 
 142             'md5': 'a590bcaf3d543576c9bd162812387666', 
 143             'note': 'Only available for registered users', 
 147                 'uploader': 'Триллеры', 
 148                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 
 150                 'upload_date': '20121218', 
 153             'skip': 'Requires vk account credentials', 
 156             'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 
 157             'md5': '4d7a5ef8cf114dfa09577e57b2993202', 
 161                 'uploader': 'Киномания - лучшее из мира кино', 
 164                 'upload_date': '20140328', 
 166             'skip': 'Requires vk account credentials', 
 169             'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', 
 170             'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', 
 171             'note': 'ivi.ru embed', 
 175                 'title': 'Книга Илая', 
 177                 'upload_date': '20140626', 
 180             'skip': 'Only works from Russia', 
 183             # video (removed?) only available with list id 
 184             'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 
 185             'md5': '091287af5402239a1051c37ec7b92913', 
 189                 'title': 'ТюменцевВВ_09.07.2015', 
 190                 'uploader': 'Anton Ivanov', 
 192                 'upload_date': '20150709', 
 198             'url': 'https://vk.com/video276849682_170681728', 
 202                 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 
 203                 'description': 'md5:d9903938abdc74c738af77f527ca0596', 
 205                 'upload_date': '20130116', 
 206                 'uploader': "Children's Joy Foundation", 
 207                 'uploader_id': 'thecjf', 
 212             # video key is extra_data not url\d+ 
 213             'url': 'http://vk.com/video-110305615_171782105', 
 214             'md5': 'e13fcda136f99764872e739d13fac1d1', 
 218                 'title': 'S-Dance, репетиции к The way show', 
 219                 'uploader': 'THE WAY SHOW | 17 апреля', 
 220                 'upload_date': '20160207', 
 225             # removed video, just testing that we match the pattern 
 226             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 
 227             'only_matching': True, 
 230             # age restricted video, requires vk account credentials 
 231             'url': 'https://vk.com/video205387401_164765225', 
 232             'only_matching': True, 
 236             'url': 'https://vk.com/video-76116461_171554880', 
 237             'only_matching': True, 
 240             'url': 'http://new.vk.com/video205387401_165548505', 
 241             'only_matching': True, 
 245     def _real_extract(self
, url
): 
 246         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 247         video_id 
= mobj
.group('videoid') 
 250             info_url 
= 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
 
 251             # Some videos (removed?) can only be downloaded with list id specified 
 252             list_id 
= mobj
.group('list_id') 
 254                 info_url 
+= '&list=%s' % list_id
 
 256             info_url 
= 'http://vk.com/video_ext.php?' + mobj
.group('embed_query') 
 257             video_id 
= '%s_%s' % (mobj
.group('oid'), mobj
.group('id')) 
 259         info_page 
= self
._download
_webpage
(info_url
, video_id
) 
 261         error_message 
= self
._html
_search
_regex
( 
 262             [r
'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', 
 263                 r
'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], 
 264             info_page
, 'error message', default
=None) 
 266             raise ExtractorError(error_message
, expected
=True) 
 268         if re
.search(r
'<!>/login\.php\?.*\bact=security_check', info_page
): 
 269             raise ExtractorError( 
 270                 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', 
 274             r
'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': 
 275             'Video %s has been removed from public access due to rightholder complaint.', 
 277             r
'<!>Please log in or <': 
 278             'Video %s is only available for registered users, ' 
 279             'use --username and --password options to provide account credentials.', 
 282             'Video %s does not exist.', 
 284             r
'<!>Видео временно недоступно': 
 285             'Video %s is temporarily unavailable.', 
 288             'Access denied to video %s.', 
 291         for error_re
, error_msg 
in ERRORS
.items(): 
 292             if re
.search(error_re
, info_page
): 
 293                 raise ExtractorError(error_msg 
% video_id
, expected
=True) 
 295         youtube_url 
= self
._search
_regex
( 
 296             r
'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', 
 297             info_page
, 'youtube iframe', default
=None) 
 299             return self
.url_result(youtube_url
, 'Youtube') 
 301         vimeo_url 
= VimeoIE
._extract
_vimeo
_url
(url
, info_page
) 
 302         if vimeo_url 
is not None: 
 303             return self
.url_result(vimeo_url
) 
 305         pladform_url 
= PladformIE
._extract
_url
(info_page
) 
 307             return self
.url_result(pladform_url
) 
 309         m_rutube 
= re
.search( 
 310             r
'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page
) 
 311         if m_rutube 
is not None: 
 312             rutube_url 
= self
._proto
_relative
_url
( 
 313                 m_rutube
.group(1).replace('\\', '')) 
 314             return self
.url_result(rutube_url
) 
 316         m_opts 
= re
.search(r
'(?s)var\s+opts\s*=\s*({.+?});', info_page
) 
 318             m_opts_url 
= re
.search(r
"url\s*:\s*'((?!/\b)[^']+)", m_opts
.group(1)) 
 320                 opts_url 
= m_opts_url
.group(1) 
 321                 if opts_url
.startswith('//'): 
 322                     opts_url 
= 'http:' + opts_url
 
 323                 return self
.url_result(opts_url
) 
 325         data_json 
= self
._search
_regex
(r
'var\s+vars\s*=\s*({.+?});', info_page
, 'vars') 
 326         data 
= json
.loads(data_json
) 
 328         # Extract upload date 
 330         mobj 
= re
.search(r
'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page
) 
 332             mobj
.group(1) + ' ' + mobj
.group(2) 
 333             upload_date 
= unified_strdate(mobj
.group(1) + ' ' + mobj
.group(2)) 
 336         views 
= self
._html
_search
_regex
( 
 337             r
'"mv_views_count_number"[^>]*>(.+?\bviews?)<', 
 338             info_page
, 'view count', default
=None) 
 340             view_count 
= str_to_int(self
._search
_regex
( 
 341                 r
'([\d,.]+)', views
, 'view count', fatal
=False)) 
 344         for k
, v 
in data
.items(): 
 345             if not k
.startswith('url') and not k
.startswith('cache') and k 
!= 'extra_data' or not v
: 
 347             height 
= int_or_none(self
._search
_regex
( 
 348                 r
'^(?:url|cache)(\d+)', k
, 'height', default
=None)) 
 354         self
._sort
_formats
(formats
) 
 357             'id': compat_str(data
['vid']), 
 359             'title': unescapeHTML(data
['md_title']), 
 360             'thumbnail': data
.get('jpg'), 
 361             'uploader': data
.get('md_author'), 
 362             'duration': data
.get('duration'), 
 363             'upload_date': upload_date
, 
 364             'view_count': view_count
, 
 368 class VKUserVideosIE(VKBaseIE
): 
 369     IE_NAME 
= 'vk:uservideos' 
 370     IE_DESC 
= "VK - User's Videos" 
 371     _VALID_URL 
= r
'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' 
 372     _TEMPLATE_URL 
= 'https://vk.com/videos' 
 374         'url': 'http://vk.com/videos205387401', 
 377             'title': "Tom Cruise's Videos", 
 379         'playlist_mincount': 4, 
 381         'url': 'http://vk.com/videos-77521', 
 382         'only_matching': True, 
 384         'url': 'http://vk.com/videos-97664626?section=all', 
 385         'only_matching': True, 
 387         'url': 'http://m.vk.com/videos205387401', 
 388         'only_matching': True, 
 390         'url': 'http://new.vk.com/videos205387401', 
 391         'only_matching': True, 
 394     def _real_extract(self
, url
): 
 395         page_id 
= self
._match
_id
(url
) 
 397         webpage 
= self
._download
_webpage
(url
, page_id
) 
 401                 'http://vk.com/video' + video_id
, 'VK', video_id
=video_id
) 
 402             for video_id 
in orderedSet(re
.findall(r
'href="/video(-?[0-9_]+)"', webpage
))] 
 404         title 
= unescapeHTML(self
._search
_regex
( 
 405             r
'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', 
 406             webpage
, 'title', default
=page_id
)) 
 408         return self
.playlist_result(entries
, page_id
, title
) 
 411 class VKWallPostIE(VKBaseIE
): 
 412     IE_NAME 
= 'vk:wallpost' 
 413     _VALID_URL 
= r
'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))' 
 415         # public page URL, audio playlist 
 416         'url': 'https://vk.com/bs.official?w=wall-23538238_35', 
 419             'title': 'Black Shadow - Wall post 23538238_35', 
 420             'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 
 423             'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 
 425                 'id': '135220665_111806521', 
 427                 'title': 'Black Shadow - Слепое Верование', 
 429                 'uploader': 'Black Shadow', 
 430                 'artist': 'Black Shadow', 
 431                 'track': 'Слепое Верование', 
 434             'md5': '4cc7e804579122b17ea95af7834c9233', 
 436                 'id': '135220665_111802303', 
 438                 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 
 440                 'uploader': 'Black Shadow', 
 441                 'artist': 'Black Shadow', 
 442                 'track': 'Война - Негасимое Бездны Пламя!', 
 445                 'skip_download': True, 
 448         'skip': 'Requires vk account credentials', 
 450         # single YouTube embed, no leading - 
 451         'url': 'https://vk.com/wall85155021_6319', 
 453             'id': '85155021_6319', 
 454             'title': 'Sergey Gorbunov - Wall post 85155021_6319', 
 457         'skip': 'Requires vk account credentials', 
 460         'url': 'https://vk.com/wall-23538238_35', 
 461         'only_matching': True, 
 463         # mobile wall page URL 
 464         'url': 'https://m.vk.com/wall-23538238_35', 
 465         'only_matching': True, 
 468     def _real_extract(self
, url
): 
 469         post_id 
= self
._match
_id
(url
) 
 471         wall_url 
= 'https://vk.com/wall%s' % post_id
 
 473         post_id 
= remove_start(post_id
, '-') 
 475         webpage 
= self
._download
_webpage
(wall_url
, post_id
) 
 477         error 
= self
._html
_search
_regex
( 
 478             r
'>Error</div>\s*<div[^>]+class=["\']body
["\'][^>]*>([^<]+)', 
 479             webpage, 'error', default=None) 
 481             raise ExtractorError('VK said: %s' % error, expected=True) 
 483         description = clean_html(get_element_by_class('wall_post_text', webpage)) 
 484         uploader = clean_html(get_element_by_class( 
 485             'fw_post_author', webpage)) or self._og_search_description(webpage) 
 486         thumbnail = self._og_search_thumbnail(webpage) 
 490         for audio in re.finditer(r'''(?sx) 
 492                                 id=(?P<q1>["\'])audio_info(?P
<id>\d
+_\d
+).*?
(?P
=q1
)[^
>]+ 
 493                                 value
=(?P
<q2
>["\'])(?P<url>http.+?)(?P=q2) 
 495                             </table>''', webpage): 
 496             audio_html = audio.group(0) 
 497             audio_id = audio.group('id') 
 498             duration = parse_duration(get_element_by_class('duration', audio_html)) 
 499             track = self._html_search_regex( 
 500                 r'<span[^>]+id=["\']title
%s[^
>]*>([^
<]+)' % audio_id, 
 501                 audio_html, 'title
', default=None) 
 502             artist = self._html_search_regex( 
 503                 r'>([^
<]+)</a
></b
>\s
*&ndash
', audio_html, 
 504                 'artist
', default=None) 
 507                 'url
': audio.group('url
'), 
 508                 'title
': '%s - %s' % (artist, track) if artist and track else audio_id, 
 509                 'thumbnail
': thumbnail, 
 510                 'duration
': duration, 
 511                 'uploader
': uploader, 
 516         for video in re.finditer( 
 517                 r'<a
[^
>]+href
=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): 
 518             entries.append(self.url_result( 
 519                 compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) 
 521         title = 'Wall post %s' % post_id 
 523         return self.playlist_result( 
 524             orderedSet(entries), post_id, 
 525             '%s - %s' % (uploader, title) if uploader else title,