]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/vk.py
   2 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
   9 from ..compat 
import compat_str
 
  19 from .vimeo 
import VimeoIE
 
  20 from .pladform 
import PladformIE
 
  23 class VKIE(InfoExtractor
): 
  30                                 (?:m\.)?vk\.com/video_| 
  33                             ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| 
  35                                 (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| 
  36                                 (?:www\.)?daxab.com/embed/ 
  38                             (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? 
  45             'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 
  46             'md5': '0deae91935c54e00003c2a00646315f0', 
  50                 'title': 'ProtivoGunz - Хуёвая песня', 
  51                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 
  53                 'upload_date': '20120212', 
  58             'url': 'http://vk.com/video205387401_165548505', 
  59             'md5': '6c0aeb2e90396ba97035b9cbde548700', 
  63                 'uploader': 'Tom Cruise', 
  66                 'upload_date': '20130721', 
  71             'note': 'Embedded video', 
  72             'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 
  73             'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 
  77                 'uploader': 'Vladimir Gavrin', 
  80                 'upload_date': '20120730', 
  83             'skip': 'This video has been removed from public access.', 
  87             # please update if you find a video whose URL follows the same pattern 
  88             'url': 'http://vk.com/video-8871596_164049491', 
  89             'md5': 'a590bcaf3d543576c9bd162812387666', 
  90             'note': 'Only available for registered users', 
  94                 'uploader': 'Триллеры', 
  95                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 
  97                 'upload_date': '20121218', 
 100             'skip': 'Requires vk account credentials', 
 103             'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 
 104             'md5': '4d7a5ef8cf114dfa09577e57b2993202', 
 108                 'uploader': 'Киномания - лучшее из мира кино', 
 111                 'upload_date': '20140328', 
 113             'skip': 'Requires vk account credentials', 
 116             'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', 
 117             'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', 
 118             'note': 'ivi.ru embed', 
 122                 'title': 'Книга Илая', 
 124                 'upload_date': '20140626', 
 127             'skip': 'Only works from Russia', 
 130             # video (removed?) only available with list id 
 131             'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 
 132             'md5': '091287af5402239a1051c37ec7b92913', 
 136                 'title': 'ТюменцевВВ_09.07.2015', 
 137                 'uploader': 'Anton Ivanov', 
 139                 'upload_date': '20150709', 
 145             'url': 'https://vk.com/video276849682_170681728', 
 149                 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 
 150                 'description': 'md5:d9903938abdc74c738af77f527ca0596', 
 152                 'upload_date': '20130116', 
 153                 'uploader': "Children's Joy Foundation", 
 154                 'uploader_id': 'thecjf', 
 159             # video key is extra_data not url\d+ 
 160             'url': 'http://vk.com/video-110305615_171782105', 
 161             'md5': 'e13fcda136f99764872e739d13fac1d1', 
 165                 'title': 'S-Dance, репетиции к The way show', 
 166                 'uploader': 'THE WAY SHOW | 17 апреля', 
 167                 'upload_date': '20160207', 
 172             # removed video, just testing that we match the pattern 
 173             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 
 174             'only_matching': True, 
 177             # age restricted video, requires vk account credentials 
 178             'url': 'https://vk.com/video205387401_164765225', 
 179             'only_matching': True, 
 183             'url': 'https://vk.com/video-76116461_171554880', 
 184             'only_matching': True, 
 189         (username
, password
) = self
._get
_login
_info
() 
 193         login_page
, url_handle 
= self
._download
_webpage
_handle
( 
 194             'https://vk.com', None, 'Downloading login page') 
 196         login_form 
= self
._hidden
_inputs
(login_page
) 
 199             'email': username
.encode('cp1251'), 
 200             'pass': password
.encode('cp1251'), 
 203         # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header 
 204         # and expects the first one to be set rather than second (see 
 205         # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). 
 206         # As of RFC6265 the newer one cookie should be set into cookie store 
 207         # what actually happens. 
 208         # We will workaround this VK issue by resetting the remixlhk cookie to 
 209         # the first one manually. 
 210         cookies 
= url_handle
.headers
.get('Set-Cookie') 
 211         if sys
.version_info
[0] >= 3: 
 212             cookies 
= cookies
.encode('iso-8859-1') 
 213         cookies 
= cookies
.decode('utf-8') 
 214         remixlhk 
= re
.search(r
'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies
) 
 216             value
, domain 
= remixlhk
.groups() 
 217             self
._set
_cookie
(domain
, 'remixlhk', value
) 
 219         login_page 
= self
._download
_webpage
( 
 220             'https://login.vk.com/?act=login', None, 
 221             note
='Logging in as %s' % username
, 
 222             data
=urlencode_postdata(login_form
)) 
 224         if re
.search(r
'onLoginFailed', login_page
): 
 225             raise ExtractorError( 
 226                 'Unable to login, incorrect username and/or password', expected
=True) 
 228     def _real_initialize(self
): 
 231     def _real_extract(self
, url
): 
 232         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 233         video_id 
= mobj
.group('videoid') 
 236             info_url 
= 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
 
 237             # Some videos (removed?) can only be downloaded with list id specified 
 238             list_id 
= mobj
.group('list_id') 
 240                 info_url 
+= '&list=%s' % list_id
 
 242             info_url 
= 'http://vk.com/video_ext.php?' + mobj
.group('embed_query') 
 243             video_id 
= '%s_%s' % (mobj
.group('oid'), mobj
.group('id')) 
 245         info_page 
= self
._download
_webpage
(info_url
, video_id
) 
 247         error_message 
= self
._html
_search
_regex
( 
 248             [r
'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', 
 249                 r
'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], 
 250             info_page
, 'error message', default
=None) 
 252             raise ExtractorError(error_message
, expected
=True) 
 254         if re
.search(r
'<!>/login\.php\?.*\bact=security_check', info_page
): 
 255             raise ExtractorError( 
 256                 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', 
 260             r
'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': 
 261             'Video %s has been removed from public access due to rightholder complaint.', 
 263             r
'<!>Please log in or <': 
 264             'Video %s is only available for registered users, ' 
 265             'use --username and --password options to provide account credentials.', 
 268             'Video %s does not exist.', 
 270             r
'<!>Видео временно недоступно': 
 271             'Video %s is temporarily unavailable.', 
 274             'Access denied to video %s.', 
 277         for error_re
, error_msg 
in ERRORS
.items(): 
 278             if re
.search(error_re
, info_page
): 
 279                 raise ExtractorError(error_msg 
% video_id
, expected
=True) 
 281         youtube_url 
= self
._search
_regex
( 
 282             r
'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', 
 283             info_page
, 'youtube iframe', default
=None) 
 285             return self
.url_result(youtube_url
, 'Youtube') 
 287         vimeo_url 
= VimeoIE
._extract
_vimeo
_url
(url
, info_page
) 
 288         if vimeo_url 
is not None: 
 289             return self
.url_result(vimeo_url
) 
 291         pladform_url 
= PladformIE
._extract
_url
(info_page
) 
 293             return self
.url_result(pladform_url
) 
 295         m_rutube 
= re
.search( 
 296             r
'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page
) 
 297         if m_rutube 
is not None: 
 298             rutube_url 
= self
._proto
_relative
_url
( 
 299                 m_rutube
.group(1).replace('\\', '')) 
 300             return self
.url_result(rutube_url
) 
 302         m_opts 
= re
.search(r
'(?s)var\s+opts\s*=\s*({.+?});', info_page
) 
 304             m_opts_url 
= re
.search(r
"url\s*:\s*'((?!/\b)[^']+)", m_opts
.group(1)) 
 306                 opts_url 
= m_opts_url
.group(1) 
 307                 if opts_url
.startswith('//'): 
 308                     opts_url 
= 'http:' + opts_url
 
 309                 return self
.url_result(opts_url
) 
 311         data_json 
= self
._search
_regex
(r
'var\s+vars\s*=\s*({.+?});', info_page
, 'vars') 
 312         data 
= json
.loads(data_json
) 
 314         # Extract upload date 
 316         mobj 
= re
.search(r
'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page
) 
 318             mobj
.group(1) + ' ' + mobj
.group(2) 
 319             upload_date 
= unified_strdate(mobj
.group(1) + ' ' + mobj
.group(2)) 
 322         views 
= self
._html
_search
_regex
( 
 323             r
'"mv_views_count_number"[^>]*>(.+?\bviews?)<', 
 324             info_page
, 'view count', default
=None) 
 326             view_count 
= str_to_int(self
._search
_regex
( 
 327                 r
'([\d,.]+)', views
, 'view count', fatal
=False)) 
 330         for k
, v 
in data
.items(): 
 331             if not k
.startswith('url') and not k
.startswith('cache') and k 
!= 'extra_data' or not v
: 
 333             height 
= int_or_none(self
._search
_regex
( 
 334                 r
'^(?:url|cache)(\d+)', k
, 'height', default
=None)) 
 340         self
._sort
_formats
(formats
) 
 343             'id': compat_str(data
['vid']), 
 345             'title': unescapeHTML(data
['md_title']), 
 346             'thumbnail': data
.get('jpg'), 
 347             'uploader': data
.get('md_author'), 
 348             'duration': data
.get('duration'), 
 349             'upload_date': upload_date
, 
 350             'view_count': view_count
, 
 354 class VKUserVideosIE(InfoExtractor
): 
 355     IE_NAME 
= 'vk:uservideos' 
 356     IE_DESC 
= "VK - User's Videos" 
 357     _VALID_URL 
= r
'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' 
 358     _TEMPLATE_URL 
= 'https://vk.com/videos' 
 360         'url': 'http://vk.com/videos205387401', 
 363             'title': "Tom Cruise's Videos", 
 365         'playlist_mincount': 4, 
 367         'url': 'http://vk.com/videos-77521', 
 368         'only_matching': True, 
 370         'url': 'http://vk.com/videos-97664626?section=all', 
 371         'only_matching': True, 
 374     def _real_extract(self
, url
): 
 375         page_id 
= self
._match
_id
(url
) 
 377         webpage 
= self
._download
_webpage
(url
, page_id
) 
 381                 'http://vk.com/video' + video_id
, 'VK', video_id
=video_id
) 
 382             for video_id 
in orderedSet(re
.findall(r
'href="/video(-?[0-9_]+)"', webpage
))] 
 384         title 
= unescapeHTML(self
._search
_regex
( 
 385             r
'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', 
 386             webpage
, 'title', default
=page_id
)) 
 388         return self
.playlist_result(entries
, page_id
, title
)