Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/vlive.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 import time
   6 import itertools
   7
   8 from .common import InfoExtractor
   9 from ..compat import (
  10     compat_urllib_parse_urlencode,
  11     compat_str,
  12 )
  13 from ..utils import (
  14     dict_get,
  15     ExtractorError,
  16     float_or_none,
  17     int_or_none,
  18     remove_start,
  19     try_get,
  20     urlencode_postdata,
  21 )
  22
  23
  24 class VLiveIE(InfoExtractor):
  25     IE_NAME = 'vlive'
  26     _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
  27     _NETRC_MACHINE = 'vlive'
  28     _TESTS = [{
  29         'url': 'http://www.vlive.tv/video/1326',
  30         'md5': 'cc7314812855ce56de70a06a27314983',
  31         'info_dict': {
  32             'id': '1326',
  33             'ext': 'mp4',
  34             'title': "[V LIVE] Girl's Day's Broadcast",
  35             'creator': "Girl's Day",
  36             'view_count': int,
  37         },
  38     }, {
  39         'url': 'http://www.vlive.tv/video/16937',
  40         'info_dict': {
  41             'id': '16937',
  42             'ext': 'mp4',
  43             'title': '[V LIVE] 첸백시 걍방',
  44             'creator': 'EXO',
  45             'view_count': int,
  46             'subtitles': 'mincount:12',
  47         },
  48         'params': {
  49             'skip_download': True,
  50         },
  51     }, {
  52         'url': 'https://www.vlive.tv/video/129100',
  53         'md5': 'ca2569453b79d66e5b919e5d308bff6b',
  54         'info_dict': {
  55             'id': '129100',
  56             'ext': 'mp4',
  57             'title': '[V LIVE] [BTS+] Run BTS! 2019 - EP.71 :: Behind the scene',
  58             'creator': 'BTS+',
  59             'view_count': int,
  60             'subtitles': 'mincount:10',
  61         },
  62         'skip': 'This video is only available for CH+ subscribers',
  63     }]
  64
  65     @classmethod
  66     def suitable(cls, url):
  67         return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
  68
  69     def _real_initialize(self):
  70         self._login()
  71
  72     def _login(self):
  73         email, password = self._get_login_info()
  74         if None in (email, password):
  75             return
  76
  77         def is_logged_in():
  78             login_info = self._download_json(
  79                 'https://www.vlive.tv/auth/loginInfo', None,
  80                 note='Downloading login info',
  81                 headers={'Referer': 'https://www.vlive.tv/home'})
  82             return try_get(
  83                 login_info, lambda x: x['message']['login'], bool) or False
  84
  85         LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
  86         self._request_webpage(
  87             LOGIN_URL, None, note='Downloading login cookies')
  88
  89         self._download_webpage(
  90             LOGIN_URL, None, note='Logging in',
  91             data=urlencode_postdata({'email': email, 'pwd': password}),
  92             headers={
  93                 'Referer': LOGIN_URL,
  94                 'Content-Type': 'application/x-www-form-urlencoded'
  95             })
  96
  97         if not is_logged_in():
  98             raise ExtractorError('Unable to log in', expected=True)
  99
 100     def _real_extract(self, url):
 101         video_id = self._match_id(url)
 102
 103         webpage = self._download_webpage(
 104             'https://www.vlive.tv/video/%s' % video_id, video_id)
 105
 106         VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
 107         VIDEO_PARAMS_FIELD = 'video params'
 108
 109         params = self._parse_json(self._search_regex(
 110             VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
 111             transform_source=lambda s: '[' + s + ']', fatal=False)
 112
 113         if not params or len(params) < 7:
 114             params = self._search_regex(
 115                 VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
 116             params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
 117
 118         status, long_video_id, key = params[2], params[5], params[6]
 119         status = remove_start(status, 'PRODUCT_')
 120
 121         if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
 122             return self._live(video_id, webpage)
 123         elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
 124             return self._replay(video_id, webpage, long_video_id, key)
 125
 126         if status == 'LIVE_END':
 127             raise ExtractorError('Uploading for replay. Please wait...',
 128                                  expected=True)
 129         elif status == 'COMING_SOON':
 130             raise ExtractorError('Coming soon!', expected=True)
 131         elif status == 'CANCELED':
 132             raise ExtractorError('We are sorry, '
 133                                  'but the live broadcast has been canceled.',
 134                                  expected=True)
 135         elif status == 'ONLY_APP':
 136             raise ExtractorError('Unsupported video type', expected=True)
 137         else:
 138             raise ExtractorError('Unknown status %s' % status)
 139
 140     def _get_common_fields(self, webpage):
 141         title = self._og_search_title(webpage)
 142         creator = self._html_search_regex(
 143             r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)',
 144             webpage, 'creator', fatal=False)
 145         thumbnail = self._og_search_thumbnail(webpage)
 146         return {
 147             'title': title,
 148             'creator': creator,
 149             'thumbnail': thumbnail,
 150         }
 151
 152     def _live(self, video_id, webpage):
 153         init_page = self._download_init_page(video_id)
 154
 155         live_params = self._search_regex(
 156             r'"liveStreamInfo"\s*:\s*(".*"),',
 157             init_page, 'live stream info')
 158         live_params = self._parse_json(live_params, video_id)
 159         live_params = self._parse_json(live_params, video_id)
 160
 161         formats = []
 162         for vid in live_params.get('resolutions', []):
 163             formats.extend(self._extract_m3u8_formats(
 164                 vid['cdnUrl'], video_id, 'mp4',
 165                 m3u8_id=vid.get('name'),
 166                 fatal=False, live=True))
 167         self._sort_formats(formats)
 168
 169         info = self._get_common_fields(webpage)
 170         info.update({
 171             'title': self._live_title(info['title']),
 172             'id': video_id,
 173             'formats': formats,
 174             'is_live': True,
 175         })
 176         return info
 177
 178     def _replay(self, video_id, webpage, long_video_id, key):
 179         if '' in (long_video_id, key):
 180             init_page = self._download_init_page(video_id)
 181             video_info = self._parse_json(self._search_regex(
 182                 (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script',
 183                  r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'),
 184                 video_id)
 185             if video_info.get('status') == 'NEED_CHANNEL_PLUS':
 186                 self.raise_login_required(
 187                     'This video is only available for CH+ subscribers')
 188             long_video_id, key = video_info['vid'], video_info['inkey']
 189
 190         playinfo = self._download_json(
 191             'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
 192             % compat_urllib_parse_urlencode({
 193                 'videoId': long_video_id,
 194                 'key': key,
 195                 'ptc': 'http',
 196                 'doct': 'json',  # document type (xml or json)
 197                 'cpt': 'vtt',  # captions type (vtt or ttml)
 198             }), video_id)
 199
 200         formats = [{
 201             'url': vid['source'],
 202             'format_id': vid.get('encodingOption', {}).get('name'),
 203             'abr': float_or_none(vid.get('bitrate', {}).get('audio')),
 204             'vbr': float_or_none(vid.get('bitrate', {}).get('video')),
 205             'width': int_or_none(vid.get('encodingOption', {}).get('width')),
 206             'height': int_or_none(vid.get('encodingOption', {}).get('height')),
 207             'filesize': int_or_none(vid.get('size')),
 208         } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
 209         self._sort_formats(formats)
 210
 211         view_count = int_or_none(playinfo.get('meta', {}).get('count'))
 212
 213         subtitles = {}
 214         for caption in playinfo.get('captions', {}).get('list', []):
 215             lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
 216             if lang and caption.get('source'):
 217                 subtitles[lang] = [{
 218                     'ext': 'vtt',
 219                     'url': caption['source']}]
 220
 221         info = self._get_common_fields(webpage)
 222         info.update({
 223             'id': video_id,
 224             'formats': formats,
 225             'view_count': view_count,
 226             'subtitles': subtitles,
 227         })
 228         return info
 229
 230     def _download_init_page(self, video_id):
 231         return self._download_webpage(
 232             'https://www.vlive.tv/video/init/view',
 233             video_id, note='Downloading live webpage',
 234             data=urlencode_postdata({'videoSeq': video_id}),
 235             headers={
 236                 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
 237                 'Content-Type': 'application/x-www-form-urlencoded'
 238             })
 239
 240
 241 class VLiveChannelIE(InfoExtractor):
 242     IE_NAME = 'vlive:channel'
 243     _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
 244     _TEST = {
 245         'url': 'http://channels.vlive.tv/FCD4B',
 246         'info_dict': {
 247             'id': 'FCD4B',
 248             'title': 'MAMAMOO',
 249         },
 250         'playlist_mincount': 110
 251     }
 252     _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
 253
 254     def _real_extract(self, url):
 255         channel_code = self._match_id(url)
 256
 257         webpage = self._download_webpage(
 258             'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
 259
 260         app_id = None
 261
 262         app_js_url = self._search_regex(
 263             r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
 264             webpage, 'app js', default=None, group='url')
 265
 266         if app_js_url:
 267             app_js = self._download_webpage(
 268                 app_js_url, channel_code, 'Downloading app JS', fatal=False)
 269             if app_js:
 270                 app_id = self._search_regex(
 271                     r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
 272                     app_js, 'app id', default=None)
 273
 274         app_id = app_id or self._APP_ID
 275
 276         channel_info = self._download_json(
 277             'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
 278             channel_code, note='Downloading decode channel code',
 279             query={
 280                 'app_id': app_id,
 281                 'channelCode': channel_code,
 282                 '_': int(time.time())
 283             })
 284
 285         channel_seq = channel_info['result']['channelSeq']
 286         channel_name = None
 287         entries = []
 288
 289         for page_num in itertools.count(1):
 290             video_list = self._download_json(
 291                 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
 292                 channel_code, note='Downloading channel list page #%d' % page_num,
 293                 query={
 294                     'app_id': app_id,
 295                     'channelSeq': channel_seq,
 296                     # Large values of maxNumOfRows (~300 or above) may cause
 297                     # empty responses (see [1]), e.g. this happens for [2] that
 298                     # has more than 300 videos.
 299                     # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
 300                     # 2. http://channels.vlive.tv/EDBF.
 301                     'maxNumOfRows': 100,
 302                     '_': int(time.time()),
 303                     'pageNo': page_num
 304                 }
 305             )
 306
 307             if not channel_name:
 308                 channel_name = try_get(
 309                     video_list,
 310                     lambda x: x['result']['channelInfo']['channelName'],
 311                     compat_str)
 312
 313             videos = try_get(
 314                 video_list, lambda x: x['result']['videoList'], list)
 315             if not videos:
 316                 break
 317
 318             for video in videos:
 319                 video_id = video.get('videoSeq')
 320                 if not video_id:
 321                     continue
 322                 video_id = compat_str(video_id)
 323                 entries.append(
 324                     self.url_result(
 325                         'http://www.vlive.tv/video/%s' % video_id,
 326                         ie=VLiveIE.ie_key(), video_id=video_id))
 327
 328         return self.playlist_result(
 329             entries, channel_code, channel_name)
 330
 331
 332 class VLivePlaylistIE(InfoExtractor):
 333     IE_NAME = 'vlive:playlist'
 334     _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
 335     _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
 336     _TESTS = [{
 337         # regular working playlist
 338         'url': 'https://www.vlive.tv/video/117956/playlist/117963',
 339         'info_dict': {
 340             'id': '117963',
 341             'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들'
 342         },
 343         'playlist_mincount': 10
 344     }, {
 345         # playlist with no playlistVideoSeqs
 346         'url': 'http://www.vlive.tv/video/22867/playlist/22912',
 347         'info_dict': {
 348             'id': '22867',
 349             'ext': 'mp4',
 350             'title': '[V LIVE] Valentine Day Message from MINA',
 351             'creator': 'TWICE',
 352             'view_count': int
 353         },
 354         'params': {
 355             'skip_download': True,
 356         }
 357     }]
 358
 359     def _build_video_result(self, video_id, message):
 360         self.to_screen(message)
 361         return self.url_result(
 362             self._VIDEO_URL_TEMPLATE % video_id,
 363             ie=VLiveIE.ie_key(), video_id=video_id)
 364
 365     def _real_extract(self, url):
 366         mobj = re.match(self._VALID_URL, url)
 367         video_id, playlist_id = mobj.group('video_id', 'id')
 368
 369         if self._downloader.params.get('noplaylist'):
 370             return self._build_video_result(
 371                 video_id,
 372                 'Downloading just video %s because of --no-playlist'
 373                 % video_id)
 374
 375         self.to_screen(
 376             'Downloading playlist %s - add --no-playlist to just download video'
 377             % playlist_id)
 378
 379         webpage = self._download_webpage(
 380             'http://www.vlive.tv/video/%s/playlist/%s'
 381             % (video_id, playlist_id), playlist_id)
 382
 383         raw_item_ids = self._search_regex(
 384             r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
 385             'playlist video seqs', default=None, fatal=False)
 386
 387         if not raw_item_ids:
 388             return self._build_video_result(
 389                 video_id,
 390                 'Downloading just video %s because no playlist was found'
 391                 % video_id)
 392
 393         item_ids = self._parse_json(raw_item_ids, playlist_id)
 394
 395         entries = [
 396             self.url_result(
 397                 self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
 398                 video_id=compat_str(item_id))
 399             for item_id in item_ids]
 400
 401         playlist_name = self._html_search_regex(
 402             r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
 403             webpage, 'playlist title', fatal=False)
 404
 405         return self.playlist_result(entries, playlist_id, playlist_name)