Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bilibili.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import hashlib
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_parse_qs,
  10     compat_urlparse,
  11 )
  12 from ..utils import (
  13     ExtractorError,
  14     int_or_none,
  15     float_or_none,
  16     parse_iso8601,
  17     smuggle_url,
  18     str_or_none,
  19     strip_jsonp,
  20     unified_timestamp,
  21     unsmuggle_url,
  22     urlencode_postdata,
  23 )
  24
  25
  26 class BiliBiliIE(InfoExtractor):
  27     _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)'
  28
  29     _TESTS = [{
  30         'url': 'http://www.bilibili.tv/video/av1074402/',
  31         'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
  32         'info_dict': {
  33             'id': '1074402',
  34             'ext': 'flv',
  35             'title': '【金坷垃】金泡沫',
  36             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
  37             'duration': 308.067,
  38             'timestamp': 1398012678,
  39             'upload_date': '20140420',
  40             'thumbnail': r're:^https?://.+\.jpg',
  41             'uploader': '菊子桑',
  42             'uploader_id': '156160',
  43         },
  44     }, {
  45         # Tested in BiliBiliBangumiIE
  46         'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
  47         'only_matching': True,
  48     }, {
  49         'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
  50         'md5': '3f721ad1e75030cc06faf73587cfec57',
  51         'info_dict': {
  52             'id': '100643',
  53             'ext': 'mp4',
  54             'title': 'CHAOS;CHILD',
  55             'description': '如果你是神明，并且能够让妄想成为现实。那你会进行怎么样的妄想？是淫靡的世界？独裁社会？毁灭性的制裁？还是……2015年，涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
  56         },
  57         'skip': 'Geo-restricted to China',
  58     }, {
  59         # Title with double quotes
  60         'url': 'http://www.bilibili.com/video/av8903802/',
  61         'info_dict': {
  62             'id': '8903802',
  63             'title': '阿滴英文｜英文歌分享#6 "Closer',
  64             'description': '滴妹今天唱Closer給你聽! 有史以来，被推最多次也是最久的歌曲，其实歌词跟我原本想像差蛮多的，不过还是好听！ 微博@阿滴英文',
  65         },
  66         'playlist': [{
  67             'info_dict': {
  68                 'id': '8903802_part1',
  69                 'ext': 'flv',
  70                 'title': '阿滴英文｜英文歌分享#6 "Closer',
  71                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
  72                 'uploader': '阿滴英文',
  73                 'uploader_id': '65880958',
  74                 'timestamp': 1488382634,
  75                 'upload_date': '20170301',
  76             },
  77             'params': {
  78                 'skip_download': True,  # Test metadata only
  79             },
  80         }, {
  81             'info_dict': {
  82                 'id': '8903802_part2',
  83                 'ext': 'flv',
  84                 'title': '阿滴英文｜英文歌分享#6 "Closer',
  85                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
  86                 'uploader': '阿滴英文',
  87                 'uploader_id': '65880958',
  88                 'timestamp': 1488382634,
  89                 'upload_date': '20170301',
  90             },
  91             'params': {
  92                 'skip_download': True,  # Test metadata only
  93             },
  94         }]
  95     }]
  96
  97     _APP_KEY = 'iVGUTjsxvpLeuDCf'
  98     _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
  99
 100     def _report_error(self, result):
 101         if 'message' in result:
 102             raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
 103         elif 'code' in result:
 104             raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
 105         else:
 106             raise ExtractorError('Can\'t extract Bangumi episode ID')
 107
 108     def _real_extract(self, url):
 109         url, smuggled_data = unsmuggle_url(url, {})
 110
 111         mobj = re.match(self._VALID_URL, url)
 112         video_id = mobj.group('id')
 113         anime_id = mobj.group('anime_id')
 114         webpage = self._download_webpage(url, video_id)
 115
 116         if 'anime/' not in url:
 117             cid = self._search_regex(
 118                 r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
 119                 default=None
 120             ) or compat_parse_qs(self._search_regex(
 121                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
 122                  r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
 123                  r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
 124                 webpage, 'player parameters'))['cid'][0]
 125         else:
 126             if 'no_bangumi_tip' not in smuggled_data:
 127                 self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
 128                     video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
 129             headers = {
 130                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 131                 'Referer': url
 132             }
 133             headers.update(self.geo_verification_headers())
 134
 135             js = self._download_json(
 136                 'http://bangumi.bilibili.com/web_api/get_source', video_id,
 137                 data=urlencode_postdata({'episode_id': video_id}),
 138                 headers=headers)
 139             if 'result' not in js:
 140                 self._report_error(js)
 141             cid = js['result']['cid']
 142
 143         headers = {
 144             'Referer': url
 145         }
 146         headers.update(self.geo_verification_headers())
 147
 148         entries = []
 149
 150         RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
 151         for num, rendition in enumerate(RENDITIONS, start=1):
 152             payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
 153             sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
 154
 155             video_info = self._download_json(
 156                 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
 157                 video_id, note='Downloading video info page',
 158                 headers=headers, fatal=num == len(RENDITIONS))
 159
 160             if not video_info:
 161                 continue
 162
 163             if 'durl' not in video_info:
 164                 if num < len(RENDITIONS):
 165                     continue
 166                 self._report_error(video_info)
 167
 168             for idx, durl in enumerate(video_info['durl']):
 169                 formats = [{
 170                     'url': durl['url'],
 171                     'filesize': int_or_none(durl['size']),
 172                 }]
 173                 for backup_url in durl.get('backup_url', []):
 174                     formats.append({
 175                         'url': backup_url,
 176                         # backup URLs have lower priorities
 177                         'preference': -2 if 'hd.mp4' in backup_url else -3,
 178                     })
 179
 180                 for a_format in formats:
 181                     a_format.setdefault('http_headers', {}).update({
 182                         'Referer': url,
 183                     })
 184
 185                 self._sort_formats(formats)
 186
 187                 entries.append({
 188                     'id': '%s_part%s' % (video_id, idx),
 189                     'duration': float_or_none(durl.get('length'), 1000),
 190                     'formats': formats,
 191                 })
 192             break
 193
 194         title = self._html_search_regex(
 195             ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
 196              '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
 197             group='title')
 198         description = self._html_search_meta('description', webpage)
 199         timestamp = unified_timestamp(self._html_search_regex(
 200             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
 201             default=None) or self._html_search_meta(
 202             'uploadDate', webpage, 'timestamp', default=None))
 203         thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
 204
 205         # TODO 'view_count' requires deobfuscating Javascript
 206         info = {
 207             'id': video_id,
 208             'title': title,
 209             'description': description,
 210             'timestamp': timestamp,
 211             'thumbnail': thumbnail,
 212             'duration': float_or_none(video_info.get('timelength'), scale=1000),
 213         }
 214
 215         uploader_mobj = re.search(
 216             r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
 217             webpage)
 218         if uploader_mobj:
 219             info.update({
 220                 'uploader': uploader_mobj.group('name'),
 221                 'uploader_id': uploader_mobj.group('id'),
 222             })
 223         if not info.get('uploader'):
 224             info['uploader'] = self._html_search_meta(
 225                 'author', webpage, 'uploader', default=None)
 226
 227         for entry in entries:
 228             entry.update(info)
 229
 230         if len(entries) == 1:
 231             return entries[0]
 232         else:
 233             for idx, entry in enumerate(entries):
 234                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
 235
 236             return {
 237                 '_type': 'multi_video',
 238                 'id': video_id,
 239                 'title': title,
 240                 'description': description,
 241                 'entries': entries,
 242             }
 243
 244
 245 class BiliBiliBangumiIE(InfoExtractor):
 246     _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
 247
 248     IE_NAME = 'bangumi.bilibili.com'
 249     IE_DESC = 'BiliBili番剧'
 250
 251     _TESTS = [{
 252         'url': 'http://bangumi.bilibili.com/anime/1869',
 253         'info_dict': {
 254             'id': '1869',
 255             'title': '混沌武士',
 256             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
 257         },
 258         'playlist_count': 26,
 259     }, {
 260         'url': 'http://bangumi.bilibili.com/anime/1869',
 261         'info_dict': {
 262             'id': '1869',
 263             'title': '混沌武士',
 264             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
 265         },
 266         'playlist': [{
 267             'md5': '91da8621454dd58316851c27c68b0c13',
 268             'info_dict': {
 269                 'id': '40062',
 270                 'ext': 'mp4',
 271                 'title': '混沌武士',
 272                 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日，酒馆里来了一群恶霸，虽然他们的举动令风十分不满，但是毕竟风只是一届女流，无法对他们采取什么行动，只能在心里嘟哝。这时，酒家里又进来了个“不良份子...',
 273                 'timestamp': 1414538739,
 274                 'upload_date': '20141028',
 275                 'episode': '疾风怒涛 Tempestuous Temperaments',
 276                 'episode_number': 1,
 277             },
 278         }],
 279         'params': {
 280             'playlist_items': '1',
 281         },
 282     }]
 283
 284     @classmethod
 285     def suitable(cls, url):
 286         return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
 287
 288     def _real_extract(self, url):
 289         bangumi_id = self._match_id(url)
 290
 291         # Sometimes this API returns a JSONP response
 292         season_info = self._download_json(
 293             'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
 294             bangumi_id, transform_source=strip_jsonp)['result']
 295
 296         entries = [{
 297             '_type': 'url_transparent',
 298             'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
 299             'ie_key': BiliBiliIE.ie_key(),
 300             'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
 301             'episode': episode.get('index_title'),
 302             'episode_number': int_or_none(episode.get('index')),
 303         } for episode in season_info['episodes']]
 304
 305         entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
 306
 307         return self.playlist_result(
 308             entries, bangumi_id,
 309             season_info.get('bangumi_title'), season_info.get('evaluate'))
 310
 311
 312 class BilibiliAudioBaseIE(InfoExtractor):
 313     def _call_api(self, path, sid, query=None):
 314         if not query:
 315             query = {'sid': sid}
 316         return self._download_json(
 317             'https://www.bilibili.com/audio/music-service-c/web/' + path,
 318             sid, query=query)['data']
 319
 320
 321 class BilibiliAudioIE(BilibiliAudioBaseIE):
 322     _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
 323     _TEST = {
 324         'url': 'https://www.bilibili.com/audio/au1003142',
 325         'md5': 'fec4987014ec94ef9e666d4d158ad03b',
 326         'info_dict': {
 327             'id': '1003142',
 328             'ext': 'm4a',
 329             'title': '【tsukimi】YELLOW / 神山羊',
 330             'artist': 'tsukimi',
 331             'comment_count': int,
 332             'description': 'YELLOW的mp3版！',
 333             'duration': 183,
 334             'subtitles': {
 335                 'origin': [{
 336                     'ext': 'lrc',
 337                 }],
 338             },
 339             'thumbnail': r're:^https?://.+\.jpg',
 340             'timestamp': 1564836614,
 341             'upload_date': '20190803',
 342             'uploader': 'tsukimi-つきみぐー',
 343             'view_count': int,
 344         },
 345     }
 346
 347     def _real_extract(self, url):
 348         au_id = self._match_id(url)
 349
 350         play_data = self._call_api('url', au_id)
 351         formats = [{
 352             'url': play_data['cdns'][0],
 353             'filesize': int_or_none(play_data.get('size')),
 354         }]
 355
 356         song = self._call_api('song/info', au_id)
 357         title = song['title']
 358         statistic = song.get('statistic') or {}
 359
 360         subtitles = None
 361         lyric = song.get('lyric')
 362         if lyric:
 363             subtitles = {
 364                 'origin': [{
 365                     'url': lyric,
 366                 }]
 367             }
 368
 369         return {
 370             'id': au_id,
 371             'title': title,
 372             'formats': formats,
 373             'artist': song.get('author'),
 374             'comment_count': int_or_none(statistic.get('comment')),
 375             'description': song.get('intro'),
 376             'duration': int_or_none(song.get('duration')),
 377             'subtitles': subtitles,
 378             'thumbnail': song.get('cover'),
 379             'timestamp': int_or_none(song.get('passtime')),
 380             'uploader': song.get('uname'),
 381             'view_count': int_or_none(statistic.get('play')),
 382         }
 383
 384
 385 class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
 386     _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
 387     _TEST = {
 388         'url': 'https://www.bilibili.com/audio/am10624',
 389         'info_dict': {
 390             'id': '10624',
 391             'title': '每日新曲推荐（每日11:00更新）',
 392             'description': '每天11:00更新，为你推送最新音乐',
 393         },
 394         'playlist_count': 19,
 395     }
 396
 397     def _real_extract(self, url):
 398         am_id = self._match_id(url)
 399
 400         songs = self._call_api(
 401             'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
 402
 403         entries = []
 404         for song in songs:
 405             sid = str_or_none(song.get('id'))
 406             if not sid:
 407                 continue
 408             entries.append(self.url_result(
 409                 'https://www.bilibili.com/audio/au' + sid,
 410                 BilibiliAudioIE.ie_key(), sid))
 411
 412         if entries:
 413             album_data = self._call_api('menu/info', am_id) or {}
 414             album_title = album_data.get('title')
 415             if album_title:
 416                 for entry in entries:
 417                     entry['album'] = album_title
 418                 return self.playlist_result(
 419                     entries, am_id, album_title, album_data.get('intro'))
 420
 421         return self.playlist_result(entries, am_id)