Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bilibili.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import calendar
   5 import datetime
   6 import re
   7
   8 from .common import InfoExtractor
   9 from ..compat import (
  10     compat_etree_fromstring,
  11     compat_str,
  12     compat_parse_qs,
  13     compat_xml_parse_error,
  14 )
  15 from ..utils import (
  16     ExtractorError,
  17     int_or_none,
  18     float_or_none,
  19     xpath_text,
  20 )
  21
  22
  23 class BiliBiliIE(InfoExtractor):
  24     _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)'
  25
  26     _TESTS = [{
  27         'url': 'http://www.bilibili.tv/video/av1074402/',
  28         'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
  29         'info_dict': {
  30             'id': '1554319',
  31             'ext': 'flv',
  32             'title': '【金坷垃】金泡沫',
  33             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
  34             'duration': 308.067,
  35             'timestamp': 1398012660,
  36             'upload_date': '20140420',
  37             'thumbnail': 're:^https?://.+\.jpg',
  38             'uploader': '菊子桑',
  39             'uploader_id': '156160',
  40         },
  41     }, {
  42         'url': 'http://www.bilibili.com/video/av1041170/',
  43         'info_dict': {
  44             'id': '1041170',
  45             'title': '【BD1080P】刀语【诸神&异域】',
  46             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦！~',
  47         },
  48         'playlist_count': 9,
  49     }, {
  50         'url': 'http://www.bilibili.com/video/av4808130/',
  51         'info_dict': {
  52             'id': '4808130',
  53             'title': '【长篇】哆啦A梦443【钉铛】',
  54             'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
  55         },
  56         'playlist': [{
  57             'md5': '55cdadedf3254caaa0d5d27cf20a8f9c',
  58             'info_dict': {
  59                 'id': '4808130_part1',
  60                 'ext': 'flv',
  61                 'title': '【长篇】哆啦A梦443【钉铛】',
  62                 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
  63                 'timestamp': 1464564180,
  64                 'upload_date': '20160529',
  65                 'uploader': '喜欢拉面',
  66                 'uploader_id': '151066',
  67             },
  68         }, {
  69             'md5': '926f9f67d0c482091872fbd8eca7ea3d',
  70             'info_dict': {
  71                 'id': '4808130_part2',
  72                 'ext': 'flv',
  73                 'title': '【长篇】哆啦A梦443【钉铛】',
  74                 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
  75                 'timestamp': 1464564180,
  76                 'upload_date': '20160529',
  77                 'uploader': '喜欢拉面',
  78                 'uploader_id': '151066',
  79             },
  80         }, {
  81             'md5': '4b7b225b968402d7c32348c646f1fd83',
  82             'info_dict': {
  83                 'id': '4808130_part3',
  84                 'ext': 'flv',
  85                 'title': '【长篇】哆啦A梦443【钉铛】',
  86                 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
  87                 'timestamp': 1464564180,
  88                 'upload_date': '20160529',
  89                 'uploader': '喜欢拉面',
  90                 'uploader_id': '151066',
  91             },
  92         }, {
  93             'md5': '7b795e214166501e9141139eea236e91',
  94             'info_dict': {
  95                 'id': '4808130_part4',
  96                 'ext': 'flv',
  97                 'title': '【长篇】哆啦A梦443【钉铛】',
  98                 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
  99                 'timestamp': 1464564180,
 100                 'upload_date': '20160529',
 101                 'uploader': '喜欢拉面',
 102                 'uploader_id': '151066',
 103             },
 104         }],
 105     }, {
 106         # Missing upload time
 107         'url': 'http://www.bilibili.com/video/av1867637/',
 108         'info_dict': {
 109             'id': '2880301',
 110             'ext': 'flv',
 111             'title': '【HDTV】【喜剧】岳父岳母真难当 （2014）【法国票房冠军】',
 112             'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫，老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人，结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
 113             'uploader': '黑夜为猫',
 114             'uploader_id': '610729',
 115         },
 116         'params': {
 117             # Just to test metadata extraction
 118             'skip_download': True,
 119         },
 120         'expected_warnings': ['upload time'],
 121     }]
 122
 123     # BiliBili blocks keys from time to time. The current key is extracted from
 124     # the Android client
 125     # TODO: find the sign algorithm used in the flash player
 126     _APP_KEY = '86385cdc024c0f6c'
 127
 128     def _real_extract(self, url):
 129         mobj = re.match(self._VALID_URL, url)
 130         video_id = mobj.group('id')
 131
 132         webpage = self._download_webpage(url, video_id)
 133
 134         params = compat_parse_qs(self._search_regex(
 135             [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
 136              r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
 137             webpage, 'player parameters'))
 138         cid = params['cid'][0]
 139
 140         info_xml_str = self._download_webpage(
 141             'http://interface.bilibili.com/v_cdn_play',
 142             cid, query={'appkey': self._APP_KEY, 'cid': cid},
 143             note='Downloading video info page')
 144
 145         err_msg = None
 146         durls = None
 147         info_xml = None
 148         try:
 149             info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8'))
 150         except compat_xml_parse_error:
 151             info_json = self._parse_json(info_xml_str, video_id, fatal=False)
 152             err_msg = (info_json or {}).get('error_text')
 153         else:
 154             err_msg = xpath_text(info_xml, './message')
 155
 156         if info_xml is not None:
 157             durls = info_xml.findall('./durl')
 158         if not durls:
 159             if err_msg:
 160                 raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True)
 161             else:
 162                 raise ExtractorError('No videos found!')
 163
 164         entries = []
 165
 166         for durl in durls:
 167             size = xpath_text(durl, ['./filesize', './size'])
 168             formats = [{
 169                 'url': durl.find('./url').text,
 170                 'filesize': int_or_none(size),
 171             }]
 172             for backup_url in durl.findall('./backup_url/url'):
 173                 formats.append({
 174                     'url': backup_url.text,
 175                     # backup URLs have lower priorities
 176                     'preference': -2 if 'hd.mp4' in backup_url.text else -3,
 177                 })
 178
 179             self._sort_formats(formats)
 180
 181             entries.append({
 182                 'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
 183                 'duration': int_or_none(xpath_text(durl, './length'), 1000),
 184                 'formats': formats,
 185             })
 186
 187         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
 188         description = self._html_search_meta('description', webpage)
 189         datetime_str = self._html_search_regex(
 190             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)
 191         timestamp = None
 192         if datetime_str:
 193             timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple())
 194
 195         # TODO 'view_count' requires deobfuscating Javascript
 196         info = {
 197             'id': compat_str(cid),
 198             'title': title,
 199             'description': description,
 200             'timestamp': timestamp,
 201             'thumbnail': self._html_search_meta('thumbnailUrl', webpage),
 202             'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000),
 203         }
 204
 205         uploader_mobj = re.search(
 206             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
 207             webpage)
 208         if uploader_mobj:
 209             info.update({
 210                 'uploader': uploader_mobj.group('name'),
 211                 'uploader_id': uploader_mobj.group('id'),
 212             })
 213
 214         for entry in entries:
 215             entry.update(info)
 216
 217         if len(entries) == 1:
 218             return entries[0]
 219         else:
 220             for idx, entry in enumerate(entries):
 221                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
 222
 223             return {
 224                 '_type': 'multi_video',
 225                 'id': video_id,
 226                 'title': title,
 227                 'description': description,
 228                 'entries': entries,
 229             }