]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/kuwo.py
   2 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
   7 from ..compat 
import compat_urlparse
 
  17 class KuwoBaseIE(InfoExtractor
): 
  19         {'format': 'ape', 'ext': 'ape', 'preference': 100}, 
  20         {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, 
  21         {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, 
  22         {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, 
  23         {'format': 'wma', 'ext': 'wma', 'preference': 20}, 
  24         {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} 
  27     def _get_formats(self
, song_id
, tolerate_ip_deny
=False): 
  29         for file_format 
in self
._FORMATS
: 
  31                 'format': file_format
['ext'], 
  32                 'br': file_format
.get('br', ''), 
  33                 'rid': 'MUSIC_%s' % song_id
, 
  34                 'type': 'convert_url', 
  38             song_url 
= self
._download
_webpage
( 
  39                 'http://antiserver.kuwo.cn/anti.s', 
  40                 song_id
, note
='Download %s url info' % file_format
['format'], 
  41                 query
=query
, headers
=self
.geo_verification_headers(), 
  44             if song_url 
== 'IPDeny' and not tolerate_ip_deny
: 
  45                 raise ExtractorError('This song is blocked in this region', expected
=True) 
  47             if song_url
.startswith('http://') or song_url
.startswith('https://'): 
  50                     'format_id': file_format
['format'], 
  51                     'format': file_format
['format'], 
  52                     'preference': file_format
['preference'], 
  53                     'abr': file_format
.get('abr'), 
  59 class KuwoIE(KuwoBaseIE
): 
  62     _VALID_URL 
= r
'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' 
  64         'url': 'http://www.kuwo.cn/yinyue/635632/', 
  70             'upload_date': '20080122', 
  71             'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' 
  73         'skip': 'this song has been offline because of copyright issues', 
  75         'url': 'http://www.kuwo.cn/yinyue/6446136/', 
  80             'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c', 
  82             'upload_date': '20150518', 
  88         'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', 
  89         'only_matching': True, 
  92     def _real_extract(self
, url
): 
  93         song_id 
= self
._match
_id
(url
) 
  94         webpage
, urlh 
= self
._download
_webpage
_handle
( 
  95             url
, song_id
, note
='Download song detail info', 
  96             errnote
='Unable to get song detail info') 
  97         if song_id 
not in urlh
.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage
: 
  98             raise ExtractorError('this song has been offline because of copyright issues', expected
=True) 
 100         song_name 
= self
._html
_search
_regex
( 
 101             r
'<p[^>]+id="lrcName">([^<]+)</p>', webpage
, 'song name') 
 102         singer_name 
= remove_start(self
._html
_search
_regex
( 
 103             r
'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">', 
 104             webpage
, 'singer name', fatal
=False), '歌手') 
 105         lrc_content 
= clean_html(get_element_by_id('lrcContent', webpage
)) 
 106         if lrc_content 
== '暂无':     # indicates no lyrics 
 109         formats 
= self
._get
_formats
(song_id
) 
 110         self
._sort
_formats
(formats
) 
 112         album_id 
= self
._html
_search
_regex
( 
 113             r
'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', 
 114             webpage
, 'album id', fatal
=False) 
 117         if album_id 
is not None: 
 118             album_info_page 
= self
._download
_webpage
( 
 119                 'http://www.kuwo.cn/album/%s/' % album_id
, song_id
, 
 120                 note
='Download album detail info', 
 121                 errnote
='Unable to get album detail info') 
 123             publish_time 
= self
._html
_search
_regex
( 
 124                 r
'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page
, 
 125                 'publish time', fatal
=False) 
 127                 publish_time 
= publish_time
.replace('-', '') 
 132             'creator': singer_name
, 
 133             'upload_date': publish_time
, 
 134             'description': lrc_content
, 
 139 class KuwoAlbumIE(InfoExtractor
): 
 140     IE_NAME 
= 'kuwo:album' 
 141     IE_DESC 
= '酷我音乐 - 专辑' 
 142     _VALID_URL 
= r
'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' 
 144         'url': 'http://www.kuwo.cn/album/502294/', 
 147             'title': 'Made\xa0Series\xa0《M》', 
 148             'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f', 
 153     def _real_extract(self
, url
): 
 154         album_id 
= self
._match
_id
(url
) 
 156         webpage 
= self
._download
_webpage
( 
 157             url
, album_id
, note
='Download album info', 
 158             errnote
='Unable to get album info') 
 160         album_name 
= self
._html
_search
_regex
( 
 161             r
'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage
, 
 163         album_intro 
= remove_start( 
 164             clean_html(get_element_by_id('intro', webpage
)), 
 165             '%s简介:' % album_name
) 
 168             self
.url_result(song_url
, 'Kuwo') for song_url 
in re
.findall( 
 169                 r
'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', 
 172         return self
.playlist_result(entries
, album_id
, album_name
, album_intro
) 
 175 class KuwoChartIE(InfoExtractor
): 
 176     IE_NAME 
= 'kuwo:chart' 
 177     IE_DESC 
= '酷我音乐 - 排行榜' 
 178     _VALID_URL 
= r
'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' 
 180         'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 
 184         'playlist_mincount': 7, 
 187     def _real_extract(self
, url
): 
 188         chart_id 
= self
._match
_id
(url
) 
 189         webpage 
= self
._download
_webpage
( 
 190             url
, chart_id
, note
='Download chart info', 
 191             errnote
='Unable to get chart info') 
 194             self
.url_result(song_url
, 'Kuwo') for song_url 
in re
.findall( 
 195                 r
'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage
) 
 197         return self
.playlist_result(entries
, chart_id
) 
 200 class KuwoSingerIE(InfoExtractor
): 
 201     IE_NAME 
= 'kuwo:singer' 
 202     IE_DESC 
= '酷我音乐 - 歌手' 
 203     _VALID_URL 
= r
'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' 
 205         'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 
 208             'title': 'Bruno\xa0Mars', 
 210         'playlist_mincount': 329, 
 212         'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 
 217         'playlist_mincount': 95, 
 218         'skip': 'Regularly stalls travis build',  # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 
 223     def _real_extract(self
, url
): 
 224         singer_id 
= self
._match
_id
(url
) 
 225         webpage 
= self
._download
_webpage
( 
 226             url
, singer_id
, note
='Download singer info', 
 227             errnote
='Unable to get singer info') 
 229         singer_name 
= self
._html
_search
_regex
( 
 230             r
'<h1>([^<]+)</h1>', webpage
, 'singer name') 
 232         artist_id 
= self
._html
_search
_regex
( 
 233             r
'data-artistid="(\d+)"', webpage
, 'artist id') 
 235         page_count 
= int(self
._html
_search
_regex
( 
 236             r
'data-page="(\d+)"', webpage
, 'page count')) 
 238         def page_func(page_num
): 
 239             webpage 
= self
._download
_webpage
( 
 240                 'http://www.kuwo.cn/artist/contentMusicsAjax', 
 241                 singer_id
, note
='Download song list page #%d' % (page_num 
+ 1), 
 242                 errnote
='Unable to get song list page #%d' % (page_num 
+ 1), 
 243                 query
={'artistId': artist_id
, 'pn': page_num
, 'rn': self
.PAGE_SIZE
}) 
 246                 self
.url_result(compat_urlparse
.urljoin(url
, song_url
), 'Kuwo') 
 247                 for song_url 
in re
.findall( 
 248                     r
'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', 
 252         entries 
= InAdvancePagedList(page_func
, page_count
, self
.PAGE_SIZE
) 
 254         return self
.playlist_result(entries
, singer_id
, singer_name
) 
 257 class KuwoCategoryIE(InfoExtractor
): 
 258     IE_NAME 
= 'kuwo:category' 
 259     IE_DESC 
= '酷我音乐 - 分类' 
 260     _VALID_URL 
= r
'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' 
 262         'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 
 266             'description': '这些都是属于八十年代的回忆!', 
 268         'playlist_mincount': 24, 
 271     def _real_extract(self
, url
): 
 272         category_id 
= self
._match
_id
(url
) 
 273         webpage 
= self
._download
_webpage
( 
 274             url
, category_id
, note
='Download category info', 
 275             errnote
='Unable to get category info') 
 277         category_name 
= self
._html
_search
_regex
( 
 278             r
'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage
, 'category name') 
 280         category_desc 
= remove_start( 
 281             get_element_by_id('intro', webpage
).strip(), 
 282             '%s简介:' % category_name
) 
 283         if category_desc 
== '暂无': 
 286         jsonm 
= self
._parse
_json
(self
._html
_search
_regex
( 
 287             r
'var\s+jsonm\s*=\s*([^;]+);', webpage
, 'category songs'), category_id
) 
 290             self
.url_result('http://www.kuwo.cn/yinyue/%s/' % song
['musicrid'], 'Kuwo') 
 291             for song 
in jsonm
['musiclist'] 
 293         return self
.playlist_result(entries
, category_id
, category_name
, category_desc
) 
 296 class KuwoMvIE(KuwoBaseIE
): 
 298     IE_DESC 
= '酷我音乐 - MV' 
 299     _VALID_URL 
= r
'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' 
 301         'url': 'http://www.kuwo.cn/mv/6480076/', 
 305             'title': 'My HouseMV', 
 308         # In this video, music URLs (anti.s) are blocked outside China and 
 309         # USA, while the MV URL (mvurl) is available globally, so force the MV 
 310         # URL for consistent results in different countries 
 315     _FORMATS 
= KuwoBaseIE
._FORMATS 
+ [ 
 316         {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, 
 317         {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, 
 320     def _real_extract(self
, url
): 
 321         song_id 
= self
._match
_id
(url
) 
 322         webpage 
= self
._download
_webpage
( 
 323             url
, song_id
, note
='Download mv detail info: %s' % song_id
, 
 324             errnote
='Unable to get mv detail info: %s' % song_id
) 
 327             r
'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', 
 330             song_name 
= mobj
.group('song') 
 331             singer_name 
= mobj
.group('singer') 
 333             raise ExtractorError('Unable to find song or singer names') 
 335         formats 
= self
._get
_formats
(song_id
, tolerate_ip_deny
=True) 
 337         mv_url 
= self
._download
_webpage
( 
 338             'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id
, 
 339             song_id
, note
='Download %s MV URL' % song_id
) 
 345         self
._sort
_formats
(formats
) 
 350             'creator': singer_name
,