]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/bandcamp.py
1 from __future__
import unicode_literals
8 from .common
import InfoExtractor
23 class BandcampIE(InfoExtractor
):
24 _VALID_URL
= r
'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
26 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
27 'md5': 'c557841d5e50261777a6585648adf439',
31 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
34 '_skip': 'There is a limit of 200 free downloads / month for the test song'
36 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
37 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
41 'title': 'Ben Prunty - Lanius (Battle)',
42 'uploader': 'Ben Prunty',
46 def _real_extract(self
, url
):
47 mobj
= re
.match(self
._VALID
_URL
, url
)
48 title
= mobj
.group('title')
49 webpage
= self
._download
_webpage
(url
, title
)
50 thumbnail
= self
._html
_search
_meta
('og:image', webpage
, default
=None)
51 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
53 m_trackinfo
= re
.search(r
'trackinfo: (.+),\s*?\n', webpage
)
55 json_code
= m_trackinfo
.group(1)
56 data
= json
.loads(json_code
)[0]
57 track_id
= compat_str(data
['id'])
59 if not data
.get('file'):
60 raise ExtractorError('Not streamable', video_id
=track_id
, expected
=True)
63 for format_id
, format_url
in data
['file'].items():
64 ext
, abr_str
= format_id
.split('-', 1)
66 'format_id': format_id
,
67 'url': self
._proto
_relative
_url
(format_url
, 'http:'),
71 'abr': int_or_none(abr_str
),
74 self
._sort
_formats
(formats
)
78 'title': data
['title'],
79 'thumbnail': thumbnail
,
81 'duration': float_or_none(data
.get('duration')),
84 raise ExtractorError('No free songs found')
86 download_link
= m_download
.group(1)
87 video_id
= self
._search
_regex
(
88 r
'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
91 download_webpage
= self
._download
_webpage
(
92 download_link
, video_id
, 'Downloading free downloads page')
94 blob
= self
._parse
_json
(
96 r
'data-blob=(["\'])(?P
<blob
>{.+?
})\
1', download_webpage,
97 'blob
', group='blob
'),
98 video_id, transform_source=unescapeHTML)
100 info = blob['digital_items
'][0]
102 downloads = info['downloads
']
103 track = info['title
']
105 artist = info.get('artist
')
106 title = '%s - %s' % (artist, track) if artist else track
108 download_formats = {}
109 for f in blob['download_formats
']:
110 name, ext = f.get('name
'), f.get('file_extension
')
111 if all(isinstance(x, compat_str) for x in (name, ext)):
112 download_formats[name] = ext.strip('.')
115 for format_id, f in downloads.items():
116 format_url = f.get('url
')
119 # Stat URL generation algorithm is reverse engineered from
120 # download_*_bundle_*.js
121 stat_url = update_url_query(
122 format_url.replace('/download
/', '/statdownload
/'), {
123 '.rand
': int(time.time() * 1000 * random.random()),
125 format_id = f.get('encoding_name
') or format_id
126 stat = self._download_json(
127 stat_url, video_id, 'Downloading
%s JSON
' % format_id,
128 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
132 retry_url = stat.get('retry_url
')
133 if not isinstance(retry_url, compat_str):
136 'url
': self._proto_relative_url(retry_url, 'http
:'),
137 'ext
': download_formats.get(format_id),
138 'format_id
': format_id,
139 'format_note
': f.get('description
'),
140 'filesize
': parse_filesize(f.get('size_mb
')),
143 self._sort_formats(formats)
148 'thumbnail
': info.get('thumb_url
') or thumbnail,
149 'uploader
': info.get('artist
'),
156 class BandcampAlbumIE(InfoExtractor):
157 IE_NAME = 'Bandcamp
:album
'
158 _VALID_URL = r'https?
://(?
:(?P
<subdomain
>[^
.]+)\
.)?bandcamp\
.com(?
:/album
/(?P
<album_id
>[^?
#]+)|/?(?:$|[?#]))'
161 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
164 'md5': '39bc1eded3476e927c724321ddf116cf',
172 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
176 'title': 'Kero One - Keep It Alive (Blazo remix)',
181 'title': 'Jazz Format Mixtape vol.1',
182 'id': 'jazz-format-mixtape-vol-1',
183 'uploader_id': 'blazo',
188 'skip': 'Bandcamp imposes download limits.'
190 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
192 'title': 'Hierophany of the Open Grave',
193 'uploader_id': 'nightbringer',
194 'id': 'hierophany-of-the-open-grave',
196 'playlist_mincount': 9,
198 'url': 'http://dotscale.bandcamp.com',
202 'uploader_id': 'dotscale',
204 'playlist_mincount': 7,
206 # with escaped quote in title
207 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
209 'title': '"Entropy" EP',
210 'uploader_id': 'jstrecords',
213 'playlist_mincount': 3,
215 # not all tracks have songs
216 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
218 'id': 'we-are-the-plague',
219 'title': 'WE ARE THE PLAGUE',
220 'uploader_id': 'insulters',
225 def _real_extract(self
, url
):
226 mobj
= re
.match(self
._VALID
_URL
, url
)
227 uploader_id
= mobj
.group('subdomain')
228 album_id
= mobj
.group('album_id')
229 playlist_id
= album_id
or uploader_id
230 webpage
= self
._download
_webpage
(url
, playlist_id
)
231 track_elements
= re
.findall(
232 r
'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage
)
233 if not track_elements
:
234 raise ExtractorError('The page doesn\'t contain any tracks')
235 # Only tracks with duration info have songs
237 self
.url_result(compat_urlparse
.urljoin(url
, t_path
), ie
=BandcampIE
.ie_key())
238 for elem_content
, t_path
in track_elements
239 if self
._html
_search
_meta
('duration', elem_content
, default
=None)]
241 title
= self
._html
_search
_regex
(
242 r
'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
243 webpage
, 'title', fatal
=False)
245 title
= title
.replace(r
'\"', '"')
248 'uploader_id': uploader_id
,