]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/grooveshark.py
2 from __future__
import unicode_literals
10 from .common
import InfoExtractor
11 from ..utils
import ExtractorError
, compat_urllib_request
, compat_html_parser
19 class GroovesharkHtmlParser(compat_html_parser
.HTMLParser
):
21 self
._current
_object
= None
23 compat_html_parser
.HTMLParser
.__init
__(self
)
25 def handle_starttag(self
, tag
, attrs
):
26 attrs
= dict((k
, v
) for k
, v
in attrs
)
28 self
._current
_object
= {'attrs': attrs
, 'params': []}
30 self
._current
_object
['params'].append(attrs
)
32 def handle_endtag(self
, tag
):
34 self
.objects
.append(self
._current
_object
)
35 self
._current
_object
= None
38 def extract_object_tags(cls
, html
):
45 class GroovesharkIE(InfoExtractor
):
46 _VALID_URL
= r
'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
48 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
49 'md5': '7ecf8aefa59d6b2098517e1baa530023',
52 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
58 do_playerpage_request
= True
59 do_bootstrap_request
= True
61 def _parse_target(self
, target
):
62 uri
= compat_urlparse
.urlparse(target
)
63 hash = uri
.fragment
[1:].split('?')[0]
64 token
= os
.path
.basename(hash.rstrip('/'))
65 return (uri
, hash, token
)
67 def _build_bootstrap_url(self
, target
):
68 (uri
, hash, token
) = self
._parse
_target
(target
)
69 query
= 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse
.quote(hash, safe
=''), self
.ts
)
70 return (compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
72 def _build_meta_url(self
, target
):
73 (uri
, hash, token
) = self
._parse
_target
(target
)
74 query
= 'hash=%s&%d' % (compat_urllib_parse
.quote(hash, safe
=''), self
.ts
)
75 return (compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
77 def _build_stream_url(self
, meta
):
78 return compat_urlparse
.urlunparse(('http', meta
['streamKey']['ip'], '/stream.php', None, None, None))
80 def _build_swf_referer(self
, target
, obj
):
81 (uri
, _
, _
) = self
._parse
_target
(target
)
82 return compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, obj
['attrs']['data'], None, None, None))
84 def _transform_bootstrap(self
, js
):
85 return re
.split('(?m)^\s*try\s*{', js
)[0] \
86 .split(' = ', 1)[1].strip().rstrip(';')
88 def _transform_meta(self
, js
):
89 return js
.split('\n')[0].split('=')[1].rstrip(';')
91 def _get_meta(self
, target
):
92 (meta_url
, token
) = self
._build
_meta
_url
(target
)
93 self
.to_screen('Metadata URL: %s' % meta_url
)
95 headers
= {'Referer': compat_urlparse
.urldefrag(target
)[0]}
96 req
= compat_urllib_request
.Request(meta_url
, headers
=headers
)
97 res
= self
._download
_json
(req
, token
,
98 transform_source
=self
._transform
_meta
)
100 if 'getStreamKeyWithSong' not in res
:
101 raise ExtractorError(
102 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
104 if res
['getStreamKeyWithSong'] is None:
105 raise ExtractorError(
106 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
109 return res
['getStreamKeyWithSong']
111 def _get_bootstrap(self
, target
):
112 (bootstrap_url
, token
) = self
._build
_bootstrap
_url
(target
)
114 headers
= {'Referer': compat_urlparse
.urldefrag(target
)[0]}
115 req
= compat_urllib_request
.Request(bootstrap_url
, headers
=headers
)
116 res
= self
._download
_json
(req
, token
, fatal
=False,
117 note
='Downloading player bootstrap data',
118 errnote
='Unable to download player bootstrap data',
119 transform_source
=self
._transform
_bootstrap
)
122 def _get_playerpage(self
, target
):
123 (_
, _
, token
) = self
._parse
_target
(target
)
125 webpage
= self
._download
_webpage
(
127 note
='Downloading player page',
128 errnote
='Unable to download player page',
131 if webpage
is not None:
132 # Search (for example German) error message
133 error_msg
= self
._html
_search
_regex
(
134 r
'<div id="content">\s*<h2>(.*?)</h2>', webpage
,
135 'error message', default
=None)
136 if error_msg
is not None:
137 error_msg
= error_msg
.replace('\n', ' ')
138 raise ExtractorError('Grooveshark said: %s' % error_msg
)
140 if webpage
is not None:
141 o
= GroovesharkHtmlParser
.extract_object_tags(webpage
)
142 return (webpage
, [x
for x
in o
if x
['attrs']['id'] == 'jsPlayerEmbed'])
144 return (webpage
, None)
146 def _real_initialize(self
):
147 self
.ts
= int(time
.time() * 1000) # timestamp in millis
149 def _real_extract(self
, url
):
150 (target_uri
, _
, token
) = self
._parse
_target
(url
)
152 # 1. Fill cookiejar by making a request to the player page
154 if self
.do_playerpage_request
:
155 (_
, player_objs
) = self
._get
_playerpage
(url
)
156 if player_objs
is not None:
157 swf_referer
= self
._build
_swf
_referer
(url
, player_objs
[0])
158 self
.to_screen('SWF Referer: %s' % swf_referer
)
160 # 2. Ask preload.php for swf bootstrap data to better mimic webapp
161 if self
.do_bootstrap_request
:
162 bootstrap
= self
._get
_bootstrap
(url
)
163 self
.to_screen('CommunicationToken: %s' % bootstrap
['getCommunicationToken'])
165 # 3. Ask preload.php for track metadata.
166 meta
= self
._get
_meta
(url
)
168 # 4. Construct stream request for track.
169 stream_url
= self
._build
_stream
_url
(meta
)
170 duration
= int(math
.ceil(float(meta
['streamKey']['uSecs']) / 1000000))
171 post_dict
= {'streamKey': meta
['streamKey']['streamKey']}
172 post_data
= compat_urllib_parse
.urlencode(post_dict
).encode('utf-8')
174 'Content-Length': len(post_data
),
175 'Content-Type': 'application/x-www-form-urlencoded'
177 if swf_referer
is not None:
178 headers
['Referer'] = swf_referer
182 'title': meta
['song']['Name'],
183 'http_method': 'POST',
186 'format': 'mp3 audio',
187 'duration': duration
,
188 'http_post_data': post_data
,
189 'http_headers': headers
,