]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/grooveshark.py
2 from __future__
import unicode_literals
10 from .common
import InfoExtractor
11 from ..compat
import (
14 compat_urllib_request
,
17 from ..utils
import ExtractorError
20 class GroovesharkHtmlParser(compat_html_parser
.HTMLParser
):
22 self
._current
_object
= None
24 compat_html_parser
.HTMLParser
.__init
__(self
)
26 def handle_starttag(self
, tag
, attrs
):
27 attrs
= dict((k
, v
) for k
, v
in attrs
)
29 self
._current
_object
= {'attrs': attrs
, 'params': []}
31 self
._current
_object
['params'].append(attrs
)
33 def handle_endtag(self
, tag
):
35 self
.objects
.append(self
._current
_object
)
36 self
._current
_object
= None
39 def extract_object_tags(cls
, html
):
46 class GroovesharkIE(InfoExtractor
):
47 _VALID_URL
= r
'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
49 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
50 'md5': '7ecf8aefa59d6b2098517e1baa530023',
53 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
59 do_playerpage_request
= True
60 do_bootstrap_request
= True
62 def _parse_target(self
, target
):
63 uri
= compat_urlparse
.urlparse(target
)
64 hash = uri
.fragment
[1:].split('?')[0]
65 token
= os
.path
.basename(hash.rstrip('/'))
66 return (uri
, hash, token
)
68 def _build_bootstrap_url(self
, target
):
69 (uri
, hash, token
) = self
._parse
_target
(target
)
70 query
= 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse
.quote(hash, safe
=''), self
.ts
)
71 return (compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
73 def _build_meta_url(self
, target
):
74 (uri
, hash, token
) = self
._parse
_target
(target
)
75 query
= 'hash=%s&%d' % (compat_urllib_parse
.quote(hash, safe
=''), self
.ts
)
76 return (compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
78 def _build_stream_url(self
, meta
):
79 return compat_urlparse
.urlunparse(('http', meta
['streamKey']['ip'], '/stream.php', None, None, None))
81 def _build_swf_referer(self
, target
, obj
):
82 (uri
, _
, _
) = self
._parse
_target
(target
)
83 return compat_urlparse
.urlunparse((uri
.scheme
, uri
.netloc
, obj
['attrs']['data'], None, None, None))
85 def _transform_bootstrap(self
, js
):
86 return re
.split('(?m)^\s*try\s*\{', js
)[0] \
87 .split(' = ', 1)[1].strip().rstrip(';')
89 def _transform_meta(self
, js
):
90 return js
.split('\n')[0].split('=')[1].rstrip(';')
92 def _get_meta(self
, target
):
93 (meta_url
, token
) = self
._build
_meta
_url
(target
)
94 self
.to_screen('Metadata URL: %s' % meta_url
)
96 headers
= {'Referer': compat_urlparse
.urldefrag(target
)[0]}
97 req
= compat_urllib_request
.Request(meta_url
, headers
=headers
)
98 res
= self
._download
_json
(req
, token
,
99 transform_source
=self
._transform
_meta
)
101 if 'getStreamKeyWithSong' not in res
:
102 raise ExtractorError(
103 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
105 if res
['getStreamKeyWithSong'] is None:
106 raise ExtractorError(
107 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
110 return res
['getStreamKeyWithSong']
112 def _get_bootstrap(self
, target
):
113 (bootstrap_url
, token
) = self
._build
_bootstrap
_url
(target
)
115 headers
= {'Referer': compat_urlparse
.urldefrag(target
)[0]}
116 req
= compat_urllib_request
.Request(bootstrap_url
, headers
=headers
)
117 res
= self
._download
_json
(req
, token
, fatal
=False,
118 note
='Downloading player bootstrap data',
119 errnote
='Unable to download player bootstrap data',
120 transform_source
=self
._transform
_bootstrap
)
123 def _get_playerpage(self
, target
):
124 (_
, _
, token
) = self
._parse
_target
(target
)
126 webpage
= self
._download
_webpage
(
128 note
='Downloading player page',
129 errnote
='Unable to download player page',
132 if webpage
is not None:
133 # Search (for example German) error message
134 error_msg
= self
._html
_search
_regex
(
135 r
'<div id="content">\s*<h2>(.*?)</h2>', webpage
,
136 'error message', default
=None)
137 if error_msg
is not None:
138 error_msg
= error_msg
.replace('\n', ' ')
139 raise ExtractorError('Grooveshark said: %s' % error_msg
)
141 if webpage
is not None:
142 o
= GroovesharkHtmlParser
.extract_object_tags(webpage
)
143 return (webpage
, [x
for x
in o
if x
['attrs']['id'] == 'jsPlayerEmbed'])
145 return (webpage
, None)
147 def _real_initialize(self
):
148 self
.ts
= int(time
.time() * 1000) # timestamp in millis
150 def _real_extract(self
, url
):
151 (target_uri
, _
, token
) = self
._parse
_target
(url
)
153 # 1. Fill cookiejar by making a request to the player page
155 if self
.do_playerpage_request
:
156 (_
, player_objs
) = self
._get
_playerpage
(url
)
157 if player_objs
is not None:
158 swf_referer
= self
._build
_swf
_referer
(url
, player_objs
[0])
159 self
.to_screen('SWF Referer: %s' % swf_referer
)
161 # 2. Ask preload.php for swf bootstrap data to better mimic webapp
162 if self
.do_bootstrap_request
:
163 bootstrap
= self
._get
_bootstrap
(url
)
164 self
.to_screen('CommunicationToken: %s' % bootstrap
['getCommunicationToken'])
166 # 3. Ask preload.php for track metadata.
167 meta
= self
._get
_meta
(url
)
169 # 4. Construct stream request for track.
170 stream_url
= self
._build
_stream
_url
(meta
)
171 duration
= int(math
.ceil(float(meta
['streamKey']['uSecs']) / 1000000))
172 post_dict
= {'streamKey': meta
['streamKey']['streamKey']}
173 post_data
= compat_urllib_parse
.urlencode(post_dict
).encode('utf-8')
175 'Content-Length': len(post_data
),
176 'Content-Type': 'application/x-www-form-urlencoded'
178 if swf_referer
is not None:
179 headers
['Referer'] = swf_referer
183 'title': meta
['song']['Name'],
184 'http_method': 'POST',
187 'format': 'mp3 audio',
188 'duration': duration
,
189 'http_post_data': post_data
,
190 'http_headers': headers
,