| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | # coding: utf-8 | 
					
						
							|  |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | import math | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  | import os.path | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2014-05-20 22:28:32 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | from .common import InfoExtractor | 
					
						
							| 
									
										
										
										
											2014-11-04 23:51:01 +01:00
										 |  |  | from ..compat import ( | 
					
						
							|  |  |  |     compat_html_parser, | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |     compat_urllib_parse, | 
					
						
							| 
									
										
										
										
											2014-11-04 23:51:01 +01:00
										 |  |  |     compat_urllib_request, | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |     compat_urlparse, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2014-11-04 23:51:01 +01:00
										 |  |  | from ..utils import ExtractorError | 
					
						
							| 
									
										
										
										
											2014-05-20 22:28:32 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | class GroovesharkHtmlParser(compat_html_parser.HTMLParser): | 
					
						
							|  |  |  |     def __init__(self): | 
					
						
							|  |  |  |         self._current_object = None | 
					
						
							|  |  |  |         self.objects = [] | 
					
						
							|  |  |  |         compat_html_parser.HTMLParser.__init__(self) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def handle_starttag(self, tag, attrs): | 
					
						
							|  |  |  |         attrs = dict((k, v) for k, v in attrs) | 
					
						
							|  |  |  |         if tag == 'object': | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  |             self._current_object = {'attrs': attrs, 'params': []} | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         elif tag == 'param': | 
					
						
							|  |  |  |             self._current_object['params'].append(attrs) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |     def handle_endtag(self, tag): | 
					
						
							|  |  |  |         if tag == 'object': | 
					
						
							|  |  |  |             self.objects.append(self._current_object) | 
					
						
							|  |  |  |             self._current_object = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @classmethod | 
					
						
							|  |  |  |     def extract_object_tags(cls, html): | 
					
						
							|  |  |  |         p = cls() | 
					
						
							|  |  |  |         p.feed(html) | 
					
						
							|  |  |  |         p.close() | 
					
						
							|  |  |  |         return p.objects | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | class GroovesharkIE(InfoExtractor): | 
					
						
							|  |  |  |     _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)' | 
					
						
							|  |  |  |     _TEST = { | 
					
						
							|  |  |  |         'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5', | 
					
						
							| 
									
										
										
										
											2014-08-24 01:32:12 +02:00
										 |  |  |         'md5': '7ecf8aefa59d6b2098517e1baa530023', | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': '6SS1DW', | 
					
						
							|  |  |  |             'title': 'Jolene (Tenth Key Remix ft. Will Sessions)', | 
					
						
							|  |  |  |             'ext': 'mp3', | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |             'duration': 227, | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     do_playerpage_request = True | 
					
						
							|  |  |  |     do_bootstrap_request = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _parse_target(self, target): | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         uri = compat_urlparse.urlparse(target) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         hash = uri.fragment[1:].split('?')[0] | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         token = os.path.basename(hash.rstrip('/')) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         return (uri, hash, token) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _build_bootstrap_url(self, target): | 
					
						
							|  |  |  |         (uri, hash, token) = self._parse_target(target) | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) | 
					
						
							|  |  |  |         return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _build_meta_url(self, target): | 
					
						
							|  |  |  |         (uri, hash, token) = self._parse_target(target) | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) | 
					
						
							|  |  |  |         return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _build_stream_url(self, meta): | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None)) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _build_swf_referer(self, target, obj): | 
					
						
							|  |  |  |         (uri, _, _) = self._parse_target(target) | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None)) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _transform_bootstrap(self, js): | 
					
						
							| 
									
										
										
										
											2014-05-20 22:28:32 -04:00
										 |  |  |         return re.split('(?m)^\s*try\s*{', js)[0] \ | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |                  .split(' = ', 1)[1].strip().rstrip(';') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _transform_meta(self, js): | 
					
						
							|  |  |  |         return js.split('\n')[0].split('=')[1].rstrip(';') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _get_meta(self, target): | 
					
						
							|  |  |  |         (meta_url, token) = self._build_meta_url(target) | 
					
						
							|  |  |  |         self.to_screen('Metadata URL: %s' % meta_url) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         headers = {'Referer': compat_urlparse.urldefrag(target)[0]} | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         req = compat_urllib_request.Request(meta_url, headers=headers) | 
					
						
							|  |  |  |         res = self._download_json(req, token, | 
					
						
							|  |  |  |                                   transform_source=self._transform_meta) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if 'getStreamKeyWithSong' not in res: | 
					
						
							|  |  |  |             raise ExtractorError( | 
					
						
							|  |  |  |                 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if res['getStreamKeyWithSong'] is None: | 
					
						
							|  |  |  |             raise ExtractorError( | 
					
						
							|  |  |  |                 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.', | 
					
						
							|  |  |  |                 expected=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return res['getStreamKeyWithSong'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _get_bootstrap(self, target): | 
					
						
							|  |  |  |         (bootstrap_url, token) = self._build_bootstrap_url(target) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         headers = {'Referer': compat_urlparse.urldefrag(target)[0]} | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         req = compat_urllib_request.Request(bootstrap_url, headers=headers) | 
					
						
							|  |  |  |         res = self._download_json(req, token, fatal=False, | 
					
						
							|  |  |  |                                   note='Downloading player bootstrap data', | 
					
						
							|  |  |  |                                   errnote='Unable to download player bootstrap data', | 
					
						
							|  |  |  |                                   transform_source=self._transform_bootstrap) | 
					
						
							|  |  |  |         return res | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |     def _get_playerpage(self, target): | 
					
						
							|  |  |  |         (_, _, token) = self._parse_target(target) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         webpage = self._download_webpage( | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |             target, token, | 
					
						
							|  |  |  |             note='Downloading player page', | 
					
						
							|  |  |  |             errnote='Unable to download player page', | 
					
						
							|  |  |  |             fatal=False) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         if webpage is not None: | 
					
						
							|  |  |  |             # Search (for example German) error message | 
					
						
							|  |  |  |             error_msg = self._html_search_regex( | 
					
						
							|  |  |  |                 r'<div id="content">\s*<h2>(.*?)</h2>', webpage, | 
					
						
							|  |  |  |                 'error message', default=None) | 
					
						
							|  |  |  |             if error_msg is not None: | 
					
						
							|  |  |  |                 error_msg = error_msg.replace('\n', ' ') | 
					
						
							|  |  |  |                 raise ExtractorError('Grooveshark said: %s' % error_msg) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         if webpage is not None: | 
					
						
							|  |  |  |             o = GroovesharkHtmlParser.extract_object_tags(webpage) | 
					
						
							|  |  |  |             return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return (webpage, None) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _real_initialize(self): | 
					
						
							|  |  |  |         self.ts = int(time.time() * 1000)  # timestamp in millis | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |     def _real_extract(self, url): | 
					
						
							|  |  |  |         (target_uri, _, token) = self._parse_target(url) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         # 1. Fill cookiejar by making a request to the player page | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         swf_referer = None | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         if self.do_playerpage_request: | 
					
						
							|  |  |  |             (_, player_objs) = self._get_playerpage(url) | 
					
						
							|  |  |  |             if player_objs is not None: | 
					
						
							|  |  |  |                 swf_referer = self._build_swf_referer(url, player_objs[0]) | 
					
						
							|  |  |  |                 self.to_screen('SWF Referer: %s' % swf_referer) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # 2. Ask preload.php for swf bootstrap data to better mimic webapp | 
					
						
							|  |  |  |         if self.do_bootstrap_request: | 
					
						
							|  |  |  |             bootstrap = self._get_bootstrap(url) | 
					
						
							|  |  |  |             self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken']) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         # 3. Ask preload.php for track metadata. | 
					
						
							|  |  |  |         meta = self._get_meta(url) | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         # 4. Construct stream request for track. | 
					
						
							|  |  |  |         stream_url = self._build_stream_url(meta) | 
					
						
							|  |  |  |         duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000)) | 
					
						
							|  |  |  |         post_dict = {'streamKey': meta['streamKey']['streamKey']} | 
					
						
							| 
									
										
										
										
											2014-08-24 02:06:59 +02:00
										 |  |  |         post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8') | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |         headers = { | 
					
						
							|  |  |  |             'Content-Length': len(post_data), | 
					
						
							|  |  |  |             'Content-Type': 'application/x-www-form-urlencoded' | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         if swf_referer is not None: | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  |             headers['Referer'] = swf_referer | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |         return { | 
					
						
							| 
									
										
										
										
											2014-05-20 02:47:34 -04:00
										 |  |  |             'id': token, | 
					
						
							|  |  |  |             'title': meta['song']['Name'], | 
					
						
							|  |  |  |             'http_method': 'POST', | 
					
						
							|  |  |  |             'url': stream_url, | 
					
						
							|  |  |  |             'ext': 'mp3', | 
					
						
							|  |  |  |             'format': 'mp3 audio', | 
					
						
							|  |  |  |             'duration': duration, | 
					
						
							| 
									
										
										
										
											2014-08-24 01:31:35 +02:00
										 |  |  |             'http_post_data': post_data, | 
					
						
							|  |  |  |             'http_headers': headers, | 
					
						
							| 
									
										
										
										
											2014-05-20 02:55:21 -04:00
										 |  |  |         } |