| 
									
										
										
										
											2014-01-17 04:06:18 +01:00
										 |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  | import itertools | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | from .common import InfoExtractor | 
					
						
							| 
									
										
										
										
											2014-12-13 12:24:42 +01:00
										 |  |  | from ..compat import ( | 
					
						
							| 
									
										
										
										
											2014-02-27 18:58:09 +01:00
										 |  |  |     compat_urllib_parse, | 
					
						
							| 
									
										
										
										
											2014-12-13 12:24:42 +01:00
										 |  |  | ) | 
					
						
							|  |  |  | from ..utils import ( | 
					
						
							| 
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 |  |  |     ExtractorError, | 
					
						
							| 
									
										
										
										
											2014-08-26 14:55:15 +02:00
										 |  |  |     HEADRequest, | 
					
						
							| 
									
										
										
										
											2015-01-30 23:21:44 +06:00
										 |  |  |     str_to_int, | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MixcloudIE(InfoExtractor): | 
					
						
							| 
									
										
										
										
											2014-02-26 00:04:03 +01:00
										 |  |  |     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' | 
					
						
							| 
									
										
										
										
											2014-01-17 04:06:18 +01:00
										 |  |  |     IE_NAME = 'mixcloud' | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |     _TESTS = [{ | 
					
						
							| 
									
										
										
										
											2014-01-17 04:06:18 +01:00
										 |  |  |         'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', | 
					
						
							|  |  |  |         'info_dict': { | 
					
						
							| 
									
										
										
										
											2014-02-27 18:58:09 +01:00
										 |  |  |             'id': 'dholbach-cryptkeeper', | 
					
						
							|  |  |  |             'ext': 'mp3', | 
					
						
							| 
									
										
										
										
											2014-01-17 04:06:18 +01:00
										 |  |  |             'title': 'Cryptkeeper', | 
					
						
							|  |  |  |             'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', | 
					
						
							|  |  |  |             'uploader': 'Daniel Holbach', | 
					
						
							|  |  |  |             'uploader_id': 'dholbach', | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             'thumbnail': 're:https?://.*\.jpg', | 
					
						
							|  |  |  |             'view_count': int, | 
					
						
							|  |  |  |             'like_count': int, | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |     }, { | 
					
						
							|  |  |  |         'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', | 
					
						
							|  |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', | 
					
						
							| 
									
										
										
										
											2015-03-18 16:50:23 +01:00
										 |  |  |             'ext': 'mp3', | 
					
						
							|  |  |  |             'title': 'Caribou 7 inch Vinyl Mix & Chat', | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |             'description': 'md5:2b8aec6adce69f9d41724647c65875e8', | 
					
						
							| 
									
										
										
										
											2015-03-18 16:50:23 +01:00
										 |  |  |             'uploader': 'Gilles Peterson Worldwide', | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |             'uploader_id': 'gillespeterson', | 
					
						
							| 
									
										
										
										
											2015-03-18 16:50:23 +01:00
										 |  |  |             'thumbnail': 're:https?://.*/images/', | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |             'view_count': int, | 
					
						
							|  |  |  |             'like_count': int, | 
					
						
							|  |  |  |         }, | 
					
						
							|  |  |  |     }] | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  |     def _get_url(self, track_id, template_url, server_number): | 
					
						
							|  |  |  |         boundaries = (1, 30) | 
					
						
							|  |  |  |         for nr in server_numbers(server_number, boundaries): | 
					
						
							|  |  |  |             url = template_url % nr | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 |  |  |                 # We only want to know if the request succeed | 
					
						
							|  |  |  |                 # don't download the whole file | 
					
						
							| 
									
										
										
										
											2014-10-15 00:53:54 +02:00
										 |  |  |                 self._request_webpage( | 
					
						
							|  |  |  |                     HEADRequest(url), track_id, | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  |                     'Checking URL %d/%d ...' % (nr, boundaries[-1])) | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  |                 return url | 
					
						
							| 
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 |  |  |             except ExtractorError: | 
					
						
							| 
									
										
										
										
											2014-10-15 00:53:54 +02:00
										 |  |  |                 pass | 
					
						
							| 
									
										
										
										
											2013-06-23 21:59:15 +02:00
										 |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _real_extract(self, url): | 
					
						
							|  |  |  |         mobj = re.match(self._VALID_URL, url) | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |         uploader = mobj.group(1) | 
					
						
							|  |  |  |         cloudcast_name = mobj.group(2) | 
					
						
							| 
									
										
										
										
											2014-02-27 18:58:09 +01:00
										 |  |  |         track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name))) | 
					
						
							| 
									
										
										
										
											2014-01-17 04:05:15 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |         webpage = self._download_webpage(url, track_id) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-17 04:05:15 +01:00
										 |  |  |         preview_url = self._search_regex( | 
					
						
							| 
									
										
										
										
											2015-02-04 19:47:55 +06:00
										 |  |  |             r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') | 
					
						
							| 
									
										
										
										
											2014-01-01 21:07:55 +01:00
										 |  |  |         song_url = preview_url.replace('/previews/', '/c/originals/') | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  |         server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |         template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  |         final_song_url = self._get_url(track_id, template_url, server_number) | 
					
						
							| 
									
										
										
										
											2013-12-10 13:42:41 +01:00
										 |  |  |         if final_song_url is None: | 
					
						
							|  |  |  |             self.to_screen('Trying with m4a extension') | 
					
						
							|  |  |  |             template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  |             final_song_url = self._get_url(track_id, template_url, server_number) | 
					
						
							| 
									
										
										
										
											2013-12-10 13:42:41 +01:00
										 |  |  |         if final_song_url is None: | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             raise ExtractorError('Unable to extract track url') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         PREFIX = ( | 
					
						
							| 
									
										
										
										
											2015-03-18 16:50:23 +01:00
										 |  |  |             r'm-play-on-spacebar[^>]+' | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') | 
					
						
							|  |  |  |         title = self._html_search_regex( | 
					
						
							|  |  |  |             PREFIX + r'm-title="([^"]+)"', webpage, 'title') | 
					
						
							|  |  |  |         thumbnail = self._proto_relative_url(self._html_search_regex( | 
					
						
							|  |  |  |             PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', | 
					
						
							|  |  |  |             fatal=False)) | 
					
						
							|  |  |  |         uploader = self._html_search_regex( | 
					
						
							|  |  |  |             PREFIX + r'm-owner-name="([^"]+)"', | 
					
						
							|  |  |  |             webpage, 'uploader', fatal=False) | 
					
						
							|  |  |  |         uploader_id = self._search_regex( | 
					
						
							|  |  |  |             r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) | 
					
						
							|  |  |  |         description = self._og_search_description(webpage) | 
					
						
							| 
									
										
										
										
											2015-01-30 23:21:44 +06:00
										 |  |  |         like_count = str_to_int(self._search_regex( | 
					
						
							| 
									
										
										
										
											2015-03-15 22:32:06 +01:00
										 |  |  |             r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             webpage, 'like count', fatal=False)) | 
					
						
							| 
									
										
										
										
											2015-01-30 23:21:44 +06:00
										 |  |  |         view_count = str_to_int(self._search_regex( | 
					
						
							|  |  |  |             [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', | 
					
						
							|  |  |  |              r'/listeners/?">([0-9,.]+)</a>'], | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             webpage, 'play count', fatal=False)) | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             'id': track_id, | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             'title': title, | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |             'url': final_song_url, | 
					
						
							| 
									
										
										
										
											2014-05-13 09:42:38 +02:00
										 |  |  |             'description': description, | 
					
						
							|  |  |  |             'thumbnail': thumbnail, | 
					
						
							|  |  |  |             'uploader': uploader, | 
					
						
							|  |  |  |             'uploader_id': uploader_id, | 
					
						
							|  |  |  |             'view_count': view_count, | 
					
						
							|  |  |  |             'like_count': like_count, | 
					
						
							| 
									
										
										
										
											2013-09-14 14:26:42 +02:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2015-03-16 00:20:06 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def server_numbers(first, boundaries): | 
					
						
							|  |  |  |     """ Server numbers to try in descending order of probable availability.
 | 
					
						
							|  |  |  |     Starting from first (i.e. the number of the server hosting the preview file) | 
					
						
							|  |  |  |     and going further and further up to the higher boundary and down to the | 
					
						
							|  |  |  |     lower one in an alternating fashion. Namely: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         server_numbers(2, (1, 5)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Where the preview server is 2, min number is 1 and max is 5. | 
					
						
							|  |  |  |         # Yields: 2, 3, 1, 4, 5 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Why not random numbers or increasing sequences? Since from what I've seen, | 
					
						
							|  |  |  |     full length files seem to be hosted on servers whose number is closer to | 
					
						
							|  |  |  |     that of the preview; to be confirmed. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     zip_longest = getattr(itertools, 'zip_longest', None) | 
					
						
							|  |  |  |     if zip_longest is None: | 
					
						
							|  |  |  |         # python 2.x | 
					
						
							|  |  |  |         zip_longest = itertools.izip_longest | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if len(boundaries) != 2: | 
					
						
							|  |  |  |         raise ValueError("boundaries should be a two-element tuple") | 
					
						
							|  |  |  |     min, max = boundaries | 
					
						
							|  |  |  |     highs = range(first + 1, max + 1) | 
					
						
							|  |  |  |     lows = range(first - 1, min - 1, -1) | 
					
						
							|  |  |  |     rest = filter( | 
					
						
							|  |  |  |         None, itertools.chain.from_iterable(zip_longest(highs, lows))) | 
					
						
							|  |  |  |     yield first | 
					
						
							|  |  |  |     for n in rest: | 
					
						
							|  |  |  |         yield n |