| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from .common import InfoExtractor | 
					
						
							|  |  |  | from ..utils import ( | 
					
						
							|  |  |  |     compat_urllib_parse, | 
					
						
							| 
									
										
										
										
											2013-08-23 16:40:20 +02:00
										 |  |  |     unescapeHTML, | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  |     ExtractorError, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class XHamsterIE(InfoExtractor): | 
					
						
							|  |  |  |     """Information Extractor for xHamster""" | 
					
						
							| 
									
										
										
										
											2013-09-17 06:24:20 +02:00
										 |  |  |     _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' | 
					
						
							|  |  |  |     _TESTS = [{ | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |         'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         'file': '1509445.mp4', | 
					
						
							|  |  |  |         'md5': '8281348b8d3c53d39fffb377d24eac4e', | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |         'info_dict': { | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |             "upload_date": "20121014", | 
					
						
							|  |  |  |             "uploader_id": "Ruseful2011", | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |             "title": "FemaleAgent Shy beauty takes the bait", | 
					
						
							|  |  |  |             "age_limit": 18, | 
					
						
							| 
									
										
										
										
											2013-06-27 20:46:46 +02:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2013-09-17 06:24:20 +02:00
										 |  |  |     }, | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |         'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', | 
					
						
							|  |  |  |         'file': '2221348.flv', | 
					
						
							|  |  |  |         'md5': 'e767b9475de189320f691f49c679c4c7', | 
					
						
							|  |  |  |         'info_dict': { | 
					
						
							|  |  |  |             "upload_date": "20130914", | 
					
						
							|  |  |  |             "uploader_id": "jojo747400", | 
					
						
							|  |  |  |             "title": "Britney Spears  Sexy Booty", | 
					
						
							|  |  |  |             "age_limit": 18, | 
					
						
							| 
									
										
										
										
											2013-09-17 06:24:20 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |     }] | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _real_extract(self,url): | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         def extract_video_url(webpage): | 
					
						
							|  |  |  |             mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) | 
					
						
							|  |  |  |             if mobj is None: | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |                 raise ExtractorError('Unable to extract media URL') | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |             if len(mobj.group('server')) == 0: | 
					
						
							|  |  |  |                 return compat_urllib_parse.unquote(mobj.group('file')) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 return mobj.group('server')+'/key='+mobj.group('file') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-23 03:51:09 +01:00
										 |  |  |         def extract_mp4_video_url(webpage): | 
					
						
							|  |  |  |             mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage) | 
					
						
							|  |  |  |             if mp4 is None: | 
					
						
							|  |  |  |                 return None | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 return mp4.group(1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         def is_hd(webpage): | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |             return '<div class=\'icon iconHD\'' in webpage | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  |         mobj = re.match(self._VALID_URL, url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         video_id = mobj.group('id') | 
					
						
							| 
									
										
										
										
											2013-09-17 06:24:20 +02:00
										 |  |  |         seo = mobj.group('seo') | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  |         webpage = self._download_webpage(mrss_url, video_id) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         video_title = self._html_search_regex( | 
					
						
							|  |  |  |             r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title') | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-08-23 16:40:20 +02:00
										 |  |  |         # Only a few videos have an description | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) | 
					
						
							|  |  |  |         video_description = mobj.group(1) if mobj else None | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) | 
					
						
							|  |  |  |         if mobj: | 
					
						
							|  |  |  |             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             video_upload_date = None | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |             self._downloader.report_warning('Unable to extract upload date') | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         video_uploader_id = self._html_search_regex( | 
					
						
							|  |  |  |             r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', | 
					
						
							|  |  |  |             webpage, 'uploader id', default='anonymous') | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         video_thumbnail = self._search_regex( | 
					
						
							|  |  |  |             r'\'image\':\'(?P<thumbnail>[^\']+)\'', | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |             webpage, 'thumbnail', fatal=False) | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-19 21:09:48 +02:00
										 |  |  |         age_limit = self._rta_search(webpage) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         hd = is_hd(webpage) | 
					
						
							| 
									
										
										
										
											2014-01-23 03:51:09 +01:00
										 |  |  |         video_url = extract_video_url(webpage) | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         formats = [{ | 
					
						
							|  |  |  |             'url': video_url, | 
					
						
							|  |  |  |             'format_id': 'hd' if hd else 'sd', | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |             'preference': 0, | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         }] | 
					
						
							| 
									
										
										
										
											2014-01-23 03:51:09 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |         video_mp4_url = extract_mp4_video_url(webpage) | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         if video_mp4_url is not None: | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |             formats.append({ | 
					
						
							|  |  |  |                 'url': video_mp4_url, | 
					
						
							|  |  |  |                 'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |                 'format_id': 'mp4-hd' if hd else 'mp4-sd', | 
					
						
							|  |  |  |                 'preference': 1, | 
					
						
							| 
									
										
										
										
											2014-01-23 03:52:59 +01:00
										 |  |  |             }) | 
					
						
							| 
									
										
										
										
											2014-01-23 03:51:09 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         if not hd: | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |             webpage = self._download_webpage( | 
					
						
							|  |  |  |                 mrss_url + '?hd', video_id, note='Downloading HD webpage') | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |             if is_hd(webpage): | 
					
						
							|  |  |  |                 video_url = extract_video_url(webpage) | 
					
						
							|  |  |  |                 formats.append({ | 
					
						
							|  |  |  |                     'url': video_url, | 
					
						
							|  |  |  |                     'format_id': 'hd', | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |                     'preference': 2, | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |                 }) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-23 04:04:35 +01:00
										 |  |  |         self._sort_formats(formats) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         return { | 
					
						
							|  |  |  |             'id': video_id, | 
					
						
							|  |  |  |             'title': video_title, | 
					
						
							|  |  |  |             'formats': formats, | 
					
						
							| 
									
										
										
										
											2013-08-23 16:40:20 +02:00
										 |  |  |             'description': video_description, | 
					
						
							| 
									
										
										
										
											2013-06-23 22:32:44 +02:00
										 |  |  |             'upload_date': video_upload_date, | 
					
						
							|  |  |  |             'uploader_id': video_uploader_id, | 
					
						
							| 
									
										
										
										
											2013-10-19 21:09:48 +02:00
										 |  |  |             'thumbnail': video_thumbnail, | 
					
						
							|  |  |  |             'age_limit': age_limit, | 
					
						
							| 
									
										
										
										
											2013-10-26 20:38:54 +02:00
										 |  |  |         } |