diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 166bcf443..549bdd8ca 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -3,13 +3,20 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_str +) from ..utils import ( clean_html, determine_ext, ExtractorError, int_or_none, parse_duration, + get_element_by_class, + get_elements_by_class, + js_to_json, + try_get ) @@ -32,7 +39,80 @@ class XVideosIE(InfoExtractor): 'title': 'Biker Takes his Girl', 'duration': 108, 'age_limit': 18, - } + 'uploader': "Kandys Kisses" + }, + }, { + 'url': 'https://www.xvideos.com/video43548989/petite_brooke_haze_is_so_cute', + 'md5': 'b629ee68705da901dbd60c3b3a7c16bc', + 'info_dict': { + 'id': '43548989', + 'ext': 'mp4', + 'title': 'Petite Brooke Haze is so cute', + 'duration': 521, + 'uploader': 'Amkempire', + 'uploader_channel': 'amkempire', + 'performers': [ + 'Brooke Haze' + ], + 'performer_channels': [ + 'brooke-haze' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': [{ + 'name': 'AMKEmpire', + 'desc': ('AMKingdom caters to a plethora of niches, while offering high-end, quality content. ' + 'ATK/AMK features Hairy, Natural, Mature, and other Fetish categories, since 1996!') + } + ], + 'age_limit': 18, + }, + }, { + # multiple performers + 'url': 'https://www.xvideos.com/video721515/simon_says_fuck_4', + 'md5': '68f64996a6a53ab834e14a36f2357038', + 'info_dict': { + 'id': '721515', + 'ext': 'mp4', + 'title': 'Simon says fuck! 4', + 'duration': 310, + 'uploader': 'Latgpxxx', + 'uploader_channel': 'latgpxxx', + 'performers': [ + 'Jenna Haze', + 'Billy Glide' + ], + 'performer_channels': [ + 'jenna-haze', + 'billy-glide-1' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': list, + 'age_limit': 18, + }, + }, { + # anonymous / unavailable uploader + 'url': 'https://www.xvideos.com/video91/3_young_school_girls', + 'md5': '86f6a54c5f3ad45a01c5daac512c59a6', + 'info_dict': { + 'id': '91', + 'ext': 'mp4', + 'title': '3 young school girls', + 'duration': 35, + 'uploader': None, + 'uploader_channel': None, + 'performers': [ + 'Alisha Klass' + ], + 'performer_channels': [ + 'alisha-klass' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': list, + 'age_limit': 18, + }, }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, @@ -100,11 +180,59 @@ class XVideosIE(InfoExtractor): self._sort_formats(formats) + metadata_node = get_element_by_class("video-metadata", webpage) + uploader_node = get_element_by_class("uploader-tag", metadata_node) + performer_nodes = get_elements_by_class("profile", metadata_node) + base_pattern = r']+class=["\']name["\'][^>]*>.*?(?P[^<]+)' + + uploader = None + if uploader_node is not None: + uploader = self._search_regex( + base_pattern, uploader_node, 'name', default=None, group='name', fatal=False) + + performers = [] + if performer_nodes is not None: + for node in performer_nodes: + performer = self._search_regex(base_pattern, node, 'name', default=None, group='name', fatal=False) + if performer is not None: + performers.append(performer) + + uploader_channel = self._search_regex( + r']+href=["\']/?(?:profiles|channels)/(?P[^"]+)', metadata_node, + 'channel', default=None, group='channel', fatal=False) + + performer_channels = re.findall( + r']+href=["\']/?(?:pornstar-channels|models|pornstars)/(?P[^"]+)', metadata_node) + + tags = [item.replace("-", " ") for item in re.findall(r']+href=["\']/?tags/(?P[^"]+)', + metadata_node)] + + raw_conf = self._search_regex( + r'conf\s?=\s?(?P.+);', webpage, 'json', default=None, group='json', fatal=False) + + parsed_conf = self._parse_json( + raw_conf, video_id, transform_source=js_to_json, fatal=False) + + rc_list = try_get(parsed_conf, lambda x: x['data']['related_keywords']) or [] + rc_list_alt = try_get(parsed_conf, lambda x: x['dyn']['ads']['categories'], compat_str) or None + related_categories = rc_list or [item.replace('_', ' ') for item in rc_list_alt.split(',')] + + sponsor_dict = try_get(parsed_conf, lambda x: x['data']['sponsors']) or {} + sponsor = [{'name': sponsor.get('n'), 'desc': sponsor.get('d')} + for sponsor in sponsor_dict] + return { 'id': video_id, 'formats': formats, 'title': title, 'duration': duration, 'thumbnails': thumbnails, + 'uploader': uploader, + 'performers': performers, + 'uploader_channel': uploader_channel, + 'performer_channels': performer_channels, + 'related_categories': related_categories, + 'tags': tags, + 'sponsor': sponsor, 'age_limit': 18, }