Merge branch 'master' into BlenderCloud-issue-13282

This commit is contained in:
Parmjit Virk 2017-07-07 15:49:44 -05:00
commit a7a3bf1fbc

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
clean_html, clean_html,
dict_get, dict_get,
@ -14,12 +15,21 @@ from ..utils import (
class XHamsterIE(InfoExtractor): class XHamsterIE(InfoExtractor):
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' _VALID_URL = r'''(?x)
https?://
(?:.+?\.)?xhamster\.com/
(?:
movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|
videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+)
)
'''
_TESTS = [{ _TESTS = [{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'md5': '8281348b8d3c53d39fffb377d24eac4e', 'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': { 'info_dict': {
'id': '1509445', 'id': '1509445',
'display_id': 'femaleagent_shy_beauty_takes_the_bait',
'ext': 'mp4', 'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait', 'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014', 'upload_date': '20121014',
@ -32,6 +42,7 @@ class XHamsterIE(InfoExtractor):
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'info_dict': { 'info_dict': {
'id': '2221348', 'id': '2221348',
'display_id': 'britney_spears_sexy_booty',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Britney Spears Sexy Booty', 'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914', 'upload_date': '20130914',
@ -66,26 +77,18 @@ class XHamsterIE(InfoExtractor):
# This video is visible for marcoalfa123456's friends only # This video is visible for marcoalfa123456's friends only
'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',
'only_matching': True, 'only_matching': True,
}, {
# new URL schema
'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
def extract_video_url(webpage, name):
return self._search_regex(
[r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
webpage, name, group='mp4')
def is_hd(webpage):
return '<div class=\'icon iconHD\'' in webpage
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') or mobj.group('id_2')
display_id = mobj.group('display_id') or mobj.group('display_id_2')
video_id = mobj.group('id') webpage = self._download_webpage(url, video_id)
seo = mobj.group('seo')
proto = mobj.group('proto')
mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
error = self._html_search_regex( error = self._html_search_regex(
r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
@ -99,6 +102,39 @@ class XHamsterIE(InfoExtractor):
r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
webpage, 'title') webpage, 'title')
formats = []
format_urls = set()
sources = self._parse_json(
self._search_regex(
r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources',
default='{}'),
video_id, fatal=False)
for format_id, format_url in sources.items():
if not isinstance(format_url, compat_str):
continue
if format_url in format_urls:
continue
format_urls.add(format_url)
formats.append({
'format_id': format_id,
'url': format_url,
'height': int_or_none(self._search_regex(
r'^(\d+)[pP]', format_id, 'height', default=None))
})
video_url = self._search_regex(
[r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
webpage, 'video url', group='mp4', default=None)
if video_url and video_url not in format_urls:
formats.append({
'url': video_url,
})
self._sort_formats(formats)
# Only a few videos have an description # Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
description = mobj.group(1) if mobj else None description = mobj.group(1) if mobj else None
@ -117,7 +153,8 @@ class XHamsterIE(InfoExtractor):
webpage, 'thumbnail', fatal=False, group='thumbnail') webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._search_regex( duration = parse_duration(self._search_regex(
r'Runtime:\s*</span>\s*([\d:]+)', webpage, [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']',
r'Runtime:\s*</span>\s*([\d:]+)'], webpage,
'duration', fatal=False)) 'duration', fatal=False))
view_count = int_or_none(self._search_regex( view_count = int_or_none(self._search_regex(
@ -132,30 +169,6 @@ class XHamsterIE(InfoExtractor):
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
hd = is_hd(webpage)
format_id = 'hd' if hd else 'sd'
video_url = extract_video_url(webpage, format_id)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
'preference': 1,
}]
if not hd:
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
video_url = extract_video_url(webpage, 'hd')
formats.append({
'url': video_url,
'format_id': 'hd',
'preference': 2,
})
self._sort_formats(formats)
categories_html = self._search_regex( categories_html = self._search_regex(
r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage,
'categories', default=None) 'categories', default=None)
@ -164,6 +177,7 @@ class XHamsterIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'display_id': display_id,
'title': title, 'title': title,
'description': description, 'description': description,
'upload_date': upload_date, 'upload_date': upload_date,