add xmovie8 extractor, and bypass the cloudflare challenge

This commit is contained in:
yc0 2017-10-11 10:50:39 +08:00
parent cf5f6ed5be
commit 6aa6cfc113
2 changed files with 331 additions and 0 deletions

View File

@ -1307,6 +1307,7 @@ from .xiami import (
XiamiCollectionIE XiamiCollectionIE
) )
from .xminus import XMinusIE from .xminus import XMinusIE
from .xmovies8 import XMovies8IE
from .xnxx import XNXXIE from .xnxx import XNXXIE
from .xstream import XstreamIE from .xstream import XstreamIE
from .xtube import XTubeUserIE, XTubeIE from .xtube import XTubeUserIE, XTubeIE

View File

@ -0,0 +1,330 @@
# coding: utf-8
from __future__ import unicode_literals
import re, time,operator
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
)
from ..utils import (
clean_html,
urljoin,
compat_urlparse,
ExtractorError,
sanitized_Request,
update_Request
)
def cookie_to_dict(cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
}
if cookie.port_specified:
cookie_dict['port'] = cookie.port
if cookie.domain_specified:
cookie_dict['domain'] = cookie.domain
if cookie.path_specified:
cookie_dict['path'] = cookie.path
if cookie.expires is not None:
cookie_dict['expires'] = cookie.expires
if cookie.secure is not None:
cookie_dict['secure'] = cookie.secure
if cookie.discard is not None:
cookie_dict['discard'] = cookie.discard
try:
if (cookie.has_nonstandard_attr('httpOnly') or
cookie.has_nonstandard_attr('httponly') or
cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
except TypeError:
pass
return cookie_dict
def evaluate_expression(expr):
"""Evaluate a Javascript expression for the challange and return its value"""
stack = []
ranges = []
value = ""
for index, char in enumerate(expr):
if char == "(":
stack.append(index+1)
elif char == ")":
begin = stack.pop()
if stack:
ranges.append((begin, index))
for subexpr in [expr[begin:end] for begin, end in ranges] or (expr,):
num = 0
for part in subexpr.split("[]"):
num += expression_values[part]
value += str(num)
return int(value)
operator_functions = {
"+": operator.add,
"-": operator.sub,
"*": operator.mul,
}
expression_values = {
"": 0,
"+": 0,
"!+": 1,
"+!!": 1,
}
class XMovies8IE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_VALID_URL = r'''(?x)
https?://(?:www\.)?xmovies8\.(?:tv|es)/movie/
(?P<id>[a-zA-Z\-\.0-9]+)/?
(?P<isWatching>watching)?
(?:\.html)?
'''
_TEST = {
'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852',
# 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
'info_dict': {
'id': '36164052',
'ext': 'flv',
'title': '데일리 에이프릴 요정들의 시상식!',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
},
'params': {
# m3u8 download
'skip_download': True,
}
}
def _get_cv(self,ct, host_name):
#ct = ct.replace('\n', '').replace('\r', '')
#find all hidden form value
hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
hidden = '&'.join(map(lambda x:'='.join(x), hidden))
# get challange endpoint url
url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
# get var name
# var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
_, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
v = self._calc_symbol(v)
# call eval() to calc expression
for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
v = eval('%d %s %d' % (v, op, self._calc_symbol(arg)))
# t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
# print '%s\.innerHTML\s*=\s*"([^"])";' % t
# new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
# here we assume the meaning of t in defintely hostname, cf may change in the future
v += len(host_name)
# get wait time
wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
return hidden, v, url, wait
def _calc_symbol(self,s):
_ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
#type 1 +((...)+(...)) 2-digit num
if _:
v1, v2 = map(self._calc_symbol, _[0])
return int(str(v1)+str(v2))
#type 2 plain
else:
# use look-up table to replace
vmap = {'!':1, '[]':0, '!![]':1, '':0}
return sum(map(lambda x:vmap[x], s.split('+')))
def _pycfl(self,s):
# !+[] 1
# !![] 1
# ![] 0
# [] 0
result = ''
# print(s) # DEBUG
ss = re.split('\(|\)', s)
for s in ss:
if s in ('+', ''):
continue
elif s[0] == '+':
s = s[1:]
s = s.replace('!+[]', '1')
s = s.replace('!![]', '1')
s = s.replace('![]', '0')
s = s.replace('[]', '0')
s = s.replace('+!![]', '10')
result += str(sum([int(i) for i in s.split('+')]))
return result
def _extract_all(self,txt, rules, pos=0, values=None):
"""Calls extract for each rule and returns the result in a dict"""
if values is None:
values = {}
for key, begin, end in rules:
result, pos = self._extract(txt, begin, end, pos)
if key:
values[key] = result
return values, pos
def _extract(self,txt, begin, end, pos=0):
"""Extract the text between 'begin' and 'end' from 'txt'
Args:
txt: String to search in
begin: First string to be searched for
end: Second string to be searched for after 'begin'
pos: Starting position for searches in 'txt'
Returns:
The string between the two search-strings 'begin' and 'end' beginning
with position 'pos' in 'txt' as well as the position after 'end'.
If at least one of 'begin' or 'end' is not found, None and the original
value of 'pos' is returned
Examples:
extract("abcde", "b", "d") -> "c" , 4
extract("abcde", "b", "d", 3) -> None, 3
"""
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except ValueError:
return None, pos
def _solve_challenge(self, req,headers=None):
try:
self._request_webpage(
req, None, note='Solve Challenge',headers=headers)
except ExtractorError as ee:
if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503:
raise
page = ee.cause.read().decode('utf-8')
params = self._extract_all(page, (
('jschl_vc', 'name="jschl_vc" value="', '"'),
('pass' , 'name="pass" value="', '"'),
))[0]
params["jschl_answer"] = self._solve_jschl(req.full_url, page)
time.sleep(4)
print("params : ",params)
req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params)
self._request_webpage(
req, None, note='Downloading redirect page',headers=headers,fatal=False)
return req
# session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params)
# return session.cookies
def _solve_jschl(self,url, page):
"""Solve challenge to get 'jschl_answer' value"""
data, pos = self._extract_all(page, (
('var' , ',f, ', '='),
('key' , '"', '"'),
('expr', ':', '}')
))
solution = evaluate_expression(data["expr"])
variable = "{}.{}".format(data["var"], data["key"])
vlength = len(variable)
expressions = self._extract(page, "'challenge-form');", "f.submit();", pos)[0]
for expr in expressions.split(";")[1:]:
if expr.startswith(variable):
func = operator_functions[expr[vlength]]
value = evaluate_expression(expr[vlength+2:])
solution = func(solution, value)
elif expr.startswith("a.value"):
return solution + len(compat_urllib_parse_urlparse(url).netloc)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
isWatching = mobj.group('isWatching')
full_video_url = compat_urlparse.urljoin(url, "/watching.html") if not isWatching else url
parsed_url = compat_urllib_parse_urlparse(full_video_url)
headers = {
'User-Agent': self._USER_AGENT,
# 'Cookie':'__cfduid='+cfduid,
'Referer':'http://'+parsed_url.netloc+'/',
# 'Host':parsed_url.netloc
}
req = sanitized_Request(full_video_url)
self._solve_challenge(req,headers)
try:
webpage = self._download_webpage(req, video_id, headers=headers)
print("??????")
except ExtractorError as ee:
# print(ee)
if not isinstance(ee.cause, compat_HTTPError) or \
ee.cause.code != 503:
raise
redir_webpage = ee.cause.read().decode('utf-8')
cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value
self._set_cookie(parsed_url.netloc,'__cfduid',cfduid)
c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc)
print(c,v,u,w)
# action = self._search_regex(
# r'<form id="challenge-form" action="([^"]+)"',
# redir_webpage, 'Redirect form')
# vc = self._search_regex(
# r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
# redir_webpage, 'redirect vc value')
# pwd = self._search_regex(
# r'<input type="hidden" name="pass" value="([^"]+)"/>',
# redir_webpage, 'redirect pass value')
# av = re.search(
# r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
# redir_webpage)
# init = re.search(
# r'''
# (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P<dict>[a-zA-Z]*)={\"(?P<key>[a-zA-Z]*)\":(?P<init>[\(\)!\[\]\+]*)
# '''
# ,redir_webpage)
# ans = int(self._pycfl(init.group('init')))
# for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P<oper>[+\-\*/])=(?P<val>[\(\)!\[\]\+]*);',redir_webpage):
# if '*' == content.group('oper'):
# ans *= int(self._pycfl(content.group('val')))
# elif '+' == content.group('oper'):
# ans += int(self._pycfl(content.group('val')))
# elif '-' == content.group('oper'):
# ans -= int(self._pycfl(content.group('val')))
# elif '/' == content.group('oper'):
# ans /= int(self._pycfl(content.group('val')))
# ans += len(parsed_url.netloc)
# confirm_url = (
# parsed_url.scheme + '://' + parsed_url.netloc +
# action + '?' +
# compat_urllib_parse_urlencode({
# 'jschl_vc': vc,
# # 'pass': pwd,
# 'jschl_answer': compat_str(ans)
# })
# )
try:
time.sleep(int(w)//1000)
urlh = self._request_webpage(
req, None, note='Downloading redirect page',headers=headers,fatal=False)
# print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v))
# print(confirm_url)
# webpage, url_handle = self._download_webpage_handle(
# confirm_url, None, 'Downloading login page',headers=headers)
# webpage = self._download_webpage(
# confirm_url, video_id,
# note='Confirming after redirect',
# headers=headers)
self.to_screen(webpage)
# title = self._html_search_regex(r'<div class="info_movie(?:\sfull)?"[^>]+<div class="tit full"><h1>(.+?)</h1>', webpage, 'title', fatal=False)
# print(title)
return {
'id': video_id,
# 'title': title,
'description': self._og_search_description(webpage),
# 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
# TODO more properties (see youtube_dl/extractor/common.py)
}
except ExtractorError as ee:
if not isinstance(ee.cause, compat_HTTPError) or \
ee.cause.code != 503:
raise
webpage = ee.cause.read().decode('utf-8')