add xmovie8 extractor, and bypass the cloudflare challenge
This commit is contained in:
parent
cf5f6ed5be
commit
6aa6cfc113
@ -1307,6 +1307,7 @@ from .xiami import (
|
|||||||
XiamiCollectionIE
|
XiamiCollectionIE
|
||||||
)
|
)
|
||||||
from .xminus import XMinusIE
|
from .xminus import XMinusIE
|
||||||
|
from .xmovies8 import XMovies8IE
|
||||||
from .xnxx import XNXXIE
|
from .xnxx import XNXXIE
|
||||||
from .xstream import XstreamIE
|
from .xstream import XstreamIE
|
||||||
from .xtube import XTubeUserIE, XTubeIE
|
from .xtube import XTubeUserIE, XTubeIE
|
||||||
|
330
youtube_dl/extractor/xmovies8.py
Normal file
330
youtube_dl/extractor/xmovies8.py
Normal file
@ -0,0 +1,330 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re, time,operator
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..compat import (
|
||||||
|
compat_HTTPError,
|
||||||
|
compat_str,
|
||||||
|
compat_urllib_parse_urlencode,
|
||||||
|
compat_urllib_parse_urlparse,
|
||||||
|
)
|
||||||
|
from ..utils import (
|
||||||
|
clean_html,
|
||||||
|
urljoin,
|
||||||
|
compat_urlparse,
|
||||||
|
ExtractorError,
|
||||||
|
sanitized_Request,
|
||||||
|
update_Request
|
||||||
|
)
|
||||||
|
def cookie_to_dict(cookie):
|
||||||
|
cookie_dict = {
|
||||||
|
'name': cookie.name,
|
||||||
|
'value': cookie.value,
|
||||||
|
}
|
||||||
|
if cookie.port_specified:
|
||||||
|
cookie_dict['port'] = cookie.port
|
||||||
|
if cookie.domain_specified:
|
||||||
|
cookie_dict['domain'] = cookie.domain
|
||||||
|
if cookie.path_specified:
|
||||||
|
cookie_dict['path'] = cookie.path
|
||||||
|
if cookie.expires is not None:
|
||||||
|
cookie_dict['expires'] = cookie.expires
|
||||||
|
if cookie.secure is not None:
|
||||||
|
cookie_dict['secure'] = cookie.secure
|
||||||
|
if cookie.discard is not None:
|
||||||
|
cookie_dict['discard'] = cookie.discard
|
||||||
|
try:
|
||||||
|
if (cookie.has_nonstandard_attr('httpOnly') or
|
||||||
|
cookie.has_nonstandard_attr('httponly') or
|
||||||
|
cookie.has_nonstandard_attr('HttpOnly')):
|
||||||
|
cookie_dict['httponly'] = True
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
return cookie_dict
|
||||||
|
def evaluate_expression(expr):
|
||||||
|
"""Evaluate a Javascript expression for the challange and return its value"""
|
||||||
|
stack = []
|
||||||
|
ranges = []
|
||||||
|
value = ""
|
||||||
|
for index, char in enumerate(expr):
|
||||||
|
if char == "(":
|
||||||
|
stack.append(index+1)
|
||||||
|
elif char == ")":
|
||||||
|
begin = stack.pop()
|
||||||
|
if stack:
|
||||||
|
ranges.append((begin, index))
|
||||||
|
for subexpr in [expr[begin:end] for begin, end in ranges] or (expr,):
|
||||||
|
num = 0
|
||||||
|
for part in subexpr.split("[]"):
|
||||||
|
num += expression_values[part]
|
||||||
|
value += str(num)
|
||||||
|
return int(value)
|
||||||
|
|
||||||
|
operator_functions = {
|
||||||
|
"+": operator.add,
|
||||||
|
"-": operator.sub,
|
||||||
|
"*": operator.mul,
|
||||||
|
}
|
||||||
|
|
||||||
|
expression_values = {
|
||||||
|
"": 0,
|
||||||
|
"+": 0,
|
||||||
|
"!+": 1,
|
||||||
|
"+!!": 1,
|
||||||
|
}
|
||||||
|
class XMovies8IE(InfoExtractor):
|
||||||
|
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
|
||||||
|
_VALID_URL = r'''(?x)
|
||||||
|
https?://(?:www\.)?xmovies8\.(?:tv|es)/movie/
|
||||||
|
(?P<id>[a-zA-Z\-\.0-9]+)/?
|
||||||
|
(?P<isWatching>watching)?
|
||||||
|
(?:\.html)?
|
||||||
|
'''
|
||||||
|
_TEST = {
|
||||||
|
'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852',
|
||||||
|
|
||||||
|
# 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
|
||||||
|
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '36164052',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': '데일리 에이프릴 요정들의 시상식!',
|
||||||
|
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
|
||||||
|
'uploader': 'dailyapril',
|
||||||
|
'uploader_id': 'dailyapril',
|
||||||
|
'upload_date': '20160503',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
# m3u8 download
|
||||||
|
'skip_download': True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def _get_cv(self,ct, host_name):
|
||||||
|
#ct = ct.replace('\n', '').replace('\r', '')
|
||||||
|
#find all hidden form value
|
||||||
|
hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
|
||||||
|
hidden = '&'.join(map(lambda x:'='.join(x), hidden))
|
||||||
|
# get challange endpoint url
|
||||||
|
url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
|
||||||
|
# get var name
|
||||||
|
# var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
|
||||||
|
_, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
|
||||||
|
v = self._calc_symbol(v)
|
||||||
|
# call eval() to calc expression
|
||||||
|
for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
|
||||||
|
v = eval('%d %s %d' % (v, op, self._calc_symbol(arg)))
|
||||||
|
# t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
|
||||||
|
# print '%s\.innerHTML\s*=\s*"([^"])";' % t
|
||||||
|
# new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
|
||||||
|
# here we assume the meaning of t in defintely hostname, cf may change in the future
|
||||||
|
v += len(host_name)
|
||||||
|
# get wait time
|
||||||
|
wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
|
||||||
|
return hidden, v, url, wait
|
||||||
|
def _calc_symbol(self,s):
|
||||||
|
_ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
|
||||||
|
#type 1 +((...)+(...)) 2-digit num
|
||||||
|
if _:
|
||||||
|
v1, v2 = map(self._calc_symbol, _[0])
|
||||||
|
return int(str(v1)+str(v2))
|
||||||
|
#type 2 plain
|
||||||
|
else:
|
||||||
|
# use look-up table to replace
|
||||||
|
vmap = {'!':1, '[]':0, '!![]':1, '':0}
|
||||||
|
return sum(map(lambda x:vmap[x], s.split('+')))
|
||||||
|
def _pycfl(self,s):
|
||||||
|
# !+[] 1
|
||||||
|
# !![] 1
|
||||||
|
# ![] 0
|
||||||
|
# [] 0
|
||||||
|
result = ''
|
||||||
|
# print(s) # DEBUG
|
||||||
|
ss = re.split('\(|\)', s)
|
||||||
|
for s in ss:
|
||||||
|
if s in ('+', ''):
|
||||||
|
continue
|
||||||
|
elif s[0] == '+':
|
||||||
|
s = s[1:]
|
||||||
|
s = s.replace('!+[]', '1')
|
||||||
|
s = s.replace('!![]', '1')
|
||||||
|
s = s.replace('![]', '0')
|
||||||
|
s = s.replace('[]', '0')
|
||||||
|
s = s.replace('+!![]', '10')
|
||||||
|
result += str(sum([int(i) for i in s.split('+')]))
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _extract_all(self,txt, rules, pos=0, values=None):
|
||||||
|
"""Calls extract for each rule and returns the result in a dict"""
|
||||||
|
if values is None:
|
||||||
|
values = {}
|
||||||
|
for key, begin, end in rules:
|
||||||
|
result, pos = self._extract(txt, begin, end, pos)
|
||||||
|
if key:
|
||||||
|
values[key] = result
|
||||||
|
return values, pos
|
||||||
|
def _extract(self,txt, begin, end, pos=0):
|
||||||
|
"""Extract the text between 'begin' and 'end' from 'txt'
|
||||||
|
|
||||||
|
Args:
|
||||||
|
txt: String to search in
|
||||||
|
begin: First string to be searched for
|
||||||
|
end: Second string to be searched for after 'begin'
|
||||||
|
pos: Starting position for searches in 'txt'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The string between the two search-strings 'begin' and 'end' beginning
|
||||||
|
with position 'pos' in 'txt' as well as the position after 'end'.
|
||||||
|
|
||||||
|
If at least one of 'begin' or 'end' is not found, None and the original
|
||||||
|
value of 'pos' is returned
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
extract("abcde", "b", "d") -> "c" , 4
|
||||||
|
extract("abcde", "b", "d", 3) -> None, 3
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
first = txt.index(begin, pos) + len(begin)
|
||||||
|
last = txt.index(end, first)
|
||||||
|
return txt[first:last], last+len(end)
|
||||||
|
except ValueError:
|
||||||
|
return None, pos
|
||||||
|
|
||||||
|
def _solve_challenge(self, req,headers=None):
|
||||||
|
try:
|
||||||
|
self._request_webpage(
|
||||||
|
req, None, note='Solve Challenge',headers=headers)
|
||||||
|
except ExtractorError as ee:
|
||||||
|
if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503:
|
||||||
|
raise
|
||||||
|
page = ee.cause.read().decode('utf-8')
|
||||||
|
params = self._extract_all(page, (
|
||||||
|
('jschl_vc', 'name="jschl_vc" value="', '"'),
|
||||||
|
('pass' , 'name="pass" value="', '"'),
|
||||||
|
))[0]
|
||||||
|
params["jschl_answer"] = self._solve_jschl(req.full_url, page)
|
||||||
|
time.sleep(4)
|
||||||
|
print("params : ",params)
|
||||||
|
req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params)
|
||||||
|
self._request_webpage(
|
||||||
|
req, None, note='Downloading redirect page',headers=headers,fatal=False)
|
||||||
|
return req
|
||||||
|
# session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params)
|
||||||
|
# return session.cookies
|
||||||
|
def _solve_jschl(self,url, page):
|
||||||
|
"""Solve challenge to get 'jschl_answer' value"""
|
||||||
|
data, pos = self._extract_all(page, (
|
||||||
|
('var' , ',f, ', '='),
|
||||||
|
('key' , '"', '"'),
|
||||||
|
('expr', ':', '}')
|
||||||
|
))
|
||||||
|
solution = evaluate_expression(data["expr"])
|
||||||
|
variable = "{}.{}".format(data["var"], data["key"])
|
||||||
|
vlength = len(variable)
|
||||||
|
expressions = self._extract(page, "'challenge-form');", "f.submit();", pos)[0]
|
||||||
|
for expr in expressions.split(";")[1:]:
|
||||||
|
if expr.startswith(variable):
|
||||||
|
func = operator_functions[expr[vlength]]
|
||||||
|
value = evaluate_expression(expr[vlength+2:])
|
||||||
|
solution = func(solution, value)
|
||||||
|
elif expr.startswith("a.value"):
|
||||||
|
return solution + len(compat_urllib_parse_urlparse(url).netloc)
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
video_id = mobj.group('id')
|
||||||
|
isWatching = mobj.group('isWatching')
|
||||||
|
|
||||||
|
|
||||||
|
full_video_url = compat_urlparse.urljoin(url, "/watching.html") if not isWatching else url
|
||||||
|
parsed_url = compat_urllib_parse_urlparse(full_video_url)
|
||||||
|
headers = {
|
||||||
|
'User-Agent': self._USER_AGENT,
|
||||||
|
# 'Cookie':'__cfduid='+cfduid,
|
||||||
|
'Referer':'http://'+parsed_url.netloc+'/',
|
||||||
|
# 'Host':parsed_url.netloc
|
||||||
|
}
|
||||||
|
req = sanitized_Request(full_video_url)
|
||||||
|
self._solve_challenge(req,headers)
|
||||||
|
try:
|
||||||
|
webpage = self._download_webpage(req, video_id, headers=headers)
|
||||||
|
print("??????")
|
||||||
|
except ExtractorError as ee:
|
||||||
|
# print(ee)
|
||||||
|
if not isinstance(ee.cause, compat_HTTPError) or \
|
||||||
|
ee.cause.code != 503:
|
||||||
|
raise
|
||||||
|
redir_webpage = ee.cause.read().decode('utf-8')
|
||||||
|
cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value
|
||||||
|
self._set_cookie(parsed_url.netloc,'__cfduid',cfduid)
|
||||||
|
|
||||||
|
c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc)
|
||||||
|
print(c,v,u,w)
|
||||||
|
# action = self._search_regex(
|
||||||
|
# r'<form id="challenge-form" action="([^"]+)"',
|
||||||
|
# redir_webpage, 'Redirect form')
|
||||||
|
# vc = self._search_regex(
|
||||||
|
# r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
|
||||||
|
# redir_webpage, 'redirect vc value')
|
||||||
|
# pwd = self._search_regex(
|
||||||
|
# r'<input type="hidden" name="pass" value="([^"]+)"/>',
|
||||||
|
# redir_webpage, 'redirect pass value')
|
||||||
|
# av = re.search(
|
||||||
|
# r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
|
||||||
|
# redir_webpage)
|
||||||
|
# init = re.search(
|
||||||
|
# r'''
|
||||||
|
# (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P<dict>[a-zA-Z]*)={\"(?P<key>[a-zA-Z]*)\":(?P<init>[\(\)!\[\]\+]*)
|
||||||
|
# '''
|
||||||
|
# ,redir_webpage)
|
||||||
|
|
||||||
|
# ans = int(self._pycfl(init.group('init')))
|
||||||
|
# for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P<oper>[+\-\*/])=(?P<val>[\(\)!\[\]\+]*);',redir_webpage):
|
||||||
|
# if '*' == content.group('oper'):
|
||||||
|
# ans *= int(self._pycfl(content.group('val')))
|
||||||
|
# elif '+' == content.group('oper'):
|
||||||
|
# ans += int(self._pycfl(content.group('val')))
|
||||||
|
# elif '-' == content.group('oper'):
|
||||||
|
# ans -= int(self._pycfl(content.group('val')))
|
||||||
|
# elif '/' == content.group('oper'):
|
||||||
|
# ans /= int(self._pycfl(content.group('val')))
|
||||||
|
|
||||||
|
# ans += len(parsed_url.netloc)
|
||||||
|
# confirm_url = (
|
||||||
|
# parsed_url.scheme + '://' + parsed_url.netloc +
|
||||||
|
# action + '?' +
|
||||||
|
# compat_urllib_parse_urlencode({
|
||||||
|
# 'jschl_vc': vc,
|
||||||
|
# # 'pass': pwd,
|
||||||
|
# 'jschl_answer': compat_str(ans)
|
||||||
|
# })
|
||||||
|
# )
|
||||||
|
try:
|
||||||
|
time.sleep(int(w)//1000)
|
||||||
|
urlh = self._request_webpage(
|
||||||
|
req, None, note='Downloading redirect page',headers=headers,fatal=False)
|
||||||
|
# print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v))
|
||||||
|
# print(confirm_url)
|
||||||
|
|
||||||
|
# webpage, url_handle = self._download_webpage_handle(
|
||||||
|
# confirm_url, None, 'Downloading login page',headers=headers)
|
||||||
|
# webpage = self._download_webpage(
|
||||||
|
# confirm_url, video_id,
|
||||||
|
# note='Confirming after redirect',
|
||||||
|
# headers=headers)
|
||||||
|
|
||||||
|
self.to_screen(webpage)
|
||||||
|
# title = self._html_search_regex(r'<div class="info_movie(?:\sfull)?"[^>]+<div class="tit full"><h1>(.+?)</h1>', webpage, 'title', fatal=False)
|
||||||
|
# print(title)
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
# 'title': title,
|
||||||
|
'description': self._og_search_description(webpage),
|
||||||
|
# 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
|
||||||
|
# TODO more properties (see youtube_dl/extractor/common.py)
|
||||||
|
}
|
||||||
|
except ExtractorError as ee:
|
||||||
|
if not isinstance(ee.cause, compat_HTTPError) or \
|
||||||
|
ee.cause.code != 503:
|
||||||
|
raise
|
||||||
|
webpage = ee.cause.read().decode('utf-8')
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user