[XMovies8IE] Add new extractor

This commit is contained in:
yc0 2017-10-11 23:41:17 +08:00
parent f9045dfb04
commit a998cb2f7c

View File

@ -1,53 +1,33 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re, time,operator import re
import time
import operator
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_str,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
clean_html,
urljoin,
compat_urlparse, compat_urlparse,
ExtractorError, ExtractorError
sanitized_Request,
update_Request
) )
def urljoin(*args): def urljoin(*args):
""" """
Joins given arguments into a url. Trailing but not leading slashes are Joins given arguments into a url. Trailing but not leading slashes are
stripped for each argument. stripped for each argument.
The urljoin in utils is not suitable for me.
I do not want to join url with the base url.
I only want to concat two paths without duplicate slashs
""" """
return "/".join(map(lambda x: str(x).rstrip('/'), args)) return "/".join(map(lambda x: str(x).rstrip('/'), args))
def cookie_to_dict(cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
}
if cookie.port_specified:
cookie_dict['port'] = cookie.port
if cookie.domain_specified:
cookie_dict['domain'] = cookie.domain
if cookie.path_specified:
cookie_dict['path'] = cookie.path
if cookie.expires is not None:
cookie_dict['expires'] = cookie.expires
if cookie.secure is not None:
cookie_dict['secure'] = cookie.secure
if cookie.discard is not None:
cookie_dict['discard'] = cookie.discard
try:
if (cookie.has_nonstandard_attr('httpOnly') or
cookie.has_nonstandard_attr('httponly') or
cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
except TypeError:
pass
return cookie_dict
def evaluate_expression(expr): def evaluate_expression(expr):
"""Evaluate a Javascript expression for the challange and return its value""" """Evaluate a Javascript expression for the challange and return its value"""
stack = [] stack = []
@ -67,18 +47,22 @@ def evaluate_expression(expr):
value += str(num) value += str(num)
return int(value) return int(value)
operator_functions = { operator_functions = {
"+": operator.add, "+": operator.add,
"-": operator.sub, "-": operator.sub,
"*": operator.mul, "*": operator.mul,
} }
expression_values = { expression_values = {
"": 0, "": 0,
"+": 0, "+": 0,
"!+": 1, "!+": 1,
"+!!": 1, "+!!": 1,
} }
class XMovies8IE(InfoExtractor): class XMovies8IE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
@ -112,73 +96,10 @@ class XMovies8IE(InfoExtractor):
'vcodec': 'avc1.64001f', 'vcodec': 'avc1.64001f',
'acodec': 'mp4a.40.2'}] 'acodec': 'mp4a.40.2'}]
}, },
# 'info_dict': {
# 'id': '36164052',
# 'ext': 'flv',
# 'title': '데일리 에이프릴 요정들의 시상식!',
# 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
# 'uploader': 'dailyapril',
# 'uploader_id': 'dailyapril',
# 'upload_date': '20160503',
# },
'params': { 'params': {
# m3u8 download
'skip_download': True, 'skip_download': True,
} }
} }
def _get_cv(self,ct, host_name):
#ct = ct.replace('\n', '').replace('\r', '')
#find all hidden form value
hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
hidden = '&'.join(map(lambda x:'='.join(x), hidden))
# get challange endpoint url
url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
# get var name
# var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
_, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
v = self._calc_symbol(v)
# call eval() to calc expression
for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
v = eval('%d %s %d' % (v, op, self._calc_symbol(arg)))
# t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
# print '%s\.innerHTML\s*=\s*"([^"])";' % t
# new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
# here we assume the meaning of t in defintely hostname, cf may change in the future
v += len(host_name)
# get wait time
wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
return hidden, v, url, wait
def _calc_symbol(self,s):
_ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
#type 1 +((...)+(...)) 2-digit num
if _:
v1, v2 = map(self._calc_symbol, _[0])
return int(str(v1)+str(v2))
#type 2 plain
else:
# use look-up table to replace
vmap = {'!':1, '[]':0, '!![]':1, '':0}
return sum(map(lambda x:vmap[x], s.split('+')))
def _pycfl(self,s):
# !+[] 1
# !![] 1
# ![] 0
# [] 0
result = ''
# print(s) # DEBUG
ss = re.split('\(|\)', s)
for s in ss:
if s in ('+', ''):
continue
elif s[0] == '+':
s = s[1:]
s = s.replace('!+[]', '1')
s = s.replace('!![]', '1')
s = s.replace('![]', '0')
s = s.replace('[]', '0')
s = s.replace('+!![]', '10')
result += str(sum([int(i) for i in s.split('+')]))
return result
def _extract_all(self, txt, rules, pos=0, values=None): def _extract_all(self, txt, rules, pos=0, values=None):
"""Calls extract for each rule and returns the result in a dict""" """Calls extract for each rule and returns the result in a dict"""
@ -189,6 +110,7 @@ class XMovies8IE(InfoExtractor):
if key: if key:
values[key] = result values[key] = result
return values, pos return values, pos
def _extract(self, txt, begin, end, pos=0): def _extract(self, txt, begin, end, pos=0):
"""Extract the text between 'begin' and 'end' from 'txt' """Extract the text between 'begin' and 'end' from 'txt'
@ -201,7 +123,6 @@ class XMovies8IE(InfoExtractor):
Returns: Returns:
The string between the two search-strings 'begin' and 'end' beginning The string between the two search-strings 'begin' and 'end' beginning
with position 'pos' in 'txt' as well as the position after 'end'. with position 'pos' in 'txt' as well as the position after 'end'.
If at least one of 'begin' or 'end' is not found, None and the original If at least one of 'begin' or 'end' is not found, None and the original
value of 'pos' is returned value of 'pos' is returned
@ -216,10 +137,10 @@ class XMovies8IE(InfoExtractor):
except ValueError: except ValueError:
return None, pos return None, pos
def _solve_challenge(self, req,headers=None): def _solve_challenge(self, url, headers=None):
try: try:
self._request_webpage( self._request_webpage(
req, None, note='Solve Challenge',headers=headers) url, None, note='Solving Challenge', headers=headers)
except ExtractorError as ee: except ExtractorError as ee:
if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503: if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503:
raise raise
@ -228,15 +149,13 @@ class XMovies8IE(InfoExtractor):
('jschl_vc', 'name="jschl_vc" value="', '"'), ('jschl_vc', 'name="jschl_vc" value="', '"'),
('pass', 'name="pass" value="', '"'), ('pass', 'name="pass" value="', '"'),
))[0] ))[0]
params["jschl_answer"] = self._solve_jschl(req.full_url, page) params["jschl_answer"] = self._solve_jschl(url, page)
time.sleep(4) time.sleep(4)
print("params : ",params) # print("params : ", params)
req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params) rst = self._request_webpage(
self._request_webpage( urljoin(url, "/cdn-cgi/l/chk_jschl"), None, note='Downloading redirect page', headers=headers, fatal=False, query=params)
req, None, note='Downloading redirect page',headers=headers,fatal=False) return rst
return req
# session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params)
# return session.cookies
def _solve_jschl(self, url, page): def _solve_jschl(self, url, page):
"""Solve challenge to get 'jschl_answer' value""" """Solve challenge to get 'jschl_answer' value"""
data, pos = self._extract_all(page, ( data, pos = self._extract_all(page, (
@ -255,70 +174,60 @@ class XMovies8IE(InfoExtractor):
solution = func(solution, value) solution = func(solution, value)
elif expr.startswith("a.value"): elif expr.startswith("a.value"):
return solution + len(compat_urllib_parse_urlparse(url).netloc) return solution + len(compat_urllib_parse_urlparse(url).netloc)
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
isWatching = mobj.group('isWatching') isWatching = mobj.group('isWatching')
print("original :", url) # print("original :", url)
# url = compat_urlparse.urljoin(url, "/watching") if not isWatching else url
base_url = compat_urlparse.urljoin(url, "/") base_url = compat_urlparse.urljoin(url, "/")
print("base :", base_url) # print("base :", base_url)
parsed_url = compat_urllib_parse_urlparse(url) parsed_url = compat_urllib_parse_urlparse(url)
print("after parsed:", parsed_url) # print("after parsed:", parsed_url)
headers = { headers = {
'User-Agent': self._USER_AGENT, 'User-Agent': self._USER_AGENT,
# 'Cookie':'__cfduid='+cfduid,
'Referer': 'http://' + parsed_url.netloc + '/', 'Referer': 'http://' + parsed_url.netloc + '/',
# 'Host':parsed_url.netloc
} }
req = sanitized_Request(base_url) self._solve_challenge(base_url, headers)
self._solve_challenge(req,headers)
try: try:
path = urljoin(parsed_url.path, "watching.html") if not isWatching else parsed_url.path path = urljoin(parsed_url.path, "watching.html") if not isWatching else parsed_url.path
#print(path) # print(compat_urlparse.urljoin(base_url, path))
print(compat_urlparse.urljoin(base_url,path))
webpage = self._download_webpage(compat_urlparse.urljoin(base_url, path), video_id, headers=headers) webpage = self._download_webpage(compat_urlparse.urljoin(base_url, path), video_id, headers=headers)
# self.to_screen(webpage)
# title = self._html_search_regex(r'<div class="info_movie(?:\sfull.*)[^<]+class="full desc.*<h1>(.+)</h1>',webpage,'title', fatal=False)
# self.to_screen(webpage)
title = self._html_search_regex(r'(?is)<meta[^>]+prop="name" content="([^"]+)', webpage, 'title', fatal=False) title = self._html_search_regex(r'(?is)<meta[^>]+prop="name" content="([^"]+)', webpage, 'title', fatal=False)
description = self._html_search_regex(r'(?is)<meta[^>]+prop="description" content="([^"]+)', webpage, 'description', fatal=False) description = self._html_search_regex(r'(?is)<meta[^>]+prop="description" content="([^"]+)', webpage, 'description', fatal=False)
duration = self._html_search_regex(r'(?is)<meta[^>]+prop="duration" content="([^"]+)',webpage,'duration', fatal=False) # duration = self._html_search_regex(r'(?is)<meta[^>]+prop="duration" content="([^"]+)', webpage, 'duration', fatal=False)
thumbnailUrl = self._html_search_regex(r'(?is)<link[^>]+prop="thumbnailUrl" href="([^"]+)', webpage, 'thumbnailUrl', fatal=False) thumbnailUrl = self._html_search_regex(r'(?is)<link[^>]+prop="thumbnailUrl" href="([^"]+)', webpage, 'thumbnailUrl', fatal=False)
player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*', webpage, 'player_id', fatal=False) player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*', webpage, 'player_id', fatal=False)
movie_id = self._html_search_regex(r'<script[^>]+/javascript\"> var movie = { id: (\d+),', webpage, 'movie_id', fatal=False) movie_id = self._html_search_regex(r'<script[^>]+/javascript\"> var movie = { id: (\d+),', webpage, 'movie_id', fatal=False)
print(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3")) # print(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3"))
load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3"),video_id,headers=headers,query={'id':player_id}) load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3"), video_id, note="Downloading player v3", headers=headers, query={'id': player_id})
print(title) # print(title)
print(player_id) # print(player_id)
print(load_player_v3) # print(load_player_v3)
print(load_player_v3.get('value')) # print(load_player_v3.get('value'))
playlist = self._download_json(parsed_url.scheme+":"+load_player_v3.get('value'),video_id,headers=headers) playlist = self._download_json(parsed_url.scheme + ":" + load_player_v3.get('value'), video_id, note="Downloading video format", headers=headers)
print(playlist) # print(playlist)
formats = None formats = None
for play in playlist.get('playlist'): for play in playlist.get('playlist'):
print(play.get('file')) # print(play.get('file'))
# m3u8_formats = self._extract_m3u8_formats(play.get('file'),video_id) # m3u8_formats = self._extract_m3u8_formats(play.get('file'),video_id)
formats = self._extract_m3u8_formats(play.get('file'), video_id, "mp4") formats = self._extract_m3u8_formats(play.get('file'), video_id, "mp4")
print(formats) # print(formats)
if not formats and error:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats) self._sort_formats(formats)
print({ # print({
'id': movie_id, # 'id': movie_id,
'title': title, # 'title': title,
'ext':formats[0].get('ext'), # 'ext': formats[0].get('ext'),
'description': description, # 'description': description,
'thumbnail': thumbnailUrl, # 'thumbnail': thumbnailUrl,
'formats': formats # 'formats': formats
}) # })
return { return {
'id': movie_id, 'id': movie_id,
'title': title, 'title': title,
@ -328,84 +237,7 @@ class XMovies8IE(InfoExtractor):
'formats': formats 'formats': formats
} }
except ExtractorError as ee: except ExtractorError as ee:
print("OOOOOO")
print(ee)
if not isinstance(ee.cause, compat_HTTPError) or \ if not isinstance(ee.cause, compat_HTTPError) or \
ee.cause.code != 503: ee.cause.code != 503:
self.to_screen(ee.cause.read().decode('utf-8')) self.to_screen(ee.cause.read().decode('utf-8'))
raise raise
redir_webpage = ee.cause.read().decode('utf-8')
cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value
self._set_cookie(parsed_url.netloc,'__cfduid',cfduid)
c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc)
print(c,v,u,w)
# action = self._search_regex(
# r'<form id="challenge-form" action="([^"]+)"',
# redir_webpage, 'Redirect form')
# vc = self._search_regex(
# r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
# redir_webpage, 'redirect vc value')
# pwd = self._search_regex(
# r'<input type="hidden" name="pass" value="([^"]+)"/>',
# redir_webpage, 'redirect pass value')
# av = re.search(
# r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
# redir_webpage)
# init = re.search(
# r'''
# (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P<dict>[a-zA-Z]*)={\"(?P<key>[a-zA-Z]*)\":(?P<init>[\(\)!\[\]\+]*)
# '''
# ,redir_webpage)
# ans = int(self._pycfl(init.group('init')))
# for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P<oper>[+\-\*/])=(?P<val>[\(\)!\[\]\+]*);',redir_webpage):
# if '*' == content.group('oper'):
# ans *= int(self._pycfl(content.group('val')))
# elif '+' == content.group('oper'):
# ans += int(self._pycfl(content.group('val')))
# elif '-' == content.group('oper'):
# ans -= int(self._pycfl(content.group('val')))
# elif '/' == content.group('oper'):
# ans /= int(self._pycfl(content.group('val')))
# ans += len(parsed_url.netloc)
# confirm_url = (
# parsed_url.scheme + '://' + parsed_url.netloc +
# action + '?' +
# compat_urllib_parse_urlencode({
# 'jschl_vc': vc,
# # 'pass': pwd,
# 'jschl_answer': compat_str(ans)
# })
# )
try:
time.sleep(int(w)//1000)
urlh = self._request_webpage(
req, None, note='Downloading redirect page',headers=headers,fatal=False)
# print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v))
# print(confirm_url)
# webpage, url_handle = self._download_webpage_handle(
# confirm_url, None, 'Downloading login page',headers=headers)
# webpage = self._download_webpage(
# confirm_url, video_id,
# note='Confirming after redirect',
# headers=headers)
self.to_screen(webpage)
# title = self._html_search_regex(r'<div class="info_movie(?:\sfull)?"[^>]+<div class="tit full"><h1>(.+?)</h1>', webpage, 'title', fatal=False)
# print(title)
return {
'id': video_id,
# 'title': title,
'description': self._og_search_description(webpage),
# 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
# TODO more properties (see youtube_dl/extractor/common.py)
}
except ExtractorError as ee:
if not isinstance(ee.cause, compat_HTTPError) or \
ee.cause.code != 503:
raise
webpage = ee.cause.read().decode('utf-8')