add xmovie8 extractor, and bypass the cloudflare challenge

2017-10-11 10:50:39 +08:00 · 2017-10-11 10:50:39 +08:00 · 6aa6cfc113
commit 6aa6cfc113
parent cf5f6ed5be
2 changed files with 331 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1307,6 +1307,7 @@ from .xiami import (
    XiamiCollectionIE
 )
 from .xminus import XMinusIE
+from .xmovies8 import XMovies8IE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
 from .xtube import XTubeUserIE, XTubeIE
--- a/youtube_dl/extractor/xmovies8.py
+++ b/youtube_dl/extractor/xmovies8.py
@ -0,0 +1,330 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re, time,operator
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urllib_parse_urlencode,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    clean_html,
+    urljoin,
+    compat_urlparse,
+    ExtractorError,
+    sanitized_Request,
+    update_Request
+)
+def cookie_to_dict(cookie):
+    cookie_dict = {
+        'name': cookie.name,
+        'value': cookie.value,
+    }
+    if cookie.port_specified:
+        cookie_dict['port'] = cookie.port
+    if cookie.domain_specified:
+        cookie_dict['domain'] = cookie.domain
+    if cookie.path_specified:
+        cookie_dict['path'] = cookie.path
+    if cookie.expires is not None:
+        cookie_dict['expires'] = cookie.expires
+    if cookie.secure is not None:
+        cookie_dict['secure'] = cookie.secure
+    if cookie.discard is not None:
+        cookie_dict['discard'] = cookie.discard
+    try:
+        if (cookie.has_nonstandard_attr('httpOnly') or
+                cookie.has_nonstandard_attr('httponly') or
+                cookie.has_nonstandard_attr('HttpOnly')):
+            cookie_dict['httponly'] = True
+    except TypeError:
+        pass
+    return cookie_dict
+def evaluate_expression(expr):
+    """Evaluate a Javascript expression for the challange and return its value"""
+    stack = []
+    ranges = []
+    value = ""
+    for index, char in enumerate(expr):
+        if char == "(":
+            stack.append(index+1)
+        elif char == ")":
+            begin = stack.pop()
+            if stack:
+                ranges.append((begin, index))
+    for subexpr in [expr[begin:end] for begin, end in ranges] or (expr,):
+        num = 0
+        for part in subexpr.split("[]"):
+            num += expression_values[part]
+        value += str(num)
+    return int(value)
+ 
+operator_functions = {
+    "+": operator.add,
+    "-": operator.sub,
+    "*": operator.mul,
+}
+ 
+expression_values = {
+    "": 0,
+    "+": 0,
+    "!+": 1,
+    "+!!": 1,
+}
+class XMovies8IE(InfoExtractor):
+    _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?xmovies8\.(?:tv|es)/movie/
+        (?P<id>[a-zA-Z\-\.0-9]+)/?
+        (?P<isWatching>watching)?
+        (?:\.html)?
+        '''
+    _TEST = {
+        'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852',
+       
+        # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
+        'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
+        'info_dict': {
+            'id': '36164052',
+            'ext': 'flv',
+            'title': '데일리 에이프릴 요정들의 시상식!',
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+            'upload_date': '20160503',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }
+    def _get_cv(self,ct, host_name):
+        #ct = ct.replace('\n', '').replace('\r', '')
+        #find all hidden form value
+        hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
+        hidden = '&'.join(map(lambda x:'='.join(x), hidden))
+        # get challange endpoint url
+        url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
+        # get var name
+        # var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
+        _, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
+        v = self._calc_symbol(v)
+        # call eval() to calc expression
+        for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
+            v = eval('%d %s %d' % (v, op, self._calc_symbol(arg)))
+        # t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
+        # print '%s\.innerHTML\s*=\s*"([^"])";' % t
+        # new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
+        # here we assume the meaning of t in defintely hostname, cf may change in the future
+        v += len(host_name)
+        # get wait time
+        wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
+        return hidden, v, url, wait
+    def _calc_symbol(self,s):
+        _ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
+        #type 1 +((...)+(...)) 2-digit num
+        if _:
+            v1, v2 = map(self._calc_symbol, _[0])
+            return int(str(v1)+str(v2))
+        #type 2 plain
+        else:
+            # use look-up table to replace
+            vmap = {'!':1, '[]':0, '!![]':1, '':0}
+            return sum(map(lambda x:vmap[x], s.split('+')))
+    def _pycfl(self,s):
+        # !+[]  1
+        # !![]  1
+        # ![]   0
+        # []    0
+        result = ''
+        # print(s)  # DEBUG
+        ss = re.split('\(|\)', s)
+        for s in ss:
+            if s in ('+', ''):
+                continue
+            elif s[0] == '+':
+                s = s[1:]
+            s = s.replace('!+[]', '1')
+            s = s.replace('!![]', '1')
+            s = s.replace('![]', '0')
+            s = s.replace('[]', '0')
+            s = s.replace('+!![]', '10')
+            result += str(sum([int(i) for i in s.split('+')]))
+        return result
+
+    def _extract_all(self,txt, rules, pos=0, values=None):
+        """Calls extract for each rule and returns the result in a dict"""
+        if values is None:
+            values = {}
+        for key, begin, end in rules:
+            result, pos = self._extract(txt, begin, end, pos)
+            if key:
+                values[key] = result
+        return values, pos
+    def _extract(self,txt, begin, end, pos=0):
+        """Extract the text between 'begin' and 'end' from 'txt'
+    
+        Args:
+            txt: String to search in
+            begin: First string to be searched for
+            end: Second string to be searched for after 'begin'
+            pos: Starting position for searches in 'txt'
+    
+        Returns:
+            The string between the two search-strings 'begin' and 'end' beginning
+            with position 'pos' in 'txt' as well as the position after 'end'.
+    
+            If at least one of 'begin' or 'end' is not found, None and the original
+            value of 'pos' is returned
+    
+        Examples:
+            extract("abcde", "b", "d")    -> "c" , 4
+            extract("abcde", "b", "d", 3) -> None, 3
+        """
+        try:
+            first = txt.index(begin, pos) + len(begin)
+            last = txt.index(end, first)
+            return txt[first:last], last+len(end)
+        except ValueError:
+            return None, pos
+    
+    def _solve_challenge(self, req,headers=None):
+        try:
+            self._request_webpage(
+                req, None, note='Solve Challenge',headers=headers)
+        except ExtractorError as ee:
+            if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503:
+                raise
+            page = ee.cause.read().decode('utf-8')
+            params = self._extract_all(page, (
+                ('jschl_vc', 'name="jschl_vc" value="', '"'),
+                ('pass'    , 'name="pass" value="', '"'),
+            ))[0]
+            params["jschl_answer"] = self._solve_jschl(req.full_url, page)
+            time.sleep(4)
+            print("params : ",params)
+            req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params)
+            self._request_webpage(
+                req, None, note='Downloading redirect page',headers=headers,fatal=False)
+            return req
+            # session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params)
+            # return session.cookies
+    def _solve_jschl(self,url, page):
+        """Solve challenge to get 'jschl_answer' value"""
+        data, pos = self._extract_all(page, (
+            ('var' , ',f, ', '='),
+            ('key' , '"', '"'),
+            ('expr', ':', '}')
+        ))
+        solution = evaluate_expression(data["expr"])
+        variable = "{}.{}".format(data["var"], data["key"])
+        vlength = len(variable)
+        expressions = self._extract(page, "'challenge-form');", "f.submit();", pos)[0]
+        for expr in expressions.split(";")[1:]:
+            if expr.startswith(variable):
+                func = operator_functions[expr[vlength]]
+                value = evaluate_expression(expr[vlength+2:])
+                solution = func(solution, value)
+            elif expr.startswith("a.value"):
+                return solution + len(compat_urllib_parse_urlparse(url).netloc)
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        isWatching = mobj.group('isWatching')
+
+        
+        full_video_url = compat_urlparse.urljoin(url, "/watching.html") if not isWatching else url
+        parsed_url = compat_urllib_parse_urlparse(full_video_url)
+        headers = {
+            'User-Agent': self._USER_AGENT,
+            # 'Cookie':'__cfduid='+cfduid,
+            'Referer':'http://'+parsed_url.netloc+'/',
+            # 'Host':parsed_url.netloc
+        }
+        req = sanitized_Request(full_video_url)
+        self._solve_challenge(req,headers)
+        try:
+            webpage = self._download_webpage(req, video_id, headers=headers)
+            print("??????")
+        except ExtractorError as ee:
+            # print(ee)
+            if not isinstance(ee.cause, compat_HTTPError) or \
+               ee.cause.code != 503:
+                raise
+            redir_webpage = ee.cause.read().decode('utf-8')
+            cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value
+            self._set_cookie(parsed_url.netloc,'__cfduid',cfduid)
+            
+            c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc)
+            print(c,v,u,w)
+            # action = self._search_regex(
+            #     r'<form id="challenge-form" action="([^"]+)"',
+            #     redir_webpage, 'Redirect form')
+            # vc = self._search_regex(
+            #     r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
+            #     redir_webpage, 'redirect vc value')
+            # pwd = self._search_regex(
+            #     r'<input type="hidden" name="pass" value="([^"]+)"/>',
+            #     redir_webpage, 'redirect pass value')
+            # av = re.search(
+            #     r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
+            #     redir_webpage)
+            # init = re.search(
+            #     r'''
+            #     (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P<dict>[a-zA-Z]*)={\"(?P<key>[a-zA-Z]*)\":(?P<init>[\(\)!\[\]\+]*)
+            #     '''
+            # ,redir_webpage)
+
+            # ans = int(self._pycfl(init.group('init')))
+            # for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P<oper>[+\-\*/])=(?P<val>[\(\)!\[\]\+]*);',redir_webpage):
+            #     if '*' == content.group('oper'):
+            #         ans *= int(self._pycfl(content.group('val')))
+            #     elif '+' == content.group('oper'):
+            #         ans += int(self._pycfl(content.group('val')))
+            #     elif '-' == content.group('oper'):
+            #         ans -= int(self._pycfl(content.group('val')))
+            #     elif '/' == content.group('oper'):
+            #         ans /= int(self._pycfl(content.group('val')))
+            
+            # ans += len(parsed_url.netloc)
+            # confirm_url = (
+            #     parsed_url.scheme + '://' + parsed_url.netloc +
+            #     action + '?' +
+            #     compat_urllib_parse_urlencode({
+            #         'jschl_vc': vc,
+            #         # 'pass': pwd,
+            #         'jschl_answer': compat_str(ans)                  
+            #         })
+            # )
+            try:
+                time.sleep(int(w)//1000)
+                urlh = self._request_webpage(
+                    req, None, note='Downloading redirect page',headers=headers,fatal=False)
+                # print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v))
+                # print(confirm_url)
+                
+                # webpage, url_handle = self._download_webpage_handle(
+                    # confirm_url, None, 'Downloading login page',headers=headers)
+                # webpage = self._download_webpage(
+                #     confirm_url, video_id,
+                #     note='Confirming after redirect',
+                #     headers=headers)
+
+                self.to_screen(webpage)
+                # title = self._html_search_regex(r'<div class="info_movie(?:\sfull)?"[^>]+<div class="tit full"><h1>(.+?)</h1>', webpage, 'title', fatal=False)
+                # print(title)
+                return {
+                    'id': video_id,
+                    # 'title': title,
+                    'description': self._og_search_description(webpage),
+                    # 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
+                    # TODO more properties (see youtube_dl/extractor/common.py)
+                }
+            except ExtractorError as ee:
+                if not isinstance(ee.cause, compat_HTTPError) or \
+                    ee.cause.code != 503:
+                    raise
+                webpage = ee.cause.read().decode('utf-8')
+