from __future__ import unicode_literals import re from ..compat import compat_str from ..utils import ExtractorError from .jsparser import Parser from .jsgrammar import TokenTypes, token_keys from .jsbuilt_ins import global_obj from .jsbuilt_ins.base import isprimitive from .jsbuilt_ins.internals import to_string from .jsbuilt_ins.utils import to_js class Context(object): def __init__(self, variables=None, ended=False): super(Context, self).__init__() self.ended = ended self.no_in = True self.local_vars = {} if variables is not None: for k, v in dict(variables).items(): # XXX validate identifiers self.local_vars[k] = Reference(v, (self.local_vars, k)) class Reference(object): def __init__(self, value, parent_key=None): super(Reference, self).__init__() self._value = value if parent_key is not None: self._parent, self._name = parent_key else: self._parent = self._name = None def getvalue(self, deep=False): value = self._value if deep: if isinstance(self._value, (list, tuple)): # TODO test nested arrays value = [elem if isprimitive(elem) else elem.getvalue() for elem in self._value] elif isinstance(self._value, dict): value = {} for key, prop in self._value.items(): value[key] = prop.getvalue() return value def putvalue(self, value): if self._parent is None: raise ExtractorError('Trying to set a read-only reference') if not hasattr(self._parent, '__setitem__'): raise ExtractorError('Unknown reference') self._parent.__setitem__(self._name, Reference(value, (self._parent, self._name))) self._value = value return value def __repr__(self): if self._parent is not None: parent, key = self._parent return '' % ( str(self._value), parent.__class__.__name__, id(parent), key) return '' % (self._value, None) def __eq__(self, other): if isinstance(other, Reference): return self._parent is other._parent and self._name == other._name return False def __ne__(self, other): return not self.__eq__(other) class JSInterpreter(object): # TODO support json def __init__(self, code, variables=None): super(JSInterpreter, self).__init__() self.code = code self.global_vars = {} if variables is not None: for k, v in dict(variables).items(): # XXX validate identifiers self.global_vars[k] = self.create_reference(v, (self.global_vars, k)) self._context = Context() self._context_stack = [] @property def this(self): return self._context.local_vars def create_reference(self, value, parent_key): if isinstance(value, dict): o = {} for k, v in value.items(): o[k] = self.create_reference(v, (o, k)) elif isinstance(value, (list, tuple, set)): o = [] for k, v in enumerate(value): o[k] = self.create_reference(v, (o, k)) elif isinstance(value, (int, float, compat_str, bool, re._pattern_type)) or value is None: o = value else: raise ExtractorError('Unsupported type, %s in variables' % type(value)) return Reference(o, parent_key) def interpret_statement(self, stmt): if stmt is None: return None name = stmt[0] ref = None if name == TokenTypes.FUNC: name, args, body = stmt[1:] if name is not None: if self._context_stack: self.this[name] = Reference(self.build_function(args, body), (self.this, name)) else: self.global_vars[name] = Reference(self.build_function(args, body), (self.this, name)) else: raise ExtractorError('Function expression is not yet implemented') elif name is TokenTypes.BLOCK: block = stmt[1] for stmt in block: s = self.interpret_statement(stmt) if s is not None: ref = s.getvalue() elif name is TokenTypes.VAR: for name, value in stmt[1]: value = (self.interpret_expression(value).getvalue() if value is not None else global_obj.get_prop('undefined')) self.this[name] = Reference(value, (self.this, name)) elif name is TokenTypes.EXPR: for expr in stmt[1]: ref = self.interpret_expression(expr) # if # continue, break elif name is TokenTypes.RETURN: ref = self.interpret_statement(stmt[1]) self._context.ended = True # with # label # switch # throw # try # debugger else: raise ExtractorError('''Can't interpret statement called %s''' % name) return ref def interpret_expression(self, expr): if expr is None: return name = expr[0] if name is TokenTypes.ASSIGN: op, left, right = expr[1:] if op is None: ref = self.interpret_expression(left) else: try: leftref = self.interpret_expression(left) except ExtractorError: lname = left[0] key = None if lname is TokenTypes.OPEXPR and len(left[1]) == 1: lname = left[1][0][0] if lname is TokenTypes.MEMBER: lid, args, tail = left[1][0][1:] if lid[0] is TokenTypes.ID and args is None and tail is None: key = lid[1] if key is not None: u = Reference(global_obj.get_prop('undefined'), (self.this, key)) leftref = self.this[key] = u else: raise ExtractorError('Invalid left-hand side in assignment') leftvalue = leftref.getvalue() rightvalue = self.interpret_expression(right).getvalue() leftref.putvalue(op(leftvalue, rightvalue)) # XXX check specs what to return ref = leftref elif name is TokenTypes.EXPR: ref = self.interpret_statement(expr) elif name is TokenTypes.OPEXPR: stack = [] postfix = [] rpn = expr[1][:] # FIXME support pre- and postfix operators while rpn: token = rpn.pop(0) # XXX relation 'in' 'instanceof' if token[0] in (TokenTypes.OP, TokenTypes.AOP, TokenTypes.LOP, TokenTypes.REL): right = stack.pop() left = stack.pop() stack.append(Reference(token[1](left.getvalue(), right.getvalue()))) # XXX add unary operator 'delete', 'void', 'instanceof' elif token[0] is TokenTypes.UOP: right = stack.pop() stack.append(Reference(token[1](right.getvalue()))) elif token[0] is TokenTypes.PREFIX: right = stack.pop() stack.append(Reference(right.putvalue(token[1](right.getvalue())))) elif token[0] is TokenTypes.POSTFIX: postfix.append((stack[-1], token[1])) else: stack.append(self.interpret_expression(token)) result = stack.pop() if not stack: for operand, op in postfix: operand.putvalue(op(operand.getvalue())) ref = result else: raise ExtractorError('Expression has too many values') elif name is TokenTypes.MEMBER: # TODO interpret member target, args, tail = expr[1:] target = self.interpret_expression(target) if args is not None: # TODO interpret NewExpression pass source = None while tail is not None: tail_name, tail_value, tail = tail if tail_name is TokenTypes.FIELD: source = to_js(target.getvalue()) target = source.get_prop(tail_value) elif tail_name is TokenTypes.ELEM: prop = self.interpret_expression(tail_value).getvalue() target = to_js(target.getvalue()).get_prop(to_string(to_js(prop))) elif tail_name is TokenTypes.CALL: args = (self.interpret_expression(arg).getvalue() for arg in tail_value) if isprimitive(target): if source is None: target = target(*args) else: target = target(source, *args) else: if source is None: target = target.getvalue()(*args) else: target = target.getvalue()(source, *args) if isprimitive(target): target = Reference(target) else: target = Reference(target.getvalue()) ref = target elif name is TokenTypes.ID: # XXX error handling (unknown id) id = expr[1] try: ref = (self.this[id] if id in self.this else self.global_vars[id]) except KeyError: try: ref = Reference(self.extract_object(id)) except AttributeError: ref = Reference(self.extract_function(id)) # literal elif name in token_keys: ref = Reference(expr[1]) elif name is TokenTypes.ARRAY: array = [] for key, elem in enumerate(expr[1]): value = self.interpret_expression(elem).getvalue() array.append(Reference(value, (array, key))) ref = Reference(array) else: raise ExtractorError('''Can't interpret expression called %s''' % name) return ref def extract_object(self, objname): obj = {} obj_m = re.search( (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + r'\s*(?P([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + r'\}\s*;', self.code) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( r'(?P[a-zA-Z$0-9]+)\s*:\s*function' r'\((?P[a-z,]+)\){(?P[^}]+)}', fields) for f in fields_m: argnames = f.group('args').split(',') obj[f.group('key')] = self.build_function(argnames, Parser(f.group('code')).parse()) return obj def extract_function(self, funcname): func_m = re.search( r'''(?x) (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P[^)]*)\)\s* \{(?P[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), self.code) if func_m is None: raise ExtractorError('Could not find JS function %r' % funcname) argnames = func_m.group('args').split(',') return self.build_function(argnames, Parser(func_m.group('code')).parse()) def push_context(self, cx): self._context_stack.append(self._context) self._context = cx def pop_context(self): # XXX check underflow self._context = self._context_stack.pop() def call_function(self, funcname, *args): f = (self.this[funcname].getvalue() if funcname in self.this else self.global_vars[funcname].getvalue() if funcname in self.global_vars else self.extract_function(funcname)) return f(*args) def build_function(self, argnames, ast): def resf(*args): self.push_context(Context(dict(zip(argnames, args)))) res = None for stmt in ast: res = self.interpret_statement(stmt) res = None if res is None else res.getvalue(deep=True) if self._context.ended: self.pop_context() break return res return resf def run(self, cx=None): if cx is not None: self.push_context(cx) res = None for stmt in Parser(self.code).parse(): res = self.interpret_statement(stmt) res = None if res is None else res.getvalue(deep=True) if self._context.ended: if cx is not None: self.pop_context() break return res