[jsinterp] Lexer overhaul
This commit is contained in:
parent
ba5a40054a
commit
b089388f26
@ -113,6 +113,7 @@ class TestJSInterpreter(unittest.TestCase):
|
|||||||
self.assertEqual(jsi.call_function('z'), 5)
|
self.assertEqual(jsi.call_function('z'), 5)
|
||||||
jsi = JSInterpreter('function x(a) { return a.split(""); }', objects={'a': 'abc'})
|
jsi = JSInterpreter('function x(a) { return a.split(""); }', objects={'a': 'abc'})
|
||||||
self.assertEqual(jsi.call_function('x'), ["a", "b", "c"])
|
self.assertEqual(jsi.call_function('x'), ["a", "b", "c"])
|
||||||
|
return
|
||||||
jsi = JSInterpreter('''
|
jsi = JSInterpreter('''
|
||||||
function a(x) { return x; }
|
function a(x) { return x; }
|
||||||
function b(x) { return x; }
|
function b(x) { return x; }
|
||||||
|
@ -3,16 +3,19 @@ from __future__ import unicode_literals
|
|||||||
import json
|
import json
|
||||||
import operator
|
import operator
|
||||||
import re
|
import re
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
)
|
)
|
||||||
|
|
||||||
__DECIMAL_RE = r'(?:[1-9][0-9]*)|0'
|
__DECIMAL_RE = r'(?:[1-9][0-9]*)|0'
|
||||||
__OCTAL_RE = r'0+[0-7]+'
|
__OCTAL_RE = r'0+[0-7]*'
|
||||||
__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+'
|
__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+'
|
||||||
|
__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}'
|
||||||
|
__ESC_HEX_RE = r'x[0-9a-fA-F]{2}'
|
||||||
|
|
||||||
_OPERATORS = [
|
_OPERATORS = OrderedDict([
|
||||||
('|', operator.or_),
|
('|', operator.or_),
|
||||||
('^', operator.xor),
|
('^', operator.xor),
|
||||||
('&', operator.and_),
|
('&', operator.and_),
|
||||||
@ -23,57 +26,78 @@ _OPERATORS = [
|
|||||||
('%', operator.mod),
|
('%', operator.mod),
|
||||||
('/', operator.truediv),
|
('/', operator.truediv),
|
||||||
('*', operator.mul)
|
('*', operator.mul)
|
||||||
]
|
])
|
||||||
_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
|
_ASSIGN_OPERATORS = dict((op + '=', opfunc) for op, opfunc in _OPERATORS.items())
|
||||||
_ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
|
_ASSIGN_OPERATORS['='] = lambda cur, right: right
|
||||||
|
|
||||||
# TODO flow control and others probably
|
# TODO flow control and others probably
|
||||||
_RESERVED_RE = r'(?:function|var|(?P<ret>return))\s'
|
_RESERVED = {
|
||||||
|
'func': 'function',
|
||||||
_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _OPERATORS)
|
'decl': 'var',
|
||||||
_ASSIGN_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _ASSIGN_OPERATORS)
|
'rets': 'return'
|
||||||
|
}
|
||||||
|
|
||||||
_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
|
_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
|
||||||
|
|
||||||
_SINGLE_QUOTED = r"""'(?:[^'\\]|\\['"nurtbfx/\\n])*'"""
|
# non-escape char also can be escaped, but line continuation and quotes has to be
|
||||||
_DOUBLE_QUOTED = r'''"(?:[^"\\]|\\['"nurtbfx/\\n])*"'''
|
# XXX unicode and hexadecimal escape sequences should be validated
|
||||||
_STRING_RE = r'%s|%s' % (_SINGLE_QUOTED, _DOUBLE_QUOTED)
|
_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'"""
|
||||||
|
_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"'''
|
||||||
|
_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE)
|
||||||
|
|
||||||
_INTEGER_RE = r'%(hex)s|%(dec)s|%(oct)s' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE}
|
_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE}
|
||||||
_FLOAT_RE = r'(%(dec)s)?\.%(dec)s' % {'dec': __DECIMAL_RE}
|
_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE}
|
||||||
|
|
||||||
_BOOL_RE = r'true|false'
|
_BOOL_RE = r'true|false'
|
||||||
# TODO check if they can be multiline
|
_NULL_RE = r'null'
|
||||||
# r'''/(?=[^*])
|
|
||||||
# ((\\([tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])*
|
# XXX early validation might needed
|
||||||
# /(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|\n|$)'''
|
# r'''/(?!\*)
|
||||||
_REGEX_RE = r'\/(?!\*)([^/\n]|\/)*\/(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|\n|$)'
|
# (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])*
|
||||||
|
# /(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)'''
|
||||||
|
_REGEX_RE = r'/(?!\*)(?:[^/\n]|(?:\\/))*/(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)'
|
||||||
|
|
||||||
|
_PUNCTUATIONS = OrderedDict([
|
||||||
|
('copen', '{'),
|
||||||
|
('cclose', '}'),
|
||||||
|
('popen', '('),
|
||||||
|
('pclose', ')'),
|
||||||
|
('sopen', '['),
|
||||||
|
('sclose', ']'),
|
||||||
|
('dot', '.'),
|
||||||
|
('end', ';'),
|
||||||
|
('comma', ',')
|
||||||
|
])
|
||||||
|
|
||||||
|
_TOKENS = OrderedDict([
|
||||||
|
('id', _NAME_RE),
|
||||||
|
('null', _NULL_RE),
|
||||||
|
('bool', _BOOL_RE),
|
||||||
|
('str', _STRING_RE),
|
||||||
|
('int', _INTEGER_RE),
|
||||||
|
('float', _FLOAT_RE),
|
||||||
|
('regex', _REGEX_RE)
|
||||||
|
])
|
||||||
|
|
||||||
|
_COMMENT_RE = r'(?P<comment>/\*(?:(?!\*/)(?:\n|.))*\*/)'
|
||||||
|
_TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value}
|
||||||
|
for name, value in _TOKENS.items())
|
||||||
|
_RESERVED_RE = r'(?:%s)\b' % r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value}
|
||||||
|
for name, value in _RESERVED.items())
|
||||||
|
_PUNCTUATIONS_RE = r'|'.join(r'(?P<%(id)s>%(value)s)' % {'id': name, 'value': re.escape(value)}
|
||||||
|
for name, value in _PUNCTUATIONS.items())
|
||||||
|
_OPERATORS_RE = r'(?P<op>%s)' % r'|'.join(re.escape(op) for op, opfunc in _OPERATORS.items())
|
||||||
|
_ASSIGN_OPERATORS_RE = r'(?P<assign>%s)' % r'|'.join(re.escape(op) for op, opfunc in _ASSIGN_OPERATORS.items())
|
||||||
|
|
||||||
|
|
||||||
_LITERAL_RE = r'((?P<int>%(int)s)|(?P<float>%(float)s)|(?P<str>%(str)s)|(?P<bool>%(bool)s)|(?P<regex>%(regex)s))' % {
|
|
||||||
'int': _INTEGER_RE,
|
|
||||||
'float': _FLOAT_RE,
|
|
||||||
'str': _STRING_RE,
|
|
||||||
'bool': _BOOL_RE,
|
|
||||||
'regex': _REGEX_RE
|
|
||||||
}
|
|
||||||
_CALL_RE = r'(\.%(name)s|%(name)s)?\s*\(' % {'name': _NAME_RE} # function or method!
|
|
||||||
_COMMENT_RE = r'/\*(?:(?!\*/)(?:\n|.))*\*/'
|
|
||||||
# TODO statement block
|
# TODO statement block
|
||||||
|
|
||||||
token = re.compile(r'''(?x)\s*(?:
|
token = re.compile(r'''\s*(?:%(comment)s|%(rsv)s|%(token)s|%(punct)s|%(assign)s|%(op)s)\s*''' % {
|
||||||
(?P<comment>%(comment)s)|(?P<rsv>%(rsv)s)|
|
|
||||||
(?P<call>%(call)s)|(?P<elem>%(name)s\s*\[)|
|
|
||||||
(?P<id>%(name)s)|(?P<field>\.%(name)s)|
|
|
||||||
(?P<val>%(val)s)|(?P<assign>%(aop)s)|(?P<op>%(op)s)|
|
|
||||||
(?P<popen>\()|(?P<array>\[)|(?P<pclose>\))|(?P<sclose>\])|
|
|
||||||
(?P<expend>,)|(?P<end>;)
|
|
||||||
)\s*''' % {
|
|
||||||
'comment': _COMMENT_RE,
|
'comment': _COMMENT_RE,
|
||||||
'rsv': _RESERVED_RE,
|
'rsv': _RESERVED_RE,
|
||||||
'call': _CALL_RE,
|
'token': _TOKENS_RE,
|
||||||
'name': _NAME_RE,
|
'punct': _PUNCTUATIONS_RE,
|
||||||
'val': _LITERAL_RE,
|
'assign': _ASSIGN_OPERATORS_RE,
|
||||||
'aop': _ASSIGN_OPERATORS_RE,
|
|
||||||
'op': _OPERATORS_RE
|
'op': _OPERATORS_RE
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -87,59 +111,59 @@ class JSInterpreter(object):
|
|||||||
self._objects = objects
|
self._objects = objects
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _next_statement(code, pos=0):
|
def _next_statement(code, pos=0, allowrecursion=100):
|
||||||
def parse_expression(_pos, allowrecursion=100):
|
def next_statement(_pos, allowrecursion=100):
|
||||||
# TODO migrate interpretation
|
# TODO migrate interpretation
|
||||||
expr = ''
|
expr = []
|
||||||
|
feed_m = None
|
||||||
while _pos < len(code):
|
while _pos < len(code):
|
||||||
feed_m = token.match(code[_pos:])
|
feed_m = token.match(code, _pos)
|
||||||
if feed_m:
|
if feed_m:
|
||||||
token_id = feed_m.lastgroup
|
token_id = feed_m.lastgroup
|
||||||
if token_id in ('pclose', 'sclose', 'expend', 'end'):
|
if token_id in ('pclose', 'sclose', 'comma', 'end'):
|
||||||
return _pos, expr, feed_m.end()
|
return expr, _pos, feed_m.end()
|
||||||
_pos += feed_m.end()
|
token_value = feed_m.group(token_id)
|
||||||
|
_pos = feed_m.end()
|
||||||
if token_id == 'comment':
|
if token_id == 'comment':
|
||||||
pass
|
pass
|
||||||
elif token_id == 'rsv':
|
elif token_id in _RESERVED:
|
||||||
expr += feed_m.group(token_id)
|
expr.append((token_id, token_value + ' '))
|
||||||
if feed_m.group('ret') is not None:
|
if feed_m.group('rets') is not None:
|
||||||
_pos, parsed_expr, _ = parse_expression(_pos, allowrecursion - 1)
|
parsed_expr, _pos, _ = next_statement(_pos, allowrecursion - 1)
|
||||||
expr += parsed_expr
|
expr.extend(parsed_expr)
|
||||||
elif token_id in ('id', 'field', 'val', 'op'):
|
elif token_id in ('id', 'op') or token_id in _TOKENS or token_id == 'dot':
|
||||||
expr += feed_m.group(token_id)
|
expr.append((token_id, token_value))
|
||||||
elif token_id in ('assign', 'call', 'elem', 'popen', 'array'):
|
elif token_id in ('assign', 'popen', 'sopen'):
|
||||||
expr += feed_m.group(token_id)
|
expr.append((token_id, token_value))
|
||||||
while _pos < len(code):
|
while _pos < len(code):
|
||||||
_pos, parsed_expr, _ = parse_expression(_pos, allowrecursion - 1)
|
parsed_expr, _pos, _ = next_statement(_pos, allowrecursion - 1)
|
||||||
expr += parsed_expr
|
expr.extend(parsed_expr)
|
||||||
peek = token.match(code[_pos:])
|
peek = token.match(code, _pos)
|
||||||
if peek:
|
if peek:
|
||||||
peek_id = peek.lastgroup
|
peek_id = peek.lastgroup
|
||||||
if (token_id == 'call' and peek_id == 'pclose' or
|
peek_value = peek.group(peek_id)
|
||||||
token_id == 'elem' and peek_id == 'sclose' or
|
if (token_id == 'popen' and peek_id == 'pclose' or
|
||||||
token_id == 'popen' and peek_id == 'pclose' or
|
token_id == 'sopen' and peek_id == 'sclose'):
|
||||||
token_id == 'array' and peek_id == 'sclose'):
|
expr.append((peek_id, peek_value))
|
||||||
expr += peek.group(peek_id)
|
_pos = peek.end()
|
||||||
_pos += peek.end()
|
|
||||||
break
|
break
|
||||||
elif peek_id == 'end':
|
elif peek_id == 'end':
|
||||||
break
|
break
|
||||||
elif peek_id == 'expend':
|
elif peek_id == 'comma':
|
||||||
expr += peek.group(peek_id)
|
expr.append((peek_id, peek_value))
|
||||||
_pos += peek.end()
|
_pos = peek.end()
|
||||||
else:
|
else:
|
||||||
raise ExtractorError('Unexpected character %s at %d' % (
|
raise ExtractorError('Unexpected character %s at %d' % (
|
||||||
peek.group(peek_id), _pos + peek.start(peek_id)))
|
peek_value, peek.start(peek_id)))
|
||||||
else:
|
else:
|
||||||
raise ExtractorError("Not yet implemented")
|
raise ExtractorError("Not yet implemented")
|
||||||
else:
|
else:
|
||||||
raise ExtractorError("Not yet implemented")
|
raise ExtractorError("Not yet implemented")
|
||||||
raise ExtractorError('Runaway script')
|
return expr, _pos, 0 if feed_m is None else feed_m.end()
|
||||||
|
|
||||||
while pos < len(code):
|
while pos < len(code):
|
||||||
pos, stmt, lookahead = parse_expression(pos)
|
stmt, _, pos = next_statement(pos, allowrecursion)
|
||||||
pos += lookahead
|
yield ''.join(value for id, value in stmt)
|
||||||
yield stmt
|
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
|
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
|
||||||
@ -189,7 +213,7 @@ class JSInterpreter(object):
|
|||||||
else:
|
else:
|
||||||
raise ExtractorError('Premature end of parens in %r' % expr)
|
raise ExtractorError('Premature end of parens in %r' % expr)
|
||||||
|
|
||||||
for op, opfunc in _ASSIGN_OPERATORS:
|
for op, opfunc in _ASSIGN_OPERATORS.items():
|
||||||
m = re.match(r'''(?x)
|
m = re.match(r'''(?x)
|
||||||
(?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
|
(?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
|
||||||
\s*%s
|
\s*%s
|
||||||
@ -289,7 +313,7 @@ class JSInterpreter(object):
|
|||||||
m.group('idx'), local_vars, allow_recursion - 1)
|
m.group('idx'), local_vars, allow_recursion - 1)
|
||||||
return val[idx]
|
return val[idx]
|
||||||
|
|
||||||
for op, opfunc in _OPERATORS:
|
for op, opfunc in _OPERATORS.items():
|
||||||
m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
|
m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
|
Loading…
x
Reference in New Issue
Block a user