2016-12-04 19:15:35 +01:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import re
|
|
|
|
import operator
|
|
|
|
|
|
|
|
from ..utils import ExtractorError
|
|
|
|
from .jsgrammar import (
|
2018-06-10 22:27:22 +02:00
|
|
|
ASSIGN_OPERATORS_RE,
|
2016-12-04 19:15:35 +01:00
|
|
|
COMMENT_RE,
|
2018-06-10 22:27:22 +02:00
|
|
|
LINETERMINATORSEQ_RE,
|
|
|
|
LOGICAL_OPERATORS_RE,
|
|
|
|
OPERATORS_RE,
|
2016-12-04 19:15:35 +01:00
|
|
|
TOKENS_RE,
|
|
|
|
PUNCTUATIONS_RE,
|
|
|
|
RELATIONS_RE,
|
2018-06-10 22:27:22 +02:00
|
|
|
UNARY_OPERATORS_RE,
|
|
|
|
TokenTypes
|
2016-12-04 19:15:35 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
_PUNCTUATIONS = {
|
2018-06-10 22:27:22 +02:00
|
|
|
'{': TokenTypes.COPEN,
|
|
|
|
'}': TokenTypes.CCLOSE,
|
|
|
|
'(': TokenTypes.POPEN,
|
|
|
|
')': TokenTypes.PCLOSE,
|
|
|
|
'[': TokenTypes.SOPEN,
|
|
|
|
']': TokenTypes.SCLOSE,
|
|
|
|
'.': TokenTypes.DOT,
|
|
|
|
';': TokenTypes.END,
|
|
|
|
',': TokenTypes.COMMA,
|
|
|
|
'?': TokenTypes.HOOK,
|
|
|
|
':': TokenTypes.COLON
|
2016-12-04 19:15:35 +01:00
|
|
|
}
|
|
|
|
_LOGICAL_OPERATORS = {
|
2018-06-10 22:27:22 +02:00
|
|
|
'&&': (TokenTypes.AND, lambda cur, right: cur and right),
|
|
|
|
'||': (TokenTypes.OR, lambda cur, right: cur or right)
|
2016-12-04 19:15:35 +01:00
|
|
|
}
|
|
|
|
_UNARY_OPERATORS = {
|
2018-06-10 22:27:22 +02:00
|
|
|
'+': (TokenTypes.PLUS, lambda cur: cur),
|
|
|
|
'-': (TokenTypes.NEG, lambda cur: cur * -1),
|
|
|
|
'++': (TokenTypes.INC, lambda cur: cur + 1),
|
|
|
|
'--': (TokenTypes.DEC, lambda cur: cur - 1),
|
|
|
|
'!': (TokenTypes.NOT, operator.not_),
|
|
|
|
'~': (TokenTypes.BNOT, operator.inv),
|
2016-12-04 19:15:35 +01:00
|
|
|
# XXX define these operators
|
2018-06-10 22:27:22 +02:00
|
|
|
'delete': (TokenTypes.DEL, None),
|
|
|
|
'void': (TokenTypes.VOID, None),
|
|
|
|
'typeof': (TokenTypes.TYPE, lambda cur: type(cur))
|
2016-12-04 19:15:35 +01:00
|
|
|
}
|
|
|
|
_RELATIONS = {
|
2018-06-10 22:27:22 +02:00
|
|
|
'<': (TokenTypes.LT, operator.lt),
|
|
|
|
'>': (TokenTypes.GT, operator.gt),
|
|
|
|
'<=': (TokenTypes.LE, operator.le),
|
|
|
|
'>=': (TokenTypes.GE, operator.ge),
|
2016-12-04 19:15:35 +01:00
|
|
|
# XXX check python and JavaScript equality difference
|
2018-06-10 22:27:22 +02:00
|
|
|
'==': (TokenTypes.EQ, operator.eq),
|
|
|
|
'!=': (TokenTypes.NE, operator.ne),
|
|
|
|
'===': (TokenTypes.SEQ, lambda cur, right: cur == right and type(cur) == type(right)),
|
|
|
|
'!==': (TokenTypes.SNE, lambda cur, right: not cur == right or not type(cur) == type(right)),
|
|
|
|
'in': (TokenTypes.IN, operator.contains),
|
|
|
|
'instanceof': (TokenTypes.INSTANCEOF, lambda cur, right: isinstance(cur, right))
|
2016-12-04 19:15:35 +01:00
|
|
|
}
|
|
|
|
_OPERATORS = {
|
2018-06-10 22:27:22 +02:00
|
|
|
'|': (TokenTypes.BOR, operator.or_),
|
|
|
|
'^': (TokenTypes.BXOR, operator.xor),
|
|
|
|
'&': (TokenTypes.BAND, operator.and_),
|
2016-12-04 19:15:35 +01:00
|
|
|
# NOTE convert to int before shift float
|
2018-06-10 22:27:22 +02:00
|
|
|
'>>': (TokenTypes.RSHIFT, operator.rshift),
|
|
|
|
'<<': (TokenTypes.LSHIFT, operator.lshift),
|
|
|
|
'>>>': (TokenTypes.URSHIFT, lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right),
|
|
|
|
'-': (TokenTypes.SUB, operator.sub),
|
|
|
|
'+': (TokenTypes.ADD, operator.add),
|
|
|
|
'%': (TokenTypes.MOD, operator.mod),
|
|
|
|
'/': (TokenTypes.DIV, operator.truediv),
|
|
|
|
'*': (TokenTypes.MUL, operator.mul)
|
2016-12-04 19:15:35 +01:00
|
|
|
}
|
|
|
|
_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items())
|
|
|
|
_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right)
|
|
|
|
|
2016-12-05 11:44:32 +01:00
|
|
|
_operator_lookup = {
|
2018-06-10 22:27:22 +02:00
|
|
|
TokenTypes.OP: _OPERATORS,
|
|
|
|
TokenTypes.AOP: _ASSIGN_OPERATORS,
|
|
|
|
TokenTypes.UOP: _UNARY_OPERATORS,
|
|
|
|
TokenTypes.LOP: _LOGICAL_OPERATORS,
|
|
|
|
TokenTypes.REL: _RELATIONS
|
2016-12-05 11:44:32 +01:00
|
|
|
}
|
2016-12-04 19:15:35 +01:00
|
|
|
# only to check ids
|
2016-12-05 11:44:32 +01:00
|
|
|
_reserved_words = ('break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally',
|
|
|
|
'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw', 'try',
|
|
|
|
'typeof', 'var', 'void', 'while', 'with')
|
2016-12-06 18:42:59 +01:00
|
|
|
_input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(lop)s|%(uop)s|%(aop)s|%(op)s|%(rel)s|%(punct)s)\s*' % {
|
2016-12-04 19:15:35 +01:00
|
|
|
'comment': COMMENT_RE,
|
|
|
|
'token': TOKENS_RE,
|
|
|
|
'lop': LOGICAL_OPERATORS_RE,
|
|
|
|
'uop': UNARY_OPERATORS_RE,
|
|
|
|
'aop': ASSIGN_OPERATORS_RE,
|
2016-12-06 18:42:59 +01:00
|
|
|
'op': OPERATORS_RE,
|
|
|
|
'rel': RELATIONS_RE,
|
|
|
|
'punct': PUNCTUATIONS_RE
|
2016-12-04 19:15:35 +01:00
|
|
|
})
|
|
|
|
|
2018-06-10 22:27:22 +02:00
|
|
|
_line_terminator = re.compile(LINETERMINATORSEQ_RE)
|
|
|
|
|
2016-12-04 19:15:35 +01:00
|
|
|
|
2016-12-15 15:41:56 +01:00
|
|
|
def convert_to_unary(token_value):
|
2018-06-10 22:27:22 +02:00
|
|
|
return {TokenTypes.ADD: _UNARY_OPERATORS['+'], TokenTypes.SUB: _UNARY_OPERATORS['-']}[token_value[0]]
|
|
|
|
|
|
|
|
|
|
|
|
class Token(object):
|
|
|
|
def __init__(self, token_type, token_value, pos, line, at):
|
|
|
|
super(Token, self).__init__()
|
|
|
|
self.id = token_type
|
|
|
|
self.value = token_value
|
|
|
|
self.pos = pos
|
|
|
|
self.line = line
|
|
|
|
self.at = at
|
2016-12-15 15:41:56 +01:00
|
|
|
|
|
|
|
|
2016-12-04 19:15:35 +01:00
|
|
|
class TokenStream(object):
|
|
|
|
def __init__(self, code, start=0):
|
2017-01-22 14:26:45 +01:00
|
|
|
super(TokenStream, self).__init__()
|
2016-12-04 19:15:35 +01:00
|
|
|
self.code = code
|
|
|
|
self.ended = False
|
|
|
|
self.peeked = []
|
|
|
|
self._ts = self._next_token(start)
|
|
|
|
self._last = None
|
2018-06-10 22:27:22 +02:00
|
|
|
self._line = 1 + len(_line_terminator.findall(self.code[:start]))
|
2016-12-04 19:15:35 +01:00
|
|
|
|
|
|
|
def _next_token(self, pos=0):
|
2016-12-05 11:44:32 +01:00
|
|
|
while not self.ended:
|
2016-12-04 19:15:35 +01:00
|
|
|
feed_m = _input_element.match(self.code, pos)
|
|
|
|
if feed_m is not None:
|
|
|
|
token_id = feed_m.lastgroup
|
|
|
|
token_value = feed_m.group(token_id)
|
|
|
|
pos = feed_m.start(token_id)
|
2018-06-10 22:27:22 +02:00
|
|
|
token_id = TokenTypes[TokenTypes.index(token_id)]
|
|
|
|
|
|
|
|
# TODO use line report insteadof position
|
|
|
|
lt_count, lt_match = 0, None
|
|
|
|
for lt_count, lt_match in enumerate(_line_terminator.finditer(token_value)): pass
|
|
|
|
lt_last = pos if lt_match is None else pos + lt_match.start()
|
|
|
|
at = pos - lt_last
|
|
|
|
self._line += lt_count
|
|
|
|
|
2016-12-05 11:44:32 +01:00
|
|
|
self.ended = feed_m.end() >= len(self.code) # because how yield works
|
2018-06-10 22:27:22 +02:00
|
|
|
if token_id is TokenTypes.COMMENT:
|
2016-12-04 19:15:35 +01:00
|
|
|
pass
|
|
|
|
# TODO date
|
2018-06-10 22:27:22 +02:00
|
|
|
elif token_id is TokenTypes.NULL:
|
|
|
|
yield Token(token_id, None, pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.BOOL:
|
|
|
|
yield Token(token_id, {'true': True, 'false': False}[token_value], pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.STR:
|
|
|
|
yield Token(token_id, token_value[1:-1], pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.INT:
|
2016-12-14 18:21:57 +01:00
|
|
|
root = ((16 if len(token_value) > 2 and token_value[1] in 'xX' else 8)
|
|
|
|
if token_value.startswith('0') else 10)
|
2018-06-10 22:27:22 +02:00
|
|
|
yield Token(token_id, int(token_value, root), pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.FLOAT:
|
|
|
|
yield Token(token_id, float(token_value), pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.REGEX:
|
2016-12-04 19:15:35 +01:00
|
|
|
# TODO error handling
|
|
|
|
regex = re.compile(feed_m.group('rebody'))
|
2018-06-10 22:27:22 +02:00
|
|
|
yield Token(token_id, (regex, feed_m.group('reflags')), pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.ID:
|
|
|
|
yield Token(token_id, token_value, pos, self._line, at)
|
2016-12-05 11:44:32 +01:00
|
|
|
elif token_id in _operator_lookup:
|
2018-06-10 22:27:22 +02:00
|
|
|
yield Token(token_id if token_value != 'in' else TokenTypes.IN,
|
|
|
|
_operator_lookup[token_id][token_value],
|
|
|
|
pos, self._line, at)
|
|
|
|
elif token_id is TokenTypes.PUNCT:
|
|
|
|
yield Token(_PUNCTUATIONS[token_value], token_value, pos, self._line, at)
|
2016-12-04 19:15:35 +01:00
|
|
|
else:
|
|
|
|
raise ExtractorError('Unexpected token at %d' % pos)
|
|
|
|
pos = feed_m.end()
|
2016-12-14 18:21:57 +01:00
|
|
|
elif pos >= len(self.code):
|
|
|
|
self.ended = True
|
2016-12-04 19:15:35 +01:00
|
|
|
else:
|
|
|
|
raise ExtractorError('Unrecognised sequence at %d' % pos)
|
2018-06-03 23:54:49 +02:00
|
|
|
|
2016-12-04 19:15:35 +01:00
|
|
|
def chk_id(self, last=False):
|
|
|
|
if last:
|
2018-06-10 22:27:22 +02:00
|
|
|
token = self._last
|
2016-12-04 19:15:35 +01:00
|
|
|
else:
|
2018-06-10 22:27:22 +02:00
|
|
|
token = self.peek()
|
|
|
|
if token.id is not TokenTypes.ID or token.value in _reserved_words:
|
|
|
|
raise ExtractorError('Invalid identifier at %d' % token.pos)
|
2016-12-04 19:15:35 +01:00
|
|
|
|
|
|
|
def peek(self, count=1):
|
|
|
|
for _ in range(count - len(self.peeked)):
|
|
|
|
token = next(self._ts, None)
|
|
|
|
if token is None:
|
2018-06-10 22:27:22 +02:00
|
|
|
pos = len(self.code)
|
|
|
|
|
|
|
|
lt_count, lt_match = 0, None
|
|
|
|
for lt_count, lt_match in enumerate(_line_terminator.finditer(self.code)): pass
|
|
|
|
lt_last = pos if lt_match is None else pos + lt_match.start()
|
|
|
|
at = pos - lt_last
|
|
|
|
|
|
|
|
self.peeked.append(Token(TokenTypes.END, ';', pos, self._line, at))
|
2016-12-04 19:15:35 +01:00
|
|
|
else:
|
|
|
|
self.peeked.append(token)
|
|
|
|
return self.peeked[count - 1]
|
|
|
|
|
2016-12-11 17:36:19 +01:00
|
|
|
def pop(self, count=1):
|
2016-12-11 23:30:03 +01:00
|
|
|
if count > len(self.peeked):
|
|
|
|
self.peek(count)
|
|
|
|
self.flush()
|
|
|
|
else:
|
|
|
|
self._last = self.peeked[count - 1]
|
|
|
|
self.peeked = self.peeked[count:]
|
2016-12-11 17:36:19 +01:00
|
|
|
return self._last
|
|
|
|
|
|
|
|
def flush(self):
|
|
|
|
if self.peeked:
|
|
|
|
self._last = self.peeked[-1]
|
|
|
|
self.peeked = []
|
2016-12-04 19:15:35 +01:00
|
|
|
return self._last
|
|
|
|
|
|
|
|
def last(self):
|
2016-12-05 11:44:32 +01:00
|
|
|
return self._last
|