193 lines
6.8 KiB
Python
Raw Normal View History

2016-12-04 19:15:35 +01:00
from __future__ import unicode_literals
import re
import operator
from ..utils import ExtractorError
from .jsgrammar import (
COMMENT_RE,
TOKENS_RE,
PUNCTUATIONS_RE,
LOGICAL_OPERATORS_RE,
UNARY_OPERATORS_RE,
RELATIONS_RE,
ASSIGN_OPERATORS_RE,
2016-12-07 07:28:09 +01:00
OPERATORS_RE,
Token
2016-12-04 19:15:35 +01:00
)
_PUNCTUATIONS = {
2016-12-07 07:28:09 +01:00
'{': Token.COPEN,
'}': Token.CCLOSE,
'(': Token.POPEN,
')': Token.PCLOSE,
'[': Token.SOPEN,
']': Token.SCLOSE,
'.': Token.DOT,
';': Token.END,
',': Token.COMMA,
'?': Token.HOOK,
':': Token.COLON
2016-12-04 19:15:35 +01:00
}
_LOGICAL_OPERATORS = {
2016-12-07 07:28:09 +01:00
'&&': (Token.AND, lambda cur, right: cur and right),
'||': (Token.OR, lambda cur, right: cur or right)
2016-12-04 19:15:35 +01:00
}
_UNARY_OPERATORS = {
'+': (Token.PLUS, lambda cur: cur),
'-': (Token.NEG, lambda cur: cur * -1),
2016-12-07 07:28:09 +01:00
'++': (Token.INC, lambda cur: cur + 1),
'--': (Token.DEC, lambda cur: cur - 1),
'!': (Token.NOT, operator.not_),
'~': (Token.BNOT, operator.inv),
2016-12-04 19:15:35 +01:00
# XXX define these operators
2016-12-07 07:28:09 +01:00
'delete': (Token.DEL, None),
'void': (Token.VOID, None),
'typeof': (Token.TYPE, lambda cur: type(cur))
2016-12-04 19:15:35 +01:00
}
_RELATIONS = {
2016-12-07 07:28:09 +01:00
'<': (Token.LT, operator.lt),
'>': (Token.GT, operator.gt),
'<=': (Token.LE, operator.le),
'>=': (Token.GE, operator.ge),
2016-12-04 19:15:35 +01:00
# XXX check python and JavaScript equality difference
2016-12-07 07:28:09 +01:00
'==': (Token.EQ, operator.eq),
'!=': (Token.NE, operator.ne),
'===': (Token.SEQ, lambda cur, right: cur == right and type(cur) == type(right)),
'!==': (Token.SNE, lambda cur, right: not cur == right or not type(cur) == type(right)),
'in': (Token.IN, operator.contains),
'instanceof': (Token.INSTANCEOF, lambda cur, right: isinstance(cur, right))
2016-12-04 19:15:35 +01:00
}
_OPERATORS = {
2016-12-07 07:28:09 +01:00
'|': (Token.BOR, operator.or_),
'^': (Token.BXOR, operator.xor),
'&': (Token.BAND, operator.and_),
2016-12-04 19:15:35 +01:00
# NOTE convert to int before shift float
2016-12-07 07:28:09 +01:00
'>>': (Token.RSHIFT, operator.rshift),
'<<': (Token.LSHIFT, operator.lshift),
'>>>': (Token.URSHIFT, lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right),
'-': (Token.SUB, operator.sub),
'+': (Token.ADD, operator.add),
'%': (Token.MOD, operator.mod),
'/': (Token.DIV, operator.truediv),
'*': (Token.MUL, operator.mul)
2016-12-04 19:15:35 +01:00
}
_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items())
_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right)
_operator_lookup = {
2016-12-07 07:28:09 +01:00
Token.OP: _OPERATORS,
Token.AOP: _ASSIGN_OPERATORS,
Token.UOP: _UNARY_OPERATORS,
Token.LOP: _LOGICAL_OPERATORS,
Token.REL: _RELATIONS
}
2016-12-04 19:15:35 +01:00
# only to check ids
_reserved_words = ('break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally',
'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw', 'try',
'typeof', 'var', 'void', 'while', 'with')
_input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(lop)s|%(uop)s|%(aop)s|%(op)s|%(rel)s|%(punct)s)\s*' % {
2016-12-04 19:15:35 +01:00
'comment': COMMENT_RE,
'token': TOKENS_RE,
'lop': LOGICAL_OPERATORS_RE,
'uop': UNARY_OPERATORS_RE,
'aop': ASSIGN_OPERATORS_RE,
'op': OPERATORS_RE,
'rel': RELATIONS_RE,
'punct': PUNCTUATIONS_RE
2016-12-04 19:15:35 +01:00
})
def convert_to_unary(token_value):
return {Token.ADD: _UNARY_OPERATORS['+'], Token.SUB: _UNARY_OPERATORS['-']}[token_value[0]]
2016-12-04 19:15:35 +01:00
class TokenStream(object):
def __init__(self, code, start=0):
super(TokenStream, self).__init__()
2016-12-04 19:15:35 +01:00
self.code = code
self.ended = False
self.peeked = []
self._ts = self._next_token(start)
self._last = None
def _next_token(self, pos=0):
while not self.ended:
2016-12-04 19:15:35 +01:00
feed_m = _input_element.match(self.code, pos)
if feed_m is not None:
token_id = feed_m.lastgroup
token_value = feed_m.group(token_id)
pos = feed_m.start(token_id)
token_id = Token[Token.index(token_id)]
self.ended = feed_m.end() >= len(self.code) # because how yield works
2016-12-07 07:28:09 +01:00
if token_id is Token.COMMENT:
2016-12-04 19:15:35 +01:00
pass
# TODO date
2016-12-07 07:28:09 +01:00
elif token_id is Token.NULL:
2016-12-04 19:15:35 +01:00
yield (token_id, None, pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.BOOL:
2016-12-04 19:15:35 +01:00
yield (token_id, {'true': True, 'false': False}[token_value], pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.STR:
2016-12-08 09:20:14 +01:00
yield (token_id, token_value[1:-1], pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.INT:
2016-12-14 18:21:57 +01:00
root = ((16 if len(token_value) > 2 and token_value[1] in 'xX' else 8)
if token_value.startswith('0') else 10)
yield (token_id, int(token_value, root), pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.FLOAT:
2016-12-04 19:15:35 +01:00
yield (token_id, float(token_value), pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.REGEX:
2016-12-04 19:15:35 +01:00
# TODO error handling
regex = re.compile(feed_m.group('rebody'))
yield (token_id, (regex, feed_m.group('reflags')), pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.ID:
2016-12-04 19:15:35 +01:00
yield (token_id, token_value, pos)
elif token_id in _operator_lookup:
yield (token_id if token_value != 'in' else Token.IN,
_operator_lookup[token_id][token_value],
pos)
2016-12-07 07:28:09 +01:00
elif token_id is Token.PUNCT:
yield (_PUNCTUATIONS[token_value], token_value, pos)
2016-12-04 19:15:35 +01:00
else:
raise ExtractorError('Unexpected token at %d' % pos)
pos = feed_m.end()
2016-12-14 18:21:57 +01:00
elif pos >= len(self.code):
self.ended = True
2016-12-04 19:15:35 +01:00
else:
raise ExtractorError('Unrecognised sequence at %d' % pos)
raise StopIteration
def chk_id(self, last=False):
if last:
name, value, pos = self._last
else:
name, value, pos = self.peek()
2016-12-07 07:28:09 +01:00
if name is not Token.ID or value in _reserved_words:
2016-12-04 19:15:35 +01:00
raise ExtractorError('Invalid identifier at %d' % pos)
def peek(self, count=1):
for _ in range(count - len(self.peeked)):
token = next(self._ts, None)
if token is None:
2016-12-07 07:28:09 +01:00
self.peeked.append((Token.END, ';', len(self.code)))
2016-12-04 19:15:35 +01:00
else:
self.peeked.append(token)
return self.peeked[count - 1]
2016-12-11 17:36:19 +01:00
def pop(self, count=1):
if count > len(self.peeked):
self.peek(count)
self.flush()
else:
self._last = self.peeked[count - 1]
self.peeked = self.peeked[count:]
2016-12-11 17:36:19 +01:00
return self._last
def flush(self):
if self.peeked:
self._last = self.peeked[-1]
self.peeked = []
2016-12-04 19:15:35 +01:00
return self._last
def last(self):
return self._last