[jsinterp] Refactoring and minor fixes

2016-12-04 19:15:35 +01:00 · 2016-12-04 19:15:35 +01:00 · f6ad8db133
commit f6ad8db133
parent f605783764
4 changed files with 248 additions and 377 deletions
--- a/youtube_dl/jsinterp/init.py
+++ b/youtube_dl/jsinterp/init.py
@ -0,0 +1,3 @@
+from .jsinterp import JSInterpreter
+
+__all__ = ['JSInterpreter']
--- a/youtube_dl/jsinterp/jsgrammar.py
+++ b/youtube_dl/jsinterp/jsgrammar.py
@ -0,0 +1,62 @@
+from __future__ import unicode_literals
+
+import re
+
+__DECIMAL_RE = r'(?:[1-9][0-9]*)|0'
+__OCTAL_RE = r'0[0-7]+'
+__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+'
+__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}'
+__ESC_HEX_RE = r'x[0-9a-fA-F]{2}'
+
+
+# NOTE order is fixed due to regex matching, does not represent any precedence
+_punctuations = ['{', '}', '(', ')', '[', ']', '.', ';', ',', '?', ':']
+_logical_operator = ['||', '&&']
+_unary_operator = ['++', '--', '!', '~', 'delete', 'void', 'typeof']
+_relation = ['===', '!==', '==', '!=', '<=', '>=', '<', '>']
+_operator = ['|', '^', '&', '>>>', '>>', '<<', '-', '+', '%', '/', '*']
+_assign_operator = [op + '=' for op in _operator]
+_assign_operator.append('=')
+
+# XXX add support for unicode chars
+_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+
+# non-escape char also can be escaped, but line continuation and quotes has to be
+# XXX unicode and hexadecimal escape sequences should be validated
+_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'"""
+_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"'''
+_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE)
+
+_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE}
+_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE}
+
+_BOOL_RE = r'true|false'
+_NULL_RE = r'null'
+
+# XXX early validation might needed
+# r'''/(?!\*)
+#     (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])*
+#     /(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)'''
+_REGEX_FLAGS_RE = r'(?![gimy]*(?P<reflag>[gimy])[gimy]*(?P=reflag))(?P<reflags>[gimy]{0,4}\b)'
+_REGEX_RE = r'/(?!\*)(?P<rebody>(?:[^/\n]|(?:\\/))*)/(?:(?:%s)|(?:\s|$))' % _REGEX_FLAGS_RE
+
+_TOKENS = [
+    ('null', _NULL_RE),
+    ('bool', _BOOL_RE),
+    ('id', _NAME_RE),
+    ('str', _STRING_RE),
+    ('int', _INTEGER_RE),
+    ('float', _FLOAT_RE),
+    ('regex', _REGEX_RE)
+]
+
+COMMENT_RE = r'(?P<comment>/\*(?:(?!\*/)(?:\n|.))*\*/)'
+TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value}
+                      for name, value in _TOKENS)
+
+PUNCTUATIONS_RE = r'(?P<punc>%s)' % r'|'.join(re.escape(value) for value in _punctuations)
+LOGICAL_OPERATORS_RE = r'(?P<lop>%s)' % r'|'.join(re.escape(value) for value in _logical_operator)
+UNARY_OPERATORS_RE = r'(?P<uop>%s)' % r'|'.join(re.escape(value) for value in _unary_operator)
+RELATIONS_RE = r'(?P<rel>%s)' % r'|'.join(re.escape(value) for value in _relation)
+OPERATORS_RE = r'(?P<op>%s)' % r'|'.join(re.escape(value) for value in _operator)
+ASSIGN_OPERATORS_RE = r'(?P<aop>%s)' % r'|'.join(re.escape(value) for value in _assign_operator)
--- a/youtube_dl/jsinterp/jsinterp.py
+++ b/youtube_dl/jsinterp/jsinterp.py
@ -1,205 +1,14 @@
 from __future__ import unicode_literals

-import json
-import operator
 import re

-from .utils import ExtractorError
+from ..utils import ExtractorError
+from .tstream import TokenStream

-__DECIMAL_RE = r'(?:[1-9][0-9]*)|0'
-__OCTAL_RE = r'0[0-7]+'
-__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+'
-__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}'
-__ESC_HEX_RE = r'x[0-9a-fA-F]{2}'
-
-_PUNCTUATIONS = {
-    'copen': '{',
-    'cclose': '}',
-    'popen': '(',
-    'pclose': ')',
-    'sopen': '[',
-    'sclose': ']',
-    'dot': '.',
-    'end': ';',
-    'comma': ',',
-    'hook': '?',
-    'colon': ':'
-}
-
-# TODO find a final storage solution (already)
-_LOGICAL_OPERATORS = {
-    '&&': ('and', lambda cur, right: cur and right),
-    '||': ('or', lambda cur, right: cur or right)
-}
-_UNARY_OPERATORS = {
-    '++': ('inc', lambda cur: cur + 1),
-    '--': ('dec', lambda cur: cur - 1),
-    '!': ('not', operator.not_),
-    '~': ('bnot', lambda cur: cur ^ -1),
-    # XXX define these operators
-    'delete': ('del', None),
-    'void': ('void', None),
-    'typeof': ('type', lambda cur: type(cur))
-}
-_RELATIONS = {
-    '<': ('lt', operator.lt),
-    '>': ('gt', operator.gt),
-    '<=': ('le', operator.le),
-    '>=': ('ge', operator.ge),
-    # XXX check python and JavaScript equality difference
-    '==': ('eq', operator.eq),
-    '!=': ('ne', operator.ne),
-    '===': ('seq', lambda cur, right: cur == right and type(cur) == type(right)),
-    '!==': ('sne', lambda cur, right: not cur == right or not type(cur) == type(right))
-}
-_OPERATORS = {
-    '|': ('bor', operator.or_),
-    '^': ('bxor', operator.xor),
-    '&': ('band', operator.and_),
-    # NOTE convert to int before shift float
-    '>>': ('rshift', operator.rshift),
-    '<<': ('lshift', operator.lshift),
-    '>>>': ('urshift', lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right),
-    '-': ('sub', operator.sub),
-    '+': ('add', operator.add),
-    '%': ('mod', operator.mod),
-    '/': ('div', operator.truediv),
-    '*': ('mul', operator.mul)
-}
-_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items())
-_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right)
-
-# NOTE merely fixed due to regex matching, does not represent any precedence
-_logical_operator_order = _LOGICAL_OPERATORS.keys()  # whatever
-_unary_operator_order = _UNARY_OPERATORS.keys()  # evs
-_relation_order = ['===', '!==', '==', '!=', '<=', '>=', '<', '>']
-_operator_order = ['|', '^', '&', '>>>', '>>', '<<', '-', '+', '%', '/', '*']
-_assign_operator_order = [op + '=' for op in _operator_order]
-_assign_operator_order.append('=')
-
-# only to check ids
-_RESERVED_WORDS = ('break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally',
-                   'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw',
-                   'try', 'typeof', 'var', 'void', 'while', 'with')
-
-# XXX add support for unicode chars
-_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
-
-# non-escape char also can be escaped, but line continuation and quotes has to be
-# XXX unicode and hexadecimal escape sequences should be validated
-_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'"""
-_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"'''
-_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE)
-
-_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE}
-_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE}
-
-_BOOL_RE = r'true|false'
-_NULL_RE = r'null'
-
-# XXX early validation might needed
-# r'''/(?!\*)
-#     (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])*
-#     /(?:(?![gimy]*(?P<flag>[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)'''
-_REGEX_FLAGS_RE = r'(?![gimy]*(?P<reflag>[gimy])[gimy]*(?P=reflag))(?P<reflags>[gimy]{0,4}\b)'
-_REGEX_RE = r'/(?!\*)(?P<rebody>(?:[^/\n]|(?:\\/))*)/(?:(?:%s)|(?:\s|$))' % _REGEX_FLAGS_RE
-
-re.compile(_REGEX_RE)
-
-_TOKENS = [
-    ('null', _NULL_RE),
-    ('bool', _BOOL_RE),
-    ('id', _NAME_RE),
-    ('str', _STRING_RE),
-    ('int', _INTEGER_RE),
-    ('float', _FLOAT_RE),
-    ('regex', _REGEX_RE)
-]
-
-_token_keys = set(name for name, value in _TOKENS)
-
-_COMMENT_RE = r'(?P<comment>/\*(?:(?!\*/)(?:\n|.))*\*/)'
-_TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value}
-                       for name, value in _TOKENS)
-_PUNCTUATIONS_RE = r'|'.join(r'(?P<%(id)s>%(value)s)' % {'id': name, 'value': re.escape(value)}
-                             for name, value in _PUNCTUATIONS.items())
-_LOGICAL_OPERATORS_RE = r'(?P<lop>%s)' % r'|'.join(re.escape(value) for value in _logical_operator_order)
-_UNARY_OPERATORS_RE = r'(?P<uop>%s)' % r'|'.join(re.escape(value) for value in _unary_operator_order)
-_RELATIONS_RE = r'(?P<rel>%s)' % r'|'.join(re.escape(value) for value in _relation_order)
-_OPERATORS_RE = r'(?P<op>%s)' % r'|'.join(re.escape(value) for value in _operator_order)
-_ASSIGN_OPERATORS_RE = r'(?P<aop>%s)' % r'|'.join(re.escape(value) for value in _assign_operator_order)
-
-input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(punct)s|%(lop)s|%(uop)s|%(rel)s|%(aop)s|%(op)s)\s*' % {
-    'comment': _COMMENT_RE,
-    'token': _TOKENS_RE,
-    'punct': _PUNCTUATIONS_RE,
-    'lop': _LOGICAL_OPERATORS_RE,
-    'uop': _UNARY_OPERATORS_RE,
-    'rel': _RELATIONS_RE,
-    'aop': _ASSIGN_OPERATORS_RE,
-    'op': _OPERATORS_RE
-})
-
-
-class TokenStream(object):
-    def __init__(self, code, start=0):
-        self.code = code
-        self.ended = False
-        self.peeked = []
-        self._ts = self._next_token(start)
-
-    def _next_token(self, pos=0):
-        while pos < len(self.code):
-            feed_m = input_element.match(self.code, pos)
-            if feed_m is not None:
-                token_id = feed_m.lastgroup
-                token_value = feed_m.group(token_id)
-                pos = feed_m.start(token_id)
-                if token_id == 'comment':
-                    pass
-                elif token_id in _token_keys:
-                    # TODO date
-                    if token_id == 'null':
-                        yield (token_id, None, pos)
-                    elif token_id == 'bool':
-                        yield (token_id, {'true': True, 'false': False}[token_value], pos)
-                    elif token_id == 'str':
-                        yield (token_id, token_value, pos)
-                    elif token_id == 'int':
-                        yield (token_id, int(token_value), pos)
-                    elif token_id == 'float':
-                        yield (token_id, float(token_value), pos)
-                    elif token_id == 'regex':
-                        # TODO error handling
-                        regex = re.compile(feed_m.group('rebody'))
-                        yield (token_id, {'re': regex, 'flags': feed_m.group('reflags')}, pos)
-                    elif token_id in ('lor', 'uop', 'rel', 'aop', 'op'):
-                        yield (token_id, _LOGICAL_OPERATORS[token_value])
-                    else:
-                        yield (token_id, token_value, pos)
-                else:
-                    yield (token_id, token_value, pos)
-                pos = feed_m.end()
-            else:
-                raise ExtractorError('Unexpected character sequence at %d' % pos)
-        raise StopIteration
-
-    def peek(self, count=1):
-        for _ in range(count - len(self.peeked)):
-            token = next(self._ts, None)
-            if token is None:
-                self.ended = True
-                self.peeked.append(('end', ';', len(self.code)))
-            else:
-                self.peeked.append(token)
-        return self.peeked[count - 1]
-
-    def pop(self):
-        if not self.peeked:
-            self.peek()
-        return self.peeked.pop(0)
+_token_keys = 'null', 'bool', 'id', 'str', 'int', 'float', 'regex'


+# TODO support json
 class JSInterpreter(object):
    undefined = object()

@ -210,11 +19,6 @@ class JSInterpreter(object):
        self._functions = {}
        self._objects = objects

-    @staticmethod
-    def _chk_id(name, at):
-        if name in _RESERVED_WORDS:
-            raise ExtractorError('Invalid identifier at %d' % at)
-
    def _next_statement(self, token_stream, stack_top):
        if stack_top < 0:
            raise ExtractorError('Recursion limit reached')
@ -234,7 +38,7 @@ class JSInterpreter(object):
            # block
            token_stream.pop()
            statement_list = []
-            for s in self._next_statement(token_stream, stack_top - 1):
+            for s in self.statements(token_stream, stack_top - 1):
                statement_list.append(s)
                token_id, token_value, token_pos = token_stream.peek()
                if token_id == 'cclose':
@ -251,7 +55,7 @@ class JSInterpreter(object):
                    token_id, token_value, token_pos = token_stream.pop()
                    if token_id != 'id':
                        raise ExtractorError('Missing variable name at %d' % token_pos)
-                    self._chk_id(token_value, token_pos)
+                    token_stream.chk_id(last=True)
                    variables.append(token_value)

                    peek_id, peek_value, peek_pos = token_stream.peek()
@ -355,12 +159,14 @@ class JSInterpreter(object):

        left = self._conditional_expression(token_stream, stack_top - 1)
        peek_id, peek_value, peek_pos = token_stream.peek()
-        if peek_id in _assign_operator_order:
+        if peek_id == 'aop':
            token_stream.pop()
+            _, op = peek_value
            right = self._assign_expression(token_stream, stack_top - 1)
        else:
+            op = None
            right = None
-        return ('assign', left, right)
+        return ('assign', op, left, right)

    def _member_expression(self, token_stream, stack_top):
        peek_id, peek_value, peek_pos = token_stream.peek()
@ -422,7 +228,7 @@ class JSInterpreter(object):
                    raise ExtractorError('Function expression is not yet supported at %d' % peek_pos)
                # id
                else:
-                    self._chk_id(peek_value, peek_pos)
+                    token_stream.chk_id()
                    return ('id', peek_value)
            # literals
            else:
@ -614,180 +420,10 @@ class JSInterpreter(object):
        return ('rpn', out)

    def interpret_statement(self, stmt, local_vars, allow_recursion=100):
-        if allow_recursion < 0:
-            raise ExtractorError('Recursion limit reached')
-
-        should_abort = False
-        stmt = stmt.lstrip()
-        stmt_m = re.match(r'var\s', stmt)
-        if stmt_m:
-            expr = stmt[len(stmt_m.group(0)):]
-        else:
-            return_m = re.match(r'return(?:\s+|$)', stmt)
-            if return_m:
-                expr = stmt[len(return_m.group(0)):]
-                should_abort = True
-            else:
-                # Try interpreting it as an expression
-                expr = stmt
-
-        v = self.interpret_expression(expr, local_vars, allow_recursion)
-        return v, should_abort
+        pass

    def interpret_expression(self, expr, local_vars, allow_recursion):
-        expr = expr.strip()
-
-        if expr == '':  # Empty expression
-            return None
-
-        if expr.startswith('('):
-            parens_count = 0
-            for m in re.finditer(r'[()]', expr):
-                if m.group(0) == '(':
-                    parens_count += 1
-                else:
-                    parens_count -= 1
-                    if parens_count == 0:
-                        sub_expr = expr[1:m.start()]
-                        sub_result = self.interpret_expression(
-                            sub_expr, local_vars, allow_recursion)
-                        remaining_expr = expr[m.end():].strip()
-                        if not remaining_expr:
-                            return sub_result
-                        else:
-                            expr = json.dumps(sub_result) + remaining_expr
-                        break
-            else:
-                raise ExtractorError('Premature end of parens in %r' % expr)
-
-        for op, opfunc in _ASSIGN_OPERATORS:
-            m = re.match(r'''(?x)
-                (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
-                \s*%s
-                (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
-            if not m:
-                continue
-            right_val = self.interpret_expression(
-                m.group('expr'), local_vars, allow_recursion - 1)
-
-            if m.groupdict().get('index'):
-                lvar = local_vars[m.group('out')]
-                idx = self.interpret_expression(
-                    m.group('index'), local_vars, allow_recursion)
-                assert isinstance(idx, int)
-                cur = lvar[idx]
-                val = opfunc(cur, right_val)
-                lvar[idx] = val
-                return val
-            else:
-                cur = local_vars.get(m.group('out'))
-                val = opfunc(cur, right_val)
-                local_vars[m.group('out')] = val
-                return val
-
-        if expr.isdigit():
-            return int(expr)
-
-        var_m = re.match(
-            r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
-            expr)
-        if var_m:
-            return local_vars[var_m.group('name')]
-
-        try:
-            return json.loads(expr)
-        except ValueError:
-            pass
-
-        m = re.match(
-            r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
-            expr)
-        if m:
-            variable = m.group('var')
-            member = m.group('member')
-            arg_str = m.group('args')
-
-            if variable in local_vars:
-                obj = local_vars[variable]
-            else:
-                if variable not in self._objects:
-                    self._objects[variable] = self.extract_object(variable)
-                obj = self._objects[variable]
-
-            if arg_str is None:
-                # Member access
-                if member == 'length':
-                    return len(obj)
-                return obj[member]
-
-            assert expr.endswith(')')
-            # Function call
-            if arg_str == '':
-                argvals = tuple()
-            else:
-                argvals = tuple([
-                    self.interpret_expression(v, local_vars, allow_recursion)
-                    for v in arg_str.split(',')])
-
-            if member == 'split':
-                assert argvals == ('',)
-                return list(obj)
-            if member == 'join':
-                assert len(argvals) == 1
-                return argvals[0].join(obj)
-            if member == 'reverse':
-                assert len(argvals) == 0
-                obj.reverse()
-                return obj
-            if member == 'slice':
-                assert len(argvals) == 1
-                return obj[argvals[0]:]
-            if member == 'splice':
-                assert isinstance(obj, list)
-                index, howMany = argvals
-                res = []
-                for i in range(index, min(index + howMany, len(obj))):
-                    res.append(obj.pop(index))
-                return res
-
-            return obj[member](argvals)
-
-        m = re.match(
-            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
-        if m:
-            val = local_vars[m.group('in')]
-            idx = self.interpret_expression(
-                m.group('idx'), local_vars, allow_recursion - 1)
-            return val[idx]
-
-        for op, opfunc in _OPERATORS:
-            m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
-            if not m:
-                continue
-            x, abort = self.interpret_statement(
-                m.group('x'), local_vars, allow_recursion - 1)
-            if abort:
-                raise ExtractorError(
-                    'Premature left-side return of %s in %r' % (op, expr))
-            y, abort = self.interpret_statement(
-                m.group('y'), local_vars, allow_recursion - 1)
-            if abort:
-                raise ExtractorError(
-                    'Premature right-side return of %s in %r' % (op, expr))
-            return opfunc(x, y)
-
-        m = re.match(
-            r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
-        if m:
-            fname = m.group('func')
-            argvals = tuple([
-                int(v) if v.isdigit() else local_vars[v]
-                for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple()
-            if fname not in self._functions:
-                self._functions[fname] = self.extract_function(fname)
-            return self._functions[fname](argvals)
-
-        raise ExtractorError('Unsupported JS expression %r' % expr)
+        pass

    def extract_object(self, objname):
        obj = {}
--- a/youtube_dl/jsinterp/tstream.py
+++ b/youtube_dl/jsinterp/tstream.py
@ -0,0 +1,170 @@
+from __future__ import unicode_literals
+
+import re
+import operator
+
+from ..utils import ExtractorError
+from .jsgrammar import (
+    COMMENT_RE,
+    TOKENS_RE,
+    PUNCTUATIONS_RE,
+    LOGICAL_OPERATORS_RE,
+    UNARY_OPERATORS_RE,
+    RELATIONS_RE,
+    ASSIGN_OPERATORS_RE,
+    OPERATORS_RE
+)
+
+
+_PUNCTUATIONS = {
+    '{': 'copen',
+    '}': 'cclose',
+    '(': 'popen',
+    ')': 'pclose',
+    '[': 'sopen',
+    ']': 'sclose',
+    '.': 'dot',
+    ';': 'end',
+    ',': 'comma',
+    '?': 'hook',
+    ':': 'colon'
+}
+_LOGICAL_OPERATORS = {
+    '&&': ('and', lambda cur, right: cur and right),
+    '||': ('or', lambda cur, right: cur or right)
+}
+_UNARY_OPERATORS = {
+    '++': ('inc', lambda cur: cur + 1),
+    '--': ('dec', lambda cur: cur - 1),
+    '!': ('not', operator.not_),
+    '~': ('bnot', lambda cur: cur ^ -1),
+    # XXX define these operators
+    'delete': ('del', None),
+    'void': ('void', None),
+    'typeof': ('type', lambda cur: type(cur))
+}
+_RELATIONS = {
+    '<': ('lt', operator.lt),
+    '>': ('gt', operator.gt),
+    '<=': ('le', operator.le),
+    '>=': ('ge', operator.ge),
+    # XXX check python and JavaScript equality difference
+    '==': ('eq', operator.eq),
+    '!=': ('ne', operator.ne),
+    '===': ('seq', lambda cur, right: cur == right and type(cur) == type(right)),
+    '!==': ('sne', lambda cur, right: not cur == right or not type(cur) == type(right))
+}
+_OPERATORS = {
+    '|': ('bor', operator.or_),
+    '^': ('bxor', operator.xor),
+    '&': ('band', operator.and_),
+    # NOTE convert to int before shift float
+    '>>': ('rshift', operator.rshift),
+    '<<': ('lshift', operator.lshift),
+    '>>>': ('urshift', lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right),
+    '-': ('sub', operator.sub),
+    '+': ('add', operator.add),
+    '%': ('mod', operator.mod),
+    '/': ('div', operator.truediv),
+    '*': ('mul', operator.mul)
+}
+_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items())
+_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right)
+
+# only to check ids
+_RESERVED_WORDS = ( 'break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally',
+                    'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw', 'try',
+                    'typeof', 'var', 'void', 'while', 'with')
+
+
+_input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(punct)s|%(lop)s|%(uop)s|%(rel)s|%(aop)s|%(op)s)\s*' % {
+    'comment': COMMENT_RE,
+    'token': TOKENS_RE,
+    'punct': PUNCTUATIONS_RE,
+    'lop': LOGICAL_OPERATORS_RE,
+    'uop': UNARY_OPERATORS_RE,
+    'rel': RELATIONS_RE,
+    'aop': ASSIGN_OPERATORS_RE,
+    'op': OPERATORS_RE
+})
+
+
+class TokenStream(object):
+    def __init__(self, code, start=0):
+        self.code = code
+        self.ended = False
+        self.peeked = []
+        self._ts = self._next_token(start)
+        self._last = None
+
+    def _next_token(self, pos=0):
+        while pos < len(self.code):
+            feed_m = _input_element.match(self.code, pos)
+            if feed_m is not None:
+                token_id = feed_m.lastgroup
+                token_value = feed_m.group(token_id)
+                pos = feed_m.start(token_id)
+                if token_id == 'comment':
+                    pass
+                # TODO date
+                elif token_id == 'null':
+                    yield (token_id, None, pos)
+                elif token_id == 'bool':
+                    yield (token_id, {'true': True, 'false': False}[token_value], pos)
+                elif token_id == 'str':
+                    yield (token_id, token_value, pos)
+                elif token_id == 'int':
+                    yield (token_id, int(token_value), pos)
+                elif token_id == 'float':
+                    yield (token_id, float(token_value), pos)
+                elif token_id == 'regex':
+                    # TODO error handling
+                    regex = re.compile(feed_m.group('rebody'))
+                    yield (token_id, {'re': regex, 'flags': feed_m.group('reflags')}, pos)
+                elif token_id == 'id':
+                    yield (token_id, token_value, pos)
+                elif token_id == 'op':
+                    yield (token_id, _OPERATORS[token_value])
+                elif token_id == 'aop':
+                    yield (token_id, _ASSIGN_OPERATORS[token_value])
+                elif token_id == 'rel':
+                    yield (token_id, _RELATIONS[token_value])
+                elif token_id == 'uop':
+                    yield (token_id, _UNARY_OPERATORS[token_value])
+                elif token_id == 'lop':
+                    yield (token_id, _LOGICAL_OPERATORS[token_value])
+                elif token_id == 'punc':
+                    yield (token_id, _PUNCTUATIONS[token_value], pos)
+                else:
+                    raise ExtractorError('Unexpected token at %d' % pos)
+                pos = feed_m.end()
+            else:
+                raise ExtractorError('Unrecognised sequence at %d' % pos)
+        raise StopIteration
+
+    def chk_id(self, last=False):
+        if last:
+            name, value, pos = self._last
+        else:
+            name, value, pos = self.peek()
+        if name in _RESERVED_WORDS:
+            raise ExtractorError('Invalid identifier at %d' % pos)
+
+    def peek(self, count=1):
+        for _ in range(count - len(self.peeked)):
+            token = next(self._ts, None)
+            if token is None:
+                self.ended = True
+                self.peeked.append(('end', ';', len(self.code)))
+            else:
+                self.peeked.append(token)
+        return self.peeked[count - 1]
+
+    def pop(self):
+        if not self.peeked:
+            self.peek()
+        self._last = self.peeked.pop(0)
+        return self._last
+
+    def last(self):
+        return self._last