diff --git a/youtube_dl/jsinterp/__init__.py b/youtube_dl/jsinterp/__init__.py new file mode 100644 index 000000000..700ab03db --- /dev/null +++ b/youtube_dl/jsinterp/__init__.py @@ -0,0 +1,3 @@ +from .jsinterp import JSInterpreter + +__all__ = ['JSInterpreter'] \ No newline at end of file diff --git a/youtube_dl/jsinterp/jsgrammar.py b/youtube_dl/jsinterp/jsgrammar.py new file mode 100644 index 000000000..4d93e07d9 --- /dev/null +++ b/youtube_dl/jsinterp/jsgrammar.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +import re + +__DECIMAL_RE = r'(?:[1-9][0-9]*)|0' +__OCTAL_RE = r'0[0-7]+' +__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+' +__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}' +__ESC_HEX_RE = r'x[0-9a-fA-F]{2}' + + +# NOTE order is fixed due to regex matching, does not represent any precedence +_punctuations = ['{', '}', '(', ')', '[', ']', '.', ';', ',', '?', ':'] +_logical_operator = ['||', '&&'] +_unary_operator = ['++', '--', '!', '~', 'delete', 'void', 'typeof'] +_relation = ['===', '!==', '==', '!=', '<=', '>=', '<', '>'] +_operator = ['|', '^', '&', '>>>', '>>', '<<', '-', '+', '%', '/', '*'] +_assign_operator = [op + '=' for op in _operator] +_assign_operator.append('=') + +# XXX add support for unicode chars +_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' + +# non-escape char also can be escaped, but line continuation and quotes has to be +# XXX unicode and hexadecimal escape sequences should be validated +_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'""" +_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"''' +_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE) + +_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE} +_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE} + +_BOOL_RE = r'true|false' +_NULL_RE = r'null' + +# XXX early validation might needed +# r'''/(?!\*) +# (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])* +# /(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)''' +_REGEX_FLAGS_RE = r'(?![gimy]*(?P[gimy])[gimy]*(?P=reflag))(?P[gimy]{0,4}\b)' +_REGEX_RE = r'/(?!\*)(?P(?:[^/\n]|(?:\\/))*)/(?:(?:%s)|(?:\s|$))' % _REGEX_FLAGS_RE + +_TOKENS = [ + ('null', _NULL_RE), + ('bool', _BOOL_RE), + ('id', _NAME_RE), + ('str', _STRING_RE), + ('int', _INTEGER_RE), + ('float', _FLOAT_RE), + ('regex', _REGEX_RE) +] + +COMMENT_RE = r'(?P/\*(?:(?!\*/)(?:\n|.))*\*/)' +TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value} + for name, value in _TOKENS) + +PUNCTUATIONS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _punctuations) +LOGICAL_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _logical_operator) +UNARY_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _unary_operator) +RELATIONS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _relation) +OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _operator) +ASSIGN_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _assign_operator) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp/jsinterp.py similarity index 56% rename from youtube_dl/jsinterp.py rename to youtube_dl/jsinterp/jsinterp.py index 2f11a6c91..3ff0fc7bc 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp/jsinterp.py @@ -1,205 +1,14 @@ from __future__ import unicode_literals -import json -import operator import re -from .utils import ExtractorError +from ..utils import ExtractorError +from .tstream import TokenStream -__DECIMAL_RE = r'(?:[1-9][0-9]*)|0' -__OCTAL_RE = r'0[0-7]+' -__HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+' -__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}' -__ESC_HEX_RE = r'x[0-9a-fA-F]{2}' - -_PUNCTUATIONS = { - 'copen': '{', - 'cclose': '}', - 'popen': '(', - 'pclose': ')', - 'sopen': '[', - 'sclose': ']', - 'dot': '.', - 'end': ';', - 'comma': ',', - 'hook': '?', - 'colon': ':' -} - -# TODO find a final storage solution (already) -_LOGICAL_OPERATORS = { - '&&': ('and', lambda cur, right: cur and right), - '||': ('or', lambda cur, right: cur or right) -} -_UNARY_OPERATORS = { - '++': ('inc', lambda cur: cur + 1), - '--': ('dec', lambda cur: cur - 1), - '!': ('not', operator.not_), - '~': ('bnot', lambda cur: cur ^ -1), - # XXX define these operators - 'delete': ('del', None), - 'void': ('void', None), - 'typeof': ('type', lambda cur: type(cur)) -} -_RELATIONS = { - '<': ('lt', operator.lt), - '>': ('gt', operator.gt), - '<=': ('le', operator.le), - '>=': ('ge', operator.ge), - # XXX check python and JavaScript equality difference - '==': ('eq', operator.eq), - '!=': ('ne', operator.ne), - '===': ('seq', lambda cur, right: cur == right and type(cur) == type(right)), - '!==': ('sne', lambda cur, right: not cur == right or not type(cur) == type(right)) -} -_OPERATORS = { - '|': ('bor', operator.or_), - '^': ('bxor', operator.xor), - '&': ('band', operator.and_), - # NOTE convert to int before shift float - '>>': ('rshift', operator.rshift), - '<<': ('lshift', operator.lshift), - '>>>': ('urshift', lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right), - '-': ('sub', operator.sub), - '+': ('add', operator.add), - '%': ('mod', operator.mod), - '/': ('div', operator.truediv), - '*': ('mul', operator.mul) -} -_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items()) -_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right) - -# NOTE merely fixed due to regex matching, does not represent any precedence -_logical_operator_order = _LOGICAL_OPERATORS.keys() # whatever -_unary_operator_order = _UNARY_OPERATORS.keys() # evs -_relation_order = ['===', '!==', '==', '!=', '<=', '>=', '<', '>'] -_operator_order = ['|', '^', '&', '>>>', '>>', '<<', '-', '+', '%', '/', '*'] -_assign_operator_order = [op + '=' for op in _operator_order] -_assign_operator_order.append('=') - -# only to check ids -_RESERVED_WORDS = ('break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally', - 'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw', - 'try', 'typeof', 'var', 'void', 'while', 'with') - -# XXX add support for unicode chars -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' - -# non-escape char also can be escaped, but line continuation and quotes has to be -# XXX unicode and hexadecimal escape sequences should be validated -_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'""" -_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"''' -_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE) - -_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE} -_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE} - -_BOOL_RE = r'true|false' -_NULL_RE = r'null' - -# XXX early validation might needed -# r'''/(?!\*) -# (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])* -# /(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)''' -_REGEX_FLAGS_RE = r'(?![gimy]*(?P[gimy])[gimy]*(?P=reflag))(?P[gimy]{0,4}\b)' -_REGEX_RE = r'/(?!\*)(?P(?:[^/\n]|(?:\\/))*)/(?:(?:%s)|(?:\s|$))' % _REGEX_FLAGS_RE - -re.compile(_REGEX_RE) - -_TOKENS = [ - ('null', _NULL_RE), - ('bool', _BOOL_RE), - ('id', _NAME_RE), - ('str', _STRING_RE), - ('int', _INTEGER_RE), - ('float', _FLOAT_RE), - ('regex', _REGEX_RE) -] - -_token_keys = set(name for name, value in _TOKENS) - -_COMMENT_RE = r'(?P/\*(?:(?!\*/)(?:\n|.))*\*/)' -_TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value} - for name, value in _TOKENS) -_PUNCTUATIONS_RE = r'|'.join(r'(?P<%(id)s>%(value)s)' % {'id': name, 'value': re.escape(value)} - for name, value in _PUNCTUATIONS.items()) -_LOGICAL_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _logical_operator_order) -_UNARY_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _unary_operator_order) -_RELATIONS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _relation_order) -_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _operator_order) -_ASSIGN_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(value) for value in _assign_operator_order) - -input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(punct)s|%(lop)s|%(uop)s|%(rel)s|%(aop)s|%(op)s)\s*' % { - 'comment': _COMMENT_RE, - 'token': _TOKENS_RE, - 'punct': _PUNCTUATIONS_RE, - 'lop': _LOGICAL_OPERATORS_RE, - 'uop': _UNARY_OPERATORS_RE, - 'rel': _RELATIONS_RE, - 'aop': _ASSIGN_OPERATORS_RE, - 'op': _OPERATORS_RE -}) - - -class TokenStream(object): - def __init__(self, code, start=0): - self.code = code - self.ended = False - self.peeked = [] - self._ts = self._next_token(start) - - def _next_token(self, pos=0): - while pos < len(self.code): - feed_m = input_element.match(self.code, pos) - if feed_m is not None: - token_id = feed_m.lastgroup - token_value = feed_m.group(token_id) - pos = feed_m.start(token_id) - if token_id == 'comment': - pass - elif token_id in _token_keys: - # TODO date - if token_id == 'null': - yield (token_id, None, pos) - elif token_id == 'bool': - yield (token_id, {'true': True, 'false': False}[token_value], pos) - elif token_id == 'str': - yield (token_id, token_value, pos) - elif token_id == 'int': - yield (token_id, int(token_value), pos) - elif token_id == 'float': - yield (token_id, float(token_value), pos) - elif token_id == 'regex': - # TODO error handling - regex = re.compile(feed_m.group('rebody')) - yield (token_id, {'re': regex, 'flags': feed_m.group('reflags')}, pos) - elif token_id in ('lor', 'uop', 'rel', 'aop', 'op'): - yield (token_id, _LOGICAL_OPERATORS[token_value]) - else: - yield (token_id, token_value, pos) - else: - yield (token_id, token_value, pos) - pos = feed_m.end() - else: - raise ExtractorError('Unexpected character sequence at %d' % pos) - raise StopIteration - - def peek(self, count=1): - for _ in range(count - len(self.peeked)): - token = next(self._ts, None) - if token is None: - self.ended = True - self.peeked.append(('end', ';', len(self.code))) - else: - self.peeked.append(token) - return self.peeked[count - 1] - - def pop(self): - if not self.peeked: - self.peek() - return self.peeked.pop(0) +_token_keys = 'null', 'bool', 'id', 'str', 'int', 'float', 'regex' +# TODO support json class JSInterpreter(object): undefined = object() @@ -210,11 +19,6 @@ class JSInterpreter(object): self._functions = {} self._objects = objects - @staticmethod - def _chk_id(name, at): - if name in _RESERVED_WORDS: - raise ExtractorError('Invalid identifier at %d' % at) - def _next_statement(self, token_stream, stack_top): if stack_top < 0: raise ExtractorError('Recursion limit reached') @@ -234,7 +38,7 @@ class JSInterpreter(object): # block token_stream.pop() statement_list = [] - for s in self._next_statement(token_stream, stack_top - 1): + for s in self.statements(token_stream, stack_top - 1): statement_list.append(s) token_id, token_value, token_pos = token_stream.peek() if token_id == 'cclose': @@ -251,7 +55,7 @@ class JSInterpreter(object): token_id, token_value, token_pos = token_stream.pop() if token_id != 'id': raise ExtractorError('Missing variable name at %d' % token_pos) - self._chk_id(token_value, token_pos) + token_stream.chk_id(last=True) variables.append(token_value) peek_id, peek_value, peek_pos = token_stream.peek() @@ -355,12 +159,14 @@ class JSInterpreter(object): left = self._conditional_expression(token_stream, stack_top - 1) peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id in _assign_operator_order: + if peek_id == 'aop': token_stream.pop() + _, op = peek_value right = self._assign_expression(token_stream, stack_top - 1) else: + op = None right = None - return ('assign', left, right) + return ('assign', op, left, right) def _member_expression(self, token_stream, stack_top): peek_id, peek_value, peek_pos = token_stream.peek() @@ -422,7 +228,7 @@ class JSInterpreter(object): raise ExtractorError('Function expression is not yet supported at %d' % peek_pos) # id else: - self._chk_id(peek_value, peek_pos) + token_stream.chk_id() return ('id', peek_value) # literals else: @@ -614,180 +420,10 @@ class JSInterpreter(object): return ('rpn', out) def interpret_statement(self, stmt, local_vars, allow_recursion=100): - if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') - - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] - else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True - else: - # Try interpreting it as an expression - expr = stmt - - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort + pass def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() - - if expr == '': # Empty expression - return None - - if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 - else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result - else: - expr = json.dumps(sub_result) + remaining_expr - break - else: - raise ExtractorError('Premature end of parens in %r' % expr) - - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(r'''(?x) - (?P%s)(?:\[(?P[^\]]+?)\])? - \s*%s - (?P.*)$''' % (_NAME_RE, re.escape(op)), expr) - if not m: - continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) - - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val - else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val - - if expr.isdigit(): - return int(expr) - - var_m = re.match( - r'(?!if|return|true|false)(?P%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] - - try: - return json.loads(expr) - except ValueError: - pass - - m = re.match( - r'(?P%s)\.(?P[^(]+)(?:\(+(?P[^()]*)\))?$' % _NAME_RE, - expr) - if m: - variable = m.group('var') - member = m.group('member') - arg_str = m.group('args') - - if variable in local_vars: - obj = local_vars[variable] - else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] - - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] - - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() - else: - argvals = tuple([ - self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) - - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res - - return obj[member](argvals) - - m = re.match( - r'(?P%s)\[(?P.+)\]$' % _NAME_RE, expr) - if m: - val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] - - for op, opfunc in _OPERATORS: - m = re.match(r'(?P.+?)%s(?P.+)' % re.escape(op), expr) - if not m: - continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) - - m = re.match( - r'^(?P%s)\((?P[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') - argvals = tuple([ - int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: - self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) - - raise ExtractorError('Unsupported JS expression %r' % expr) + pass def extract_object(self, objname): obj = {} diff --git a/youtube_dl/jsinterp/tstream.py b/youtube_dl/jsinterp/tstream.py new file mode 100644 index 000000000..fd4ec99cf --- /dev/null +++ b/youtube_dl/jsinterp/tstream.py @@ -0,0 +1,170 @@ +from __future__ import unicode_literals + +import re +import operator + +from ..utils import ExtractorError +from .jsgrammar import ( + COMMENT_RE, + TOKENS_RE, + PUNCTUATIONS_RE, + LOGICAL_OPERATORS_RE, + UNARY_OPERATORS_RE, + RELATIONS_RE, + ASSIGN_OPERATORS_RE, + OPERATORS_RE +) + + +_PUNCTUATIONS = { + '{': 'copen', + '}': 'cclose', + '(': 'popen', + ')': 'pclose', + '[': 'sopen', + ']': 'sclose', + '.': 'dot', + ';': 'end', + ',': 'comma', + '?': 'hook', + ':': 'colon' +} +_LOGICAL_OPERATORS = { + '&&': ('and', lambda cur, right: cur and right), + '||': ('or', lambda cur, right: cur or right) +} +_UNARY_OPERATORS = { + '++': ('inc', lambda cur: cur + 1), + '--': ('dec', lambda cur: cur - 1), + '!': ('not', operator.not_), + '~': ('bnot', lambda cur: cur ^ -1), + # XXX define these operators + 'delete': ('del', None), + 'void': ('void', None), + 'typeof': ('type', lambda cur: type(cur)) +} +_RELATIONS = { + '<': ('lt', operator.lt), + '>': ('gt', operator.gt), + '<=': ('le', operator.le), + '>=': ('ge', operator.ge), + # XXX check python and JavaScript equality difference + '==': ('eq', operator.eq), + '!=': ('ne', operator.ne), + '===': ('seq', lambda cur, right: cur == right and type(cur) == type(right)), + '!==': ('sne', lambda cur, right: not cur == right or not type(cur) == type(right)) +} +_OPERATORS = { + '|': ('bor', operator.or_), + '^': ('bxor', operator.xor), + '&': ('band', operator.and_), + # NOTE convert to int before shift float + '>>': ('rshift', operator.rshift), + '<<': ('lshift', operator.lshift), + '>>>': ('urshift', lambda cur, right: cur >> right if cur >= 0 else (cur + 0x100000000) >> right), + '-': ('sub', operator.sub), + '+': ('add', operator.add), + '%': ('mod', operator.mod), + '/': ('div', operator.truediv), + '*': ('mul', operator.mul) +} +_ASSIGN_OPERATORS = dict((op + '=', ('set_%s' % token[0], token[1])) for op, token in _OPERATORS.items()) +_ASSIGN_OPERATORS['='] = ('set', lambda cur, right: right) + +# only to check ids +_RESERVED_WORDS = ( 'break', 'case', 'catch', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'finally', + 'for', 'function', 'if', 'in', 'instanceof', 'new', 'return', 'switch', 'this', 'throw', 'try', + 'typeof', 'var', 'void', 'while', 'with') + + +_input_element = re.compile(r'\s*(?:%(comment)s|%(token)s|%(punct)s|%(lop)s|%(uop)s|%(rel)s|%(aop)s|%(op)s)\s*' % { + 'comment': COMMENT_RE, + 'token': TOKENS_RE, + 'punct': PUNCTUATIONS_RE, + 'lop': LOGICAL_OPERATORS_RE, + 'uop': UNARY_OPERATORS_RE, + 'rel': RELATIONS_RE, + 'aop': ASSIGN_OPERATORS_RE, + 'op': OPERATORS_RE +}) + + +class TokenStream(object): + def __init__(self, code, start=0): + self.code = code + self.ended = False + self.peeked = [] + self._ts = self._next_token(start) + self._last = None + + def _next_token(self, pos=0): + while pos < len(self.code): + feed_m = _input_element.match(self.code, pos) + if feed_m is not None: + token_id = feed_m.lastgroup + token_value = feed_m.group(token_id) + pos = feed_m.start(token_id) + if token_id == 'comment': + pass + # TODO date + elif token_id == 'null': + yield (token_id, None, pos) + elif token_id == 'bool': + yield (token_id, {'true': True, 'false': False}[token_value], pos) + elif token_id == 'str': + yield (token_id, token_value, pos) + elif token_id == 'int': + yield (token_id, int(token_value), pos) + elif token_id == 'float': + yield (token_id, float(token_value), pos) + elif token_id == 'regex': + # TODO error handling + regex = re.compile(feed_m.group('rebody')) + yield (token_id, {'re': regex, 'flags': feed_m.group('reflags')}, pos) + elif token_id == 'id': + yield (token_id, token_value, pos) + elif token_id == 'op': + yield (token_id, _OPERATORS[token_value]) + elif token_id == 'aop': + yield (token_id, _ASSIGN_OPERATORS[token_value]) + elif token_id == 'rel': + yield (token_id, _RELATIONS[token_value]) + elif token_id == 'uop': + yield (token_id, _UNARY_OPERATORS[token_value]) + elif token_id == 'lop': + yield (token_id, _LOGICAL_OPERATORS[token_value]) + elif token_id == 'punc': + yield (token_id, _PUNCTUATIONS[token_value], pos) + else: + raise ExtractorError('Unexpected token at %d' % pos) + pos = feed_m.end() + else: + raise ExtractorError('Unrecognised sequence at %d' % pos) + raise StopIteration + + def chk_id(self, last=False): + if last: + name, value, pos = self._last + else: + name, value, pos = self.peek() + if name in _RESERVED_WORDS: + raise ExtractorError('Invalid identifier at %d' % pos) + + def peek(self, count=1): + for _ in range(count - len(self.peeked)): + token = next(self._ts, None) + if token is None: + self.ended = True + self.peeked.append(('end', ';', len(self.code))) + else: + self.peeked.append(token) + return self.peeked[count - 1] + + def pop(self): + if not self.peeked: + self.peek() + self._last = self.peeked.pop(0) + return self._last + + def last(self): + return self._last \ No newline at end of file