From 3d0252aee00b978d1867a87aa6b9a79a0594a543 Mon Sep 17 00:00:00 2001 From: sulyi Date: Thu, 29 Dec 2016 00:36:24 +0100 Subject: [PATCH] [jsinterp] Refactoring jsparser --- test/test_jsinterp_parse.py | 6 +- youtube_dl/jsinterp/jsgrammar.py | 32 +- youtube_dl/jsinterp/jsinterp.py | 836 +------------------------------ youtube_dl/jsinterp/jsparser.py | 834 ++++++++++++++++++++++++++++++ 4 files changed, 860 insertions(+), 848 deletions(-) create mode 100644 youtube_dl/jsinterp/jsparser.py diff --git a/test/test_jsinterp_parse.py b/test/test_jsinterp_parse.py index 9aaf3f44b..f984d04d3 100644 --- a/test/test_jsinterp_parse.py +++ b/test/test_jsinterp_parse.py @@ -14,7 +14,7 @@ else: import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.jsinterp import JSInterpreter +from youtube_dl.jsinterp.jsparser import Parser from .jstests import gettestcases @@ -43,8 +43,8 @@ class TestJSInterpreterParse(unittest.TestCase): def generator(test_case, name): def test_template(self): for a in test_case['subtests']: - jsi = JSInterpreter(a['code'], variables=a.get('globals')) - parsed = list(jsi.parse()) + jsp = Parser(a['code']) + parsed = list(jsp.parse()) if 'ast' in a: self.assertEqual(traverse(parsed), traverse(a['ast'])) else: diff --git a/youtube_dl/jsinterp/jsgrammar.py b/youtube_dl/jsinterp/jsgrammar.py index 77f6a1175..44bf15603 100644 --- a/youtube_dl/jsinterp/jsgrammar.py +++ b/youtube_dl/jsinterp/jsgrammar.py @@ -4,23 +4,23 @@ import re from collections import namedtuple -_token_keys = ('COPEN', 'CCLOSE', 'POPEN', 'PCLOSE', 'SOPEN', 'SCLOSE', - 'DOT', 'END', 'COMMA', 'HOOK', 'COLON', - 'AND', 'OR', 'PLUS', 'NEG', 'INC', 'DEC', 'NOT', 'BNOT', 'DEL', 'VOID', 'TYPE', - 'LT', 'GT', 'LE', 'GE', 'EQ', 'NE', 'SEQ', 'SNE', 'IN', 'INSTANCEOF', - 'BOR', 'BXOR', 'BAND', 'RSHIFT', 'LSHIFT', 'URSHIFT', 'SUB', 'ADD', 'MOD', 'DIV', 'MUL', - 'OP', 'AOP', 'UOP', 'LOP', 'REL', 'PREFIX', 'POSTFIX', - 'COMMENT', 'TOKEN', 'PUNCT', - 'NULL', 'BOOL', 'ID', 'STR', 'INT', 'FLOAT', 'REGEX', 'OBJECT', - 'REFLAGS', 'REBODY', - 'FUNC', - 'BLOCK', 'VAR', 'EXPR', 'IF', 'FOR', 'DO', 'WHILE', 'CONTINUE', 'BREAK', 'RETURN', - 'WITH', 'LABEL', 'SWITCH', 'THROW', 'TRY', 'DEBUG', - 'ASSIGN', 'MEMBER', 'FIELD', 'ELEM', 'CALL', 'ARRAY', 'COND', 'OPEXPR', - 'PROPGET', 'PROPSET', 'PROPVALUE', - 'RSV') +_token_names = ('COPEN', 'CCLOSE', 'POPEN', 'PCLOSE', 'SOPEN', 'SCLOSE', + 'DOT', 'END', 'COMMA', 'HOOK', 'COLON', + 'AND', 'OR', 'PLUS', 'NEG', 'INC', 'DEC', 'NOT', 'BNOT', 'DEL', 'VOID', 'TYPE', + 'LT', 'GT', 'LE', 'GE', 'EQ', 'NE', 'SEQ', 'SNE', 'IN', 'INSTANCEOF', + 'BOR', 'BXOR', 'BAND', 'RSHIFT', 'LSHIFT', 'URSHIFT', 'SUB', 'ADD', 'MOD', 'DIV', 'MUL', + 'OP', 'AOP', 'UOP', 'LOP', 'REL', 'PREFIX', 'POSTFIX', + 'COMMENT', 'TOKEN', 'PUNCT', + 'NULL', 'BOOL', 'ID', 'STR', 'INT', 'FLOAT', 'REGEX', 'OBJECT', + 'REFLAGS', 'REBODY', + 'FUNC', + 'BLOCK', 'VAR', 'EXPR', 'IF', 'FOR', 'DO', 'WHILE', 'CONTINUE', 'BREAK', 'RETURN', + 'WITH', 'LABEL', 'SWITCH', 'THROW', 'TRY', 'DEBUG', + 'ASSIGN', 'MEMBER', 'FIELD', 'ELEM', 'CALL', 'ARRAY', 'COND', 'OPEXPR', + 'PROPGET', 'PROPSET', 'PROPVALUE', + 'RSV') -Token = namedtuple('Token', _token_keys)._make(_token_keys) +Token = namedtuple('Token', _token_names)._make(_token_names) __DECIMAL_RE = r'(?:[1-9][0-9]*)|0' __OCTAL_RE = r'0[0-7]+' diff --git a/youtube_dl/jsinterp/jsinterp.py b/youtube_dl/jsinterp/jsinterp.py index c4c949e97..e527b2ee4 100644 --- a/youtube_dl/jsinterp/jsinterp.py +++ b/youtube_dl/jsinterp/jsinterp.py @@ -4,7 +4,7 @@ import re from ..compat import compat_str from ..utils import ExtractorError -from .tstream import TokenStream, convert_to_unary +from .jsparser import Parser from .jsgrammar import Token, token_keys @@ -57,6 +57,7 @@ class Reference(object): class JSInterpreter(object): # TODO support json + undefined = object() def __init__(self, code, variables=None): @@ -73,14 +74,6 @@ class JSInterpreter(object): def this(self): return self._context.local_vars - def parse(self, code=None, pos=0, stack_size=100): - if code is None: - code = self.code - ts = TokenStream(code, pos) - while not ts.ended: - yield self._source_element(ts, stack_size) - raise StopIteration - def create_reference(self, value, parent_key): if isinstance(value, dict): o = {} @@ -97,821 +90,6 @@ class JSInterpreter(object): return Reference(o, parent_key) - def _source_element(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.ID and token_value == 'function': - source_element = self._function(token_stream, stack_top - 1) - else: - source_element = self._statement(token_stream, stack_top - 1) - - return source_element - - def _statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - statement = None - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.END: - # empty statement goes straight here - token_stream.pop() - return statement - - # block - elif token_id is Token.COPEN: - # XXX refactor will deprecate some _statement calls - open_pos = token_pos - token_stream.pop() - block = [] - while True: - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.CCLOSE: - token_stream.pop() - break - elif token_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - block.append(self._statement(token_stream, stack_top - 1)) - - statement = (Token.BLOCK, block) - - elif token_id is Token.ID: - if token_value == 'var': - token_stream.pop() - variables = [] - init = [] - has_another = True - while has_another: - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.ID: - raise ExtractorError('Missing variable name at %d' % token_pos) - token_stream.chk_id(last=True) - variables.append(token_value) - - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.AOP: - token_stream.pop() - init.append(self._assign_expression(token_stream, stack_top - 1)) - peek_id, peek_value, peek_pos = token_stream.peek() - else: - init.append(JSInterpreter.undefined) - - if peek_id is Token.END: - if self._context.no_in: - token_stream.pop() - has_another = False - elif peek_id is Token.COMMA: - pass - else: - # FIXME automatic end insertion - # - token_id is Token.CCLOSE - # - check line terminator - # - restricted token - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - statement = (Token.VAR, zip(variables, init)) - - elif token_value == 'if': - statement = self._if_statement(token_stream, stack_top - 1) - - elif token_value == 'for': - statement = self._for_loop(token_stream, stack_top - 1) - - elif token_value == 'do': - statement = self._do_loop(token_stream, stack_top - 1) - - elif token_value == 'while': - statement = self._while_loop(token_stream, stack_top - 1) - - elif token_value in ('break', 'continue'): - token_stream.pop() - token = {'break': Token.BREAK, 'continue': Token.CONTINUE}[token_value] - peek_id, peek_value, peek_pos = token_stream.peek() - # XXX no line break here - label_name = None - if peek_id is not Token.END: - token_stream.chk_id() - label_name = peek_value - token_stream.pop() - statement = (token, label_name) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - - elif token_value == 'return': - statement = self._return_statement(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - - elif token_value == 'with': - statement = self._with_statement(token_stream, stack_top - 1) - - elif token_value == 'switch': - statement = self._switch_statement(token_stream, stack_top - 1) - - elif token_value == 'throw': - token_stream.pop() - # XXX no line break here - expr = self._expression(token_stream, stack_top - 1) - statement = (Token.RETURN, expr) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - - elif token_value == 'try': - statement = self._try_statement(token_stream, stack_top - 1) - - elif token_value == 'debugger': - token_stream.pop() - statement = (Token.DEBUG) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - else: # label - # XXX possible refactoring (this is the only branch not poping) - token_id, token_value, token_pos = token_stream.peek(2) - if token_id is Token.COLON: - token_id, label_name, token_pos = token_stream.pop(2) - token_stream.chk_id(last=True) - statement = (Token.LABEL, label_name, self._statement(token_stream, stack_top - 1)) - - # expr - if statement is None: - statement = self._expression(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('Unexpected sequence at %d' % peek_pos) - - return statement - - def _if_statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('Missing condition at %d' % token_pos) - cond_expr = self._expression(token_stream, stack_top - 1) - token_stream.pop() # Token.PCLOSE - true_stmt = self._statement(token_stream, stack_top - 1) - false_stmt = None - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.ID and token_value == 'else': - token_stream.pop() - false_stmt = self._statement(token_stream, stack_top - 1) - return (Token.IF, cond_expr, true_stmt, false_stmt) - - def _for_loop(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('''Expected '(' at %d''' % token_pos) - - # FIXME set infor True (checked by variable declaration and relation expression) - self._context.no_in = False - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.END: - init = None - elif token_id is Token.ID and token_value == 'var': - # XXX change it on refactoring variable declaration list - init = self._statement(token_stream, stack_top - 1) - else: - init = self._expression(token_stream, stack_top - 1) - self._context.no_in = True - - token_id, token_value, token_pos = token_stream.pop() - if token_id is Token.ID and token_value == 'in': - cond = self._expression(token_stream, stack_top - 1) - # FIXME further processing of operator 'in' needed for interpretation - incr = None - # NOTE ES6 has 'of' operator - elif token_id is Token.END: - token_id, token_value, token_pos = token_stream.peek() - cond = None if token_id is Token.END else self._expression(token_stream, stack_top - 1) - - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.END: - raise ExtractorError('''Expected ';' at %d''' % token_pos) - - token_id, token_value, token_pos = token_stream.peek() - incr = None if token_id is Token.END else self._expression(token_stream, stack_top - 1) - else: - raise ExtractorError('Invalid condition in for loop initialization at %d' % token_pos) - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.PCLOSE: - raise ExtractorError('''Expected ')' at %d''' % token_pos) - body = self._statement(token_stream, stack_top - 1) - return (Token.FOR, init, cond, incr, body) - - def _do_loop(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - body = self._statement(token_stream, stack_top - 1) - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.ID and token_value != 'while': - raise ExtractorError('''Expected 'while' at %d''' % token_pos) - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('''Expected '(' at %d''' % token_pos) - expr = self._expression(token_stream, stack_top - 1) - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.PCLOSE: - raise ExtractorError('''Expected ')' at %d''' % token_pos) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.END: - token_stream.pop() - else: - # FIXME automatic end insertion - raise ExtractorError('''Expected ';' at %d''' % peek_pos) - return (Token.DO, expr, body) - - def _while_loop(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('''Expected '(' at %d''' % token_pos) - expr = self._expression(token_stream, stack_top - 1) - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.PCLOSE: - raise ExtractorError('''Expected ')' at %d''' % token_pos) - body = self._statement(token_stream, stack_top) - return (Token.WHILE, expr, body) - - def _return_statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - peek_id, peek_value, peek_pos = token_stream.peek() - # XXX no line break here - expr = self._expression(token_stream, stack_top - 1) if peek_id is not Token.END else None - return (Token.RETURN, expr) - - def _with_statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('Missing expression at %d' % token_pos) - expr = self._expression(token_stream, stack_top - 1) - token_stream.pop() # Token.PCLOSE - return (Token.WITH, expr, self._statement(token_stream, stack_top - 1)) - - def _switch_statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('Missing expression at %d' % token_pos) - discriminant = self._expression(token_stream, stack_top - 1) - token_stream.pop() # Token.PCLOSE - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.COPEN: - raise ExtractorError('Missing case block at %d' % token_pos) - open_pos = token_pos - has_default = False - block = [] - while True: - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.CCLOSE: - break - elif token_id is Token.ID and token_value == 'case': - token_stream.pop() - expr = self._expression(token_stream, stack_top - 1) - - elif token_id is Token.ID and token_value == 'default': - if has_default: - raise ExtractorError('Multiple default clause') - token_stream.pop() - has_default = True - expr = None - - elif token_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - else: - raise ExtractorError('Unexpected sequence at %d, default or case clause is expected' % - token_pos) - - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.COLON: - raise ExtractorError('''Unexpected sequence at %d, ':' is expected''' % token_pos) - - statement_list = [] - while True: - token_id, token_value, token_pos = token_stream.peek() - if token_id == Token.CCLOSE or (token_id is Token.ID and (token_value in ('default', 'case'))): - break - elif token_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - statement_list.append(self._statement(token_stream, stack_top - 1)) - - block.append((expr, statement_list)) - token_stream.pop() - return (Token.SWITCH, discriminant, block) - - def _try_statement(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.peek() - if token_id is not Token.COPEN: - raise ExtractorError('Block is expected at %d' % token_pos) - try_block = self._statement(token_stream, stack_top - 1) - token_id, token_value, token_pos = token_stream.pop() - catch_block = None - if token_id is Token.ID and token_value == 'catch': - token_id, token_value, token_pos = token_stream.peek() - if token_id is not Token.POPEN: - raise ExtractorError('Catch clause is missing an identifier at %d' % token_pos) - token_stream.pop() - token_stream.chk_id() - token_id, error_name, token_pos = token_stream.pop() - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.PCLOSE: - raise ExtractorError('Catch clause expects a single identifier at %d' % token_pos) - token_id, token_value, token_pos = token_stream.peek() - if token_id is not Token.COPEN: - raise ExtractorError('Block is expected at %d' % token_pos) - catch_block = (error_name, self._statement(token_stream, stack_top - 1)) - finally_block = None - if token_id is Token.ID and token_value == 'finally': - token_id, token_value, token_pos = token_stream.peek() - if token_id is not Token.COPEN: - raise ExtractorError('Block is expected at %d' % token_pos) - finally_block = self._statement(token_stream, stack_top - 1) - if catch_block is None and finally_block is None: - raise ExtractorError('Try statement is expecting catch or finally at %d' % token_pos) - return (Token.TRY, try_block, catch_block, finally_block) - - def _expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - expr_list = [] - has_another = True - while has_another: - expr_list.append(self._assign_expression(token_stream, stack_top - 1)) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.COMMA: - token_stream.pop() - elif peek_id is Token.ID and peek_value == 'yield': - # TODO parse yield - raise ExtractorError('Yield statement is not yet supported at %d' % peek_pos) - else: - has_another = False - return (Token.EXPR, expr_list) - - def _assign_expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - left = self._conditional_expression(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.AOP: - token_stream.pop() - _, op = peek_value - right = self._assign_expression(token_stream, stack_top - 1) - else: - op = None - right = None - return (Token.ASSIGN, op, left, right) - - def _member_expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.ID and peek_value == 'new': - token_stream.pop() - target = self._member_expression(token_stream, stack_top - 1) - args = self._arguments(token_stream, stack_top - 1) - # Rhino has check for args length - # Rhino has experimental syntax allowing an object literal to follow a new expression - else: - target = self._primary_expression(token_stream, stack_top) - args = None - - return (Token.MEMBER, target, args, self._member_tail(token_stream, stack_top - 1)) - - def _member_tail(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.DOT: - token_stream.pop() - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.DOT: - token_stream.pop() - peek_id, peek_value, peek_pos = token_stream.peek() - elif peek_id is Token.POPEN: - # TODO parse field query - raise ExtractorError('Field query is not yet supported at %d' % peek_pos) - - if peek_id is Token.ID: - token_stream.pop() - return (Token.FIELD, peek_value, self._member_tail(token_stream, stack_top - 1)) - else: - raise ExtractorError('Identifier name expected at %d' % peek_pos) - elif peek_id is Token.SOPEN: - token_stream.pop() - index = self._expression(token_stream, stack_top - 1) - token_id, token_value, token_pos = token_stream.pop() - if token_id is Token.SCLOSE: - return (Token.ELEM, index, self._member_tail(token_stream, stack_top - 1)) - else: - raise ExtractorError('Unexpected sequence at %d' % token_pos) - elif peek_id is Token.POPEN: - args = self._arguments(token_stream, stack_top - 1) - return (Token.CALL, args, self._member_tail(token_stream, stack_top - 1)) - else: - return None - - def _primary_expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - # TODO support let - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id in token_keys: - if peek_id is Token.ID: - # this - if peek_value == 'this': - token_stream.pop() - return (Token.RSV, 'this') - # function expr - elif peek_value == 'function': - return self._function(token_stream, stack_top - 1, True) - # id - else: - token_stream.chk_id() - token_stream.pop() - return (Token.ID, peek_value) - # literals - else: - token_stream.pop() - return (peek_id, peek_value) - # array - elif peek_id is Token.SOPEN: - return self._array_literal(token_stream, stack_top - 1) - # object - elif peek_id is Token.COPEN: - return self._object_literal(token_stream, stack_top) - # expr - elif peek_id is Token.POPEN: - token_stream.pop() - open_pos = peek_pos - expr = self._expression(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is not Token.PCLOSE: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - token_stream.pop() - return expr - else: - raise ExtractorError('Syntax error at %d' % peek_pos) - - def _function(self, token_stream, stack_top, is_expr=False): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_stream.pop() - token_id, token_value, token_pos = token_stream.peek() - - name = None - if token_id is Token.ID: - token_stream.chk_id() - token_id, name, token_pos = token_stream.pop() - token_id, token_value, token_pos = token_stream.peek() - elif not is_expr: - raise ExtractorError('Function declaration at %d is missing identifier' % token_pos) - - if token_id is not Token.POPEN: - raise ExtractorError('Expected argument list at %d' % token_pos) - - # args - token_stream.pop() - open_pos = token_pos - args = [] - while True: - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.PCLOSE: - token_stream.pop() - break - token_stream.chk_id() - token_stream.pop() - args.append(token_value) - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.COMMA: - token_stream.pop() - elif token_id is Token.PCLOSE: - pass - elif token_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - else: - raise ExtractorError('Expected , separator at %d' % token_pos) - - token_id, token_value, token_pos = token_stream.peek() - if token_id is not Token.COPEN: - raise ExtractorError('Expected function body at %d' % token_pos) - - return (Token.FUNC, name, args, (self._function_body(token_stream, stack_top - 1))) - - def _function_body(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_id, token_value, open_pos = token_stream.pop() - body = [] - while True: - token_id, token_value, token_pos = token_stream.peek() - if token_id is Token.CCLOSE: - token_stream.pop() - break - elif token_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - body.append(self._source_element(token_stream, stack_top - 1)) - - return body - - def _arguments(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.POPEN: - token_stream.pop() - open_pos = peek_pos - else: - return None - args = [] - while True: - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.PCLOSE: - token_stream.pop() - return args - # FIXME handle infor - args.append(self._assign_expression(token_stream, stack_top - 1)) - # TODO parse generator expression - peek_id, peek_value, peek_pos = token_stream.peek() - - if peek_id is Token.COMMA: - token_stream.pop() - elif peek_id is Token.PCLOSE: - pass - elif peek_id is Token.END and token_stream.ended: - raise ExtractorError('Unbalanced parentheses at %d' % open_pos) - else: - raise ExtractorError('''Expected ',' separator at %d''' % peek_pos) - - def _array_literal(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - # XXX check no linebreak here - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is not Token.SOPEN: - raise ExtractorError('Array expected at %d' % peek_pos) - token_stream.pop() - elements = [] - - has_another = True - while has_another: - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.COMMA: - token_stream.pop() - elements.append(None) - elif peek_id is Token.SCLOSE: - token_stream.pop() - has_another = False - elif peek_id is Token.ID and peek_value == 'for': - # TODO parse array comprehension - raise ExtractorError('Array comprehension is not yet supported at %d' % peek_pos) - else: - elements.append(self._assign_expression(token_stream, stack_top - 1)) - peek_id, peek_value, peek_pos = token_stream.pop() - if peek_id is Token.SCLOSE: - has_another = False - elif peek_id is not Token.COMMA: - raise ExtractorError('''Expected ',' after element at %d''' % peek_pos) - - return (Token.ARRAY, elements) - - def _object_literal(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - token_id, token_value, open_pos = token_stream.pop() - property_list = [] - while True: - token_id, token_value, token_pos = token_stream.pop() - if token_id is Token.CCLOSE: - break - elif token_id is Token.COMMA: - continue - elif token_id is Token.ID and token_value in ('get', 'set'): - is_set = token_id is Token.ID and token_value == 'set' - - token_id, token_value, token_pos = token_stream.pop() - if token_id not in (Token.ID, Token.STR, Token.INT, Token.FLOAT): - raise ExtractorError('Property name is expected at %d' % token_pos) - property_name = token_value - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.POPEN: - raise ExtractorError('''Expected '(' at %d''' % token_pos) - - if is_set: - token_stream.chk_id() - token_id, arg, token_pos = token_stream.pop() - - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.PCLOSE: - raise ExtractorError('''Expected ')' at %d''' % token_pos) - - if is_set: - desc = (Token.PROPSET, arg, self._function_body(token_stream, stack_top - 1)) - else: - desc = (Token.PROPGET, self._function_body(token_stream, stack_top - 1)) - - elif token_id in (Token.ID, Token.STR, Token.INT, Token.FLOAT): - property_name = token_value - token_id, token_value, token_pos = token_stream.pop() - if token_id is not Token.COLON: - raise ExtractorError('Property name is expected at %d' % token_pos) - - desc = (Token.PROPVALUE, self._assign_expression(token_stream, stack_top - 1)) - - elif token_stream.ended: - raise ExtractorError('Unmatched parentheses at %d' % open_pos) - else: - raise ExtractorError('Property assignment is expected at %d' % token_pos) - - property_list.append((property_name, desc)) - - return (Token.OBJECT, property_list) - - def _conditional_expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - expr = self._operator_expression(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.HOOK: - hook_pos = peek_pos - true_expr = self._assign_expression(token_stream, stack_top - 1) - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.COLON: - false_expr = self._assign_expression(token_stream, stack_top - 1) - else: - raise ExtractorError('Missing : in conditional expression at %d' % hook_pos) - return (Token.COND, expr, true_expr, false_expr) - return expr - - def _operator_expression(self, token_stream, stack_top): - if stack_top < 0: - raise ExtractorError('Recursion limit reached') - - # --<---------------------------------<-- op --<--------------------------<---- - # | | - # | --<-- prefix --<-- -->-- postfix -->-- | - # | | ^ ^ | ^ - # v v | | v | - # ->------------>----------->-- lefthand-side expression -->----------->------------>---| - # - # 20 grouping - # ... # handled by lefthandside_expression - # 17 postfix - # 16 unary - # 15 exponentiation # not yet found in grammar - # 14 mul - # 13 add - # 12 shift - # 11 rel - # 10 eq - # 9 band - # 8 bxor - # 7 bor - # 6 land - # 5 lor - # 4 cond # handled by conditional_expression - - out = [] - stack = [] - - while True: - had_inc = False - has_prefix = True - while has_prefix: - peek_id, peek_value, peek_pos = token_stream.peek() - if peek_id is Token.OP and peek_value[0] in (Token.ADD, Token.SUB): - # any binary operators will be consumed later - peek_id = Token.UOP - peek_value = convert_to_unary(peek_value) - if peek_id is Token.UOP: - name, op = peek_value - had_inc = name in (Token.INC, Token.DEC) - if had_inc: - peek_id = Token.PREFIX - while stack and stack[-1][0] > 16: - _, stack_id, stack_op = stack.pop() - out.append((stack_id, stack_op)) - stack.append((16, peek_id, op)) - token_stream.pop() - peek_id, peek_value, peek_pos = token_stream.peek() - if had_inc and peek_id is not Token.ID: - raise ExtractorError('Prefix operator has to be followed by an identifier at %d' % peek_pos) - has_prefix = peek_id is Token.UOP - else: - has_prefix = False - - left = self._member_expression(token_stream, stack_top - 1) - out.append(left) - - peek_id, peek_value, peek_pos = token_stream.peek() - # postfix - if peek_id is Token.UOP: - if had_inc: - raise ExtractorError('''Can't have prefix and postfix operator at the same time at %d''' % peek_pos) - name, op = peek_value - if name in (Token.INC, Token.DEC): - peek_id = Token.POSTFIX - prec = 17 - else: - raise ExtractorError('Unexpected operator at %d' % peek_pos) - while stack and stack[-1][0] >= 17: - _, stack_id, stack_op = stack.pop() - out.append((stack_id, stack_op)) - stack.append((prec, peek_id, op)) - token_stream.pop() - peek_id, peek_value, peek_pos = token_stream.peek() - - if peek_id is Token.REL: - name, op = peek_value - prec = 11 - elif peek_id is Token.OP: - name, op = peek_value - if name in (Token.MUL, Token.DIV, Token.MOD): - prec = 14 - elif name in (Token.ADD, Token.SUB): - prec = 13 - elif name in (Token.RSHIFT, Token.LSHIFT, Token.URSHIFT): - prec = 12 - elif name is Token.BAND: - prec = 9 - elif name is Token.BXOR: - prec = 8 - elif name is Token.BOR: - prec = 7 - else: - raise ExtractorError('Unexpected operator at %d' % peek_pos) - elif peek_id is Token.LOP: - name, op = peek_value - prec = {Token.OR: 5, Token.AND: 6}[name] - else: - op = None - prec = 4 # empties stack - - while stack and stack[-1][0] >= prec: - _, stack_id, stack_op = stack.pop() - out.append((stack_id, stack_op)) - if op is None: - break - else: - stack.append((prec, peek_id, op)) - token_stream.pop() - - return (Token.OPEXPR, out) - def interpret_statement(self, stmt): if stmt is None: return None @@ -935,8 +113,8 @@ class JSInterpreter(object): ref = s.getvalue() elif name is Token.VAR: for name, value in stmt[1]: - self.this[name] = Reference(self.interpret_expression(value).getvalue(), - (self.this, name)) + value = self.interpret_expression(value).getvalue() if value is not None else self.undefined + self.this[name] = Reference(value, (self.this, name)) elif name is Token.EXPR: for expr in stmt[1]: ref = self.interpret_expression(expr) @@ -1076,7 +254,7 @@ class JSInterpreter(object): fields) for f in fields_m: argnames = f.group('args').split(',') - obj[f.group('key')] = self.build_function(argnames, self.parse(f.group('code'))) + obj[f.group('key')] = self.build_function(argnames, Parser(f.group('code')).parse()) return obj @@ -1092,7 +270,7 @@ class JSInterpreter(object): raise ExtractorError('Could not find JS function %r' % funcname) argnames = func_m.group('args').split(',') - return self.build_function(argnames, self.parse(func_m.group('code'))) + return self.build_function(argnames, Parser(func_m.group('code')).parse()) def push_context(self, cx): self._context_stack.append(self._context) @@ -1125,7 +303,7 @@ class JSInterpreter(object): if cx is not None: self.push_context(cx) res = None - for stmt in self.parse(): + for stmt in Parser(self.code).parse(): res = self.interpret_statement(stmt) res = None if res is None else res.getvalue(deep=True) if self._context.ended: diff --git a/youtube_dl/jsinterp/jsparser.py b/youtube_dl/jsinterp/jsparser.py new file mode 100644 index 000000000..68f856b11 --- /dev/null +++ b/youtube_dl/jsinterp/jsparser.py @@ -0,0 +1,834 @@ +from __future__ import unicode_literals + +from ..utils import ExtractorError +from .jsgrammar import Token, token_keys +from .tstream import TokenStream, convert_to_unary + + +class Parser(object): + + def __init__(self, code, pos=0, stack_size=100): + self.token_stream = TokenStream(code, pos) + self.stack_top = stack_size + self._no_in = True + + def parse(self): + while not self.token_stream.ended: + yield self._source_element(self.stack_top) + raise StopIteration + + def _source_element(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.ID and token_value == 'function': + source_element = self._function(stack_top - 1) + else: + source_element = self._statement(stack_top - 1) + + return source_element + + def _statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + statement = None + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.END: + # empty statement goes straight here + self.token_stream.pop() + return statement + + # block + elif token_id is Token.COPEN: + # XXX refactor will deprecate some _statement calls + open_pos = token_pos + self.token_stream.pop() + block = [] + while True: + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.CCLOSE: + self.token_stream.pop() + break + elif token_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + block.append(self._statement(stack_top - 1)) + + statement = (Token.BLOCK, block) + + elif token_id is Token.ID: + if token_value == 'var': + self.token_stream.pop() + variables = [] + init = [] + has_another = True + while has_another: + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.ID: + raise ExtractorError('Missing variable name at %d' % token_pos) + self.token_stream.chk_id(last=True) + variables.append(token_value) + + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.AOP: + self.token_stream.pop() + init.append(self._assign_expression(stack_top - 1)) + peek_id, peek_value, peek_pos = self.token_stream.peek() + else: + init.append(None) + + if peek_id is Token.END: + if self._no_in: + self.token_stream.pop() + has_another = False + elif peek_id is Token.COMMA: + # TODO for not NoIn + pass + else: + # FIXME automatic end insertion + # - token_id is Token.CCLOSE + # - check line terminator + # - restricted token + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + statement = (Token.VAR, zip(variables, init)) + + elif token_value == 'if': + statement = self._if_statement(stack_top - 1) + + elif token_value == 'for': + statement = self._for_loop(stack_top - 1) + + elif token_value == 'do': + statement = self._do_loop(stack_top - 1) + + elif token_value == 'while': + statement = self._while_loop(stack_top - 1) + + elif token_value in ('break', 'continue'): + self.token_stream.pop() + token = {'break': Token.BREAK, 'continue': Token.CONTINUE}[token_value] + peek_id, peek_value, peek_pos = self.token_stream.peek() + # XXX no line break here + label_name = None + if peek_id is not Token.END: + self.token_stream.chk_id() + label_name = peek_value + self.token_stream.pop() + statement = (token, label_name) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + + elif token_value == 'return': + statement = self._return_statement(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + + elif token_value == 'with': + statement = self._with_statement(stack_top - 1) + + elif token_value == 'switch': + statement = self._switch_statement(stack_top - 1) + + elif token_value == 'throw': + self.token_stream.pop() + # XXX no line break here + expr = self._expression(stack_top - 1) + statement = (Token.RETURN, expr) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + + elif token_value == 'try': + statement = self._try_statement(stack_top - 1) + + elif token_value == 'debugger': + self.token_stream.pop() + statement = (Token.DEBUG) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + else: # label + # XXX possible refactoring (this is the only branch not poping) + token_id, token_value, token_pos = self.token_stream.peek(2) + if token_id is Token.COLON: + token_id, label_name, token_pos = self.token_stream.pop(2) + self.token_stream.chk_id(last=True) + statement = (Token.LABEL, label_name, self._statement(stack_top - 1)) + + # expr + if statement is None: + statement = self._expression(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('Unexpected sequence at %d' % peek_pos) + + return statement + + def _if_statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('Missing condition at %d' % token_pos) + cond_expr = self._expression(stack_top - 1) + self.token_stream.pop() # Token.PCLOSE + true_stmt = self._statement(stack_top - 1) + false_stmt = None + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.ID and token_value == 'else': + self.token_stream.pop() + false_stmt = self._statement(stack_top - 1) + return (Token.IF, cond_expr, true_stmt, false_stmt) + + def _for_loop(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('''Expected '(' at %d''' % token_pos) + + # FIXME set infor True (checked by variable declaration and relation expression) + self._no_in = False + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.END: + init = None + elif token_id is Token.ID and token_value == 'var': + # XXX change it on refactoring variable declaration list + init = self._statement(stack_top - 1) + else: + init = self._expression(stack_top - 1) + self._no_in = True + + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is Token.ID and token_value == 'in': + cond = self._expression(stack_top - 1) + # FIXME further processing of operator 'in' needed for interpretation + incr = None + # NOTE ES6 has 'of' operator + elif token_id is Token.END: + token_id, token_value, token_pos = self.token_stream.peek() + cond = None if token_id is Token.END else self._expression(stack_top - 1) + + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.END: + raise ExtractorError('''Expected ';' at %d''' % token_pos) + + token_id, token_value, token_pos = self.token_stream.peek() + incr = None if token_id is Token.END else self._expression(stack_top - 1) + else: + raise ExtractorError('Invalid condition in for loop initialization at %d' % token_pos) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.PCLOSE: + raise ExtractorError('''Expected ')' at %d''' % token_pos) + body = self._statement(stack_top - 1) + return (Token.FOR, init, cond, incr, body) + + def _do_loop(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + body = self._statement(stack_top - 1) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.ID and token_value != 'while': + raise ExtractorError('''Expected 'while' at %d''' % token_pos) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('''Expected '(' at %d''' % token_pos) + expr = self._expression(stack_top - 1) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.PCLOSE: + raise ExtractorError('''Expected ')' at %d''' % token_pos) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.END: + self.token_stream.pop() + else: + # FIXME automatic end insertion + raise ExtractorError('''Expected ';' at %d''' % peek_pos) + return (Token.DO, expr, body) + + def _while_loop(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('''Expected '(' at %d''' % token_pos) + expr = self._expression(stack_top - 1) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.PCLOSE: + raise ExtractorError('''Expected ')' at %d''' % token_pos) + body = self._statement(stack_top) + return (Token.WHILE, expr, body) + + def _return_statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + peek_id, peek_value, peek_pos = self.token_stream.peek() + # XXX no line break here + expr = self._expression(stack_top - 1) if peek_id is not Token.END else None + return (Token.RETURN, expr) + + def _with_statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('Missing expression at %d' % token_pos) + expr = self._expression(stack_top - 1) + self.token_stream.pop() # Token.PCLOSE + return (Token.WITH, expr, self._statement(stack_top - 1)) + + def _switch_statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('Missing expression at %d' % token_pos) + discriminant = self._expression(stack_top - 1) + self.token_stream.pop() # Token.PCLOSE + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.COPEN: + raise ExtractorError('Missing case block at %d' % token_pos) + open_pos = token_pos + has_default = False + block = [] + while True: + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.CCLOSE: + break + elif token_id is Token.ID and token_value == 'case': + self.token_stream.pop() + expr = self._expression(stack_top - 1) + + elif token_id is Token.ID and token_value == 'default': + if has_default: + raise ExtractorError('Multiple default clause') + self.token_stream.pop() + has_default = True + expr = None + + elif token_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + else: + raise ExtractorError('Unexpected sequence at %d, default or case clause is expected' % + token_pos) + + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.COLON: + raise ExtractorError('''Unexpected sequence at %d, ':' is expected''' % token_pos) + + statement_list = [] + while True: + token_id, token_value, token_pos = self.token_stream.peek() + if token_id == Token.CCLOSE or (token_id is Token.ID and (token_value in ('default', 'case'))): + break + elif token_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + statement_list.append(self._statement(stack_top - 1)) + + block.append((expr, statement_list)) + self.token_stream.pop() + return (Token.SWITCH, discriminant, block) + + def _try_statement(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is not Token.COPEN: + raise ExtractorError('Block is expected at %d' % token_pos) + try_block = self._statement(stack_top - 1) + token_id, token_value, token_pos = self.token_stream.pop() + catch_block = None + if token_id is Token.ID and token_value == 'catch': + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is not Token.POPEN: + raise ExtractorError('Catch clause is missing an identifier at %d' % token_pos) + self.token_stream.pop() + self.token_stream.chk_id() + token_id, error_name, token_pos = self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.PCLOSE: + raise ExtractorError('Catch clause expects a single identifier at %d' % token_pos) + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is not Token.COPEN: + raise ExtractorError('Block is expected at %d' % token_pos) + catch_block = (error_name, self._statement(stack_top - 1)) + finally_block = None + if token_id is Token.ID and token_value == 'finally': + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is not Token.COPEN: + raise ExtractorError('Block is expected at %d' % token_pos) + finally_block = self._statement(stack_top - 1) + if catch_block is None and finally_block is None: + raise ExtractorError('Try statement is expecting catch or finally at %d' % token_pos) + return (Token.TRY, try_block, catch_block, finally_block) + + def _expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + expr_list = [] + has_another = True + while has_another: + expr_list.append(self._assign_expression(stack_top - 1)) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.COMMA: + self.token_stream.pop() + elif peek_id is Token.ID and peek_value == 'yield': + # TODO parse yield + raise ExtractorError('Yield statement is not yet supported at %d' % peek_pos) + else: + has_another = False + return (Token.EXPR, expr_list) + + def _assign_expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + left = self._conditional_expression(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.AOP: + self.token_stream.pop() + _, op = peek_value + right = self._assign_expression(stack_top - 1) + else: + op = None + right = None + return (Token.ASSIGN, op, left, right) + + def _member_expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.ID and peek_value == 'new': + self.token_stream.pop() + target = self._member_expression(stack_top - 1) + args = self._arguments(stack_top - 1) + # Rhino has check for args length + # Rhino has experimental syntax allowing an object literal to follow a new expression + else: + target = self._primary_expression(stack_top) + args = None + + return (Token.MEMBER, target, args, self._member_tail(stack_top - 1)) + + def _member_tail(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.DOT: + self.token_stream.pop() + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.DOT: + self.token_stream.pop() + peek_id, peek_value, peek_pos = self.token_stream.peek() + elif peek_id is Token.POPEN: + # TODO parse field query + raise ExtractorError('Field query is not yet supported at %d' % peek_pos) + + if peek_id is Token.ID: + self.token_stream.pop() + return (Token.FIELD, peek_value, self._member_tail(stack_top - 1)) + else: + raise ExtractorError('Identifier name expected at %d' % peek_pos) + elif peek_id is Token.SOPEN: + self.token_stream.pop() + index = self._expression(stack_top - 1) + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is Token.SCLOSE: + return (Token.ELEM, index, self._member_tail(stack_top - 1)) + else: + raise ExtractorError('Unexpected sequence at %d' % token_pos) + elif peek_id is Token.POPEN: + args = self._arguments(stack_top - 1) + return (Token.CALL, args, self._member_tail(stack_top - 1)) + else: + return None + + def _primary_expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + # TODO support let + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id in token_keys: + if peek_id is Token.ID: + # this + if peek_value == 'this': + self.token_stream.pop() + return (Token.RSV, 'this') + # function expr + elif peek_value == 'function': + return self._function(stack_top - 1, True) + # id + else: + self.token_stream.chk_id() + self.token_stream.pop() + return (Token.ID, peek_value) + # literals + else: + self.token_stream.pop() + return (peek_id, peek_value) + # array + elif peek_id is Token.SOPEN: + return self._array_literal(stack_top - 1) + # object + elif peek_id is Token.COPEN: + return self._object_literal(stack_top) + # expr + elif peek_id is Token.POPEN: + self.token_stream.pop() + open_pos = peek_pos + expr = self._expression(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is not Token.PCLOSE: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + self.token_stream.pop() + return expr + else: + raise ExtractorError('Syntax error at %d' % peek_pos) + + def _function(self, stack_top, is_expr=False): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.peek() + + name = None + if token_id is Token.ID: + self.token_stream.chk_id() + token_id, name, token_pos = self.token_stream.pop() + token_id, token_value, token_pos = self.token_stream.peek() + elif not is_expr: + raise ExtractorError('Function declaration at %d is missing identifier' % token_pos) + + if token_id is not Token.POPEN: + raise ExtractorError('Expected argument list at %d' % token_pos) + + # args + self.token_stream.pop() + open_pos = token_pos + args = [] + while True: + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.PCLOSE: + self.token_stream.pop() + break + self.token_stream.chk_id() + self.token_stream.pop() + args.append(token_value) + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.COMMA: + self.token_stream.pop() + elif token_id is Token.PCLOSE: + pass + elif token_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + else: + raise ExtractorError('Expected , separator at %d' % token_pos) + + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is not Token.COPEN: + raise ExtractorError('Expected function body at %d' % token_pos) + + return (Token.FUNC, name, args, (self._function_body(stack_top - 1))) + + def _function_body(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + token_id, token_value, open_pos = self.token_stream.pop() + body = [] + while True: + token_id, token_value, token_pos = self.token_stream.peek() + if token_id is Token.CCLOSE: + self.token_stream.pop() + break + elif token_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + body.append(self._source_element(stack_top - 1)) + + return body + + def _arguments(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.POPEN: + self.token_stream.pop() + open_pos = peek_pos + else: + return None + args = [] + while True: + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.PCLOSE: + self.token_stream.pop() + return args + # FIXME handle infor + args.append(self._assign_expression(stack_top - 1)) + # TODO parse generator expression + peek_id, peek_value, peek_pos = self.token_stream.peek() + + if peek_id is Token.COMMA: + self.token_stream.pop() + elif peek_id is Token.PCLOSE: + pass + elif peek_id is Token.END and self.token_stream.ended: + raise ExtractorError('Unbalanced parentheses at %d' % open_pos) + else: + raise ExtractorError('''Expected ',' separator at %d''' % peek_pos) + + def _array_literal(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + # XXX check no linebreak here + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is not Token.SOPEN: + raise ExtractorError('Array expected at %d' % peek_pos) + self.token_stream.pop() + elements = [] + + has_another = True + while has_another: + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.COMMA: + self.token_stream.pop() + elements.append(None) + elif peek_id is Token.SCLOSE: + self.token_stream.pop() + has_another = False + elif peek_id is Token.ID and peek_value == 'for': + # TODO parse array comprehension + raise ExtractorError('Array comprehension is not yet supported at %d' % peek_pos) + else: + elements.append(self._assign_expression(stack_top - 1)) + peek_id, peek_value, peek_pos = self.token_stream.pop() + if peek_id is Token.SCLOSE: + has_another = False + elif peek_id is not Token.COMMA: + raise ExtractorError('''Expected ',' after element at %d''' % peek_pos) + + return (Token.ARRAY, elements) + + def _object_literal(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + token_id, token_value, open_pos = self.token_stream.pop() + property_list = [] + while True: + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is Token.CCLOSE: + break + elif token_id is Token.COMMA: + continue + elif token_id is Token.ID and token_value in ('get', 'set'): + is_set = token_id is Token.ID and token_value == 'set' + + token_id, token_value, token_pos = self.token_stream.pop() + if token_id not in (Token.ID, Token.STR, Token.INT, Token.FLOAT): + raise ExtractorError('Property name is expected at %d' % token_pos) + property_name = token_value + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.POPEN: + raise ExtractorError('''Expected '(' at %d''' % token_pos) + + if is_set: + self.token_stream.chk_id() + token_id, arg, token_pos = self.token_stream.pop() + + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.PCLOSE: + raise ExtractorError('''Expected ')' at %d''' % token_pos) + + if is_set: + desc = (Token.PROPSET, arg, self._function_body(stack_top - 1)) + else: + desc = (Token.PROPGET, self._function_body(stack_top - 1)) + + elif token_id in (Token.ID, Token.STR, Token.INT, Token.FLOAT): + property_name = token_value + token_id, token_value, token_pos = self.token_stream.pop() + if token_id is not Token.COLON: + raise ExtractorError('Property name is expected at %d' % token_pos) + + desc = (Token.PROPVALUE, self._assign_expression(stack_top - 1)) + + elif self.token_stream.ended: + raise ExtractorError('Unmatched parentheses at %d' % open_pos) + else: + raise ExtractorError('Property assignment is expected at %d' % token_pos) + + property_list.append((property_name, desc)) + + return (Token.OBJECT, property_list) + + def _conditional_expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + expr = self._operator_expression(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.HOOK: + hook_pos = peek_pos + true_expr = self._assign_expression(stack_top - 1) + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.COLON: + false_expr = self._assign_expression(stack_top - 1) + else: + raise ExtractorError('Missing : in conditional expression at %d' % hook_pos) + return (Token.COND, expr, true_expr, false_expr) + return expr + + def _operator_expression(self, stack_top): + if stack_top < 0: + raise ExtractorError('Recursion limit reached') + + # --<---------------------------------<-- op --<--------------------------<---- + # | | + # | --<-- prefix --<-- -->-- postfix -->-- | + # | | ^ ^ | ^ + # v v | | v | + # ->------------>----------->-- lefthand-side expression -->----------->------------>---| + # + # 20 grouping + # ... # handled by lefthandside_expression + # 17 postfix + # 16 unary + # 15 exponentiation # not yet found in grammar + # 14 mul + # 13 add + # 12 shift + # 11 rel + # 10 eq + # 9 band + # 8 bxor + # 7 bor + # 6 land + # 5 lor + # 4 cond # handled by conditional_expression + + out = [] + stack = [] + + while True: + had_inc = False + has_prefix = True + while has_prefix: + peek_id, peek_value, peek_pos = self.token_stream.peek() + if peek_id is Token.OP and peek_value[0] in (Token.ADD, Token.SUB): + # any binary operators will be consumed later + peek_id = Token.UOP + peek_value = convert_to_unary(peek_value) + if peek_id is Token.UOP: + name, op = peek_value + had_inc = name in (Token.INC, Token.DEC) + if had_inc: + peek_id = Token.PREFIX + while stack and stack[-1][0] > 16: + _, stack_id, stack_op = stack.pop() + out.append((stack_id, stack_op)) + stack.append((16, peek_id, op)) + self.token_stream.pop() + peek_id, peek_value, peek_pos = self.token_stream.peek() + if had_inc and peek_id is not Token.ID: + raise ExtractorError('Prefix operator has to be followed by an identifier at %d' % peek_pos) + has_prefix = peek_id is Token.UOP + else: + has_prefix = False + + left = self._member_expression(stack_top - 1) + out.append(left) + + peek_id, peek_value, peek_pos = self.token_stream.peek() + # postfix + if peek_id is Token.UOP: + if had_inc: + raise ExtractorError('''Can't have prefix and postfix operator at the same time at %d''' % peek_pos) + name, op = peek_value + if name in (Token.INC, Token.DEC): + peek_id = Token.POSTFIX + prec = 17 + else: + raise ExtractorError('Unexpected operator at %d' % peek_pos) + while stack and stack[-1][0] >= 17: + _, stack_id, stack_op = stack.pop() + out.append((stack_id, stack_op)) + stack.append((prec, peek_id, op)) + self.token_stream.pop() + peek_id, peek_value, peek_pos = self.token_stream.peek() + + if peek_id is Token.REL: + name, op = peek_value + prec = 11 + elif peek_id is Token.OP: + name, op = peek_value + if name in (Token.MUL, Token.DIV, Token.MOD): + prec = 14 + elif name in (Token.ADD, Token.SUB): + prec = 13 + elif name in (Token.RSHIFT, Token.LSHIFT, Token.URSHIFT): + prec = 12 + elif name is Token.BAND: + prec = 9 + elif name is Token.BXOR: + prec = 8 + elif name is Token.BOR: + prec = 7 + else: + raise ExtractorError('Unexpected operator at %d' % peek_pos) + elif peek_id is Token.LOP: + name, op = peek_value + prec = {Token.OR: 5, Token.AND: 6}[name] + else: + op = None + prec = 4 # empties stack + + while stack and stack[-1][0] >= prec: + _, stack_id, stack_op = stack.pop() + out.append((stack_id, stack_op)) + if op is None: + break + else: + stack.append((prec, peek_id, op)) + self.token_stream.pop() + + return (Token.OPEXPR, out)