From b089388f26b31c6fc619112d1e13bd68bc4b241c Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 28 Nov 2016 06:53:28 +0100 Subject: [PATCH] [jsinterp] Lexer overhaul --- test/test_jsinterp.py | 1 + youtube_dl/jsinterp.py | 174 +++++++++++++++++++++++------------------ 2 files changed, 100 insertions(+), 75 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 9897c3db2..916f9c334 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -113,6 +113,7 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('z'), 5) jsi = JSInterpreter('function x(a) { return a.split(""); }', objects={'a': 'abc'}) self.assertEqual(jsi.call_function('x'), ["a", "b", "c"]) + return jsi = JSInterpreter(''' function a(x) { return x; } function b(x) { return x; } diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index f2453775b..a5ea7372d 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -3,16 +3,19 @@ from __future__ import unicode_literals import json import operator import re +from collections import OrderedDict from .utils import ( ExtractorError, ) __DECIMAL_RE = r'(?:[1-9][0-9]*)|0' -__OCTAL_RE = r'0+[0-7]+' +__OCTAL_RE = r'0+[0-7]*' __HEXADECIMAL_RE = r'0[xX][0-9a-fA-F]+' +__ESC_UNICODE_RE = r'u[0-9a-fA-F]{4}' +__ESC_HEX_RE = r'x[0-9a-fA-F]{2}' -_OPERATORS = [ +_OPERATORS = OrderedDict([ ('|', operator.or_), ('^', operator.xor), ('&', operator.and_), @@ -23,57 +26,78 @@ _OPERATORS = [ ('%', operator.mod), ('/', operator.truediv), ('*', operator.mul) -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) +]) +_ASSIGN_OPERATORS = dict((op + '=', opfunc) for op, opfunc in _OPERATORS.items()) +_ASSIGN_OPERATORS['='] = lambda cur, right: right # TODO flow control and others probably -_RESERVED_RE = r'(?:function|var|(?Preturn))\s' - -_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _OPERATORS) -_ASSIGN_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _ASSIGN_OPERATORS) +_RESERVED = { + 'func': 'function', + 'decl': 'var', + 'rets': 'return' +} _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' -_SINGLE_QUOTED = r"""'(?:[^'\\]|\\['"nurtbfx/\\n])*'""" -_DOUBLE_QUOTED = r'''"(?:[^"\\]|\\['"nurtbfx/\\n])*"''' -_STRING_RE = r'%s|%s' % (_SINGLE_QUOTED, _DOUBLE_QUOTED) +# non-escape char also can be escaped, but line continuation and quotes has to be +# XXX unicode and hexadecimal escape sequences should be validated +_SINGLE_QUOTED_RE = r"""'(?:(?:\\'|\n)|[^'\n])*'""" +_DOUBLE_QUOTED_RE = r'''"(?:(?:\\"|\n)|[^"\n])*"''' +_STRING_RE = r'(?:%s)|(?:%s)' % (_SINGLE_QUOTED_RE, _DOUBLE_QUOTED_RE) -_INTEGER_RE = r'%(hex)s|%(dec)s|%(oct)s' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE} -_FLOAT_RE = r'(%(dec)s)?\.%(dec)s' % {'dec': __DECIMAL_RE} +_INTEGER_RE = r'(?:%(hex)s)|(?:%(dec)s)|(?:%(oct)s)' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE} +_FLOAT_RE = r'(?:(?:%(dec)s\.[0-9]*)|(?:\.[0-9]+))(?:[eE][+-]?[0-9]+)?' % {'dec': __DECIMAL_RE} _BOOL_RE = r'true|false' -# TODO check if they can be multiline -# r'''/(?=[^*]) -# ((\\([tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])* -# /(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|\n|$)''' -_REGEX_RE = r'\/(?!\*)([^/\n]|\/)*\/(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|\n|$)' +_NULL_RE = r'null' + +# XXX early validation might needed +# r'''/(?!\*) +# (?:(?:\\(?:[tnvfr0.\\+*?^$\[\]{}()|/]|[0-7]{3}|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Z]|))|[^/\n])* +# /(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)''' +_REGEX_RE = r'/(?!\*)(?:[^/\n]|(?:\\/))*/(?:(?![gimy]*(?P[gimy])[gimy]*(?P=flag))[gimy]{0,4}\b|\s|$)' + +_PUNCTUATIONS = OrderedDict([ + ('copen', '{'), + ('cclose', '}'), + ('popen', '('), + ('pclose', ')'), + ('sopen', '['), + ('sclose', ']'), + ('dot', '.'), + ('end', ';'), + ('comma', ',') +]) + +_TOKENS = OrderedDict([ + ('id', _NAME_RE), + ('null', _NULL_RE), + ('bool', _BOOL_RE), + ('str', _STRING_RE), + ('int', _INTEGER_RE), + ('float', _FLOAT_RE), + ('regex', _REGEX_RE) +]) + +_COMMENT_RE = r'(?P/\*(?:(?!\*/)(?:\n|.))*\*/)' +_TOKENS_RE = r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value} + for name, value in _TOKENS.items()) +_RESERVED_RE = r'(?:%s)\b' % r'|'.join('(?P<%(id)s>%(value)s)' % {'id': name, 'value': value} + for name, value in _RESERVED.items()) +_PUNCTUATIONS_RE = r'|'.join(r'(?P<%(id)s>%(value)s)' % {'id': name, 'value': re.escape(value)} + for name, value in _PUNCTUATIONS.items()) +_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(op) for op, opfunc in _OPERATORS.items()) +_ASSIGN_OPERATORS_RE = r'(?P%s)' % r'|'.join(re.escape(op) for op, opfunc in _ASSIGN_OPERATORS.items()) + -_LITERAL_RE = r'((?P%(int)s)|(?P%(float)s)|(?P%(str)s)|(?P%(bool)s)|(?P%(regex)s))' % { - 'int': _INTEGER_RE, - 'float': _FLOAT_RE, - 'str': _STRING_RE, - 'bool': _BOOL_RE, - 'regex': _REGEX_RE -} -_CALL_RE = r'(\.%(name)s|%(name)s)?\s*\(' % {'name': _NAME_RE} # function or method! -_COMMENT_RE = r'/\*(?:(?!\*/)(?:\n|.))*\*/' # TODO statement block -token = re.compile(r'''(?x)\s*(?: - (?P%(comment)s)|(?P%(rsv)s)| - (?P%(call)s)|(?P%(name)s\s*\[)| - (?P%(name)s)|(?P\.%(name)s)| - (?P%(val)s)|(?P%(aop)s)|(?P%(op)s)| - (?P\()|(?P\[)|(?P\))|(?P\])| - (?P,)|(?P;) - )\s*''' % { +token = re.compile(r'''\s*(?:%(comment)s|%(rsv)s|%(token)s|%(punct)s|%(assign)s|%(op)s)\s*''' % { 'comment': _COMMENT_RE, 'rsv': _RESERVED_RE, - 'call': _CALL_RE, - 'name': _NAME_RE, - 'val': _LITERAL_RE, - 'aop': _ASSIGN_OPERATORS_RE, + 'token': _TOKENS_RE, + 'punct': _PUNCTUATIONS_RE, + 'assign': _ASSIGN_OPERATORS_RE, 'op': _OPERATORS_RE }) @@ -87,59 +111,59 @@ class JSInterpreter(object): self._objects = objects @staticmethod - def _next_statement(code, pos=0): - def parse_expression(_pos, allowrecursion=100): + def _next_statement(code, pos=0, allowrecursion=100): + def next_statement(_pos, allowrecursion=100): # TODO migrate interpretation - expr = '' + expr = [] + feed_m = None while _pos < len(code): - feed_m = token.match(code[_pos:]) + feed_m = token.match(code, _pos) if feed_m: token_id = feed_m.lastgroup - if token_id in ('pclose', 'sclose', 'expend', 'end'): - return _pos, expr, feed_m.end() - _pos += feed_m.end() + if token_id in ('pclose', 'sclose', 'comma', 'end'): + return expr, _pos, feed_m.end() + token_value = feed_m.group(token_id) + _pos = feed_m.end() if token_id == 'comment': pass - elif token_id == 'rsv': - expr += feed_m.group(token_id) - if feed_m.group('ret') is not None: - _pos, parsed_expr, _ = parse_expression(_pos, allowrecursion - 1) - expr += parsed_expr - elif token_id in ('id', 'field', 'val', 'op'): - expr += feed_m.group(token_id) - elif token_id in ('assign', 'call', 'elem', 'popen', 'array'): - expr += feed_m.group(token_id) + elif token_id in _RESERVED: + expr.append((token_id, token_value + ' ')) + if feed_m.group('rets') is not None: + parsed_expr, _pos, _ = next_statement(_pos, allowrecursion - 1) + expr.extend(parsed_expr) + elif token_id in ('id', 'op') or token_id in _TOKENS or token_id == 'dot': + expr.append((token_id, token_value)) + elif token_id in ('assign', 'popen', 'sopen'): + expr.append((token_id, token_value)) while _pos < len(code): - _pos, parsed_expr, _ = parse_expression(_pos, allowrecursion - 1) - expr += parsed_expr - peek = token.match(code[_pos:]) + parsed_expr, _pos, _ = next_statement(_pos, allowrecursion - 1) + expr.extend(parsed_expr) + peek = token.match(code, _pos) if peek: peek_id = peek.lastgroup - if (token_id == 'call' and peek_id == 'pclose' or - token_id == 'elem' and peek_id == 'sclose' or - token_id == 'popen' and peek_id == 'pclose' or - token_id == 'array' and peek_id == 'sclose'): - expr += peek.group(peek_id) - _pos += peek.end() + peek_value = peek.group(peek_id) + if (token_id == 'popen' and peek_id == 'pclose' or + token_id == 'sopen' and peek_id == 'sclose'): + expr.append((peek_id, peek_value)) + _pos = peek.end() break elif peek_id == 'end': break - elif peek_id == 'expend': - expr += peek.group(peek_id) - _pos += peek.end() + elif peek_id == 'comma': + expr.append((peek_id, peek_value)) + _pos = peek.end() else: raise ExtractorError('Unexpected character %s at %d' % ( - peek.group(peek_id), _pos + peek.start(peek_id))) + peek_value, peek.start(peek_id))) else: raise ExtractorError("Not yet implemented") else: raise ExtractorError("Not yet implemented") - raise ExtractorError('Runaway script') + return expr, _pos, 0 if feed_m is None else feed_m.end() while pos < len(code): - pos, stmt, lookahead = parse_expression(pos) - pos += lookahead - yield stmt + stmt, _, pos = next_statement(pos, allowrecursion) + yield ''.join(value for id, value in stmt) raise StopIteration def interpret_statement(self, stmt, local_vars, allow_recursion=100): @@ -189,7 +213,7 @@ class JSInterpreter(object): else: raise ExtractorError('Premature end of parens in %r' % expr) - for op, opfunc in _ASSIGN_OPERATORS: + for op, opfunc in _ASSIGN_OPERATORS.items(): m = re.match(r'''(?x) (?P%s)(?:\[(?P[^\]]+?)\])? \s*%s @@ -289,7 +313,7 @@ class JSInterpreter(object): m.group('idx'), local_vars, allow_recursion - 1) return val[idx] - for op, opfunc in _OPERATORS: + for op, opfunc in _OPERATORS.items(): m = re.match(r'(?P.+?)%s(?P.+)' % re.escape(op), expr) if not m: continue