diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a8df4aef0..35d2c0096 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -8,6 +8,10 @@ from .utils import ( ExtractorError, ) +__DECIMAL_RE = r'([1-9][0-9]*)|0' +__OCTAL_RE = r'0+[0-7]+' +__HEXADECIMAL_RE = r'(0[xX])[0-9a-fA-F]+' + _OPERATORS = [ ('|', operator.or_), ('^', operator.xor), @@ -18,13 +22,59 @@ _OPERATORS = [ ('+', operator.add), ('%', operator.mod), ('/', operator.truediv), - ('*', operator.mul), + ('*', operator.mul) ] _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] _ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) +_RESERVED_RE = r'(function|var|return)\s' + +_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _OPERATORS) +_ASSIGN_OPERATORS_RE = r'|'.join(re.escape(op) for op, opfunc in _ASSIGN_OPERATORS) + _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +# can't use raw string, starts with " and end with ' +_STRING_RE = '''"(?:[^"\\\\]*(?:\\\\\\\\|\\\\[\'"nurtbfx/\\n]))*[^"\\\\]*"| + \'(?:[^\'\\\\]*(?:\\\\\\\\|\\\\[\'"nurtbfx/\\n]))*[^\'\\\\]*\'''' + +_INTEGER_RE = r'%(hex)s|%(dec)s|%(oct)s' % {'hex': __HEXADECIMAL_RE, 'dec': __DECIMAL_RE, 'oct': __OCTAL_RE} +_FLOAT_RE = r'%(dec)s\.%(dec)s' % {'dec': __DECIMAL_RE} + +_BOOL_RE = r'true|false' +_REGEX_RE = r'/[^/]*/' # TODO make validation work + +_LITERAL_RE = r'(%(int)s|%(float)s|%(str)s|%(bool)s|%(regex)s)' % { + 'int': _INTEGER_RE, + 'float': _FLOAT_RE, + 'str': _STRING_RE, + 'bool': _BOOL_RE, + 'regex': _REGEX_RE +} +_ARRAY_RE = r'\[(%(literal)s\s*,\s*)*(%(literal)s\s*)?\]' % {'literal': _LITERAL_RE} # TODO nested array + +_VALUE_RE = r'(%(literal)s)|(%(array)s)' % {'literal': _LITERAL_RE, 'array': _ARRAY_RE} +_CALL_RE = r'%(name)s\s*\((%(val)s\s*,\s*)*(%(val)s\s*)?\)' % {'name': _NAME_RE, 'val': _VALUE_RE} +_PARENTHESES_RE = r'(?P\()|(?P\))|(?P\[)|(?P\])' +_EXP_RE = r'''(?P%(name)s)|(?P%(val)s)|(?P%(op)s)|%(par)s''' % { + 'name': _NAME_RE, + 'val': _VALUE_RE, + 'op': _OPERATORS_RE, + 'par': _PARENTHESES_RE +} # TODO validate expression (it's probably recursive!) +_ARRAY_ELEMENT_RE = r'%(name)s\s*\[\s*(%(index)s)\s*\]' % {'name': _NAME_RE, 'index': _EXP_RE} + +token = re.compile(r'''(?x)\s* + ((?P%(rsv)s)|(?P%(call)s)|(?P\.%(name)s)| + (?P%(aop)s)|(%(exp)s)| + (?P;))\s*''' % { + 'rsv': _RESERVED_RE, + 'call': _CALL_RE, + 'name': _NAME_RE, + 'aop': _ASSIGN_OPERATORS_RE, + 'exp': _EXP_RE +}) + class JSInterpreter(object): def __init__(self, code, objects=None): @@ -34,6 +84,41 @@ class JSInterpreter(object): self._functions = {} self._objects = objects + @staticmethod + def _next_statement(code, pos=0): + stmt = '' + while pos < len(code): + feed_m = token.match(code[pos:]) + if feed_m: + for token_id, token_value in feed_m.groupdict().items(): + if token_value is not None: + pos += feed_m.end() + if token_id == 'end': + yield stmt + stmt = '' + else: + if token_id == 'rsv': + pass + if token_id == 'call': + pass + if token_id == 'field': + pass + if token_id == 'id': + pass + if token_id == 'val': + pass + if token_id == 'popen': + pass + if token_id == 'pclose': + pass + if token_id == 'op': + pass + if token_id == 'assign': + pass + stmt += token_value + else: + raise NotImplemented("Possibly I've missed something") + def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise ExtractorError('Recursion limit reached') @@ -250,7 +335,7 @@ class JSInterpreter(object): def build_function(self, argnames, code): def resf(args): local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): + for stmt in self._next_statement(code): res, abort = self.interpret_statement(stmt, local_vars) if abort: break