1204 lines
41 KiB
Python
Executable file
1204 lines
41 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import io
|
|
import re
|
|
import unittest
|
|
import typing
|
|
|
|
import jsparagus
|
|
from jsparagus import gen, lexer, rewrites
|
|
from jsparagus.grammar import (Grammar, Production, CallMethod, Nt,
|
|
Optional, LookaheadRule, NtDef, Var)
|
|
from js_parser.parse_esgrammar import parse_esgrammar
|
|
|
|
|
|
LispTokenizer = lexer.LexicalGrammar("( )", SYMBOL=r'[!%&*+:<=>?@A-Z^_a-z~]+')
|
|
|
|
|
|
def prod(body, method_name):
|
|
return Production(body, CallMethod(method_name, list(range(len(body)))))
|
|
|
|
|
|
class GenTestCase(unittest.TestCase):
|
|
def compile(self, tokenize, grammar, **kwargs):
|
|
"""Compile a grammar. Use this when you expect compilation to
|
|
succeed."""
|
|
self.tokenize = tokenize
|
|
self.parser_class = gen.compile(grammar, **kwargs)
|
|
|
|
def parse(self, text, goal=None):
|
|
if goal is None:
|
|
parser = self.parser_class()
|
|
else:
|
|
parser = self.parser_class(goal=goal)
|
|
lexer = self.tokenize(parser)
|
|
lexer.write(text)
|
|
return lexer.close()
|
|
|
|
def compile_multi(self, tokenize, grammar):
|
|
self.tokenize = tokenize
|
|
obj = gen.compile_multi(grammar)
|
|
for attr in dir(obj):
|
|
if attr.startswith("parse_"):
|
|
setattr(self, attr, getattr(obj, attr))
|
|
|
|
def assertParse(self, s, expected=None, *, goal=None):
|
|
result = self.parse(s, goal=goal)
|
|
if expected is not None:
|
|
self.assertEqual(expected, result)
|
|
|
|
def assertNoParse(self, s, *, goal=None, message="banana"):
|
|
if goal is None:
|
|
kwargs = {}
|
|
else:
|
|
kwargs = {"goal": goal}
|
|
self.assertRaisesRegex(
|
|
SyntaxError,
|
|
re.escape(message),
|
|
lambda: self.parse(s, **kwargs))
|
|
|
|
def testSimple(self):
|
|
grammar = parse_esgrammar(
|
|
"""
|
|
expr :
|
|
SYMBOL => $0
|
|
`(` tail
|
|
|
|
tail :
|
|
`)` => $0
|
|
expr tail
|
|
""",
|
|
terminal_names=["SYMBOL"]
|
|
)
|
|
self.compile(LispTokenizer, grammar)
|
|
|
|
self.assertParse(
|
|
"(lambda (x) (* x x))",
|
|
('expr_1',
|
|
'(',
|
|
('tail_1',
|
|
'lambda',
|
|
('tail_1',
|
|
('expr_1', '(', ('tail_1', 'x', ')')),
|
|
('tail_1',
|
|
('expr_1',
|
|
'(',
|
|
('tail_1',
|
|
'*',
|
|
('tail_1',
|
|
'x',
|
|
('tail_1', 'x', ')')))),
|
|
')')))))
|
|
|
|
def testEnd(self):
|
|
self.compile(
|
|
lexer.LexicalGrammar("ONE TWO"),
|
|
Grammar({
|
|
'goal': [
|
|
['ONE', 'TWO']
|
|
]
|
|
})
|
|
)
|
|
self.assertNoParse("ONE TWO TWO",
|
|
message="expected 'end of input', got 'TWO'")
|
|
|
|
def testList(self):
|
|
list_grammar = Grammar({
|
|
'prelist': [
|
|
['word', 'list']
|
|
],
|
|
'list': [
|
|
['word'],
|
|
['list', 'word'],
|
|
],
|
|
'word': [
|
|
['SYMBOL']
|
|
],
|
|
})
|
|
self.compile(LispTokenizer, list_grammar)
|
|
self.assertParse(
|
|
"the quick brown fox jumped over the lazy dog",
|
|
('prelist',
|
|
'the',
|
|
('list_1',
|
|
('list_1',
|
|
('list_1',
|
|
('list_1',
|
|
('list_1',
|
|
('list_1',
|
|
('list_1',
|
|
'quick',
|
|
'brown'),
|
|
'fox'),
|
|
'jumped'),
|
|
'over'),
|
|
'the'),
|
|
'lazy'),
|
|
'dog')))
|
|
|
|
def testArithmetic(self):
|
|
tokenize = lexer.LexicalGrammar(
|
|
"+ - * / ( )",
|
|
NUM=r'[0-9]\w*',
|
|
VAR=r'[A-Za-z]\w*')
|
|
arith_grammar = Grammar({
|
|
'expr': [
|
|
['term'],
|
|
['expr', '+', 'term'],
|
|
['expr', '-', 'term'],
|
|
],
|
|
'term': [
|
|
['prim'],
|
|
['term', '*', 'prim'],
|
|
['term', '/', 'prim'],
|
|
],
|
|
'prim': [
|
|
['NUM'],
|
|
['VAR'],
|
|
['(', 'expr', ')'],
|
|
],
|
|
})
|
|
self.compile(tokenize, arith_grammar)
|
|
|
|
self.assertParse(
|
|
'2 * 3 + 4 * (5 + 7)',
|
|
('expr_1',
|
|
('term_1', '2', '*', '3'),
|
|
'+',
|
|
('term_1',
|
|
'4',
|
|
'*',
|
|
('prim_2',
|
|
'(',
|
|
('expr_1', '5', '+', '7'),
|
|
')'))))
|
|
|
|
self.assertNoParse(
|
|
"(",
|
|
message="unexpected end of input")
|
|
self.assertNoParse(
|
|
")",
|
|
message="expected one of ['(', 'NUM', 'VAR'], got ')'")
|
|
|
|
def testAmbiguous(self):
|
|
# This grammar should fail verification.
|
|
# It's ambiguous: is ABC s(A)y(BC) or s(AB)y(C)?
|
|
grammar = Grammar({
|
|
'goal': [
|
|
['s', 'y'],
|
|
],
|
|
's': [
|
|
['A'],
|
|
['s', 'B'],
|
|
],
|
|
'y': [
|
|
['C'],
|
|
['B', 'C'],
|
|
],
|
|
})
|
|
|
|
out = io.StringIO()
|
|
self.assertRaisesRegex(ValueError, r"conflict",
|
|
lambda: gen.generate_parser(out, grammar))
|
|
|
|
def testAmbiguousEmpty(self):
|
|
"""Reject grammars that are ambiguous due to empty productions.
|
|
|
|
(Empty productions are ones that match the empty string.)"""
|
|
|
|
def check(rules):
|
|
grammar = Grammar(rules, goal_nts=['goal'])
|
|
out = io.StringIO()
|
|
self.assertRaisesRegex(
|
|
ValueError,
|
|
r"ambiguous grammar|conflict",
|
|
lambda: gen.generate_parser(out, grammar))
|
|
|
|
check({'goal': [[], []]})
|
|
check({'goal': [[Optional('X')], []]})
|
|
check({'goal': [[Optional('X')], [Optional('Y')]]})
|
|
check({'goal': [[Optional('X'), Optional('Y')], [Optional('Z')]]})
|
|
|
|
# Issue #3: This also has an abiguity; empty string matches either
|
|
# `goal ::= [empty]` or `goal ::= phrase, phrase ::= [empty]`.
|
|
check({
|
|
'goal': [[Optional('phrase')]],
|
|
'phrase': [[Optional('X')]],
|
|
})
|
|
|
|
# Input "X" is ambiguous, could be ('goal', ('a', None), ('a', 'X'))
|
|
# or the other 'a' could be the one that's missing.
|
|
check({
|
|
'goal': [['a', 'a']],
|
|
'a': [[Optional('X')]],
|
|
})
|
|
|
|
def testLeftFactor(self):
|
|
"""Most basic left-factoring test."""
|
|
tokenize = lexer.LexicalGrammar("A B")
|
|
grammar = Grammar({
|
|
'goal': [
|
|
['A'],
|
|
['A', 'B'],
|
|
],
|
|
})
|
|
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("A", 'A')
|
|
self.assertParse("A B", ('goal_1', 'A', 'B'))
|
|
|
|
def testLeftFactorMulti(self):
|
|
"""Test left-factoring with common prefix of length >1."""
|
|
tokenize = lexer.LexicalGrammar("A B C D E")
|
|
grammar = Grammar({
|
|
'goal': [
|
|
['A', 'B', 'C', 'D'],
|
|
['A', 'B', 'C', 'E'],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse(
|
|
"A B C D",
|
|
('goal_0', 'A', 'B', 'C', 'D'))
|
|
self.assertParse(
|
|
"A B C E",
|
|
('goal_1', 'A', 'B', 'C', 'E'))
|
|
|
|
def testLeftFactorMultiLevel(self):
|
|
"""Test left-factoring again on a nonterminal introduced by
|
|
left-factoring."""
|
|
tokenize = lexer.LexicalGrammar("FOR IN TO BY ( ) = ;",
|
|
VAR=r'[A-Za-z]+')
|
|
|
|
# The first left-factoring pass on `stmt` will left-factor `FOR ( VAR`.
|
|
# A second pass is needed to left-factor `= expr TO expr`.
|
|
grammar = Grammar({
|
|
'stmt': [
|
|
['expr', ';'],
|
|
['FOR', '(', 'VAR', 'IN', 'expr', ')', 'stmt'],
|
|
['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', ')', 'stmt'],
|
|
['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr',
|
|
'BY', 'expr', ')', 'stmt'],
|
|
['IF', '(', 'expr', ')', 'stmt'],
|
|
],
|
|
'expr': [
|
|
['VAR'],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse(
|
|
"FOR (x IN y) z;",
|
|
('stmt_1', 'FOR', '(', 'x', 'IN', 'y', ')',
|
|
('stmt_0', 'z', ';')))
|
|
self.assertParse(
|
|
"FOR (x = y TO z) x;",
|
|
('stmt_2', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', ')',
|
|
('stmt_0', 'x', ';')))
|
|
self.assertParse(
|
|
"FOR (x = y TO z BY w) x;",
|
|
('stmt_3', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', 'BY', 'w', ')',
|
|
('stmt_0', 'x', ';')))
|
|
|
|
def testFirstFirstConflict(self):
|
|
"""This grammar is unambiguous, but is not LL(1) due to a first/first conflict.
|
|
|
|
Cribbed from: https://stackoverflow.com/a/17047370/94977
|
|
"""
|
|
|
|
tokenize = lexer.LexicalGrammar("A B C")
|
|
grammar = Grammar({
|
|
's': [
|
|
['x', 'B'],
|
|
['y', 'C'],
|
|
],
|
|
'x': [
|
|
prod(['A'], "x"),
|
|
],
|
|
'y': [
|
|
prod(['A'], "y"),
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
|
|
self.assertParse("A B", ('s_0', ('x', 'A'), 'B'))
|
|
self.assertParse("A C", ('s_1', ('y', 'A'), 'C'))
|
|
|
|
def testLeftHandSideExpression(self):
|
|
"""Example of a grammar that's in SLR(1) but hard to smoosh into an LL(1) form.
|
|
|
|
This is taken from the ECMAScript grammar.
|
|
|
|
...Of course, it's not really possible to enforce the desired syntactic
|
|
restrictions in LR(k) either; the ES grammar matches `(x + y) = z` and
|
|
an additional attribute grammar (IsValidSimpleAssignmentTarget) is
|
|
necessary to rule it out.
|
|
"""
|
|
self.compile(
|
|
lexer.LexicalGrammar("= +", VAR=r'[a-z]+\b'),
|
|
Grammar({
|
|
'AssignmentExpression': [
|
|
['AdditiveExpression'],
|
|
['LeftHandSideExpression', '=', 'AssignmentExpression'],
|
|
],
|
|
'AdditiveExpression': [
|
|
['LeftHandSideExpression'],
|
|
['AdditiveExpression', '+', 'LeftHandSideExpression'],
|
|
],
|
|
'LeftHandSideExpression': [
|
|
['VAR'],
|
|
]
|
|
})
|
|
)
|
|
self.assertParse("z = x + y")
|
|
self.assertNoParse(
|
|
"x + y = z",
|
|
message="expected one of ['+', 'end of input'], got '='")
|
|
|
|
def testDeepRecursion(self):
|
|
grammar = Grammar({
|
|
'expr': [
|
|
['SYMBOL'],
|
|
['(', ')'],
|
|
['(', 'exprs', ')'],
|
|
],
|
|
'exprs': [
|
|
['expr'],
|
|
['exprs', 'expr'],
|
|
],
|
|
})
|
|
self.compile(LispTokenizer, grammar)
|
|
|
|
N = 3000
|
|
s = "x"
|
|
t = ('expr_0', 'x')
|
|
for i in range(N):
|
|
s = "(" + s + ")"
|
|
t = ('expr_2', '(', t, ')')
|
|
|
|
result = self.parse(s)
|
|
|
|
# Python can't check that result == t; it causes a RecursionError.
|
|
# Testing that repr(result) == repr(t), same deal. So:
|
|
for i in range(N):
|
|
self.assertIsInstance(result, tuple)
|
|
self.assertEqual(len(result), 4)
|
|
self.assertEqual(result[0], 'expr_2')
|
|
self.assertEqual(result[1], '(')
|
|
self.assertEqual(result[3], ')')
|
|
result = result[2]
|
|
|
|
def testExpandOptional(self):
|
|
grammar = Grammar({'goal': [[]]})
|
|
empties = {}
|
|
# Unit test for rewrites.expand_optional_symbols_in_rhs
|
|
self.assertEqual(
|
|
list(rewrites.expand_optional_symbols_in_rhs(['ONE', 'TWO', '3'],
|
|
grammar, empties)),
|
|
[(['ONE', 'TWO', '3'], {})])
|
|
self.assertEqual(
|
|
list(rewrites.expand_optional_symbols_in_rhs(
|
|
['a', 'b', Optional('c')], grammar, empties)),
|
|
[(['a', 'b'], {2: None}),
|
|
(['a', 'b', 'c'], {})])
|
|
self.assertEqual(
|
|
list(rewrites.expand_optional_symbols_in_rhs(
|
|
[Optional('a'), Optional('b')], grammar, empties)),
|
|
[([], {0: None, 1: None}),
|
|
(['a'], {1: None}),
|
|
(['b'], {0: None}),
|
|
(['a', 'b'], {})])
|
|
|
|
def testEmptyGrammar(self):
|
|
tokenize = lexer.LexicalGrammar("X")
|
|
self.compile(tokenize, Grammar({'goal': [[]]}))
|
|
self.assertParse("", ('goal',))
|
|
self.assertNoParse(
|
|
"X",
|
|
message="expected 'end of input', got 'X' (line 1)")
|
|
|
|
def testOptionalEmpty(self):
|
|
tokenize = lexer.LexicalGrammar("X Y")
|
|
grammar = Grammar({
|
|
'a': [
|
|
[Optional('b'), Optional('c')],
|
|
],
|
|
'b': [
|
|
prod(['X'], 'b'),
|
|
],
|
|
'c': [
|
|
prod(['Y'], 'c'),
|
|
]
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("", ('a', None, None))
|
|
self.assertParse("X", ('a', ('b', 'X'), None))
|
|
self.assertParse("Y", ('a', None, ('c', 'Y')))
|
|
self.assertParse("X Y", ('a', ('b', 'X'), ('c', 'Y')))
|
|
|
|
def testOptional(self):
|
|
tokenize = lexer.LexicalGrammar('[ ] , X')
|
|
grammar = Grammar({
|
|
'array': [
|
|
['[', Optional('elision'), ']'],
|
|
['[', 'elements', ']'],
|
|
['[', 'elements', ',', Optional('elision'), ']']
|
|
],
|
|
'elements': [
|
|
[Optional('elision'), 'X'],
|
|
['elements', ',', Optional('elision'), 'X']
|
|
],
|
|
'elision': [
|
|
[','],
|
|
['elision', ',']
|
|
]
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("[]",
|
|
('array_0', '[', None, ']'))
|
|
self.assertParse("[,]",
|
|
('array_0', '[', ',', ']'))
|
|
self.assertParse(
|
|
"[,,X,,X,]",
|
|
('array_2',
|
|
'[',
|
|
('elements_1',
|
|
('elements_0',
|
|
('elision_1',
|
|
',',
|
|
','),
|
|
'X'),
|
|
',',
|
|
',',
|
|
'X'),
|
|
',',
|
|
None,
|
|
']'))
|
|
|
|
def testPositiveLookahead(self):
|
|
self.compile(
|
|
lexer.LexicalGrammar('A B + ( )'),
|
|
Grammar({
|
|
'goal': [
|
|
[LookaheadRule(frozenset({'A', 'B'}), True), 'expr'],
|
|
],
|
|
'expr': [
|
|
['term'],
|
|
['expr', '+', 'term'],
|
|
],
|
|
'term': [
|
|
['A'],
|
|
['B'],
|
|
['(', 'expr', ')'],
|
|
]
|
|
})
|
|
)
|
|
self.assertNoParse(
|
|
"(A)",
|
|
message="expected one of ['A', 'B'], got '('")
|
|
self.assertParse("A + B")
|
|
|
|
def testNegativeLookahead(self):
|
|
tokenize = lexer.LexicalGrammar('a b')
|
|
rules = {
|
|
'goal': [
|
|
[LookaheadRule(frozenset({'a'}), False), 'abs'],
|
|
],
|
|
'abs': [
|
|
['a'],
|
|
['b'],
|
|
['abs', 'a'],
|
|
['abs', 'b'],
|
|
],
|
|
}
|
|
|
|
self.compile(tokenize, Grammar(rules))
|
|
self.assertNoParse("a b", message="expected 'b', got 'a'")
|
|
self.assertParse(
|
|
'b a',
|
|
('goal', ('abs_2', 'b', 'a')))
|
|
|
|
# In simple cases like this, the lookahead restriction can even
|
|
# disambiguate a grammar that would otherwise be ambiguous.
|
|
rules['goal'].append(prod(['a'], 'goal_a'))
|
|
self.compile(tokenize, Grammar(rules))
|
|
self.assertParse('a', ('goal_a', 'a'))
|
|
|
|
def disabledNegativeLookaheadDisambiguation(self):
|
|
tokenize = lexer.LexicalGrammar(
|
|
'( ) { } ; function =',
|
|
IDENT=r'[A-Za-z_][A-Za-z_0-9]*')
|
|
grammar = Grammar({
|
|
'stmts': [
|
|
['stmt'],
|
|
['stmts', 'stmt'],
|
|
],
|
|
'stmt': [
|
|
[LookaheadRule(set=frozenset({'function'}), positive=False),
|
|
'expr', ';'],
|
|
['fndecl'],
|
|
],
|
|
'fndecl': [
|
|
['function', 'IDENT', '(', ')', '{', Optional('stmt'), '}'],
|
|
],
|
|
'expr': [
|
|
['term'],
|
|
['IDENT', '=', 'expr'],
|
|
],
|
|
'term': [
|
|
['(', 'expr', ')'],
|
|
['fndecl'],
|
|
['term', '(', 'expr', ')'],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
|
|
# Test that without the lookahead restriction, we reject this grammar
|
|
# (it's ambiguous):
|
|
del grammar['stmt'][0][0]
|
|
self.assertRaisesRegex(ValueError,
|
|
'banana',
|
|
lambda: gen.compile(grammar))
|
|
|
|
self.assertParse(
|
|
'function f() { x = function y() {}; }',
|
|
('stmt', 1,
|
|
('fndecl',
|
|
'function', 'f', '(', ')', '{',
|
|
('stmt', 0,
|
|
('expr', 1,
|
|
'x',
|
|
'=',
|
|
('expr', 0,
|
|
('term', 1,
|
|
('fndecl',
|
|
'function', 'y', '(', ')',
|
|
'{', None, '}')))),
|
|
';'))))
|
|
|
|
self.assertParse(
|
|
'(function g(){});',
|
|
('stmts', 0,
|
|
('stmt', 0,
|
|
('term', 1,
|
|
('fndecl',
|
|
'function', 'g', '(', ')', '{', None, '}')),
|
|
';')))
|
|
|
|
def testTrailingLookahead(self):
|
|
"""Lookahead at the end of a production is banned."""
|
|
tokenize = lexer.LexicalGrammar('IF ( X ) ELSE OTHER ;')
|
|
grammar = gen.Grammar({
|
|
'goal': [['stmt']],
|
|
'stmt': [
|
|
['OTHER', ';'],
|
|
['IF', '(', 'X', ')', 'stmt',
|
|
LookaheadRule(frozenset({'ELSE'}), False)],
|
|
['IF', '(', 'X', ')', 'stmt', 'ELSE', 'stmt'],
|
|
],
|
|
})
|
|
|
|
def stmt_0():
|
|
return ('stmt_0', 'OTHER', ';')
|
|
|
|
def stmt_1(t):
|
|
return ('stmt_1', 'IF', '(', 'X', ')', t)
|
|
|
|
def stmt_2(t, e):
|
|
return ('stmt_2', 'IF', '(', 'X', ')', t, 'ELSE', e)
|
|
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse('IF(X) OTHER;', stmt_1(stmt_0()))
|
|
self.assertParse('IF(X) OTHER; ELSE OTHER;',
|
|
stmt_2(stmt_0(), stmt_0()))
|
|
self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER; ELSE OTHER;',
|
|
stmt_2(stmt_2(stmt_0(), stmt_0()), stmt_0()))
|
|
self.assertParse('IF(X) OTHER; ELSE IF(X) OTHER; ELSE OTHER;',
|
|
stmt_2(stmt_0(), stmt_2(stmt_0(), stmt_0())))
|
|
self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER;',
|
|
stmt_1(stmt_2(stmt_0(), stmt_0())))
|
|
|
|
def testLookaheadBeforeOptional(self):
|
|
self.compile(
|
|
lexer.LexicalGrammar(
|
|
'= : _',
|
|
PUBLIC=r'public\b',
|
|
IDENT=r'[a-z]+\b',
|
|
NUM=r'[0-9]\b'),
|
|
Grammar({
|
|
'decl': [
|
|
[
|
|
LookaheadRule(frozenset({'IDENT'}), True),
|
|
Optional('attrs'),
|
|
'pat', '=', 'NUM'
|
|
],
|
|
],
|
|
'attrs': [
|
|
['attr'],
|
|
['attrs', 'attr'],
|
|
],
|
|
'attr': [
|
|
['PUBLIC', ':'],
|
|
['IDENT', ':'],
|
|
],
|
|
'pat': [
|
|
['IDENT'],
|
|
['_'],
|
|
],
|
|
})
|
|
)
|
|
self.assertEqual(
|
|
self.parse("x = 0"),
|
|
("decl", None, "x", "=", "0"))
|
|
self.assertParse("thread: x = 0")
|
|
self.assertNoParse(
|
|
"public: x = 0",
|
|
message="expected 'IDENT', got 'public'")
|
|
self.assertNoParse("_ = 0", message="expected 'IDENT', got '_'")
|
|
self.assertParse("funny: public: x = 0")
|
|
self.assertParse("funny: _ = 0")
|
|
|
|
def testForLookahead(self):
|
|
grammar = Grammar({
|
|
'Stmt': [
|
|
[';'],
|
|
['ForStmt'],
|
|
],
|
|
'ForStmt': [
|
|
["for", "(", LookaheadRule(frozenset({"let"}), False),
|
|
"Expr", ";", ";", ")", "Stmt"],
|
|
],
|
|
'Expr': [
|
|
["0"],
|
|
["let"],
|
|
],
|
|
})
|
|
self.compile(lexer.LexicalGrammar("for ( let ; ) 0"), grammar)
|
|
self.assertParse("for (0;;) ;")
|
|
self.assertNoParse("for (let;;) ;", message="expected '0', got 'let'")
|
|
|
|
def testLookaheadDisambiguation(self):
|
|
"""A lookahead restriction should be able to rule out certain nonterminals entirely."""
|
|
|
|
grammar = Grammar({
|
|
'Script': [
|
|
['Statement'],
|
|
['Statement', 'Statement'],
|
|
],
|
|
'Statement': [
|
|
[LookaheadRule(frozenset({'function'}), False), 'Expression', ';'],
|
|
['Function'],
|
|
],
|
|
'Function': [
|
|
['function', 'x', '(', ')', '{', '}'],
|
|
],
|
|
'Expression': [
|
|
['Primary'],
|
|
['++', 'Primary'],
|
|
['Primary', '++'],
|
|
],
|
|
'Primary': [
|
|
['Function'],
|
|
['x'],
|
|
],
|
|
})
|
|
|
|
self.compile(lexer.LexicalGrammar("function x ( ) { } ++ ;"), grammar)
|
|
self.assertParse("function x() {}")
|
|
self.assertParse("++function x() {};")
|
|
self.assertNoParse("++function x() {}", message="unexpected end")
|
|
# TODO: The parser generator fails to handle this case because it does
|
|
# not forward the restriction from producting a Function to the
|
|
# Primitive rule. Therefore, `Function [lookahead: ;]` is incorrectly
|
|
# reduced to a `Primitive [lookahead: ;]`
|
|
# self.assertNoParse("function x() {}++;", message="got ';'")
|
|
self.assertParse("function x() {} ++x;")
|
|
|
|
# XXX to test: combination of lookaheads, ++, +-, -+, --
|
|
# XXX todo: find an example where lookahead canonicalization matters
|
|
|
|
def testHugeExample(self):
|
|
grammar = Grammar(
|
|
{
|
|
'grammar': [['nt_def_or_blank_line'],
|
|
['grammar', 'nt_def_or_blank_line']],
|
|
'arg': [['sigil', 'NT']],
|
|
'args': [['arg'], ['args', ',', 'arg']],
|
|
'definite_sigil': [['~'], ['+']],
|
|
'exclusion': [['terminal'],
|
|
['nonterminal'],
|
|
['CHR', 'through', 'CHR']],
|
|
'exclusion_list': [['exclusion'],
|
|
['exclusion_list', 'or', 'exclusion']],
|
|
'ifdef': [['[', 'definite_sigil', 'NT', ']']],
|
|
'line_terminator': [['NT'], ['NTALT']],
|
|
'lookahead_assertion': [
|
|
['==', 'terminal'],
|
|
['!=', 'terminal'],
|
|
['<!', 'NT'],
|
|
['<!', '{', 'lookahead_exclusions', '}']],
|
|
'lookahead_exclusion': [['lookahead_exclusion_element'],
|
|
['lookahead_exclusion',
|
|
'lookahead_exclusion_element']],
|
|
'lookahead_exclusion_element': [['terminal'],
|
|
['no_line_terminator_here']],
|
|
'lookahead_exclusions': [['lookahead_exclusion'],
|
|
['lookahead_exclusions', ',',
|
|
'lookahead_exclusion']],
|
|
'no_line_terminator_here': [
|
|
['[', 'no', 'line_terminator', 'here', ']']],
|
|
'nonterminal': [['NT'], ['NTCALL', '[', 'args', ']']],
|
|
'nt_def': [['nt_lhs', 'EQ', 'NL', 'rhs_lines', 'NL'],
|
|
['nt_lhs', 'EQ', 'one', 'of', 'NL',
|
|
't_list_lines', 'NL']],
|
|
'nt_def_or_blank_line': [['NL'], ['nt_def']],
|
|
'nt_lhs': [['NT'], ['NTCALL', '[', 'params', ']']],
|
|
'param': [['NT']],
|
|
'params': [['param'], ['params', ',', 'param']],
|
|
'rhs': [['symbols'], ['[', 'empty', ']']],
|
|
'rhs_line': [[Optional(inner='ifdef'), 'rhs',
|
|
Optional(inner='PRODID'), 'NL'],
|
|
['PROSE', 'NL']],
|
|
'rhs_lines': [['rhs_line'], ['rhs_lines', 'rhs_line']],
|
|
'sigil': [['definite_sigil'], ['?']],
|
|
'symbol': [['terminal'],
|
|
['nonterminal'],
|
|
['nonterminal', '?'],
|
|
['nonterminal', 'but', 'not', 'exclusion'],
|
|
['nonterminal', 'but', 'not', 'one', 'of',
|
|
'exclusion_list'],
|
|
['[', 'lookahead', 'lookahead_assertion', ']'],
|
|
['no_line_terminator_here'],
|
|
['WPROSE']],
|
|
'symbols': [['symbol'], ['symbols', 'symbol']],
|
|
't_list_line': [['terminal_seq', 'NL']],
|
|
't_list_lines': [['t_list_line'],
|
|
['t_list_lines', 't_list_line']],
|
|
'terminal': [['T'], ['CHR']],
|
|
'terminal_seq': [['terminal'], ['terminal_seq', 'terminal']]
|
|
},
|
|
variable_terminals='EQ T CHR NTCALL NT NTALT '
|
|
'PRODID PROSE WPROSE'.split()
|
|
)
|
|
|
|
# Note: This lexical grammar is not suitable for use with incremental
|
|
# parsing.
|
|
emu_grammar_lexer = lexer.LexicalGrammar(
|
|
# the operators and keywords:
|
|
"[ ] { } , ~ + ? <! == != "
|
|
"but empty here lookahead no not of one or through",
|
|
NL="\n",
|
|
# any number of colons together
|
|
EQ=r':+',
|
|
# terminals of the ES grammar, quoted with backticks
|
|
T=r'`[^` \n]+`|```',
|
|
# also terminals, denoting control characters
|
|
CHR=r'<[A-Z]+>|U\+[0-9A-f]{4}',
|
|
# nonterminals that will be followed by boolean parameters
|
|
NTCALL=r'(?:uri|[A-Z])\w*(?=\[)',
|
|
# nonterminals (also, boolean parameters)
|
|
NT=r'(?:uri|[A-Z])\w*',
|
|
# nonterminals wrapped in vertical bars for no apparent reason
|
|
NTALT=r'\|[A-Z]\w+\|',
|
|
# the spec also gives a few productions names
|
|
PRODID=r'#[A-Za-z]\w*',
|
|
# prose to the end of the line
|
|
PROSE=r'>.*',
|
|
# prose wrapped in square brackets
|
|
WPROSE=r'\[>[^]]*\]',
|
|
)
|
|
|
|
self.compile(emu_grammar_lexer, grammar)
|
|
|
|
source = """\
|
|
IdentifierReference[Yield, Await] :
|
|
Identifier
|
|
[~Yield] `yield`
|
|
[~Await] `await`
|
|
|
|
"""
|
|
|
|
self.assertParse(source)
|
|
|
|
def testParameterizedProductions(self):
|
|
passthru = ('Yield', Var('Yield')),
|
|
name = Nt("name", passthru)
|
|
stmt = Nt("stmt", passthru)
|
|
stmts = Nt("stmts", passthru)
|
|
grammar = Grammar({
|
|
'script': [
|
|
['def'],
|
|
['script', 'def'],
|
|
],
|
|
'def': [
|
|
[
|
|
'function', 'IDENT', '(', ')', '{',
|
|
Nt('stmts', (('Yield', False),)), '}'
|
|
],
|
|
[
|
|
'function', '*', 'IDENT', '(', ')', '{',
|
|
Nt('stmts', (('Yield', True),)), '}'
|
|
],
|
|
],
|
|
'stmts': NtDef(('Yield',), [
|
|
[stmt],
|
|
[stmts, stmt],
|
|
], None),
|
|
'stmt': NtDef(('Yield',), [
|
|
[name, "(", ")", ";"],
|
|
[name, "=", name, ";"],
|
|
Production(["yield", name, ";"],
|
|
reducer=CallMethod("yield_stmt", [1]),
|
|
condition=('Yield', True)),
|
|
], None),
|
|
'name': NtDef(('Yield',), [
|
|
["IDENT"],
|
|
# Specifically ask for a method here, because otherwise we
|
|
# wouldn't get one and then type checking would fail.
|
|
Production(["yield"],
|
|
CallMethod("yield_as_name", []),
|
|
condition=('Yield', False)),
|
|
], None),
|
|
}, variable_terminals=["IDENT"])
|
|
self.compile(lexer.LexicalGrammar("( ) { } ; * = function yield",
|
|
IDENT=r'[A-Za-z]\w*'),
|
|
grammar)
|
|
self.assertParse("function* farm() { cow = pig; yield cow; }")
|
|
self.assertNoParse(
|
|
"function city() { yield toOncomingTraffic; }",
|
|
message="expected one of ['(', '='], got 'toOncomingTraffic'")
|
|
self.assertNoParse(
|
|
"function* farm() { yield = corn; yield yield; }",
|
|
message="expected 'IDENT', got '='")
|
|
|
|
def testMissingParameterError(self):
|
|
grammar = {
|
|
'Foo': [
|
|
['Bar'],
|
|
],
|
|
'Bar': NtDef(('Arg',), [
|
|
['NUM'],
|
|
Production(['STR'],
|
|
reducer=0,
|
|
condition=('Arg', True)),
|
|
], None),
|
|
}
|
|
|
|
self.assertRaisesRegex(ValueError, "missing parameters for 'Bar'",
|
|
lambda: Grammar(grammar))
|
|
|
|
def testCanonicalLR(self):
|
|
"""Example 4.39 (grammar 4.20) from the book."""
|
|
|
|
# Modified as marked below
|
|
grammar = Grammar({
|
|
"S": [
|
|
["L", "=", "R"],
|
|
["R"],
|
|
],
|
|
"L": [
|
|
["*", "R"],
|
|
["id"],
|
|
],
|
|
"R": [
|
|
["L"],
|
|
# added so we can have a negative test, showing that
|
|
# `R = R` is not an S:
|
|
["7"],
|
|
],
|
|
})
|
|
self.compile(lexer.LexicalGrammar("id = * 7"), grammar)
|
|
self.assertParse("id = *id")
|
|
self.assertParse("*id = id")
|
|
self.assertParse("id = 7")
|
|
self.assertNoParse("7 = id",
|
|
message="expected 'end of input', got '='")
|
|
|
|
def testLookaheadWithCanonicalLR(self):
|
|
"""Only a lookahead assertion makes this grammar unambiguous."""
|
|
tokenize = lexer.LexicalGrammar("async => { } ;", Identifier=r'\w+')
|
|
grammar = Grammar({
|
|
"script": [
|
|
["Expression", ";"],
|
|
],
|
|
"Expression": [
|
|
["PrimaryExpression"],
|
|
["async", "Identifier", "=>", "AsyncConciseBody"],
|
|
],
|
|
"AsyncConciseBody": [
|
|
[LookaheadRule(set=frozenset(["{"]), positive=False),
|
|
"Expression"],
|
|
["{", "}"],
|
|
],
|
|
"PrimaryExpression": [
|
|
["{", "}"],
|
|
],
|
|
})
|
|
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("{};")
|
|
self.assertParse("async x => {};")
|
|
self.assertParse("async x => async y => {};")
|
|
|
|
def testMultiGoal(self):
|
|
tokenize = lexer.LexicalGrammar("WHILE DEF FN { } ( ) -> ;", ID=r'\w+')
|
|
grammar = Grammar({
|
|
"stmt": [
|
|
["expr", ";"],
|
|
["{", "stmts", "}"],
|
|
["WHILE", "(", "expr", ")", "stmt"],
|
|
["DEF", "ID", "(", "ID", ")", "{", Optional("stmts"), "}"],
|
|
],
|
|
"stmts": [
|
|
["stmt"],
|
|
["stmts", "stmt"],
|
|
],
|
|
"expr": [
|
|
["FN", "ID", "->", "expr"],
|
|
["call_expr"],
|
|
],
|
|
"call_expr": [
|
|
["ID"],
|
|
["call_expr", "(", "expr", ")"],
|
|
["(", "expr", ")"],
|
|
],
|
|
}, goal_nts=["stmts", "expr"])
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("WHILE ( x ) { decx ( x ) ; }", goal="stmts")
|
|
self.assertNoParse(
|
|
"WHILE ( x ) { decx ( x ) ; }", goal="expr",
|
|
message="expected one of ['(', 'FN', 'ID'], got 'WHILE'")
|
|
self.assertParse("f(x);", goal="stmts")
|
|
self.assertNoParse("f(x);", goal="expr",
|
|
message="expected 'end of input', got ';'")
|
|
self.assertParse("(FN x -> f ( x ))(x)", goal="expr")
|
|
self.assertNoParse("(FN x -> f ( x ))(x)", goal="stmts",
|
|
message="unexpected end of input")
|
|
|
|
def testStaggeredItems(self):
|
|
"""Items in a state can have different amounts of leading context."""
|
|
# In this example grammar, after "A" "B", we're in a state that
|
|
# contains these two items (ignoring lookahead):
|
|
# goal ::= "A" "B" · y
|
|
# x ::= "B" · stars "X"
|
|
#
|
|
# Likewise, after `"A" "B" stars`, we have:
|
|
# x ::= "B" stars · "X"
|
|
# y ::= stars · "Y"
|
|
# stars ::= stars · "*"
|
|
tokenize = lexer.LexicalGrammar("A B * X Y")
|
|
grammar = Grammar({
|
|
"goal": [
|
|
["A", "x"],
|
|
["A", "B", "y"],
|
|
],
|
|
"x": [
|
|
["B", "stars", "X"],
|
|
],
|
|
"y": [
|
|
["stars", "Y"],
|
|
],
|
|
"stars": [
|
|
["*"],
|
|
["stars", "*"],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("A B * * * X")
|
|
self.assertParse("A B * * * Y")
|
|
|
|
def testCheckCycleFree(self):
|
|
tokenize = lexer.LexicalGrammar("!")
|
|
grammar = Grammar({
|
|
"problem": [
|
|
["one", "two"],
|
|
],
|
|
"one": [
|
|
["!"],
|
|
],
|
|
"two": [
|
|
[Optional("problem")],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("! ! ! ! !")
|
|
|
|
def testReduceActions(self):
|
|
tokenize = lexer.LexicalGrammar("+ - * / ( )",
|
|
NUM=r'[0-9]\w*',
|
|
VAR=r'[A-Za-z]\w*')
|
|
grammar = Grammar({
|
|
"expr": [
|
|
["term"],
|
|
prod(["expr", "+", "term"], "add"),
|
|
prod(["expr", "-", "term"], "sub"),
|
|
],
|
|
"term": [
|
|
["unary"],
|
|
prod(["term", "*", "unary"], "mul"),
|
|
prod(["term", "/", "unary"], "div"),
|
|
],
|
|
"unary": [
|
|
["prim"],
|
|
prod(["-", "prim"], "neg"),
|
|
],
|
|
"prim": [
|
|
prod(["(", "expr", ")"], "parens"),
|
|
prod(["NUM"], "num"),
|
|
prod(["VAR"], "var"),
|
|
],
|
|
}, goal_nts=['expr'])
|
|
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("X", ('var', 'X'))
|
|
self.assertParse("3 + 4", ('add', ('num', '3'), '+', ('num', '4')))
|
|
self.assertParse(
|
|
"2 * 3 + 4 * (5 + 7)",
|
|
(
|
|
'add',
|
|
('mul', ('num', '2'), '*', ('num', '3')),
|
|
'+',
|
|
(
|
|
'mul',
|
|
('num', '4'),
|
|
'*',
|
|
('parens', '(',
|
|
('add', ('num', '5'), '+', ('num', '7')), ')'))))
|
|
self.assertParse(
|
|
"1 / (1 + 1 / (1 + 1 / (1 + 1)))",
|
|
(
|
|
'div', ('num', '1'), '/', (
|
|
'parens', '(', (
|
|
'add', ('num', '1'), '+', (
|
|
'div', ('num', '1'), '/', (
|
|
'parens', '(', (
|
|
'add', ('num', '1'), '+', (
|
|
'div', ('num', '1'), '/', (
|
|
'parens', '(', (
|
|
'add', ('num', '1'), '+',
|
|
('num', '1')),
|
|
')'))),
|
|
')'))),
|
|
')')))
|
|
|
|
def testConvenienceMethodTypeInference(self):
|
|
"""A method can be called only in an intermediate reduce expression."""
|
|
|
|
# The reduce expression `f(g($0))`.
|
|
reducer = CallMethod("f", [CallMethod("g", [0])])
|
|
|
|
# The grammar `goal ::= NAME => f(g($1))`.
|
|
grammar = Grammar(
|
|
{
|
|
'goal': [Production(['NAME'], reducer)],
|
|
},
|
|
variable_terminals=['NAME'])
|
|
|
|
# Since the return value of f() is used as the value of a `goal`,
|
|
# we infer that f() returns a goal.
|
|
self.assertEqual(
|
|
grammar.methods['f'].return_type,
|
|
jsparagus.types.Type('goal'))
|
|
|
|
# Since the return value of g() isn't used except as an argument, we
|
|
# just give it the type `g`.
|
|
self.assertEqual(
|
|
grammar.methods['g'].return_type,
|
|
jsparagus.types.Type('g'))
|
|
|
|
# Since g() is passed to f(), we infer this:
|
|
self.assertEqual(
|
|
grammar.methods['f'].argument_types,
|
|
[jsparagus.types.Type('g')])
|
|
|
|
def testEpsilonFreeTransform(self):
|
|
tokenize = lexer.LexicalGrammar('{ } X')
|
|
grammar = Grammar({
|
|
'goal': [
|
|
['{', 'xlist', '}'],
|
|
],
|
|
'xlist': [
|
|
[],
|
|
['xlist', 'X'],
|
|
],
|
|
})
|
|
self.compile(tokenize, grammar)
|
|
self.assertParse("{}", ('goal', '{', ('xlist_0',), '}'))
|
|
|
|
def compile_as_js(
|
|
self,
|
|
grammar_source: str,
|
|
goals: typing.Optional[typing.Iterable[str]] = None,
|
|
verbose: bool = False,
|
|
) -> None:
|
|
"""Like self.compile(), but generate a parser from ESGrammar,
|
|
with ASI support, using the JS lexer.
|
|
"""
|
|
from js_parser.lexer import JSLexer
|
|
from js_parser import load_es_grammar
|
|
from js_parser import generate_js_parser_tables
|
|
|
|
grammar = parse_esgrammar(
|
|
grammar_source,
|
|
filename="es-simplified.esgrammar",
|
|
extensions=[],
|
|
goals=goals,
|
|
synthetic_terminals=load_es_grammar.ECMASCRIPT_SYNTHETIC_TERMINALS,
|
|
terminal_names=load_es_grammar.TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR)
|
|
grammar = generate_js_parser_tables.hack_grammar(grammar)
|
|
base_parser_class = gen.compile(grammar, verbose=verbose)
|
|
|
|
# "type: ignore" because poor mypy can't cope with the runtime codegen
|
|
# we're doing here.
|
|
class JSParser(base_parser_class): # type: ignore
|
|
def __init__(self, goal='Script', builder=None):
|
|
super().__init__(goal, builder)
|
|
self._goal = goal
|
|
# self.debug = True
|
|
|
|
def clone(self):
|
|
return JSParser(self._goal, self.methods)
|
|
|
|
def on_recover(self, error_code, lexer, stv):
|
|
"""Check that ASI error recovery is really acceptable."""
|
|
if error_code == 'asi':
|
|
if not self.closed and stv.term != '}' and not lexer.saw_line_terminator():
|
|
lexer.throw("missing semicolon")
|
|
else:
|
|
assert error_code == 'do_while_asi'
|
|
|
|
self.tokenize = JSLexer
|
|
self.parser_class = JSParser
|
|
|
|
def testExtraGoal(self):
|
|
|
|
grammar_source = """
|
|
StuffToIgnore_ThisWorksAroundAnUnrelatedBug:
|
|
Identifier
|
|
IdentifierName
|
|
|
|
Hat :
|
|
`^`
|
|
|
|
ArrowFunction :
|
|
`^` `=>`
|
|
Hat `*` `=>`
|
|
|
|
Script :
|
|
`?` `?` ArrowFunction
|
|
`?` `?` [lookahead <! {`async`} ] Hat `of`
|
|
|
|
LazyArrowFunction :
|
|
ArrowFunction
|
|
"""
|
|
|
|
def try_it(goals):
|
|
self.compile_as_js(grammar_source, goals=goals)
|
|
self.assertParse("? ? ^ =>", goal='Script')
|
|
self.assertParse("? ? ^ of", goal='Script')
|
|
|
|
try_it(['Script', 'LazyArrowFunction'])
|
|
try_it(['Script'])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|