if polyglot#init#is_disabled(expand(':p'), 'smt2', 'autoload/smt2/parser.vim') finish endif vim9script const debug = false set maxfuncdepth=100000000 # SMT files tend to be highly nested # TODO: Retry iterative parsing now that we have a scanner and simpler grammar # TODO: Refer to token kind by name, e.g. token_comment instead of 8 # TODO: Change Ast.kind type from string to enum/number? # ------------------------------------------------------------------------------ # AST nodes -- essentially named token wrappers # # Note: pos_from, pos_to and contains_comment were only introduced to allow for # a fast FitsOneLine(ast) function in the formatter. # Here, pos_from and pos_to refer to indices of characters -- not tokens # ------------------------------------------------------------------------------ def Ast(kind: string, value: any, pos_from: number, pos_to: number, contains_comment: bool): dict return {kind: kind, value: value, pos_from: pos_from, pos_to: pos_to, contains_comment: contains_comment} enddef def ParagraphAst(exprs: list>, pos_from: number, pos_to: number): dict var contains_comment = false for expr in exprs if expr.contains_comment contains_comment = true break endif endfor return Ast('Paragraph', exprs, pos_from, pos_to, contains_comment) enddef def SExprAst(exprs: list>, pos_from: number, pos_to: number): dict var contains_comment = false for expr in exprs if expr.contains_comment contains_comment = true break endif endfor return Ast('SExpr', exprs, pos_from, pos_to, contains_comment) enddef def AtomAst(token: dict): dict return Ast('Atom', token, token.pos, token.pos + len(token.lexeme), token.kind == 8) enddef def PrintAst(ast: dict, indent = 0) echo repeat(' ', indent * 2) .. '[' .. ast.kind .. '] ' if ast.kind ==# 'Atom' echon ast.value.lexeme elseif ast.kind ==# 'SExpr' for v in ast.value call PrintAst(v, indent + 1) endfor elseif ast.kind ==# 'Paragraph' for v in ast.value call PrintAst(v, indent + 1) endfor endif enddef # ------------------------------------------------------------------------------ # Grammar # ------------------------------------------------------------------------------ # Paragraph ::= Expr+ # Expr ::= SExpr | Atom # SExpr ::= '(' Expr* ')' # ------------------------------------------------------------------------------ # LParen # ------------------------------------------------------------------------------ def AtStartOfLParen(scanner: dict): bool return scanner.cur_token.kind == 0 # token_lparen enddef def ParseLParen(scanner: dict) # consumes token; no return if debug scanner->smt2#scanner#Enforce(scanner->AtStartOfLParen(), "ParseLParen called but not at start of LParen", scanner.cur_token.pos) endif scanner->smt2#scanner#NextToken() enddef # ------------------------------------------------------------------------------ # RParen # ------------------------------------------------------------------------------ def AtStartOfRParen(scanner: dict): bool return scanner.cur_token.kind == 1 # token_rparen enddef def ParseRParen(scanner: dict) # consumes token; no return if debug scanner->smt2#scanner#Enforce(scanner->AtStartOfRParen(), "ParseRParen called but not at start of RParen", scanner.cur_token.pos) endif scanner->smt2#scanner#NextToken() enddef # ------------------------------------------------------------------------------ # Atom # ------------------------------------------------------------------------------ def AtStartOfAtom(scanner: dict): bool return 2 <= scanner.cur_token.kind && scanner.cur_token.kind <= 8 enddef def ParseAtom(scanner: dict): dict if debug scanner->smt2#scanner#Enforce(scanner->AtStartOfAtom(), "ParseAtom called but not at start of Atom", scanner.cur_token.pos) endif const ast = AtomAst(scanner.cur_token) scanner->smt2#scanner#NextToken() return ast enddef # ------------------------------------------------------------------------------ # Expr # ------------------------------------------------------------------------------ def AtStartOfExpr(scanner: dict): bool return scanner->AtStartOfSExpr() || scanner->AtStartOfAtom() enddef def ParseExpr(scanner: dict): dict if debug scanner->smt2#scanner#Enforce(scanner->AtStartOfExpr(), "ParseExpr called but not at start of Expr", scanner.cur_token.pos) endif if scanner->AtStartOfSExpr() return scanner->ParseSExpr() endif return scanner->ParseAtom() enddef # ------------------------------------------------------------------------------ # SExpr # ------------------------------------------------------------------------------ const AtStartOfSExpr = funcref(AtStartOfLParen) def ParseSExpr(scanner: dict): dict const pos_from = scanner.cur_token.pos if debug scanner->smt2#scanner#Enforce(scanner->AtStartOfSExpr(), "ParseSExpr called but not at start of SExpr", pos_from) endif scanner->ParseLParen() # Expr* var exprs: list> while scanner->AtStartOfExpr() exprs->add(scanner->ParseExpr()) endwhile scanner->smt2#scanner#Enforce(scanner->AtStartOfRParen(), printf("Expected RParen but got %s", scanner.cur_token.kind->smt2#scanner#TokenKind2Str()), scanner.cur_token.pos) scanner->ParseRParen() const pos_to = scanner.cur_token.pos return SExprAst(exprs, pos_from, pos_to) enddef # ------------------------------------------------------------------------------ # Paragraph # ------------------------------------------------------------------------------ def ParseParagraph(scanner: dict): dict const pos_from = scanner.cur_token.pos # Expr+ scanner->smt2#scanner#Enforce(scanner->AtStartOfExpr(), printf("Expected Expr but got %s", scanner.cur_token.kind->smt2#scanner#TokenKind2Str()), pos_from) var exprs = [scanner->ParseExpr()] while scanner->AtStartOfExpr() && !scanner.at_new_paragraph exprs->add(scanner->ParseExpr()) endwhile const pos_to = scanner.cur_token.pos return ParagraphAst(exprs, pos_from, pos_to) enddef # ------------------------------------------------------------------------------ # Public functions # ------------------------------------------------------------------------------ def smt2#parser#ParseCurrentParagraph(): dict # source = [start of current paragraph, EOF] # Note: This is needed since `silent! normal! {y}` may not yank full paragraphs # in the context of multiline expressions const cursor = getpos('.') silent! normal! { const line_offset = line('.') const source = join(getline('.', '$'), "\n") call setpos('.', cursor) var scanner = smt2#scanner#Scanner(source, line_offset) const ast = scanner->ParseParagraph() if debug | ast->PrintAst() | endif return ast enddef def smt2#parser#ParseAllParagraphs(): list> # source = current buffer const source = join(getline(1, '$'), "\n") var scanner = smt2#scanner#Scanner(source) var asts = [] while scanner.cur_token.kind != 9 # token_eof const ast = scanner->ParseParagraph() asts->add(ast) if debug | ast->PrintAst() | endif endwhile return asts enddef