lang: massive amounts of parser and ast pain

This commit is contained in:
Schrottkatze 2024-04-11 03:23:03 +02:00
parent 881a987b2f
commit 9da157ff4a
Signed by: schrottkatze
SSH key fingerprint: SHA256:hXb3t1vINBFCiDCmhRABHX5ocdbLiKyCdKI4HK2Rbbc
16 changed files with 900 additions and 170 deletions

View file

@ -2,51 +2,14 @@ use std::collections::{BTreeMap, HashMap};
use indexmap::IndexMap;
use super::{Span, Spanned};
use super::Spanned;
#[derive(Debug, PartialEq)]
pub struct File<'src> {
pub decls: IndexMap<Spanned<&'src str>, Expression<'src>>,
pub decls: IndexMap<Spanned<&'src str>, raw_ast::RawExpression<'src>>,
}
#[derive(Debug, PartialEq)]
pub struct Expression<'src> {
pub expr: Expr<'src>,
pub span: Span,
}
impl<'src> Expression<'src> {
pub fn new(expr: Expr<'src>, span: Span) -> Self {
Self { expr, span }
}
}
#[derive(Debug, PartialEq)]
pub enum Expr<'src> {
Node(
Spanned<&'src str>,
Option<Spanned<IndexMap<Spanned<&'src str>, Expression<'src>>>>,
),
SimplePipe(Box<Expression<'src>>, Box<Expression<'src>>),
// NamingPipe(
// Box<Expression<'src>>,
// (Vec<Spanned<&'src str>>, Vec<Spanned<&'src str>>),
// Box<Expression<'src>>,
// ),
MappingPipe(Box<Expression<'src>>, Box<Expression<'src>>),
NullPipe(Box<Expression<'src>>, Box<Expression<'src>>),
MultiPipe(IndexMap<Spanned<&'src str>, Expression<'src>>),
// LetIn(
// IndexMap<Spanned<&'src str>, Box<Expression<'src>>>,
// Box<Expression<'src>>,
// ),
// $
Var(&'src str),
// @
InputVar(&'src str),
AttrSet(Spanned<IndexMap<Spanned<&'src str>, Expression<'src>>>),
Lit(Lit<'src>),
}
pub mod raw_ast;
#[derive(Debug, PartialEq)]
pub enum Lit<'src> {
@ -55,3 +18,7 @@ pub enum Lit<'src> {
Float(f64),
String(&'src str),
}
pub mod lossless;
pub mod ast_tree;

View file

@ -0,0 +1,31 @@
use ego_tree::Tree;
use crate::parser::Spanned;
use super::{File, Lit};
pub struct Ast<'src> {
tree: Tree<AstNode<'src>>,
}
struct AstNode<'src> {
kind: NodeKind<'src>,
}
enum NodeKind<'src> {
Decl,
Ident(&'src str),
Instr,
Expr,
MappingPipe,
NullPipe,
MultiPipe,
Var(&'src str),
InputVar(&'src str),
AttrSet,
Attr,
Lit(Lit<'src>),
Matrix,
Dimensions(u16, u16),
MatrixRow,
}

View file

@ -0,0 +1,19 @@
use self::lex::SyntaxKind;
pub mod parser;
pub mod lex;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
enum Lang {}
impl rowan::Language for Lang {
type Kind = SyntaxKind;
#[allow(unsafe_code)]
fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
assert!(raw.0 <= SyntaxKind::ROOT as u16);
unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
}
fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
kind.into()
}
}

View file

@ -0,0 +1,118 @@
use logos::Logos;
use crate::parser::Span;
pub fn lex(src: &str) -> Vec<(SyntaxKind, &str)> {
let mut lex = SyntaxKind::lexer(src);
let mut r = Vec::new();
while let Some(tok_res) = lex.next() {
r.push((tok_res.unwrap_or(SyntaxKind::LEX_ERR), lex.slice()))
}
r.reverse();
r
}
#[derive(Logos, Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord)]
#[repr(u16)]
#[allow(non_camel_case_types)]
pub enum SyntaxKind {
#[token("def")]
DEF_KW = 0,
#[token("let")]
LET_KW,
#[token("in")]
IN_KW,
#[token("mat")]
MAT_KW,
#[regex("[\\d]+x[\\d]+")]
PAT_DIMENSIONS,
#[regex("[\\d]+")]
INT_NUM,
#[regex("[+-]?([\\d]+\\.[\\d]*|[\\d]*\\.[\\d]+)")]
FLOAT_NUM,
#[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#)]
STRING,
MATRIX,
DECL,
LIST,
MAT_BODY,
PARENTHESIZED_EXPR,
EXPR,
#[token("(")]
L_PAREN,
#[token(")")]
R_PAREN,
#[token("{")]
L_CURLY,
#[token("}")]
R_CURLY,
#[token("[")]
L_BRACK,
#[token("]")]
R_BRACK,
#[token("<")]
L_ANGLE,
#[token(">")]
R_ANGLE,
#[token("+")]
PLUS,
#[token("-")]
MINUS,
#[token("*")]
STAR,
#[token("/")]
SLASH,
#[token("%")]
PERCENT,
#[token("^")]
CARET,
INSTR,
INSTR_NAME,
INSTR_PARAMS,
ATTR_SET,
ATTR,
ATTR_NAME,
ATTR_VALUE,
#[regex("[a-zA-Z_]+[a-zA-Z_\\-\\d]*")]
IDENT,
#[regex("\\$[a-zA-Z0-9_\\-]+")]
VAR,
#[regex("\\@[a-zA-Z0-9_\\-]+")]
INPUT_VAR,
#[token("$")]
DOLLAR,
#[token("@")]
AT,
#[token(",")]
COMMA,
#[token("|")]
PIPE,
#[token("@|")]
MAPPING_PIPE,
#[token("!|")]
NULL_PIPE,
#[token("=")]
EQ,
#[token(":")]
COLON,
#[token(";")]
SEMICOLON,
#[token(".")]
DOT,
#[token("!")]
BANG,
#[regex("[ \\t\\f]+")]
WHITESPACE,
#[token("\n")]
NEWLINE,
PARSE_ERR,
LEX_ERR,
ROOT,
}
impl From<SyntaxKind> for rowan::SyntaxKind {
fn from(kind: SyntaxKind) -> Self {
Self(kind as u16)
}
}

View file

@ -0,0 +1,437 @@
use std::borrow::Borrow;
use chumsky::container::Container;
use rowan::{
Checkpoint, GreenNode, GreenNodeBuilder, GreenNodeData, GreenTokenData, Language, NodeOrToken,
};
use crate::parser::{
ast::lossless::{lex::SyntaxKind::*, Lang},
Span,
};
use super::lex::{self, SyntaxKind};
#[derive(PartialEq, Eq)]
pub struct Parse {
pub green_node: GreenNode,
}
impl std::fmt::Debug for Parse {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
debug_print_green_node(NodeOrToken::Node(self.green_node.borrow()), f, 0)
}
}
fn debug_print_green_node(
node: NodeOrToken<&GreenNodeData, &GreenTokenData>,
f: &mut std::fmt::Formatter<'_>,
lvl: i32,
) -> std::fmt::Result {
for _ in 0..lvl {
f.write_str(" ")?;
}
match node {
NodeOrToken::Node(n) => {
writeln!(f, "{:?} {{", Lang::kind_from_raw(node.kind()));
for c in n.children() {
debug_print_green_node(c, f, lvl + 1)?;
}
for _ in 0..lvl {
f.write_str(" ")?;
}
f.write_str("}\n")
}
NodeOrToken::Token(t) => {
writeln!(f, "{:?} {:?};", Lang::kind_from_raw(t.kind()), t.text())
}
}
}
#[derive(Debug)]
struct Parser<'src> {
tokens: Vec<(SyntaxKind, &'src str)>,
builder: GreenNodeBuilder<'src>,
errors: Vec<SyntaxError>,
}
#[derive(Debug, PartialEq, Eq)]
enum SyntaxError {
Expected(SyntaxKind),
AttrExpectedValue,
/// guessed if there's a newline and attr on next line without comma
/// should then suggest comma after attr
ExpectedCommaBetweenAttrs,
}
pub fn parse(src: &str) -> Parse {
let mut tokens = lex::lex(src);
Parser {
tokens,
builder: GreenNodeBuilder::new(),
errors: Vec::new(),
}
.parse()
}
impl Parser<'_> {
fn parse(mut self) -> Parse {
self.start_node(ROOT);
match self.expr(None) {
expr::ExprRes::Ok => (),
expr::ExprRes::Eof => (),
expr::ExprRes::NoExpr => todo!(),
}
self.builder.finish_node();
Parse {
green_node: self.builder.finish(),
}
}
fn start_node(&mut self, kind: SyntaxKind) {
self.builder.start_node(kind.into());
}
fn finish_node(&mut self) {
self.builder.finish_node();
}
/// Advance one token, adding it to the current branch of the tree builder.
fn bump(&mut self) {
let (kind, text) = self.tokens.pop().unwrap();
self.builder.token(kind.into(), text);
}
fn syntax_err(&mut self, err: SyntaxError) {
let (_, text) = self.tokens.pop().unwrap();
self.builder.token(PARSE_ERR.into(), text);
self.errors.push(err);
}
fn syntax_err_by_checkpoint(&mut self, checkpoint: Checkpoint, err: SyntaxError) {
self.builder.start_node_at(checkpoint, PARSE_ERR.into());
self.finish_node();
self.errors.push(err);
}
fn expected(&mut self, expected: SyntaxKind) {
self.syntax_err(SyntaxError::Expected(expected))
}
/// Peek at the first unprocessed token
fn current(&self) -> Option<SyntaxKind> {
self.tokens.last().map(|(kind, _)| *kind)
}
fn next(&self) -> Option<SyntaxKind> {
self.tokens
.get(self.tokens.len() - 2)
.map(|(kind, _)| *kind)
}
fn skip_ws(&mut self) {
while self.current() == Some(WHITESPACE) || self.current() == Some(NEWLINE) {
self.bump()
}
}
fn skip_ws_without_newlines(&mut self) {
while self.current() == Some(WHITESPACE) {
self.bump()
}
}
}
mod expr {
use rowan::Checkpoint;
use super::{attrset::AttrsetRes, instr::NodeRes, Parser};
use crate::parser::{ast::lossless::lex::SyntaxKind::*, Span};
impl Parser<'_> {
pub(super) fn expr(&mut self, start: Option<Checkpoint>) -> ExprRes {
self.skip_ws();
let start = start.unwrap_or_else(|| self.builder.checkpoint());
match self.current() {
Some(IDENT) => {
let expr_res = match self.instr() {
NodeRes::Ok => ExprRes::Ok,
NodeRes::Eof => ExprRes::Eof,
};
self.builder.start_node_at(start, EXPR.into());
self.finish_node();
expr_res
}
Some(_) => self.atom(Some(start)),
None => ExprRes::Eof,
}
}
pub(super) fn atom(&mut self, start: Option<Checkpoint>) -> ExprRes {
self.skip_ws();
let start = start.unwrap_or_else(|| self.builder.checkpoint());
match self.current() {
Some(INT_NUM | FLOAT_NUM | STRING) => {
self.bump();
self.builder.start_node_at(start, EXPR.into());
self.finish_node();
ExprRes::Ok
}
Some(L_CURLY) => match self.attrset(start) {
AttrsetRes::Ok => ExprRes::Ok,
AttrsetRes::Eof => ExprRes::Eof,
},
Some(L_PAREN) => {
self.builder.start_node_at(start, PARENTHESIZED_EXPR.into());
self.bump();
self.expr(None);
self.skip_ws();
match self.current() {
Some(R_PAREN) => ExprRes::Ok,
Some(_) => todo!(),
None => ExprRes::Eof,
}
}
Some(_) => ExprRes::NoExpr,
None => ExprRes::Eof,
}
}
}
pub enum ExprRes {
Ok,
Eof,
/// isnt an expression
NoExpr,
}
}
mod attrset {
use chumsky::container::Container;
use rowan::Checkpoint;
use super::{expr::ExprRes, instr::NodeRes, Parser};
use crate::parser::{
ast::lossless::{lex::SyntaxKind::*, parser::SyntaxError},
Span,
};
impl Parser<'_> {
pub(super) fn attrset(&mut self, checkpoint: Checkpoint) -> AttrsetRes {
assert_eq!(self.current(), Some(L_CURLY));
self.bump();
self.skip_ws();
match self.current() {
Some(R_CURLY) => {
self.builder.start_node_at(checkpoint, ATTR_SET.into());
self.bump();
self.finish_node();
AttrsetRes::Ok
}
Some(_) => {
self.builder.start_node_at(checkpoint, ATTR_SET.into());
let res = match self.attrs() {
AttrRes::Eof => AttrsetRes::Eof,
AttrRes::RCurly | AttrRes::Ok => {
println!("curr: {:?}", self.current());
AttrsetRes::Ok
}
};
self.finish_node();
res
}
None => AttrsetRes::Eof,
}
// self.start_node(ATTR);
}
fn attrs(&mut self) -> AttrRes {
let mut res = AttrRes::Ok;
while res == AttrRes::Ok {
println!("it: {:?}", self.tokens.last());
match self.attr() {
AttrRes::Ok => {
self.skip_ws_without_newlines();
println!(
"a: {:?}, {:?}",
self.tokens.last(),
self.tokens.get(self.tokens.len() - 2)
);
println!("errs: {:?}", self.errors);
res = AttrRes::Ok;
let checkpoint_previous_end = self.builder.checkpoint();
res = match self.current() {
Some(COMMA) => {
self.bump();
AttrRes::Ok
}
Some(R_CURLY) => {
self.bump();
res = AttrRes::Ok;
break;
}
Some(NEWLINE) => {
self.skip_ws();
println!(
"b: {:?}, {:?}",
self.tokens.last(),
self.tokens.get(self.tokens.len() - 2)
);
match self.current() {
Some(COMMA) => {
self.bump();
AttrRes::Ok
}
Some(R_CURLY) => {
self.bump();
res = AttrRes::Ok;
break;
}
Some(IDENT) => {
println!("wtf");
self.syntax_err_by_checkpoint(
checkpoint_previous_end,
SyntaxError::ExpectedCommaBetweenAttrs,
);
// self.syntax_err(SyntaxError::ExpectedCommaBetweenAttrs);
AttrRes::Ok
}
Some(_) => {
self.bump();
AttrRes::Ok
}
None => {
res = AttrRes::Eof;
break;
}
}
}
Some(_) => {
self.bump();
println!(
"c: {:?}, {:?}",
self.tokens.last(),
self.tokens.get(self.tokens.len() - 2)
);
AttrRes::Ok
}
None => {
res = AttrRes::Eof;
break;
}
}
}
AttrRes::Eof => {
res = AttrRes::Eof;
break;
}
AttrRes::RCurly => {
res = AttrRes::RCurly;
break;
}
}
}
println!("toks_left: {:?}", self.tokens);
res
}
fn attr(&mut self) -> AttrRes {
self.skip_ws();
self.start_node(ATTR);
self.start_node(ATTR_NAME);
match self.current() {
Some(IDENT) => self.bump(),
Some(R_CURLY) => return AttrRes::Ok,
Some(_) => self.expected(IDENT),
None => return AttrRes::Eof,
}
self.finish_node();
self.skip_ws();
match self.current() {
Some(COLON) => self.bump(),
Some(R_CURLY) => {
self.expected(COLON);
return AttrRes::RCurly;
}
Some(_) => self.expected(COLON),
None => return AttrRes::Eof,
}
self.skip_ws();
self.start_node(ATTR_VALUE);
match self.expr(None) {
ExprRes::Ok => self.bump(),
ExprRes::Eof => return AttrRes::Eof,
ExprRes::NoExpr => match self.current() {
Some(COMMA) => self.syntax_err(SyntaxError::AttrExpectedValue),
Some(R_CURLY) => {
self.syntax_err(SyntaxError::AttrExpectedValue);
return AttrRes::RCurly;
}
Some(_) => self.expected(EXPR),
None => unreachable!(),
},
}
self.finish_node();
self.finish_node();
AttrRes::Ok
}
}
#[derive(PartialEq, Eq)]
pub enum AttrsetRes {
Ok,
Eof,
}
#[derive(PartialEq, Eq)]
enum AttrRes {
Ok,
Eof,
RCurly,
}
}
mod instr {
use super::Parser;
use crate::parser::{
ast::lossless::{lex::SyntaxKind::*, parser::expr::ExprRes},
Span,
};
impl Parser<'_> {
pub(super) fn instr(&mut self) -> NodeRes {
assert_eq!(self.current(), Some(IDENT));
self.skip_ws();
self.start_node(INSTR);
self.instr_name();
// used to count positionals
let mut i = 0;
let params_checkpoint = self.builder.checkpoint();
loop {
match self.expr(None) {
ExprRes::Ok => {
i += 1;
continue;
}
ExprRes::NoExpr | ExprRes::Eof => break,
}
}
if i >= 1 {
self.builder
.start_node_at(params_checkpoint, INSTR_PARAMS.into());
self.finish_node();
}
self.finish_node();
NodeRes::Ok
}
fn instr_name(&mut self) {
self.start_node(INSTR_NAME);
while self.current() == Some(IDENT) {
self.bump();
self.skip_ws_without_newlines();
}
self.finish_node();
}
}
pub(super) enum NodeRes {
Ok,
Eof,
}
}

View file

@ -0,0 +1,50 @@
use indexmap::IndexMap;
use super::super::Spanned;
use super::super::Span;
use super::Lit;
#[derive(Debug, PartialEq)]
pub struct RawExpression<'src> {
pub expr: Box<RawExpr<'src>>,
pub span: Span,
}
impl<'src> RawExpression<'src> {
pub fn new(expr: RawExpr<'src>, span: Span) -> Self {
Self {
expr: Box::new(expr),
span,
}
}
}
#[derive(Debug, PartialEq)]
pub enum RawExpr<'src> {
Node(
Vec<Spanned<&'src str>>,
Option<Spanned<IndexMap<Spanned<&'src str>, RawExpression<'src>>>>,
),
SimplePipe(RawExpression<'src>, RawExpression<'src>),
// NamingPipe(
// Box<Expression<'src>>,
// (Vec<Spanned<&'src str>>, Vec<Spanned<&'src str>>),
// Box<Expression<'src>>,
// ),
MappingPipe(RawExpression<'src>, RawExpression<'src>),
NullPipe(RawExpression<'src>, RawExpression<'src>),
MultiPipe(IndexMap<Spanned<&'src str>, RawExpression<'src>>),
// LetIn(
// IndexMap<Spanned<&'src str>, Box<Expression<'src>>>,
// Box<Expression<'src>>,
// ),
// $
Var(&'src str),
// @
InputVar(&'src str),
AttrSet(Spanned<IndexMap<Spanned<&'src str>, RawExpression<'src>>>),
Lit(Lit<'src>),
Matrix(Spanned<(u16, u16)>, Vec<RawExpression<'src>>),
List(Vec<RawExpression<'src>>),
}

View file

@ -1,4 +1,4 @@
use crate::parser::ast::{Expr, File};
use crate::parser::ast::File;
use crate::parser::parse;
use crate::tokens::Token;
use chumsky::input::Stream;