From 1711d17fa6f89e6b9adc4884f0d35b9955b02acd Mon Sep 17 00:00:00 2001 From: Schrottkatze Date: Fri, 12 Apr 2024 20:55:55 +0200 Subject: [PATCH] lang: parsing to events now --- crates/lang/Cargo.toml | 2 +- crates/lang/src/lib.rs | 1 + crates/lang/src/main.rs | 4 +- crates/lang/src/parser/ast/lossless/lex.rs | 1 - crates/lang/src/parser/ast/lossless/parser.rs | 461 +++++------------- flake.lock | 54 +- flake.nix | 31 +- testfiles/test.owo | 5 +- 8 files changed, 176 insertions(+), 383 deletions(-) diff --git a/crates/lang/Cargo.toml b/crates/lang/Cargo.toml index 8c1d3db..3e03209 100644 --- a/crates/lang/Cargo.toml +++ b/crates/lang/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" [dependencies] logos = "0.14" -chumsky = {version= "1.0.0-alpha.7", features=["label"]} +chumsky = {version= "1.0.0-alpha.7", features=["label", "extension"]} petgraph = { workspace = true} indexmap = "2.2.6" clap = { version = "4", features = ["derive"] } diff --git a/crates/lang/src/lib.rs b/crates/lang/src/lib.rs index 94a1430..ebf22d3 100644 --- a/crates/lang/src/lib.rs +++ b/crates/lang/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(type_alias_impl_trait)] pub mod err_reporting; pub mod parser; pub mod tokens; diff --git a/crates/lang/src/main.rs b/crates/lang/src/main.rs index 203e1eb..346862e 100644 --- a/crates/lang/src/main.rs +++ b/crates/lang/src/main.rs @@ -5,7 +5,7 @@ use lang::{ err_reporting::ErrorCollector, parser::ast::lossless::{ lex, - parser::{self, parser_to_events::to_events}, + parser::{self, parse}, }, }; @@ -20,7 +20,7 @@ fn main() { let n = args.file.clone(); let f = fs::read_to_string(n.clone()).expect("failed to read file"); println!("toks: {:?}", lex::lex(&f)); - println!("evs: {:?}", to_events(&f)); + println!("parse res: {:?}", parse(&f)); // let parse_res = parser::parse(&f); // println!("parse: {:?}", parse_res); diff --git a/crates/lang/src/parser/ast/lossless/lex.rs b/crates/lang/src/parser/ast/lossless/lex.rs index c25608a..e2a867b 100644 --- a/crates/lang/src/parser/ast/lossless/lex.rs +++ b/crates/lang/src/parser/ast/lossless/lex.rs @@ -10,7 +10,6 @@ pub fn lex(src: &str) -> Vec<(SyntaxKind, &str)> { r.push((tok_res.unwrap_or(SyntaxKind::LEX_ERR), lex.slice())) } - r.reverse(); r } diff --git a/crates/lang/src/parser/ast/lossless/parser.rs b/crates/lang/src/parser/ast/lossless/parser.rs index b5f5e85..adcb08e 100644 --- a/crates/lang/src/parser/ast/lossless/parser.rs +++ b/crates/lang/src/parser/ast/lossless/parser.rs @@ -9,24 +9,29 @@ use crate::parser::{ Span, }; +use self::parser_to_events::{to_events, Event}; + use super::lex::{self, SyntaxKind}; pub mod parser_to_events { - use chumsky::Parser; + use chumsky::prelude::*; - use crate::parser::ast::lossless::lex::SyntaxKind::{self, *}; + use crate::parser::ast::lossless::lex::{ + self, + SyntaxKind::{self, *}, + }; - #[derive(Debug, PartialEq, Eq)] - pub(super) enum Event { - StartNode, + #[derive(Debug, PartialEq, Eq, Clone, Copy)] + pub enum Event { + StartNode(SyntaxKind), StartErr(SyntaxError), EatToken, FinishNode, FinishErr, } - #[derive(Debug, PartialEq, Eq)] - enum SyntaxError { + #[derive(Debug, PartialEq, Eq, Clone, Copy)] + pub enum SyntaxError { Expected(SyntaxKind), AttrExpectedValue, /// guessed if there's a newline and attr on next line without comma @@ -34,14 +39,107 @@ pub mod parser_to_events { ExpectedCommaBetweenAttrs, } - pub fn to_events(src: &str) -> Vec { - let mut tokens = lex::lex(src); - parser().parse(tokens) + pub fn to_events(tokens: &[(SyntaxKind, &str)]) -> Vec { + let mut only_toks: Vec = tokens.iter().map(|(t, _)| *t).collect(); + let res = parser().parse(&only_toks); + res.unwrap() } - pub fn parser() -> impl Parser<'static, SyntaxKind, Vec> { - let whitespace = just(WHITESPACE).or(NEWLINE).repeated().collect::>(); - whitespace + macro_rules! padded { + ($parser:expr) => {{ + let ws = one_of([WHITESPACE, NEWLINE]) + .to(Event::EatToken) + .repeated() + .collect::>(); + ws.then($parser) + .then(ws) + .map(|((mut before, mut c), mut after)| { + before.append(&mut c); + before.append(&mut after); + before + }) + }}; + } + + pub fn parser<'toks>() -> impl Parser<'toks, &'toks [SyntaxKind], Vec> { + let ws = one_of([WHITESPACE, NEWLINE]) + .to(Event::EatToken) + .repeated() + .collect::>(); + // let ws_without_newlines = just(WHITESPACE) + // .to(Event::EatToken) + // .repeated() + // .collect::>(); + let parenthesized = |c| { + just(L_PAREN) + .to(vec![Event::EatToken]) + .then(c) + .then(just(R_PAREN).to(vec![Event::EatToken])) + .map(|((mut before, mut c), mut after)| { + before.append(&mut c); + before.append(&mut after); + before + }) + }; + + let expr = recursive(|expr| { + let lit = one_of([INT_NUM, FLOAT_NUM, STRING]).to(vec![ + Event::StartNode(EXPR), + Event::EatToken, + Event::FinishNode, + ]); + + let atom = lit.clone().or(parenthesized(expr)); + + let ident = just(IDENT).to(vec![Event::EatToken]); + let instr_name = ident + .clone() + .map(|mut v| { + v.insert(0, Event::StartNode(INSTR_NAME)); + v + }) + .foldl( + ws.then(ident).repeated(), + |mut ident, (mut ws, mut next)| { + ident.append(&mut ws); + ident.append(&mut next); + ident + }, + ) + .map(|mut v| { + v.push(Event::FinishNode); + v + }); + let instr = padded!(instr_name) + .then( + atom.clone() + .map(|mut v| { + v.insert(0, Event::StartNode(INSTR_PARAMS)); + v + }) + .foldl( + ws.then(atom.clone()).repeated(), + |mut cur, (mut ws, mut next)| { + cur.append(&mut ws); + cur.append(&mut next); + cur + }, + ) + .map(|mut v| { + v.push(Event::FinishNode); + v + }), + ) + .map(|(mut name, mut params)| { + name.insert(0, Event::StartNode(INSTR)); + name.append(&mut params); + name.push(Event::FinishNode); + name + }); + padded!(instr.or(lit).or(atom)) + }); + expr + // .map(|(lit, mut ev)| lit.append(&mut ev)); } } @@ -99,7 +197,7 @@ enum SyntaxError { } pub fn parse(src: &str) -> Parse { - let mut tokens = lex::lex(src); + let tokens = lex::lex(src); Parser { tokens, builder: GreenNodeBuilder::new(), @@ -110,12 +208,19 @@ pub fn parse(src: &str) -> Parse { impl Parser<'_> { fn parse(mut self) -> Parse { - self.start_node(ROOT); + let evs = to_events(&self.tokens); + self.builder.start_node(ROOT.into()); - match self.expr(None) { - expr::ExprRes::Ok => (), - expr::ExprRes::Eof => (), - expr::ExprRes::NoExpr => todo!(), + self.tokens.reverse(); + + for ev in evs { + match ev { + Event::StartNode(kind) => self.builder.start_node(kind.into()), + Event::StartErr(SyntaxError) => todo!(), + Event::EatToken => self.bump(), + Event::FinishNode => self.builder.finish_node(), + Event::FinishErr => todo!(), + } } self.builder.finish_node(); @@ -124,13 +229,6 @@ impl Parser<'_> { } } - fn start_node(&mut self, kind: SyntaxKind) { - self.builder.start_node(kind.into()); - } - fn finish_node(&mut self) { - self.builder.finish_node(); - } - /// Advance one token, adding it to the current branch of the tree builder. fn bump(&mut self) { let (kind, text) = self.tokens.pop().unwrap(); @@ -143,7 +241,7 @@ impl Parser<'_> { } fn syntax_err_by_checkpoint(&mut self, checkpoint: Checkpoint, err: SyntaxError) { self.builder.start_node_at(checkpoint, PARSE_ERR.into()); - self.finish_node(); + self.builder.finish_node(); self.errors.push(err); } fn expected(&mut self, expected: SyntaxKind) { @@ -158,313 +256,4 @@ impl Parser<'_> { .get(self.tokens.len() - 2) .map(|(kind, _)| *kind) } - fn skip_ws(&mut self) { - while self.current() == Some(WHITESPACE) || self.current() == Some(NEWLINE) { - self.bump() - } - } - fn skip_ws_without_newlines(&mut self) { - while self.current() == Some(WHITESPACE) { - self.bump() - } - } -} - -mod expr { - use rowan::Checkpoint; - - use super::{attrset::AttrsetRes, instr::NodeRes, Parser}; - use crate::parser::{ast::lossless::lex::SyntaxKind::*, Span}; - impl Parser<'_> { - pub(super) fn expr(&mut self, start: Option) -> ExprRes { - self.skip_ws(); - let start = start.unwrap_or_else(|| self.builder.checkpoint()); - match self.current() { - Some(IDENT) => { - let expr_res = match self.instr() { - NodeRes::Ok => ExprRes::Ok, - NodeRes::Eof => ExprRes::Eof, - }; - self.builder.start_node_at(start, EXPR.into()); - self.finish_node(); - expr_res - } - Some(_) => self.atom(Some(start)), - None => ExprRes::Eof, - } - } - - pub(super) fn atom(&mut self, start: Option) -> ExprRes { - self.skip_ws(); - let start = start.unwrap_or_else(|| self.builder.checkpoint()); - match self.current() { - Some(INT_NUM | FLOAT_NUM | STRING) => { - self.bump(); - self.builder.start_node_at(start, EXPR.into()); - self.finish_node(); - ExprRes::Ok - } - Some(L_CURLY) => match self.attrset(start) { - AttrsetRes::Ok => ExprRes::Ok, - AttrsetRes::Eof => ExprRes::Eof, - }, - Some(L_PAREN) => { - self.builder.start_node_at(start, PARENTHESIZED_EXPR.into()); - self.bump(); - self.expr(None); - self.skip_ws(); - match self.current() { - Some(R_PAREN) => ExprRes::Ok, - Some(_) => todo!(), - None => ExprRes::Eof, - } - } - Some(_) => ExprRes::NoExpr, - None => ExprRes::Eof, - } - } - } - - pub enum ExprRes { - Ok, - Eof, - /// isnt an expression - NoExpr, - } -} - -mod attrset { - use chumsky::container::Container; - use rowan::Checkpoint; - - use super::{expr::ExprRes, instr::NodeRes, Parser}; - use crate::parser::{ - ast::lossless::{lex::SyntaxKind::*, parser::SyntaxError}, - Span, - }; - impl Parser<'_> { - pub(super) fn attrset(&mut self, checkpoint: Checkpoint) -> AttrsetRes { - assert_eq!(self.current(), Some(L_CURLY)); - self.bump(); - self.skip_ws(); - match self.current() { - Some(R_CURLY) => { - self.builder.start_node_at(checkpoint, ATTR_SET.into()); - self.bump(); - self.finish_node(); - AttrsetRes::Ok - } - Some(_) => { - self.builder.start_node_at(checkpoint, ATTR_SET.into()); - let res = match self.attrs() { - AttrRes::Eof => AttrsetRes::Eof, - AttrRes::RCurly | AttrRes::Ok => { - println!("curr: {:?}", self.current()); - AttrsetRes::Ok - } - }; - self.finish_node(); - res - } - None => AttrsetRes::Eof, - } - // self.start_node(ATTR); - } - - fn attrs(&mut self) -> AttrRes { - let mut res = AttrRes::Ok; - - while res == AttrRes::Ok { - println!("it: {:?}", self.tokens.last()); - match self.attr() { - AttrRes::Ok => { - self.skip_ws_without_newlines(); - println!( - "a: {:?}, {:?}", - self.tokens.last(), - self.tokens.get(self.tokens.len() - 2) - ); - println!("errs: {:?}", self.errors); - res = AttrRes::Ok; - let checkpoint_previous_end = self.builder.checkpoint(); - res = match self.current() { - Some(COMMA) => { - self.bump(); - AttrRes::Ok - } - Some(R_CURLY) => { - self.bump(); - res = AttrRes::Ok; - break; - } - Some(NEWLINE) => { - self.skip_ws(); - println!( - "b: {:?}, {:?}", - self.tokens.last(), - self.tokens.get(self.tokens.len() - 2) - ); - match self.current() { - Some(COMMA) => { - self.bump(); - AttrRes::Ok - } - Some(R_CURLY) => { - self.bump(); - res = AttrRes::Ok; - break; - } - Some(IDENT) => { - println!("wtf"); - self.syntax_err_by_checkpoint( - checkpoint_previous_end, - SyntaxError::ExpectedCommaBetweenAttrs, - ); - // self.syntax_err(SyntaxError::ExpectedCommaBetweenAttrs); - AttrRes::Ok - } - Some(_) => { - self.bump(); - AttrRes::Ok - } - None => { - res = AttrRes::Eof; - break; - } - } - } - Some(_) => { - self.bump(); - println!( - "c: {:?}, {:?}", - self.tokens.last(), - self.tokens.get(self.tokens.len() - 2) - ); - AttrRes::Ok - } - None => { - res = AttrRes::Eof; - break; - } - } - } - AttrRes::Eof => { - res = AttrRes::Eof; - break; - } - AttrRes::RCurly => { - res = AttrRes::RCurly; - break; - } - } - } - println!("toks_left: {:?}", self.tokens); - res - } - - fn attr(&mut self) -> AttrRes { - self.skip_ws(); - self.start_node(ATTR); - self.start_node(ATTR_NAME); - match self.current() { - Some(IDENT) => self.bump(), - Some(R_CURLY) => return AttrRes::Ok, - Some(_) => self.expected(IDENT), - None => return AttrRes::Eof, - } - self.finish_node(); - self.skip_ws(); - match self.current() { - Some(COLON) => self.bump(), - Some(R_CURLY) => { - self.expected(COLON); - return AttrRes::RCurly; - } - Some(_) => self.expected(COLON), - None => return AttrRes::Eof, - } - self.skip_ws(); - self.start_node(ATTR_VALUE); - match self.expr(None) { - ExprRes::Ok => self.bump(), - ExprRes::Eof => return AttrRes::Eof, - ExprRes::NoExpr => match self.current() { - Some(COMMA) => self.syntax_err(SyntaxError::AttrExpectedValue), - Some(R_CURLY) => { - self.syntax_err(SyntaxError::AttrExpectedValue); - return AttrRes::RCurly; - } - Some(_) => self.expected(EXPR), - None => unreachable!(), - }, - } - self.finish_node(); - self.finish_node(); - AttrRes::Ok - } - } - - #[derive(PartialEq, Eq)] - pub enum AttrsetRes { - Ok, - Eof, - } - - #[derive(PartialEq, Eq)] - enum AttrRes { - Ok, - Eof, - RCurly, - } -} - -mod instr { - use super::Parser; - use crate::parser::{ - ast::lossless::{lex::SyntaxKind::*, parser::expr::ExprRes}, - Span, - }; - - impl Parser<'_> { - pub(super) fn instr(&mut self) -> NodeRes { - assert_eq!(self.current(), Some(IDENT)); - self.skip_ws(); - self.start_node(INSTR); - self.instr_name(); - - // used to count positionals - let mut i = 0; - let params_checkpoint = self.builder.checkpoint(); - loop { - match self.expr(None) { - ExprRes::Ok => { - i += 1; - continue; - } - ExprRes::NoExpr | ExprRes::Eof => break, - } - } - if i >= 1 { - self.builder - .start_node_at(params_checkpoint, INSTR_PARAMS.into()); - self.finish_node(); - } - self.finish_node(); - NodeRes::Ok - } - - fn instr_name(&mut self) { - self.start_node(INSTR_NAME); - while self.current() == Some(IDENT) { - self.bump(); - self.skip_ws_without_newlines(); - } - self.finish_node(); - } - } - - pub(super) enum NodeRes { - Ok, - Eof, - } } diff --git a/flake.lock b/flake.lock index 7c3a06d..9719001 100644 --- a/flake.lock +++ b/flake.lock @@ -11,11 +11,11 @@ "pre-commit-hooks": "pre-commit-hooks" }, "locked": { - "lastModified": 1710475558, - "narHash": "sha256-egKrPCKjy/cE+NqCj4hg2fNX/NwLCf0bRDInraYXDgs=", + "lastModified": 1712055811, + "narHash": "sha256-7FcfMm5A/f02yyzuavJe06zLa9hcMHsagE28ADcmQvk=", "owner": "cachix", "repo": "cachix", - "rev": "661bbb7f8b55722a0406456b15267b5426a3bda6", + "rev": "02e38da89851ec7fec3356a5c04bc8349cae0e30", "type": "github" }, "original": { @@ -33,11 +33,11 @@ "pre-commit-hooks": "pre-commit-hooks_2" }, "locked": { - "lastModified": 1712724616, - "narHash": "sha256-qs9uEbrOpp6oXcDOp5cpilyU52t78ZpEPATtaHRVLIU=", + "lastModified": 1712925466, + "narHash": "sha256-MJ6VxGNu/ftbn8SErJjBz80FUNXkZfcObHg/JP7wwAc=", "owner": "cachix", "repo": "devenv", - "rev": "d1a11d14dbe96a03c7f9068e4d3af05f283734e0", + "rev": "1af93652caf48bfeef6ba7d1cf59fc66e506e5c2", "type": "github" }, "original": { @@ -83,11 +83,11 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1712730246, - "narHash": "sha256-iB8bFj+07RHpmt+XuGGvYQk2Iwm12u6+DklGq/+Tg5s=", + "lastModified": 1712903033, + "narHash": "sha256-KcvsEm0h1mIwBHFAzWFBjGihnbf2fxpAaXOdVbUfAI4=", "owner": "nix-community", "repo": "fenix", - "rev": "d402ae4a5e5676722290470f61a5e8e3155b5487", + "rev": "c739f83545e625227f4d0af7fe2a71e69931fa4c", "type": "github" }, "original": { @@ -335,11 +335,11 @@ "nixpkgs-regression": "nixpkgs-regression_2" }, "locked": { - "lastModified": 1710500156, - "narHash": "sha256-zvCqeUO2GLOm7jnU23G4EzTZR7eylcJN+HJ5svjmubI=", + "lastModified": 1712911606, + "narHash": "sha256-BGvBhepCufsjcUkXnEEXhEVjwdJAwPglCC2+bInc794=", "owner": "domenkozar", "repo": "nix", - "rev": "c5bbf14ecbd692eeabf4184cc8d50f79c2446549", + "rev": "b24a9318ea3f3600c1e24b4a00691ee912d4de12", "type": "github" }, "original": { @@ -431,11 +431,11 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1710236354, - "narHash": "sha256-vWrciFdq49vve43g4pbi7NjmL4cwG1ifXnQx+dU3T5E=", + "lastModified": 1710796454, + "narHash": "sha256-lQlICw60RhH8sHTDD/tJiiJrlAfNn8FDI9c+7G2F0SE=", "owner": "cachix", "repo": "devenv-nixpkgs", - "rev": "829e73affeadfb4198a7105cbe3a03153d13edc9", + "rev": "06fb0f1c643aee3ae6838dda3b37ef0abc3c763b", "type": "github" }, "original": { @@ -447,11 +447,11 @@ }, "nixpkgs_3": { "locked": { - "lastModified": 1712608508, - "narHash": "sha256-vMZ5603yU0wxgyQeHJryOI+O61yrX2AHwY6LOFyV1gM=", + "lastModified": 1712791164, + "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", "owner": "nixos", "repo": "nixpkgs", - "rev": "4cba8b53da471aea2ab2b0c1f30a81e7c451f4b6", + "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", "type": "github" }, "original": { @@ -463,11 +463,11 @@ }, "nixpkgs_4": { "locked": { - "lastModified": 1712608508, - "narHash": "sha256-vMZ5603yU0wxgyQeHJryOI+O61yrX2AHwY6LOFyV1gM=", + "lastModified": 1712791164, + "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "4cba8b53da471aea2ab2b0c1f30a81e7c451f4b6", + "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", "type": "github" }, "original": { @@ -543,11 +543,11 @@ "nixpkgs-stable": "nixpkgs-stable_2" }, "locked": { - "lastModified": 1712055707, - "narHash": "sha256-4XLvuSIDZJGS17xEwSrNuJLL7UjDYKGJSbK1WWX2AK8=", + "lastModified": 1712897695, + "narHash": "sha256-nMirxrGteNAl9sWiOhoN5tIHyjBbVi5e2tgZUgZlK3Y=", "owner": "cachix", "repo": "pre-commit-hooks.nix", - "rev": "e35aed5fda3cc79f88ed7f1795021e559582093a", + "rev": "40e6053ecb65fcbf12863338a6dcefb3f55f1bf8", "type": "github" }, "original": { @@ -567,11 +567,11 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1712663608, - "narHash": "sha256-tN9ZL6kGppmHg84lxlpAlaN+kXWNctKK7Yitq/iXDEw=", + "lastModified": 1712818880, + "narHash": "sha256-VDxsvgj/bNypHq48tQWtc3VRbWvzlFjzKf9ZZIVO10Y=", "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "a5feb4f05f09adca661c869b1bf2324898cbaa43", + "rev": "657b33b0cb9bd49085202e91ad5b4676532c9140", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 3c8d815..caad7d6 100644 --- a/flake.nix +++ b/flake.nix @@ -15,6 +15,7 @@ self, nixpkgs, devenv, + fenix, systems, ... } @ inputs: let @@ -24,6 +25,11 @@ forEachSystem (system: let pkgs = nixpkgs.legacyPackages.${system}; + toolchain = with fenix.packages.${system}; + combine [ + default.toolchain + rust-analyzer + ]; in { default = devenv.lib.mkShell { inherit inputs pkgs; @@ -33,17 +39,18 @@ config, ... }: { - languages.rust = { - enable = true; - channel = "nightly"; - components = [ - "rustc" - "cargo" - "clippy" - "rustfmt" - "rust-src" - ]; - }; + # languages.rust = { + # enable = true; + # channel = "nightly"; + # components = [ + # "rustc" + # "cargo" + # "clippy" + # "rustfmt" + # "rust-src" + # "rust-analyzer" + # ]; + # }; pre-commit.hooks = { clippy.enable = false; @@ -59,7 +66,7 @@ mold cargo-nextest cargo-watch - rust-analyzer + toolchain ]; }) ]; diff --git a/testfiles/test.owo b/testfiles/test.owo index 3662b45..78cbbda 100644 --- a/testfiles/test.owo +++ b/testfiles/test.owo @@ -1,4 +1 @@ -meow mew meow 5 3.14 "uwu" { - meow: test 24 - another: hi "hello", -} "awa" +hello world test 42 3.14 "uwu"