lang: add pipelines and rename parser to lst_parser

This commit is contained in:
Schrottkatze 2024-04-30 12:21:06 +02:00
parent db2643359c
commit 30f17773a8
Signed by: schrottkatze
SSH key fingerprint: SHA256:hXb3t1vINBFCiDCmhRABHX5ocdbLiKyCdKI4HK2Rbbc
18 changed files with 108 additions and 47 deletions

View file

@ -0,0 +1,7 @@
use crate::lst_parser::syntax_kind::SyntaxKind;
#[derive(Debug)]
pub enum SyntaxError {
Expected(Vec<SyntaxKind>),
PipelineNeedsSink,
}

View file

@ -0,0 +1,23 @@
use crate::lst_parser::syntax_kind::SyntaxKind;
#[derive(Debug)]
pub enum Event {
Start {
kind: SyntaxKind,
forward_parent: Option<usize>,
},
Finish,
Eat {
count: usize,
},
Error,
}
impl Event {
pub(crate) fn tombstone() -> Self {
Self::Start {
kind: SyntaxKind::TOMBSTONE,
forward_parent: None,
}
}
}

View file

@ -0,0 +1,34 @@
use std::fmt::Debug;
use crate::lst_parser::syntax_kind::SyntaxKind::*;
use super::{
input::Input,
output::Output,
syntax_kind::{self, lex},
Parser,
};
mod expression;
pub fn source_file(p: &mut Parser) {
let root = p.start("root");
expression::expression(p, false);
p.eat_succeeding_ws();
root.complete(p, ROOT);
}
fn check_parser(input: &str, parser_fn: fn(&mut Parser), output: &str) {
let toks = lex(input);
let mut parser = Parser::new(Input::new(&toks));
parser_fn(&mut parser);
let p_out = dbg!(parser.finish());
let o = Output::from_parser_output(toks, p_out);
let s = format!("{o:?}");
assert_eq!(&s, output);
}

View file

@ -0,0 +1,81 @@
use crate::lst_parser::{error::SyntaxError, syntax_kind::SyntaxKind::*, CompletedMarker, Parser};
use self::{collection::collection, instruction::instr, lit::literal, pipeline::PIPES};
mod collection;
mod instruction;
mod lit;
mod pipeline {
use enumset::enum_set;
use crate::lst_parser::{
error::SyntaxError,
syntax_kind::{SyntaxKind::*, TokenSet},
CompletedMarker, Parser,
};
use super::expression;
pub fn pipeline(p: &mut Parser, start_expr: CompletedMarker) -> Option<CompletedMarker> {
if !pipe(p) {
return Some(start_expr);
}
let pipeline_marker = start_expr.precede(p, "pipeline_start");
loop {
if expression(p, true).is_none() {
return Some(pipeline_marker.complete_err(p, SyntaxError::PipelineNeedsSink));
}
if !pipe(p) {
return Some(pipeline_marker.complete(p, PIPELINE));
}
}
}
pub const PIPES: TokenSet = enum_set!(PIPE | MAPPING_PIPE | NULL_PIPE);
fn pipe(p: &mut Parser) -> bool {
if PIPES.contains(p.current()) {
p.do_bump();
true
} else {
false
}
}
}
pub fn expression(p: &mut Parser, in_pipe: bool) -> Option<CompletedMarker> {
let expr = p.start("expr");
if atom(p).or_else(|| instr(p)).is_none() {
expr.abandon(p);
return None;
}
let r = expr.complete(p, EXPR);
if PIPES.contains(p.current()) && !in_pipe {
pipeline::pipeline(p, r)
} else {
Some(r)
}
}
pub fn atom(p: &mut Parser) -> Option<CompletedMarker> {
literal(p)
.or_else(|| collection(p))
.or_else(|| parenthesized_expr(p))
}
pub fn parenthesized_expr(p: &mut Parser) -> Option<CompletedMarker> {
if p.eat(L_PAREN) {
let par_expr = p.start("parenthesized");
expression(p, false);
if !p.eat(R_PAREN) {
return Some(par_expr.complete_err(p, SyntaxError::Expected(vec![R_PAREN])));
}
return Some(par_expr.complete(p, PARENTHESIZED_EXPR));
}
None
}

View file

@ -0,0 +1,25 @@
use enumset::enum_set;
use crate::lst_parser::{
syntax_kind::{SyntaxKind::*, TokenSet},
CompletedMarker, Parser,
};
use self::{attr_set::attr_set, vec::vec_matrix_list};
mod attr_set;
mod vec;
const COLLECTION_START: TokenSet = enum_set!(L_BRACK | L_BRACE);
pub fn collection(p: &mut Parser) -> Option<CompletedMarker> {
if !COLLECTION_START.contains(p.current()) {
return None;
}
Some(match p.current() {
L_BRACK => vec_matrix_list(p),
L_BRACE => attr_set(p),
_ => unreachable!(),
})
}

View file

@ -0,0 +1,45 @@
use crate::lst_parser::{
error::SyntaxError,
grammar::expression::{atom, expression},
CompletedMarker, Marker, Parser,
SyntaxKind::*,
};
pub fn attr_set(p: &mut Parser) -> CompletedMarker {
let start = p.start("attr_set_start");
assert!(p.eat(L_BRACE));
loop {
if attr(p).is_some() {
// TODO: handle others
if p.eat(COMMA) {
continue;
} else if p.eat(R_BRACE) {
return start.complete(p, ATTR_SET);
}
// TODO: check for newline and stuff following that for recov of others
} else if p.eat(R_BRACE) {
return start.complete(p, ATTR_SET);
}
}
}
fn attr(p: &mut Parser) -> Option<CompletedMarker> {
if p.at(IDENT) {
let attr_start = p.start("attr");
let attr_name_start = p.start("attr_name");
p.do_bump();
attr_name_start.complete(p, ATTR_NAME);
// TODO: handle comma, expr/atom, other
p.eat(COLON);
// TODO: handle failed expr parser too
let attr_value = p.start("attr_value");
let _ = expression(p, false);
attr_value.complete(p, ATTR_VALUE);
Some(attr_start.complete(p, ATTR))
} else {
None
}
}

View file

@ -0,0 +1,73 @@
use crate::lst_parser::{
error::SyntaxError, grammar::expression::atom, CompletedMarker, Marker, Parser, SyntaxKind::*,
};
pub fn vec_matrix_list(p: &mut Parser) -> CompletedMarker {
let start = p.start("vec_matrix_list_start");
assert!(p.eat(L_BRACK));
let row_start = p.start("matrix_row_start");
if let Some(item) = atom(p) {
item.precede(p, "coll_item_start")
.complete(p, COLLECTION_ITEM);
if p.at(COMMA) {
row_start.abandon(p);
return finish_list(p, start);
}
finish_mat_or_vec(p, start, row_start)
} else if p.eat(R_BRACK) {
start.complete(p, LIST)
} else {
start.complete_err(p, SyntaxError::Expected(vec![EXPR, R_BRACK]))
}
}
// TODO: handle semicolons, other wrong toks
fn finish_list(p: &mut Parser, list_start: Marker) -> CompletedMarker {
loop {
if p.eat(COMMA) {
if let Some(item) = atom(p) {
item.precede(p, "coll_item_start")
.complete(p, COLLECTION_ITEM);
} else if p.eat(R_BRACK) {
return list_start.complete(p, LIST);
}
} else if p.eat(R_BRACK) {
return list_start.complete(p, LIST);
}
}
}
// TODO: handle commas, general other wrong toks
fn finish_mat_or_vec(p: &mut Parser, coll_start: Marker, mut row_start: Marker) -> CompletedMarker {
let mut is_matrix = false;
let mut row_item_count = 1;
loop {
if let Some(item) = atom(p) {
item.precede(p, "coll_item_start")
.complete(p, COLLECTION_ITEM);
row_item_count += 1;
} else if p.at(SEMICOLON) {
is_matrix = true;
row_start.complete(p, MAT_ROW);
p.eat(SEMICOLON);
row_start = p.start("matrix_row_start");
row_item_count = 0;
} else if p.at(R_BRACK) {
if is_matrix && row_item_count == 0 {
row_start.abandon(p);
p.eat(R_BRACK);
return coll_start.complete(p, MATRIX);
} else if is_matrix {
row_start.complete(p, MAT_ROW);
p.eat(R_BRACK);
return coll_start.complete(p, MATRIX);
} else {
row_start.abandon(p);
p.eat(R_BRACK);
return coll_start.complete(p, VEC);
}
}
}
}

View file

@ -0,0 +1,34 @@
use crate::lst_parser::{syntax_kind::SyntaxKind::*, CompletedMarker, Parser};
use super::{atom, lit::literal};
pub fn instr(p: &mut Parser) -> Option<CompletedMarker> {
if !p.at(IDENT) {
return None;
}
let instr = p.start("instr");
instr_name(p);
instr_params(p);
Some(instr.complete(p, INSTR))
}
fn instr_name(p: &mut Parser) {
let instr_name = p.start("instr_name");
while p.at(IDENT) {
p.do_bump();
}
instr_name.complete(p, INSTR_NAME);
}
fn instr_params(p: &mut Parser) {
if let Some(start) = atom(p) {
while atom(p).is_some() {}
start.precede(p, "params_start").complete(p, INSTR_PARAMS);
}
}

View file

@ -0,0 +1,59 @@
use enumset::enum_set;
use indoc::indoc;
use crate::lst_parser::{
grammar::check_parser,
syntax_kind::{SyntaxKind::*, TokenSet},
CompletedMarker, Parser,
};
const LIT_TOKENS: TokenSet = enum_set!(INT_NUM | FLOAT_NUM | STRING);
pub fn literal(p: &mut Parser) -> Option<CompletedMarker> {
if !LIT_TOKENS.contains(p.current()) {
return None;
}
let lit = p.start("lit");
p.do_bump();
Some(lit.complete(p, LITERAL))
}
#[test]
fn test_parse_lst_lit() {
check_parser(
"42",
|p| {
literal(p);
},
indoc! {r#"
LITERAL {
INT_NUM "42";
}
"#},
);
check_parser(
"3.14",
|p| {
literal(p);
},
indoc! {r#"
LITERAL {
FLOAT_NUM "3.14";
}
"#},
);
check_parser(
r#""Meow""#,
|p| {
literal(p);
},
indoc! {r#"
LITERAL {
STRING "\"Meow\"";
}
"#},
);
}

View file

@ -0,0 +1,61 @@
use crate::lst_parser::syntax_kind::SyntaxKind;
pub struct Input<'src, 'toks> {
raw: &'toks Vec<(SyntaxKind, &'src str)>,
/// indices of the "meaningful" tokens (not whitespace etc)
/// includes newlines because those might indeed help with finding errors
meaningful: Vec<usize>,
/// indices of newlines for the purpose of easily querying them
/// can be helpful with missing commas etc
newlines: Vec<usize>,
}
impl<'src, 'toks> Input<'src, 'toks> {
pub fn new(raw_toks: &'toks Vec<(SyntaxKind, &'src str)>) -> Self {
let meaningful = raw_toks
.iter()
.enumerate()
.filter_map(|(i, tok)| match tok.0 {
SyntaxKind::WHITESPACE | SyntaxKind::NEWLINE => None,
_ => Some(i),
})
.collect();
let newlines = raw_toks
.iter()
.enumerate()
.filter_map(|(i, tok)| match tok.0 {
SyntaxKind::NEWLINE => Some(i),
_ => None,
})
.collect();
Self {
raw: raw_toks,
meaningful,
newlines,
}
}
#[allow(clippy::unwrap_used, reason = "meaningful indices cannot be invalid")]
pub(crate) fn kind(&self, idx: usize) -> SyntaxKind {
let Some(meaningful_idx) = self.meaningful.get(idx) else {
return SyntaxKind::EOF;
};
self.raw.get(*meaningful_idx).unwrap().0
}
pub(crate) fn preceding_meaningless(&self, idx: usize) -> usize {
assert!(self.meaningful.len() > idx);
if idx == 0 {
1
} else {
self.meaningful[idx] - self.meaningful[idx - 1]
}
}
pub(crate) fn meaningless_tail_len(&self) -> usize {
self.raw.len() - (self.meaningful.last().unwrap() + 1)
}
}

View file

@ -0,0 +1,113 @@
use rowan::{GreenNode, GreenNodeBuilder, GreenNodeData, GreenTokenData, Language, NodeOrToken};
use std::mem;
use crate::lst_parser::syntax_kind::{Lang, SyntaxKind};
use super::{error::SyntaxError, events::Event};
pub struct Output {
pub green_node: GreenNode,
pub errors: Vec<SyntaxError>,
}
impl std::fmt::Debug for Output {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
debug_print_green_node(NodeOrToken::Node(&self.green_node), f, 0)
}
}
fn debug_print_green_node(
node: NodeOrToken<&GreenNodeData, &GreenTokenData>,
f: &mut std::fmt::Formatter<'_>,
lvl: i32,
) -> std::fmt::Result {
for _ in 0..lvl {
f.write_str(" ")?;
}
match node {
NodeOrToken::Node(n) => {
writeln!(f, "{:?} {{", Lang::kind_from_raw(node.kind()))?;
for c in n.children() {
debug_print_green_node(c, f, lvl + 1)?;
}
for _ in 0..lvl {
f.write_str(" ")?;
}
f.write_str("}\n")
}
NodeOrToken::Token(t) => {
writeln!(f, "{:?} {:?};", Lang::kind_from_raw(t.kind()), t.text())
}
}
}
impl Output {
pub fn from_parser_output(
mut raw_toks: Vec<(SyntaxKind, &str)>,
(mut events, errs): (Vec<Event>, Vec<SyntaxError>),
) -> Self {
let mut builder = GreenNodeBuilder::new();
let mut fw_parents = Vec::new();
raw_toks.reverse();
for i in 0..events.len() {
match mem::replace(&mut events[i], Event::tombstone()) {
Event::Start {
kind,
forward_parent,
} => {
if kind == SyntaxKind::TOMBSTONE && forward_parent.is_none() {
continue;
}
fw_parents.push(kind);
let mut idx = i;
let mut fp = forward_parent;
while let Some(fwd) = fp {
idx += fwd as usize;
fp = match mem::replace(&mut events[idx], Event::tombstone()) {
Event::Start {
kind,
forward_parent,
} => {
fw_parents.push(kind);
forward_parent
}
_ => unreachable!(),
}
}
// remove whitespace bc it's ugly
while let Some((SyntaxKind::WHITESPACE | SyntaxKind::NEWLINE, _)) =
raw_toks.last()
{
match events.iter_mut().find(|ev| matches!(ev, Event::Eat { .. })) {
Some(Event::Eat { count }) => *count -= 1,
_ => unreachable!(),
}
let (tok, text): (SyntaxKind, &str) = raw_toks.pop().unwrap();
builder.token(tok.into(), text);
}
for kind in fw_parents.drain(..).rev() {
if kind != SyntaxKind::TOMBSTONE {
builder.start_node(kind.into());
}
}
}
Event::Finish => builder.finish_node(),
Event::Eat { count } => (0..count).for_each(|_| {
let (tok, text): (SyntaxKind, &str) = raw_toks.pop().unwrap();
builder.token(tok.into(), text);
}),
Event::Error => todo!(),
}
}
Self {
green_node: builder.finish(),
errors: errs,
}
}
}

View file

@ -0,0 +1,138 @@
use enumset::EnumSet;
use logos::Logos;
pub fn lex(src: &str) -> Vec<(SyntaxKind, &str)> {
let mut lex = SyntaxKind::lexer(src);
let mut r = Vec::new();
while let Some(tok_res) = lex.next() {
r.push((tok_res.unwrap_or(SyntaxKind::LEX_ERR), lex.slice()))
}
r
}
#[derive(enumset::EnumSetType, Logos, Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord)]
#[repr(u16)]
#[enumset(no_super_impls)]
#[allow(non_camel_case_types)]
pub enum SyntaxKind {
#[token("def")]
DEF_KW = 0,
#[token("let")]
LET_KW,
#[token("in")]
IN_KW,
#[regex("[\\d]+")]
INT_NUM,
#[regex("[+-]?([\\d]+\\.[\\d]*|[\\d]*\\.[\\d]+)")]
FLOAT_NUM,
#[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#)]
STRING,
MATRIX,
MAT_ROW,
VEC,
LIST,
// either of a vec, a matrix or a list
COLLECTION_ITEM,
DECL,
PARENTHESIZED_EXPR,
EXPR,
LITERAL,
#[token("(")]
L_PAREN,
#[token(")")]
R_PAREN,
#[token("{")]
L_BRACE,
#[token("}")]
R_BRACE,
#[token("[")]
L_BRACK,
#[token("]")]
R_BRACK,
#[token("<")]
L_ANGLE,
#[token(">")]
R_ANGLE,
#[token("+")]
PLUS,
#[token("-")]
MINUS,
#[token("*")]
STAR,
#[token("/")]
SLASH,
#[token("%")]
PERCENT,
#[token("^")]
CARET,
INSTR,
INSTR_NAME,
INSTR_PARAMS,
ATTR_SET,
ATTR,
ATTR_NAME,
ATTR_VALUE,
#[regex("[a-zA-Z_]+[a-zA-Z_\\-\\d]*")]
IDENT,
#[regex("\\$[a-zA-Z0-9_\\-]+")]
VAR,
#[regex("\\@[a-zA-Z0-9_\\-]+")]
INPUT_VAR,
#[token("$")]
DOLLAR,
#[token("@")]
AT,
#[token(",")]
COMMA,
#[token("|")]
PIPE,
#[token("@|")]
MAPPING_PIPE,
#[token("!|")]
NULL_PIPE,
PIPELINE,
#[token("=")]
EQ,
#[token(":")]
COLON,
#[token(";")]
SEMICOLON,
#[token(".")]
DOT,
#[token("!")]
BANG,
#[regex("[ \\t\\f]+")]
WHITESPACE,
#[token("\n")]
NEWLINE,
PARSE_ERR,
LEX_ERR,
ROOT,
EOF,
TOMBSTONE,
ERROR,
}
pub type TokenSet = EnumSet<SyntaxKind>;
impl From<SyntaxKind> for rowan::SyntaxKind {
fn from(kind: SyntaxKind) -> Self {
Self(kind as u16)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Lang {}
impl rowan::Language for Lang {
type Kind = SyntaxKind;
#[allow(unsafe_code)]
fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
assert!(raw.0 <= SyntaxKind::ROOT as u16);
unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
}
fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
kind.into()
}
}

View file

@ -0,0 +1 @@