diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..f5b1a03 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake --log-format multiline-with-logs \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6abfe1b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/.direnv diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..d0ebf8a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,217 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + +[[package]] +name = "countme" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636" + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "drop_bomb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" + +[[package]] +name = "enumset" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a6b7c3d347de0a9f7bfd2f853be43fe32fa6fac30c70f6d6d67a1e936b87ee" +dependencies = [ + "enumset_derive", +] + +[[package]] +name = "enumset_derive" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6da3ea9e1d1a3b1593e15781f930120e72aa7501610b2f82e5b6739c72e8eac5" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "logos" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7251356ef8cb7aec833ddf598c6cb24d17b689d20b993f9d11a3d764e34e6458" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f80069600c0d66734f5ff52cc42f2dabd6b29d205f333d61fd7832e9e9963f" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24fb722b06a9dc12adb0963ed585f19fc61dc5413e6a9be9422ef92c091e731d" +dependencies = [ + "logos-codegen", +] + +[[package]] +name = "lopal_core" +version = "0.1.0" +dependencies = [ + "drop_bomb", + "enumset", + "rowan", +] + +[[package]] +name = "lopal_json" +version = "0.1.0" +dependencies = [ + "enumset", + "logos", + "lopal_core", + "rowan", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rowan" +version = "0.15.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a542b0253fa46e632d27a1dc5cf7b930de4df8659dc6e720b647fc72147ae3d" +dependencies = [ + "countme", + "hashbrown", + "rustc-hash", + "text-size", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "text-size" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f18aa187839b2bdb1ad2fa35ead8c4c2976b64e4363c386d45ac0f7ee85c9233" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..810b8e5 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,46 @@ +[workspace] +members = [ + "crates/lopal_core", + "crates/lopal_json" +] +resolver = "2" + + +[workspace.lints.rust] +unsafe_code = "deny" +variant_size_differences = "warn" + +[workspace.lints.clippy] +branches_sharing_code = "warn" +clone_on_ref_ptr = "warn" +cognitive_complexity = "warn" +derive_partial_eq_without_eq = "warn" +equatable_if_let = "warn" +filetype_is_file = "warn" +format_push_string = "warn" +if_then_some_else_none = "warn" +integer_division = "warn" +let_underscore_must_use = "warn" +manual_clamp = "warn" +pedantic = "warn" +str_to_string = "warn" +unneeded_field_pattern = "warn" +unnested_or_patterns = "warn" + +allow_attributes_without_reason = "deny" +cast_lossless = "deny" +fallible_impl_from = "deny" +unnecessary_cast = "deny" +unwrap_used = "deny" +# allowed, since you can give reasons +expect_used = "allow" + +# must be allowed with clearly documented reasons +indexing_slicing = "allow" + +module_name_repetitions = "allow" +must_use_candidate = "allow" + +# TODO: more granular and clean +missing_panics_doc = "allow" +missing_errors_doc = "allow" diff --git a/crates/lopal_core/Cargo.toml b/crates/lopal_core/Cargo.toml new file mode 100644 index 0000000..5e29277 --- /dev/null +++ b/crates/lopal_core/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "lopal_core" +version = "0.1.0" +edition = "2021" + +[dependencies] +rowan = "0.15.15" +drop_bomb = "0.1.5" +enumset = "1.1.3" + +[lints] +workspace = true diff --git a/crates/lopal_core/src/lib.rs b/crates/lopal_core/src/lib.rs new file mode 100644 index 0000000..26d8679 --- /dev/null +++ b/crates/lopal_core/src/lib.rs @@ -0,0 +1,8 @@ +#![feature(iter_collect_into)] +pub mod parser; + +pub use parser::{ + error::SyntaxError, + marker::{CompletedMarker, Marker}, + Parser, SyntaxElement, +}; diff --git a/crates/lopal_core/src/parser.rs b/crates/lopal_core/src/parser.rs new file mode 100644 index 0000000..c592cdb --- /dev/null +++ b/crates/lopal_core/src/parser.rs @@ -0,0 +1,253 @@ +use std::{cell::Cell, fmt, marker::PhantomData, mem}; + +use enumset::{EnumSet, EnumSetType}; +use rowan::{GreenNode, GreenNodeBuilder}; + +use crate::parser::event::NodeKind; + +use self::{event::Event, input::Input, marker::Marker}; +pub use {error::SyntaxError, output::ParserOutput}; + +pub mod error; +mod event; +mod input; +pub mod marker; +pub mod output; + +/// this is used to define some required SyntaxKinds like an EOF token or an error token +pub trait SyntaxElement +where + Self: EnumSetType + + Into + + From + + fmt::Debug + + Clone + + PartialEq + + Eq, +{ + /// EOF value. This will be used by the rest of the parser library to represent an EOF. + const SYNTAX_EOF: Self; + /// Error value. This will be used as a placeholder for associated respective errors. + const SYNTAX_ERROR: Self; + const SYNTAX_ROOT: Self; +} + +pub struct Parser<'src, SyntaxKind: SyntaxElement, SyntaxErr: SyntaxError> { + input: Input<'src, SyntaxKind>, + pos: usize, + events: Vec>, + step_limit: u32, + steps: Cell, +} + +impl<'src, 'toks, SyntaxKind: SyntaxElement, SyntaxErr: SyntaxError> + Parser<'src, SyntaxKind, SyntaxErr> +{ + /// eat all meaningless tokens at the end of the file. + pub fn eat_succeeding_meaningless(&mut self) { + self.push_ev(Event::Eat { + count: self.input.meaningless_tail_len(), + }); + } + + /// Get token from current position of the parser. + pub fn current(&self) -> SyntaxKind { + self.step(); + self.input.kind(self.pos) + } + + pub fn start(&mut self, name: &str) -> Marker { + let pos = self.events.len(); + self.push_ev(Event::tombstone()); + Marker::new(pos, name) + } + + /// Eat next token if it's of kind `kind` and return `true`. + /// Otherwise, `false`. + pub fn eat(&mut self, kind: SyntaxKind) -> bool { + if !self.at(kind) { + return false; + } + + self.do_bump(); + true + } + + pub fn do_bump(&mut self) { + self.push_ev(Event::Eat { + count: self.input.preceding_meaningless(self.pos), + }); + self.pos += 1; + } + + /// Check if the token at the current parser position is of `kind` + pub fn at(&self, kind: SyntaxKind) -> bool { + self.nth_at(0, kind) + } + + /// Check if the token that is `n` ahead is of `kind` + pub fn nth_at(&self, n: usize, kind: SyntaxKind) -> bool { + self.nth(n) == kind + } + + pub fn nth(&self, n: usize) -> SyntaxKind { + self.step(); + self.input.kind(self.pos + n) + } + + fn push_ev(&mut self, event: Event) { + self.events.push(event); + } + + fn step(&self) { + let steps = self.steps.get(); + assert!(steps <= self.step_limit, "the parser seems stuck."); + self.steps.set(steps + 1); + } + + pub fn finish(self) -> ParserOutput { + let Self { + input, + pos, + mut events, + step_limit, + steps, + } = self; + let (mut raw_toks, meaningless_tokens) = input.dissolve(); + let mut builder = GreenNodeBuilder::new(); + // TODO: document what the hell a forward parent is + let mut fw_parents = Vec::new(); + let mut errors: Vec = Vec::new(); + raw_toks.reverse(); + + // always have an implicit root node to avoid [`GreenNodeBuilder::finish()`] panicking due to multiple root elements. + builder.start_node(SyntaxKind::SYNTAX_ROOT.into()); + + for i in 0..events.len() { + match mem::replace(&mut events[i], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + if kind == NodeKind::Tombstone && forward_parent.is_none() { + continue; + } + + // resolving forward parents + // temporarily jump around with the parser index and replace them with tombstones + fw_parents.push(kind); + let mut idx = i; + let mut fp = forward_parent; + while let Some(fwd) = fp { + idx += fwd as usize; + fp = match mem::replace(&mut events[idx], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + fw_parents.push(kind); + forward_parent + } + _ => unreachable!(), + } + } + + // clear semantically meaningless tokens before the new tree node for aesthetic reasons + while raw_toks + .last() + .is_some_and(|v| meaningless_tokens.contains(v.0)) + { + // update first next Eat event + match events.iter_mut().find(|ev| matches!(ev, Event::Eat { .. })) { + Some(Event::Eat { count }) => *count -= 1, + _ => unreachable!(), + } + + // put whitespace into lst + let (tok, text) = raw_toks.pop().unwrap(); + builder.token(tok.into(), text); + } + + // insert forward parents into the tree in correct order + for kind in fw_parents.drain(..).rev() { + match kind { + NodeKind::Syntax(kind) => builder.start_node(kind.into()), + NodeKind::Error(err) => { + errors.push(err); + builder.start_node(SyntaxKind::SYNTAX_ERROR.into()) + } + _ => {} + } + } + } + Event::Finish => builder.finish_node(), + Event::Eat { count } => (0..count).for_each(|_| { + let (tok, text) = raw_toks.pop().unwrap(); + builder.token(tok.into(), text); + }), + } + } + + // finish SYNTAX_ROOT + builder.finish_node(); + + ParserOutput { + green_node: builder.finish(), + errors, + _syntax_kind: PhantomData::, + } + } +} + +pub struct ParserBuilder< + 'src, + SyntaxKind: SyntaxElement, + // SyntaxErr: SyntaxError, +> { + raw_toks: Vec<(SyntaxKind, &'src str)>, + meaningless_token_kinds: EnumSet, + step_limit: u32, +} + +impl<'src, SyntaxKind: SyntaxElement> ParserBuilder<'src, SyntaxKind> { + pub fn new(raw_toks: Vec<(SyntaxKind, &'src str)>) -> Self { + Self { + raw_toks, + meaningless_token_kinds: EnumSet::new(), + step_limit: 4096, + } + } + + /// Sets the parser step limit. + /// Defaults to 4096 + pub fn step_limit(mut self, new: u32) -> Self { + self.step_limit = new; + self + } + + pub fn add_meaningless(mut self, kind: SyntaxKind) -> Self { + self.meaningless_token_kinds.insert(kind); + self + } + + pub fn add_meaningless_many(mut self, kind: Vec) -> Self { + self.meaningless_token_kinds + .insert_all(kind.into_iter().collect()); + self + } + + pub fn build(self) -> Parser<'src, SyntaxKind, SyntaxErr> { + let Self { + raw_toks, + meaningless_token_kinds, + step_limit, + } = self; + Parser { + input: Input::new(raw_toks, Some(meaningless_token_kinds)), + pos: 0, + events: Vec::new(), + step_limit, + steps: Cell::new(0), + } + } +} diff --git a/crates/lopal_core/src/parser/error.rs b/crates/lopal_core/src/parser/error.rs new file mode 100644 index 0000000..9c9d893 --- /dev/null +++ b/crates/lopal_core/src/parser/error.rs @@ -0,0 +1,9 @@ +use std::fmt; + +/// A marker trait... for now! +// TODO: constrain that conversion to `NodeKind::Error` is enforced to be possible +pub trait SyntaxError +where + Self: fmt::Debug + Clone + PartialEq + Eq, +{ +} diff --git a/crates/lopal_core/src/parser/event.rs b/crates/lopal_core/src/parser/event.rs new file mode 100644 index 0000000..1b71d8e --- /dev/null +++ b/crates/lopal_core/src/parser/event.rs @@ -0,0 +1,42 @@ +use enumset::EnumSetType; + +use super::{error::SyntaxError, SyntaxElement}; + +pub enum Event { + Start { + kind: NodeKind, + forward_parent: Option, + }, + Finish, + Eat { + count: usize, + }, +} + +impl Event { + pub fn tombstone() -> Self { + Self::Start { + kind: NodeKind::Tombstone, + forward_parent: None, + } + } +} + +#[derive(Clone, PartialEq, Eq)] +pub enum NodeKind { + Tombstone, + Syntax(SyntaxKind), + Error(SyntaxErr), +} + +impl NodeKind { + pub fn is_tombstone(&self) -> bool { + matches!(self, Self::Tombstone) + } + pub fn is_syntax(&self) -> bool { + matches!(self, Self::Syntax(_)) + } + pub fn is_error(&self) -> bool { + matches!(self, Self::Error(_)) + } +} diff --git a/crates/lopal_core/src/parser/input.rs b/crates/lopal_core/src/parser/input.rs new file mode 100644 index 0000000..d7e14b3 --- /dev/null +++ b/crates/lopal_core/src/parser/input.rs @@ -0,0 +1,67 @@ +use enumset::{EnumSet, EnumSetType}; + +use super::SyntaxElement; + +pub struct Input<'src, SyntaxKind: SyntaxElement> { + raw: Vec<(SyntaxKind, &'src str)>, + // enumset of meaningless tokens + semantically_meaningless: EnumSet, + // indices of non-meaningless tokens + meaningful_toks: Vec, +} + +impl<'src, SyntaxKind: SyntaxElement> Input<'src, SyntaxKind> { + pub fn new( + raw_toks: Vec<(SyntaxKind, &'src str)>, + meaningless: Option>, + ) -> Self { + let mut meaningful_toks = Vec::new(); + + if let Some(meaningless) = meaningless { + let meaningful_toks = raw_toks + .iter() + .enumerate() + .filter_map(|(i, tok)| (!meaningless.contains(tok.0)).then_some(i)) + .collect_into(&mut meaningful_toks); + } + + Self { + raw: raw_toks, + semantically_meaningless: meaningless.unwrap_or_default(), + meaningful_toks, + } + } + + pub fn kind(&self, idx: usize) -> SyntaxKind { + let Some(meaningful_idx) = self.meaningful_toks.get(idx) else { + return SyntaxKind::SYNTAX_EOF; + }; + + self.raw.get(*meaningful_idx).unwrap().0 + } + + pub fn preceding_meaningless(&self, idx: usize) -> usize { + assert!(self.meaningful_toks.len() > idx); + + if idx == 0 { + // maybe should be `self.meaningful_toks[idx]` instead?? + 1 + } else { + self.meaningful_toks[idx] - self.meaningful_toks[idx - 1] + } + } + + /// get the count of meaningless tokens at the end of the file. + pub fn meaningless_tail_len(&self) -> usize { + self.raw.len() - (self.meaningful_toks.last().unwrap() + 1) + } + + pub fn dissolve(self) -> (Vec<(SyntaxKind, &'src str)>, EnumSet) { + let Self { + raw, + semantically_meaningless, + .. + } = self; + (raw, semantically_meaningless) + } +} diff --git a/crates/lopal_core/src/parser/marker.rs b/crates/lopal_core/src/parser/marker.rs new file mode 100644 index 0000000..2e1244d --- /dev/null +++ b/crates/lopal_core/src/parser/marker.rs @@ -0,0 +1,97 @@ +use drop_bomb::DropBomb; +use rowan::SyntaxKind; + +use super::{ + error::SyntaxError, + event::{Event, NodeKind}, + Parser, SyntaxElement, +}; + +pub struct Marker { + pos: usize, + bomb: DropBomb, +} + +impl Marker { + pub(super) fn new(pos: usize, name: &str) -> Self { + Self { + pos, + bomb: DropBomb::new(format!("Marker {name} must be completed or abandoned.")), + } + } + + fn close_node( + mut self, + p: &mut Parser, + kind: NodeKind, + ) -> CompletedMarker { + self.bomb.defuse(); + + match &mut p.events[self.pos] { + Event::Start { kind: slot, .. } => *slot = kind.clone(), + _ => unreachable!(), + } + + p.push_ev(Event::Finish); + CompletedMarker { + pos: self.pos, + kind, + } + } + + pub fn complete( + self, + p: &mut Parser, + kind: SyntaxKind, + ) -> CompletedMarker { + self.close_node(p, NodeKind::Syntax(kind)) + } + + pub fn error( + self, + p: &mut Parser, + kind: SyntaxErr, + ) -> CompletedMarker { + self.close_node(p, NodeKind::Error(kind)) + } + + pub fn abandon( + mut self, + p: &mut Parser, + ) { + self.bomb.defuse(); + + // clean up empty tombstone event from marker + if self.pos == p.events.len() - 1 { + match p.events.pop() { + Some(Event::Start { + kind: NodeKind::Tombstone, + forward_parent: None, + }) => (), + _ => unreachable!(), + } + } + } +} + +pub struct CompletedMarker { + pos: usize, + kind: NodeKind, +} + +impl CompletedMarker { + pub fn precede(self, p: &mut Parser, name: &str) -> Marker { + let new_pos = p.start(name); + + match &mut p.events[self.pos] { + Event::Start { forward_parent, .. } => { + // point forward parent of the node this marker completed to the new node + // will later be used to make the new node a parent of the current node. + *forward_parent = Some(new_pos.pos - self.pos) + } + _ => unreachable!(), + } + + new_pos + } +} diff --git a/crates/lopal_core/src/parser/output.rs b/crates/lopal_core/src/parser/output.rs new file mode 100644 index 0000000..76c3cf7 --- /dev/null +++ b/crates/lopal_core/src/parser/output.rs @@ -0,0 +1,73 @@ +use std::{fmt, marker::PhantomData}; + +use rowan::{GreenNode, GreenNodeData, GreenTokenData, NodeOrToken}; + +use crate::{SyntaxElement, SyntaxError}; + +pub struct ParserOutput { + pub green_node: GreenNode, + pub errors: Vec, + pub(super) _syntax_kind: PhantomData, +} + +impl std::fmt::Debug + for ParserOutput +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut errs: Vec<&SyntaxErr> = self.errors.iter().collect(); + errs.reverse(); + debug_print_output::( + NodeOrToken::Node(&self.green_node), + f, + 0, + &mut errs, + ) + } +} + +fn debug_print_output( + node: NodeOrToken<&GreenNodeData, &GreenTokenData>, + f: &mut std::fmt::Formatter<'_>, + lvl: i32, + errs: &mut Vec<&SyntaxErr>, +) -> std::fmt::Result { + if f.alternate() { + for _ in 0..lvl { + f.write_str(" ")?; + } + } + let maybe_newline = if f.alternate() { "\n" } else { " " }; + + match node { + NodeOrToken::Node(n) => { + let kind: SyntaxKind = node.kind().into(); + if kind != SyntaxKind::SYNTAX_ERROR { + write!(f, "{:?} {{{maybe_newline}", kind)?; + } else { + let err = errs + .pop() + .expect("all error syntax nodes should correspond to an error"); + + write!(f, "{:?}: {err:?} {{{maybe_newline}", kind)?; + } + for c in n.children() { + debug_print_output::(c, f, lvl + 1, errs)?; + } + + if f.alternate() { + for _ in 0..lvl { + f.write_str(" ")?; + } + } + write!(f, "}}{maybe_newline}") + } + NodeOrToken::Token(t) => { + write!( + f, + "{:?} {:?};{maybe_newline}", + Into::::into(t.kind()), + t.text() + ) + } + } +} diff --git a/crates/lopal_json/Cargo.toml b/crates/lopal_json/Cargo.toml new file mode 100644 index 0000000..05c8324 --- /dev/null +++ b/crates/lopal_json/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "lopal_json" +version = "0.1.0" +edition = "2021" + +[dependencies] +logos = "0.14.2" +enumset = "1.1.3" +rowan = "0.15.15" +lopal_core = { path = "../lopal_core" } + +[lints] +workspace = true diff --git a/crates/lopal_json/src/grammar.rs b/crates/lopal_json/src/grammar.rs new file mode 100644 index 0000000..3011e03 --- /dev/null +++ b/crates/lopal_json/src/grammar.rs @@ -0,0 +1,78 @@ +use array::array; +use enumset::{enum_set, EnumSet}; +use lopal_core::parser::ParserBuilder; + +use crate::{ + syntax_error::SyntaxError, + syntax_kind::{lex, SyntaxKind}, +}; + +use self::object::object; + +mod array; +mod object; + +pub(crate) type Parser<'src> = lopal_core::Parser<'src, SyntaxKind, SyntaxError>; +pub(crate) type CompletedMarker = lopal_core::CompletedMarker; + +const BASIC_VALUE_TOKENS: EnumSet = + enum_set!(SyntaxKind::BOOL | SyntaxKind::NULL | SyntaxKind::NUMBER | SyntaxKind::STRING); + +pub fn value(p: &mut Parser) -> bool { + if BASIC_VALUE_TOKENS.contains(p.current()) { + p.do_bump(); + return true; + } else { + object(p).or_else(|| array(p)).is_some() + } +} + +#[cfg(test)] +mod tests { + use super::{ + test_utils::{check_parser, gen_checks}, + value, + }; + + #[test] + fn value_lit() { + gen_checks! {value; + r#""helo world""# => r#"ROOT { STRING "\"helo world\""; }"#, + "42" => r#"ROOT { NUMBER "42"; }"#, + "null" => r#"ROOT { NULL "null"; }"#, + "true" => r#"ROOT { BOOL "true"; }"#, + "false" => r#"ROOT { BOOL "false"; }"# + }; + } +} + +#[cfg(test)] +mod test_utils { + use lopal_core::parser::ParserBuilder; + + use crate::syntax_kind::{lex, SyntaxKind}; + + use super::Parser; + + macro_rules! gen_checks { + ($fn_to_test:ident; $($in:literal => $out:literal),+) => { + $(crate::grammar::test_utils::check_parser($in, |p| { $fn_to_test(p); }, $out);)+ + } + } + + pub(super) use gen_checks; + + pub(super) fn check_parser(input: &str, parser_fn: fn(&mut Parser), expected_output: &str) { + let toks = lex(input); + let mut p: Parser = ParserBuilder::new(toks) + .add_meaningless(SyntaxKind::WHITESPACE) + .add_meaningless(SyntaxKind::NEWLINE) + .build(); + + parser_fn(&mut p); + + let out = p.finish(); + + assert_eq!(format!("{out:?}").trim_end(), expected_output); + } +} diff --git a/crates/lopal_json/src/grammar/array.rs b/crates/lopal_json/src/grammar/array.rs new file mode 100644 index 0000000..3ae1726 --- /dev/null +++ b/crates/lopal_json/src/grammar/array.rs @@ -0,0 +1,52 @@ +use crate::{syntax_error::SyntaxError, syntax_kind::SyntaxKind}; + +use super::{value, CompletedMarker, Parser}; + +pub(super) fn array(p: &mut Parser) -> Option { + let array_start = p.start("array"); + + if !p.eat(SyntaxKind::BRACKET_OPEN) { + array_start.abandon(p); + return None; + } + + let el = p.start("arr_el"); + value(p); + el.complete(p, SyntaxKind::ELEMENT); + + while p.at(SyntaxKind::COMMA) { + let potential_trailing_comma = p.start("potential_trailing_comma"); + + p.eat(SyntaxKind::COMMA); + let maybe_el = p.start("arr_el"); + if !value(p) { + maybe_el.abandon(p); + potential_trailing_comma.complete(p, SyntaxKind::TRAILING_COMMA); + } else { + maybe_el.complete(p, SyntaxKind::ELEMENT); + potential_trailing_comma.abandon(p); + } + } + + Some(if !p.eat(SyntaxKind::BRACKET_CLOSE) { + array_start.error(p, SyntaxError::UnclosedArray) + } else { + array_start.complete(p, SyntaxKind::ARRAY) + }) +} + +#[cfg(test)] +mod tests { + use crate::grammar::{array::array, test_utils::gen_checks}; + + #[test] + fn array_basic() { + gen_checks! {array; + r#"[1,2,3]"# => r#"ROOT { ARRAY { BRACKET_OPEN "["; ELEMENT { NUMBER "1"; } COMMA ","; ELEMENT { NUMBER "2"; } COMMA ","; ELEMENT { NUMBER "3"; } BRACKET_CLOSE "]"; } }"#, + r#"[1,2,]"# => r#"ROOT { ARRAY { BRACKET_OPEN "["; ELEMENT { NUMBER "1"; } COMMA ","; ELEMENT { NUMBER "2"; } TRAILING_COMMA { COMMA ","; } BRACKET_CLOSE "]"; } }"#, + r#"[1,2"# => r#"ROOT { PARSE_ERR: UnclosedArray { BRACKET_OPEN "["; ELEMENT { NUMBER "1"; } COMMA ","; ELEMENT { NUMBER "2"; } } }"#, + r#"[1,2,"# => r#"ROOT { PARSE_ERR: UnclosedArray { BRACKET_OPEN "["; ELEMENT { NUMBER "1"; } COMMA ","; ELEMENT { NUMBER "2"; } TRAILING_COMMA { COMMA ","; } } }"#, + r#"[{"hello":"world""# => r#"ROOT { PARSE_ERR: UnclosedArray { BRACKET_OPEN "["; ELEMENT { PARSE_ERR: UnclosedObject { BRACE_OPEN "{"; MEMBER { MEMBER_NAME { STRING "\"hello\""; } COLON ":"; MEMBER_VALUE { STRING "\"world\""; } } } } } }"# + } + } +} diff --git a/crates/lopal_json/src/grammar/object.rs b/crates/lopal_json/src/grammar/object.rs new file mode 100644 index 0000000..02d9e73 --- /dev/null +++ b/crates/lopal_json/src/grammar/object.rs @@ -0,0 +1,92 @@ +use crate::{grammar::value, syntax_error::SyntaxError, syntax_kind::SyntaxKind}; + +use super::{CompletedMarker, Parser, BASIC_VALUE_TOKENS}; + +pub(super) fn object(p: &mut Parser) -> Option { + let obj_start = p.start("object"); + + if !p.eat(SyntaxKind::BRACE_OPEN) { + obj_start.abandon(p); + return None; + } + + member(p); + while p.at(SyntaxKind::COMMA) { + // not always an error, later configurable + let potential_trailing_comma = p.start("potential_trailing_comma"); + p.eat(SyntaxKind::COMMA); + + if member(p).is_none() { + potential_trailing_comma.complete(p, SyntaxKind::TRAILING_COMMA); + } else { + potential_trailing_comma.abandon(p); + } + } + + Some(if p.eat(SyntaxKind::BRACE_CLOSE) { + obj_start.complete(p, SyntaxKind::OBJECT) + } else { + obj_start.error(p, SyntaxError::UnclosedObject) + }) +} + +fn member(p: &mut Parser) -> Option { + let member_start = p.start("member"); + + if p.at(SyntaxKind::BRACE_CLOSE) { + member_start.abandon(p); + return None; + } else if p.at(SyntaxKind::STRING) { + let member_name_start = p.start("member_name"); + p.eat(SyntaxKind::STRING); + member_name_start.complete(p, SyntaxKind::MEMBER_NAME); + } else { + return todo!("handle other tokens: {:?}", p.current()); + } + + if !p.eat(SyntaxKind::COLON) { + todo!("handle wrong tokens") + } + + let member_value_start = p.start("member_value_start"); + if value(p) { + member_value_start.complete(p, SyntaxKind::MEMBER_VALUE); + Some(member_start.complete(p, SyntaxKind::MEMBER)) + } else { + member_value_start.abandon(p); + let e = member_start.error(p, SyntaxError::MemberMissingValue); + Some( + e.precede(p, "member but failed already") + .complete(p, SyntaxKind::MEMBER), + ) + } +} + +#[cfg(test)] +mod tests { + use crate::grammar::{ + object::{member, object}, + test_utils::gen_checks, + }; + + #[test] + fn object_basic() { + gen_checks! {object; + r#"{"a": "b"}"# => r#"ROOT { OBJECT { BRACE_OPEN "{"; MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; WHITESPACE " "; MEMBER_VALUE { STRING "\"b\""; } } BRACE_CLOSE "}"; } }"#, + r#"{"a": 42}"# => r#"ROOT { OBJECT { BRACE_OPEN "{"; MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; WHITESPACE " "; MEMBER_VALUE { NUMBER "42"; } } BRACE_CLOSE "}"; } }"#, + r#"{"a": "b""# => r#"ROOT { PARSE_ERR: UnclosedObject { BRACE_OPEN "{"; MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; WHITESPACE " "; MEMBER_VALUE { STRING "\"b\""; } } } }"#, + r#"{"a": }"# => r#"ROOT { OBJECT { BRACE_OPEN "{"; MEMBER { PARSE_ERR: MemberMissingValue { MEMBER_NAME { STRING "\"a\""; } COLON ":"; } } WHITESPACE " "; BRACE_CLOSE "}"; } }"#, + r#"{"a":"# => r#"ROOT { PARSE_ERR: UnclosedObject { BRACE_OPEN "{"; MEMBER { PARSE_ERR: MemberMissingValue { MEMBER_NAME { STRING "\"a\""; } COLON ":"; } } } }"#, + r#"{"a":true,}"# => r#"ROOT { OBJECT { BRACE_OPEN "{"; MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; MEMBER_VALUE { BOOL "true"; } } TRAILING_COMMA { COMMA ","; } BRACE_CLOSE "}"; } }"# + } + } + + #[test] + fn member_basic() { + gen_checks! {member; + r#""a": "b""# => r#"ROOT { MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; WHITESPACE " "; MEMBER_VALUE { STRING "\"b\""; } } }"#, + r#""a": 42"# => r#"ROOT { MEMBER { MEMBER_NAME { STRING "\"a\""; } COLON ":"; WHITESPACE " "; MEMBER_VALUE { NUMBER "42"; } } }"#, + r#""a":"# => r#"ROOT { MEMBER { PARSE_ERR: MemberMissingValue { MEMBER_NAME { STRING "\"a\""; } COLON ":"; } } }"# + } + } +} diff --git a/crates/lopal_json/src/lib.rs b/crates/lopal_json/src/lib.rs new file mode 100644 index 0000000..89160be --- /dev/null +++ b/crates/lopal_json/src/lib.rs @@ -0,0 +1,3 @@ +mod grammar; +mod syntax_error; +mod syntax_kind; diff --git a/crates/lopal_json/src/syntax_error.rs b/crates/lopal_json/src/syntax_error.rs new file mode 100644 index 0000000..5b039cc --- /dev/null +++ b/crates/lopal_json/src/syntax_error.rs @@ -0,0 +1,11 @@ +use crate::syntax_kind::SyntaxKind; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SyntaxError { + UnclosedObject, + UnclosedArray, + DisallowedKeyType(SyntaxKind), + MemberMissingValue, + UnexpectedTrailingComma, +} +impl lopal_core::parser::SyntaxError for SyntaxError {} diff --git a/crates/lopal_json/src/syntax_kind.rs b/crates/lopal_json/src/syntax_kind.rs new file mode 100644 index 0000000..dda5b8d --- /dev/null +++ b/crates/lopal_json/src/syntax_kind.rs @@ -0,0 +1,117 @@ +use logos::Logos; + +pub fn lex(src: &str) -> Vec<(SyntaxKind, &str)> { + let mut lex = SyntaxKind::lexer(src); + let mut r = Vec::new(); + + while let Some(tok_res) = lex.next() { + r.push((tok_res.unwrap_or(SyntaxKind::LEX_ERR), lex.slice())) + } + + r +} + +#[derive(enumset::EnumSetType, Debug, Logos, PartialEq, Eq, Clone, Copy, Hash)] +#[repr(u16)] +#[enumset(no_super_impls)] +#[allow(non_camel_case_types)] +pub enum SyntaxKind { + OBJECT, + MEMBER, + MEMBER_NAME, + MEMBER_VALUE, + + ARRAY, + ELEMENT, + + // SyntaxKinds for future json5/etc support + TRAILING_COMMA, + + // Tokens + // Regexes adapted from [the logos handbook](https://logos.maciej.codes/examples/json_borrowed.html) + #[token("true")] + #[token("false")] + BOOL, + #[token("{")] + BRACE_OPEN, + #[token("}")] + BRACE_CLOSE, + #[token("[")] + BRACKET_OPEN, + #[token("]")] + BRACKET_CLOSE, + #[token(":")] + COLON, + #[token(",")] + COMMA, + #[token("null")] + NULL, + #[regex(r"-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?")] + NUMBER, + #[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#)] + STRING, + + // Whitespace tokens + #[regex("[ \\t\\f]+")] + WHITESPACE, + #[token("\n")] + NEWLINE, + + // Error SyntaxKinds + LEX_ERR, + PARSE_ERR, + + // Meta SyntaxKinds + ROOT, + EOF, +} + +impl lopal_core::parser::SyntaxElement for SyntaxKind { + const SYNTAX_EOF: Self = Self::EOF; + + const SYNTAX_ERROR: Self = Self::PARSE_ERR; + const SYNTAX_ROOT: Self = Self::ROOT; +} + +impl From for rowan::SyntaxKind { + fn from(kind: SyntaxKind) -> Self { + Self(kind as u16) + } +} + +impl From for SyntaxKind { + fn from(raw: rowan::SyntaxKind) -> Self { + assert!(raw.0 <= SyntaxKind::EOF as u16); + #[allow(unsafe_code, reason = "The transmute is necessary here")] + unsafe { + std::mem::transmute::(raw.0) + } + } +} + +#[cfg(test)] +mod tests { + use crate::syntax_kind::{lex, SyntaxKind}; + + #[test] + fn simple_object() { + const TEST_DATA: &str = r#"{"hello_world": "meow", "some_num":7.42}"#; + + assert_eq!( + dbg!(lex(TEST_DATA)), + vec![ + (SyntaxKind::BRACE_OPEN, "{"), + (SyntaxKind::STRING, "\"hello_world\""), + (SyntaxKind::COLON, ":"), + (SyntaxKind::WHITESPACE, " "), + (SyntaxKind::STRING, "\"meow\""), + (SyntaxKind::COMMA, ","), + (SyntaxKind::WHITESPACE, " "), + (SyntaxKind::STRING, "\"some_num\""), + (SyntaxKind::COLON, ":"), + (SyntaxKind::NUMBER, "7.42"), + (SyntaxKind::BRACE_CLOSE, "}") + ] + ); + } +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..473fdc2 --- /dev/null +++ b/flake.lock @@ -0,0 +1,82 @@ +{ + "nodes": { + "fenix": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ], + "rust-analyzer-src": "rust-analyzer-src" + }, + "locked": { + "lastModified": 1747032090, + "narHash": "sha256-htgrHIR/P7V8WeRW/XDWJHXBzbTSWCDYZHsxPAzDuUY=", + "owner": "nix-community", + "repo": "fenix", + "rev": "1436bb8b85b35ca3ba64ad97df31a3b23c7610a3", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "fenix", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1746904237, + "narHash": "sha256-3e+AVBczosP5dCLQmMoMEogM57gmZ2qrVSrmq9aResQ=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "d89fc19e405cb2d55ce7cc114356846a0ee5e956", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "fenix": "fenix", + "nixpkgs": "nixpkgs", + "systems": "systems" + } + }, + "rust-analyzer-src": { + "flake": false, + "locked": { + "lastModified": 1746889290, + "narHash": "sha256-h3LQYZgyv2l3U7r+mcsrEOGRldaK0zJFwAAva4hV/6g=", + "owner": "rust-lang", + "repo": "rust-analyzer", + "rev": "2bafe9d96c6734aacfd49e115f6cf61e7adc68bc", + "type": "github" + }, + "original": { + "owner": "rust-lang", + "ref": "nightly", + "repo": "rust-analyzer", + "type": "github" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..1519a03 --- /dev/null +++ b/flake.nix @@ -0,0 +1,40 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + systems.url = "github:nix-systems/default"; + fenix.url = "github:nix-community/fenix"; + fenix.inputs.nixpkgs.follows = "nixpkgs"; + }; + + outputs = + { + nixpkgs, + fenix, + systems, + ... + }: + let + forEachSystem = nixpkgs.lib.genAttrs (import systems); + in + { + devShells = forEachSystem ( + system: + let + pkgs = nixpkgs.legacyPackages.${system}; + toolchain = + with fenix.packages.${system}; + combine [ + complete.toolchain + ]; + in + { + default = pkgs.mkShell rec { + buildInputs = with pkgs; [ + toolchain + ]; + LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs; + }; + } + ); + }; +}