From 9d24571b40bb4e91daf5e2d51f9e526edc62e8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20H=C3=B6lting?= <87192362+moritz-hoelting@users.noreply.github.com> Date: Wed, 27 Mar 2024 21:39:56 +0100 Subject: [PATCH] Add parser for syntax tree --- Cargo.toml | 1 + src/base/error.rs | 2 + src/base/source_file.rs | 10 +- src/lexical/token.rs | 41 ++- src/lexical/token_stream.rs | 2 +- src/lib.rs | 22 +- src/syntax/error.rs | 84 +++++ src/syntax/mod.rs | 6 + src/syntax/parser.rs | 460 ++++++++++++++++++++++++++ src/syntax/syntax_tree/declaration.rs | 138 ++++++++ src/syntax/syntax_tree/expression.rs | 1 + src/syntax/syntax_tree/mod.rs | 191 +++++++++++ src/syntax/syntax_tree/program.rs | 48 +++ src/syntax/syntax_tree/statement.rs | 148 +++++++++ 14 files changed, 1145 insertions(+), 9 deletions(-) create mode 100644 src/syntax/error.rs create mode 100644 src/syntax/mod.rs create mode 100644 src/syntax/parser.rs create mode 100644 src/syntax/syntax_tree/declaration.rs create mode 100644 src/syntax/syntax_tree/expression.rs create mode 100644 src/syntax/syntax_tree/mod.rs create mode 100644 src/syntax/syntax_tree/program.rs create mode 100644 src/syntax/syntax_tree/statement.rs diff --git a/Cargo.toml b/Cargo.toml index 3b6b0f0..0acfe81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" [dependencies] colored = "2.1.0" derive_more = { version = "0.99.17", default-features = false, features = ["deref", "from", "deref_mut"] } +enum-as-inner = "0.6.0" getset = "0.1.2" strum = { version = "0.26.2", features = ["derive"] } strum_macros = "0.26.2" diff --git a/src/base/error.rs b/src/base/error.rs index 53484e6..2e3dfe3 100644 --- a/src/base/error.rs +++ b/src/base/error.rs @@ -8,6 +8,8 @@ pub enum Error { IoError(#[from] io::Error), #[error("An error occured while tokenizing the source code.")] TokenizeError(#[from] crate::lexical::token::TokenizeError), + #[error("An error occurred while parsing the source code.")] + ParseError(#[from] crate::syntax::error::Error), #[error("An error occurred")] Other(&'static str), } diff --git a/src/base/source_file.rs b/src/base/source_file.rs index 58a31b5..37be4ef 100644 --- a/src/base/source_file.rs +++ b/src/base/source_file.rs @@ -71,6 +71,7 @@ impl SourceFile { SourceIterator { source_file: self, iterator: self.content().char_indices().peekable(), + prev: None, } } @@ -299,6 +300,9 @@ pub struct SourceIterator<'a> { #[get_copy = "pub"] source_file: &'a Arc, iterator: Peekable>, + /// Get the previous character that was iterated over. + #[get_copy = "pub"] + prev: Option<(usize, char)>, } impl<'a> SourceIterator<'a> { /// Peek at the next character in the source file. @@ -310,7 +314,11 @@ impl<'a> Iterator for SourceIterator<'a> { type Item = (usize, char); fn next(&mut self) -> Option { - self.iterator.next() + let item = self.iterator.next(); + if item.is_some() { + self.prev = item; + } + item } } diff --git a/src/lexical/token.rs b/src/lexical/token.rs index 7871734..01979bd 100644 --- a/src/lexical/token.rs +++ b/src/lexical/token.rs @@ -74,6 +74,7 @@ pub enum Token { Punctuation(Punctuation), Numeric(Numeric), Comment(Comment), + LiteralCommand(LiteralCommand), } impl Token { @@ -87,6 +88,7 @@ impl Token { Self::Punctuation(token) => &token.span, Self::Numeric(token) => &token.span, Self::Comment(token) => &token.span, + Self::LiteralCommand(token) => &token.span, } } } @@ -100,6 +102,7 @@ impl SourceElement for Token { Self::Punctuation(token) => token.span(), Self::Numeric(token) => token.span(), Self::Comment(token) => token.span(), + Self::LiteralCommand(token) => token.span(), } } } @@ -200,6 +203,26 @@ impl SourceElement for Comment { } } +/// Represents a hardcoded literal command in the source code. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct LiteralCommand { + /// Span that makes up the token. + pub span: Span, +} + +impl SourceElement for LiteralCommand { + fn span(&self) -> Span { + self.span.clone() + } +} +impl LiteralCommand { + /// Returns the command without the leading slash. + #[must_use] + pub fn clean_command(&self) -> &str { + &self.span.str().trim()[1..] + } +} + /// Is an error that can occur when invoking the [`Token::tokenize`] method. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, thiserror::Error, From)] #[allow(missing_docs)] @@ -283,6 +306,7 @@ impl Token { iter: &mut SourceIterator, start: usize, character: char, + prev_token: Option<&Self>, handler: &impl Handler, ) -> Result { // Single line comment @@ -341,6 +365,10 @@ impl Token { return Err(TokenizeError::FatalLexicalError); } } + // When there is no second slash and at the start of a line + else if prev_token.map_or(true, |token| token.span().str().contains('\n')) { + Ok(Self::handle_literal_command(iter, start)) + } // Just a single slash punctuation else { Ok(Punctuation { @@ -362,6 +390,16 @@ impl Token { .into() } + /// Handles a command that is preceeded by a slash + fn handle_literal_command(iter: &mut SourceIterator, start: usize) -> Self { + Self::walk_iter(iter, |c| !(c.is_whitespace() && c.is_ascii_control())); + + LiteralCommand { + span: Self::create_span(start, iter), + } + .into() + } + /// Lexes the source code from the given iterator. /// /// The tokenization starts at the current location of the iterator. The function moves the @@ -375,6 +413,7 @@ impl Token { pub fn tokenize( iter: &mut SourceIterator, handler: &impl Handler, + prev_token: Option<&Self>, ) -> Result { // Gets the first character let (start, character) = iter @@ -391,7 +430,7 @@ impl Token { } // Found comment/single slash punctuation else if character == '/' { - Self::handle_comment(iter, start, character, handler) + Self::handle_comment(iter, start, character, prev_token, handler) } // Found numeric literal else if character.is_ascii_digit() { diff --git a/src/lexical/token_stream.rs b/src/lexical/token_stream.rs index b43657a..9d285ac 100644 --- a/src/lexical/token_stream.rs +++ b/src/lexical/token_stream.rs @@ -47,7 +47,7 @@ impl TokenStream { // Tokenize the source code. loop { - match Token::tokenize(&mut source_file_iterator, handler) { + match Token::tokenize(&mut source_file_iterator, handler, tokens.last()) { Ok(token) => tokens.push(token), Err(TokenizeError::EndOfSourceCodeIteratorArgument) => { break; diff --git a/src/lib.rs b/src/lib.rs index 4d90758..fe0919d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,25 +3,24 @@ //! `ShulkerScript` is a simple, imperative scripting language for creating Minecraft data packs. #![deny( - missing_docs, missing_debug_implementations, missing_copy_implementations, - clippy::all, - clippy::pedantic, clippy::nursery, rustdoc::broken_intra_doc_links, clippy::missing_errors_doc )] +#![warn(missing_docs, clippy::all, clippy::pedantic)] #![allow(clippy::missing_panics_doc, clippy::missing_const_for_fn)] pub mod base; pub mod lexical; +pub mod syntax; use std::{cell::Cell, fmt::Display, path::PathBuf}; use base::{source_file::SourceFile, Handler, Result}; -use crate::{base::Error, lexical::token_stream::TokenStream}; +use crate::{base::Error, lexical::token_stream::TokenStream, syntax::parser::Parser}; /// Compiles the given source code. /// @@ -34,14 +33,25 @@ pub fn compile(path: PathBuf) -> Result<()> { let tokens = TokenStream::tokenize(&source_file, &printer); - println!("{tokens:#?}"); - if printer.has_printed() { return Err(Error::Other( "An error occurred while tokenizing the source code.", )); } + let mut parser = Parser::new(&tokens); + let result = parser.parse_program(&printer).ok_or(Error::Other( + "An error occured while parsing the source code.", + ))?; + + println!("result: {result:#?}"); + + if printer.has_printed() { + return Err(Error::Other( + "An error occurred while parsing the source code.", + )); + } + Ok(()) } diff --git a/src/syntax/error.rs b/src/syntax/error.rs new file mode 100644 index 0000000..0afbb0c --- /dev/null +++ b/src/syntax/error.rs @@ -0,0 +1,84 @@ +//! Contains the error types that can occur while parsing the syntax of the language. + +use std::fmt::Display; + +use crate::{ + base::log::{Message, Severity, SourceCodeDisplay}, + lexical::token::{KeywordKind, Token}, +}; + +/// Enumeration containing all kinds of syntax that can be failed to parse. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[allow(missing_docs)] +pub enum SyntaxKind { + Punctuation(char), + Keyword(KeywordKind), + Identifier, + Declaration, + Numeric, + Statement, + Expression, + Type, +} + +/// A syntax/token is expected but found an other invalid token. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct UnexpectedSyntax { + /// The kind of syntax that was expected. + pub expected: SyntaxKind, + + /// The invalid token that was found. + pub found: Option, +} + +impl Display for UnexpectedSyntax { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let expected_binding = match self.expected { + SyntaxKind::Identifier => "an identifier token".to_string(), + SyntaxKind::Punctuation(char) => format!("a punctuation token `{char}`"), + SyntaxKind::Keyword(keyword) => format!("a keyword token `{}`", keyword.as_str()), + SyntaxKind::Declaration => "a declaration token".to_string(), + SyntaxKind::Numeric => "a numeric token".to_string(), + SyntaxKind::Statement => "a statement syntax".to_string(), + SyntaxKind::Expression => "an expression syntax".to_string(), + SyntaxKind::Type => "a type syntax".to_string(), + }; + let found_binding = match self.found.clone() { + Some(Token::Comment(..)) => "a comment token".to_string(), + Some(Token::Identifier(..)) => "an identifier token".to_string(), + Some(Token::Keyword(keyword)) => { + format!("a keyword token `{}`", keyword.keyword.as_str()) + } + Some(Token::WhiteSpaces(..)) => "a white spaces token".to_string(), + Some(Token::Punctuation(punctuation)) => { + format!("a punctuation token `{}`", punctuation.punctuation) + } + Some(Token::Numeric(..)) => "a numeric token".to_string(), + Some(Token::LiteralCommand(..)) => "a literal command token".to_string(), + + None => "EOF".to_string(), + }; + + let message = format!("expected {expected_binding}, but found {found_binding}"); + + write!(f, "{}", Message::new(Severity::Error, message))?; + + self.found.as_ref().map_or(Ok(()), |span| { + write!( + f, + "\n{}", + SourceCodeDisplay::new(span.span(), Option::::None) + ) + }) + } +} + +impl std::error::Error for UnexpectedSyntax {} + +/// An enumeration containing all kinds of syntactic errors that can occur while parsing the +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error("{0}")] + UnexpectedSyntax(#[from] UnexpectedSyntax), +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs new file mode 100644 index 0000000..84ceb17 --- /dev/null +++ b/src/syntax/mod.rs @@ -0,0 +1,6 @@ +//! This module contains the syntax tree and parser for the `ShulkerScript` language. + +pub mod error; +pub mod parser; +#[allow(clippy::module_name_repetitions)] +pub mod syntax_tree; diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs new file mode 100644 index 0000000..2d8082e --- /dev/null +++ b/src/syntax/parser.rs @@ -0,0 +1,460 @@ +//! Provides a way to parse a token stream into an abstract syntax tree. + +use derive_more::{Deref, DerefMut}; +use enum_as_inner::EnumAsInner; + +use crate::{ + base::Handler, + lexical::{ + token::{Identifier, Keyword, KeywordKind, Numeric, Punctuation, Token}, + token_stream::{Delimited, Delimiter, TokenStream, TokenTree}, + }, +}; + +use super::error::{Error, SyntaxKind, UnexpectedSyntax}; + +/// Represents a parser that reads a token stream and constructs an abstract syntax tree. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deref, DerefMut)] +pub struct Parser<'a> { + #[deref] + #[deref_mut] + current_frame: Frame<'a>, + stack: Vec>, +} + +impl<'a> Parser<'a> { + /// Creates a new parser from the given token stream. + #[must_use] + pub fn new(token_stream: &'a TokenStream) -> Self { + Self { + current_frame: Frame { + token_provider: TokenProvider::TokenStream(token_stream), + current_index: 0, + }, + stack: Vec::new(), + } + } + + /// Steps into the [`Delimited`] token stream and parses the content within the delimiters. + /// + /// The parser's position must be at the delimited token stream. + pub fn step_into( + &mut self, + delimiter: Delimiter, + f: impl FnOnce(&mut Self) -> Option, + handler: &dyn Handler, + ) -> Option> { + self.current_frame.stop_at_significant(); + let raw_token_tree = self + .current_frame + .token_provider + .token_stream() + .get(self.current_frame.current_index); + + // move after the whole delimited list + self.current_frame.forward(); + + let expected = match delimiter { + Delimiter::Parenthesis => '(', + Delimiter::Brace => '{', + Delimiter::Bracket => '[', + }; + + let delimited_stream = if let Some(token_tree) = raw_token_tree { + match token_tree { + TokenTree::Delimited(delimited_tree) if delimited_tree.delimiter == delimiter => { + delimited_tree + } + found => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Punctuation(expected), + found: Some(match found { + TokenTree::Token(token) => token.clone(), + TokenTree::Delimited(delimited_tree) => { + Token::Punctuation(delimited_tree.open.clone()) + } + }), + })); + + return None; + } + } + } else { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Punctuation(expected), + found: self.get_reading(None).into_token(), + })); + + return None; + }; + + // creates a new frame + let new_frame = Frame { + token_provider: TokenProvider::Delimited(delimited_stream), + current_index: 0, + }; + + // pushes the current frame onto the stack and replaces the current frame with the new one + self.stack + .push(std::mem::replace(&mut self.current_frame, new_frame)); + + let open = delimited_stream.open.clone(); + + let tree = f(self); + + // pops the current frame off the stack + let new_frame = self.stack.pop()?; + + // the current frame must be at the end + if !self.current_frame.is_exhausted() { + let expected = match self + .current_frame + .token_provider + .as_delimited() + .unwrap() + .delimiter + { + Delimiter::Parenthesis => ')', + Delimiter::Brace => '}', + Delimiter::Bracket => ']', + }; + + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Punctuation(expected), + found: self.peek().into_token(), + })); + } + + let close_punctuation = self + .current_frame + .token_provider + .as_delimited() + .unwrap() + .close + .clone(); + + // replaces the current frame with the popped one + self.current_frame = new_frame; + + Some(DelimitedTree { + open, + tree, + close: close_punctuation, + }) + } +} + +/// Represents a result of [`Parser::step_into()`] function. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct DelimitedTree { + /// The opening delimiter. + pub open: Punctuation, + + /// The tree inside the delimiter. + pub tree: Option, + + /// The closing delimiter. + pub close: Punctuation, +} + +/// Provides a way to iterate over a token stream. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, EnumAsInner)] +pub enum TokenProvider<'a> { + /// Iterating at the top level of the token stream. + TokenStream(&'a TokenStream), + + /// Iterating inside a delimited token stream. + Delimited(&'a Delimited), +} + +impl<'a> TokenProvider<'a> { + /// Gets the token stream of the current token provider. + #[must_use] + pub fn token_stream(&self) -> &'a TokenStream { + match self { + TokenProvider::TokenStream(token_stream) => token_stream, + TokenProvider::Delimited(delimited) => &delimited.token_stream, + } + } +} + +/// Represents a single frame of the parser's stack, responsible for reading a token stream in +/// that given token stream level. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Frame<'a> { + token_provider: TokenProvider<'a>, + current_index: usize, +} + +impl<'a> Frame<'a> { + /// Checks if the current [`Frame`] doesn't have any more significant [`TokenTree`]s to + /// parse. + #[must_use] + pub fn is_exhausted(&self) -> bool { + let token_stream = self.token_provider.token_stream(); + for i in self.current_index..self.token_provider.token_stream().len() { + if !matches!( + token_stream.get(i), + Some(TokenTree::Token( + Token::WhiteSpaces(..) | Token::Comment(..) + )) + ) { + return false; + } + } + true + } + + /// Checks if the current [`Frame`] has reached the end of the [`TokenStream`]. + #[must_use] + pub fn is_end(&self) -> bool { + self.current_index >= self.token_provider.token_stream().len() + } + + fn get_reading(&self, token: Option<&TokenTree>) -> Reading { + token.map_or_else( + || match self.token_provider { + // end of file + TokenProvider::TokenStream(..) => Reading::Eof, + TokenProvider::Delimited(delimited) => { + Reading::DelimitedEnd(delimited.close.clone()) + } + }, + |token| match token { + TokenTree::Token(token) => Reading::Atomic(token.clone()), + TokenTree::Delimited(delimited) => Reading::IntoDelimited(delimited.open.clone()), + }, + ) + } + + /// Returns a [`Token`] pointing by the `current_index` of the [`Frame`]. + #[must_use] + pub fn peek(&self) -> Reading { + self.get_reading(self.token_provider.token_stream().get(self.current_index)) + } + + /// Returns a [`Token`] pointing by the `current_index` with the given index offset of the + /// [`Frame`]. + /// + /// # Returns + /// + /// `None` if `offset + current_index` is less than zero or greter than + /// `self.token_provider.token_stream().len() + 1` + #[must_use] + pub fn peek_offset(&self, offset: isize) -> Option { + let index = self.current_index.checked_add(offset.try_into().ok()?)?; + + if index > self.token_provider.token_stream().len() + 1 { + return None; + } + + Some(self.get_reading(self.token_provider.token_stream().get(index))) + } + + /// Returns a [`Token`] pointing by the `current_index` of the [`Frame`] and increments the + /// `current_index` by 1. + pub fn next_token(&mut self) -> Reading { + let token = self.peek(); + + // increment the index + self.forward(); + + token + } + + /// Forwards the `current_index` by 1 if the [`Frame`] is not exhausted. + pub fn forward(&mut self) { + // increment the index + if !self.is_end() { + self.current_index += 1; + } + } + + /// Skips any insignificant [`Token`]s, returns the next significant [`Token`] found, and + /// increments the `current_index` afterward. + pub fn next_significant_token(&mut self) -> Reading { + let token = self.stop_at_significant(); + + // increment the index + self.forward(); + + token + } + + /// Makes the current [`Frame`] point to the significant [`Token`] if currently not. + /// + /// # Returns + /// The significant [`Token`] if found, otherwise `None`. + pub fn stop_at_significant(&mut self) -> Reading { + while !self.is_end() { + let token = self.peek(); + + if !matches!( + token, + Reading::Atomic(Token::WhiteSpaces(..) | Token::Comment(..)) + ) { + return token; + } + + self.forward(); + } + + match self.token_provider { + TokenProvider::TokenStream(..) => Reading::Eof, + TokenProvider::Delimited(delimited) => Reading::DelimitedEnd(delimited.close.clone()), + } + } + + /// Makes the current position stops at the first token that satisfies the predicate. + pub fn stop_at(&mut self, predicate: impl Fn(&Reading) -> bool) -> Reading { + while !self.is_end() { + let token = self.peek(); + + if predicate(&token) { + return token; + } + + self.current_index += 1; + } + + match self.token_provider { + TokenProvider::TokenStream(..) => Reading::Eof, + TokenProvider::Delimited(delimited) => Reading::DelimitedEnd(delimited.close.clone()), + } + } + + /// Expects the next [`Token`] to be an [`Identifier`], and returns it. + /// + /// # Errors + /// If the next [`Token`] is not an [`Identifier`]. + pub fn parse_identifier(&mut self, handler: &impl Handler) -> Option { + match self.next_significant_token() { + Reading::Atomic(Token::Identifier(ident)) => Some(ident), + found => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Identifier, + found: found.into_token(), + })); + None + } + } + } + + /// Expects the next [`Token`] to be an [`Numeric`], and returns it. + /// + /// # Errors + /// If the next [`Token`] is not an [`Identifier`]. + pub fn parse_numeric(&mut self, handler: &dyn Handler) -> Option { + match self.next_significant_token() { + Reading::Atomic(Token::Numeric(ident)) => Some(ident), + found => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Numeric, + found: found.into_token(), + })); + None + } + } + } + + /// Expects the next [`Token`] to be a [`Keyword`] of specific kind, and returns it. + /// + /// # Errors + /// If the next [`Token`] is not a [`Keyword`] of specific kind. + pub fn parse_keyword( + &mut self, + expected: KeywordKind, + handler: &dyn Handler, + ) -> Option { + match self.next_significant_token() { + Reading::Atomic(Token::Keyword(keyword_token)) if keyword_token.keyword == expected => { + Some(keyword_token) + } + found => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Keyword(expected), + found: found.into_token(), + })); + None + } + } + } + + /// Expects the next [`Token`] to be a [`Punctuation`] of specific kind, and returns it. + /// + /// # Errors + /// If the next [`Token`] is not a [`Punctuation`] of specific kind. + pub fn parse_punctuation( + &mut self, + expected: char, + skip_insignificant: bool, + handler: &dyn Handler, + ) -> Option { + match if skip_insignificant { + self.next_significant_token() + } else { + self.next_token() + } { + Reading::Atomic(Token::Punctuation(punctuation_token)) + if punctuation_token.punctuation == expected => + { + Some(punctuation_token) + } + found => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Punctuation(expected), + found: found.into_token(), + })); + None + } + } + } + + /// Tries to parse the given function, and if it fails, resets the current index to the + /// `current_index` before the function call. + pub fn try_parse(&mut self, f: impl FnOnce(&mut Self) -> Option) -> Option { + let current_index = self.current_index; + + let result = f(self); + + if result.is_none() { + self.current_index = current_index; + } + + result + } +} + +/// Represents the read value of the [`Frame`]. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Reading { + /// A singular token. + Atomic(Token), + + /// Found an openning delimiter token, which means that the parser can step into a new + /// delimited frame. + IntoDelimited(Punctuation), + + /// Found a closing delimiter token, which means that the parser should step out of the current + /// delimited frame. + DelimitedEnd(Punctuation), + + /// End of file. + Eof, +} + +impl Reading { + /// Gets the read token inside the [`Reading`] as `Option` + /// + /// # Returns + /// + /// Returns `None` if the [`Reading`] is [`Reading::Eof`]. + #[must_use] + pub fn into_token(self) -> Option { + match self { + Self::Atomic(token) => Some(token), + Self::IntoDelimited(punc) | Self::DelimitedEnd(punc) => Some(Token::Punctuation(punc)), + Self::Eof => None, + } + } +} diff --git a/src/syntax/syntax_tree/declaration.rs b/src/syntax/syntax_tree/declaration.rs new file mode 100644 index 0000000..895ef24 --- /dev/null +++ b/src/syntax/syntax_tree/declaration.rs @@ -0,0 +1,138 @@ +//! Syntax tree nodes for declarations. + +#![allow(missing_docs)] + +use getset::Getters; + +use crate::{ + base::{ + source_file::{SourceElement, Span}, + Handler, + }, + lexical::{ + token::{Identifier, Keyword, KeywordKind, Punctuation, Token}, + token_stream::Delimiter, + }, + syntax::{ + error::{Error, SyntaxKind, UnexpectedSyntax}, + parser::{Parser, Reading}, + }, +}; + +use super::{statement::Block, ConnectedList}; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Declaration { + Function(Function), +} + +impl SourceElement for Declaration { + fn span(&self) -> Span { + match self { + Self::Function(function) => function.span(), + } + } +} + +/// Syntax Synopsis: +/// +/// ``` ebnf +/// Function: +/// 'function' Identifier '(' ParameterList? ')' Block +/// ; +/// +/// ParameterList: +/// Identifier (',' Identifier)* ','? +/// ; +/// ``` +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Getters)] +pub struct Function { + #[get = "pub"] + function_keyword: Keyword, + #[get = "pub"] + identifier: Identifier, + #[get = "pub"] + open_paren: Punctuation, + #[get = "pub"] + parameters: Option>, + #[get = "pub"] + close_paren: Punctuation, + #[get = "pub"] + block: Block, +} + +impl Function { + /// Dissolves the [`Function`] into its components. + #[must_use] + pub fn dissolve( + self, + ) -> ( + Keyword, + Identifier, + Punctuation, + Option>, + Punctuation, + Block, + ) { + ( + self.function_keyword, + self.identifier, + self.open_paren, + self.parameters, + self.close_paren, + self.block, + ) + } +} + +impl SourceElement for Function { + fn span(&self) -> Span { + self.function_keyword.span.join(&self.block.span()).unwrap() + } +} + +impl<'a> Parser<'a> { + pub fn parse_declaration(&mut self, handler: &impl Handler) -> Option { + match self.stop_at_significant() { + Reading::Atomic(Token::Keyword(function_keyword)) + if function_keyword.keyword == KeywordKind::Function => + { + // eat the function keyword + self.forward(); + + // parse the identifier + let identifier = self.parse_identifier(handler)?; + let delimited_tree = self.parse_enclosed_list( + Delimiter::Parenthesis, + ',', + |parser: &mut Parser<'_>| parser.parse_identifier(handler), + handler, + )?; + + // parse the block + let block = self.parse_block(handler)?; + + Some(Declaration::Function(Function { + function_keyword, + identifier, + open_paren: delimited_tree.open, + parameters: delimited_tree.list, + close_paren: delimited_tree.close, + block, + })) + } + + unexpected => { + // make progress + self.forward(); + + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Declaration, + found: unexpected.into_token(), + })); + + None + } + } + } +} diff --git a/src/syntax/syntax_tree/expression.rs b/src/syntax/syntax_tree/expression.rs new file mode 100644 index 0000000..e1371f1 --- /dev/null +++ b/src/syntax/syntax_tree/expression.rs @@ -0,0 +1 @@ +//! Syntax tree nodes for expressions. diff --git a/src/syntax/syntax_tree/mod.rs b/src/syntax/syntax_tree/mod.rs new file mode 100644 index 0000000..4891a42 --- /dev/null +++ b/src/syntax/syntax_tree/mod.rs @@ -0,0 +1,191 @@ +//! Contains the syntax tree nodes that represent the structure of the source code. + +use getset::Getters; + +use crate::{ + base::{ + source_file::{SourceElement, Span}, + Handler, + }, + lexical::{ + token::{Punctuation, Token}, + token_stream::Delimiter, + }, + syntax::parser::Reading, +}; + +use super::{error::Error, parser::Parser}; + +pub mod declaration; +pub mod expression; +pub mod program; +pub mod statement; + +/// Represents a syntax tree node with a pattern of syntax tree nodes separated by a separator. +/// +/// This struct is useful for representing syntax tree nodes that are separated by a separator. +/// For example, a comma separated list of expressions such as `1, 2, 3` can be represented by a +/// [`ConnectedList`] with the separator being a comma token and the elements being the expressions. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Getters)] +pub struct ConnectedList { + /// The first element of the list. + #[get = "pub"] + first: Element, + + /// The rest of the elements of the list. + /// + /// Each element of the list is a tuple containing the separator and the element. The separator + /// is the token/syntax tree node that separates the current element from the prior one. + #[get = "pub"] + rest: Vec<(Separator, Element)>, + + /// The trailing separator of the list. + #[get = "pub"] + trailing_separator: Option, +} + +/// Represents a syntax tree node with a pattern of having [`ConnectedList`] delimited by a pair of +/// punctuation like such `(a, b, c)`. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct DelimitedList { + /// The open punctuation of the list. + pub open: Punctuation, + + /// The list of elements of the list. + /// + /// If `None` then the list is empty (or immediately closed after the open punctuation). + pub list: Option>, + + /// The close punctuation of the list. + pub close: Punctuation, +} + +impl<'a> Parser<'a> { + /// Parses a list of elements enclosed by a pair of delimiters, separated by a separator. + /// + /// The parser position must be at the delimited list of the given delimiter. It will + /// consume the whole delimited list and move the next token after the list. + /// + /// # Errors + /// - if the parser position is not at the delimited list of the given delimiter. + /// - any error returned by the given parser function. + pub fn parse_enclosed_list( + &mut self, + delimiter: Delimiter, + separator: char, + mut f: impl FnMut(&mut Self) -> Option, + handler: &impl Handler, + ) -> Option> { + fn skip_to_next_separator(this: &mut Parser, separator: char) -> Option { + if let Reading::Atomic(Token::Punctuation(punc)) = this.stop_at(|token| { + matches!( + token, Reading::Atomic(Token::Punctuation(punc)) + if punc.punctuation == separator + ) + }) { + this.forward(); + Some(punc) + } else { + None + } + } + + let delimited_tree = self.step_into( + delimiter, + |parser| { + let mut first = None; + let mut rest = Vec::new(); + let mut trailing_separator: Option = None; + + while !parser.is_exhausted() { + let Some(element) = f(parser) else { + skip_to_next_separator(parser, separator); + continue; + }; + + // adds new element + match (&first, &trailing_separator) { + (None, None) => { + first = Some(element); + } + (Some(_), Some(separator)) => { + rest.push((separator.clone(), element)); + trailing_separator = None; + } + _ => { + unreachable!() + } + } + + // expect separator if not exhausted + if !parser.is_exhausted() { + let Some(separator) = parser.parse_punctuation(separator, true, handler) + else { + if let Some(punctuation) = skip_to_next_separator(parser, separator) { + trailing_separator = Some(punctuation); + } + + continue; + }; + + trailing_separator = Some(separator); + } + } + + Some(first.map(|first| ConnectedList { + first, + rest, + trailing_separator, + })) + }, + handler, + )?; + + Some(DelimitedList { + open: delimited_tree.open, + list: delimited_tree.tree.unwrap(), + close: delimited_tree.close, + }) + } +} + +impl SourceElement + for ConnectedList +{ + fn span(&self) -> Span { + let end = self.trailing_separator.as_ref().map_or_else( + || { + self.rest + .last() + .map_or_else(|| self.first.span(), |(_, element)| element.span()) + }, + SourceElement::span, + ); + + self.first.span().join(&end).unwrap() + } +} + +impl ConnectedList { + /// Returns an iterator over the elements of the list. + pub fn elements(&self) -> impl Iterator { + std::iter::once(&self.first).chain(self.rest.iter().map(|(_, element)| element)) + } + + /// Returns an iterator over the elements of the list. + pub fn into_elements(self) -> impl Iterator { + std::iter::once(self.first).chain(self.rest.into_iter().map(|(_, element)| element)) + } + + /// Gets the number of elements in the list. + pub fn len(&self) -> usize { + self.rest.len() + 1 + } + + /// Returns `true` if the list is empty. + /// + /// The function will never return `false`. + pub fn is_empty(&self) -> bool { + false + } +} diff --git a/src/syntax/syntax_tree/program.rs b/src/syntax/syntax_tree/program.rs new file mode 100644 index 0000000..7195845 --- /dev/null +++ b/src/syntax/syntax_tree/program.rs @@ -0,0 +1,48 @@ +//! The program node of the syntax tree. + +use getset::Getters; + +use crate::{ + base::Handler, + syntax::{ + error::Error, + parser::{Parser, Reading}, + }, +}; + +use super::declaration::Declaration; + +/// Program is a collection of declarations. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Getters)] +pub struct Program { + /// The declarations within the program. + #[get = "pub"] + declarations: Vec, +} + +impl<'a> Parser<'a> { + /// Parses a [`Program`]. + pub fn parse_program(&mut self, handler: &impl Handler) -> Option { + let mut declarations = Vec::new(); + + while !self.is_exhausted() { + let result = self.parse_declaration(handler); + + #[allow(clippy::option_if_let_else)] + if let Some(x) = result { + declarations.push(x); + } else { + self.stop_at(|reading| { + matches!( + reading, + Reading::IntoDelimited(x) if x.punctuation == '{' + ) + }); + + self.next_token(); + } + } + + Some(Program { declarations }) + } +} diff --git a/src/syntax/syntax_tree/statement.rs b/src/syntax/syntax_tree/statement.rs new file mode 100644 index 0000000..bf07b9e --- /dev/null +++ b/src/syntax/syntax_tree/statement.rs @@ -0,0 +1,148 @@ +//! Syntax tree nodes for statements. + +use getset::Getters; + +use crate::{ + base::{ + source_file::{SourceElement, Span}, + Handler, + }, + lexical::{ + token::{LiteralCommand, Punctuation, Token}, + token_stream::Delimiter, + }, + syntax::{ + error::{Error, SyntaxKind, UnexpectedSyntax}, + parser::{Parser, Reading}, + }, +}; + +/// Syntax Synopsis: +/// +/// ``` ebnf +/// Statement: +/// Block +/// | Conditional +/// ; +/// ``` +#[allow(missing_docs)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Statement { + Block(Block), + LiteralCommand(LiteralCommand), + // Conditional(Conditional), +} + +impl SourceElement for Statement { + fn span(&self) -> Span { + match self { + Self::Block(block) => block.span(), + Self::LiteralCommand(literal_command) => literal_command.span(), + //Self::Conditional(conditional) => conditional.span(), + } + } +} + +/// Syntax Synopsis: +/// +/// ``` ebnf +/// Block: +/// '{' Statement* '}' +/// ; +/// ``` +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Getters)] +pub struct Block { + /// The opening brace of the block. + #[get = "pub"] + pub open_brace: Punctuation, + /// The statements within the block. + #[get = "pub"] + pub statements: Vec, + /// The closing brace of the block. + #[get = "pub"] + pub close_brace: Punctuation, +} + +impl Block { + /// Dissolves the [`Block`] into its components. + #[must_use] + pub fn dissolve(self) -> (Punctuation, Vec, Punctuation) { + (self.open_brace, self.statements, self.close_brace) + } +} + +impl SourceElement for Block { + fn span(&self) -> Span { + self.open_brace + .span() + .join(&self.close_brace.span()) + .unwrap() + } +} + +impl<'a> Parser<'a> { + /// Parses a [`Block`]. + pub fn parse_block(&mut self, handler: &impl Handler) -> Option { + let token_tree = self.step_into( + Delimiter::Brace, + |parser| { + let mut statements = Vec::new(); + + while !parser.is_exhausted() { + parser.parse_statement(handler).map_or_else( + || { + // error recovery + parser.stop_at(|reading| matches!( + reading, + Reading::Atomic(Token::Punctuation(punc)) if punc.punctuation == ';' + ) || matches!( + reading, + Reading::IntoDelimited(punc) if punc.punctuation == '{' + )); + + // goes after the semicolon or the open brace + parser.forward(); + }, + |statement| statements.push(statement), + ); + } + + Some(statements) + }, + handler, + )?; + + Some(Block { + open_brace: token_tree.open, + statements: token_tree.tree?, + close_brace: token_tree.close, + }) + } + + /// Parses a [`Statement`]. + pub fn parse_statement(&mut self, handler: &impl Handler) -> Option { + match self.stop_at_significant() { + // variable declaration + Reading::Atomic(Token::LiteralCommand(command)) => { + self.forward(); + Some(Statement::LiteralCommand(command)) + } + // block statement + Reading::IntoDelimited(open_brace) if open_brace.punctuation == '{' => { + let block = self.parse_block(handler)?; + + Some(Statement::Block(block)) + } + + // other + unexpected => { + handler.receive(Error::UnexpectedSyntax(UnexpectedSyntax { + expected: SyntaxKind::Statement, + found: unexpected.into_token(), + })); + + None + } + } + } +}