From 830b3b10d9eff513e46a2d4a81ed4b23c1bbadf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20H=C3=B6lting?= <87192362+moritz-hoelting@users.noreply.github.com> Date: Wed, 27 Mar 2024 19:27:11 +0100 Subject: [PATCH] Add tokenizing module --- .gitignore | 2 + Cargo.toml | 14 ++ src/base/diagnostic.rs | 5 + src/base/error.rs | 16 ++ src/base/log.rs | 75 +++++++ src/base/mod.rs | 12 ++ src/base/source_file.rs | 359 +++++++++++++++++++++++++++++++ src/lexical/error.rs | 62 ++++++ src/lexical/mod.rs | 8 + src/lexical/token.rs | 411 ++++++++++++++++++++++++++++++++++++ src/lexical/token_stream.rs | 195 +++++++++++++++++ src/lib.rs | 70 ++++++ 12 files changed, 1229 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/base/diagnostic.rs create mode 100644 src/base/error.rs create mode 100644 src/base/log.rs create mode 100644 src/base/mod.rs create mode 100644 src/base/source_file.rs create mode 100644 src/lexical/error.rs create mode 100644 src/lexical/mod.rs create mode 100644 src/lexical/token.rs create mode 100644 src/lexical/token_stream.rs create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4fffb2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..3b6b0f0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "shulkerscript-lang" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +colored = "2.1.0" +derive_more = { version = "0.99.17", default-features = false, features = ["deref", "from", "deref_mut"] } +getset = "0.1.2" +strum = { version = "0.26.2", features = ["derive"] } +strum_macros = "0.26.2" +thiserror = "1.0.58" diff --git a/src/base/diagnostic.rs b/src/base/diagnostic.rs new file mode 100644 index 0000000..420aa00 --- /dev/null +++ b/src/base/diagnostic.rs @@ -0,0 +1,5 @@ +/// Represents a trait responsible for handling diagnostics in the interpreter. +pub trait Handler { + /// Receive an error and handles it. + fn receive(&self, error: T); +} diff --git a/src/base/error.rs b/src/base/error.rs new file mode 100644 index 0000000..53484e6 --- /dev/null +++ b/src/base/error.rs @@ -0,0 +1,16 @@ +use std::io; + +/// An error that occurred during compilation. +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("An error occurred while reading the file.")] + IoError(#[from] io::Error), + #[error("An error occured while tokenizing the source code.")] + TokenizeError(#[from] crate::lexical::token::TokenizeError), + #[error("An error occurred")] + Other(&'static str), +} + +/// A specialized [`Result`] type for this crate. +pub type Result = std::result::Result; diff --git a/src/base/log.rs b/src/base/log.rs new file mode 100644 index 0000000..f11eb98 --- /dev/null +++ b/src/base/log.rs @@ -0,0 +1,75 @@ +//! Module containing structures and implementations for logging messages to the user. + +use colored::Colorize; +use std::fmt::Display; + +use super::source_file::Span; + +/// Represent the severity of a log message to be printed to the console. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[allow(missing_docs)] +pub enum Severity { + Error, + Info, + Warning, +} + +/// Struct implementing [`Display`] that represents a log message to be displayed to the user. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Message { + /// The severity of the log message. + pub severity: Severity, + + /// The message to be displayed. + pub display: T, +} +impl Message { + /// Create a new log message with the given severity and message to be displayed. + pub fn new(severity: Severity, display: T) -> Self { + Self { severity, display } + } +} + +impl Display for Message { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let log_header = (match self.severity { + Severity::Error => "[error]:".red(), + Severity::Info => "[info]:".green(), + Severity::Warning => "[warning]:".yellow(), + }) + .bold(); + + let message_part = &self.display.to_string().bold(); + + write!(f, "{log_header} {message_part}") + } +} + +/// Structure implementing [`Display`] that prints the particular span of the source code. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SourceCodeDisplay<'a, T> { + /// The span of the source code to be printed. + pub span: &'a Span, + + /// The help message to be displayed. + pub help_display: Option, +} + +impl<'a, T> SourceCodeDisplay<'a, T> { + /// Create a new source code display with the given span and help message to be displayed. + pub fn new(span: &'a Span, help_display: Option) -> Self { + Self { span, help_display } + } +} + +impl<'a, T: std::fmt::Display> Display for SourceCodeDisplay<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.span.str())?; + + if let Some(help_display) = &self.help_display { + write!(f, "\n\n{help_display}")?; + } + + Ok(()) + } +} diff --git a/src/base/mod.rs b/src/base/mod.rs new file mode 100644 index 0000000..80c4b19 --- /dev/null +++ b/src/base/mod.rs @@ -0,0 +1,12 @@ +//! The base module contains the core functionality of the `ShulkerScript` language. + +pub mod source_file; + +mod error; +#[doc(inline)] +pub use error::{Error, Result}; + +mod diagnostic; +pub use diagnostic::Handler; + +pub mod log; diff --git a/src/base/source_file.rs b/src/base/source_file.rs new file mode 100644 index 0000000..58a31b5 --- /dev/null +++ b/src/base/source_file.rs @@ -0,0 +1,359 @@ +//! Module for handling source files and their elements. + +use std::{ + cmp::Ordering, + fmt::Debug, + fs, + iter::{Iterator, Peekable}, + ops::Range, + path::PathBuf, + str::CharIndices, + sync::Arc, +}; + +use getset::{CopyGetters, Getters}; + +use super::Error; + +/// Represents a source file that contains the source code. +#[derive(Clone)] +pub struct SourceFile { + path: PathBuf, + content: String, + lines: Vec>, +} + +#[allow(clippy::missing_fields_in_debug)] +impl Debug for SourceFile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SourceFile") + .field("path", &self.path) + .field("lines", &self.lines) + .finish() + } +} + +impl SourceFile { + fn new(path: PathBuf, content: String) -> Arc { + let lines = get_line_byte_positions(&content); + + Arc::new(Self { + path, + content, + lines, + }) + } + + /// Get the content of the source file + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + /// Get the line of the source file at the given line number. + /// + /// Numbering starts at 1. + #[must_use] + pub fn get_line(&self, line: usize) -> Option<&str> { + if line == 0 { + return None; + } + + let line = line - 1; + self.lines + .get(line) + .map(|range| &self.content()[range.clone()]) + } + + /// Get the [`SourceIterator`] for the source file. + #[must_use] + pub fn iter<'a>(self: &'a Arc) -> SourceIterator<'a> { + SourceIterator { + source_file: self, + iterator: self.content().char_indices().peekable(), + } + } + + /// Get the number of lines in the source file. + #[must_use] + pub fn line_amount(&self) -> usize { + self.lines.len() + } + + /// Load the source file from the given file path. + /// + /// # Errors + /// - [`Error::IoError`]: Error occurred when reading the file contents. + pub fn load(path: PathBuf) -> Result, Error> { + let source = fs::read_to_string(&path).map_err(Error::IoError)?; + Ok(Self::new(path, source)) + } + + /// Get the [`Location`] of a given byte index + #[must_use] + pub fn get_location(&self, byte_index: usize) -> Option { + if self.content.is_char_boundary(byte_index) { + None + } else { + // get the line number by binary searching the line ranges + let line = self + .lines + .binary_search_by(|range| { + if range.contains(&byte_index) { + Ordering::Equal + } else if byte_index < range.start { + Ordering::Greater + } else { + Ordering::Less + } + }) + .ok()?; + + let line_starting_byte_index = self.lines[line].start; + let line_str = self.get_line(line + 1).unwrap(); + + // get the column number by iterating through the utf-8 characters (starts at 1) + let column = line_str + .char_indices() + .take_while(|(i, _)| *i + line_starting_byte_index < byte_index) + .count() + + 1; + + Some(Location { + line: line + 1, + column, + }) + } + } +} + +/// Represents a range of characters in a source file. +#[derive(Clone, Getters, CopyGetters)] +pub struct Span { + /// Get the start byte index of the span. + #[get_copy = "pub"] + start: usize, + + /// Get the end byte index of the span (exclusive). + #[get_copy = "pub"] + end: usize, + + /// Get the source file that the span is located in. + #[get = "pub"] + source_file: Arc, +} + +#[allow(clippy::missing_fields_in_debug)] +impl Debug for Span { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Span") + .field("start", &self.start) + .field("end", &self.end) + .field("content", &self.str()) + .finish() + } +} + +impl PartialEq for Span { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.source_file, &other.source_file) + && self.start == other.start + && self.end == other.end + } +} + +impl Eq for Span {} + +#[allow(clippy::non_canonical_partial_ord_impl)] +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Self) -> Option { + let self_ptr_value = Arc::as_ptr(&self.source_file) as usize; + let other_ptr_value = Arc::as_ptr(&other.source_file) as usize; + + Some(self_ptr_value.cmp(&other_ptr_value).then_with(|| { + self.start + .cmp(&other.start) + .then_with(|| self.end.cmp(&other.end)) + })) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Self) -> Ordering { + let self_ptr_value = Arc::as_ptr(&self.source_file) as usize; + let other_ptr_value = Arc::as_ptr(&other.source_file) as usize; + + self_ptr_value + .cmp(&other_ptr_value) + .then_with(|| self.start.cmp(&other.start)) + .then_with(|| self.end.cmp(&other.end)) + } +} + +impl std::hash::Hash for Span { + fn hash(&self, state: &mut H) { + self.start.hash(state); + self.end.hash(state); + Arc::as_ptr(&self.source_file).hash(state); + } +} + +impl Span { + /// Create a span from the given start and end byte indices in the source file. + /// + /// # Parameters + /// - `start`: The start byte index of the span. + /// - `end`: The end byte index of the span (exclusive). + #[must_use] + pub fn new(source_file: Arc, start: usize, end: usize) -> Option { + if start > end + || !source_file.content().is_char_boundary(start) + || source_file.content().len() < end + || (source_file.content().len() + 1 != end + && !source_file.content().is_char_boundary(end)) + { + return None; + } + + Some(Self { + start, + end, + source_file, + }) + } + + /// Create a span from the given start byte index to the end of the source file. + #[must_use] + pub fn to_end(source_file: Arc, start: usize) -> Option { + if !source_file.content().is_char_boundary(start) { + return None; + } + Some(Self { + start, + end: source_file.content().len(), + source_file, + }) + } + + /// Get the string slice of the source code that the span represents. + #[must_use] + pub fn str(&self) -> &str { + &self.source_file.content()[self.start..self.end] + } + + /// Get the starting [`Location`] of the span. + #[must_use] + pub fn start_location(&self) -> Location { + self.source_file.get_location(self.start).unwrap() + } + + /// Get the ending [`Location`] of the span. + /// + /// Returns [`None`] if the end of the span is the end of the source file. + #[must_use] + pub fn end_location(&self) -> Option { + self.source_file.get_location(self.end) + } + + /// Join the starting position of this span with the end position of the given span. + #[must_use] + pub fn join(&self, end: &Self) -> Option { + if !Arc::ptr_eq(&self.source_file, &end.source_file) || self.start > end.end { + return None; + } + + Some(Self { + start: self.start, + end: end.end, + source_file: self.source_file.clone(), + }) + } +} + +/// Pointing to a particular location in a source file. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub struct Location { + /// Line number of the location (starts at 1). + pub line: usize, + + /// Column number of the location (starts at 1). + pub column: usize, +} + +/// Represents an element that is located within a source file. +pub trait SourceElement { + /// Get the span location of the element. + fn span(&self) -> Span; +} + +impl SourceElement for Box { + fn span(&self) -> Span { + self.as_ref().span() + } +} + +/// Iterator iterating over the characters in a source file that can be peeked at. +#[derive(Debug, Clone, CopyGetters)] +pub struct SourceIterator<'a> { + /// Get the source file that the iterator is iterating over. + #[get_copy = "pub"] + source_file: &'a Arc, + iterator: Peekable>, +} +impl<'a> SourceIterator<'a> { + /// Peek at the next character in the source file. + pub fn peek(&mut self) -> Option<(usize, char)> { + self.iterator.peek().copied() + } +} +impl<'a> Iterator for SourceIterator<'a> { + type Item = (usize, char); + + fn next(&mut self) -> Option { + self.iterator.next() + } +} + +/// Get the byte positions of the lines in the given text. +fn get_line_byte_positions(text: &str) -> Vec> { + let mut current_position = 0; + let mut results = Vec::new(); + + let mut skip = false; + + for (byte, char) in text.char_indices() { + if skip { + skip = false; + continue; + } + + // lf + if char == '\n' { + #[allow(clippy::range_plus_one)] + results.push(current_position..byte + 1); + + current_position = byte + 1; + } + + // crlf + if char == '\r' { + if text.as_bytes().get(byte + 1) == Some(&b'\n') { + results.push(current_position..byte + 2); + + current_position = byte + 2; + + skip = true; + } else { + #[allow(clippy::range_plus_one)] + results.push(current_position..byte + 1); + + current_position = byte + 1; + } + } + } + + // add the last line + results.push(current_position..text.len()); + + results +} diff --git a/src/lexical/error.rs b/src/lexical/error.rs new file mode 100644 index 0000000..d80dfd4 --- /dev/null +++ b/src/lexical/error.rs @@ -0,0 +1,62 @@ +use std::fmt::Display; + +use getset::Getters; + +use crate::base::{ + log::{Message, Severity, SourceCodeDisplay}, + source_file::Span, +}; + +use super::token_stream::Delimiter; + +/// Represents an error that occurred during the lexical analysis of the source code. +#[allow(missing_docs)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, thiserror::Error)] +pub enum Error { + #[error("Comment is not terminated.")] + UnterminatedDelimitedComment(#[from] UnterminatedDelimitedComment), + #[error("Delimiter is not terminated.")] + UndelimitedDelimiter(#[from] UndelimitedDelimiter), +} + +/// Source code contains an unclosed `/*` comment. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Getters, thiserror::Error)] +pub struct UnterminatedDelimitedComment { + /// Span of the unclosed `/*` that starts the comment. + pub span: Span, +} + +impl Display for UnterminatedDelimitedComment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}\n{}", + Message::new(Severity::Error, "found an unclosed `/*` comment"), + SourceCodeDisplay::new(&self.span, Option::::None) + ) + } +} + +/// Delimiter is not closed by its corresponding closing pair. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Getters, thiserror::Error)] +pub struct UndelimitedDelimiter { + /// Span of the opening delimiter. + pub opening_span: Span, + + /// Kind of the delimiter. + pub delimiter: Delimiter, +} + +impl Display for UndelimitedDelimiter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}\n{}", + Message::new(Severity::Error, "found an undelimited delimiter"), + SourceCodeDisplay::new( + &self.opening_span, + Some("this delimiter is not closed by its corresponding closing pair") + ) + ) + } +} diff --git a/src/lexical/mod.rs b/src/lexical/mod.rs new file mode 100644 index 0000000..440fef4 --- /dev/null +++ b/src/lexical/mod.rs @@ -0,0 +1,8 @@ +//! The lexical module is responsible for converting raw text into a stream of tokens that the parser can understand. + +pub mod token_stream; + +pub mod token; + +mod error; +pub use error::Error; diff --git a/src/lexical/token.rs b/src/lexical/token.rs new file mode 100644 index 0000000..7871734 --- /dev/null +++ b/src/lexical/token.rs @@ -0,0 +1,411 @@ +//! Contains the [`Token`] struct and its related types. + +use std::{collections::HashMap, str::FromStr, sync::OnceLock}; + +use crate::base::{ + source_file::{SourceElement, SourceIterator, Span}, + Handler, +}; +use derive_more::From; +use strum::IntoEnumIterator; +use strum_macros::EnumIter; + +use super::{error::UnterminatedDelimitedComment, Error}; + +/// Is an enumeration representing keywords in shulkerscript. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, EnumIter)] +#[allow(missing_docs)] +pub enum KeywordKind { + Function, + If, + Else, +} + +impl ToString for KeywordKind { + fn to_string(&self) -> String { + self.as_str().to_string() + } +} + +/// Is an error that is returned when a string cannot be parsed into a [`Keyword`] in [`FromStr`] +/// trait implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, thiserror::Error)] +#[error("invalid string representation of keyword.")] +pub struct KeywordParseError; + +impl FromStr for KeywordKind { + type Err = KeywordParseError; + + fn from_str(s: &str) -> Result { + static STRING_KEYWORD_MAP: OnceLock> = OnceLock::new(); + let map = STRING_KEYWORD_MAP.get_or_init(|| { + let mut map = HashMap::new(); + + for keyword in Self::iter() { + map.insert(keyword.as_str(), keyword); + } + + map + }); + + map.get(s).copied().ok_or(KeywordParseError) + } +} + +impl KeywordKind { + /// Gets the string representation of the keyword as a `&str`. + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::Function => "fn", + Self::If => "if", + Self::Else => "else", + } + } +} + +/// Is an enumeration containing all kinds of tokens in the Flux programming language. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, From)] +#[allow(missing_docs)] +pub enum Token { + WhiteSpaces(WhiteSpaces), + Identifier(Identifier), + Keyword(Keyword), + Punctuation(Punctuation), + Numeric(Numeric), + Comment(Comment), +} + +impl Token { + /// Returns the span of the token. + #[must_use] + pub fn span(&self) -> &Span { + match self { + Self::WhiteSpaces(token) => &token.span, + Self::Identifier(token) => &token.span, + Self::Keyword(token) => &token.span, + Self::Punctuation(token) => &token.span, + Self::Numeric(token) => &token.span, + Self::Comment(token) => &token.span, + } + } +} + +impl SourceElement for Token { + fn span(&self) -> Span { + match self { + Self::WhiteSpaces(token) => token.span(), + Self::Identifier(token) => token.span(), + Self::Keyword(token) => token.span(), + Self::Punctuation(token) => token.span(), + Self::Numeric(token) => token.span(), + Self::Comment(token) => token.span(), + } + } +} + +/// Represents a contiguous sequence of whitespace characters. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct WhiteSpaces { + /// Is the span that makes up the token. + pub span: Span, +} + +impl SourceElement for WhiteSpaces { + fn span(&self) -> Span { + self.span.clone() + } +} +/// Represents a contiguous sequence of characters that are valid in an identifier. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Identifier { + /// Is the span that makes up the token. + pub span: Span, +} + +impl SourceElement for Identifier { + fn span(&self) -> Span { + self.span.clone() + } +} + +/// Represents a contiguous sequence of characters that are reserved for a keyword. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Keyword { + /// Is the span that makes up the token. + pub span: Span, + + /// Is the [`KeywordKind`] that the token represents. + pub keyword: KeywordKind, +} + +impl SourceElement for Keyword { + fn span(&self) -> Span { + self.span.clone() + } +} + +/// Represents a single ASCII punctuation character. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Punctuation { + /// Is the span that makes up the token. + pub span: Span, + + /// Is the ASCII punctuation character that the token represents. + pub punctuation: char, +} + +impl SourceElement for Punctuation { + fn span(&self) -> Span { + self.span.clone() + } +} + +/// Represents a hardcoded numeric literal value in the source code. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Numeric { + /// Is the span that makes up the token. + pub span: Span, +} + +impl SourceElement for Numeric { + fn span(&self) -> Span { + self.span.clone() + } +} + +/// Is an enumeration representing the two kinds of comments in the Flux programming language. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CommentKind { + /// A comment that starts with `//` and ends at the end of the line. + Line, + + /// A comment that starts with `/*` and ends with `*/`. + Delimited, +} + +/// Represents a portion of the source code that is ignored by the interpreter. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Comment { + /// Is the span that makes up the token. + pub span: Span, + + /// Is the kind of comment that the token represents. + pub kind: CommentKind, +} + +impl SourceElement for Comment { + fn span(&self) -> Span { + self.span.clone() + } +} + +/// Is an error that can occur when invoking the [`Token::tokenize`] method. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, thiserror::Error, From)] +#[allow(missing_docs)] +pub enum TokenizeError { + #[error("encountered a fatal lexical error that causes the process to stop.")] + FatalLexicalError, + + #[error("the iterator argument is at the end of the source code.")] + EndOfSourceCodeIteratorArgument, +} + +impl Token { + /// Increments the iterator while the predicate returns true. + pub fn walk_iter(iter: &mut SourceIterator, predicate: impl Fn(char) -> bool) { + while let Some((_, character)) = iter.peek() { + if !predicate(character) { + break; + } + + iter.next(); + } + } + + /// Creates a span from the given start location to the current location of the iterator. + fn create_span(start: usize, iter: &mut SourceIterator) -> Span { + iter.peek().map_or_else( + || Span::to_end(iter.source_file().clone(), start).unwrap(), + |(index, _)| Span::new(iter.source_file().clone(), start, index).unwrap(), + ) + } + + /// Checks if the given character is a valid first character of an identifier. + fn is_first_identifier_character(character: char) -> bool { + character == '_' + || (!character.is_control() + && !character.is_whitespace() + && !character.is_ascii_punctuation() + && !character.is_ascii_digit()) + } + + /// Checks if the given character is a valid character of an identifier. + fn is_identifier_character(character: char) -> bool { + character == '_' + || (!character.is_control() + && !character.is_whitespace() + && !character.is_ascii_punctuation()) + } + + /// Handles a contiguous sequence of whitespace characters. + fn handle_whitespace(iter: &mut SourceIterator, start: usize) -> Self { + Self::walk_iter(iter, char::is_whitespace); + + WhiteSpaces { + span: Self::create_span(start, iter), + } + .into() + } + + /// Handles a contiguous sequence of characters that are valid in an identifier. + fn handle_identifier_and_keyword(iter: &mut SourceIterator, start: usize) -> Self { + Self::walk_iter(iter, Self::is_identifier_character); + + let span = Self::create_span(start, iter); + let word = span.str(); + + // Checks if the word is a keyword + KeywordKind::from_str(word).ok().map_or_else( + || Identifier { span: span.clone() }.into(), + |kw| { + Keyword { + span: span.clone(), + keyword: kw, + } + .into() + }, + ) + } + + /// Handles a sequence starting with a slash + fn handle_comment( + iter: &mut SourceIterator, + start: usize, + character: char, + handler: &impl Handler, + ) -> Result { + // Single line comment + if let Some((_, '/')) = iter.peek() { + iter.next(); + + Self::walk_iter(iter, |character| !(character == '\n' || character == '\r')); + + let is_cr = iter + .peek() + .map_or(false, |(_, character)| character == '\r'); + + if let (true, Some((_, '\n'))) = (is_cr, iter.next()) { + // skips the crlf + iter.next(); + } + + Ok(Comment { + span: Self::create_span(start, iter), + kind: CommentKind::Line, + } + .into()) + } + // Delimited comment + else if let Some((_, '*')) = iter.peek() { + iter.next(); + + let mut is_terminated = false; + + while let Some((_, character)) = iter.next() { + if character == '*' { + if let Some((_, '/')) = iter.peek() { + iter.next(); + + is_terminated = true; + + break; + } + } + } + + // Checks if the comment is terminated + if is_terminated { + Ok(Comment { + span: Self::create_span(start, iter), + kind: CommentKind::Delimited, + } + .into()) + } else { + handler.receive( + UnterminatedDelimitedComment { + span: Span::new(iter.source_file().clone(), start, start + 2).unwrap(), + } + .into(), + ); + return Err(TokenizeError::FatalLexicalError); + } + } + // Just a single slash punctuation + else { + Ok(Punctuation { + span: Self::create_span(start, iter), + punctuation: character, + } + .into()) + } + } + + /// Handles a sequence of digits + fn handle_numeric_literal(iter: &mut SourceIterator, start: usize) -> Self { + // Tokenizes the whole number part + Self::walk_iter(iter, |character| character.is_ascii_digit()); + + Numeric { + span: Self::create_span(start, iter), + } + .into() + } + + /// Lexes the source code from the given iterator. + /// + /// The tokenization starts at the current location of the iterator. The function moves the + /// iterator at least once and forwards it until it makes a token. After the token is made, the + /// iterator is left at the next character that is not part of the token. + /// + /// # Errors + /// - [`TokenizeError::EndOfSourceCodeIteratorArgument`] - The iterator argument is at the end of the + /// source code. + /// - [`TokenizeError::FatalLexicalError`] - A fatal lexical error occurred. + pub fn tokenize( + iter: &mut SourceIterator, + handler: &impl Handler, + ) -> Result { + // Gets the first character + let (start, character) = iter + .next() + .ok_or(TokenizeError::EndOfSourceCodeIteratorArgument)?; + + // Found white spaces + if character.is_whitespace() { + Ok(Self::handle_whitespace(iter, start)) + } + // Found identifier/keyword + else if Self::is_first_identifier_character(character) { + Ok(Self::handle_identifier_and_keyword(iter, start)) + } + // Found comment/single slash punctuation + else if character == '/' { + Self::handle_comment(iter, start, character, handler) + } + // Found numeric literal + else if character.is_ascii_digit() { + Ok(Self::handle_numeric_literal(iter, start)) + } + // Found a punctuation + else if character.is_ascii_punctuation() { + Ok(Punctuation { + span: Self::create_span(start, iter), + punctuation: character, + } + .into()) + } else { + unreachable!("all cases covered before") + } + } +} diff --git a/src/lexical/token_stream.rs b/src/lexical/token_stream.rs new file mode 100644 index 0000000..b43657a --- /dev/null +++ b/src/lexical/token_stream.rs @@ -0,0 +1,195 @@ +//! Contains the [`TokenStream`] struct and its related types. + +use std::{fmt::Debug, sync::Arc}; + +use derive_more::{Deref, From}; + +use crate::base::{source_file::SourceFile, Handler}; + +use super::{ + error::{self, UndelimitedDelimiter}, + token::{Punctuation, Token, TokenizeError}, +}; + +/// Is a list of well structured [`TokenTree`]s. +/// +/// This struct is the final output of the lexical analysis phase and is meant to be used by the +/// next stage of the compilation process. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Deref)] +pub struct TokenStream { + #[deref] + token_trees: Vec, +} + +impl Debug for TokenStream { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self.token_trees.iter()).finish() + } +} + +impl TokenStream { + /// Tokenizes the given source code. + /// + /// This function tokenizes the given iterator of source code by calling the + /// [`Token::tokenize()`] repeatedly until the iterator is exhausted. + /// + /// # Parameters + /// - `source_file_iterator`: The iterator that iterates over the source code. + /// + /// # Returns + /// A tuple containing the stream of successfully tokenized tokens and a list of lexical errors + /// encountered during tokenization. + #[must_use] + pub fn tokenize(source_file: &Arc, handler: &impl Handler) -> Self { + // The list of token trees that will be returned. + let mut tokens = Vec::new(); + let mut source_file_iterator = source_file.iter(); + + // Tokenize the source code. + loop { + match Token::tokenize(&mut source_file_iterator, handler) { + Ok(token) => tokens.push(token), + Err(TokenizeError::EndOfSourceCodeIteratorArgument) => { + break; + } + Err(TokenizeError::FatalLexicalError) => (), + } + } + + // reverse to use pop() instead of remove(0) + tokens.reverse(); + + // stucture the tokens into a token stream + let mut token_trees = Vec::new(); + while let Some(token_tree) = Self::handle_token(&mut tokens, handler) { + token_trees.push(token_tree); + } + + Self { token_trees } + } + + /// Handles a token. + fn handle_token( + tokens: &mut Vec, + handler: &impl Handler, + ) -> Option { + tokens + .pop() + .and_then(|token| Self::handle_popped_token(tokens, token, handler)) + } + + /// Handles a token after it has been popped. + fn handle_popped_token( + tokens: &mut Vec, + popped_token: Token, + handler: &dyn Handler, + ) -> Option { + match popped_token { + Token::Punctuation(punc) if punc.punctuation == '{' => { + Self::handle_delimited(tokens, punc, Delimiter::Brace, handler) + .map(TokenTree::Delimited) + } + Token::Punctuation(punc) if punc.punctuation == '[' => { + Self::handle_delimited(tokens, punc, Delimiter::Bracket, handler) + .map(TokenTree::Delimited) + } + Token::Punctuation(punc) if punc.punctuation == '(' => { + Self::handle_delimited(tokens, punc, Delimiter::Parenthesis, handler) + .map(TokenTree::Delimited) + } + token => Some(TokenTree::Token(token)), + } + } + + /// Handles a delimited token. + fn handle_delimited( + tokens: &mut Vec, + open: Punctuation, + delimiter: Delimiter, + handler: &dyn Handler, + ) -> Option { + let mut token_trees = Vec::new(); + + while let Some(token) = tokens.pop() { + match (token, delimiter) { + (Token::Punctuation(p), Delimiter::Brace) if p.punctuation == '}' => { + return Some(Delimited { + open, + token_stream: Self { token_trees }, + close: p, + delimiter, + }); + } + (Token::Punctuation(punc), Delimiter::Bracket) if punc.punctuation == ']' => { + return Some(Delimited { + open, + token_stream: Self { token_trees }, + close: punc, + delimiter, + }) + } + (Token::Punctuation(punc), Delimiter::Parenthesis) if punc.punctuation == ')' => { + return Some(Delimited { + open, + token_stream: Self { token_trees }, + close: punc, + delimiter, + }) + } + (token, _) => { + let Some(token_tree) = Self::handle_popped_token(tokens, token, handler) else { + break; + }; + + token_trees.push(token_tree); + } + } + } + + handler.receive(error::Error::UndelimitedDelimiter(UndelimitedDelimiter { + opening_span: open.span, + delimiter, + })); + + None + } + + /// Dissolves this struct into a tuple of its components. + #[must_use] + pub fn dissolve(self) -> Vec { + self.token_trees + } +} + +/// Is an enumeration of either a [`Token`] or a [`Delimited`]. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, From)] +#[allow(missing_docs)] +pub enum TokenTree { + Token(Token), + Delimited(Delimited), +} + +/// Is an enumeration of the different types of delimiters in the [`Delimited`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[allow(missing_docs)] +pub enum Delimiter { + Parenthesis, + Brace, + Bracket, +} + +/// Represents a list of tokens enclosed by a pair of delimiters. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Delimited { + /// The opening delimiter. + pub open: Punctuation, + + /// The stream of tokens inside the delimiter. + pub token_stream: TokenStream, + + /// The closing delimiter. + pub close: Punctuation, + + /// The type of delimiter. + pub delimiter: Delimiter, +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..4d90758 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,70 @@ +//! The `ShulkerScript` language. +//! +//! `ShulkerScript` is a simple, imperative scripting language for creating Minecraft data packs. + +#![deny( + missing_docs, + missing_debug_implementations, + missing_copy_implementations, + clippy::all, + clippy::pedantic, + clippy::nursery, + rustdoc::broken_intra_doc_links, + clippy::missing_errors_doc +)] +#![allow(clippy::missing_panics_doc, clippy::missing_const_for_fn)] + +pub mod base; +pub mod lexical; + +use std::{cell::Cell, fmt::Display, path::PathBuf}; + +use base::{source_file::SourceFile, Handler, Result}; + +use crate::{base::Error, lexical::token_stream::TokenStream}; + +/// Compiles the given source code. +/// +/// # Errors +/// - If an error occurs while reading the file. +pub fn compile(path: PathBuf) -> Result<()> { + let source_file = SourceFile::load(path)?; + + let printer = Printer::new(); + + let tokens = TokenStream::tokenize(&source_file, &printer); + + println!("{tokens:#?}"); + + if printer.has_printed() { + return Err(Error::Other( + "An error occurred while tokenizing the source code.", + )); + } + + Ok(()) +} + +struct Printer { + printed: Cell, +} + +impl Printer { + /// Creates a new [`Printer`]. + fn new() -> Self { + Self { + printed: Cell::new(false), + } + } + + fn has_printed(&self) -> bool { + self.printed.get() + } +} + +impl Handler for Printer { + fn receive(&self, error: E) { + eprintln!("{error}"); + self.printed.set(true); + } +}