diff --git a/trixy/trixy-lang_parser/.gitignore b/trixy/trixy-lang_parser/.gitignore new file mode 100644 index 0000000..72fc7e3 --- /dev/null +++ b/trixy/trixy-lang_parser/.gitignore @@ -0,0 +1,6 @@ +# build +/target +/result + +# lua_macros is a library +Cargo.lock diff --git a/trixy/trixy-lang_parser/Cargo.toml b/trixy/trixy-lang_parser/Cargo.toml new file mode 100644 index 0000000..500ee94 --- /dev/null +++ b/trixy/trixy-lang_parser/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "trixy-lang_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +clap = { version = "4.4.11", features = ["derive"] } +pretty_assertions = "1.4.0" +thiserror = "1.0.50" diff --git a/trixy/trixy-lang_parser/example/example.tri b/trixy/trixy-lang_parser/example/example.tri new file mode 100644 index 0000000..c9b5c9a --- /dev/null +++ b/trixy/trixy-lang_parser/example/example.tri @@ -0,0 +1,9 @@ +fn print(message: CommandTransferValue); + +nasp trinitrix { + fn hi(name: String) -> String; +} + + +// That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing: +// vim: syntax=rust diff --git a/trixy/trixy-lang_parser/example/example_simple.tri b/trixy/trixy-lang_parser/example/example_simple.tri new file mode 100644 index 0000000..8cdb691 --- /dev/null +++ b/trixy/trixy-lang_parser/example/example_simple.tri @@ -0,0 +1,11 @@ +fn print(message: CommandTransferValue); + +nasp trinitrix { + fn hi(name: String) -> String; +} +namespace commands { >- +} + + +// That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing: +// vim: syntax=rust diff --git a/trixy/trixy-lang_parser/src/command_spec/mod.rs b/trixy/trixy-lang_parser/src/command_spec/mod.rs new file mode 100644 index 0000000..2832a12 --- /dev/null +++ b/trixy/trixy-lang_parser/src/command_spec/mod.rs @@ -0,0 +1,36 @@ +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct CommandSpec { + pub(crate) declarations: Vec, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Declaration { + pub(crate) namespace: Vec, + pub(crate) genus: Genus, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Namespace { + pub(crate) name: String, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) enum Genus { + Function { + name: String, + inputs: Vec, + output: Type, + }, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct NamedType { + pub(crate) name: String, + pub(crate) base: Type, +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) enum Type { + String, + Void, +} diff --git a/trixy/trixy-lang_parser/src/error.rs b/trixy/trixy-lang_parser/src/error.rs new file mode 100644 index 0000000..e0681a3 --- /dev/null +++ b/trixy/trixy-lang_parser/src/error.rs @@ -0,0 +1,9 @@ +use thiserror::Error; + +use crate::lexing::error::SpannedLexingError; + +#[derive(Error, Debug)] +pub enum TrixyError { + #[error(transparent)] + Parsing(#[from] SpannedLexingError), +} diff --git a/trixy/trixy-lang_parser/src/lexing/error.rs b/trixy/trixy-lang_parser/src/lexing/error.rs new file mode 100644 index 0000000..12177fb --- /dev/null +++ b/trixy/trixy-lang_parser/src/lexing/error.rs @@ -0,0 +1,100 @@ +use std::{error::Error, fmt::Display}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum LexingError { + #[error("No matches were found")] + NoMatchesTaken, + #[error("Expected an token, but reached end of file")] + UnexpectedEOF, + #[error("Char ('{0}') is not a know token!")] + UnknownCharacter(char), + #[error("The Arrow token must be of the form: ->")] + ExpectedArrow, +} + +#[derive(Debug)] +pub enum SpannedLexingError { + Error { + source: LexingError, + /// The starting char index of the error in the source file + start: usize, + /// The starting char index of the error in the context line + contexted_start: usize, + /// The line above the error + line_above: String, + /// The line below the error + line_below: String, + /// The line in which the error occurred + line: String, + /// The line number of the main error line + line_number: usize, + }, +} + +impl Error for SpannedLexingError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + let Self::Error { source, .. } = self; + Some(source) + } +} + +impl Display for SpannedLexingError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self::Error { + source, + line_above, + line_below, + line, + line_number, + contexted_start, + .. + } = self; + let error_line = { + let mut output = String::new(); + output.push_str("\x1b[92;1m"); + for _ in 0..(*contexted_start) { + output.push(' '); + } + line_number.to_string().chars().for_each(|_| { + output.push(' '); + }); + output.push('^'); + for _ in *contexted_start..(line.len() - 1) { + output.push('-'); + } + output.push(' '); + let appandig_str = match source { + LexingError::NoMatchesTaken => "This token does not produce a possible match".to_owned(), + LexingError::UnexpectedEOF => "This eof was completely unexpected".to_owned(), + LexingError::UnknownCharacter(char) => format!("This char: `{char}`; is not a valid token"), + LexingError::ExpectedArrow => "The `-` token is interpretet as a started arrow (`->`), but we could not find the arrow tip (`>`)".to_owned(), + }; + output.push_str(&appandig_str); + output.push_str("\x1b[0m"); + output + }; + + writeln!(f, "\x1b[31;1merror: \x1b[37;1m{}\x1b[0m", source)?; + if !line_above.is_empty() { + writeln!( + f, + "\x1b[32;1m{} |\x1b[0m {}", + line_number - 1, + line_above + )?; + } + writeln!(f, "\x1b[36;1m{} |\x1b[0m {}", line_number, line)?; + writeln!(f, " {}", error_line)?; + if !line_below.is_empty() { + writeln!( + f, + "\x1b[32;1m{} |\x1b[0m {}", + line_number + 1, + line_below + ) + } else { + write!(f, "") + } + } +} diff --git a/trixy/trixy-lang_parser/src/lexing/mod.rs b/trixy/trixy-lang_parser/src/lexing/mod.rs new file mode 100644 index 0000000..d601cb6 --- /dev/null +++ b/trixy/trixy-lang_parser/src/lexing/mod.rs @@ -0,0 +1,84 @@ +use self::{error::SpannedLexingError, tokenizer::Tokenizer}; + +pub mod error; +mod tokenizer; + +#[cfg(test)] +mod test; + +#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] +pub struct TokenStream { + original_file: String, + tokens: Vec, +} + +impl TokenStream { + /// Turn a string of valid Trixy code into a list of tokens, including the + /// location of that token's start and end point in the original source code. + /// + /// Note the token indices represent the half-open interval `[start, end)`, + /// equivalent to `start .. end` in Rust. + pub fn lex(src: &str) -> Result { + let mut tokenizer = Tokenizer::new(src); + let mut tokens = Vec::new(); + + while let Some(tok) = tokenizer.next_token()? { + tokens.push(tok); + } + + Ok(Self { + tokens, + original_file: src.to_owned(), + }) + } +} + +/// A token span is recorded in chars starting from the beginning of the file: +/// A token span like this, for example: +/// ```no_run +/// TokenSpan { +/// start: 20, +/// end: 23, +/// } +/// ``` +/// signals, that the token starts at the 20th char in the source file and ends on the 23rd. +#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] +pub struct TokenSpan { + start: usize, + /// The start of the token span + end: usize, +} + +/// A Token +#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] +pub struct Token { + /// The token's original location in the source file + span: TokenSpan, + kind: TokenKind, +} + +/// Possibly kinds of tokens +#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] +pub enum TokenKind { + Keyword(Keyword), + Identifier(String), + Colon, + Semicolon, + Comma, + Arrow, + BraceOpen, + BraceClose, + ParenthesisOpen, + ParenthesisClose, +} + +/// Keywords used in the language +#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] +pub enum Keyword { + /// Start a namespace declaration + #[allow(non_camel_case_types)] + nasp, + /// Start a function declaration + #[allow(non_camel_case_types)] + r#fn, +} diff --git a/trixy/trixy-lang_parser/src/lexing/test.rs b/trixy/trixy-lang_parser/src/lexing/test.rs new file mode 100644 index 0000000..43665ad --- /dev/null +++ b/trixy/trixy-lang_parser/src/lexing/test.rs @@ -0,0 +1,194 @@ +use crate::lexing::{Keyword, Token, TokenKind, TokenSpan}; + +use super::TokenStream; + +use pretty_assertions::assert_eq; + +#[test] +fn test_lexing_trixy() { + let input = " +nasp commands { + fn expect(event: String) -> String; +} +"; + let token_stream = TokenStream::lex(input).unwrap(); + let expected_token_stream = { + let tokens = vec![ + Token { + span: TokenSpan { start: 1, end: 5 }, + kind: TokenKind::Keyword(Keyword::nasp), + }, + Token { + span: TokenSpan { start: 6, end: 14 }, + kind: TokenKind::Identifier("commands".to_owned()), + }, + Token { + span: TokenSpan { start: 15, end: 16 }, + kind: TokenKind::BraceOpen, + }, + Token { + span: TokenSpan { start: 21, end: 23 }, + kind: TokenKind::Keyword(Keyword::r#fn), + }, + Token { + span: TokenSpan { start: 24, end: 30 }, + kind: TokenKind::Identifier("expect".to_owned()), + }, + Token { + span: TokenSpan { start: 30, end: 31 }, + kind: TokenKind::ParenthesisOpen, + }, + Token { + span: TokenSpan { start: 31, end: 36 }, + kind: TokenKind::Identifier("event".to_owned()), + }, + Token { + span: TokenSpan { start: 36, end: 37 }, + kind: TokenKind::Colon, + }, + Token { + span: TokenSpan { start: 38, end: 44 }, + kind: TokenKind::Identifier("String".to_owned()), + }, + Token { + span: TokenSpan { start: 44, end: 45 }, + kind: TokenKind::ParenthesisClose, + }, + Token { + span: TokenSpan { start: 46, end: 48 }, + kind: TokenKind::Arrow, + }, + Token { + span: TokenSpan { start: 49, end: 55 }, + kind: TokenKind::Identifier("String".to_owned()), + }, + Token { + span: TokenSpan { start: 55, end: 56 }, + kind: TokenKind::Semicolon, + }, + Token { + span: TokenSpan { start: 57, end: 58 }, + kind: TokenKind::BraceClose, + }, + ]; + TokenStream { + tokens, + original_file: input.to_owned(), + } + }; + assert_eq!(token_stream, expected_token_stream) +} + +#[test] +fn test_failing_lexing() { + let input = " +nasp trinitrix { + nasp - commands { + fn hi(strings: String) -> String; + } +} +"; + let token_stream = TokenStream::lex(input); + eprintln!("{}", token_stream.as_ref().unwrap_err()); + + // uncomment the next line to see the error message, without having to remove cargo's output filter + // assert!(!token_stream.is_err()); + assert!(token_stream.is_err()); +} + +#[test] +fn test_multiple_tokens() { + let input = " +nasp nasp {{ +}} +"; + let token_stream = TokenStream::lex(input).unwrap(); + let expected_token_stream = { + let tokens = vec![ + Token { + span: TokenSpan { start: 1, end: 5 }, + kind: TokenKind::Keyword(Keyword::nasp), + }, + Token { + span: TokenSpan { start: 6, end: 10 }, + kind: TokenKind::Keyword(Keyword::nasp), + }, + Token { + span: TokenSpan { start: 11, end: 12 }, + kind: TokenKind::BraceOpen, + }, + Token { + span: TokenSpan { start: 12, end: 13 }, + kind: TokenKind::BraceOpen, + }, + Token { + span: TokenSpan { start: 14, end: 15 }, + kind: TokenKind::BraceClose, + }, + Token { + span: TokenSpan { start: 15, end: 16 }, + kind: TokenKind::BraceClose, + }, + ]; + TokenStream { + tokens, + original_file: input.to_owned(), + } + }; + assert_eq!(token_stream, expected_token_stream) +} + +#[test] +fn test_comments() { + let input = " + // Some comment + nasp nasp {{ + + }} + // NOTE(@soispha): We do not support nested multi line comments <2023-12-16> + /* Some + * multi + * line + * comment + */ +"; + let token_stream = TokenStream::lex(input) + .map_err(|e| { + eprintln!("{}", e); + panic!(); + }) + .unwrap(); + let expected_token_stream = { + let tokens = vec![ + Token { + span: TokenSpan { start: 33, end: 37 }, + kind: TokenKind::Keyword(Keyword::nasp), + }, + Token { + span: TokenSpan { start: 38, end: 42 }, + kind: TokenKind::Keyword(Keyword::nasp), + }, + Token { + span: TokenSpan { start: 43, end: 44 }, + kind: TokenKind::BraceOpen, + }, + Token { + span: TokenSpan { start: 44, end: 45 }, + kind: TokenKind::BraceOpen, + }, + Token { + span: TokenSpan { start: 55, end: 56 }, + kind: TokenKind::BraceClose, + }, + Token { + span: TokenSpan { start: 56, end: 57 }, + kind: TokenKind::BraceClose, + }, + ]; + TokenStream { + tokens, + original_file: input.to_owned(), + } + }; + assert_eq!(token_stream, expected_token_stream) +} diff --git a/trixy/trixy-lang_parser/src/lexing/tokenizer.rs b/trixy/trixy-lang_parser/src/lexing/tokenizer.rs new file mode 100644 index 0000000..67986e1 --- /dev/null +++ b/trixy/trixy-lang_parser/src/lexing/tokenizer.rs @@ -0,0 +1,235 @@ +// This code is heavily inspired by: https://michael-f-bryan.github.io/static-analyser-in-rust/book/lex.html + +use crate::lexing::{Keyword, TokenSpan}; + +use super::{ + error::{LexingError, SpannedLexingError}, + Token, TokenKind, +}; + +pub(super) struct Tokenizer<'a> { + current_index: usize, + remaining_text: &'a str, + original_text: &'a str, +} + +impl<'a> Tokenizer<'a> { + pub(super) fn new(input: &'a str) -> Self { + Self { + current_index: 0, + remaining_text: input, + original_text: input, + } + } + pub(super) fn next_token(&mut self) -> Result, SpannedLexingError> { + self.skip_ignored_tokens(); + if self.remaining_text.is_empty() { + return Ok(None); + } else { + let start = self.current_index; + + let (token_kind, index) = self.get_next_tokenkind().map_err(|e| { + let (line_above, line, line_below, contexted_start, line_number) = { + let line_number = self + .original_text + .chars() + .take(start) + .filter(|a| a == &'\n') + .count(); + let lines: Vec<_> = self.original_text.lines().collect(); + + let line = (*lines + .get(line_number) + .expect("This should work, as have *at least* one (index = 0) line")) + .to_owned(); + + let contexted_start = { + let matched_line: Vec<_> = self.original_text.match_indices(&line).collect(); + let (index, matched_line) = matched_line.get(0).expect("This first index should always match, as we took the line from the string in the first place"); + debug_assert_eq!(matched_line, &&line); + start - index + }; + + let line_above; + if line_number == 0 { + // We only have one line, so no line above + line_above = "".to_owned(); + } else { + line_above = (*lines + .get(line_number - 1) + .expect("We checked that this should work")) + .to_owned(); + } + + let line_below; + if lines.len() - 1 > line_number { + // We have a line after the current line + line_below = (*lines + .get(line_number + 1) + .expect("We checked that this should work")) + .to_owned(); + } else { + line_below = "".to_owned(); + } + + (line_above, line, line_below, contexted_start, line_number) + }; + SpannedLexingError::Error { + source: e, + start, + contexted_start, + line_above, + line_below, + line_number, + line, + } + })?; + self.chomp(index); // end - start + let end = self.current_index; + Ok(Some(Token { + span: TokenSpan { start, end }, + kind: token_kind, + })) + } + } + + fn get_next_tokenkind(&mut self) -> Result<(TokenKind, usize), LexingError> { + let next = match self.remaining_text.chars().next() { + Some(c) => c, + None => return Err(LexingError::UnexpectedEOF), + }; + + let (tok, length) = match next { + '(' => (TokenKind::ParenthesisOpen, 1), + ')' => (TokenKind::ParenthesisClose, 1), + '{' => (TokenKind::BraceOpen, 1), + '}' => (TokenKind::BraceClose, 1), + ':' => (TokenKind::Colon, 1), + ';' => (TokenKind::Semicolon, 1), + ',' => (TokenKind::Comma, 1), + '-' => tokenize_arrow(self.remaining_text)?, + c @ '_' | c if c.is_alphanumeric() => tokenize_ident(self.remaining_text)?, + other => return Err(LexingError::UnknownCharacter(other)), + }; + + Ok((tok, length)) + } + + /// Skip past any whitespace characters or comments. + fn skip_ignored_tokens(&mut self) { + loop { + let ws = self.skip_whitespace(); + let comments = self.skip_comments(); + + if ws + comments == 0 { + return; + } + } + } + fn skip_whitespace(&mut self) -> usize { + let mut remaining = self.remaining_text; + + // Filter out whitespace + let _ws = { + let ws = match take_while(remaining, |ch| ch.is_whitespace()) { + Ok((_, bytes_skipped)) => bytes_skipped, + _ => 0, + }; + remaining = &remaining[ws..]; + ws + }; + // let comments = skip_comments(remaining); + // remaining = &remaining[comments..]; + + let skip = self.remaining_text.len() - remaining.len(); + self.chomp(skip); + skip + } + + fn skip_comments(&mut self) -> usize { + let remaining = self.remaining_text; + let pairs = [("//", "\n"), ("/*", "*/")]; + + let mut skip = 0; + for &(pattern, matcher) in &pairs { + if remaining.starts_with(pattern) { + let leftovers = skip_until(remaining, matcher); + skip = remaining.len() - leftovers.len(); + break; + } + } + self.chomp(skip); + skip + } + + fn chomp(&mut self, chars_to_chomp: usize) { + self.remaining_text = &self.remaining_text[chars_to_chomp..]; + self.current_index += chars_to_chomp; + } +} + +fn tokenize_ident(text: &str) -> Result<(TokenKind, usize), LexingError> { + let (got, chars_read) = take_while(text, |ch| ch == '_' || ch.is_alphanumeric())?; + + // Filter out keywords + let tokenkind = match got { + "nasp" => TokenKind::Keyword(Keyword::nasp), + "fn" => TokenKind::Keyword(Keyword::r#fn), + other => TokenKind::Identifier(other.to_string()), + }; + + Ok((tokenkind, chars_read)) +} + +fn tokenize_arrow(text: &str) -> Result<(TokenKind, usize), LexingError> { + let mut chars = text.chars(); + if let Some(char) = chars.next() { + if char == '-' { + if let Some(char) = chars.next() { + if char == '>' { + return Ok((TokenKind::Arrow, 2)); + } + } + } + } + // This is a implicit else as the other if clauses return + Err(LexingError::ExpectedArrow) +} + +/// Consumes bytes while a predicate evaluates to true. +fn take_while(data: &str, mut pred: F) -> Result<(&str, usize), LexingError> +where + F: FnMut(char) -> bool, +{ + let mut current_index = 0; + + for ch in data.chars() { + let should_continue = pred(ch); + + if !should_continue { + break; + } + + current_index += ch.len_utf8(); + } + + if current_index == 0 { + Err(LexingError::NoMatchesTaken) + } else { + Ok((&data[..current_index], current_index)) + } +} + +/// Skips input until the remaining string pattern starts with the pattern +fn skip_until<'a>(mut src: &'a str, pattern: &str) -> &'a str { + while !src.is_empty() && !src.starts_with(pattern) { + let next_char_size = src + .chars() + .next() + .expect("The string isn't empty") + .len_utf8(); + src = &src[next_char_size..]; + } + + &src[pattern.len()..] +} diff --git a/trixy/trixy-lang_parser/src/lib.rs b/trixy/trixy-lang_parser/src/lib.rs new file mode 100644 index 0000000..1167aea --- /dev/null +++ b/trixy/trixy-lang_parser/src/lib.rs @@ -0,0 +1,58 @@ +use error::TrixyError; + +use crate::lexing::TokenStream; + +use self::command_spec::CommandSpec; + +mod command_spec; +pub mod error; +pub mod lexing; + +pub fn parse_trixy_lang(input: &str) -> Result { + let input_tokens = TokenStream::lex(input)?; + + todo!() +} + +#[cfg(test)] +mod test { + use crate::{ + command_spec::{CommandSpec, Declaration, Genus, NamedType, Namespace, Type}, + parse_trixy_lang, + }; + + #[test] + fn test_function_with_namespace() { + let expected = parse_trixy_lang( + " + nasp commands { + fn say_something(name_to_greet: String, what_to_say: String) -> String; + } + ", + ) + .unwrap(); + let correct: CommandSpec = { + let declarations = vec![Declaration { + namespace: vec![Namespace { + name: "commands".to_owned(), + }], + genus: Genus::Function { + name: "say_something".to_owned(), + inputs: vec![ + NamedType { + name: "name_to_greet".to_owned(), + base: Type::String, + }, + NamedType { + name: "what_to_say".to_owned(), + base: Type::String, + }, + ], + output: Type::String, + }, + }]; + CommandSpec { declarations } + }; + assert_eq!(expected, correct); + } +} diff --git a/trixy/trixy-lang_parser/src/main.rs b/trixy/trixy-lang_parser/src/main.rs new file mode 100644 index 0000000..c6f8104 --- /dev/null +++ b/trixy/trixy-lang_parser/src/main.rs @@ -0,0 +1,45 @@ +use std::{fs, process::exit}; + +use trixy_lang_parser::lexing::TokenStream; + +use std::path::PathBuf; + +use clap::{Parser, Subcommand}; + +/// A helper command for the trixy-lang_parser crate +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +pub struct Args { + #[command(subcommand)] + /// The subcommand to execute + pub subcommand: Command, +} +#[derive(Subcommand, Debug)] +pub enum Command { + #[clap(value_parser)] + /// Only try to tokenize the file + Tokenize { + #[clap(value_parser)] + /// The file containing the trixy code to tokenize + file: PathBuf, + }, +} + +pub fn main() { + let args = Args::parse(); + match args.subcommand { + Command::Tokenize { file } => { + let input = fs::read_to_string(file).unwrap(); + + let input_tokens = match TokenStream::lex(&input) { + Ok(err) => err, + Err(ok) => { + println!("{}", ok); + exit(1); + } + }; + + println!("{:#?}", input_tokens); + } + } +}