feat(trixy-lang_parser): Add a lexer with error handling for trixy code

This commit is contained in:
Benedikt Peetz 2023-12-16 11:45:23 +01:00
parent cd2dbc516a
commit 3da75f6913
Signed by: bpeetz
GPG Key ID: A5E94010C3A642AD
12 changed files with 798 additions and 0 deletions

6
trixy/trixy-lang_parser/.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
# build
/target
/result
# lua_macros is a library
Cargo.lock

View File

@ -0,0 +1,11 @@
[package]
name = "trixy-lang_parser"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.4.11", features = ["derive"] }
pretty_assertions = "1.4.0"
thiserror = "1.0.50"

View File

@ -0,0 +1,9 @@
fn print(message: CommandTransferValue);
nasp trinitrix {
fn hi(name: String) -> String;
}
// That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing:
// vim: syntax=rust

View File

@ -0,0 +1,11 @@
fn print(message: CommandTransferValue);
nasp trinitrix {
fn hi(name: String) -> String;
}
namespace commands { >-
}
// That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing:
// vim: syntax=rust

View File

@ -0,0 +1,36 @@
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct CommandSpec {
pub(crate) declarations: Vec<Declaration>,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct Declaration {
pub(crate) namespace: Vec<Namespace>,
pub(crate) genus: Genus,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct Namespace {
pub(crate) name: String,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) enum Genus {
Function {
name: String,
inputs: Vec<NamedType>,
output: Type,
},
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct NamedType {
pub(crate) name: String,
pub(crate) base: Type,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) enum Type {
String,
Void,
}

View File

@ -0,0 +1,9 @@
use thiserror::Error;
use crate::lexing::error::SpannedLexingError;
#[derive(Error, Debug)]
pub enum TrixyError {
#[error(transparent)]
Parsing(#[from] SpannedLexingError),
}

View File

@ -0,0 +1,100 @@
use std::{error::Error, fmt::Display};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum LexingError {
#[error("No matches were found")]
NoMatchesTaken,
#[error("Expected an token, but reached end of file")]
UnexpectedEOF,
#[error("Char ('{0}') is not a know token!")]
UnknownCharacter(char),
#[error("The Arrow token must be of the form: ->")]
ExpectedArrow,
}
#[derive(Debug)]
pub enum SpannedLexingError {
Error {
source: LexingError,
/// The starting char index of the error in the source file
start: usize,
/// The starting char index of the error in the context line
contexted_start: usize,
/// The line above the error
line_above: String,
/// The line below the error
line_below: String,
/// The line in which the error occurred
line: String,
/// The line number of the main error line
line_number: usize,
},
}
impl Error for SpannedLexingError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
let Self::Error { source, .. } = self;
Some(source)
}
}
impl Display for SpannedLexingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let Self::Error {
source,
line_above,
line_below,
line,
line_number,
contexted_start,
..
} = self;
let error_line = {
let mut output = String::new();
output.push_str("\x1b[92;1m");
for _ in 0..(*contexted_start) {
output.push(' ');
}
line_number.to_string().chars().for_each(|_| {
output.push(' ');
});
output.push('^');
for _ in *contexted_start..(line.len() - 1) {
output.push('-');
}
output.push(' ');
let appandig_str = match source {
LexingError::NoMatchesTaken => "This token does not produce a possible match".to_owned(),
LexingError::UnexpectedEOF => "This eof was completely unexpected".to_owned(),
LexingError::UnknownCharacter(char) => format!("This char: `{char}`; is not a valid token"),
LexingError::ExpectedArrow => "The `-` token is interpretet as a started arrow (`->`), but we could not find the arrow tip (`>`)".to_owned(),
};
output.push_str(&appandig_str);
output.push_str("\x1b[0m");
output
};
writeln!(f, "\x1b[31;1merror: \x1b[37;1m{}\x1b[0m", source)?;
if !line_above.is_empty() {
writeln!(
f,
"\x1b[32;1m{} |\x1b[0m {}",
line_number - 1,
line_above
)?;
}
writeln!(f, "\x1b[36;1m{} |\x1b[0m {}", line_number, line)?;
writeln!(f, " {}", error_line)?;
if !line_below.is_empty() {
writeln!(
f,
"\x1b[32;1m{} |\x1b[0m {}",
line_number + 1,
line_below
)
} else {
write!(f, "")
}
}
}

View File

@ -0,0 +1,84 @@
use self::{error::SpannedLexingError, tokenizer::Tokenizer};
pub mod error;
mod tokenizer;
#[cfg(test)]
mod test;
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
pub struct TokenStream {
original_file: String,
tokens: Vec<Token>,
}
impl TokenStream {
/// Turn a string of valid Trixy code into a list of tokens, including the
/// location of that token's start and end point in the original source code.
///
/// Note the token indices represent the half-open interval `[start, end)`,
/// equivalent to `start .. end` in Rust.
pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
let mut tokenizer = Tokenizer::new(src);
let mut tokens = Vec::new();
while let Some(tok) = tokenizer.next_token()? {
tokens.push(tok);
}
Ok(Self {
tokens,
original_file: src.to_owned(),
})
}
}
/// A token span is recorded in chars starting from the beginning of the file:
/// A token span like this, for example:
/// ```no_run
/// TokenSpan {
/// start: 20,
/// end: 23,
/// }
/// ```
/// signals, that the token starts at the 20th char in the source file and ends on the 23rd.
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
pub struct TokenSpan {
start: usize,
/// The start of the token span
end: usize,
}
/// A Token
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
pub struct Token {
/// The token's original location in the source file
span: TokenSpan,
kind: TokenKind,
}
/// Possibly kinds of tokens
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
pub enum TokenKind {
Keyword(Keyword),
Identifier(String),
Colon,
Semicolon,
Comma,
Arrow,
BraceOpen,
BraceClose,
ParenthesisOpen,
ParenthesisClose,
}
/// Keywords used in the language
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
pub enum Keyword {
/// Start a namespace declaration
#[allow(non_camel_case_types)]
nasp,
/// Start a function declaration
#[allow(non_camel_case_types)]
r#fn,
}

View File

@ -0,0 +1,194 @@
use crate::lexing::{Keyword, Token, TokenKind, TokenSpan};
use super::TokenStream;
use pretty_assertions::assert_eq;
#[test]
fn test_lexing_trixy() {
let input = "
nasp commands {
fn expect(event: String) -> String;
}
";
let token_stream = TokenStream::lex(input).unwrap();
let expected_token_stream = {
let tokens = vec![
Token {
span: TokenSpan { start: 1, end: 5 },
kind: TokenKind::Keyword(Keyword::nasp),
},
Token {
span: TokenSpan { start: 6, end: 14 },
kind: TokenKind::Identifier("commands".to_owned()),
},
Token {
span: TokenSpan { start: 15, end: 16 },
kind: TokenKind::BraceOpen,
},
Token {
span: TokenSpan { start: 21, end: 23 },
kind: TokenKind::Keyword(Keyword::r#fn),
},
Token {
span: TokenSpan { start: 24, end: 30 },
kind: TokenKind::Identifier("expect".to_owned()),
},
Token {
span: TokenSpan { start: 30, end: 31 },
kind: TokenKind::ParenthesisOpen,
},
Token {
span: TokenSpan { start: 31, end: 36 },
kind: TokenKind::Identifier("event".to_owned()),
},
Token {
span: TokenSpan { start: 36, end: 37 },
kind: TokenKind::Colon,
},
Token {
span: TokenSpan { start: 38, end: 44 },
kind: TokenKind::Identifier("String".to_owned()),
},
Token {
span: TokenSpan { start: 44, end: 45 },
kind: TokenKind::ParenthesisClose,
},
Token {
span: TokenSpan { start: 46, end: 48 },
kind: TokenKind::Arrow,
},
Token {
span: TokenSpan { start: 49, end: 55 },
kind: TokenKind::Identifier("String".to_owned()),
},
Token {
span: TokenSpan { start: 55, end: 56 },
kind: TokenKind::Semicolon,
},
Token {
span: TokenSpan { start: 57, end: 58 },
kind: TokenKind::BraceClose,
},
];
TokenStream {
tokens,
original_file: input.to_owned(),
}
};
assert_eq!(token_stream, expected_token_stream)
}
#[test]
fn test_failing_lexing() {
let input = "
nasp trinitrix {
nasp - commands {
fn hi(strings: String) -> String;
}
}
";
let token_stream = TokenStream::lex(input);
eprintln!("{}", token_stream.as_ref().unwrap_err());
// uncomment the next line to see the error message, without having to remove cargo's output filter
// assert!(!token_stream.is_err());
assert!(token_stream.is_err());
}
#[test]
fn test_multiple_tokens() {
let input = "
nasp nasp {{
}}
";
let token_stream = TokenStream::lex(input).unwrap();
let expected_token_stream = {
let tokens = vec![
Token {
span: TokenSpan { start: 1, end: 5 },
kind: TokenKind::Keyword(Keyword::nasp),
},
Token {
span: TokenSpan { start: 6, end: 10 },
kind: TokenKind::Keyword(Keyword::nasp),
},
Token {
span: TokenSpan { start: 11, end: 12 },
kind: TokenKind::BraceOpen,
},
Token {
span: TokenSpan { start: 12, end: 13 },
kind: TokenKind::BraceOpen,
},
Token {
span: TokenSpan { start: 14, end: 15 },
kind: TokenKind::BraceClose,
},
Token {
span: TokenSpan { start: 15, end: 16 },
kind: TokenKind::BraceClose,
},
];
TokenStream {
tokens,
original_file: input.to_owned(),
}
};
assert_eq!(token_stream, expected_token_stream)
}
#[test]
fn test_comments() {
let input = "
// Some comment
nasp nasp {{
}}
// NOTE(@soispha): We do not support nested multi line comments <2023-12-16>
/* Some
* multi
* line
* comment
*/
";
let token_stream = TokenStream::lex(input)
.map_err(|e| {
eprintln!("{}", e);
panic!();
})
.unwrap();
let expected_token_stream = {
let tokens = vec![
Token {
span: TokenSpan { start: 33, end: 37 },
kind: TokenKind::Keyword(Keyword::nasp),
},
Token {
span: TokenSpan { start: 38, end: 42 },
kind: TokenKind::Keyword(Keyword::nasp),
},
Token {
span: TokenSpan { start: 43, end: 44 },
kind: TokenKind::BraceOpen,
},
Token {
span: TokenSpan { start: 44, end: 45 },
kind: TokenKind::BraceOpen,
},
Token {
span: TokenSpan { start: 55, end: 56 },
kind: TokenKind::BraceClose,
},
Token {
span: TokenSpan { start: 56, end: 57 },
kind: TokenKind::BraceClose,
},
];
TokenStream {
tokens,
original_file: input.to_owned(),
}
};
assert_eq!(token_stream, expected_token_stream)
}

View File

@ -0,0 +1,235 @@
// This code is heavily inspired by: https://michael-f-bryan.github.io/static-analyser-in-rust/book/lex.html
use crate::lexing::{Keyword, TokenSpan};
use super::{
error::{LexingError, SpannedLexingError},
Token, TokenKind,
};
pub(super) struct Tokenizer<'a> {
current_index: usize,
remaining_text: &'a str,
original_text: &'a str,
}
impl<'a> Tokenizer<'a> {
pub(super) fn new(input: &'a str) -> Self {
Self {
current_index: 0,
remaining_text: input,
original_text: input,
}
}
pub(super) fn next_token(&mut self) -> Result<Option<Token>, SpannedLexingError> {
self.skip_ignored_tokens();
if self.remaining_text.is_empty() {
return Ok(None);
} else {
let start = self.current_index;
let (token_kind, index) = self.get_next_tokenkind().map_err(|e| {
let (line_above, line, line_below, contexted_start, line_number) = {
let line_number = self
.original_text
.chars()
.take(start)
.filter(|a| a == &'\n')
.count();
let lines: Vec<_> = self.original_text.lines().collect();
let line = (*lines
.get(line_number)
.expect("This should work, as have *at least* one (index = 0) line"))
.to_owned();
let contexted_start = {
let matched_line: Vec<_> = self.original_text.match_indices(&line).collect();
let (index, matched_line) = matched_line.get(0).expect("This first index should always match, as we took the line from the string in the first place");
debug_assert_eq!(matched_line, &&line);
start - index
};
let line_above;
if line_number == 0 {
// We only have one line, so no line above
line_above = "".to_owned();
} else {
line_above = (*lines
.get(line_number - 1)
.expect("We checked that this should work"))
.to_owned();
}
let line_below;
if lines.len() - 1 > line_number {
// We have a line after the current line
line_below = (*lines
.get(line_number + 1)
.expect("We checked that this should work"))
.to_owned();
} else {
line_below = "".to_owned();
}
(line_above, line, line_below, contexted_start, line_number)
};
SpannedLexingError::Error {
source: e,
start,
contexted_start,
line_above,
line_below,
line_number,
line,
}
})?;
self.chomp(index); // end - start
let end = self.current_index;
Ok(Some(Token {
span: TokenSpan { start, end },
kind: token_kind,
}))
}
}
fn get_next_tokenkind(&mut self) -> Result<(TokenKind, usize), LexingError> {
let next = match self.remaining_text.chars().next() {
Some(c) => c,
None => return Err(LexingError::UnexpectedEOF),
};
let (tok, length) = match next {
'(' => (TokenKind::ParenthesisOpen, 1),
')' => (TokenKind::ParenthesisClose, 1),
'{' => (TokenKind::BraceOpen, 1),
'}' => (TokenKind::BraceClose, 1),
':' => (TokenKind::Colon, 1),
';' => (TokenKind::Semicolon, 1),
',' => (TokenKind::Comma, 1),
'-' => tokenize_arrow(self.remaining_text)?,
c @ '_' | c if c.is_alphanumeric() => tokenize_ident(self.remaining_text)?,
other => return Err(LexingError::UnknownCharacter(other)),
};
Ok((tok, length))
}
/// Skip past any whitespace characters or comments.
fn skip_ignored_tokens(&mut self) {
loop {
let ws = self.skip_whitespace();
let comments = self.skip_comments();
if ws + comments == 0 {
return;
}
}
}
fn skip_whitespace(&mut self) -> usize {
let mut remaining = self.remaining_text;
// Filter out whitespace
let _ws = {
let ws = match take_while(remaining, |ch| ch.is_whitespace()) {
Ok((_, bytes_skipped)) => bytes_skipped,
_ => 0,
};
remaining = &remaining[ws..];
ws
};
// let comments = skip_comments(remaining);
// remaining = &remaining[comments..];
let skip = self.remaining_text.len() - remaining.len();
self.chomp(skip);
skip
}
fn skip_comments(&mut self) -> usize {
let remaining = self.remaining_text;
let pairs = [("//", "\n"), ("/*", "*/")];
let mut skip = 0;
for &(pattern, matcher) in &pairs {
if remaining.starts_with(pattern) {
let leftovers = skip_until(remaining, matcher);
skip = remaining.len() - leftovers.len();
break;
}
}
self.chomp(skip);
skip
}
fn chomp(&mut self, chars_to_chomp: usize) {
self.remaining_text = &self.remaining_text[chars_to_chomp..];
self.current_index += chars_to_chomp;
}
}
fn tokenize_ident(text: &str) -> Result<(TokenKind, usize), LexingError> {
let (got, chars_read) = take_while(text, |ch| ch == '_' || ch.is_alphanumeric())?;
// Filter out keywords
let tokenkind = match got {
"nasp" => TokenKind::Keyword(Keyword::nasp),
"fn" => TokenKind::Keyword(Keyword::r#fn),
other => TokenKind::Identifier(other.to_string()),
};
Ok((tokenkind, chars_read))
}
fn tokenize_arrow(text: &str) -> Result<(TokenKind, usize), LexingError> {
let mut chars = text.chars();
if let Some(char) = chars.next() {
if char == '-' {
if let Some(char) = chars.next() {
if char == '>' {
return Ok((TokenKind::Arrow, 2));
}
}
}
}
// This is a implicit else as the other if clauses return
Err(LexingError::ExpectedArrow)
}
/// Consumes bytes while a predicate evaluates to true.
fn take_while<F>(data: &str, mut pred: F) -> Result<(&str, usize), LexingError>
where
F: FnMut(char) -> bool,
{
let mut current_index = 0;
for ch in data.chars() {
let should_continue = pred(ch);
if !should_continue {
break;
}
current_index += ch.len_utf8();
}
if current_index == 0 {
Err(LexingError::NoMatchesTaken)
} else {
Ok((&data[..current_index], current_index))
}
}
/// Skips input until the remaining string pattern starts with the pattern
fn skip_until<'a>(mut src: &'a str, pattern: &str) -> &'a str {
while !src.is_empty() && !src.starts_with(pattern) {
let next_char_size = src
.chars()
.next()
.expect("The string isn't empty")
.len_utf8();
src = &src[next_char_size..];
}
&src[pattern.len()..]
}

View File

@ -0,0 +1,58 @@
use error::TrixyError;
use crate::lexing::TokenStream;
use self::command_spec::CommandSpec;
mod command_spec;
pub mod error;
pub mod lexing;
pub fn parse_trixy_lang(input: &str) -> Result<CommandSpec, TrixyError> {
let input_tokens = TokenStream::lex(input)?;
todo!()
}
#[cfg(test)]
mod test {
use crate::{
command_spec::{CommandSpec, Declaration, Genus, NamedType, Namespace, Type},
parse_trixy_lang,
};
#[test]
fn test_function_with_namespace() {
let expected = parse_trixy_lang(
"
nasp commands {
fn say_something(name_to_greet: String, what_to_say: String) -> String;
}
",
)
.unwrap();
let correct: CommandSpec = {
let declarations = vec![Declaration {
namespace: vec![Namespace {
name: "commands".to_owned(),
}],
genus: Genus::Function {
name: "say_something".to_owned(),
inputs: vec![
NamedType {
name: "name_to_greet".to_owned(),
base: Type::String,
},
NamedType {
name: "what_to_say".to_owned(),
base: Type::String,
},
],
output: Type::String,
},
}];
CommandSpec { declarations }
};
assert_eq!(expected, correct);
}
}

View File

@ -0,0 +1,45 @@
use std::{fs, process::exit};
use trixy_lang_parser::lexing::TokenStream;
use std::path::PathBuf;
use clap::{Parser, Subcommand};
/// A helper command for the trixy-lang_parser crate
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
pub struct Args {
#[command(subcommand)]
/// The subcommand to execute
pub subcommand: Command,
}
#[derive(Subcommand, Debug)]
pub enum Command {
#[clap(value_parser)]
/// Only try to tokenize the file
Tokenize {
#[clap(value_parser)]
/// The file containing the trixy code to tokenize
file: PathBuf,
},
}
pub fn main() {
let args = Args::parse();
match args.subcommand {
Command::Tokenize { file } => {
let input = fs::read_to_string(file).unwrap();
let input_tokens = match TokenStream::lex(&input) {
Ok(err) => err,
Err(ok) => {
println!("{}", ok);
exit(1);
}
};
println!("{:#?}", input_tokens);
}
}
}