feat(trixy-lang_parser): Add a lexer with error handling for trixy code

2023-12-16 11:45:23 +01:00 · 2023-12-16 11:45:23 +01:00 · 3da75f6913
parent cd2dbc516a
commit 3da75f6913
12 changed files with 798 additions and 0 deletions
--- a/trixy/trixy-lang_parser/.gitignore
+++ b/trixy/trixy-lang_parser/.gitignore
@ -0,0 +1,6 @@
 # build
 /target
 /result
 # lua_macros is a library
 Cargo.lock
--- a/trixy/trixy-lang_parser/Cargo.toml
+++ b/trixy/trixy-lang_parser/Cargo.toml
@ -0,0 +1,11 @@
 [package]
 name = "trixy-lang_parser"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 clap = { version = "4.4.11", features = ["derive"] }
 pretty_assertions = "1.4.0"
 thiserror = "1.0.50"
--- a/trixy/trixy-lang_parser/example/example.tri
+++ b/trixy/trixy-lang_parser/example/example.tri
@ -0,0 +1,9 @@
 fn print(message: CommandTransferValue);
 nasp trinitrix {
    fn hi(name: String) -> String;
 }
 // That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing:
 // vim: syntax=rust
--- a/trixy/trixy-lang_parser/example/example_simple.tri
+++ b/trixy/trixy-lang_parser/example/example_simple.tri
@ -0,0 +1,11 @@
 fn print(message: CommandTransferValue);
 nasp trinitrix {
    fn hi(name: String) -> String;
 }
 namespace commands { >-
 }
 // That's a flat out lie, but it results in a rather nice syntax highlight compared to nothing:
 // vim: syntax=rust
--- a/trixy/trixy-lang_parser/src/command_spec/mod.rs
+++ b/trixy/trixy-lang_parser/src/command_spec/mod.rs
@ -0,0 +1,36 @@
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub struct CommandSpec {
    pub(crate) declarations: Vec<Declaration>,
 }
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct Declaration {
    pub(crate) namespace: Vec<Namespace>,
    pub(crate) genus: Genus,
 }
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct Namespace {
    pub(crate) name: String,
 }
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) enum Genus {
    Function {
        name: String,
        inputs: Vec<NamedType>,
        output: Type,
    },
 }
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct NamedType {
    pub(crate) name: String,
    pub(crate) base: Type,
 }
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) enum Type {
    String,
    Void,
 }
--- a/trixy/trixy-lang_parser/src/error.rs
+++ b/trixy/trixy-lang_parser/src/error.rs
@ -0,0 +1,9 @@
 use thiserror::Error;
 use crate::lexing::error::SpannedLexingError;
 #[derive(Error, Debug)]
 pub enum TrixyError {
    #[error(transparent)]
    Parsing(#[from] SpannedLexingError),
 }
--- a/trixy/trixy-lang_parser/src/lexing/error.rs
+++ b/trixy/trixy-lang_parser/src/lexing/error.rs
@ -0,0 +1,100 @@
 use std::{error::Error, fmt::Display};
 use thiserror::Error;
 #[derive(Error, Debug)]
 pub enum LexingError {
    #[error("No matches were found")]
    NoMatchesTaken,
    #[error("Expected an token, but reached end of file")]
    UnexpectedEOF,
    #[error("Char ('{0}') is not a know token!")]
    UnknownCharacter(char),
    #[error("The Arrow token must be of the form: ->")]
    ExpectedArrow,
 }
 #[derive(Debug)]
 pub enum SpannedLexingError {
    Error {
        source: LexingError,
        /// The starting char index of the error in the source file
        start: usize,
        /// The starting char index of the error in the context line
        contexted_start: usize,
        /// The line above the error
        line_above: String,
        /// The line below the error
        line_below: String,
        /// The line in which the error occurred
        line: String,
        /// The line number of the main error line
        line_number: usize,
    },
 }
 impl Error for SpannedLexingError {
    fn source(&self) -> Option<&(dyn Error + 'static)> {
        let Self::Error { source, .. } = self;
        Some(source)
    }
 }
 impl Display for SpannedLexingError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let Self::Error {
            source,
            line_above,
            line_below,
            line,
            line_number,
            contexted_start,
            ..
        } = self;
        let error_line = {
            let mut output = String::new();
            output.push_str("\x1b[92;1m");
            for _ in 0..(*contexted_start) {
                output.push(' ');
            }
            line_number.to_string().chars().for_each(|_| {
                output.push(' ');
            });
            output.push('^');
            for _ in *contexted_start..(line.len() - 1) {
                output.push('-');
            }
            output.push(' ');
            let appandig_str = match source {
                LexingError::NoMatchesTaken => "This token does not produce a possible match".to_owned(),
                LexingError::UnexpectedEOF => "This eof was completely unexpected".to_owned(),
                LexingError::UnknownCharacter(char) => format!("This char: `{char}`; is not a valid token"),
                LexingError::ExpectedArrow => "The `-` token is interpretet as a started arrow (`->`), but we could not find the arrow tip (`>`)".to_owned(),
            };
            output.push_str(&appandig_str);
            output.push_str("\x1b[0m");
            output
        };
        writeln!(f, "\x1b[31;1merror: \x1b[37;1m{}\x1b[0m", source)?;
        if !line_above.is_empty() {
            writeln!(
                f,
                "\x1b[32;1m{} |\x1b[0m     {}",
                line_number - 1,
                line_above
            )?;
        }
        writeln!(f, "\x1b[36;1m{} |\x1b[0m     {}", line_number, line)?;
        writeln!(f, "       {}", error_line)?;
        if !line_below.is_empty() {
            writeln!(
                f,
                "\x1b[32;1m{} |\x1b[0m     {}",
                line_number + 1,
                line_below
            )
        } else {
            write!(f, "")
        }
    }
 }
--- a/trixy/trixy-lang_parser/src/lexing/mod.rs
+++ b/trixy/trixy-lang_parser/src/lexing/mod.rs
@ -0,0 +1,84 @@
 use self::{error::SpannedLexingError, tokenizer::Tokenizer};
 pub mod error;
 mod tokenizer;
 #[cfg(test)]
 mod test;
 #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
 pub struct TokenStream {
    original_file: String,
    tokens: Vec<Token>,
 }
 impl TokenStream {
    /// Turn a string of valid Trixy code into a list of tokens, including the
    /// location of that token's start and end point in the original source code.
    ///
    /// Note the token indices represent the half-open interval `[start, end)`,
    /// equivalent to `start .. end` in Rust.
    pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
        let mut tokenizer = Tokenizer::new(src);
        let mut tokens = Vec::new();
        while let Some(tok) = tokenizer.next_token()? {
            tokens.push(tok);
        }
        Ok(Self {
            tokens,
            original_file: src.to_owned(),
        })
    }
 }
 /// A token span is recorded in chars starting from the beginning of the file:
 /// A token span like this, for example:
 /// ```no_run
 /// TokenSpan {
 ///     start: 20,
 ///     end: 23,
 /// }
 /// ```
 /// signals, that the token starts at the 20th char in the source file and ends on the 23rd.
 #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
 pub struct TokenSpan {
    start: usize,
    /// The start of the token span
    end: usize,
 }
 /// A Token
 #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
 pub struct Token {
    /// The token's original location in the source file
    span: TokenSpan,
    kind: TokenKind,
 }
 /// Possibly kinds of tokens
 #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
 pub enum TokenKind {
    Keyword(Keyword),
    Identifier(String),
    Colon,
    Semicolon,
    Comma,
    Arrow,
    BraceOpen,
    BraceClose,
    ParenthesisOpen,
    ParenthesisClose,
 }
 /// Keywords used in the language
 #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)]
 pub enum Keyword {
    /// Start a namespace declaration
    #[allow(non_camel_case_types)]
    nasp,
    /// Start a function declaration
    #[allow(non_camel_case_types)]
    r#fn,
 }
--- a/trixy/trixy-lang_parser/src/lexing/test.rs
+++ b/trixy/trixy-lang_parser/src/lexing/test.rs
@ -0,0 +1,194 @@
 use crate::lexing::{Keyword, Token, TokenKind, TokenSpan};
 use super::TokenStream;
 use pretty_assertions::assert_eq;
 #[test]
 fn test_lexing_trixy() {
    let input = "
 nasp commands {
    fn expect(event: String) -> String;
 }
 ";
    let token_stream = TokenStream::lex(input).unwrap();
    let expected_token_stream = {
        let tokens = vec![
            Token {
                span: TokenSpan { start: 1, end: 5 },
                kind: TokenKind::Keyword(Keyword::nasp),
            },
            Token {
                span: TokenSpan { start: 6, end: 14 },
                kind: TokenKind::Identifier("commands".to_owned()),
            },
            Token {
                span: TokenSpan { start: 15, end: 16 },
                kind: TokenKind::BraceOpen,
            },
            Token {
                span: TokenSpan { start: 21, end: 23 },
                kind: TokenKind::Keyword(Keyword::r#fn),
            },
            Token {
                span: TokenSpan { start: 24, end: 30 },
                kind: TokenKind::Identifier("expect".to_owned()),
            },
            Token {
                span: TokenSpan { start: 30, end: 31 },
                kind: TokenKind::ParenthesisOpen,
            },
            Token {
                span: TokenSpan { start: 31, end: 36 },
                kind: TokenKind::Identifier("event".to_owned()),
            },
            Token {
                span: TokenSpan { start: 36, end: 37 },
                kind: TokenKind::Colon,
            },
            Token {
                span: TokenSpan { start: 38, end: 44 },
                kind: TokenKind::Identifier("String".to_owned()),
            },
            Token {
                span: TokenSpan { start: 44, end: 45 },
                kind: TokenKind::ParenthesisClose,
            },
            Token {
                span: TokenSpan { start: 46, end: 48 },
                kind: TokenKind::Arrow,
            },
            Token {
                span: TokenSpan { start: 49, end: 55 },
                kind: TokenKind::Identifier("String".to_owned()),
            },
            Token {
                span: TokenSpan { start: 55, end: 56 },
                kind: TokenKind::Semicolon,
            },
            Token {
                span: TokenSpan { start: 57, end: 58 },
                kind: TokenKind::BraceClose,
            },
        ];
        TokenStream {
            tokens,
            original_file: input.to_owned(),
        }
    };
    assert_eq!(token_stream, expected_token_stream)
 }
 #[test]
 fn test_failing_lexing() {
    let input = "
 nasp trinitrix {
    nasp - commands {
        fn hi(strings: String) -> String;
    }
 }
 ";
    let token_stream = TokenStream::lex(input);
    eprintln!("{}", token_stream.as_ref().unwrap_err());
    // uncomment the next line to see the error message, without having to remove cargo's output filter
    // assert!(!token_stream.is_err());
    assert!(token_stream.is_err());
 }
 #[test]
 fn test_multiple_tokens() {
    let input = "
 nasp nasp {{
 }}
 ";
    let token_stream = TokenStream::lex(input).unwrap();
    let expected_token_stream = {
        let tokens = vec![
            Token {
                span: TokenSpan { start: 1, end: 5 },
                kind: TokenKind::Keyword(Keyword::nasp),
            },
            Token {
                span: TokenSpan { start: 6, end: 10 },
                kind: TokenKind::Keyword(Keyword::nasp),
            },
            Token {
                span: TokenSpan { start: 11, end: 12 },
                kind: TokenKind::BraceOpen,
            },
            Token {
                span: TokenSpan { start: 12, end: 13 },
                kind: TokenKind::BraceOpen,
            },
            Token {
                span: TokenSpan { start: 14, end: 15 },
                kind: TokenKind::BraceClose,
            },
            Token {
                span: TokenSpan { start: 15, end: 16 },
                kind: TokenKind::BraceClose,
            },
        ];
        TokenStream {
            tokens,
            original_file: input.to_owned(),
        }
    };
    assert_eq!(token_stream, expected_token_stream)
 }
 #[test]
 fn test_comments() {
    let input = "
        // Some comment
        nasp nasp {{
        }}
        // NOTE(@soispha): We do not support nested multi line comments <2023-12-16>
        /* Some
        * multi
        * line
        * comment
        */
 ";
    let token_stream = TokenStream::lex(input)
        .map_err(|e| {
            eprintln!("{}", e);
            panic!();
        })
        .unwrap();
    let expected_token_stream = {
        let tokens = vec![
            Token {
                span: TokenSpan { start: 33, end: 37 },
                kind: TokenKind::Keyword(Keyword::nasp),
            },
            Token {
                span: TokenSpan { start: 38, end: 42 },
                kind: TokenKind::Keyword(Keyword::nasp),
            },
            Token {
                span: TokenSpan { start: 43, end: 44 },
                kind: TokenKind::BraceOpen,
            },
            Token {
                span: TokenSpan { start: 44, end: 45 },
                kind: TokenKind::BraceOpen,
            },
            Token {
                span: TokenSpan { start: 55, end: 56 },
                kind: TokenKind::BraceClose,
            },
            Token {
                span: TokenSpan { start: 56, end: 57 },
                kind: TokenKind::BraceClose,
            },
        ];
        TokenStream {
            tokens,
            original_file: input.to_owned(),
        }
    };
    assert_eq!(token_stream, expected_token_stream)
 }
--- a/trixy/trixy-lang_parser/src/lexing/tokenizer.rs
+++ b/trixy/trixy-lang_parser/src/lexing/tokenizer.rs
@ -0,0 +1,235 @@
 // This code is heavily inspired by: https://michael-f-bryan.github.io/static-analyser-in-rust/book/lex.html
 use crate::lexing::{Keyword, TokenSpan};
 use super::{
    error::{LexingError, SpannedLexingError},
    Token, TokenKind,
 };
 pub(super) struct Tokenizer<'a> {
    current_index: usize,
    remaining_text: &'a str,
    original_text: &'a str,
 }
 impl<'a> Tokenizer<'a> {
    pub(super) fn new(input: &'a str) -> Self {
        Self {
            current_index: 0,
            remaining_text: input,
            original_text: input,
        }
    }
    pub(super) fn next_token(&mut self) -> Result<Option<Token>, SpannedLexingError> {
        self.skip_ignored_tokens();
        if self.remaining_text.is_empty() {
            return Ok(None);
        } else {
            let start = self.current_index;
            let (token_kind, index) = self.get_next_tokenkind().map_err(|e| {
                let (line_above, line, line_below, contexted_start, line_number) = {
                    let line_number = self
                        .original_text
                        .chars()
                        .take(start)
                        .filter(|a| a == &'\n')
                        .count();
                    let lines: Vec<_> = self.original_text.lines().collect();
                    let line = (*lines
                        .get(line_number)
                        .expect("This should work, as have *at least* one (index = 0) line"))
                    .to_owned();
                    let contexted_start = {
                        let matched_line: Vec<_> = self.original_text.match_indices(&line).collect();
                        let (index, matched_line) = matched_line.get(0).expect("This first index should always match, as we took the line from the string in the first place");
                        debug_assert_eq!(matched_line, &&line);
                        start - index
                    };
                    let line_above;
                    if line_number == 0 {
                        // We only have one line, so no line above
                        line_above = "".to_owned();
                    } else {
                        line_above = (*lines
                            .get(line_number - 1)
                            .expect("We checked that this should work"))
                        .to_owned();
                    }
                    let line_below;
                    if lines.len() - 1 > line_number {
                        // We have a line after the current line
                        line_below = (*lines
                            .get(line_number + 1)
                            .expect("We checked that this should work"))
                        .to_owned();
                    } else {
                        line_below = "".to_owned();
                    }
                    (line_above, line, line_below, contexted_start, line_number)
                };
                SpannedLexingError::Error {
                    source: e,
                    start,
                    contexted_start,
                    line_above,
                    line_below,
                    line_number,
                    line,
                }
            })?;
            self.chomp(index); // end - start
            let end = self.current_index;
            Ok(Some(Token {
                span: TokenSpan { start, end },
                kind: token_kind,
            }))
        }
    }
    fn get_next_tokenkind(&mut self) -> Result<(TokenKind, usize), LexingError> {
        let next = match self.remaining_text.chars().next() {
            Some(c) => c,
            None => return Err(LexingError::UnexpectedEOF),
        };
        let (tok, length) = match next {
            '(' => (TokenKind::ParenthesisOpen, 1),
            ')' => (TokenKind::ParenthesisClose, 1),
            '{' => (TokenKind::BraceOpen, 1),
            '}' => (TokenKind::BraceClose, 1),
            ':' => (TokenKind::Colon, 1),
            ';' => (TokenKind::Semicolon, 1),
            ',' => (TokenKind::Comma, 1),
            '-' => tokenize_arrow(self.remaining_text)?,
            c @ '_' | c if c.is_alphanumeric() => tokenize_ident(self.remaining_text)?,
            other => return Err(LexingError::UnknownCharacter(other)),
        };
        Ok((tok, length))
    }
    /// Skip past any whitespace characters or comments.
    fn skip_ignored_tokens(&mut self) {
        loop {
            let ws = self.skip_whitespace();
            let comments = self.skip_comments();
            if ws + comments == 0 {
                return;
            }
        }
    }
    fn skip_whitespace(&mut self) -> usize {
        let mut remaining = self.remaining_text;
        // Filter out whitespace
        let _ws = {
            let ws = match take_while(remaining, |ch| ch.is_whitespace()) {
                Ok((_, bytes_skipped)) => bytes_skipped,
                _ => 0,
            };
            remaining = &remaining[ws..];
            ws
        };
        // let comments = skip_comments(remaining);
        // remaining = &remaining[comments..];
        let skip = self.remaining_text.len() - remaining.len();
        self.chomp(skip);
        skip
    }
    fn skip_comments(&mut self) -> usize {
        let remaining = self.remaining_text;
        let pairs = [("//", "\n"), ("/*", "*/")];
        let mut skip = 0;
        for &(pattern, matcher) in &pairs {
            if remaining.starts_with(pattern) {
                let leftovers = skip_until(remaining, matcher);
                skip = remaining.len() - leftovers.len();
                break;
            }
        }
        self.chomp(skip);
        skip
    }
    fn chomp(&mut self, chars_to_chomp: usize) {
        self.remaining_text = &self.remaining_text[chars_to_chomp..];
        self.current_index += chars_to_chomp;
    }
 }
 fn tokenize_ident(text: &str) -> Result<(TokenKind, usize), LexingError> {
    let (got, chars_read) = take_while(text, |ch| ch == '_' || ch.is_alphanumeric())?;
    // Filter out keywords
    let tokenkind = match got {
        "nasp" => TokenKind::Keyword(Keyword::nasp),
        "fn" => TokenKind::Keyword(Keyword::r#fn),
        other => TokenKind::Identifier(other.to_string()),
    };
    Ok((tokenkind, chars_read))
 }
 fn tokenize_arrow(text: &str) -> Result<(TokenKind, usize), LexingError> {
    let mut chars = text.chars();
    if let Some(char) = chars.next() {
        if char == '-' {
            if let Some(char) = chars.next() {
                if char == '>' {
                    return Ok((TokenKind::Arrow, 2));
                }
            }
        }
    }
    // This is a implicit else as the other if clauses return
    Err(LexingError::ExpectedArrow)
 }
 /// Consumes bytes while a predicate evaluates to true.
 fn take_while<F>(data: &str, mut pred: F) -> Result<(&str, usize), LexingError>
 where
    F: FnMut(char) -> bool,
 {
    let mut current_index = 0;
    for ch in data.chars() {
        let should_continue = pred(ch);
        if !should_continue {
            break;
        }
        current_index += ch.len_utf8();
    }
    if current_index == 0 {
        Err(LexingError::NoMatchesTaken)
    } else {
        Ok((&data[..current_index], current_index))
    }
 }
 /// Skips input until the remaining string pattern starts with the pattern
 fn skip_until<'a>(mut src: &'a str, pattern: &str) -> &'a str {
    while !src.is_empty() && !src.starts_with(pattern) {
        let next_char_size = src
            .chars()
            .next()
            .expect("The string isn't empty")
            .len_utf8();
        src = &src[next_char_size..];
    }
    &src[pattern.len()..]
 }
--- a/trixy/trixy-lang_parser/src/lib.rs
+++ b/trixy/trixy-lang_parser/src/lib.rs
@ -0,0 +1,58 @@
 use error::TrixyError;
 use crate::lexing::TokenStream;
 use self::command_spec::CommandSpec;
 mod command_spec;
 pub mod error;
 pub mod lexing;
 pub fn parse_trixy_lang(input: &str) -> Result<CommandSpec, TrixyError> {
    let input_tokens = TokenStream::lex(input)?;
    todo!()
 }
 #[cfg(test)]
 mod test {
    use crate::{
        command_spec::{CommandSpec, Declaration, Genus, NamedType, Namespace, Type},
        parse_trixy_lang,
    };
    #[test]
    fn test_function_with_namespace() {
        let expected = parse_trixy_lang(
            "
                nasp commands {
                    fn say_something(name_to_greet: String, what_to_say: String) -> String;
                }
            ",
        )
        .unwrap();
        let correct: CommandSpec = {
            let declarations = vec![Declaration {
                namespace: vec![Namespace {
                    name: "commands".to_owned(),
                }],
                genus: Genus::Function {
                    name: "say_something".to_owned(),
                    inputs: vec![
                        NamedType {
                            name: "name_to_greet".to_owned(),
                            base: Type::String,
                        },
                        NamedType {
                            name: "what_to_say".to_owned(),
                            base: Type::String,
                        },
                    ],
                    output: Type::String,
                },
            }];
            CommandSpec { declarations }
        };
        assert_eq!(expected, correct);
    }
 }
--- a/trixy/trixy-lang_parser/src/main.rs
+++ b/trixy/trixy-lang_parser/src/main.rs
@ -0,0 +1,45 @@
 use std::{fs, process::exit};
 use trixy_lang_parser::lexing::TokenStream;
 use std::path::PathBuf;
 use clap::{Parser, Subcommand};
 /// A helper command for the trixy-lang_parser crate
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 pub struct Args {
    #[command(subcommand)]
    /// The subcommand to execute
    pub subcommand: Command,
 }
 #[derive(Subcommand, Debug)]
 pub enum Command {
    #[clap(value_parser)]
    /// Only try to tokenize the file
    Tokenize {
        #[clap(value_parser)]
        /// The file containing the trixy code to tokenize
        file: PathBuf,
    },
 }
 pub fn main() {
    let args = Args::parse();
    match args.subcommand {
        Command::Tokenize { file } => {
            let input = fs::read_to_string(file).unwrap();
            let input_tokens = match TokenStream::lex(&input) {
                Ok(err) => err,
                Err(ok) => {
                    println!("{}", ok);
                    exit(1);
                }
            };
            println!("{:#?}", input_tokens);
        }
    }
 }