feat(parser/lexing): Desugger doc comments by running a regex on the file

Previously we actually supported parsing doc comments (`///`), but replacing them before parsing allows for simplifications in the lexer. Precisely, that means that we can add support for attributes without having to maintain the doc comment parser.
2024-03-24 18:09:57 +01:00 · 2024-03-24 18:09:57 +01:00 · bf3eb61110
parent 918ab5df6d
commit bf3eb61110
2 changed files with 34 additions and 4 deletions
--- a/trixy-parser/src/bin/trixy-parser.rs
+++ b/trixy-parser/src/bin/trixy-parser.rs
@ -33,8 +33,17 @@ pub struct Args {
    /// The subcommand to execute
    pub subcommand: Command,
 }
 #[derive(Subcommand, Debug)]
 pub enum Command {
    #[clap(value_parser)]
    /// Only replace the regex replacements in the file
    Replace {
        #[clap(value_parser)]
        /// The file containing the trixy code to replace
        file: PathBuf,
    },
    #[clap(value_parser)]
    /// Only try to tokenize the file
    Tokenize {
@ -125,7 +134,7 @@ pub fn main() {
            let processed = match parsed.process(input) {
                Ok(ok) => ok,
                Err(err) => {
-                    eprintln!("Error while doing the seconde (checked) parsing run:");
+                    eprintln!("Error while doing the second (checked) parsing run:");
                    eprintln!("{}", err);
                    exit(1)
                }
@ -140,5 +149,11 @@ pub fn main() {
            });
            println!("{:#?}", parsed);
        }
        Command::Replace { file } => {
            let input = fs::read_to_string(file).unwrap();
            let parsed = TokenStream::replace(&input);
            println!("{}", parsed);
        }
    }
 }
--- a/trixy-parser/src/lexing/mod.rs
+++ b/trixy-parser/src/lexing/mod.rs
@ -19,7 +19,9 @@
 * If not, see <https://www.gnu.org/licenses/>.
 */
-use std::fmt::Display;
+use std::{borrow::Cow, fmt::Display};
 use regex::Regex;
 use self::{error::SpannedLexingError, tokenizer::Tokenizer};
@ -36,13 +38,26 @@ pub struct TokenStream {
 }
 impl TokenStream {
    /// Try to remove syntax sugar by applying regex matching to the input string
    pub fn replace(src: &str) -> Cow<str> {
        // vim regex
        // :%s/\v^(\s*)\/\/\/(|[^/].*)$/\1#[doc = r#"\2"#]
        let re = Regex::new(r"(?m)^(?<space>\s*)///(?<content>|[^/].*)$").unwrap();
        // Replace all doc comments with their attribute
        let src_new = re.replace_all(src, r##"$space#[doc = r#"$content"#]"##);
        src_new
    }
    /// Turn a string of valid Trixy code into a list of tokens, including the
    /// location of that token's start and end point in the original source code.
    ///
    /// Note the token indices represent the half-open interval `[start, end)`,
    /// equivalent to `start .. end` in Rust.
    pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
-        let mut tokenizer = Tokenizer::new(src);
+        let src = Self::replace(src);
        let mut tokenizer = Tokenizer::new(&src);
        let mut tokens = Vec::new();
        while let Some(tok) = tokenizer.next_token()? {
@ -57,7 +72,7 @@ impl TokenStream {
        Ok(Self {
            tokens,
-            original_file: src.to_owned(),
+            original_file: src.to_string(),
        })
    }