feat(parser/lexing): Desugger doc comments by running a regex on the file

Previously we actually supported parsing doc comments (`///`), but replacing them before parsing allows for simplifications in the lexer. Precisely, that means that we can add support for attributes without having to maintain the doc comment parser.
2024-03-24 18:09:57 +01:00 · 2024-03-24 18:09:57 +01:00 · bf3eb61110
parent 918ab5df6d
commit bf3eb61110
2 changed files with 34 additions and 4 deletions
--- a/trixy-parser/src/bin/trixy-parser.rs
+++ b/trixy-parser/src/bin/trixy-parser.rs
@ -33,8 +33,17 @@ pub struct Args {
    /// The subcommand to execute
    pub subcommand: Command,
 }
+
 #[derive(Subcommand, Debug)]
 pub enum Command {
+    #[clap(value_parser)]
+    /// Only replace the regex replacements in the file
+    Replace {
+        #[clap(value_parser)]
+        /// The file containing the trixy code to replace
+        file: PathBuf,
+    },
+
    #[clap(value_parser)]
    /// Only try to tokenize the file
    Tokenize {
@ -125,7 +134,7 @@ pub fn main() {
            let processed = match parsed.process(input) {
                Ok(ok) => ok,
                Err(err) => {
-                    eprintln!("Error while doing the seconde (checked) parsing run:");
+                    eprintln!("Error while doing the second (checked) parsing run:");
                    eprintln!("{}", err);
                    exit(1)
                }
@ -140,5 +149,11 @@ pub fn main() {
            });
            println!("{:#?}", parsed);
        }
+
+        Command::Replace { file } => {
+            let input = fs::read_to_string(file).unwrap();
+            let parsed = TokenStream::replace(&input);
+            println!("{}", parsed);
+        }
    }
 }
--- a/trixy-parser/src/lexing/mod.rs
+++ b/trixy-parser/src/lexing/mod.rs
@ -19,7 +19,9 @@
 * If not, see <https://www.gnu.org/licenses/>.
 */

-use std::fmt::Display;
+use std::{borrow::Cow, fmt::Display};
+
+use regex::Regex;

 use self::{error::SpannedLexingError, tokenizer::Tokenizer};

@ -36,13 +38,26 @@ pub struct TokenStream {
 }

 impl TokenStream {
+    /// Try to remove syntax sugar by applying regex matching to the input string
+    pub fn replace(src: &str) -> Cow<str> {
+        // vim regex
+        // :%s/\v^(\s*)\/\/\/(|[^/].*)$/\1#[doc = r#"\2"#]
+        let re = Regex::new(r"(?m)^(?<space>\s*)///(?<content>|[^/].*)$").unwrap();
+
+        // Replace all doc comments with their attribute
+        let src_new = re.replace_all(src, r##"$space#[doc = r#"$content"#]"##);
+        src_new
+    }
+
    /// Turn a string of valid Trixy code into a list of tokens, including the
    /// location of that token's start and end point in the original source code.
    ///
    /// Note the token indices represent the half-open interval `[start, end)`,
    /// equivalent to `start .. end` in Rust.
    pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
-        let mut tokenizer = Tokenizer::new(src);
+        let src = Self::replace(src);
+
+        let mut tokenizer = Tokenizer::new(&src);
        let mut tokens = Vec::new();

        while let Some(tok) = tokenizer.next_token()? {
@ -57,7 +72,7 @@ impl TokenStream {

        Ok(Self {
            tokens,
-            original_file: src.to_owned(),
+            original_file: src.to_string(),
        })
    }