feat(parser/lexing): Desugger doc comments by running a regex on the file

Previously we actually supported parsing doc comments (`///`), but
replacing them before parsing allows for simplifications in the lexer.
Precisely, that means that we can add support for attributes without
having to maintain the doc comment parser.
This commit is contained in:
Benedikt Peetz 2024-03-24 18:09:57 +01:00
parent 918ab5df6d
commit bf3eb61110
Signed by: bpeetz
GPG Key ID: A5E94010C3A642AD
2 changed files with 34 additions and 4 deletions

View File

@ -33,8 +33,17 @@ pub struct Args {
/// The subcommand to execute
pub subcommand: Command,
}
#[derive(Subcommand, Debug)]
pub enum Command {
#[clap(value_parser)]
/// Only replace the regex replacements in the file
Replace {
#[clap(value_parser)]
/// The file containing the trixy code to replace
file: PathBuf,
},
#[clap(value_parser)]
/// Only try to tokenize the file
Tokenize {
@ -125,7 +134,7 @@ pub fn main() {
let processed = match parsed.process(input) {
Ok(ok) => ok,
Err(err) => {
eprintln!("Error while doing the seconde (checked) parsing run:");
eprintln!("Error while doing the second (checked) parsing run:");
eprintln!("{}", err);
exit(1)
}
@ -140,5 +149,11 @@ pub fn main() {
});
println!("{:#?}", parsed);
}
Command::Replace { file } => {
let input = fs::read_to_string(file).unwrap();
let parsed = TokenStream::replace(&input);
println!("{}", parsed);
}
}
}

View File

@ -19,7 +19,9 @@
* If not, see <https://www.gnu.org/licenses/>.
*/
use std::fmt::Display;
use std::{borrow::Cow, fmt::Display};
use regex::Regex;
use self::{error::SpannedLexingError, tokenizer::Tokenizer};
@ -36,13 +38,26 @@ pub struct TokenStream {
}
impl TokenStream {
/// Try to remove syntax sugar by applying regex matching to the input string
pub fn replace(src: &str) -> Cow<str> {
// vim regex
// :%s/\v^(\s*)\/\/\/(|[^/].*)$/\1#[doc = r#"\2"#]
let re = Regex::new(r"(?m)^(?<space>\s*)///(?<content>|[^/].*)$").unwrap();
// Replace all doc comments with their attribute
let src_new = re.replace_all(src, r##"$space#[doc = r#"$content"#]"##);
src_new
}
/// Turn a string of valid Trixy code into a list of tokens, including the
/// location of that token's start and end point in the original source code.
///
/// Note the token indices represent the half-open interval `[start, end)`,
/// equivalent to `start .. end` in Rust.
pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
let mut tokenizer = Tokenizer::new(src);
let src = Self::replace(src);
let mut tokenizer = Tokenizer::new(&src);
let mut tokens = Vec::new();
while let Some(tok) = tokenizer.next_token()? {
@ -57,7 +72,7 @@ impl TokenStream {
Ok(Self {
tokens,
original_file: src.to_owned(),
original_file: src.to_string(),
})
}