From ad415bcafe9693b6fb51ab902ca63db373a1b318 Mon Sep 17 00:00:00 2001 From: Eric-Paul Ickhorn Date: Tue, 28 Nov 2023 20:27:20 +0100 Subject: [PATCH] Made tokenizer store lines and columns of tokens --- code/inc/tokenizer.h | 3 +++ code/src/tokenizer.c | 53 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/code/inc/tokenizer.h b/code/inc/tokenizer.h index e5677fe..08d8ee2 100644 --- a/code/inc/tokenizer.h +++ b/code/inc/tokenizer.h @@ -38,6 +38,9 @@ struct pac_token pac_token_e type; usz_t offset; usz_t length; + + usz_t line; + usz_t column; }; struct pac_tlist diff --git a/code/src/tokenizer.c b/code/src/tokenizer.c index b275274..ca8ef3a 100644 --- a/code/src/tokenizer.c +++ b/code/src/tokenizer.c @@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) list.source = source; list.num_tokens = 0; list.tokens = calloc(sizeof(pac_token_s), tokens_capacity); - for(usz_t offset = 0; offset < len_source; ++offset) + + usz_t line = 1; + usz_t column = 1; + for(usz_t offset = 0; offset < len_source; ++offset, ++column) { // The subject of the current iteration of this loop; // the rune which is being looked at @@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity); } + if(subject == '\n') + { + ++line; + column = 0; // The for()-header will make it 1. + continue; + } + if(pac_rune_is_blank(subject)) continue; - + if(subject == '"') { usz_t start_offset = offset; + usz_t start_line = line; + usz_t start_column = column; while(offset < len_source) { if(subject == '\\') + { ++offset; - + ++column; + } ++offset; + ++column; + subject = source[offset]; if(subject == '"') @@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) token.type = PAC_TOKEN_LIT_STRING; token.offset = start_offset + 1; // +1 for skipping the starting quotation mark token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out + token.line = start_line; + token.column = start_column; list.tokens[list.num_tokens] = token; ++list.num_tokens; @@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) if(pac_rune_is_letter(subject) || (subject == '_')) { usz_t start_offset = offset; + usz_t start_line = line; + usz_t start_column = column; while(offset < len_source) { ++offset; + ++column; subject = source[offset]; if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_')) @@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) token.offset = start_offset; token.length = offset - start_offset; token.type = pac_convert_word_to_token_type(&source[start_offset], token.length); + token.line = start_line; + token.column = start_column; --offset; // The for() - header will skip to the character after the word. + --column; list.tokens[list.num_tokens] = token; ++list.num_tokens; @@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) if(subject == '#') { - usz_t offset_copy = offset; - ++offset_copy; + usz_t offset_copy = offset + 1; if(offset_copy < len_source) { @@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) } else if(second_sign == '[') { - while(offset_copy < len_source) + usz_t line_copy = line; + usz_t column_copy = column + 1; // +1, because there also is one at + while(offset_copy < len_source) // the creation of 'offset_copy'. { if(source[offset_copy] == ']') { + // TODO: There should be a check for wheter there is one square bracket following this one. break; } - // If there's an escaped character here, do one jump more - if(source[offset_copy] == '\\') - ++offset_copy; + if(source[offset_copy] == '\n') + ++line_copy; + ++offset_copy; + ++column_copy; } + line = line_copy; + column = column_copy; } offset = offset_copy; } @@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source) token.type = pac_convert_sign_to_token_type(subject); token.offset = offset; token.length = 1; + token.line = line; + token.column = column; list.tokens[list.num_tokens] = token; ++list.num_tokens; @@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list) char *token_type_string = pac_stringify_token_type(token.type); char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string)); - printf("[%s]:%s %s\n", + printf("Column %-3lu @ Line %lu: [%s]%s %s\n", + token.column, + token.line, token_type_string, token_type_indent, content