Made tokenizer store lines and columns of tokens

This commit is contained in:
Eric-Paul Ickhorn 2023-11-28 20:27:20 +01:00
parent 5cecbaf11d
commit ad415bcafe
2 changed files with 46 additions and 10 deletions

View File

@ -38,6 +38,9 @@ struct pac_token
pac_token_e type; pac_token_e type;
usz_t offset; usz_t offset;
usz_t length; usz_t length;
usz_t line;
usz_t column;
}; };
struct pac_tlist struct pac_tlist

View File

@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
list.source = source; list.source = source;
list.num_tokens = 0; list.num_tokens = 0;
list.tokens = calloc(sizeof(pac_token_s), tokens_capacity); list.tokens = calloc(sizeof(pac_token_s), tokens_capacity);
for(usz_t offset = 0; offset < len_source; ++offset)
usz_t line = 1;
usz_t column = 1;
for(usz_t offset = 0; offset < len_source; ++offset, ++column)
{ {
// The subject of the current iteration of this loop; // The subject of the current iteration of this loop;
// the rune which is being looked at // the rune which is being looked at
@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity); list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
} }
if(subject == '\n')
{
++line;
column = 0; // The for()-header will make it 1.
continue;
}
if(pac_rune_is_blank(subject)) if(pac_rune_is_blank(subject))
continue; continue;
if(subject == '"') if(subject == '"')
{ {
usz_t start_offset = offset; usz_t start_offset = offset;
usz_t start_line = line;
usz_t start_column = column;
while(offset < len_source) while(offset < len_source)
{ {
if(subject == '\\') if(subject == '\\')
{
++offset; ++offset;
++column;
}
++offset;
++column;
++offset;
subject = source[offset]; subject = source[offset];
if(subject == '"') if(subject == '"')
@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.type = PAC_TOKEN_LIT_STRING; token.type = PAC_TOKEN_LIT_STRING;
token.offset = start_offset + 1; // +1 for skipping the starting quotation mark token.offset = start_offset + 1; // +1 for skipping the starting quotation mark
token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out
token.line = start_line;
token.column = start_column;
list.tokens[list.num_tokens] = token; list.tokens[list.num_tokens] = token;
++list.num_tokens; ++list.num_tokens;
@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
if(pac_rune_is_letter(subject) || (subject == '_')) if(pac_rune_is_letter(subject) || (subject == '_'))
{ {
usz_t start_offset = offset; usz_t start_offset = offset;
usz_t start_line = line;
usz_t start_column = column;
while(offset < len_source) while(offset < len_source)
{ {
++offset; ++offset;
++column;
subject = source[offset]; subject = source[offset];
if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_')) if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.offset = start_offset; token.offset = start_offset;
token.length = offset - start_offset; token.length = offset - start_offset;
token.type = pac_convert_word_to_token_type(&source[start_offset], token.length); token.type = pac_convert_word_to_token_type(&source[start_offset], token.length);
token.line = start_line;
token.column = start_column;
--offset; // The for() - header will skip to the character after the word. --offset; // The for() - header will skip to the character after the word.
--column;
list.tokens[list.num_tokens] = token; list.tokens[list.num_tokens] = token;
++list.num_tokens; ++list.num_tokens;
@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
if(subject == '#') if(subject == '#')
{ {
usz_t offset_copy = offset; usz_t offset_copy = offset + 1;
++offset_copy;
if(offset_copy < len_source) if(offset_copy < len_source)
{ {
@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
} }
else if(second_sign == '[') else if(second_sign == '[')
{ {
while(offset_copy < len_source) usz_t line_copy = line;
usz_t column_copy = column + 1; // +1, because there also is one at
while(offset_copy < len_source) // the creation of 'offset_copy'.
{ {
if(source[offset_copy] == ']') if(source[offset_copy] == ']')
{ {
// TODO: There should be a check for wheter there is one square bracket following this one.
break; break;
} }
// If there's an escaped character here, do one jump more if(source[offset_copy] == '\n')
if(source[offset_copy] == '\\') ++line_copy;
++offset_copy;
++offset_copy; ++offset_copy;
++column_copy;
} }
line = line_copy;
column = column_copy;
} }
offset = offset_copy; offset = offset_copy;
} }
@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.type = pac_convert_sign_to_token_type(subject); token.type = pac_convert_sign_to_token_type(subject);
token.offset = offset; token.offset = offset;
token.length = 1; token.length = 1;
token.line = line;
token.column = column;
list.tokens[list.num_tokens] = token; list.tokens[list.num_tokens] = token;
++list.num_tokens; ++list.num_tokens;
@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list)
char *token_type_string = pac_stringify_token_type(token.type); char *token_type_string = pac_stringify_token_type(token.type);
char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string)); char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string));
printf("[%s]:%s %s\n", printf("Column %-3lu @ Line %lu: [%s]%s %s\n",
token.column,
token.line,
token_type_string, token_type_string,
token_type_indent, token_type_indent,
content content