Made tokenizer store lines and columns of tokens

This commit is contained in:
Eric-Paul Ickhorn 2023-11-28 20:27:20 +01:00
parent 5cecbaf11d
commit ad415bcafe
2 changed files with 46 additions and 10 deletions

View File

@ -38,6 +38,9 @@ struct pac_token
pac_token_e type;
usz_t offset;
usz_t length;
usz_t line;
usz_t column;
};
struct pac_tlist

View File

@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
list.source = source;
list.num_tokens = 0;
list.tokens = calloc(sizeof(pac_token_s), tokens_capacity);
for(usz_t offset = 0; offset < len_source; ++offset)
usz_t line = 1;
usz_t column = 1;
for(usz_t offset = 0; offset < len_source; ++offset, ++column)
{
// The subject of the current iteration of this loop;
// the rune which is being looked at
@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
}
if(subject == '\n')
{
++line;
column = 0; // The for()-header will make it 1.
continue;
}
if(pac_rune_is_blank(subject))
continue;
if(subject == '"')
{
usz_t start_offset = offset;
usz_t start_line = line;
usz_t start_column = column;
while(offset < len_source)
{
if(subject == '\\')
{
++offset;
++column;
}
++offset;
++column;
subject = source[offset];
if(subject == '"')
@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.type = PAC_TOKEN_LIT_STRING;
token.offset = start_offset + 1; // +1 for skipping the starting quotation mark
token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out
token.line = start_line;
token.column = start_column;
list.tokens[list.num_tokens] = token;
++list.num_tokens;
@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
if(pac_rune_is_letter(subject) || (subject == '_'))
{
usz_t start_offset = offset;
usz_t start_line = line;
usz_t start_column = column;
while(offset < len_source)
{
++offset;
++column;
subject = source[offset];
if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.offset = start_offset;
token.length = offset - start_offset;
token.type = pac_convert_word_to_token_type(&source[start_offset], token.length);
token.line = start_line;
token.column = start_column;
--offset; // The for() - header will skip to the character after the word.
--column;
list.tokens[list.num_tokens] = token;
++list.num_tokens;
@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
if(subject == '#')
{
usz_t offset_copy = offset;
++offset_copy;
usz_t offset_copy = offset + 1;
if(offset_copy < len_source)
{
@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
}
else if(second_sign == '[')
{
while(offset_copy < len_source)
usz_t line_copy = line;
usz_t column_copy = column + 1; // +1, because there also is one at
while(offset_copy < len_source) // the creation of 'offset_copy'.
{
if(source[offset_copy] == ']')
{
// TODO: There should be a check for wheter there is one square bracket following this one.
break;
}
// If there's an escaped character here, do one jump more
if(source[offset_copy] == '\\')
++offset_copy;
if(source[offset_copy] == '\n')
++line_copy;
++offset_copy;
++column_copy;
}
line = line_copy;
column = column_copy;
}
offset = offset_copy;
}
@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
token.type = pac_convert_sign_to_token_type(subject);
token.offset = offset;
token.length = 1;
token.line = line;
token.column = column;
list.tokens[list.num_tokens] = token;
++list.num_tokens;
@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list)
char *token_type_string = pac_stringify_token_type(token.type);
char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string));
printf("[%s]:%s %s\n",
printf("Column %-3lu @ Line %lu: [%s]%s %s\n",
token.column,
token.line,
token_type_string,
token_type_indent,
content