Made tokenizer store lines and columns of tokens
This commit is contained in:
parent
5cecbaf11d
commit
ad415bcafe
|
@ -38,6 +38,9 @@ struct pac_token
|
||||||
pac_token_e type;
|
pac_token_e type;
|
||||||
usz_t offset;
|
usz_t offset;
|
||||||
usz_t length;
|
usz_t length;
|
||||||
|
|
||||||
|
usz_t line;
|
||||||
|
usz_t column;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct pac_tlist
|
struct pac_tlist
|
||||||
|
|
|
@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
list.source = source;
|
list.source = source;
|
||||||
list.num_tokens = 0;
|
list.num_tokens = 0;
|
||||||
list.tokens = calloc(sizeof(pac_token_s), tokens_capacity);
|
list.tokens = calloc(sizeof(pac_token_s), tokens_capacity);
|
||||||
for(usz_t offset = 0; offset < len_source; ++offset)
|
|
||||||
|
usz_t line = 1;
|
||||||
|
usz_t column = 1;
|
||||||
|
for(usz_t offset = 0; offset < len_source; ++offset, ++column)
|
||||||
{
|
{
|
||||||
// The subject of the current iteration of this loop;
|
// The subject of the current iteration of this loop;
|
||||||
// the rune which is being looked at
|
// the rune which is being looked at
|
||||||
|
@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
|
list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(subject == '\n')
|
||||||
|
{
|
||||||
|
++line;
|
||||||
|
column = 0; // The for()-header will make it 1.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if(pac_rune_is_blank(subject))
|
if(pac_rune_is_blank(subject))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if(subject == '"')
|
if(subject == '"')
|
||||||
{
|
{
|
||||||
usz_t start_offset = offset;
|
usz_t start_offset = offset;
|
||||||
|
usz_t start_line = line;
|
||||||
|
usz_t start_column = column;
|
||||||
while(offset < len_source)
|
while(offset < len_source)
|
||||||
{
|
{
|
||||||
if(subject == '\\')
|
if(subject == '\\')
|
||||||
|
{
|
||||||
++offset;
|
++offset;
|
||||||
|
++column;
|
||||||
|
}
|
||||||
++offset;
|
++offset;
|
||||||
|
++column;
|
||||||
|
|
||||||
subject = source[offset];
|
subject = source[offset];
|
||||||
|
|
||||||
if(subject == '"')
|
if(subject == '"')
|
||||||
|
@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
token.type = PAC_TOKEN_LIT_STRING;
|
token.type = PAC_TOKEN_LIT_STRING;
|
||||||
token.offset = start_offset + 1; // +1 for skipping the starting quotation mark
|
token.offset = start_offset + 1; // +1 for skipping the starting quotation mark
|
||||||
token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out
|
token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out
|
||||||
|
token.line = start_line;
|
||||||
|
token.column = start_column;
|
||||||
|
|
||||||
list.tokens[list.num_tokens] = token;
|
list.tokens[list.num_tokens] = token;
|
||||||
++list.num_tokens;
|
++list.num_tokens;
|
||||||
|
@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
if(pac_rune_is_letter(subject) || (subject == '_'))
|
if(pac_rune_is_letter(subject) || (subject == '_'))
|
||||||
{
|
{
|
||||||
usz_t start_offset = offset;
|
usz_t start_offset = offset;
|
||||||
|
usz_t start_line = line;
|
||||||
|
usz_t start_column = column;
|
||||||
while(offset < len_source)
|
while(offset < len_source)
|
||||||
{
|
{
|
||||||
++offset;
|
++offset;
|
||||||
|
++column;
|
||||||
subject = source[offset];
|
subject = source[offset];
|
||||||
|
|
||||||
if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
|
if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
|
||||||
|
@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
token.offset = start_offset;
|
token.offset = start_offset;
|
||||||
token.length = offset - start_offset;
|
token.length = offset - start_offset;
|
||||||
token.type = pac_convert_word_to_token_type(&source[start_offset], token.length);
|
token.type = pac_convert_word_to_token_type(&source[start_offset], token.length);
|
||||||
|
token.line = start_line;
|
||||||
|
token.column = start_column;
|
||||||
--offset; // The for() - header will skip to the character after the word.
|
--offset; // The for() - header will skip to the character after the word.
|
||||||
|
--column;
|
||||||
|
|
||||||
list.tokens[list.num_tokens] = token;
|
list.tokens[list.num_tokens] = token;
|
||||||
++list.num_tokens;
|
++list.num_tokens;
|
||||||
|
@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
|
|
||||||
if(subject == '#')
|
if(subject == '#')
|
||||||
{
|
{
|
||||||
usz_t offset_copy = offset;
|
usz_t offset_copy = offset + 1;
|
||||||
++offset_copy;
|
|
||||||
|
|
||||||
if(offset_copy < len_source)
|
if(offset_copy < len_source)
|
||||||
{
|
{
|
||||||
|
@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
}
|
}
|
||||||
else if(second_sign == '[')
|
else if(second_sign == '[')
|
||||||
{
|
{
|
||||||
while(offset_copy < len_source)
|
usz_t line_copy = line;
|
||||||
|
usz_t column_copy = column + 1; // +1, because there also is one at
|
||||||
|
while(offset_copy < len_source) // the creation of 'offset_copy'.
|
||||||
{
|
{
|
||||||
if(source[offset_copy] == ']')
|
if(source[offset_copy] == ']')
|
||||||
{
|
{
|
||||||
|
// TODO: There should be a check for wheter there is one square bracket following this one.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// If there's an escaped character here, do one jump more
|
if(source[offset_copy] == '\n')
|
||||||
if(source[offset_copy] == '\\')
|
++line_copy;
|
||||||
++offset_copy;
|
|
||||||
++offset_copy;
|
++offset_copy;
|
||||||
|
++column_copy;
|
||||||
}
|
}
|
||||||
|
line = line_copy;
|
||||||
|
column = column_copy;
|
||||||
}
|
}
|
||||||
offset = offset_copy;
|
offset = offset_copy;
|
||||||
}
|
}
|
||||||
|
@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
||||||
token.type = pac_convert_sign_to_token_type(subject);
|
token.type = pac_convert_sign_to_token_type(subject);
|
||||||
token.offset = offset;
|
token.offset = offset;
|
||||||
token.length = 1;
|
token.length = 1;
|
||||||
|
token.line = line;
|
||||||
|
token.column = column;
|
||||||
|
|
||||||
list.tokens[list.num_tokens] = token;
|
list.tokens[list.num_tokens] = token;
|
||||||
++list.num_tokens;
|
++list.num_tokens;
|
||||||
|
@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list)
|
||||||
|
|
||||||
char *token_type_string = pac_stringify_token_type(token.type);
|
char *token_type_string = pac_stringify_token_type(token.type);
|
||||||
char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string));
|
char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string));
|
||||||
printf("[%s]:%s %s\n",
|
printf("Column %-3lu @ Line %lu: [%s]%s %s\n",
|
||||||
|
token.column,
|
||||||
|
token.line,
|
||||||
token_type_string,
|
token_type_string,
|
||||||
token_type_indent,
|
token_type_indent,
|
||||||
content
|
content
|
||||||
|
|
Loading…
Reference in New Issue