Made tokenizer store lines and columns of tokens
This commit is contained in:
parent
5cecbaf11d
commit
ad415bcafe
|
@ -38,6 +38,9 @@ struct pac_token
|
|||
pac_token_e type;
|
||||
usz_t offset;
|
||||
usz_t length;
|
||||
|
||||
usz_t line;
|
||||
usz_t column;
|
||||
};
|
||||
|
||||
struct pac_tlist
|
||||
|
|
|
@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
list.source = source;
|
||||
list.num_tokens = 0;
|
||||
list.tokens = calloc(sizeof(pac_token_s), tokens_capacity);
|
||||
for(usz_t offset = 0; offset < len_source; ++offset)
|
||||
|
||||
usz_t line = 1;
|
||||
usz_t column = 1;
|
||||
for(usz_t offset = 0; offset < len_source; ++offset, ++column)
|
||||
{
|
||||
// The subject of the current iteration of this loop;
|
||||
// the rune which is being looked at
|
||||
|
@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
list.tokens = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
|
||||
}
|
||||
|
||||
if(subject == '\n')
|
||||
{
|
||||
++line;
|
||||
column = 0; // The for()-header will make it 1.
|
||||
continue;
|
||||
}
|
||||
|
||||
if(pac_rune_is_blank(subject))
|
||||
continue;
|
||||
|
||||
if(subject == '"')
|
||||
{
|
||||
usz_t start_offset = offset;
|
||||
usz_t start_line = line;
|
||||
usz_t start_column = column;
|
||||
while(offset < len_source)
|
||||
{
|
||||
if(subject == '\\')
|
||||
{
|
||||
++offset;
|
||||
|
||||
++column;
|
||||
}
|
||||
++offset;
|
||||
++column;
|
||||
|
||||
subject = source[offset];
|
||||
|
||||
if(subject == '"')
|
||||
|
@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
token.type = PAC_TOKEN_LIT_STRING;
|
||||
token.offset = start_offset + 1; // +1 for skipping the starting quotation mark
|
||||
token.length = (offset - start_offset) - 1; // -1 for leaving the ending quotation mark out
|
||||
token.line = start_line;
|
||||
token.column = start_column;
|
||||
|
||||
list.tokens[list.num_tokens] = token;
|
||||
++list.num_tokens;
|
||||
|
@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
if(pac_rune_is_letter(subject) || (subject == '_'))
|
||||
{
|
||||
usz_t start_offset = offset;
|
||||
usz_t start_line = line;
|
||||
usz_t start_column = column;
|
||||
while(offset < len_source)
|
||||
{
|
||||
++offset;
|
||||
++column;
|
||||
subject = source[offset];
|
||||
|
||||
if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
|
||||
|
@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
token.offset = start_offset;
|
||||
token.length = offset - start_offset;
|
||||
token.type = pac_convert_word_to_token_type(&source[start_offset], token.length);
|
||||
token.line = start_line;
|
||||
token.column = start_column;
|
||||
--offset; // The for() - header will skip to the character after the word.
|
||||
--column;
|
||||
|
||||
list.tokens[list.num_tokens] = token;
|
||||
++list.num_tokens;
|
||||
|
@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
|
||||
if(subject == '#')
|
||||
{
|
||||
usz_t offset_copy = offset;
|
||||
++offset_copy;
|
||||
usz_t offset_copy = offset + 1;
|
||||
|
||||
if(offset_copy < len_source)
|
||||
{
|
||||
|
@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
}
|
||||
else if(second_sign == '[')
|
||||
{
|
||||
while(offset_copy < len_source)
|
||||
usz_t line_copy = line;
|
||||
usz_t column_copy = column + 1; // +1, because there also is one at
|
||||
while(offset_copy < len_source) // the creation of 'offset_copy'.
|
||||
{
|
||||
if(source[offset_copy] == ']')
|
||||
{
|
||||
// TODO: There should be a check for wheter there is one square bracket following this one.
|
||||
break;
|
||||
}
|
||||
// If there's an escaped character here, do one jump more
|
||||
if(source[offset_copy] == '\\')
|
||||
++offset_copy;
|
||||
if(source[offset_copy] == '\n')
|
||||
++line_copy;
|
||||
|
||||
++offset_copy;
|
||||
++column_copy;
|
||||
}
|
||||
line = line_copy;
|
||||
column = column_copy;
|
||||
}
|
||||
offset = offset_copy;
|
||||
}
|
||||
|
@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
|
|||
token.type = pac_convert_sign_to_token_type(subject);
|
||||
token.offset = offset;
|
||||
token.length = 1;
|
||||
token.line = line;
|
||||
token.column = column;
|
||||
|
||||
list.tokens[list.num_tokens] = token;
|
||||
++list.num_tokens;
|
||||
|
@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list)
|
|||
|
||||
char *token_type_string = pac_stringify_token_type(token.type);
|
||||
char *token_type_indent = pac_create_spaces_for_indent(24 - strlen(token_type_string));
|
||||
printf("[%s]:%s %s\n",
|
||||
printf("Column %-3lu @ Line %lu: [%s]%s %s\n",
|
||||
token.column,
|
||||
token.line,
|
||||
token_type_string,
|
||||
token_type_indent,
|
||||
content
|
||||
|
|
Loading…
Reference in New Issue