Made tokenizer store lines and columns of tokens

2023-11-28 20:27:20 +01:00 · 2023-11-28 20:27:20 +01:00 · ad415bcafe
parent 5cecbaf11d
commit ad415bcafe
2 changed files with 46 additions and 10 deletions
--- a/code/inc/tokenizer.h
+++ b/code/inc/tokenizer.h
@ -38,6 +38,9 @@ struct pac_token
    pac_token_e                 type;
    usz_t                       offset;
    usz_t                       length;
+    
+    usz_t                       line;
+    usz_t                       column;
 };

 struct pac_tlist
--- a/code/src/tokenizer.c
+++ b/code/src/tokenizer.c
@ -52,7 +52,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
    list.source                     = source;
    list.num_tokens                 = 0;
    list.tokens                     = calloc(sizeof(pac_token_s), tokens_capacity);
-    for(usz_t offset = 0; offset < len_source; ++offset)
+
+    usz_t                   line                    = 1;
+    usz_t                   column                  = 1;
+    for(usz_t offset = 0; offset < len_source; ++offset, ++column)
    {
        // The subject of the current iteration of this loop;
        // the rune which is being looked at
@ -64,18 +67,31 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
            list.tokens                     = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
        }

+        if(subject == '\n')
+        {
+            ++line;
+            column                      = 0;        // The for()-header will make it 1.
+            continue;
+        }
+
        if(pac_rune_is_blank(subject))
            continue;
-        
+
        if(subject == '"')
        {
            usz_t                   start_offset            = offset;
+            usz_t                   start_line              = line;
+            usz_t                   start_column            = column;
            while(offset < len_source)
            {
                if(subject == '\\')
+                {
                    ++offset;
-                
+                    ++column;
+                }
                ++offset;
+                ++column;
+
                subject                         = source[offset];

                if(subject == '"')
@ -85,6 +101,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
            token.type                      = PAC_TOKEN_LIT_STRING;
            token.offset                    = start_offset + 1;             // +1 for skipping the starting quotation mark
            token.length                    = (offset - start_offset) - 1;  // -1 for leaving the ending quotation mark out 
+            token.line                      = start_line;
+            token.column                    = start_column;
            
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
@ -94,9 +112,12 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
        if(pac_rune_is_letter(subject) || (subject == '_'))
        {
            usz_t                   start_offset            = offset;
+            usz_t                   start_line              = line;
+            usz_t                   start_column            = column;
            while(offset < len_source)
            {
                ++offset;
+                ++column;
                subject                         = source[offset];
                
                if(!pac_rune_is_letter(subject) && !pac_rune_is_digit(subject) && (subject != '_'))
@ -108,7 +129,10 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
            token.offset                    = start_offset;
            token.length                    = offset - start_offset;
            token.type                      = pac_convert_word_to_token_type(&source[start_offset], token.length);
+            token.line                      = start_line;
+            token.column                    = start_column;
            --offset; // The for() - header will skip to the character after the word.
+            --column;
            
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
@ -117,8 +141,7 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)

        if(subject == '#')
        {
-            usz_t                   offset_copy             = offset;
-            ++offset_copy;
+            usz_t                   offset_copy             = offset + 1;

            if(offset_copy < len_source)
            {
@ -136,17 +159,23 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
                }
                else if(second_sign == '[')
                {
-                    while(offset_copy < len_source)
+                    usz_t                   line_copy               = line;
+                    usz_t                   column_copy             = column + 1; // +1, because there also is one at
+                    while(offset_copy < len_source)                               // the creation of 'offset_copy'.
                    {
                        if(source[offset_copy] == ']')
                        {
+                            // TODO: There should be a check for wheter there is one square bracket following this one.
                            break;
                        }
-                        // If there's an escaped character here, do one jump more
-                        if(source[offset_copy] == '\\')
-                            ++offset_copy;
+                        if(source[offset_copy] == '\n')
+                            ++line_copy;
+
                        ++offset_copy;
+                        ++column_copy;
                    }
+                    line                            = line_copy;
+                    column                          = column_copy;
                }
                offset                          = offset_copy;
            }
@ -159,6 +188,8 @@ pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
            token.type                      = pac_convert_sign_to_token_type(subject);
            token.offset                    = offset;
            token.length                    = 1;
+            token.line                      = line;
+            token.column                    = column;
            
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
@ -219,7 +250,9 @@ void pac_display_tlist(pac_tlist_s list)

        char                   *token_type_string   = pac_stringify_token_type(token.type);
        char                   *token_type_indent   = pac_create_spaces_for_indent(24 - strlen(token_type_string));
-        printf("[%s]:%s %s\n",
+        printf("Column %-3lu @ Line %lu:     [%s]%s %s\n",
+            token.column,
+            token.line,
            token_type_string,
            token_type_indent,
            content