First code

2023-11-26 19:49:54 +01:00 · 2023-11-26 19:49:54 +01:00 · 816f613cc1
commit 816f613cc1
9 changed files with 761 additions and 0 deletions
--- a/code/inc/ast.h
+++ b/code/inc/ast.h
@ -0,0 +1,71 @@
 #ifndef PARCEL_AST_H
 #define PARCEL_AST_H
 #include <utility.h>
 typedef struct pac_ast                  pac_ast_s;
 typedef struct pac_ast_rule             pac_ast_rule_s;
 typedef struct pac_ast_variant          pac_ast_variant_s;
 typedef struct pac_ast_literal          pac_ast_literal_s;
 typedef struct pac_ast_reference        pac_ast_reference_s;
 typedef struct pac_ast_item             pac_ast_item_s;
 struct pac_ast
 {
    usz_t                       num_rules;
    pac_ast_rule_s             *rules;
 };
 struct pac_ast_rule
 {
    char                       *name;
    usz_t                       num_variants;
    pac_ast_variant_s          *variants;
 };
 struct pac_ast_variant
 {
    usz_t                       num_items;
    pac_ast_item_s             *items;
 };
 // pac_ast_reference: Also called non-terminal, a reference is an item
 //  which represents all the contents of another rule.
 struct pac_ast_reference
 {
    usz_t                       len_name;
    char                       *name;
 };
 struct pac_ast_literal
 {
    usz_t                       length;
    char                       *string;
 };
 // pac_outline_e: An enumeration of all outlines known to Parcel.
 //
 //  Outlines are tokens of which only the rough format is known, like
 //  with variable names; the format is known, but the actual name isn't.
 typedef enum
 {
    PAC_OUTLINE_RUNE,
    PAC_OUTLINE_WORD,
    PAC_OUTLINE_INTEGER,
    PAC_OUTLINE_FLOAT
 } pac_outline_e;
 struct pac_ast_item
 {
    bool_t                      is_literal;
    union pac_item_data
    {
        pac_ast_literal_s           literal;
        pac_outline_e               outline;
    } data;
 };
 #endif // PARCEL_AST_H
--- a/code/inc/parcel.h
+++ b/code/inc/parcel.h
@ -0,0 +1,32 @@
 #ifndef PARCEL_H
 #define PARCEL_H
 #include <utility.h>
 typedef struct pac_grammar              pac_grammar_s;
 typedef struct pac_log_entry            pac_log_entry_s;
 typedef struct pac_log                  pac_log_s;
 struct pac_log_entry
 {
    usz_t                       length;
    char                       *text;
 };
 struct pac_log
 {
    usz_t                       length;
    pac_log_entry_s            *entries;
 };
 struct pac_grammar
 {
 };
 pac_grammar_s               pac_convert_grammar         (char *source);
 // void                        pac_delete_grammar          (pac_grammar_s grammar);
 #endif // PARCEL_H
--- a/code/inc/tokenizer.h
+++ b/code/inc/tokenizer.h
@ -0,0 +1,58 @@
 #ifndef PARCEL_TOKENIZER_H
 #define PARCEL_TOKENIZER_H
 #include <utility.h>
 typedef struct pac_token                    pac_token_s;
 typedef struct pac_tlist                    pac_tlist_s;            // Token List
 typedef enum
 {
    PAC_TOKEN_STRAY             = 0,
    PAC_TOKEN_WORD,
    PAC_TOKEN_KEYWORD_TRUE,
    PAC_TOKEN_KEYWORD_FALSE,
    PAC_TOKEN_KEYWORD_ALPHA,
    PAC_TOKEN_KEYWORD_WORD,
    PAC_TOKEN_KEYWORD_INTEGER,
    PAC_TOKEN_LIT_STRING,
    PAC_TOKEN_LIT_RUNE,                 // TODO
    PAC_TOKEN_LIT_INTEGER,              // TODO
    PAC_TOKEN_SIGN_OPEN_TAG,
    PAC_TOKEN_SIGN_CLOSE_TAG,
    PAC_TOKEN_SIGN_EQUALS,
    PAC_TOKEN_SIGN_COLON,
    PAC_TOKEN_SIGN_COMMA,
    PAC_TOKEN_SIGN_HYPHEN,
    PAC_TOKEN_SIGN_UNDERSCORE,
    PAC_TOKEN_SIGN_VERTICAL_BAR,
    PAC_TOKEN_SIGN_SEMICOLON
 } pac_token_e;
 struct pac_token
 {
    pac_token_e                 type;
    usz_t                       offset;
    usz_t                       length;
 };
 struct pac_tlist
 {
    char                       *source;
    usz_t                       num_tokens;
    pac_token_s                *tokens;
    // cursor: An index into the 'tokens'-array; used in later stages.
    usz_t                       cursor;
 };
 pac_token_e             pac_word_to_token_type          (char *word, usz_t length);
 pac_tlist_s             pac_tokenize_grammar            (char *source, usz_t len_source);
 char *                  pac_stringify_token_type        (pac_token_e type);
 void                    pac_display_tlist               (pac_tlist_s list);
 #endif
--- a/code/inc/utility.h
+++ b/code/inc/utility.h
@ -0,0 +1,44 @@
 #ifndef TN_UTIL_TYPES_H
 #define TN_UTIL_TYPES_H
 typedef signed char             i8_t;
 typedef signed short            i16_t;
 typedef signed int              i32_t;
 typedef signed long             i64_t;
 typedef unsigned char           u8_t;
 typedef unsigned short          u16_t;
 typedef unsigned int            u32_t;
 typedef unsigned long           u64_t;
 typedef float                   f32_t;
 typedef double                  f64_t;
 typedef u32_t                   rune_t;
 typedef u8_t                    bool_t;
 #ifdef __TN_OLD_PROCESSOR__
 typedef u32_t                   usz_t;
 typedef i32_t                   isz_t;
 #else
 typedef u64_t                   usz_t;
 typedef i64_t                   isz_t;
 #endif
 #define TRUE  (1)
 #define FALSE (0)
 #define NULL ((void *) 0)
 bool_t      pac_rune_is_lower_letter    (rune_t rune);
 bool_t      pac_rune_is_upper_letter    (rune_t rune);
 bool_t      pac_rune_is_letter          (rune_t rune);
 bool_t      pac_rune_is_digit           (rune_t rune);
 bool_t      pac_rune_is_blank           (rune_t rune);
 bool_t      pac_rune_is_sign            (rune_t rune);
 #endif // Include Guard (TN_UTIL_TYPES_H)
--- a/code/src/ast.c
+++ b/code/src/ast.c
@ -0,0 +1,206 @@
 #include <parcel.h>
 #include <ast.h>
 #include <tokenizer.h>
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #define CURRENT_STRING      (&tlist->source[tlist->tokens[tlist->cursor].offset])
 #define CURRENT             (tlist->tokens[tlist->cursor])
 #define SKIP_TOKEN           ++tlist->cursor
 #define TOKEN_AT(index)     (tlist->tokens[index])
 #define END_REACHED         (tlist->cursor >= tlist->num_tokens)
 i32_t pac_grow_reference(pac_tlist_s *tlist, pac_ast_reference_s *reference)
 {
 }
 i32_t pac_grow_item(pac_tlist_s *tlist, pac_ast_item_s *item)
 {
    item->is_literal            = FALSE;
    memset(item, 0x00, sizeof(pac_ast_item_s));
    switch(CURRENT.type)
    {
        case PAC_TOKEN_LIT_STRING:
        {
            item->is_literal            = TRUE;
            item->data.literal.length   = CURRENT.length;
            item->data.literal.string   = malloc(item->data.literal.length + 1);
            memcpy(item->data.literal.string, CURRENT_STRING, CURRENT.length);
            item->data.literal.string[item->data.literal.length] = 0x00;
            SKIP_TOKEN;
        } return 1;
        case PAC_TOKEN_KEYWORD_WORD:
        {
            item->data.outline          = PAC_OUTLINE_WORD;
            SKIP_TOKEN;
        } return 1;
        case PAC_TOKEN_KEYWORD_INTEGER:
        {
            item->data.outline          = PAC_OUTLINE_INTEGER;
            SKIP_TOKEN;
        } return 1;
    }
    return -1;
 }
 i32_t pac_grow_variant(pac_tlist_s *tlist, pac_ast_variant_s *variant)
 {
    usz_t                   start_index             = tlist->cursor;
    memset(variant, 0x00, sizeof(pac_ast_variant_s));
    usz_t                   items_capacity          = 8;
    variant->items                  = calloc(sizeof(pac_ast_item_s), items_capacity);
    while(!END_REACHED)
    {
        if(variant->num_items >= items_capacity)
        {
            items_capacity             *= 2;
            variant->items              = calloc(sizeof(pac_ast_item_s), items_capacity);
        }
        i32_t                   success                 = pac_grow_item(tlist, &variant->items[variant->num_items]);
        ++variant->num_items;
        if(success < 0)
            return success - 1;
        if(CURRENT.type == PAC_TOKEN_SIGN_VERTICAL_BAR)
            return tlist->cursor - start_index;
        if(CURRENT.type == PAC_TOKEN_SIGN_SEMICOLON)
            return tlist->cursor - start_index;
        if(CURRENT.type != PAC_TOKEN_SIGN_COMMA)
            return -1;
        SKIP_TOKEN;
    }
    return -1;
 }
 i32_t pac_grow_rule(pac_tlist_s *tlist, pac_ast_rule_s *rule)
 {
    memset(rule, 0x00, sizeof(pac_ast_rule_s));
    // Parse the header
    usz_t                   start_index             = tlist->cursor;
    if(CURRENT.type != PAC_TOKEN_SIGN_OPEN_TAG)
        return -1;
    SKIP_TOKEN;
    if(CURRENT.type != PAC_TOKEN_WORD)
    {
        puts("A rule name must be a single word!");
        return -1;
    }
    usz_t                   len_name                = CURRENT.length;
    usz_t                   name_start              = CURRENT.offset;
    SKIP_TOKEN;
    if(CURRENT.type != PAC_TOKEN_SIGN_CLOSE_TAG)
    {
        puts("Missing Tag closing sign!");
        return -1;
    }
    SKIP_TOKEN;
    if(CURRENT.type != PAC_TOKEN_SIGN_EQUALS)
        return -1;
    SKIP_TOKEN;
    // Parse all variants
    rule->name                      = malloc(len_name + 1);
    memcpy(rule->name, &tlist->source[name_start], len_name);
    rule->name[len_name]            = 0;
    usz_t                   variants_capacity       = 4;
    rule->variants                  = malloc(sizeof(pac_ast_variant_s) * variants_capacity);
    while(!END_REACHED)
    {
        if(rule->num_variants >= variants_capacity)
        {
            variants_capacity          *= 2;
            rule->variants              = realloc(rule->variants, sizeof(pac_ast_variant_s) * variants_capacity);
        }
        i32_t success               = pac_grow_variant(tlist, &rule->variants[rule->num_variants]);
        ++rule->num_variants;
        if(success < 0)
        {
            printf("Failed parsing variant %u of rule '%s'. ");
            while(!END_REACHED)
            {
                if(CURRENT.type == PAC_TOKEN_SIGN_VERTICAL_BAR)
                {
                    printf("Continuing with next variant.\n");
                    break;
                }
                if(CURRENT.type == PAC_TOKEN_SIGN_SEMICOLON)
                {
                    printf("Continuing with next rule.\n");
                    SKIP_TOKEN;
                    return 2;
                }
                SKIP_TOKEN;
            }
        }
        if(CURRENT.type == PAC_TOKEN_SIGN_SEMICOLON)
        {
            SKIP_TOKEN;
            return tlist->cursor - start_index;
        }
        if(CURRENT.type != PAC_TOKEN_SIGN_VERTICAL_BAR)
        {
            return -1;
        }
        SKIP_TOKEN;
    }
    return -1;
 }
 pac_ast_s pac_grow_ast(pac_tlist_s tokens)
 {
    usz_t               rules_capacity              = 32;
    pac_ast_s           ast;
    ast.num_rules               = 0;
    ast.rules                   = malloc(sizeof(pac_ast_rule_s) * rules_capacity);
    while(tokens.cursor < tokens.num_tokens)
    {
        if(ast.num_rules >= rules_capacity)
        {
            rules_capacity             *= 2;
            ast.rules                   = realloc(ast.rules, sizeof(pac_ast_rule_s) * rules_capacity);
        }
        int                 success                     = pac_grow_rule(&tokens, &ast.rules[ast.num_rules]);
        if(success < 0)
        {
            printf("Failed parsing a rule at index: %u!\n", tokens.cursor);
        }
        ++ast.num_rules;
    }
    return ast;
 }
 pac_grammar_s pac_convert_grammar(char *source)
 {
    usz_t                   len_source              = strlen(source);
    pac_tlist_s             tokens                  = pac_tokenize_grammar(source, len_source);
    pac_display_tlist(tokens);
    pac_ast_s               ast                     = pac_grow_ast(tokens);
    pac_grammar_s           grammar;
    return grammar;
 }
--- a/code/src/main.c
+++ b/code/src/main.c
@ -0,0 +1,43 @@
 #include <parcel.h>
 #include <ast.h>
 #include <stdio.h>
 #include <stdlib.h>
 char * load_file(char *path)
 {
    FILE       *file            = fopen(path, "r");
    if(file == NULL)
    {
        printf("Failed opening file at '%s'\n", path);
        return NULL;
    }
    fseek(file, 0, SEEK_END);
    long        length          = ftell(file);
    fseek(file, 0, SEEK_SET);
    char       *content         = malloc(length+1);
    content[length]             = 0x00;
    fread(content, 1, length, file);
    fclose(file);
    return content;
 }
 int main(int argc, char **argv)
 {
    if(argc != 2)
    {
        printf("Usage: %s <filename>\n", argv[0]);
        return -1;
    }
    char       *source          = load_file(argv[1]);
    if(source == NULL)
    {
        puts("Stopping due to previous error!");
        return -2;
    }
    pac_convert_grammar(source);
    return 0;
 }
--- a/code/src/runes.c
+++ b/code/src/runes.c
@ -0,0 +1,77 @@
 #include <utility.h>
 bool_t pac_rune_is_sign_of_block_1(rune_t rune)
 {
    if(rune < 0x21) return FALSE;
    if(rune > 0x2f) return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_sign_of_block_2(rune_t rune)
 {
    if(rune < 0x3a) return FALSE;
    if(rune > 0x40) return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_sign_of_block_3(rune_t rune)
 {
    if(rune < 0x5b) return FALSE;
    if(rune > 0x60) return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_sign_of_block_4(rune_t rune)
 {
    if(rune < 0x7b) return FALSE;
    if(rune > 0x7e) return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_lower_letter(rune_t rune)
 {
    if(rune < 'a') return FALSE;
    if(rune > 'z') return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_upper_letter(rune_t rune)
 {
    if(rune < 'A') return FALSE;
    if(rune > 'Z') return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_letter(rune_t rune)
 {
    if(pac_rune_is_lower_letter(rune)) return TRUE;
    if(pac_rune_is_upper_letter(rune)) return TRUE;
    return FALSE;
 }
 bool_t pac_rune_is_digit(rune_t rune)
 {
    if(rune < '0') return FALSE;
    if(rune > '9') return FALSE;
    return TRUE;
 }
 bool_t pac_rune_is_blank(rune_t rune)
 {
    if(rune == ' ')  return TRUE;
    if(rune == '\t') return TRUE;
    return FALSE;
 }
 bool_t pac_rune_is_sign(rune_t rune)
 {
    if(pac_rune_is_sign_of_block_1(rune)) return TRUE;
    if(pac_rune_is_sign_of_block_2(rune)) return TRUE;
    if(pac_rune_is_sign_of_block_3(rune)) return TRUE;
    if(pac_rune_is_sign_of_block_4(rune)) return TRUE;
    return FALSE;
 }
--- a/code/src/tokenizer.c
+++ b/code/src/tokenizer.c
@ -0,0 +1,228 @@
 #include <tokenizer.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 pac_token_e pac_convert_word_to_token_type(char *word, usz_t length)
 {
    if(length == 4)
    {
        if(!memcmp(word, "true", 4)) return PAC_TOKEN_KEYWORD_TRUE;
        if(!memcmp(word, "word", 4)) return PAC_TOKEN_KEYWORD_WORD;
        return PAC_TOKEN_WORD;
    }
    if(length == 5)
    {
        if(!memcmp(word, "false", 5)) return PAC_TOKEN_KEYWORD_FALSE;
        if(!memcmp(word, "alpha", 5)) return PAC_TOKEN_KEYWORD_ALPHA;
        return PAC_TOKEN_WORD;
    }
    if(length == 7)
    {
        if(!memcmp(word, "integer", 7)) return PAC_TOKEN_KEYWORD_INTEGER;
        return PAC_TOKEN_WORD;
    }
    return PAC_TOKEN_WORD;
 }
 pac_token_e pac_convert_sign_to_token_type(rune_t sign)
 {
    switch(sign)
    {
        case '<': return PAC_TOKEN_SIGN_OPEN_TAG;
        case '>': return PAC_TOKEN_SIGN_CLOSE_TAG;
        case '=': return PAC_TOKEN_SIGN_EQUALS;
        case ':': return PAC_TOKEN_SIGN_COLON;
        case ',': return PAC_TOKEN_SIGN_COMMA;
        case '-': return PAC_TOKEN_SIGN_HYPHEN;
        case '_': return PAC_TOKEN_SIGN_UNDERSCORE;
        case '|': return PAC_TOKEN_SIGN_VERTICAL_BAR;
        case ';': return PAC_TOKEN_SIGN_SEMICOLON;
    }
    return PAC_TOKEN_STRAY;
 }
 pac_tlist_s pac_tokenize_grammar(char *source, usz_t len_source)
 {
    usz_t                   tokens_capacity         = 1024;
    pac_tlist_s             list;
    list.cursor                     = 0;
    list.source                     = source;
    list.num_tokens                 = 0;
    list.tokens                     = calloc(sizeof(pac_token_s), tokens_capacity);
    for(usz_t offset = 0; offset < len_source; ++offset)
    {
        // The subject of the current iteration of this loop;
        // the rune which is being looked at
        rune_t                  subject                 = source[offset];
        if(list.num_tokens >= tokens_capacity)
        {
            tokens_capacity                *= 2;
            list.tokens                     = realloc(list.tokens, sizeof(pac_token_s) * tokens_capacity);
        }
        if(pac_rune_is_blank(subject))
            continue;
        if(subject == '"')
        {
            usz_t                   start_offset            = offset;
            while(offset < len_source)
            {
                if(subject == '\\')
                    ++offset;
                ++offset;
                subject                         = source[offset];
                if(subject == '"')
                    break;
            }
            pac_token_s             token;
            token.type                      = PAC_TOKEN_LIT_STRING;
            token.offset                    = start_offset + 1;             // +1 for skipping the starting quotation mark
            token.length                    = (offset - start_offset) - 1;  // -1 for leaving the ending quotation mark out 
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
            continue;
        }
        if(pac_rune_is_letter(subject) || (subject == '_'))
        {
            usz_t                   start_offset            = offset;
            while(offset < len_source)
            {
                ++offset;
                subject                         = source[offset];
                if(!pac_rune_is_letter(subject) && (subject != '_'))
                {
                    break;
                }
            }
            pac_token_s             token;
            token.offset                    = start_offset;
            token.length                    = offset - start_offset;
            token.type                      = pac_convert_word_to_token_type(&source[start_offset], token.length);
            --offset; // The for() - header will skip to the character after the word.
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
            continue;
        }
        if(subject == '#')
        {
            usz_t                   offset_copy             = offset;
            ++offset_copy;
            if(offset_copy < len_source)
            {
                rune_t                  second_sign             = source[offset_copy];
                if(second_sign == '#')
                {
                    while(offset_copy < len_source)
                    {
                        if(source[offset_copy] == '\n')
                        {
                            break;
                        }
                        ++offset_copy;
                    }
                }
                else if(second_sign == '[')
                {
                    while(offset_copy < len_source)
                    {
                        if(source[offset_copy] == ']')
                        {
                            break;
                        }
                        // If there's an escaped character here, do one jump more
                        if(source[offset_copy] == '\\')
                            ++offset_copy;
                        ++offset_copy;
                    }
                }
                offset                          = offset_copy;
            }
            continue;
        }
        if(pac_rune_is_sign(subject))
        {
            pac_token_s             token;
            token.type                      = pac_convert_sign_to_token_type(subject);
            token.offset                    = offset;
            token.length                    = 1;
            list.tokens[list.num_tokens]    = token;
            ++list.num_tokens;
            continue;
        }
    }
    return list;
 }
 char * pac_stringify_token_type(pac_token_e type)
 {
    switch(type)
    {
        case PAC_TOKEN_STRAY:               return " - ";
        case PAC_TOKEN_WORD:                return "Word";
        case PAC_TOKEN_KEYWORD_TRUE:        return "Keyword: \"true\"";
        case PAC_TOKEN_KEYWORD_FALSE:       return "Keyword: \"false\"";
        case PAC_TOKEN_KEYWORD_ALPHA:       return "Keyword: \"alpha\"";
        case PAC_TOKEN_KEYWORD_WORD:        return "Keyword: \"word\"";
        case PAC_TOKEN_KEYWORD_INTEGER:     return "Keyword: \"integer\"";
        case PAC_TOKEN_SIGN_OPEN_TAG:       return "Sign: <";
        case PAC_TOKEN_SIGN_CLOSE_TAG:      return "Sign: >";
        case PAC_TOKEN_SIGN_EQUALS:         return "Sign: =";
        case PAC_TOKEN_SIGN_COLON:          return "Sign: :";
        case PAC_TOKEN_SIGN_COMMA:          return "Sign: ,";
        case PAC_TOKEN_SIGN_HYPHEN:         return "Sign: -";
        case PAC_TOKEN_SIGN_UNDERSCORE:     return "Sign: _";
        case PAC_TOKEN_SIGN_VERTICAL_BAR:   return "Sign: |";
        case PAC_TOKEN_SIGN_SEMICOLON:      return "Sign: ;";
        case PAC_TOKEN_LIT_STRING:          return "String";
        case PAC_TOKEN_LIT_RUNE:            return "Rune";
        case PAC_TOKEN_LIT_INTEGER:         return "Integer";
    }
    return "Invalid";
 }
 char pac_spaces[256];
 char * pac_create_spaces_for_indent(u8_t count)
 {
    memset(pac_spaces, ' ', 256);
    pac_spaces[count] = 0x00;
    return &pac_spaces[0];
 }
 void pac_display_tlist(pac_tlist_s list)
 {
    printf("Displaying %lu Tokens:\n", list.num_tokens);
    for(usz_t index = 0; index < list.num_tokens; ++index)
    {
        pac_token_s             token               = list.tokens[index];
        char                    content[token.length+1];
        memcpy(&content[0], &list.source[token.offset], token.length);
        content[token.length]           = 0;
        char                   *token_type_string   = pac_stringify_token_type(token.type);
        char                   *token_type_indent   = pac_create_spaces_for_indent(24 - strlen(token_type_string));
        printf("[%s]:%s %s\n",
            token_type_string,
            token_type_indent,
            content
        );
    }
 }
--- a/samples/sample_token_01.parcel
+++ b/samples/sample_token_01.parcel
@ -0,0 +1,2 @@
 <root> = integer, "Something", integer | integer, word;
		`@ -0,0 +1,2 @@`

							`<root> = integer, "Something", integer \| integer, word;`