Enhanced Tokenizer to support more special signs

This commit is contained in:
Eric-Paul Ickhorn 2023-11-29 21:39:24 +01:00
parent e70e9c5cef
commit c5e21b4dc8
8 changed files with 67 additions and 54 deletions

View File

@ -1,4 +1,4 @@
#include <parcel.h> #include <grammar.h>
#include <ast.h> #include <ast.h>
#include <stdio.h> #include <stdio.h>

View File

@ -1,4 +1,4 @@
#include <parcel.h> #include <grammar.h>
#include <tokenizer.h> #include <tokenizer.h>
#include <ast.h> #include <ast.h>

View File

@ -21,15 +21,23 @@ typedef enum
PAC_TOKEN_LIT_RUNE, // TODO PAC_TOKEN_LIT_RUNE, // TODO
PAC_TOKEN_LIT_INTEGER, // TODO PAC_TOKEN_LIT_INTEGER, // TODO
PAC_TOKEN_SIGN_OPEN_TAG, PAC_TOKEN_SIGN_OPENING_TAG,
PAC_TOKEN_SIGN_CLOSE_TAG, PAC_TOKEN_SIGN_CLOSING_TAG,
PAC_TOKEN_SIGN_OPENING_BRACKET,
PAC_TOKEN_SIGN_CLOSING_BRACKET,
PAC_TOKEN_SIGN_EQUALS, PAC_TOKEN_SIGN_EQUALS,
PAC_TOKEN_SIGN_SEMICOLON,
PAC_TOKEN_SIGN_COLON, PAC_TOKEN_SIGN_COLON,
PAC_TOKEN_SIGN_COMMA, PAC_TOKEN_SIGN_COMMA,
PAC_TOKEN_SIGN_HYPHEN,
PAC_TOKEN_SIGN_UNDERSCORE, PAC_TOKEN_SIGN_UNDERSCORE,
PAC_TOKEN_SIGN_VERTICAL_BAR, PAC_TOKEN_SIGN_VERTICAL_BAR,
PAC_TOKEN_SIGN_SEMICOLON PAC_TOKEN_SIGN_AMPERSAND,
PAC_TOKEN_SIGN_DOLLAR,
PAC_TOKEN_SIGN_SLASH,
PAC_TOKEN_SIGN_HASH,
PAC_TOKEN_SIGN_AT,
PAC_TOKEN_SIGN_PLUS,
PAC_TOKEN_SIGN_MINUS,
} pac_token_e; } pac_token_e;

View File

@ -17,7 +17,7 @@ pac_ast_status_e pac_build_ast_reference(pac_ast_builder_s *builder, pac_ast_ref
{ {
usz_t start_cursor = TOKEN_CURSOR; usz_t start_cursor = TOKEN_CURSOR;
if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_OPEN_TAG) if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_OPENING_TAG)
{ {
pac_internal_error_s error; pac_internal_error_s error;
error.type = PAC_INTERNAL_ERROR_INVALID_RETURNED_STATUS; error.type = PAC_INTERNAL_ERROR_INVALID_RETURNED_STATUS;
@ -35,7 +35,7 @@ pac_ast_status_e pac_build_ast_reference(pac_ast_builder_s *builder, pac_ast_ref
} }
SKIP_TOKEN; SKIP_TOKEN;
if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_CLOSE_TAG) if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_CLOSING_TAG)
{ {
return pac_ast_handle_missing_reference_close_tag(builder); return pac_ast_handle_missing_reference_close_tag(builder);
} }
@ -78,7 +78,7 @@ pac_ast_status_e pac_build_ast_item(pac_ast_builder_s *builder, pac_ast_item_s *
return pac_build_ast_string_literal(builder, &item->data.string_literal); return pac_build_ast_string_literal(builder, &item->data.string_literal);
} }
if(CURRENT_TOKEN.type == PAC_TOKEN_SIGN_OPEN_TAG) if(CURRENT_TOKEN.type == PAC_TOKEN_SIGN_OPENING_TAG)
{ {
item->type = PAC_AST_ITEM_REFERENCE; item->type = PAC_AST_ITEM_REFERENCE;
return pac_build_ast_reference(builder, &item->data.reference); return pac_build_ast_reference(builder, &item->data.reference);
@ -137,7 +137,7 @@ pac_ast_status_e pac_build_ast_variant(pac_ast_builder_s *builder, pac_ast_varia
// Test if there is no separator and, if there is none, handle the error. // Test if there is no separator and, if there is none, handle the error.
if( if(
(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_COMMA) (CURRENT_TOKEN.type != PAC_TOKEN_SIGN_COMMA)
&& (CURRENT_TOKEN.type != PAC_TOKEN_SIGN_HYPHEN) && (CURRENT_TOKEN.type != PAC_TOKEN_SIGN_MINUS)
) { ) {
status = pac_ast_handle_missing_item_separator(builder, rule_name, variant_index, variant->num_items); status = pac_ast_handle_missing_item_separator(builder, rule_name, variant_index, variant->num_items);
if(status != PAC_AST_STATUS_ERROR_HANDLED) return status; if(status != PAC_AST_STATUS_ERROR_HANDLED) return status;
@ -152,7 +152,7 @@ pac_ast_status_e pac_build_ast_variant(pac_ast_builder_s *builder, pac_ast_varia
pac_ast_status_e pac_skip_ast_rule_header(pac_ast_builder_s *builder) pac_ast_status_e pac_skip_ast_rule_header(pac_ast_builder_s *builder)
{ {
if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_OPEN_TAG) if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_OPENING_TAG)
{ {
pac_internal_error_s error; pac_internal_error_s error;
error.type = PAC_INTERNAL_ERROR_INVALID_RETURNED_STATUS; error.type = PAC_INTERNAL_ERROR_INVALID_RETURNED_STATUS;
@ -174,7 +174,7 @@ pac_ast_status_e pac_skip_ast_rule_header(pac_ast_builder_s *builder)
} }
SKIP_TOKEN; SKIP_TOKEN;
if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_CLOSE_TAG) if(CURRENT_TOKEN.type != PAC_TOKEN_SIGN_CLOSING_TAG)
{ {
if((status = pac_ast_handle_missing_rule_header_closing_sign(builder) != PAC_AST_STATUS_ERROR_HANDLED)) if((status = pac_ast_handle_missing_rule_header_closing_sign(builder) != PAC_AST_STATUS_ERROR_HANDLED))
return status; return status;

View File

@ -16,7 +16,7 @@ char * pac_identify_present_construct(pac_ast_builder_s *builder)
{ {
switch(CURRENT_TOKEN.type) switch(CURRENT_TOKEN.type)
{ {
case PAC_TOKEN_SIGN_OPEN_TAG: return "RULE_IDENTIFIER"; case PAC_TOKEN_SIGN_OPENING_TAG: return "RULE_IDENTIFIER";
default: return "UNKNOWN"; default: return "UNKNOWN";
} }
} }
@ -42,7 +42,7 @@ bool_t pac_ast_builder_is_at_item_start(pac_ast_builder_s *builder)
switch(CURRENT_TOKEN.type) switch(CURRENT_TOKEN.type)
{ {
case PAC_TOKEN_LIT_STRING: case PAC_TOKEN_LIT_STRING:
case PAC_TOKEN_SIGN_OPEN_TAG: case PAC_TOKEN_SIGN_OPENING_TAG:
case PAC_TOKEN_KEYWORD_WORD: case PAC_TOKEN_KEYWORD_WORD:
case PAC_TOKEN_KEYWORD_INTEGER: case PAC_TOKEN_KEYWORD_INTEGER:
return TRUE; return TRUE;
@ -78,7 +78,7 @@ pac_ast_status_e pac_ast_handle_invalid_reference_name_token(pac_ast_builder_s *
usz_t tried_tokens = 0; usz_t tried_tokens = 0;
while(tried_tokens < 3) while(tried_tokens < 3)
{ {
if(TOKEN_AT(TOKEN_CURSOR + tried_tokens).type == PAC_TOKEN_SIGN_CLOSE_TAG) if(TOKEN_AT(TOKEN_CURSOR + tried_tokens).type == PAC_TOKEN_SIGN_CLOSING_TAG)
{ {
len_reference_name = TOKEN_AT(TOKEN_CURSOR + tried_tokens).offset - open_tag_src_offset; len_reference_name = TOKEN_AT(TOKEN_CURSOR + tried_tokens).offset - open_tag_src_offset;
++len_reference_name; // Take the closing tag into the name ++len_reference_name; // Take the closing tag into the name
@ -152,7 +152,7 @@ pac_ast_status_e pac_ast_handle_missing_item_separator(pac_ast_builder_s *builde
error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type); error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type);
error.specifics.odd_token.num_valid_options = 2; error.specifics.odd_token.num_valid_options = 2;
error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_COMMA); error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_COMMA);
error.specifics.odd_token.valid_options[1] = pac_stringify_token_type(PAC_TOKEN_SIGN_HYPHEN); error.specifics.odd_token.valid_options[1] = pac_stringify_token_type(PAC_TOKEN_SIGN_MINUS);
pac_log_syntax_error(builder->logger, error); pac_log_syntax_error(builder->logger, error);
return PAC_AST_STATUS_ERROR_HANDLED; return PAC_AST_STATUS_ERROR_HANDLED;
@ -219,7 +219,7 @@ pac_ast_status_e pac_ast_handle_missing_rule_header_closing_sign(pac_ast_builder
error.location.column = CURRENT_TOKEN.column; error.location.column = CURRENT_TOKEN.column;
error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type); error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type);
error.specifics.odd_token.num_valid_options = 1; error.specifics.odd_token.num_valid_options = 1;
error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_CLOSE_TAG); error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_CLOSING_TAG);
// Continue at equals sign (if possible) // Continue at equals sign (if possible)
if(!pac_ast_builder_forward_seek_token_type(builder, PAC_TOKEN_SIGN_EQUALS, 3)) if(!pac_ast_builder_forward_seek_token_type(builder, PAC_TOKEN_SIGN_EQUALS, 3))
@ -244,7 +244,7 @@ pac_ast_status_e pac_ast_handle_unknown_item_type(pac_ast_builder_s *builder, ch
error.location.column = CURRENT_TOKEN.column; error.location.column = CURRENT_TOKEN.column;
error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type); error.specifics.odd_token.present_token = pac_stringify_token_type(CURRENT_TOKEN.type);
error.specifics.odd_token.num_valid_options = 1; error.specifics.odd_token.num_valid_options = 1;
error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_CLOSE_TAG); error.specifics.odd_token.valid_options[0] = pac_stringify_token_type(PAC_TOKEN_SIGN_CLOSING_TAG);
if(!pac_ast_builder_forward_seek_item_start(builder, 5)) if(!pac_ast_builder_forward_seek_item_start(builder, 5))
{ {

View File

@ -1,4 +1,4 @@
#include <grammar_parser.h> #include <grammar.h>
#include <ast.h> #include <ast.h>
#include <stdio.h> #include <stdio.h>
@ -32,17 +32,6 @@ pac_set_e pac_convert_ast_set_to_grammar_set(pac_ast_set_e set)
return PAC_SET_INVALID; return PAC_SET_INVALID;
} }
pac_rule_s * pac_find_rule(pac_grammar_s *grammar, char *name)
{
for(usz_t index = 0; index < grammar->num_rules; ++index)
{
if(!strcmp(grammar->rules[index].name, name))
return &grammar->rules[index];
}
printf("Couldn't find refernced rule: %s\n", name);
return NULL;
}
void pac_copy_single_variant(pac_grammar_s *grammar, pac_variant_s *variant, pac_ast_variant_s *ast_variant) void pac_copy_single_variant(pac_grammar_s *grammar, pac_variant_s *variant, pac_ast_variant_s *ast_variant)
{ {
variant->num_items = ast_variant->num_items; variant->num_items = ast_variant->num_items;

View File

@ -30,15 +30,23 @@ pac_token_e pac_convert_sign_to_token_type(rune_t sign)
{ {
switch(sign) switch(sign)
{ {
case '<': return PAC_TOKEN_SIGN_OPEN_TAG; case '<': return PAC_TOKEN_SIGN_OPENING_TAG;
case '>': return PAC_TOKEN_SIGN_CLOSE_TAG; case '>': return PAC_TOKEN_SIGN_CLOSING_TAG;
case '[': return PAC_TOKEN_SIGN_OPENING_BRACKET;
case ']': return PAC_TOKEN_SIGN_CLOSING_BRACKET;
case '=': return PAC_TOKEN_SIGN_EQUALS; case '=': return PAC_TOKEN_SIGN_EQUALS;
case ';': return PAC_TOKEN_SIGN_SEMICOLON;
case ':': return PAC_TOKEN_SIGN_COLON; case ':': return PAC_TOKEN_SIGN_COLON;
case ',': return PAC_TOKEN_SIGN_COMMA; case ',': return PAC_TOKEN_SIGN_COMMA;
case '-': return PAC_TOKEN_SIGN_HYPHEN;
case '_': return PAC_TOKEN_SIGN_UNDERSCORE; case '_': return PAC_TOKEN_SIGN_UNDERSCORE;
case '|': return PAC_TOKEN_SIGN_VERTICAL_BAR; case '|': return PAC_TOKEN_SIGN_VERTICAL_BAR;
case ';': return PAC_TOKEN_SIGN_SEMICOLON; case '&': return PAC_TOKEN_SIGN_AMPERSAND;
case '$': return PAC_TOKEN_SIGN_DOLLAR;
case '/': return PAC_TOKEN_SIGN_SLASH;
case '#': return PAC_TOKEN_SIGN_HASH;
case '@': return PAC_TOKEN_SIGN_AT;
case '+': return PAC_TOKEN_SIGN_PLUS;
case '-': return PAC_TOKEN_SIGN_MINUS;
} }
return PAC_TOKEN_STRAY; return PAC_TOKEN_STRAY;
} }
@ -204,7 +212,7 @@ char * pac_stringify_token_type(pac_token_e type)
{ {
switch(type) switch(type)
{ {
case PAC_TOKEN_STRAY: return " - "; case PAC_TOKEN_STRAY: return "STRAY";
case PAC_TOKEN_WORD: return "Word"; case PAC_TOKEN_WORD: return "Word";
case PAC_TOKEN_KEYWORD_TRUE: return "Keyword: \"true\""; case PAC_TOKEN_KEYWORD_TRUE: return "Keyword: \"true\"";
@ -213,15 +221,23 @@ char * pac_stringify_token_type(pac_token_e type)
case PAC_TOKEN_KEYWORD_WORD: return "Keyword: \"word\""; case PAC_TOKEN_KEYWORD_WORD: return "Keyword: \"word\"";
case PAC_TOKEN_KEYWORD_INTEGER: return "Keyword: \"integer\""; case PAC_TOKEN_KEYWORD_INTEGER: return "Keyword: \"integer\"";
case PAC_TOKEN_SIGN_OPEN_TAG: return "Sign: <"; case PAC_TOKEN_SIGN_OPENING_TAG: return "Sign: <";
case PAC_TOKEN_SIGN_CLOSE_TAG: return "Sign: >"; case PAC_TOKEN_SIGN_CLOSING_TAG: return "Sign: >";
case PAC_TOKEN_SIGN_OPENING_BRACKET: return "Sign: [";
case PAC_TOKEN_SIGN_CLOSING_BRACKET: return "Sign: ]";
case PAC_TOKEN_SIGN_EQUALS: return "Sign: ="; case PAC_TOKEN_SIGN_EQUALS: return "Sign: =";
case PAC_TOKEN_SIGN_SEMICOLON: return "Sign: ;";
case PAC_TOKEN_SIGN_COLON: return "Sign: :"; case PAC_TOKEN_SIGN_COLON: return "Sign: :";
case PAC_TOKEN_SIGN_COMMA: return "Sign: ,"; case PAC_TOKEN_SIGN_COMMA: return "Sign: ,";
case PAC_TOKEN_SIGN_HYPHEN: return "Sign: -";
case PAC_TOKEN_SIGN_UNDERSCORE: return "Sign: _"; case PAC_TOKEN_SIGN_UNDERSCORE: return "Sign: _";
case PAC_TOKEN_SIGN_VERTICAL_BAR: return "Sign: |"; case PAC_TOKEN_SIGN_VERTICAL_BAR: return "Sign: |";
case PAC_TOKEN_SIGN_SEMICOLON: return "Sign: ;"; case PAC_TOKEN_SIGN_AMPERSAND: return "Sign: &";
case PAC_TOKEN_SIGN_DOLLAR: return "Sign: $";
case PAC_TOKEN_SIGN_SLASH: return "Sign: /";
case PAC_TOKEN_SIGN_HASH: return "Sign: #";
case PAC_TOKEN_SIGN_AT: return "Sign: @";
case PAC_TOKEN_SIGN_PLUS: return "Sign: +";
case PAC_TOKEN_SIGN_MINUS: return "Sign: -";
case PAC_TOKEN_LIT_STRING: return "String"; case PAC_TOKEN_LIT_STRING: return "String";
case PAC_TOKEN_LIT_RUNE: return "Rune"; case PAC_TOKEN_LIT_RUNE: return "Rune";