Mach/builder/src-c/tokenizer.c

162 lines
5.3 KiB
C

#include <parser.h>
#include <stdio.h>
#include <stdlib.h>
i32_t mach_tokenize(MachTokenStream *stream)
{
usz_t tokens_capacity = 2048;
stream->num_tokens = 0;
stream->tokens = calloc(sizeof(MachToken), tokens_capacity);
usz_t offset = 0;
while(offset < stream->len_source)
{
// There must always be one more after the last one for the STREAM_END token.
if((stream->num_tokens + 1) >= tokens_capacity)
{
tokens_capacity *= 2;
stream->tokens = realloc(stream->tokens, sizeof(MachToken) * tokens_capacity);
}
usz_t token_start = offset;
usz_t len_token = 0;
rune_t rune = rr_extract_utf8(stream->source, offset, &len_token);
if(len_token == 0)
{
// TODO: A log-entry because of invalid UTF-8 should be written here.
return -1;
}
offset += len_token;
if(rr_rune_is_letter(rune))
{
while(offset < stream->len_source)
{
len_token = 0;
rune = rr_extract_utf8(stream->source, offset, &len_token);
if(!rr_rune_is_letter(rune) && (rune != '_'))
break;
offset += len_token;
}
MachToken token;
token.offset = token_start;
token.length = offset - token_start;
token.type = MACH_TOKEN_WORD;
token.data.sign_type = rr_rune_to_ascii_sign(rune);
stream->tokens[stream->num_tokens++] = token;
continue;
}
if(rr_rune_is_digit(rune))
{
while(offset < stream->len_source)
{
rune = rr_extract_utf8(stream->source, offset, &offset);
if(!rr_rune_is_digit(rune))
break;
}
MachToken token;
token.offset = token_start;
token.length = offset - token_start;
token.type = MACH_TOKEN_INTEGER;
token.data.sign_type = rr_rune_to_ascii_sign(rune);
stream->tokens[stream->num_tokens++] = token;
continue;
}
if(rune == '#')
{
usz_t old_offset = offset;
rune_t following_rune = rr_extract_utf8(stream->source, offset, &offset);
if(following_rune == '#')
{
while(offset < stream->len_source)
{
following_rune = rr_extract_utf8(stream->source, offset, &offset);
if(following_rune == '\n')
break;
}
continue;
}
else if(following_rune == '[')
{
// Count how many brackets are needed to end this comment
usz_t num_opening_brackets = 1;
while(offset < stream->len_source)
{
following_rune = rr_extract_utf8(stream->source, offset, &offset);
if(following_rune != '[')
break;
++num_opening_brackets;
}
// Find the end of the comment
while(offset < stream->len_source)
{
following_rune = rr_extract_utf8(stream->source, offset, &offset);
usz_t num_closing_brackets = 0;
while(following_rune == ']')
{
++num_closing_brackets;
if(num_closing_brackets == num_opening_brackets)
break;
following_rune = rr_extract_utf8(stream->source, offset, &offset);
}
}
continue;
}
offset = old_offset;
}
if(rune == '"')
{
bool_t faulty = FALSE;
while(offset < stream->len_source)
{
rune = rr_extract_utf8(stream->source, offset, &offset);
if(rune == '"')
break;
if(rune == '\n')
{
faulty = TRUE;
break;
}
// If this is a backslash, skip the next character
if(rune == '\\')
rr_extract_utf8(stream->source, offset, &offset);
}
if(faulty)
{
// TODO: A log-entry because of an invalid string should be written here
return -2;
}
MachToken token;
token.offset = token_start;
token.length = offset - token_start;
token.type = MACH_TOKEN_STRING;
token.data.processed_string = NULL; // !TODO!: Postprocess escape sequences
stream->tokens[stream->num_tokens++] = token;
continue;
}
if(rr_rune_is_ascii_special(rune))
{
MachToken token;
token.offset = token_start;
token.length = offset - token_start;
token.type = MACH_TOKEN_SPECIAL_SIGN;
token.data.sign_type = rr_rune_to_ascii_sign(rune);
stream->tokens[stream->num_tokens++] = token;
continue;
}
}
return 0;
}