162 lines
5.3 KiB
C
162 lines
5.3 KiB
C
#include <parser.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
i32_t mach_tokenize(MachTokenStream *stream)
|
|
{
|
|
usz_t tokens_capacity = 2048;
|
|
stream->num_tokens = 0;
|
|
stream->tokens = calloc(sizeof(MachToken), tokens_capacity);
|
|
|
|
usz_t offset = 0;
|
|
while(offset < stream->len_source)
|
|
{
|
|
// There must always be one more after the last one for the STREAM_END token.
|
|
if((stream->num_tokens + 1) >= tokens_capacity)
|
|
{
|
|
tokens_capacity *= 2;
|
|
stream->tokens = realloc(stream->tokens, sizeof(MachToken) * tokens_capacity);
|
|
}
|
|
usz_t token_start = offset;
|
|
usz_t len_token = 0;
|
|
rune_t rune = rr_extract_utf8(stream->source, offset, &len_token);
|
|
if(len_token == 0)
|
|
{
|
|
// TODO: A log-entry because of invalid UTF-8 should be written here.
|
|
return -1;
|
|
}
|
|
offset += len_token;
|
|
|
|
if(rr_rune_is_letter(rune))
|
|
{
|
|
while(offset < stream->len_source)
|
|
{
|
|
len_token = 0;
|
|
rune = rr_extract_utf8(stream->source, offset, &len_token);
|
|
if(!rr_rune_is_letter(rune) && (rune != '_'))
|
|
break;
|
|
offset += len_token;
|
|
}
|
|
MachToken token;
|
|
token.offset = token_start;
|
|
token.length = offset - token_start;
|
|
token.type = MACH_TOKEN_WORD;
|
|
token.data.sign_type = rr_rune_to_ascii_sign(rune);
|
|
stream->tokens[stream->num_tokens++] = token;
|
|
continue;
|
|
}
|
|
|
|
if(rr_rune_is_digit(rune))
|
|
{
|
|
while(offset < stream->len_source)
|
|
{
|
|
rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
if(!rr_rune_is_digit(rune))
|
|
break;
|
|
}
|
|
MachToken token;
|
|
token.offset = token_start;
|
|
token.length = offset - token_start;
|
|
token.type = MACH_TOKEN_INTEGER;
|
|
token.data.sign_type = rr_rune_to_ascii_sign(rune);
|
|
stream->tokens[stream->num_tokens++] = token;
|
|
continue;
|
|
}
|
|
|
|
if(rune == '#')
|
|
{
|
|
usz_t old_offset = offset;
|
|
rune_t following_rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
if(following_rune == '#')
|
|
{
|
|
while(offset < stream->len_source)
|
|
{
|
|
following_rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
if(following_rune == '\n')
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
else if(following_rune == '[')
|
|
{
|
|
// Count how many brackets are needed to end this comment
|
|
|
|
usz_t num_opening_brackets = 1;
|
|
while(offset < stream->len_source)
|
|
{
|
|
following_rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
if(following_rune != '[')
|
|
break;
|
|
++num_opening_brackets;
|
|
}
|
|
|
|
// Find the end of the comment
|
|
|
|
while(offset < stream->len_source)
|
|
{
|
|
following_rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
usz_t num_closing_brackets = 0;
|
|
while(following_rune == ']')
|
|
{
|
|
++num_closing_brackets;
|
|
if(num_closing_brackets == num_opening_brackets)
|
|
break;
|
|
following_rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
offset = old_offset;
|
|
}
|
|
|
|
if(rune == '"')
|
|
{
|
|
bool_t faulty = FALSE;
|
|
while(offset < stream->len_source)
|
|
{
|
|
rune = rr_extract_utf8(stream->source, offset, &offset);
|
|
|
|
if(rune == '"')
|
|
break;
|
|
|
|
if(rune == '\n')
|
|
{
|
|
faulty = TRUE;
|
|
break;
|
|
}
|
|
// If this is a backslash, skip the next character
|
|
if(rune == '\\')
|
|
rr_extract_utf8(stream->source, offset, &offset);
|
|
}
|
|
if(faulty)
|
|
{
|
|
// TODO: A log-entry because of an invalid string should be written here
|
|
return -2;
|
|
}
|
|
MachToken token;
|
|
token.offset = token_start;
|
|
token.length = offset - token_start;
|
|
token.type = MACH_TOKEN_STRING;
|
|
token.data.processed_string = NULL; // !TODO!: Postprocess escape sequences
|
|
stream->tokens[stream->num_tokens++] = token;
|
|
continue;
|
|
}
|
|
|
|
if(rr_rune_is_ascii_special(rune))
|
|
{
|
|
MachToken token;
|
|
token.offset = token_start;
|
|
token.length = offset - token_start;
|
|
token.type = MACH_TOKEN_SPECIAL_SIGN;
|
|
token.data.sign_type = rr_rune_to_ascii_sign(rune);
|
|
stream->tokens[stream->num_tokens++] = token;
|
|
continue;
|
|
}
|
|
|
|
|
|
}
|
|
return 0;
|
|
}
|
|
|