#include #include #include i32_t mach_tokenize(MachTokenStream *stream) { usz_t tokens_capacity = 2048; stream->num_tokens = 0; stream->tokens = calloc(sizeof(MachToken), tokens_capacity); usz_t offset = 0; while(offset < stream->len_source) { // There must always be one more after the last one for the STREAM_END token. if((stream->num_tokens + 1) >= tokens_capacity) { tokens_capacity *= 2; stream->tokens = realloc(stream->tokens, sizeof(MachToken) * tokens_capacity); } usz_t token_start = offset; usz_t len_token = 0; rune_t rune = rr_extract_utf8(stream->source, offset, &len_token); if(len_token == 0) { // TODO: A log-entry because of invalid UTF-8 should be written here. return -1; } offset += len_token; if(rr_rune_is_letter(rune)) { while(offset < stream->len_source) { len_token = 0; rune = rr_extract_utf8(stream->source, offset, &len_token); if(!rr_rune_is_letter(rune) && (rune != '_')) break; offset += len_token; } MachToken token; token.offset = token_start; token.length = offset - token_start; token.type = MACH_TOKEN_WORD; token.data.sign_type = rr_rune_to_ascii_sign(rune); stream->tokens[stream->num_tokens++] = token; continue; } if(rr_rune_is_digit(rune)) { while(offset < stream->len_source) { rune = rr_extract_utf8(stream->source, offset, &offset); if(!rr_rune_is_digit(rune)) break; } MachToken token; token.offset = token_start; token.length = offset - token_start; token.type = MACH_TOKEN_INTEGER; token.data.sign_type = rr_rune_to_ascii_sign(rune); stream->tokens[stream->num_tokens++] = token; continue; } if(rune == '#') { usz_t old_offset = offset; rune_t following_rune = rr_extract_utf8(stream->source, offset, &offset); if(following_rune == '#') { while(offset < stream->len_source) { following_rune = rr_extract_utf8(stream->source, offset, &offset); if(following_rune == '\n') break; } continue; } else if(following_rune == '[') { // Count how many brackets are needed to end this comment usz_t num_opening_brackets = 1; while(offset < stream->len_source) { following_rune = rr_extract_utf8(stream->source, offset, &offset); if(following_rune != '[') break; ++num_opening_brackets; } // Find the end of the comment while(offset < stream->len_source) { following_rune = rr_extract_utf8(stream->source, offset, &offset); usz_t num_closing_brackets = 0; while(following_rune == ']') { ++num_closing_brackets; if(num_closing_brackets == num_opening_brackets) break; following_rune = rr_extract_utf8(stream->source, offset, &offset); } } continue; } offset = old_offset; } if(rune == '"') { bool_t faulty = FALSE; while(offset < stream->len_source) { rune = rr_extract_utf8(stream->source, offset, &offset); if(rune == '"') break; if(rune == '\n') { faulty = TRUE; break; } // If this is a backslash, skip the next character if(rune == '\\') rr_extract_utf8(stream->source, offset, &offset); } if(faulty) { // TODO: A log-entry because of an invalid string should be written here return -2; } MachToken token; token.offset = token_start; token.length = offset - token_start; token.type = MACH_TOKEN_STRING; token.data.processed_string = NULL; // !TODO!: Postprocess escape sequences stream->tokens[stream->num_tokens++] = token; continue; } if(rr_rune_is_ascii_special(rune)) { MachToken token; token.offset = token_start; token.length = offset - token_start; token.type = MACH_TOKEN_SPECIAL_SIGN; token.data.sign_type = rr_rune_to_ascii_sign(rune); stream->tokens[stream->num_tokens++] = token; continue; } } return 0; }