#include isz_t rr_distance_to_last_utf8_rune_start(const char *string, isz_t offset) { usz_t bytes_walked = 0; while((offset - bytes_walked) >= 0) { if(bytes_walked > 4) return -1; if((string[offset - bytes_walked] >> 6) != 0b10) { return bytes_walked; } ++bytes_walked; } return -2; } isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset) { char head_byte = string[offset]; // If this is ASCII if((head_byte & (1 << 7)) == 0) return 1; // UTF-8 - only usz_t length = 0; while(length < 5) { head_byte <<= 1; if((head_byte & (1 << 7)) == 0) break; ++length; } if(length > 4) return -1; if(length < 2) return -2; return length; } rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes) { switch(num_bytes) { case 1: return byte; case 2: return byte & 0b11100000; case 3: return byte & 0b11110000; case 4: return byte & 0b11111000; } return 0; } rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes) { rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes); usz_t byte_index = 1; while(byte_index < num_bytes) { result <<= 6; result |= bytes[byte_index] & 0b00111111; ++byte_index; } return result; } rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase) { if(string[offset] == 0x00) return ZERO; isz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset); if(offset_into_rune < 0) return ZERO; offset -= offset_into_rune; isz_t rune_length = rr_identify_utf8_rune_length(string, offset); if(rune_length < 0) return ZERO; *increase += rune_length - offset_into_rune; return rr_postprocess_utf8_bytes(&string[offset], rune_length); } bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next) { rune_t subject = rr_extract_utf8(string, offset, &offset); if(subject == '\r') { usz_t offset_backup = offset; if(rr_extract_utf8(string, offset, &offset) != '\n') (*next) = offset_backup; return TRUE; } if(subject == '\n') { (*next) = offset; return TRUE; } return FALSE; } bool_t rr_rune_is_lower(rune_t rune) { if(rune < 'a') return FALSE; if(rune > 'z') return FALSE; return TRUE; } bool_t rr_rune_is_upper(rune_t rune) { if(rune < 'A') return FALSE; if(rune > 'Z') return FALSE; return TRUE; } bool_t rr_rune_is_letter(rune_t rune) { if(rr_rune_is_lower(rune)) return TRUE; if(rr_rune_is_upper(rune)) return TRUE; return FALSE; } bool_t rr_rune_is_digit(rune_t rune) { if(rune < '0') return FALSE; if(rune > '9') return FALSE; return TRUE; } bool_t rr_rune_is_in_ascii_special_block_1(rune_t rune) { if(rune < '!') return FALSE; if(rune > '/') return FALSE; return TRUE; } bool_t rr_rune_is_in_ascii_special_block_2(rune_t rune) { if(rune < ':') return FALSE; if(rune > '@') return FALSE; return TRUE; } bool_t rr_rune_is_in_ascii_special_block_3(rune_t rune) { if(rune < '[') return FALSE; if(rune > '`') return FALSE; return TRUE; } bool_t rr_rune_is_in_ascii_special_block_4(rune_t rune) { if(rune < '{') return FALSE; if(rune > '~') return FALSE; return TRUE; } bool_t rr_rune_is_ascii_special(rune_t rune) { if(rr_rune_is_in_ascii_special_block_1(rune)) return TRUE; if(rr_rune_is_in_ascii_special_block_2(rune)) return TRUE; if(rr_rune_is_in_ascii_special_block_3(rune)) return TRUE; if(rr_rune_is_in_ascii_special_block_4(rune)) return TRUE; return FALSE; }