#include isz_t rr_distance_to_last_utf8_rune_start(const char *string, isz_t offset) { usz_t bytes_walked = 0; while((offset - bytes_walked) >= 0) { if(bytes_walked > 4) return -1; if((string[offset - bytes_walked] >> 6) != 0b10) { return bytes_walked; } ++bytes_walked; } return -2; } isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset) { char head_byte = string[offset]; // If this is ASCII if((head_byte & (1 << 7)) == 0) return 1; // UTF-8 - only usz_t length = 0; while(length < 5) { head_byte <<= 1; if((head_byte & (1 << 7)) == 0) break; ++length; } if(length > 4) return -1; if(length < 2) return -2; return length; } rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes) { switch(num_bytes) { case 1: return byte; case 2: return byte & 0b11100000; case 3: return byte & 0b11110000; case 4: return byte & 0b11111000; } return 0; } rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes) { rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes); usz_t byte_index = 1; while(byte_index < num_bytes) { result <<= 6; result |= bytes[byte_index] & 0b00111111; ++byte_index; } return result; } rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase) { if(string[offset] == 0x00) return ZERO; isz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset); if(offset_into_rune < 0) return ZERO; offset -= offset_into_rune; isz_t rune_length = rr_identify_utf8_rune_length(string, offset); if(rune_length < 0) return ZERO; *increase += rune_length - offset_into_rune; return rr_postprocess_utf8_bytes(&string[offset], rune_length); } rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase) { usz_t increase_backup = *increase; rune_t subject = rr_extract_utf8(string, offset, &increase_backup); if(subject < 'a') return ZERO; if(subject > 'z') return ZERO; (*increase) = increase_backup; return subject; } rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase) { usz_t increase_backup = *increase; rune_t subject = rr_extract_utf8(string, offset, &increase_backup); if(subject < 'A') return ZERO; if(subject > 'Z') return ZERO; (*increase) = increase_backup; return subject; } rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase) { rune_t subject; if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject; if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject; return ZERO; } rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase) { usz_t increase_backup = *increase; rune_t subject = rr_extract_utf8(string, offset, &increase_backup); if(subject < '0') return ZERO; if(subject > '9') return ZERO; (*increase) = increase_backup; return subject; } rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase) { rune_t subject; if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject; if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject; if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject; return ZERO; } bool_t rr_is_rune_of_sign_block_1(rune_t rune) { if(rune < 0x20) return FALSE; if(rune > 0x2f) return FALSE; return TRUE; } bool_t rr_is_rune_of_sign_block_2(rune_t rune) { if(rune < 0x3a) return FALSE; if(rune > 0x40) return FALSE; return TRUE; } bool_t rr_is_rune_of_sign_block_3(rune_t rune) { if(rune < 0x5b) return FALSE; if(rune > 0x60) return FALSE; return TRUE; } bool_t rr_is_rune_of_sign_block_4(rune_t rune) { if(rune < 0x7b) return FALSE; if(rune > 0x7e) return FALSE; return TRUE; } rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase) { usz_t increase_backup = *increase; rune_t subject = rr_extract_utf8(string, offset, &increase_backup); if(rr_is_rune_of_sign_block_1(subject)) return subject; if(rr_is_rune_of_sign_block_2(subject)) return subject; if(rr_is_rune_of_sign_block_3(subject)) return subject; if(rr_is_rune_of_sign_block_4(subject)) return subject; return ZERO; } bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next) { rune_t subject = rr_extract_utf8(string, offset, &offset); if(subject == '\r') { usz_t offset_backup = offset; if(rr_extract_utf8(string, offset, &offset) != '\n') (*next) = offset_backup; return TRUE; } if(subject == '\n') { (*next) = offset; return TRUE; } return FALSE; }