Base/core/src-c/runes.c

202 lines
5.9 KiB
C

#include <librr/runes.h>
isz_t rr_distance_to_last_utf8_rune_start(const char *string, isz_t offset)
{
usz_t bytes_walked = 0;
while((offset - bytes_walked) >= 0)
{
if(bytes_walked > 4)
return -1;
if((string[offset - bytes_walked] >> 6) != 0b10)
{
return bytes_walked;
}
++bytes_walked;
}
return -2;
}
isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
{
char head_byte = string[offset];
// If this is ASCII
if((head_byte & (1 << 7)) == 0)
return 1;
// UTF-8 - only
usz_t length = 0;
while(length < 5)
{
head_byte <<= 1;
if((head_byte & (1 << 7)) == 0)
break;
++length;
}
if(length > 4)
return -1;
if(length < 2)
return -2;
return length;
}
rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
{
switch(num_bytes)
{
case 1: return byte;
case 2: return byte & 0b11100000;
case 3: return byte & 0b11110000;
case 4: return byte & 0b11111000;
}
return 0;
}
rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
{
rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
usz_t byte_index = 1;
while(byte_index < num_bytes)
{
result <<= 6;
result |= bytes[byte_index] & 0b00111111;
++byte_index;
}
return result;
}
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
{
if(string[offset] == 0x00)
return ZERO;
isz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset);
if(offset_into_rune < 0)
return ZERO;
offset -= offset_into_rune;
isz_t rune_length = rr_identify_utf8_rune_length(string, offset);
if(rune_length < 0)
return ZERO;
*increase += rune_length - offset_into_rune;
return rr_postprocess_utf8_bytes(&string[offset], rune_length);
}
rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < 'a') return ZERO;
if(subject > 'z') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < 'A') return ZERO;
if(subject > 'Z') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase)
{
rune_t subject;
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
return ZERO;
}
rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < '0') return ZERO;
if(subject > '9') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase)
{
rune_t subject;
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject;
return ZERO;
}
bool_t rr_is_rune_of_sign_block_1(rune_t rune)
{
if(rune < 0x20) return FALSE;
if(rune > 0x2f) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_2(rune_t rune)
{
if(rune < 0x3a) return FALSE;
if(rune > 0x40) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_3(rune_t rune)
{
if(rune < 0x5b) return FALSE;
if(rune > 0x60) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_4(rune_t rune)
{
if(rune < 0x7b) return FALSE;
if(rune > 0x7e) return FALSE;
return TRUE;
}
rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(rr_is_rune_of_sign_block_1(subject)) return subject;
if(rr_is_rune_of_sign_block_2(subject)) return subject;
if(rr_is_rune_of_sign_block_3(subject)) return subject;
if(rr_is_rune_of_sign_block_4(subject)) return subject;
return ZERO;
}
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
{
rune_t subject = rr_extract_utf8(string, offset, &offset);
if(subject == '\r')
{
usz_t offset_backup = offset;
if(rr_extract_utf8(string, offset, &offset) != '\n')
(*next) = offset_backup;
return TRUE;
}
if(subject == '\n')
{
(*next) = offset;
return TRUE;
}
return FALSE;
}