Reductor/src-c/utility/utf8.c

283 lines
4.8 KiB
C

#include <reductor/internals/utility/text.h>
#define UTF8_IS_BYTE_TRAILING(byte) ((byte >> 6) == 2)
int32_t tred_get_utf8_length_from_head(uint8_t head)
{
// If this is just ASCII, it's only one byte.
if(! (head >> 7))
{
return 1;
}
// This 'if' catches an error: If the uppermost bit is 1
// and the bit one less significant than it is a 0, the
// byte is not a head, but a trailing byte.
if(! ((head >> 6) & 1))
{
return -1;
}
if(! ((head >> 5) & 1))
{
return 2;
}
if(! ((head >> 4) & 1))
{
return 3;
}
if(! ((head >> 3) & 1))
{
return 4;
}
return -1;
}
int32_t tred_get_utf8_head_offset(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
uint32_t lookback_offset = offset;
while(UTF8_IS_BYTE_TRAILING(source[lookback_offset]))
{
if(lookback_offset == 0)
{
return -1;
}
if((offset - lookback_offset) >= 3)
{
return -1;
}
--lookback_offset;
}
return offset;
}
uint32_t tred_get_utf8_rune_length(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
uint8_t head = source[head_offset];
uint32_t rune_length = tred_get_utf8_length_from_head(head);
if(rune_length == -1)
{
return 0;
}
return rune_length;
}
rune_t tred_extract_utf8(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
uint8_t head = source[head_offset];
uint32_t rune_length = tred_get_utf8_length_from_head(head);
if(rune_length == -1)
{
return 0;
}
if((head_offset + rune_length) >= len_source)
{
return 0;
}
uint32_t head_mask = 1 << 7;
uint32_t result = 0;
uint32_t trailing_offset = 0;
while(trailing_offset < rune_length)
{
head_mask >>= 1;
head_mask |= 1 << 7;
result <<= 6;
result |= (source[head_offset + trailing_offset] & 0x3f); // 0b00111111
++trailing_offset;
}
result |= (head & (~ head_mask)) << ((trailing_offset - 1) * 6);
return result;
}
rune_t tred_extract_ascii (
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
return source[offset];
}
bool tred_rune_is_blank(rune_t rune)
{
if(rune == ' ')
{
return true;
}
if(rune == '\t')
{
return true;
}
return false;
}
bool tred_rune_is_lowercase(rune_t rune)
{
if(rune < 'a')
{
return false;
}
if(rune < 'z')
{
return false;
}
return true;
}
bool tred_rune_is_uppercase(rune_t rune)
{
if(rune < 'A')
{
return false;
}
if(rune < 'Z')
{
return false;
}
return true;
}
bool tred_rune_is_alphabetic(rune_t rune)
{
if(tred_rune_is_lowercase(rune))
{
return true;
}
if(tred_rune_is_uppercase(rune))
{
return true;
}
}
bool tred_rune_is_numeric(rune_t rune)
{
if(rune < '0')
{
return false;
}
if(rune > '9')
{
return false;
}
return true;
}
bool tred_rune_is_alphanumeric(rune_t rune)
{
if(tred_rune_is_alphabetic(rune))
{
return true;
}
if(tred_rune_is_numeric(rune))
{
return true;
}
return false;
}
bool tred_rune_is_block_1_sign(rune_t rune)
{
if(rune < 0x21)
{
return false;
}
if(rune > 0x2f)
{
return false;
}
return true;
}
bool tred_rune_is_block_2_sign(rune_t rune)
{
if(rune < 0x3a)
{
return false;
}
if(rune > 0x40)
{
return false;
}
return true;
}
bool tred_rune_is_block_3_sign(rune_t rune)
{
if(rune < 0x5b)
{
return false;
}
if(rune > 0x60)
{
return false;
}
return true;
}
bool tred_rune_is_block_4_sign(rune_t rune)
{
if(rune < 0x7b)
{
return false;
}
if(rune > 0x7e)
{
return false;
}
return true;
}
bool tred_rune_is_sign(rune_t rune)
{
if(tred_rune_is_block_1_sign(rune))
{
return true;
}
if(tred_rune_is_block_2_sign(rune))
{
return true;
}
if(tred_rune_is_block_3_sign(rune))
{
return true;
}
if(tred_rune_is_block_4_sign(rune))
{
return true;
}
return false;
}
bool tred_rune_is_control(rune_t rune)
{
if((rune < 32) || (rune == 127))
{
return true;
}
return false;
}