283 lines
4.8 KiB
C
283 lines
4.8 KiB
C
|
#include <reductor/internals/utility/text.h>
|
||
|
|
||
|
#define UTF8_IS_BYTE_TRAILING(byte) ((byte >> 6) == 2)
|
||
|
|
||
|
int32_t tred_get_utf8_length_from_head(uint8_t head)
|
||
|
{
|
||
|
// If this is just ASCII, it's only one byte.
|
||
|
if(! (head >> 7))
|
||
|
{
|
||
|
return 1;
|
||
|
}
|
||
|
// This 'if' catches an error: If the uppermost bit is 1
|
||
|
// and the bit one less significant than it is a 0, the
|
||
|
// byte is not a head, but a trailing byte.
|
||
|
if(! ((head >> 6) & 1))
|
||
|
{
|
||
|
return -1;
|
||
|
}
|
||
|
if(! ((head >> 5) & 1))
|
||
|
{
|
||
|
return 2;
|
||
|
}
|
||
|
if(! ((head >> 4) & 1))
|
||
|
{
|
||
|
return 3;
|
||
|
}
|
||
|
if(! ((head >> 3) & 1))
|
||
|
{
|
||
|
return 4;
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
int32_t tred_get_utf8_head_offset(
|
||
|
const char *source,
|
||
|
uint32_t len_source,
|
||
|
uint32_t offset
|
||
|
) {
|
||
|
if(offset >= len_source)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
uint32_t lookback_offset = offset;
|
||
|
while(UTF8_IS_BYTE_TRAILING(source[lookback_offset]))
|
||
|
{
|
||
|
if(lookback_offset == 0)
|
||
|
{
|
||
|
return -1;
|
||
|
}
|
||
|
if((offset - lookback_offset) >= 3)
|
||
|
{
|
||
|
return -1;
|
||
|
}
|
||
|
--lookback_offset;
|
||
|
}
|
||
|
return offset;
|
||
|
}
|
||
|
|
||
|
uint32_t tred_get_utf8_rune_length(
|
||
|
const char *source,
|
||
|
uint32_t len_source,
|
||
|
uint32_t offset
|
||
|
) {
|
||
|
if(offset >= len_source)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
|
||
|
uint8_t head = source[head_offset];
|
||
|
uint32_t rune_length = tred_get_utf8_length_from_head(head);
|
||
|
if(rune_length == -1)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
return rune_length;
|
||
|
}
|
||
|
|
||
|
rune_t tred_extract_utf8(
|
||
|
const char *source,
|
||
|
uint32_t len_source,
|
||
|
uint32_t offset
|
||
|
) {
|
||
|
if(offset >= len_source)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
|
||
|
uint8_t head = source[head_offset];
|
||
|
uint32_t rune_length = tred_get_utf8_length_from_head(head);
|
||
|
if(rune_length == -1)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
if((head_offset + rune_length) >= len_source)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
uint32_t head_mask = 1 << 7;
|
||
|
uint32_t result = 0;
|
||
|
uint32_t trailing_offset = 0;
|
||
|
while(trailing_offset < rune_length)
|
||
|
{
|
||
|
head_mask >>= 1;
|
||
|
head_mask |= 1 << 7;
|
||
|
result <<= 6;
|
||
|
result |= (source[head_offset + trailing_offset] & 0x3f); // 0b00111111
|
||
|
++trailing_offset;
|
||
|
}
|
||
|
result |= (head & (~ head_mask)) << ((trailing_offset - 1) * 6);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
rune_t tred_extract_ascii (
|
||
|
const char *source,
|
||
|
uint32_t len_source,
|
||
|
uint32_t offset
|
||
|
) {
|
||
|
if(offset >= len_source)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
return source[offset];
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_blank(rune_t rune)
|
||
|
{
|
||
|
if(rune == ' ')
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(rune == '\t')
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_lowercase(rune_t rune)
|
||
|
{
|
||
|
if(rune < 'a')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune < 'z')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_uppercase(rune_t rune)
|
||
|
{
|
||
|
if(rune < 'A')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune < 'Z')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_alphabetic(rune_t rune)
|
||
|
{
|
||
|
if(tred_rune_is_lowercase(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(tred_rune_is_uppercase(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_numeric(rune_t rune)
|
||
|
{
|
||
|
if(rune < '0')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune > '9')
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_alphanumeric(rune_t rune)
|
||
|
{
|
||
|
if(tred_rune_is_alphabetic(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(tred_rune_is_numeric(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_block_1_sign(rune_t rune)
|
||
|
{
|
||
|
if(rune < 0x21)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune > 0x2f)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_block_2_sign(rune_t rune)
|
||
|
{
|
||
|
if(rune < 0x3a)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune > 0x40)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_block_3_sign(rune_t rune)
|
||
|
{
|
||
|
if(rune < 0x5b)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune > 0x60)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_block_4_sign(rune_t rune)
|
||
|
{
|
||
|
if(rune < 0x7b)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(rune > 0x7e)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_sign(rune_t rune)
|
||
|
{
|
||
|
if(tred_rune_is_block_1_sign(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(tred_rune_is_block_2_sign(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(tred_rune_is_block_3_sign(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
if(tred_rune_is_block_4_sign(rune))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool tred_rune_is_control(rune_t rune)
|
||
|
{
|
||
|
if((rune < 32) || (rune == 127))
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|