Add UTF-8 decoding utility and ASCII range check

This commit adds two main utilities:

1. It adds a function for extracting the Unicode rune encoded at a
   given offset in an UTF-8 formatted string.

2. It adds functions for checking if a rune is a Latin letter,
   whether it's lowercase or uppercase, if it's a digit, and so on.

This is particularily useful for writing the tokenizer for the scripts
which have to be parsed (the rule-parser will only accept tokens).
This commit is contained in:
Eric-Paul Ickhorn 2024-07-01 18:18:50 +02:00
parent 1bbe4fd1ca
commit 70c8fc6f72
2 changed files with 370 additions and 0 deletions

View File

@ -0,0 +1,88 @@
#ifndef REDUCTOR_TEXT_H
#define REDUCTOR_TEXT_H
#include <stdbool.h>
#include <stdint.h>
typedef uint32_t rune_t;
/// @brief Gets the length of the UTF-8 rune at a given offset.
/// @param source Source to get the bytes from.
/// @param len_source Length of 'source' in bytes.
/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character.
/// @return Length of the rune or zero on error.
uint32_t tred_get_utf8_rune_length(
const char *source,
uint32_t len_source,
uint32_t offset);
/// @brief Extracts a UTF-8 character at some offset.
/// @warning The length isn't provided by this function; it has to be gotten using
/// 'tred_get_utf8_rune_length'.
/// @param source Source string to get the bytes from.
/// @param len_source Length of 'source' in bytes.
/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character.
/// @return Rune read from 'source' at 'offset'.
rune_t tred_extract_utf8 (
const char *source,
uint32_t len_source,
uint32_t offset);
/// @brief Gets an ASCII character out of a source string, performing boundary checks.
/// @param source Source to read from
/// @param len_source Length of source to read from
/// @param offset Offset to read at
/// @return Rune read from 'source' at 'offset'.
rune_t tred_extract_ascii (
const char *source,
uint32_t len_source,
uint32_t offset);
int32_t tred_get_utf8_length_from_head(uint8_t head);
int32_t tred_get_utf8_head_offset(
const char *source,
uint32_t len_source,
uint32_t offset);
/// @brief Tests whether a rune is a blank (whitespace or tab) rune.
/// @param rune Rune to test for being blank.
/// @return 'true' if 'rune' is a blank character, 'false' otherwise.
bool tred_rune_is_blank(rune_t rune);
/// @brief Tests a rune_t for being a uppercase letter.
/// @param rune Rune to test for being an uppercase Latin letter.
/// @return 'true' if 'rune' is an uppercase Latin letter, 'false' otherwise.
bool tred_rune_is_uppercase(rune_t rune);
/// @brief Tests a rune_t for being a lowercase letter.
/// @param rune Rune to test for being an lowercase Latin letter.
/// @return 'true' if 'rune' is an lowercase Latin letter, 'false' otherwise.
bool tred_rune_is_lowercase(rune_t rune);
/// @brief Tests a rune_t for being either lowercase or uppercase.
/// @param rune Rune to test for being a Latin letter.
/// @return 'true' if 'rune' is either lowercase or uppercase, 'false' otherwise.
bool tred_rune_is_alphabetic(rune_t rune);
/// @brief Tests wheter a rune_t is a digit from 0 to 9.
/// @param rune Rune to test for being a digit.
/// @return 'true' if 'rune' is a digit from 0 to 9, 'false' otherwise.
bool tred_rune_is_numeric(rune_t rune);
/// @brief Tests whether a rune_t is either a digit from 0 to 9
/// or a letter (either lowercase or uppercase).
/// @param rune Rune to test for being alphanumeric
/// @return 'true' if 'rune' is alphanumeric, 'false' otherwise.
bool tred_rune_is_alphanumeric(rune_t rune);
/// @brief Tests whether a rune_t is a sign of one of the four ASCII sign ranges.
/// @param rune Rune to test for being a sign.
/// @return 'true' if 'rune' is from either of the four ASCII sign ranges, 'false' otherwise.
bool tred_rune_is_sign(rune_t rune);
bool tred_rune_is_control(rune_t rune);
#endif // REDUCTOR_TEXT_H

282
src-c/utility/utf8.c Normal file
View File

@ -0,0 +1,282 @@
#include <reductor/internals/utility/text.h>
#define UTF8_IS_BYTE_TRAILING(byte) ((byte >> 6) == 2)
int32_t tred_get_utf8_length_from_head(uint8_t head)
{
// If this is just ASCII, it's only one byte.
if(! (head >> 7))
{
return 1;
}
// This 'if' catches an error: If the uppermost bit is 1
// and the bit one less significant than it is a 0, the
// byte is not a head, but a trailing byte.
if(! ((head >> 6) & 1))
{
return -1;
}
if(! ((head >> 5) & 1))
{
return 2;
}
if(! ((head >> 4) & 1))
{
return 3;
}
if(! ((head >> 3) & 1))
{
return 4;
}
return -1;
}
int32_t tred_get_utf8_head_offset(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
uint32_t lookback_offset = offset;
while(UTF8_IS_BYTE_TRAILING(source[lookback_offset]))
{
if(lookback_offset == 0)
{
return -1;
}
if((offset - lookback_offset) >= 3)
{
return -1;
}
--lookback_offset;
}
return offset;
}
uint32_t tred_get_utf8_rune_length(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
uint8_t head = source[head_offset];
uint32_t rune_length = tred_get_utf8_length_from_head(head);
if(rune_length == -1)
{
return 0;
}
return rune_length;
}
rune_t tred_extract_utf8(
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
uint8_t head = source[head_offset];
uint32_t rune_length = tred_get_utf8_length_from_head(head);
if(rune_length == -1)
{
return 0;
}
if((head_offset + rune_length) >= len_source)
{
return 0;
}
uint32_t head_mask = 1 << 7;
uint32_t result = 0;
uint32_t trailing_offset = 0;
while(trailing_offset < rune_length)
{
head_mask >>= 1;
head_mask |= 1 << 7;
result <<= 6;
result |= (source[head_offset + trailing_offset] & 0x3f); // 0b00111111
++trailing_offset;
}
result |= (head & (~ head_mask)) << ((trailing_offset - 1) * 6);
return result;
}
rune_t tred_extract_ascii (
const char *source,
uint32_t len_source,
uint32_t offset
) {
if(offset >= len_source)
{
return 0;
}
return source[offset];
}
bool tred_rune_is_blank(rune_t rune)
{
if(rune == ' ')
{
return true;
}
if(rune == '\t')
{
return true;
}
return false;
}
bool tred_rune_is_lowercase(rune_t rune)
{
if(rune < 'a')
{
return false;
}
if(rune < 'z')
{
return false;
}
return true;
}
bool tred_rune_is_uppercase(rune_t rune)
{
if(rune < 'A')
{
return false;
}
if(rune < 'Z')
{
return false;
}
return true;
}
bool tred_rune_is_alphabetic(rune_t rune)
{
if(tred_rune_is_lowercase(rune))
{
return true;
}
if(tred_rune_is_uppercase(rune))
{
return true;
}
}
bool tred_rune_is_numeric(rune_t rune)
{
if(rune < '0')
{
return false;
}
if(rune > '9')
{
return false;
}
return true;
}
bool tred_rune_is_alphanumeric(rune_t rune)
{
if(tred_rune_is_alphabetic(rune))
{
return true;
}
if(tred_rune_is_numeric(rune))
{
return true;
}
return false;
}
bool tred_rune_is_block_1_sign(rune_t rune)
{
if(rune < 0x21)
{
return false;
}
if(rune > 0x2f)
{
return false;
}
return true;
}
bool tred_rune_is_block_2_sign(rune_t rune)
{
if(rune < 0x3a)
{
return false;
}
if(rune > 0x40)
{
return false;
}
return true;
}
bool tred_rune_is_block_3_sign(rune_t rune)
{
if(rune < 0x5b)
{
return false;
}
if(rune > 0x60)
{
return false;
}
return true;
}
bool tred_rune_is_block_4_sign(rune_t rune)
{
if(rune < 0x7b)
{
return false;
}
if(rune > 0x7e)
{
return false;
}
return true;
}
bool tred_rune_is_sign(rune_t rune)
{
if(tred_rune_is_block_1_sign(rune))
{
return true;
}
if(tred_rune_is_block_2_sign(rune))
{
return true;
}
if(tred_rune_is_block_3_sign(rune))
{
return true;
}
if(tred_rune_is_block_4_sign(rune))
{
return true;
}
return false;
}
bool tred_rune_is_control(rune_t rune)
{
if((rune < 32) || (rune == 127))
{
return true;
}
return false;
}