Add UTF-8 decoding utility and ASCII range check

This commit adds two main utilities: 1. It adds a function for extracting the Unicode rune encoded at a given offset in an UTF-8 formatted string. 2. It adds functions for checking if a rune is a Latin letter, whether it's lowercase or uppercase, if it's a digit, and so on. This is particularily useful for writing the tokenizer for the scripts which have to be parsed (the rule-parser will only accept tokens).
2024-07-01 18:18:50 +02:00 · 2024-07-01 18:18:50 +02:00 · 70c8fc6f72
parent 1bbe4fd1ca
commit 70c8fc6f72
2 changed files with 370 additions and 0 deletions
--- a/inc-c/reductor/internals/utility/text.h
+++ b/inc-c/reductor/internals/utility/text.h
@ -0,0 +1,88 @@
+
+#ifndef REDUCTOR_TEXT_H
+#define REDUCTOR_TEXT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef uint32_t rune_t;
+
+/// @brief Gets the length of the UTF-8 rune at a given offset.
+/// @param source Source to get the bytes from.
+/// @param len_source Length of 'source' in bytes.
+/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character.
+/// @return Length of the rune or zero on error.
+uint32_t tred_get_utf8_rune_length(
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset);
+
+/// @brief Extracts a UTF-8 character at some offset.
+/// @warning The length isn't provided by this function; it has to be gotten using
+///          'tred_get_utf8_rune_length'.
+/// @param source Source string to get the bytes from.
+/// @param len_source Length of 'source' in bytes.
+/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character.
+/// @return Rune read from 'source' at 'offset'.
+rune_t tred_extract_utf8 (
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset);
+
+/// @brief Gets an ASCII character out of a source string, performing boundary checks.
+/// @param source Source to read from
+/// @param len_source Length of source to read from
+/// @param offset Offset to read at
+/// @return Rune read from 'source' at 'offset'.
+rune_t tred_extract_ascii (
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset);
+
+int32_t tred_get_utf8_length_from_head(uint8_t head);
+
+int32_t tred_get_utf8_head_offset(
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset);
+
+
+
+/// @brief Tests whether a rune is a blank (whitespace or tab) rune.
+/// @param rune Rune to test for being blank.
+/// @return 'true' if 'rune' is a blank character, 'false' otherwise.
+bool tred_rune_is_blank(rune_t rune);
+
+/// @brief Tests a rune_t for being a uppercase letter.
+/// @param rune Rune to test for being an uppercase Latin letter.
+/// @return 'true' if 'rune' is an uppercase Latin letter, 'false' otherwise.
+bool tred_rune_is_uppercase(rune_t rune);
+
+/// @brief Tests a rune_t for being a lowercase letter.
+/// @param rune Rune to test for being an lowercase Latin letter.
+/// @return 'true' if 'rune' is an lowercase Latin letter, 'false' otherwise.
+bool tred_rune_is_lowercase(rune_t rune);
+
+/// @brief Tests a rune_t for being either lowercase or uppercase.
+/// @param rune Rune to test for being a Latin letter.
+/// @return 'true' if 'rune' is either lowercase or uppercase, 'false' otherwise.
+bool tred_rune_is_alphabetic(rune_t rune);
+
+/// @brief Tests wheter a rune_t is a digit from 0 to 9.
+/// @param rune Rune to test for being a digit.
+/// @return 'true' if 'rune' is a digit from 0 to 9, 'false' otherwise.
+bool tred_rune_is_numeric(rune_t rune);
+
+/// @brief Tests whether a rune_t is either a digit from 0 to 9
+///        or a letter (either lowercase or uppercase).
+/// @param rune Rune to test for being alphanumeric
+/// @return 'true' if 'rune' is alphanumeric, 'false' otherwise.
+bool tred_rune_is_alphanumeric(rune_t rune);
+
+/// @brief Tests whether a rune_t is a sign of one of the four ASCII sign ranges.
+/// @param rune Rune to test for being a sign.
+/// @return 'true' if 'rune' is from either of the four ASCII sign ranges, 'false' otherwise.
+bool tred_rune_is_sign(rune_t rune);
+bool tred_rune_is_control(rune_t rune);
+
+#endif // REDUCTOR_TEXT_H
--- a/src-c/utility/utf8.c
+++ b/src-c/utility/utf8.c
@ -0,0 +1,282 @@
+#include <reductor/internals/utility/text.h>
+
+#define UTF8_IS_BYTE_TRAILING(byte) ((byte >> 6) == 2)
+
+int32_t tred_get_utf8_length_from_head(uint8_t head)
+{
+    // If this is just ASCII, it's only one byte.
+    if(! (head >> 7))
+    {
+        return 1;
+    }
+    // This 'if' catches an error: If the uppermost bit is 1
+    // and the bit one less significant than it is a 0, the
+    // byte is not a head, but a trailing byte.
+    if(! ((head >> 6) & 1))
+    {
+        return -1;
+    }
+    if(! ((head >> 5) & 1))
+    {
+        return 2;
+    }
+    if(! ((head >> 4) & 1))
+    {
+        return 3;
+    }
+    if(! ((head >> 3) & 1))
+    {
+        return 4;
+    }
+    return -1;
+}
+
+int32_t tred_get_utf8_head_offset(
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset
+) {
+    if(offset >= len_source)
+    {
+        return 0;
+    }
+    uint32_t lookback_offset = offset;
+    while(UTF8_IS_BYTE_TRAILING(source[lookback_offset]))
+    {
+        if(lookback_offset == 0)
+        {
+            return -1;
+        }
+        if((offset - lookback_offset) >= 3)
+        {
+            return -1;
+        }
+        --lookback_offset;
+    }
+    return offset;
+}
+
+uint32_t tred_get_utf8_rune_length(
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset
+) {
+    if(offset >= len_source)
+    {
+        return 0;
+    }
+    int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
+    uint8_t head = source[head_offset];
+    uint32_t rune_length = tred_get_utf8_length_from_head(head);
+    if(rune_length == -1)
+    {
+        return 0;
+    }
+    return rune_length;
+}
+
+rune_t tred_extract_utf8(
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset
+) {
+    if(offset >= len_source)
+    {
+        return 0;
+    }
+    int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset);
+    uint8_t head = source[head_offset];
+    uint32_t rune_length = tred_get_utf8_length_from_head(head);
+    if(rune_length == -1)
+    {
+        return 0;
+    }
+    if((head_offset + rune_length) >= len_source)
+    {
+        return 0;
+    }
+    uint32_t head_mask = 1 << 7;
+    uint32_t result = 0;
+    uint32_t trailing_offset = 0;
+    while(trailing_offset < rune_length)
+    {
+        head_mask >>= 1;
+        head_mask |= 1 << 7;
+        result <<= 6;
+        result |= (source[head_offset + trailing_offset] & 0x3f); // 0b00111111
+        ++trailing_offset;
+    }
+    result |= (head & (~ head_mask)) << ((trailing_offset - 1) * 6);
+    return result;
+}
+
+rune_t tred_extract_ascii (
+    const char *source,
+    uint32_t len_source,
+    uint32_t offset
+) {
+    if(offset >= len_source)
+    {
+        return 0;
+    }
+    return source[offset];
+}
+
+bool tred_rune_is_blank(rune_t rune)
+{
+    if(rune == ' ')
+    {
+        return true;
+    }
+    if(rune == '\t')
+    {
+        return true;
+    }
+    return false;
+}
+
+bool tred_rune_is_lowercase(rune_t rune)
+{
+    if(rune < 'a')
+    {
+        return false;
+    }
+    if(rune < 'z')
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_uppercase(rune_t rune)
+{
+    if(rune < 'A')
+    {
+        return false;
+    }
+    if(rune < 'Z')
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_alphabetic(rune_t rune)
+{
+    if(tred_rune_is_lowercase(rune))
+    {
+        return true;
+    }
+    if(tred_rune_is_uppercase(rune))
+    {
+        return true;
+    }
+}
+
+bool tred_rune_is_numeric(rune_t rune)
+{
+    if(rune < '0')
+    {
+        return false;
+    }
+    if(rune > '9')
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_alphanumeric(rune_t rune)
+{
+    if(tred_rune_is_alphabetic(rune))
+    {
+        return true;
+    }
+    if(tred_rune_is_numeric(rune))
+    {
+        return true;
+    }
+    return false;
+}
+
+bool tred_rune_is_block_1_sign(rune_t rune)
+{
+    if(rune < 0x21)
+    {
+        return false;
+    }
+    if(rune > 0x2f)
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_block_2_sign(rune_t rune)
+{
+    if(rune < 0x3a)
+    {
+        return false;
+    }
+    if(rune > 0x40)
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_block_3_sign(rune_t rune)
+{
+    if(rune < 0x5b)
+    {
+        return false;
+    }
+    if(rune > 0x60)
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_block_4_sign(rune_t rune)
+{
+    if(rune < 0x7b)
+    {
+        return false;
+    }
+    if(rune > 0x7e)
+    {
+        return false;
+    }
+    return true;
+}
+
+bool tred_rune_is_sign(rune_t rune)
+{
+    if(tred_rune_is_block_1_sign(rune))
+    {
+        return true;
+    }
+    if(tred_rune_is_block_2_sign(rune))
+    {
+        return true;
+    }
+    if(tred_rune_is_block_3_sign(rune))
+    {
+        return true;
+    }
+    if(tred_rune_is_block_4_sign(rune))
+    {
+        return true;
+    }
+    return false;
+}
+
+bool tred_rune_is_control(rune_t rune)
+{
+    if((rune < 32) || (rune == 127))
+    {
+        return true;
+    }
+    return false;
+}