From 70c8fc6f72ce2920cc59c777befc12acff8feba7 Mon Sep 17 00:00:00 2001 From: Eric-Paul Ickhorn Date: Mon, 1 Jul 2024 18:18:50 +0200 Subject: [PATCH] Add UTF-8 decoding utility and ASCII range check This commit adds two main utilities: 1. It adds a function for extracting the Unicode rune encoded at a given offset in an UTF-8 formatted string. 2. It adds functions for checking if a rune is a Latin letter, whether it's lowercase or uppercase, if it's a digit, and so on. This is particularily useful for writing the tokenizer for the scripts which have to be parsed (the rule-parser will only accept tokens). --- inc-c/reductor/internals/utility/text.h | 88 ++++++++ src-c/utility/utf8.c | 282 ++++++++++++++++++++++++ 2 files changed, 370 insertions(+) create mode 100644 inc-c/reductor/internals/utility/text.h create mode 100644 src-c/utility/utf8.c diff --git a/inc-c/reductor/internals/utility/text.h b/inc-c/reductor/internals/utility/text.h new file mode 100644 index 0000000..7bdc810 --- /dev/null +++ b/inc-c/reductor/internals/utility/text.h @@ -0,0 +1,88 @@ + +#ifndef REDUCTOR_TEXT_H +#define REDUCTOR_TEXT_H + +#include +#include + +typedef uint32_t rune_t; + +/// @brief Gets the length of the UTF-8 rune at a given offset. +/// @param source Source to get the bytes from. +/// @param len_source Length of 'source' in bytes. +/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character. +/// @return Length of the rune or zero on error. +uint32_t tred_get_utf8_rune_length( + const char *source, + uint32_t len_source, + uint32_t offset); + +/// @brief Extracts a UTF-8 character at some offset. +/// @warning The length isn't provided by this function; it has to be gotten using +/// 'tred_get_utf8_rune_length'. +/// @param source Source string to get the bytes from. +/// @param len_source Length of 'source' in bytes. +/// @param offset Offset to read from; doesn't have to be the start of the UTF-8 character. +/// @return Rune read from 'source' at 'offset'. +rune_t tred_extract_utf8 ( + const char *source, + uint32_t len_source, + uint32_t offset); + +/// @brief Gets an ASCII character out of a source string, performing boundary checks. +/// @param source Source to read from +/// @param len_source Length of source to read from +/// @param offset Offset to read at +/// @return Rune read from 'source' at 'offset'. +rune_t tred_extract_ascii ( + const char *source, + uint32_t len_source, + uint32_t offset); + +int32_t tred_get_utf8_length_from_head(uint8_t head); + +int32_t tred_get_utf8_head_offset( + const char *source, + uint32_t len_source, + uint32_t offset); + + + +/// @brief Tests whether a rune is a blank (whitespace or tab) rune. +/// @param rune Rune to test for being blank. +/// @return 'true' if 'rune' is a blank character, 'false' otherwise. +bool tred_rune_is_blank(rune_t rune); + +/// @brief Tests a rune_t for being a uppercase letter. +/// @param rune Rune to test for being an uppercase Latin letter. +/// @return 'true' if 'rune' is an uppercase Latin letter, 'false' otherwise. +bool tred_rune_is_uppercase(rune_t rune); + +/// @brief Tests a rune_t for being a lowercase letter. +/// @param rune Rune to test for being an lowercase Latin letter. +/// @return 'true' if 'rune' is an lowercase Latin letter, 'false' otherwise. +bool tred_rune_is_lowercase(rune_t rune); + +/// @brief Tests a rune_t for being either lowercase or uppercase. +/// @param rune Rune to test for being a Latin letter. +/// @return 'true' if 'rune' is either lowercase or uppercase, 'false' otherwise. +bool tred_rune_is_alphabetic(rune_t rune); + +/// @brief Tests wheter a rune_t is a digit from 0 to 9. +/// @param rune Rune to test for being a digit. +/// @return 'true' if 'rune' is a digit from 0 to 9, 'false' otherwise. +bool tred_rune_is_numeric(rune_t rune); + +/// @brief Tests whether a rune_t is either a digit from 0 to 9 +/// or a letter (either lowercase or uppercase). +/// @param rune Rune to test for being alphanumeric +/// @return 'true' if 'rune' is alphanumeric, 'false' otherwise. +bool tred_rune_is_alphanumeric(rune_t rune); + +/// @brief Tests whether a rune_t is a sign of one of the four ASCII sign ranges. +/// @param rune Rune to test for being a sign. +/// @return 'true' if 'rune' is from either of the four ASCII sign ranges, 'false' otherwise. +bool tred_rune_is_sign(rune_t rune); +bool tred_rune_is_control(rune_t rune); + +#endif // REDUCTOR_TEXT_H diff --git a/src-c/utility/utf8.c b/src-c/utility/utf8.c new file mode 100644 index 0000000..de5ecee --- /dev/null +++ b/src-c/utility/utf8.c @@ -0,0 +1,282 @@ +#include + +#define UTF8_IS_BYTE_TRAILING(byte) ((byte >> 6) == 2) + +int32_t tred_get_utf8_length_from_head(uint8_t head) +{ + // If this is just ASCII, it's only one byte. + if(! (head >> 7)) + { + return 1; + } + // This 'if' catches an error: If the uppermost bit is 1 + // and the bit one less significant than it is a 0, the + // byte is not a head, but a trailing byte. + if(! ((head >> 6) & 1)) + { + return -1; + } + if(! ((head >> 5) & 1)) + { + return 2; + } + if(! ((head >> 4) & 1)) + { + return 3; + } + if(! ((head >> 3) & 1)) + { + return 4; + } + return -1; +} + +int32_t tred_get_utf8_head_offset( + const char *source, + uint32_t len_source, + uint32_t offset +) { + if(offset >= len_source) + { + return 0; + } + uint32_t lookback_offset = offset; + while(UTF8_IS_BYTE_TRAILING(source[lookback_offset])) + { + if(lookback_offset == 0) + { + return -1; + } + if((offset - lookback_offset) >= 3) + { + return -1; + } + --lookback_offset; + } + return offset; +} + +uint32_t tred_get_utf8_rune_length( + const char *source, + uint32_t len_source, + uint32_t offset +) { + if(offset >= len_source) + { + return 0; + } + int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset); + uint8_t head = source[head_offset]; + uint32_t rune_length = tred_get_utf8_length_from_head(head); + if(rune_length == -1) + { + return 0; + } + return rune_length; +} + +rune_t tred_extract_utf8( + const char *source, + uint32_t len_source, + uint32_t offset +) { + if(offset >= len_source) + { + return 0; + } + int32_t head_offset = tred_get_utf8_head_offset(source, len_source, offset); + uint8_t head = source[head_offset]; + uint32_t rune_length = tred_get_utf8_length_from_head(head); + if(rune_length == -1) + { + return 0; + } + if((head_offset + rune_length) >= len_source) + { + return 0; + } + uint32_t head_mask = 1 << 7; + uint32_t result = 0; + uint32_t trailing_offset = 0; + while(trailing_offset < rune_length) + { + head_mask >>= 1; + head_mask |= 1 << 7; + result <<= 6; + result |= (source[head_offset + trailing_offset] & 0x3f); // 0b00111111 + ++trailing_offset; + } + result |= (head & (~ head_mask)) << ((trailing_offset - 1) * 6); + return result; +} + +rune_t tred_extract_ascii ( + const char *source, + uint32_t len_source, + uint32_t offset +) { + if(offset >= len_source) + { + return 0; + } + return source[offset]; +} + +bool tred_rune_is_blank(rune_t rune) +{ + if(rune == ' ') + { + return true; + } + if(rune == '\t') + { + return true; + } + return false; +} + +bool tred_rune_is_lowercase(rune_t rune) +{ + if(rune < 'a') + { + return false; + } + if(rune < 'z') + { + return false; + } + return true; +} + +bool tred_rune_is_uppercase(rune_t rune) +{ + if(rune < 'A') + { + return false; + } + if(rune < 'Z') + { + return false; + } + return true; +} + +bool tred_rune_is_alphabetic(rune_t rune) +{ + if(tred_rune_is_lowercase(rune)) + { + return true; + } + if(tred_rune_is_uppercase(rune)) + { + return true; + } +} + +bool tred_rune_is_numeric(rune_t rune) +{ + if(rune < '0') + { + return false; + } + if(rune > '9') + { + return false; + } + return true; +} + +bool tred_rune_is_alphanumeric(rune_t rune) +{ + if(tred_rune_is_alphabetic(rune)) + { + return true; + } + if(tred_rune_is_numeric(rune)) + { + return true; + } + return false; +} + +bool tred_rune_is_block_1_sign(rune_t rune) +{ + if(rune < 0x21) + { + return false; + } + if(rune > 0x2f) + { + return false; + } + return true; +} + +bool tred_rune_is_block_2_sign(rune_t rune) +{ + if(rune < 0x3a) + { + return false; + } + if(rune > 0x40) + { + return false; + } + return true; +} + +bool tred_rune_is_block_3_sign(rune_t rune) +{ + if(rune < 0x5b) + { + return false; + } + if(rune > 0x60) + { + return false; + } + return true; +} + +bool tred_rune_is_block_4_sign(rune_t rune) +{ + if(rune < 0x7b) + { + return false; + } + if(rune > 0x7e) + { + return false; + } + return true; +} + +bool tred_rune_is_sign(rune_t rune) +{ + if(tred_rune_is_block_1_sign(rune)) + { + return true; + } + if(tred_rune_is_block_2_sign(rune)) + { + return true; + } + if(tred_rune_is_block_3_sign(rune)) + { + return true; + } + if(tred_rune_is_block_4_sign(rune)) + { + return true; + } + return false; +} + +bool tred_rune_is_control(rune_t rune) +{ + if((rune < 32) || (rune == 127)) + { + return true; + } + return false; +}