Added rune extraction and detection functions (not tested)

2023-12-01 16:55:40 +01:00 · 2023-12-01 16:55:40 +01:00 · ae6e4ffe33
parent 22462bf2e7
commit ae6e4ffe33
2 changed files with 329 additions and 0 deletions
--- a/code/exports/librr/runes.h
+++ b/code/exports/librr/runes.h
@ -0,0 +1,128 @@
+
+#ifndef RR_RUNES_H
+#define RR_RUNES_H
+
+#include <librr/types.h>
+
+/// @brief 
+/// @param string 
+/// @param offset 
+/// @param increase 
+/// @return The UTF-8 character which was extracted OR 0 is the function failed.
+rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase);
+
+/// @brief Extracts a lowercase letter at some offset in an UTF-8 - string.
+///  The function adds the rest length of the found lowercase letter to an
+///  integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not a lowercase letter.
+/// @return The rune which was extracted or ZERO if the found rune is not a lowercase letter.
+rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Extracts an uppercase letter at some offset in an UTF-8 - string.
+///  The function adds the rest length of the found uppercase letter to an
+///  integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not an uppercase letter.
+/// @return The rune which was extracted or ZERO if the found rune is not an uppercase letter.
+rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Extracts a letter at some offset in an UTF-8 - string.
+///  The function adds the rest length of the found letter to an
+///  integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not a letter.
+/// @return The rune which was extracted or ZERO if the found rune is not a letter.
+rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Extracts a digit at some offset in an UTF-8 - string.
+///  The function adds the rest length of the found digit to an
+///  integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not a digit.
+/// @return The rune which was extracted or ZERO if the found rune is not a digit.
+rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Extracts an alphanumeric rune at some offset in an UTF-8 - string.
+///  The function adds the rest length of the alphanumeric rune found to an
+///  integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not an alphanumeric rune.
+/// @return The rune which was extracted or ZERO if the found rune is not alphanumeric.
+rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Extracts a special sign (such as slash, at, the hash sign, etc.)
+///  at some offset in an UTF-8 - string. The function adds the rest length
+///  of the found sign to an integer to which a pointer was given.
+/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
+/// @attention Not the full length of the rune but rather the rest length,
+///  the length starting from the reading offset going to the end of the rune
+///  will be added onto 'advance'.
+/// @param string Null-terminated UTF-8 (or ASCII) string from which the
+///  rune should be extracted. This will not be modified.
+/// @param offset Offset from the start of the string at which the rune will
+///  be read. This doesn't have to point to the start of the rune; the start
+///  will be found.
+/// @param advance How many bytes 'offset' must be advanced to get to the first
+///  byte of the next rune. This will NOT be set if the rune is not a sign.
+/// @return The rune which was extracted or ZERO if the found rune is not a sign.
+rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *advance);
+
+/// @brief Checks if there is a newline delimiter at a specific offset in a string
+///  and writes the offset right after it to a given pointer's destination.
+/// @attention The next rune's offset will replace the previous content of the given pointer,
+///  the offset will not be added to.
+/// @param string The string in question
+/// @param offset The offset in the string at which the test should be done.
+/// @param next The pointer to the integer to which the offset will be written.
+///  in contrast to the other rune functions, this does NOT add, it SETS the value
+///  relative to 'offset'. This will NOT be set if no newline was found.
+/// @return TRUE if there is a newline at that point and FALSE if not.
+bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next);
+
+#endif // LIBRR_RUNES_H
--- a/code/src-c/runes.c
+++ b/code/src-c/runes.c
@ -0,0 +1,201 @@
+#include <librr/runes.h>
+
+isz_t rr_distance_to_last_utf8_rune_start(const char *string, usz_t offset)
+{
+    usz_t                       bytes_walked                    = 0;
+    while(bytes_walked < offset)
+    {
+        if(bytes_walked > 4)
+            return -1;
+        
+        if((string[offset - bytes_walked] >> 6) != 0b10)
+        {
+            return bytes_walked;
+        }
+        ++bytes_walked;
+    }
+    return -2;
+}
+
+isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
+{
+    char                        head_byte                       = string[offset];
+    
+    // If this is ASCII
+    if((head_byte & (1 << 7)) == 0)
+        return  1;
+    
+    // UTF-8 - only
+
+    usz_t                       length                          = 0;
+    while(length < 5)
+    {
+        head_byte                     <<= 1;
+        if((head_byte & (1 << 7)) == 0)
+            break;
+        ++length;
+    }
+    
+    if(length > 4)
+        return -1;
+    
+    if(length < 2)
+        return -2;
+    
+    return length;
+}
+
+rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
+{
+    switch(num_bytes)
+    {
+        case 1: return byte;
+        case 2: return byte & 0b11100000;
+        case 3: return byte & 0b11110000;
+        case 4: return byte & 0b11111000;
+    }
+    return 0;
+}
+
+rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
+{
+    rune_t                      result                          = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
+
+    usz_t                       byte_index                      = 1;
+    while(byte_index < num_bytes)
+    {
+        result                    <<= 6;
+        result                     |= bytes[byte_index] & 0b00111111;
+        ++byte_index;
+    }
+    return result;
+}
+
+rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
+{
+    if(string[offset] == 0x00)
+        return ZERO;
+    
+    usz_t                       offset_into_rune                = rr_distance_to_last_utf8_rune_start(string, offset);
+    if(offset_into_rune < 0)
+        return ZERO;
+    offset                         -= offset_into_rune;
+
+    usz_t                       rune_length                     = rr_identify_utf8_rune_length(string, offset);
+    if(rune_length < 0)
+        return ZERO;
+    
+    *increase                      += rune_length - offset_into_rune;
+    return rr_postprocess_utf8_bytes(&string[offset], rune_length);
+}
+
+rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase)
+{
+    usz_t                       increase_backup                 = *increase;
+    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
+
+    if(subject < 'a') return ZERO;
+    if(subject > 'z') return ZERO;
+
+    (*increase)                     = increase_backup;
+    return subject;
+}
+
+rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase)
+{
+    usz_t                       increase_backup                 = *increase;
+    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
+
+    if(subject < 'A') return ZERO;
+    if(subject > 'Z') return ZERO;
+
+    (*increase)                     = increase_backup;
+    return subject;
+}
+
+rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase)
+{
+    rune_t                      subject;
+    if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
+    if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
+    return ZERO;
+}
+
+rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase)
+{
+    usz_t                       increase_backup                 = *increase;
+    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
+
+    if(subject < '0') return ZERO;
+    if(subject > '9') return ZERO;
+
+    (*increase)                     = increase_backup;
+    return subject;
+}
+
+rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase)
+{
+    rune_t                      subject;
+    if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
+    if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
+    if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject;
+    return ZERO;
+}
+
+bool_t rr_is_rune_of_sign_block_1(rune_t rune)
+{
+    if(rune < 0x20) return FALSE;
+    if(rune > 0x2f) return FALSE;
+    return TRUE;
+}
+
+bool_t rr_is_rune_of_sign_block_2(rune_t rune)
+{
+    if(rune < 0x3a) return FALSE;
+    if(rune > 0x40) return FALSE;
+    return TRUE;
+}
+
+bool_t rr_is_rune_of_sign_block_3(rune_t rune)
+{
+    if(rune < 0x5b) return FALSE;
+    if(rune > 0x60) return FALSE;
+    return TRUE;
+}
+
+bool_t rr_is_rune_of_sign_block_4(rune_t rune)
+{
+    if(rune < 0x7b) return FALSE;
+    if(rune > 0x7e) return FALSE;
+    return TRUE;
+}
+
+rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase)
+{
+    usz_t                       increase_backup                 = *increase;
+    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
+
+    if(rr_is_rune_of_sign_block_1(subject)) return subject;
+    if(rr_is_rune_of_sign_block_2(subject)) return subject;
+    if(rr_is_rune_of_sign_block_3(subject)) return subject;
+    if(rr_is_rune_of_sign_block_4(subject)) return subject;
+    return ZERO;
+}
+
+bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
+{
+    rune_t                      subject                         = rr_extract_utf8(string, offset, &offset);
+    if(subject == '\r')
+    {
+        usz_t                       offset_backup                   = offset;
+        if(rr_extract_utf8(string, offset, &offset) != '\n')
+            (*next)                         = offset_backup;
+        return TRUE;
+    }
+    if(subject == '\n')
+    {
+        (*next)                         = offset;
+        return TRUE;
+    }
+    return FALSE;
+}