Added rune extraction and detection functions (not tested)

2023-12-01 16:55:40 +01:00 · 2023-12-01 16:55:40 +01:00 · ae6e4ffe33
parent 22462bf2e7
commit ae6e4ffe33
2 changed files with 329 additions and 0 deletions
--- a/code/exports/librr/runes.h
+++ b/code/exports/librr/runes.h
@ -0,0 +1,128 @@
 #ifndef RR_RUNES_H
 #define RR_RUNES_H
 #include <librr/types.h>
 /// @brief 
 /// @param string 
 /// @param offset 
 /// @param increase 
 /// @return The UTF-8 character which was extracted OR 0 is the function failed.
 rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase);
 /// @brief Extracts a lowercase letter at some offset in an UTF-8 - string.
 ///  The function adds the rest length of the found lowercase letter to an
 ///  integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not a lowercase letter.
 /// @return The rune which was extracted or ZERO if the found rune is not a lowercase letter.
 rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *advance);
 /// @brief Extracts an uppercase letter at some offset in an UTF-8 - string.
 ///  The function adds the rest length of the found uppercase letter to an
 ///  integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not an uppercase letter.
 /// @return The rune which was extracted or ZERO if the found rune is not an uppercase letter.
 rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *advance);
 /// @brief Extracts a letter at some offset in an UTF-8 - string.
 ///  The function adds the rest length of the found letter to an
 ///  integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not a letter.
 /// @return The rune which was extracted or ZERO if the found rune is not a letter.
 rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *advance);
 /// @brief Extracts a digit at some offset in an UTF-8 - string.
 ///  The function adds the rest length of the found digit to an
 ///  integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not a digit.
 /// @return The rune which was extracted or ZERO if the found rune is not a digit.
 rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *advance);
 /// @brief Extracts an alphanumeric rune at some offset in an UTF-8 - string.
 ///  The function adds the rest length of the alphanumeric rune found to an
 ///  integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not an alphanumeric rune.
 /// @return The rune which was extracted or ZERO if the found rune is not alphanumeric.
 rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *advance);
 /// @brief Extracts a special sign (such as slash, at, the hash sign, etc.)
 ///  at some offset in an UTF-8 - string. The function adds the rest length
 ///  of the found sign to an integer to which a pointer was given.
 /// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
 /// @attention Not the full length of the rune but rather the rest length,
 ///  the length starting from the reading offset going to the end of the rune
 ///  will be added onto 'advance'.
 /// @param string Null-terminated UTF-8 (or ASCII) string from which the
 ///  rune should be extracted. This will not be modified.
 /// @param offset Offset from the start of the string at which the rune will
 ///  be read. This doesn't have to point to the start of the rune; the start
 ///  will be found.
 /// @param advance How many bytes 'offset' must be advanced to get to the first
 ///  byte of the next rune. This will NOT be set if the rune is not a sign.
 /// @return The rune which was extracted or ZERO if the found rune is not a sign.
 rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *advance);
 /// @brief Checks if there is a newline delimiter at a specific offset in a string
 ///  and writes the offset right after it to a given pointer's destination.
 /// @attention The next rune's offset will replace the previous content of the given pointer,
 ///  the offset will not be added to.
 /// @param string The string in question
 /// @param offset The offset in the string at which the test should be done.
 /// @param next The pointer to the integer to which the offset will be written.
 ///  in contrast to the other rune functions, this does NOT add, it SETS the value
 ///  relative to 'offset'. This will NOT be set if no newline was found.
 /// @return TRUE if there is a newline at that point and FALSE if not.
 bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next);
 #endif // LIBRR_RUNES_H
--- a/code/src-c/runes.c
+++ b/code/src-c/runes.c
@ -0,0 +1,201 @@
 #include <librr/runes.h>
 isz_t rr_distance_to_last_utf8_rune_start(const char *string, usz_t offset)
 {
    usz_t                       bytes_walked                    = 0;
    while(bytes_walked < offset)
    {
        if(bytes_walked > 4)
            return -1;
        if((string[offset - bytes_walked] >> 6) != 0b10)
        {
            return bytes_walked;
        }
        ++bytes_walked;
    }
    return -2;
 }
 isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
 {
    char                        head_byte                       = string[offset];
    // If this is ASCII
    if((head_byte & (1 << 7)) == 0)
        return  1;
    // UTF-8 - only
    usz_t                       length                          = 0;
    while(length < 5)
    {
        head_byte                     <<= 1;
        if((head_byte & (1 << 7)) == 0)
            break;
        ++length;
    }
    if(length > 4)
        return -1;
    if(length < 2)
        return -2;
    return length;
 }
 rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
 {
    switch(num_bytes)
    {
        case 1: return byte;
        case 2: return byte & 0b11100000;
        case 3: return byte & 0b11110000;
        case 4: return byte & 0b11111000;
    }
    return 0;
 }
 rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
 {
    rune_t                      result                          = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
    usz_t                       byte_index                      = 1;
    while(byte_index < num_bytes)
    {
        result                    <<= 6;
        result                     |= bytes[byte_index] & 0b00111111;
        ++byte_index;
    }
    return result;
 }
 rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
 {
    if(string[offset] == 0x00)
        return ZERO;
    usz_t                       offset_into_rune                = rr_distance_to_last_utf8_rune_start(string, offset);
    if(offset_into_rune < 0)
        return ZERO;
    offset                         -= offset_into_rune;
    usz_t                       rune_length                     = rr_identify_utf8_rune_length(string, offset);
    if(rune_length < 0)
        return ZERO;
    *increase                      += rune_length - offset_into_rune;
    return rr_postprocess_utf8_bytes(&string[offset], rune_length);
 }
 rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase)
 {
    usz_t                       increase_backup                 = *increase;
    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
    if(subject < 'a') return ZERO;
    if(subject > 'z') return ZERO;
    (*increase)                     = increase_backup;
    return subject;
 }
 rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase)
 {
    usz_t                       increase_backup                 = *increase;
    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
    if(subject < 'A') return ZERO;
    if(subject > 'Z') return ZERO;
    (*increase)                     = increase_backup;
    return subject;
 }
 rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase)
 {
    rune_t                      subject;
    if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
    if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
    return ZERO;
 }
 rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase)
 {
    usz_t                       increase_backup                 = *increase;
    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
    if(subject < '0') return ZERO;
    if(subject > '9') return ZERO;
    (*increase)                     = increase_backup;
    return subject;
 }
 rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase)
 {
    rune_t                      subject;
    if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
    if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
    if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject;
    return ZERO;
 }
 bool_t rr_is_rune_of_sign_block_1(rune_t rune)
 {
    if(rune < 0x20) return FALSE;
    if(rune > 0x2f) return FALSE;
    return TRUE;
 }
 bool_t rr_is_rune_of_sign_block_2(rune_t rune)
 {
    if(rune < 0x3a) return FALSE;
    if(rune > 0x40) return FALSE;
    return TRUE;
 }
 bool_t rr_is_rune_of_sign_block_3(rune_t rune)
 {
    if(rune < 0x5b) return FALSE;
    if(rune > 0x60) return FALSE;
    return TRUE;
 }
 bool_t rr_is_rune_of_sign_block_4(rune_t rune)
 {
    if(rune < 0x7b) return FALSE;
    if(rune > 0x7e) return FALSE;
    return TRUE;
 }
 rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase)
 {
    usz_t                       increase_backup                 = *increase;
    rune_t                      subject                         = rr_extract_utf8(string, offset, &increase_backup);
    if(rr_is_rune_of_sign_block_1(subject)) return subject;
    if(rr_is_rune_of_sign_block_2(subject)) return subject;
    if(rr_is_rune_of_sign_block_3(subject)) return subject;
    if(rr_is_rune_of_sign_block_4(subject)) return subject;
    return ZERO;
 }
 bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
 {
    rune_t                      subject                         = rr_extract_utf8(string, offset, &offset);
    if(subject == '\r')
    {
        usz_t                       offset_backup                   = offset;
        if(rr_extract_utf8(string, offset, &offset) != '\n')
            (*next)                         = offset_backup;
        return TRUE;
    }
    if(subject == '\n')
    {
        (*next)                         = offset;
        return TRUE;
    }
    return FALSE;
 }