Added rune extraction and detection functions (not tested)

This commit is contained in:
Eric-Paul Ickhorn 2023-12-01 16:55:40 +01:00
parent 22462bf2e7
commit ae6e4ffe33
Signed by: epickh
GPG Key ID: F5EBBE013924D95F
2 changed files with 329 additions and 0 deletions

128
code/exports/librr/runes.h Normal file
View File

@ -0,0 +1,128 @@
#ifndef RR_RUNES_H
#define RR_RUNES_H
#include <librr/types.h>
/// @brief
/// @param string
/// @param offset
/// @param increase
/// @return The UTF-8 character which was extracted OR 0 is the function failed.
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase);
/// @brief Extracts a lowercase letter at some offset in an UTF-8 - string.
/// The function adds the rest length of the found lowercase letter to an
/// integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not a lowercase letter.
/// @return The rune which was extracted or ZERO if the found rune is not a lowercase letter.
rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *advance);
/// @brief Extracts an uppercase letter at some offset in an UTF-8 - string.
/// The function adds the rest length of the found uppercase letter to an
/// integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not an uppercase letter.
/// @return The rune which was extracted or ZERO if the found rune is not an uppercase letter.
rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *advance);
/// @brief Extracts a letter at some offset in an UTF-8 - string.
/// The function adds the rest length of the found letter to an
/// integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not a letter.
/// @return The rune which was extracted or ZERO if the found rune is not a letter.
rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *advance);
/// @brief Extracts a digit at some offset in an UTF-8 - string.
/// The function adds the rest length of the found digit to an
/// integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not a digit.
/// @return The rune which was extracted or ZERO if the found rune is not a digit.
rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *advance);
/// @brief Extracts an alphanumeric rune at some offset in an UTF-8 - string.
/// The function adds the rest length of the alphanumeric rune found to an
/// integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not an alphanumeric rune.
/// @return The rune which was extracted or ZERO if the found rune is not alphanumeric.
rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *advance);
/// @brief Extracts a special sign (such as slash, at, the hash sign, etc.)
/// at some offset in an UTF-8 - string. The function adds the rest length
/// of the found sign to an integer to which a pointer was given.
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
/// @attention Not the full length of the rune but rather the rest length,
/// the length starting from the reading offset going to the end of the rune
/// will be added onto 'advance'.
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
/// rune should be extracted. This will not be modified.
/// @param offset Offset from the start of the string at which the rune will
/// be read. This doesn't have to point to the start of the rune; the start
/// will be found.
/// @param advance How many bytes 'offset' must be advanced to get to the first
/// byte of the next rune. This will NOT be set if the rune is not a sign.
/// @return The rune which was extracted or ZERO if the found rune is not a sign.
rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *advance);
/// @brief Checks if there is a newline delimiter at a specific offset in a string
/// and writes the offset right after it to a given pointer's destination.
/// @attention The next rune's offset will replace the previous content of the given pointer,
/// the offset will not be added to.
/// @param string The string in question
/// @param offset The offset in the string at which the test should be done.
/// @param next The pointer to the integer to which the offset will be written.
/// in contrast to the other rune functions, this does NOT add, it SETS the value
/// relative to 'offset'. This will NOT be set if no newline was found.
/// @return TRUE if there is a newline at that point and FALSE if not.
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next);
#endif // LIBRR_RUNES_H

201
code/src-c/runes.c Normal file
View File

@ -0,0 +1,201 @@
#include <librr/runes.h>
isz_t rr_distance_to_last_utf8_rune_start(const char *string, usz_t offset)
{
usz_t bytes_walked = 0;
while(bytes_walked < offset)
{
if(bytes_walked > 4)
return -1;
if((string[offset - bytes_walked] >> 6) != 0b10)
{
return bytes_walked;
}
++bytes_walked;
}
return -2;
}
isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
{
char head_byte = string[offset];
// If this is ASCII
if((head_byte & (1 << 7)) == 0)
return 1;
// UTF-8 - only
usz_t length = 0;
while(length < 5)
{
head_byte <<= 1;
if((head_byte & (1 << 7)) == 0)
break;
++length;
}
if(length > 4)
return -1;
if(length < 2)
return -2;
return length;
}
rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
{
switch(num_bytes)
{
case 1: return byte;
case 2: return byte & 0b11100000;
case 3: return byte & 0b11110000;
case 4: return byte & 0b11111000;
}
return 0;
}
rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
{
rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
usz_t byte_index = 1;
while(byte_index < num_bytes)
{
result <<= 6;
result |= bytes[byte_index] & 0b00111111;
++byte_index;
}
return result;
}
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
{
if(string[offset] == 0x00)
return ZERO;
usz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset);
if(offset_into_rune < 0)
return ZERO;
offset -= offset_into_rune;
usz_t rune_length = rr_identify_utf8_rune_length(string, offset);
if(rune_length < 0)
return ZERO;
*increase += rune_length - offset_into_rune;
return rr_postprocess_utf8_bytes(&string[offset], rune_length);
}
rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < 'a') return ZERO;
if(subject > 'z') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < 'A') return ZERO;
if(subject > 'Z') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase)
{
rune_t subject;
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
return ZERO;
}
rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(subject < '0') return ZERO;
if(subject > '9') return ZERO;
(*increase) = increase_backup;
return subject;
}
rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase)
{
rune_t subject;
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject;
return ZERO;
}
bool_t rr_is_rune_of_sign_block_1(rune_t rune)
{
if(rune < 0x20) return FALSE;
if(rune > 0x2f) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_2(rune_t rune)
{
if(rune < 0x3a) return FALSE;
if(rune > 0x40) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_3(rune_t rune)
{
if(rune < 0x5b) return FALSE;
if(rune > 0x60) return FALSE;
return TRUE;
}
bool_t rr_is_rune_of_sign_block_4(rune_t rune)
{
if(rune < 0x7b) return FALSE;
if(rune > 0x7e) return FALSE;
return TRUE;
}
rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase)
{
usz_t increase_backup = *increase;
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
if(rr_is_rune_of_sign_block_1(subject)) return subject;
if(rr_is_rune_of_sign_block_2(subject)) return subject;
if(rr_is_rune_of_sign_block_3(subject)) return subject;
if(rr_is_rune_of_sign_block_4(subject)) return subject;
return ZERO;
}
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
{
rune_t subject = rr_extract_utf8(string, offset, &offset);
if(subject == '\r')
{
usz_t offset_backup = offset;
if(rr_extract_utf8(string, offset, &offset) != '\n')
(*next) = offset_backup;
return TRUE;
}
if(subject == '\n')
{
(*next) = offset;
return TRUE;
}
return FALSE;
}