diff --git a/code/exports/librr/runes.h b/code/exports/librr/runes.h new file mode 100644 index 0000000..7f7f91a --- /dev/null +++ b/code/exports/librr/runes.h @@ -0,0 +1,128 @@ + +#ifndef RR_RUNES_H +#define RR_RUNES_H + +#include + +/// @brief +/// @param string +/// @param offset +/// @param increase +/// @return The UTF-8 character which was extracted OR 0 is the function failed. +rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase); + +/// @brief Extracts a lowercase letter at some offset in an UTF-8 - string. +/// The function adds the rest length of the found lowercase letter to an +/// integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not a lowercase letter. +/// @return The rune which was extracted or ZERO if the found rune is not a lowercase letter. +rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *advance); + +/// @brief Extracts an uppercase letter at some offset in an UTF-8 - string. +/// The function adds the rest length of the found uppercase letter to an +/// integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not an uppercase letter. +/// @return The rune which was extracted or ZERO if the found rune is not an uppercase letter. +rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *advance); + +/// @brief Extracts a letter at some offset in an UTF-8 - string. +/// The function adds the rest length of the found letter to an +/// integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not a letter. +/// @return The rune which was extracted or ZERO if the found rune is not a letter. +rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *advance); + +/// @brief Extracts a digit at some offset in an UTF-8 - string. +/// The function adds the rest length of the found digit to an +/// integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not a digit. +/// @return The rune which was extracted or ZERO if the found rune is not a digit. +rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *advance); + +/// @brief Extracts an alphanumeric rune at some offset in an UTF-8 - string. +/// The function adds the rest length of the alphanumeric rune found to an +/// integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not an alphanumeric rune. +/// @return The rune which was extracted or ZERO if the found rune is not alphanumeric. +rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *advance); + +/// @brief Extracts a special sign (such as slash, at, the hash sign, etc.) +/// at some offset in an UTF-8 - string. The function adds the rest length +/// of the found sign to an integer to which a pointer was given. +/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value. +/// @attention Not the full length of the rune but rather the rest length, +/// the length starting from the reading offset going to the end of the rune +/// will be added onto 'advance'. +/// @param string Null-terminated UTF-8 (or ASCII) string from which the +/// rune should be extracted. This will not be modified. +/// @param offset Offset from the start of the string at which the rune will +/// be read. This doesn't have to point to the start of the rune; the start +/// will be found. +/// @param advance How many bytes 'offset' must be advanced to get to the first +/// byte of the next rune. This will NOT be set if the rune is not a sign. +/// @return The rune which was extracted or ZERO if the found rune is not a sign. +rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *advance); + +/// @brief Checks if there is a newline delimiter at a specific offset in a string +/// and writes the offset right after it to a given pointer's destination. +/// @attention The next rune's offset will replace the previous content of the given pointer, +/// the offset will not be added to. +/// @param string The string in question +/// @param offset The offset in the string at which the test should be done. +/// @param next The pointer to the integer to which the offset will be written. +/// in contrast to the other rune functions, this does NOT add, it SETS the value +/// relative to 'offset'. This will NOT be set if no newline was found. +/// @return TRUE if there is a newline at that point and FALSE if not. +bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next); + +#endif // LIBRR_RUNES_H diff --git a/code/src-c/runes.c b/code/src-c/runes.c new file mode 100644 index 0000000..7a7062d --- /dev/null +++ b/code/src-c/runes.c @@ -0,0 +1,201 @@ +#include + +isz_t rr_distance_to_last_utf8_rune_start(const char *string, usz_t offset) +{ + usz_t bytes_walked = 0; + while(bytes_walked < offset) + { + if(bytes_walked > 4) + return -1; + + if((string[offset - bytes_walked] >> 6) != 0b10) + { + return bytes_walked; + } + ++bytes_walked; + } + return -2; +} + +isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset) +{ + char head_byte = string[offset]; + + // If this is ASCII + if((head_byte & (1 << 7)) == 0) + return 1; + + // UTF-8 - only + + usz_t length = 0; + while(length < 5) + { + head_byte <<= 1; + if((head_byte & (1 << 7)) == 0) + break; + ++length; + } + + if(length > 4) + return -1; + + if(length < 2) + return -2; + + return length; +} + +rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes) +{ + switch(num_bytes) + { + case 1: return byte; + case 2: return byte & 0b11100000; + case 3: return byte & 0b11110000; + case 4: return byte & 0b11111000; + } + return 0; +} + +rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes) +{ + rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes); + + usz_t byte_index = 1; + while(byte_index < num_bytes) + { + result <<= 6; + result |= bytes[byte_index] & 0b00111111; + ++byte_index; + } + return result; +} + +rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase) +{ + if(string[offset] == 0x00) + return ZERO; + + usz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset); + if(offset_into_rune < 0) + return ZERO; + offset -= offset_into_rune; + + usz_t rune_length = rr_identify_utf8_rune_length(string, offset); + if(rune_length < 0) + return ZERO; + + *increase += rune_length - offset_into_rune; + return rr_postprocess_utf8_bytes(&string[offset], rune_length); +} + +rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase) +{ + usz_t increase_backup = *increase; + rune_t subject = rr_extract_utf8(string, offset, &increase_backup); + + if(subject < 'a') return ZERO; + if(subject > 'z') return ZERO; + + (*increase) = increase_backup; + return subject; +} + +rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase) +{ + usz_t increase_backup = *increase; + rune_t subject = rr_extract_utf8(string, offset, &increase_backup); + + if(subject < 'A') return ZERO; + if(subject > 'Z') return ZERO; + + (*increase) = increase_backup; + return subject; +} + +rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase) +{ + rune_t subject; + if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject; + if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject; + return ZERO; +} + +rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase) +{ + usz_t increase_backup = *increase; + rune_t subject = rr_extract_utf8(string, offset, &increase_backup); + + if(subject < '0') return ZERO; + if(subject > '9') return ZERO; + + (*increase) = increase_backup; + return subject; +} + +rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase) +{ + rune_t subject; + if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject; + if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject; + if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject; + return ZERO; +} + +bool_t rr_is_rune_of_sign_block_1(rune_t rune) +{ + if(rune < 0x20) return FALSE; + if(rune > 0x2f) return FALSE; + return TRUE; +} + +bool_t rr_is_rune_of_sign_block_2(rune_t rune) +{ + if(rune < 0x3a) return FALSE; + if(rune > 0x40) return FALSE; + return TRUE; +} + +bool_t rr_is_rune_of_sign_block_3(rune_t rune) +{ + if(rune < 0x5b) return FALSE; + if(rune > 0x60) return FALSE; + return TRUE; +} + +bool_t rr_is_rune_of_sign_block_4(rune_t rune) +{ + if(rune < 0x7b) return FALSE; + if(rune > 0x7e) return FALSE; + return TRUE; +} + +rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase) +{ + usz_t increase_backup = *increase; + rune_t subject = rr_extract_utf8(string, offset, &increase_backup); + + if(rr_is_rune_of_sign_block_1(subject)) return subject; + if(rr_is_rune_of_sign_block_2(subject)) return subject; + if(rr_is_rune_of_sign_block_3(subject)) return subject; + if(rr_is_rune_of_sign_block_4(subject)) return subject; + return ZERO; +} + +bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next) +{ + rune_t subject = rr_extract_utf8(string, offset, &offset); + if(subject == '\r') + { + usz_t offset_backup = offset; + if(rr_extract_utf8(string, offset, &offset) != '\n') + (*next) = offset_backup; + return TRUE; + } + if(subject == '\n') + { + (*next) = offset; + return TRUE; + } + return FALSE; +}