Added rune extraction and detection functions (not tested)
This commit is contained in:
parent
22462bf2e7
commit
ae6e4ffe33
|
@ -0,0 +1,128 @@
|
||||||
|
|
||||||
|
#ifndef RR_RUNES_H
|
||||||
|
#define RR_RUNES_H
|
||||||
|
|
||||||
|
#include <librr/types.h>
|
||||||
|
|
||||||
|
/// @brief
|
||||||
|
/// @param string
|
||||||
|
/// @param offset
|
||||||
|
/// @param increase
|
||||||
|
/// @return The UTF-8 character which was extracted OR 0 is the function failed.
|
||||||
|
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase);
|
||||||
|
|
||||||
|
/// @brief Extracts a lowercase letter at some offset in an UTF-8 - string.
|
||||||
|
/// The function adds the rest length of the found lowercase letter to an
|
||||||
|
/// integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not a lowercase letter.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not a lowercase letter.
|
||||||
|
rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Extracts an uppercase letter at some offset in an UTF-8 - string.
|
||||||
|
/// The function adds the rest length of the found uppercase letter to an
|
||||||
|
/// integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not an uppercase letter.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not an uppercase letter.
|
||||||
|
rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Extracts a letter at some offset in an UTF-8 - string.
|
||||||
|
/// The function adds the rest length of the found letter to an
|
||||||
|
/// integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not a letter.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not a letter.
|
||||||
|
rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Extracts a digit at some offset in an UTF-8 - string.
|
||||||
|
/// The function adds the rest length of the found digit to an
|
||||||
|
/// integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not a digit.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not a digit.
|
||||||
|
rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Extracts an alphanumeric rune at some offset in an UTF-8 - string.
|
||||||
|
/// The function adds the rest length of the alphanumeric rune found to an
|
||||||
|
/// integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not an alphanumeric rune.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not alphanumeric.
|
||||||
|
rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Extracts a special sign (such as slash, at, the hash sign, etc.)
|
||||||
|
/// at some offset in an UTF-8 - string. The function adds the rest length
|
||||||
|
/// of the found sign to an integer to which a pointer was given.
|
||||||
|
/// @attention 'advance' will be ADDED TO, it won't be set to a comletely new value.
|
||||||
|
/// @attention Not the full length of the rune but rather the rest length,
|
||||||
|
/// the length starting from the reading offset going to the end of the rune
|
||||||
|
/// will be added onto 'advance'.
|
||||||
|
/// @param string Null-terminated UTF-8 (or ASCII) string from which the
|
||||||
|
/// rune should be extracted. This will not be modified.
|
||||||
|
/// @param offset Offset from the start of the string at which the rune will
|
||||||
|
/// be read. This doesn't have to point to the start of the rune; the start
|
||||||
|
/// will be found.
|
||||||
|
/// @param advance How many bytes 'offset' must be advanced to get to the first
|
||||||
|
/// byte of the next rune. This will NOT be set if the rune is not a sign.
|
||||||
|
/// @return The rune which was extracted or ZERO if the found rune is not a sign.
|
||||||
|
rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *advance);
|
||||||
|
|
||||||
|
/// @brief Checks if there is a newline delimiter at a specific offset in a string
|
||||||
|
/// and writes the offset right after it to a given pointer's destination.
|
||||||
|
/// @attention The next rune's offset will replace the previous content of the given pointer,
|
||||||
|
/// the offset will not be added to.
|
||||||
|
/// @param string The string in question
|
||||||
|
/// @param offset The offset in the string at which the test should be done.
|
||||||
|
/// @param next The pointer to the integer to which the offset will be written.
|
||||||
|
/// in contrast to the other rune functions, this does NOT add, it SETS the value
|
||||||
|
/// relative to 'offset'. This will NOT be set if no newline was found.
|
||||||
|
/// @return TRUE if there is a newline at that point and FALSE if not.
|
||||||
|
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next);
|
||||||
|
|
||||||
|
#endif // LIBRR_RUNES_H
|
|
@ -0,0 +1,201 @@
|
||||||
|
#include <librr/runes.h>
|
||||||
|
|
||||||
|
isz_t rr_distance_to_last_utf8_rune_start(const char *string, usz_t offset)
|
||||||
|
{
|
||||||
|
usz_t bytes_walked = 0;
|
||||||
|
while(bytes_walked < offset)
|
||||||
|
{
|
||||||
|
if(bytes_walked > 4)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if((string[offset - bytes_walked] >> 6) != 0b10)
|
||||||
|
{
|
||||||
|
return bytes_walked;
|
||||||
|
}
|
||||||
|
++bytes_walked;
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
|
isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
|
||||||
|
{
|
||||||
|
char head_byte = string[offset];
|
||||||
|
|
||||||
|
// If this is ASCII
|
||||||
|
if((head_byte & (1 << 7)) == 0)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// UTF-8 - only
|
||||||
|
|
||||||
|
usz_t length = 0;
|
||||||
|
while(length < 5)
|
||||||
|
{
|
||||||
|
head_byte <<= 1;
|
||||||
|
if((head_byte & (1 << 7)) == 0)
|
||||||
|
break;
|
||||||
|
++length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(length > 4)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if(length < 2)
|
||||||
|
return -2;
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
|
||||||
|
{
|
||||||
|
switch(num_bytes)
|
||||||
|
{
|
||||||
|
case 1: return byte;
|
||||||
|
case 2: return byte & 0b11100000;
|
||||||
|
case 3: return byte & 0b11110000;
|
||||||
|
case 4: return byte & 0b11111000;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
|
||||||
|
{
|
||||||
|
rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
|
||||||
|
|
||||||
|
usz_t byte_index = 1;
|
||||||
|
while(byte_index < num_bytes)
|
||||||
|
{
|
||||||
|
result <<= 6;
|
||||||
|
result |= bytes[byte_index] & 0b00111111;
|
||||||
|
++byte_index;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
if(string[offset] == 0x00)
|
||||||
|
return ZERO;
|
||||||
|
|
||||||
|
usz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset);
|
||||||
|
if(offset_into_rune < 0)
|
||||||
|
return ZERO;
|
||||||
|
offset -= offset_into_rune;
|
||||||
|
|
||||||
|
usz_t rune_length = rr_identify_utf8_rune_length(string, offset);
|
||||||
|
if(rune_length < 0)
|
||||||
|
return ZERO;
|
||||||
|
|
||||||
|
*increase += rune_length - offset_into_rune;
|
||||||
|
return rr_postprocess_utf8_bytes(&string[offset], rune_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_lower(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
usz_t increase_backup = *increase;
|
||||||
|
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
|
||||||
|
|
||||||
|
if(subject < 'a') return ZERO;
|
||||||
|
if(subject > 'z') return ZERO;
|
||||||
|
|
||||||
|
(*increase) = increase_backup;
|
||||||
|
return subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_upper(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
usz_t increase_backup = *increase;
|
||||||
|
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
|
||||||
|
|
||||||
|
if(subject < 'A') return ZERO;
|
||||||
|
if(subject > 'Z') return ZERO;
|
||||||
|
|
||||||
|
(*increase) = increase_backup;
|
||||||
|
return subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_letter(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
rune_t subject;
|
||||||
|
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
|
||||||
|
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
|
||||||
|
return ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_digit(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
usz_t increase_backup = *increase;
|
||||||
|
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
|
||||||
|
|
||||||
|
if(subject < '0') return ZERO;
|
||||||
|
if(subject > '9') return ZERO;
|
||||||
|
|
||||||
|
(*increase) = increase_backup;
|
||||||
|
return subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_alphanumeric(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
rune_t subject;
|
||||||
|
if((subject = rr_extract_lower(string, offset, increase)) != ZERO) return subject;
|
||||||
|
if((subject = rr_extract_upper(string, offset, increase)) != ZERO) return subject;
|
||||||
|
if((subject = rr_extract_digit(string, offset, increase)) != ZERO) return subject;
|
||||||
|
return ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t rr_is_rune_of_sign_block_1(rune_t rune)
|
||||||
|
{
|
||||||
|
if(rune < 0x20) return FALSE;
|
||||||
|
if(rune > 0x2f) return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t rr_is_rune_of_sign_block_2(rune_t rune)
|
||||||
|
{
|
||||||
|
if(rune < 0x3a) return FALSE;
|
||||||
|
if(rune > 0x40) return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t rr_is_rune_of_sign_block_3(rune_t rune)
|
||||||
|
{
|
||||||
|
if(rune < 0x5b) return FALSE;
|
||||||
|
if(rune > 0x60) return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t rr_is_rune_of_sign_block_4(rune_t rune)
|
||||||
|
{
|
||||||
|
if(rune < 0x7b) return FALSE;
|
||||||
|
if(rune > 0x7e) return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t rr_extract_sign(const char *string, usz_t offset, usz_t *increase)
|
||||||
|
{
|
||||||
|
usz_t increase_backup = *increase;
|
||||||
|
rune_t subject = rr_extract_utf8(string, offset, &increase_backup);
|
||||||
|
|
||||||
|
if(rr_is_rune_of_sign_block_1(subject)) return subject;
|
||||||
|
if(rr_is_rune_of_sign_block_2(subject)) return subject;
|
||||||
|
if(rr_is_rune_of_sign_block_3(subject)) return subject;
|
||||||
|
if(rr_is_rune_of_sign_block_4(subject)) return subject;
|
||||||
|
return ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
|
||||||
|
{
|
||||||
|
rune_t subject = rr_extract_utf8(string, offset, &offset);
|
||||||
|
if(subject == '\r')
|
||||||
|
{
|
||||||
|
usz_t offset_backup = offset;
|
||||||
|
if(rr_extract_utf8(string, offset, &offset) != '\n')
|
||||||
|
(*next) = offset_backup;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
if(subject == '\n')
|
||||||
|
{
|
||||||
|
(*next) = offset;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
return FALSE;
|
||||||
|
}
|
Loading…
Reference in New Issue