214 lines
5.9 KiB
C
214 lines
5.9 KiB
C
#include <librr/runes.h>
|
|
|
|
isz_t rr_distance_to_last_utf8_rune_start(const char *string, isz_t offset)
|
|
{
|
|
usz_t bytes_walked = 0;
|
|
while((offset - bytes_walked) >= 0)
|
|
{
|
|
if(bytes_walked > 4)
|
|
return -1;
|
|
|
|
if((string[offset - bytes_walked] >> 6) != 0b10)
|
|
{
|
|
return bytes_walked;
|
|
}
|
|
++bytes_walked;
|
|
}
|
|
return -2;
|
|
}
|
|
|
|
isz_t rr_identify_utf8_rune_length(const char *string, usz_t offset)
|
|
{
|
|
char head_byte = string[offset];
|
|
|
|
// If this is ASCII
|
|
if((head_byte & (1 << 7)) == 0)
|
|
return 1;
|
|
|
|
// UTF-8 - only
|
|
|
|
usz_t length = 0;
|
|
while(length < 5)
|
|
{
|
|
head_byte <<= 1;
|
|
if((head_byte & (1 << 7)) == 0)
|
|
break;
|
|
++length;
|
|
}
|
|
|
|
if(length > 4)
|
|
return -1;
|
|
|
|
if(length < 2)
|
|
return -2;
|
|
|
|
return length;
|
|
}
|
|
|
|
rune_t rr_postprocess_utf8_head_byte(char byte, usz_t num_bytes)
|
|
{
|
|
switch(num_bytes)
|
|
{
|
|
case 1: return byte;
|
|
case 2: return byte & 0b11100000;
|
|
case 3: return byte & 0b11110000;
|
|
case 4: return byte & 0b11111000;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
rune_t rr_postprocess_utf8_bytes(const char *bytes, usz_t num_bytes)
|
|
{
|
|
rune_t result = rr_postprocess_utf8_head_byte(bytes[0], num_bytes);
|
|
|
|
usz_t byte_index = 1;
|
|
while(byte_index < num_bytes)
|
|
{
|
|
result <<= 6;
|
|
result |= bytes[byte_index] & 0b00111111;
|
|
++byte_index;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
rune_t rr_extract_utf8(const char *string, usz_t offset, usz_t *increase)
|
|
{
|
|
if(string[offset] == 0x00)
|
|
return ZERO;
|
|
|
|
isz_t offset_into_rune = rr_distance_to_last_utf8_rune_start(string, offset);
|
|
if(offset_into_rune < 0)
|
|
return ZERO;
|
|
offset -= offset_into_rune;
|
|
|
|
isz_t rune_length = rr_identify_utf8_rune_length(string, offset);
|
|
if(rune_length < 0)
|
|
return ZERO;
|
|
|
|
*increase += rune_length - offset_into_rune;
|
|
return rr_postprocess_utf8_bytes(&string[offset], rune_length);
|
|
}
|
|
|
|
bool_t rr_check_newline(const char *string, usz_t offset, usz_t *next)
|
|
{
|
|
rune_t subject = rr_extract_utf8(string, offset, &offset);
|
|
if(subject == '\r')
|
|
{
|
|
usz_t offset_backup = offset;
|
|
if(rr_extract_utf8(string, offset, &offset) != '\n')
|
|
(*next) = offset_backup;
|
|
return TRUE;
|
|
}
|
|
if(subject == '\n')
|
|
{
|
|
(*next) = offset;
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
bool_t rr_rune_is_lower(rune_t rune)
|
|
{
|
|
if(rune < 'a') return FALSE;
|
|
if(rune > 'z') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_upper(rune_t rune)
|
|
{
|
|
if(rune < 'A') return FALSE;
|
|
if(rune > 'Z') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_letter(rune_t rune)
|
|
{
|
|
if(rr_rune_is_lower(rune)) return TRUE;
|
|
if(rr_rune_is_upper(rune)) return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
bool_t rr_rune_is_digit(rune_t rune)
|
|
{
|
|
if(rune < '0') return FALSE;
|
|
if(rune > '9') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_in_ascii_special_block_1(rune_t rune)
|
|
{
|
|
if(rune < '!') return FALSE;
|
|
if(rune > '/') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_in_ascii_special_block_2(rune_t rune)
|
|
{
|
|
if(rune < ':') return FALSE;
|
|
if(rune > '@') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_in_ascii_special_block_3(rune_t rune)
|
|
{
|
|
if(rune < '[') return FALSE;
|
|
if(rune > '`') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_in_ascii_special_block_4(rune_t rune)
|
|
{
|
|
if(rune < '{') return FALSE;
|
|
if(rune > '~') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
bool_t rr_rune_is_ascii_special(rune_t rune)
|
|
{
|
|
if(rr_rune_is_in_ascii_special_block_1(rune)) return TRUE;
|
|
if(rr_rune_is_in_ascii_special_block_2(rune)) return TRUE;
|
|
if(rr_rune_is_in_ascii_special_block_3(rune)) return TRUE;
|
|
if(rr_rune_is_in_ascii_special_block_4(rune)) return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
rr_ascii_sign_e rr_rune_to_ascii_sign(rune_t rune)
|
|
{
|
|
switch(rune)
|
|
{
|
|
case '!': return RR_ASCII_EXCLAMATION_MARK;
|
|
case '"': return RR_ASCII_DOUBLE_QUOTATION_MARK;
|
|
case '#': return RR_ASCII_HASH_SIGN;
|
|
case '$': return RR_ASCII_DOLLAR_SIGN;
|
|
case '%': return RR_ASCII_PERCENT_SIGN;
|
|
case '&': return RR_ASCII_AMPERSAND;
|
|
case '\'': return RR_ASCII_SINGLE_QUOTATION_MARK;
|
|
case '(': return RR_ASCII_OPENING_PARENTHESIS;
|
|
case ')': return RR_ASCII_CLOSING_PARENTHESIS;
|
|
case '*': return RR_ASCII_ASTERISK;
|
|
case '+': return RR_ASCII_PLUS;
|
|
case ',': return RR_ASCII_COMMA;
|
|
case '-': return RR_ASCII_MINUS;
|
|
case '.': return RR_ASCII_POINT;
|
|
case '/': return RR_ASCII_SLASH;
|
|
case ':': return RR_ASCII_COLON;
|
|
case ';': return RR_ASCII_SEMICOLON;
|
|
case '<': return RR_ASCII_SMALLER_THAN;
|
|
case '=': return RR_ASCII_EQUALS_SIGN;
|
|
case '>': return RR_ASCII_BIGGER_THAN;
|
|
case '?': return RR_ASCII_QUESTION_MARK;
|
|
case '@': return RR_ASCII_AT_SIGN;
|
|
case '[': return RR_ASCII_OPENING_SQUARE_BRACKET;
|
|
case '\\': return RR_ASCII_BACKSLASH;
|
|
case ']': return RR_ASCII_CLOSING_SQUARE_BRACKET;
|
|
case '^': return RR_ASCII_CIRCUMFLEX;
|
|
case '_': return RR_ASCII_UNDERSCORE;
|
|
case '`': return RR_ASCII_TICK;
|
|
case '{': return RR_ASCII_OPENING_CURLY_BRACE;
|
|
case '|': return RR_ASCII_VERTICAL_BAR;
|
|
case '}': return RR_ASCII_CLOSING_CURLY_BRACE;
|
|
case '~': return RR_ASCII_TILDE;
|
|
}
|
|
return RR_ASCII_NOT_A_SIGN;
|
|
}
|