From 1193de206f47f0d24216b348c5e1d90738263493 Mon Sep 17 00:00:00 2001 From: Davide Bettio Date: Wed, 22 Jan 2025 18:24:29 +0100 Subject: [PATCH] Add new optimized function `unicode_utf8_decode` Use it as a drop-in replacement instead of bitstring_utf8_decode. This function is based on highly optimized UTF-8 decode found here: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ Signed-off-by: Davide Bettio --- src/libAtomVM/bitstring.c | 63 --------------------------------------- src/libAtomVM/bitstring.h | 24 ++------------- src/libAtomVM/interop.c | 2 +- src/libAtomVM/nifs.c | 4 +-- src/libAtomVM/unicode.c | 24 +++++++++++++++ src/libAtomVM/unicode.h | 22 ++++++++++++++ 6 files changed, 51 insertions(+), 88 deletions(-) diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c index 76a9c014c..b74c36531 100644 --- a/src/libAtomVM/bitstring.c +++ b/src/libAtomVM/bitstring.c @@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size) return true; } -enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size) -{ - if (len == 0) { - return UnicodeTransformDecodeFail; - } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) { - uint32_t v = 0; - v |= (buf[0] & 0x07) << 18; - v |= (buf[1] & 0x3F) << 12; - v |= (buf[2] & 0x3F) << 6; - v |= (buf[3] & 0x3F); - // overlong encoding or invalid codepoint - if (v <= 0x10000 || v > 0x10FFFF) { - return UnicodeTransformDecodeFail; - } - *c = v; - *out_size = 4; - return UnicodeTransformDecodeSuccess; - } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) { - uint32_t v = 0; - v |= (buf[0] & 0x0F) << 12; - v |= (buf[1] & 0x3F) << 6; - v |= (buf[2] & 0x3F); - // overlong encoding or surrogate - if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) { - return UnicodeTransformDecodeFail; - } - *c = v; - *out_size = 3; - return UnicodeTransformDecodeSuccess; - } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) { - uint32_t v = 0; - v |= (buf[0] & 0x1F) << 6; - v |= (buf[1] & 0x3F); - // overlong encoding - if (v < 0x80) { - return UnicodeTransformDecodeFail; - } - *c = v; - *out_size = 2; - return UnicodeTransformDecodeSuccess; - } else if ((*buf & 0x80) == 0) { - uint32_t v = 0; - v |= (buf[0] & 0x7F); - *c = v; - *out_size = 1; - return UnicodeTransformDecodeSuccess; - } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) { - return UnicodeTransformDecodeIncomplete; - } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) { - return UnicodeTransformDecodeIncomplete; - } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) { - return UnicodeTransformDecodeIncomplete; - } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) { - return UnicodeTransformDecodeIncomplete; - } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) { - return UnicodeTransformDecodeIncomplete; - } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) { - return UnicodeTransformDecodeIncomplete; - } - - return UnicodeTransformDecodeFail; -} - // UTF-16 encoding, when U in U+010000 to U+10FFFF: // // U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000 diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h index 141c3ce97..33381e0d8 100644 --- a/src/libAtomVM/bitstring.h +++ b/src/libAtomVM/bitstring.h @@ -23,6 +23,7 @@ #define _BITSTRING_H_ #include "term.h" +#include "unicode.h" #include #include @@ -99,13 +100,6 @@ enum BitstringFlags #endif }; -enum UnicodeTransformDecodeResult -{ - UnicodeTransformDecodeSuccess, - UnicodeTransformDecodeFail, - UnicodeTransformDecodeIncomplete -}; - union maybe_unsigned_int8 { uint8_t u; @@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int */ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size); -/** - * @brief Decode a character from UTF-8. - * - * @param buf the buffer from which to decode the string - * @param len the length (in bytes) of the bytes in buf - * @param c int value to decode to or NULL to only compute the size. - * @param out_size the size in bytes, on output (if not NULL) - * @return \c UnicodeTransformDecodeSuccess if decoding was successful, - * \c UnicodeTransformDecodeFail if character starting at buf is not a valid - * unicode character or \c UnicodeTransformDecodeIncomplete if character - * starting at buf is a valid but incomplete transformation - */ -enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size); - /** * @brief Encode a character to UTF-16. * @@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c { size_t byte_offset = offset >> 3; // divide by 8 const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset; - return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess; + return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess; } /** diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c index 744429618..cf139b5a5 100644 --- a/src/libAtomVM/interop.c +++ b/src/libAtomVM/interop.c @@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o while (input_index < len) { size_t char_size; uint32_t c; - enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size); + enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size); if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) { *rest_crsr = input_index; *output_len = result; diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index f020e247e..05746d4a7 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -2197,7 +2197,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]) for (size_t i = 0; i < encoded_len; i++) { size_t codepoint_size; uint32_t codepoint; - if (UNLIKELY(bitstring_utf8_decode( + if (UNLIKELY(unicode_utf8_decode( &utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size) != UnicodeTransformDecodeSuccess || (codepoint > 255))) { @@ -2238,7 +2238,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context for (size_t i = 0; i < u8len; i++) { size_t codepoint_size; enum UnicodeTransformDecodeResult result - = bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size); + = unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size); if (UNLIKELY((result != UnicodeTransformDecodeSuccess) || !unicode_is_valid_codepoint(codepoints[i]))) { AVM_ABORT(); diff --git a/src/libAtomVM/unicode.c b/src/libAtomVM/unicode.c index 6f29b12dd..af43ccde3 100644 --- a/src/libAtomVM/unicode.c +++ b/src/libAtomVM/unicode.c @@ -21,6 +21,8 @@ #include #include +#include "utils.h" + #include "unicode.h" // Following utf8d table and decode function are covered by MIT license @@ -63,6 +65,28 @@ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) return *state; } +enum UnicodeTransformDecodeResult unicode_utf8_decode( + const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size) +{ + uint32_t codepoint = 0; + uint32_t state = 0; + size_t i = 0; + while (i < len) { + state = decode(&state, &codepoint, buf[i]); + i++; + + if (state == UTF8_ACCEPT) { + *c = codepoint; + *out_size = i; + return UnicodeTransformDecodeSuccess; + } else if (UNLIKELY(state == UTF8_REJECT)) { + return UnicodeTransformDecodeFail; + } + } + + return UnicodeTransformDecodeIncomplete; +} + bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len) { uint32_t codepoint = 0; diff --git a/src/libAtomVM/unicode.h b/src/libAtomVM/unicode.h index b23df803f..fa9b3b6e1 100644 --- a/src/libAtomVM/unicode.h +++ b/src/libAtomVM/unicode.h @@ -29,6 +29,13 @@ extern "C" { #endif +enum UnicodeTransformDecodeResult +{ + UnicodeTransformDecodeSuccess, + UnicodeTransformDecodeFail, + UnicodeTransformDecodeIncomplete +}; + size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len); bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len); size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len); @@ -40,6 +47,21 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint) return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF)); } +/** + * @brief Decode a character from UTF-8. + * + * @param buf the buffer from which to decode the string + * @param len the length (in bytes) of the bytes in buf + * @param c int value to decode to + * @param out_size the size in bytes, on output (if not NULL) + * @return \c UnicodeTransformDecodeSuccess if decoding was successful, + * \c UnicodeTransformDecodeFail if character starting at buf is not a valid + * unicode character or \c UnicodeTransformDecodeIncomplete if character + * starting at buf is a valid but incomplete transformation + */ +enum UnicodeTransformDecodeResult unicode_utf8_decode( + const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size); + bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len); #ifdef __cplusplus