From 1193de206f47f0d24216b348c5e1d90738263493 Mon Sep 17 00:00:00 2001
From: Davide Bettio <davide@uninstall.it>
Date: Wed, 22 Jan 2025 18:24:29 +0100
Subject: [PATCH] Add new optimized function `unicode_utf8_decode`

Use it as a drop-in replacement instead of bitstring_utf8_decode.

This function is based on highly optimized UTF-8 decode found here:
http://bjoern.hoehrmann.de/utf-8/decoder/dfa/

Signed-off-by: Davide Bettio <davide@uninstall.it>
---
 src/libAtomVM/bitstring.c | 63 ---------------------------------------
 src/libAtomVM/bitstring.h | 24 ++-------------
 src/libAtomVM/interop.c   |  2 +-
 src/libAtomVM/nifs.c      |  4 +--
 src/libAtomVM/unicode.c   | 24 +++++++++++++++
 src/libAtomVM/unicode.h   | 22 ++++++++++++++
 6 files changed, 51 insertions(+), 88 deletions(-)

diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c
index 76a9c014c..b74c36531 100644
--- a/src/libAtomVM/bitstring.c
+++ b/src/libAtomVM/bitstring.c
@@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
     return true;
 }
 
-enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
-{
-    if (len == 0) {
-        return UnicodeTransformDecodeFail;
-    } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x07) << 18;
-        v |= (buf[1] & 0x3F) << 12;
-        v |= (buf[2] & 0x3F) << 6;
-        v |= (buf[3] & 0x3F);
-        // overlong encoding or invalid codepoint
-        if (v <= 0x10000 || v > 0x10FFFF) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 4;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x0F) << 12;
-        v |= (buf[1] & 0x3F) << 6;
-        v |= (buf[2] & 0x3F);
-        // overlong encoding or surrogate
-        if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 3;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x1F) << 6;
-        v |= (buf[1] & 0x3F);
-        // overlong encoding
-        if (v < 0x80) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 2;
-        return UnicodeTransformDecodeSuccess;
-    } else if ((*buf & 0x80) == 0) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x7F);
-        *c = v;
-        *out_size = 1;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
-        return UnicodeTransformDecodeIncomplete;
-    }
-
-    return UnicodeTransformDecodeFail;
-}
-
 // UTF-16 encoding, when U in U+010000 to U+10FFFF:
 //
 //  U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h
index 141c3ce97..33381e0d8 100644
--- a/src/libAtomVM/bitstring.h
+++ b/src/libAtomVM/bitstring.h
@@ -23,6 +23,7 @@
 #define _BITSTRING_H_
 
 #include "term.h"
+#include "unicode.h"
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -99,13 +100,6 @@ enum BitstringFlags
 #endif
 };
 
-enum UnicodeTransformDecodeResult
-{
-    UnicodeTransformDecodeSuccess,
-    UnicodeTransformDecodeFail,
-    UnicodeTransformDecodeIncomplete
-};
-
 union maybe_unsigned_int8
 {
     uint8_t u;
@@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
  */
 bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);
 
-/**
- * @brief Decode a character from UTF-8.
- *
- * @param buf the buffer from which to decode the string
- * @param len the length (in bytes) of the bytes in buf
- * @param c int value to decode to or NULL to only compute the size.
- * @param out_size the size in bytes, on output (if not NULL)
- * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
- * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
- * unicode character or \c UnicodeTransformDecodeIncomplete if character
- * starting at buf is a valid but incomplete transformation
- */
-enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
-
 /**
  * @brief Encode a character to UTF-16.
  *
@@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
 {
     size_t byte_offset = offset >> 3; // divide by 8
     const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
-    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
+    return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
 }
 
 /**
diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c
index 744429618..cf139b5a5 100644
--- a/src/libAtomVM/interop.c
+++ b/src/libAtomVM/interop.c
@@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
     while (input_index < len) {
         size_t char_size;
         uint32_t c;
-        enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
+        enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
         if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
             *rest_crsr = input_index;
             *output_len = result;
diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c
index f020e247e..05746d4a7 100644
--- a/src/libAtomVM/nifs.c
+++ b/src/libAtomVM/nifs.c
@@ -2197,7 +2197,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
         for (size_t i = 0; i < encoded_len; i++) {
             size_t codepoint_size;
             uint32_t codepoint;
-            if (UNLIKELY(bitstring_utf8_decode(
+            if (UNLIKELY(unicode_utf8_decode(
                              &utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
                         != UnicodeTransformDecodeSuccess
                     || (codepoint > 255))) {
@@ -2238,7 +2238,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
         for (size_t i = 0; i < u8len; i++) {
             size_t codepoint_size;
             enum UnicodeTransformDecodeResult result
-                = bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
+                = unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
             if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
                     || !unicode_is_valid_codepoint(codepoints[i]))) {
                 AVM_ABORT();
diff --git a/src/libAtomVM/unicode.c b/src/libAtomVM/unicode.c
index 6f29b12dd..af43ccde3 100644
--- a/src/libAtomVM/unicode.c
+++ b/src/libAtomVM/unicode.c
@@ -21,6 +21,8 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+#include "utils.h"
+
 #include "unicode.h"
 
 // Following utf8d table and decode function are covered by MIT license
@@ -63,6 +65,28 @@ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
   return *state;
 }
 
+enum UnicodeTransformDecodeResult unicode_utf8_decode(
+    const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
+{
+  uint32_t codepoint = 0;
+  uint32_t state = 0;
+  size_t i = 0;
+  while (i < len) {
+      state = decode(&state, &codepoint, buf[i]);
+      i++;
+
+      if (state == UTF8_ACCEPT) {
+          *c = codepoint;
+          *out_size = i;
+          return UnicodeTransformDecodeSuccess;
+      } else if (UNLIKELY(state == UTF8_REJECT)) {
+          return UnicodeTransformDecodeFail;
+      }
+  }
+
+  return UnicodeTransformDecodeIncomplete;
+}
+
 bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
 {
   uint32_t codepoint = 0;
diff --git a/src/libAtomVM/unicode.h b/src/libAtomVM/unicode.h
index b23df803f..fa9b3b6e1 100644
--- a/src/libAtomVM/unicode.h
+++ b/src/libAtomVM/unicode.h
@@ -29,6 +29,13 @@
 extern "C" {
 #endif
 
+enum UnicodeTransformDecodeResult
+{
+    UnicodeTransformDecodeSuccess,
+    UnicodeTransformDecodeFail,
+    UnicodeTransformDecodeIncomplete
+};
+
 size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
 bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
 size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
@@ -40,6 +47,21 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
     return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
 }
 
+/**
+ * @brief Decode a character from UTF-8.
+ *
+ * @param buf the buffer from which to decode the string
+ * @param len the length (in bytes) of the bytes in buf
+ * @param c int value to decode to
+ * @param out_size the size in bytes, on output (if not NULL)
+ * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
+ * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
+ * unicode character or \c UnicodeTransformDecodeIncomplete if character
+ * starting at buf is a valid but incomplete transformation
+ */
+enum UnicodeTransformDecodeResult unicode_utf8_decode(
+    const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
+
 bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);
 
 #ifdef __cplusplus