From c336d2f818f10fe9a8f5016483a2e20898da4d7a Mon Sep 17 00:00:00 2001 From: Davide Bettio Date: Sun, 11 Feb 2024 17:29:43 +0100 Subject: [PATCH 1/3] Improve utf8 atom support Allow also utf8 and unicode encodings in addition to latin1. Signed-off-by: Davide Bettio --- CHANGELOG.md | 9 +++ libs/estdlib/src/erlang.erl | 11 +-- src/libAtomVM/CMakeLists.txt | 2 + src/libAtomVM/atom_table.c | 15 ++++ src/libAtomVM/atom_table.h | 3 + src/libAtomVM/defaultatoms.c | 4 + src/libAtomVM/defaultatoms.h | 6 +- src/libAtomVM/interop.c | 56 ++++++++++++++ src/libAtomVM/interop.h | 1 + src/libAtomVM/nifs.c | 146 ++++++++++++++++++++++++++++++----- src/libAtomVM/unicode.c | 49 ++++++++++++ src/libAtomVM/unicode.h | 46 +++++++++++ 12 files changed, 322 insertions(+), 26 deletions(-) create mode 100644 src/libAtomVM/unicode.c create mode 100644 src/libAtomVM/unicode.h diff --git a/CHANGELOG.md b/CHANGELOG.md index f812c11d3..572d7fc2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.6.0-beta.1] - Unreleased +### Added + +- Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions + ### Fixed - ESP32: fix i2c_driver_acquire and i2c_driver_release functions, that were working only once. - Sending messages to registered processes using the `!` operator now works. - Fixed bug in `OP_SEND` that would accept sending a message to any integer or term without raising an error. +### Changed + +- `binary_to_atom/2` validates utf8 strings +- `*_to_atom` and `atom_to_*` properly convert latin1 (not just ASCII) to utf8 and viceversa + ## [0.6.0-beta.0] - 2024-02-08 ### Added diff --git a/libs/estdlib/src/erlang.erl b/libs/estdlib/src/erlang.erl index 689325124..799091f1b 100644 --- a/libs/estdlib/src/erlang.erl +++ b/libs/estdlib/src/erlang.erl @@ -117,6 +117,8 @@ %% * review API documentation for timer functions in this module %% +-type atom_encoding() :: latin1 | utf8 | unicode. + -type mem_type() :: binary. -type time_unit() :: second | millisecond | microsecond. -type timestamp() :: { @@ -582,13 +584,12 @@ iolist_to_binary(_IOList) -> %%----------------------------------------------------------------------------- %% @param Binary Binary to convert to atom -%% @param Encoding encoding for conversion +%% @param Encoding encoding for conversion (any of latin1, utf8 or unicode) %% @returns an atom from passed binary %% @doc Convert a binary to atom. -%% Only latin1 encoded is supported. %% @end %%----------------------------------------------------------------------------- --spec binary_to_atom(Binary :: binary(), Encoding :: latin1) -> atom(). +-spec binary_to_atom(Binary :: binary(), Encoding :: atom_encoding()) -> atom(). binary_to_atom(_Binary, _Encoding) -> erlang:nif_error(undefined). @@ -614,13 +615,13 @@ binary_to_list(_Binary) -> %%----------------------------------------------------------------------------- %% @param Atom Atom to convert -%% @param Encoding Encoding for conversion +%% @param Encoding Encoding for conversion (any of latin1, utf8 or unicode) %% @returns a binary with the atom's name %% @doc Convert an atom to a binary. %% Only latin1 encoding is supported. %% @end %%----------------------------------------------------------------------------- --spec atom_to_binary(Atom :: atom(), Encoding :: latin1) -> binary(). +-spec atom_to_binary(Atom :: atom(), Encoding :: atom_encoding()) -> binary(). atom_to_binary(_Atom, _Encoding) -> erlang:nif_error(undefined). diff --git a/src/libAtomVM/CMakeLists.txt b/src/libAtomVM/CMakeLists.txt index 8b5a011a2..05c79303b 100644 --- a/src/libAtomVM/CMakeLists.txt +++ b/src/libAtomVM/CMakeLists.txt @@ -62,6 +62,7 @@ set(HEADER_FILES term.h timer_list.h trace.h + unicode.h utils.h valueshashtable.h ${CMAKE_CURRENT_BINARY_DIR}/avm_version.h @@ -94,6 +95,7 @@ set(SOURCE_FILES stacktrace.c term.c timer_list.c + unicode.c valueshashtable.c ) diff --git a/src/libAtomVM/atom_table.c b/src/libAtomVM/atom_table.c index 1c335664d..ed281dcb3 100644 --- a/src/libAtomVM/atom_table.c +++ b/src/libAtomVM/atom_table.c @@ -27,6 +27,7 @@ #include "atom.h" #include "smp.h" +#include "unicode.h" #include "utils.h" #ifndef AVM_NO_SMP @@ -279,6 +280,20 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, return node; } +bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom) +{ + SMP_RDLOCK(table); + + struct HNode *node = (struct HNode *) atom; + const uint8_t *data = atom_string_data(node->key); + size_t len = atom_string_len(node->key); + + bool result = unicode_buf_is_ascii(data, len); + + SMP_UNLOCK(table); + return result; +} + void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf) { SMP_RDLOCK(table); diff --git a/src/libAtomVM/atom_table.h b/src/libAtomVM/atom_table.h index c9f108bd6..f919905d7 100644 --- a/src/libAtomVM/atom_table.h +++ b/src/libAtomVM/atom_table.h @@ -21,6 +21,8 @@ #ifndef _ATOM_TABLE_ #define _ATOM_TABLE_ +#include + #include "atom.h" #define ATOM_TABLE_NOT_FOUND -1 @@ -56,6 +58,7 @@ int atom_table_ensure_atoms( int atom_table_cmp_using_atom_index( struct AtomTable *table, int t_atom_index, int other_atom_index); atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, size_t *out_len); +bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom); void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf); void atom_table_write_cstring( struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf); diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c index fff72c04f..a8645adfd 100644 --- a/src/libAtomVM/defaultatoms.c +++ b/src/libAtomVM/defaultatoms.c @@ -158,6 +158,8 @@ static const char *const fibonacci_atom = "\x9" "fibonacci"; static const char *const call_atom = "\x5" "$call"; static const char *const cast_atom = "\x5" "$cast"; +static const char *const unicode_atom = "\x7" "unicode"; + void defaultatoms_init(GlobalContext *glb) { int ok = 1; @@ -300,6 +302,8 @@ void defaultatoms_init(GlobalContext *glb) ok &= globalcontext_insert_atom(glb, call_atom) == CALL_ATOM_INDEX; ok &= globalcontext_insert_atom(glb, cast_atom) == CAST_ATOM_INDEX; + ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX; + if (!ok) { AVM_ABORT(); } diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h index c8a7d41ac..df61dee84 100644 --- a/src/libAtomVM/defaultatoms.h +++ b/src/libAtomVM/defaultatoms.h @@ -167,7 +167,9 @@ extern "C" { #define CALL_ATOM_INDEX 108 #define CAST_ATOM_INDEX 109 -#define PLATFORM_ATOMS_BASE_INDEX 110 +#define UNICODE_ATOM_INDEX 110 + +#define PLATFORM_ATOMS_BASE_INDEX 111 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX) #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX) @@ -309,6 +311,8 @@ extern "C" { #define CALL_ATOM TERM_FROM_ATOM_INDEX(CALL_ATOM_INDEX) #define CAST_ATOM TERM_FROM_ATOM_INDEX(CAST_ATOM_INDEX) +#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX) + void defaultatoms_init(GlobalContext *glb); void platform_defaultatoms_init(GlobalContext *glb); diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c index 89e7011a9..744429618 100644 --- a/src/libAtomVM/interop.c +++ b/src/libAtomVM/interop.c @@ -138,6 +138,62 @@ char *interop_list_to_string(term list, int *ok) return str; } +char *interop_list_to_utf8_string(term list, int *ok) +{ + size_t byte_len = 0; + + term t = list; + while (term_is_nonempty_list(t)) { + term head = term_get_list_head(t); + if (UNLIKELY(!term_is_integer(head))) { + *ok = 0; + return NULL; + } + avm_int_t codepoint = term_to_int(head); + if (UNLIKELY(codepoint < 0)) { + *ok = 0; + return NULL; + } else if (codepoint <= 127) { + byte_len++; + } else { + size_t codepoint_size; + bool is_encodable = bitstring_utf8_encode(codepoint, NULL, &codepoint_size); + if (UNLIKELY(!is_encodable)) { + *ok = 0; + return NULL; + } + byte_len += codepoint_size; + } + t = term_get_list_tail(t); + } + + if (!term_is_nil(t)) { + *ok = 0; + return NULL; + } + + uint8_t *str = malloc(byte_len + 1); + if (IS_NULL_PTR(str)) { + *ok = 0; + return NULL; + } + + t = list; + size_t i = 0; + while (i < byte_len) { + term codepoint_term = term_get_list_head(t); + size_t codepoint_size; + // list has been previously checked, no need to check again + bitstring_utf8_encode(term_to_int(codepoint_term), &str[i], &codepoint_size); + t = term_get_list_tail(t); + i += codepoint_size; + } + str[byte_len] = 0; + + *ok = 1; + return (char *) str; +} + char *interop_atom_to_string(Context *ctx, term atom) { GlobalContext *glb = ctx->global; diff --git a/src/libAtomVM/interop.h b/src/libAtomVM/interop.h index 3a69fc1f5..6cfc7df4b 100644 --- a/src/libAtomVM/interop.h +++ b/src/libAtomVM/interop.h @@ -67,6 +67,7 @@ typedef void (*interop_chardata_rest_fun)(term t, void *accum); char *interop_term_to_string(term t, int *ok); char *interop_binary_to_string(term binary); +char *interop_list_to_utf8_string(term list, int *ok); char *interop_list_to_string(term list, int *ok); char *interop_iolist_to_string(term list, int *ok); char *interop_atom_to_string(Context *ctx, term atom); diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index 16a03202a..c0ff15311 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -35,6 +35,7 @@ #include "avm_version.h" #include "avmpack.h" #include "bif.h" +#include "bitstring.h" #include "context.h" #include "defaultatoms.h" #include "dictionary.h" @@ -52,6 +53,7 @@ #include "synclist.h" #include "sys.h" #include "term.h" +#include "unicode.h" #include "utils.h" #define MAX_NIF_NAME_LEN 260 @@ -1945,9 +1947,7 @@ static term binary_to_atom(Context *ctx, int argc, term argv[], int create_new) term a_binary = argv[0]; VALIDATE_VALUE(a_binary, term_is_binary); - if (UNLIKELY(argv[1] != LATIN1_ATOM)) { - RAISE_ERROR(BADARG_ATOM); - } + term encoding = argv[1]; const char *atom_string = term_binary_data(a_binary); size_t atom_string_len = term_binary_size(a_binary); @@ -1955,9 +1955,49 @@ static term binary_to_atom(Context *ctx, int argc, term argv[], int create_new) RAISE_ERROR(SYSTEM_LIMIT_ATOM); } - AtomString atom = malloc(atom_string_len + 1); - ((uint8_t *) atom)[0] = atom_string_len; - memcpy(((char *) atom) + 1, atom_string, atom_string_len); + bool encode_latin1_to_utf8 = false; + if (UNLIKELY((encoding == LATIN1_ATOM) + && !unicode_buf_is_ascii((const uint8_t *) atom_string, atom_string_len))) { + encode_latin1_to_utf8 = true; + } else if (UNLIKELY((encoding != LATIN1_ATOM) && (encoding != UNICODE_ATOM) + && (encoding != UTF8_ATOM))) { + RAISE_ERROR(BADARG_ATOM); + } + + AtomString atom; + if (LIKELY(!encode_latin1_to_utf8)) { + size_t i = 0; + while (i < atom_string_len) { + uint32_t codepoint; + size_t codepoint_size; + if (UNLIKELY(bitstring_utf8_decode( + (uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size)) + != UnicodeTransformDecodeSuccess) { + RAISE_ERROR(BADARG_ATOM); + } + i += codepoint_size; + } + + atom = malloc(atom_string_len + 1); + ((uint8_t *) atom)[0] = atom_string_len; + memcpy(((char *) atom) + 1, atom_string, atom_string_len); + } else { + // * 2 is the worst case size + size_t buf_len = atom_string_len * 2; + atom = malloc(buf_len + 1); + uint8_t *atom_data = ((uint8_t *) atom) + 1; + size_t out_pos = 0; + for (size_t i = 0; i < atom_string_len; i++) { + size_t out_size; + bitstring_utf8_encode(((uint8_t) atom_string[i]), &atom_data[out_pos], &out_size); + out_pos += out_size; + } + if (out_pos > 255) { + free((void *) atom); + RAISE_ERROR(SYSTEM_LIMIT_ATOM); + } + ((uint8_t *) atom)[0] = out_pos; + } enum AtomTableCopyOpt atom_opts = AtomTableCopyAtom; if (!create_new) { @@ -1991,7 +2031,7 @@ term list_to_atom(Context *ctx, int argc, term argv[], int create_new) VALIDATE_VALUE(a_list, term_is_list); int ok; - char *atom_string = interop_list_to_string(a_list, &ok); + char *atom_string = interop_list_to_utf8_string(a_list, &ok); if (UNLIKELY(!ok)) { RAISE_ERROR(OUT_OF_MEMORY_ATOM); } @@ -2031,9 +2071,7 @@ static term nif_erlang_atom_to_binary_2(Context *ctx, int argc, term argv[]) term atom_term = argv[0]; VALIDATE_VALUE(atom_term, term_is_atom); - if (UNLIKELY(argv[1] != LATIN1_ATOM)) { - RAISE_ERROR(BADARG_ATOM); - } + term encoding = argv[1]; GlobalContext *glb = ctx->global; @@ -2041,13 +2079,50 @@ static term nif_erlang_atom_to_binary_2(Context *ctx, int argc, term argv[]) size_t atom_len; atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len); + bool encode_to_latin1 = false; + if (encoding == LATIN1_ATOM) { + if (UNLIKELY(!atom_table_is_atom_ref_ascii(glb->atom_table, atom_ref))) { + encode_to_latin1 = true; + } + } else if (UNLIKELY(encoding != UTF8_ATOM) && (encoding != UNICODE_ATOM)) { + RAISE_ERROR(BADARG_ATOM); + } + if (UNLIKELY(memory_ensure_free_opt(ctx, term_binary_heap_size(atom_len), MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - term binary = term_create_uninitialized_binary(atom_len, &ctx->heap, glb); - atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, (char *) term_binary_data(binary)); - return binary; + if (LIKELY(!encode_to_latin1)) { + term binary = term_create_uninitialized_binary(atom_len, &ctx->heap, glb); + atom_table_write_bytes( + glb->atom_table, atom_ref, atom_len, (char *) term_binary_data(binary)); + return binary; + } else { + uint8_t *utf8_tmp_buf = malloc(atom_len); + if (IS_NULL_PTR(utf8_tmp_buf)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, (char *) utf8_tmp_buf); + size_t encoded_len = unicode_buf_utf8_len(utf8_tmp_buf, atom_len); + term binary = term_create_uninitialized_binary(encoded_len, &ctx->heap, glb); + char *binary_data = (char *) term_binary_data(binary); + size_t in_pos = 0; + for (size_t i = 0; i < encoded_len; i++) { + size_t codepoint_size; + uint32_t codepoint; + if (UNLIKELY(bitstring_utf8_decode( + &utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size) + != UnicodeTransformDecodeSuccess + || (codepoint > 255))) { + free(utf8_tmp_buf); + RAISE_ERROR(BADARG_ATOM); + } + binary_data[i] = codepoint; + in_pos += codepoint_size; + } + free(utf8_tmp_buf); + return binary; + } } static term nif_erlang_atom_to_list_1(Context *ctx, int argc, term argv[]) @@ -2069,18 +2144,49 @@ static term nif_erlang_atom_to_list_1(Context *ctx, int argc, term argv[]) RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - if (UNLIKELY(memory_ensure_free_opt(ctx, atom_len * 2, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + atom_table_write_bytes(ctx->global->atom_table, atom_ref, atom_len, atom_buf); + + size_t u8len = unicode_buf_utf8_len((uint8_t *) atom_buf, atom_len); + bool is_latin1 = atom_len == u8len; + + size_t list_len = is_latin1 ? atom_len : u8len; + + if (UNLIKELY( + memory_ensure_free_opt(ctx, list_len * CONS_SIZE, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + free(atom_buf); RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - atom_table_write_bytes(ctx->global->atom_table, atom_ref, atom_len, atom_buf); - term prev = term_nil(); - for (int i = atom_len - 1; i >= 0; i--) { - char c = atom_buf[i]; - prev = term_list_prepend(term_from_int11(c), prev, &ctx->heap); - } + if (is_latin1) { + for (int i = atom_len - 1; i >= 0; i--) { + char c = atom_buf[i]; + prev = term_list_prepend(term_from_int11(c), prev, &ctx->heap); + } + } else { + uint32_t *codepoints = malloc(u8len * sizeof(uint32_t)); + if (IS_NULL_PTR(codepoints)) { + free(atom_buf); + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + uint8_t *u_in = (uint8_t *) atom_buf; + for (size_t i = 0; i < u8len; i++) { + size_t codepoint_size; + enum UnicodeTransformDecodeResult result + = bitstring_utf8_decode(u_in, atom_len, &codepoints[i], &codepoint_size); + if (UNLIKELY((result != UnicodeTransformDecodeSuccess) + || !unicode_is_valid_codepoint(codepoints[i]))) { + AVM_ABORT(); + } + u_in += codepoint_size; + } + + for (int i = u8len - 1; i >= 0; i--) { + prev = term_list_prepend(term_from_int(codepoints[i]), prev, &ctx->heap); + } + free(codepoints); + } free(atom_buf); return prev; diff --git a/src/libAtomVM/unicode.c b/src/libAtomVM/unicode.c new file mode 100644 index 000000000..38140937d --- /dev/null +++ b/src/libAtomVM/unicode.c @@ -0,0 +1,49 @@ +/* + * This file is part of AtomVM. + * + * Copyright 2024 Davide Bettio + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later + */ + +#include +#include + +#include "unicode.h" + +size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len) +{ + size_t count = 0; + + for (size_t i = 0; i < buf_len; i++) { + // we count either ASCII characters or the first byte of a unicode sequence + if ((buf[i] & 0xC0) != 0x80) { + count++; + } + } + + return count; +} + +bool unicode_buf_is_ascii(const uint8_t *buf, size_t len) +{ + for (size_t i = 0; i < len; i++) { + if (buf[i] > 0x7F) { + return false; + } + } + + return true; +} diff --git a/src/libAtomVM/unicode.h b/src/libAtomVM/unicode.h new file mode 100644 index 000000000..087f13126 --- /dev/null +++ b/src/libAtomVM/unicode.h @@ -0,0 +1,46 @@ +/* + * This file is part of AtomVM. + * + * Copyright 2024 Davide Bettio + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later + */ + +#ifndef _UNICODE_H_ +#define _UNICODE_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len); +bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len); + +static inline bool unicode_is_valid_codepoint(uint32_t codepoint) +{ + // 0x110000 - 0x1FFFFF are not valid codepoints + // 0xD800 - 0xDFFF are surrogates + return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF)); +} + +#ifdef __cplusplus +} +#endif + +#endif From 7ba257fef01e9a7ecaeb6c1039c7f08a75155507 Mon Sep 17 00:00:00 2001 From: Davide Bettio Date: Fri, 16 Feb 2024 22:52:19 +0100 Subject: [PATCH 2/3] Add binary_to_atom/1 and binary_to_exiting_atom/1 Both functions have been introduced with OTP23 and they default to utf8. Signed-off-by: Davide Bettio --- CHANGELOG.md | 1 + libs/estdlib/src/erlang.erl | 23 +++++++++++++++++++++++ src/libAtomVM/nifs.c | 4 +--- src/libAtomVM/nifs.gperf | 2 ++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 572d7fc2c..b04e54c46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions +- `binary_to_atom/1` and `atom_to_binary/1` that default to utf8 (they were introduced with OTP23) ### Fixed diff --git a/libs/estdlib/src/erlang.erl b/libs/estdlib/src/erlang.erl index 799091f1b..b7901d386 100644 --- a/libs/estdlib/src/erlang.erl +++ b/libs/estdlib/src/erlang.erl @@ -53,9 +53,11 @@ list_to_integer/1, list_to_tuple/1, iolist_to_binary/1, + binary_to_atom/1, binary_to_atom/2, binary_to_integer/1, binary_to_list/1, + atom_to_binary/1, atom_to_binary/2, atom_to_list/1, float_to_binary/1, @@ -582,6 +584,16 @@ list_to_tuple(_List) -> iolist_to_binary(_IOList) -> erlang:nif_error(undefined). +%%----------------------------------------------------------------------------- +%% @param Binary Binary to convert to atom +%% @returns an atom from passed binary +%% @doc Convert a binary to atom, defaults to utf8. +%% @end +%%----------------------------------------------------------------------------- +-spec binary_to_atom(Binary :: binary()) -> atom(). +binary_to_atom(_Binary) -> + erlang:nif_error(undefined). + %%----------------------------------------------------------------------------- %% @param Binary Binary to convert to atom %% @param Encoding encoding for conversion (any of latin1, utf8 or unicode) @@ -613,6 +625,17 @@ binary_to_integer(_Binary) -> binary_to_list(_Binary) -> erlang:nif_error(undefined). +%%----------------------------------------------------------------------------- +%% @param Atom Atom to convert +%% @returns a binary with the atom's name +%% @doc Convert an atom to a binary, defaults to utf8. +%% Only latin1 encoding is supported. +%% @end +%%----------------------------------------------------------------------------- +-spec atom_to_binary(Atom :: atom()) -> binary(). +atom_to_binary(_Atom) -> + erlang:nif_error(undefined). + %%----------------------------------------------------------------------------- %% @param Atom Atom to convert %% @param Encoding Encoding for conversion (any of latin1, utf8 or unicode) diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index c0ff15311..89717cfaa 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -1942,12 +1942,10 @@ static term nif_erlang_binary_to_existing_atom_2(Context *ctx, int argc, term ar static term binary_to_atom(Context *ctx, int argc, term argv[], int create_new) { - UNUSED(argc); - term a_binary = argv[0]; VALIDATE_VALUE(a_binary, term_is_binary); - term encoding = argv[1]; + term encoding = (argc == 2) ? argv[1] : UTF8_ATOM; const char *atom_string = term_binary_data(a_binary); size_t atom_string_len = term_binary_size(a_binary); diff --git a/src/libAtomVM/nifs.gperf b/src/libAtomVM/nifs.gperf index 249ab1714..3101a3099 100644 --- a/src/libAtomVM/nifs.gperf +++ b/src/libAtomVM/nifs.gperf @@ -39,10 +39,12 @@ binary:split/2, &binary_split_nif calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif erlang:atom_to_binary/2, &atom_to_binary_nif erlang:atom_to_list/1, &atom_to_list_nif +erlang:binary_to_atom/1, &binary_to_atom_nif erlang:binary_to_atom/2, &binary_to_atom_nif erlang:binary_to_float/1, &binary_to_float_nif erlang:binary_to_integer/1, &binary_to_integer_nif erlang:binary_to_list/1, &binary_to_list_nif +erlang:binary_to_existing_atom/1, &binary_to_existing_atom_nif erlang:binary_to_existing_atom/2, &binary_to_existing_atom_nif erlang:delete_element/2, &delete_element_nif erlang:erase/1, &erase_nif From 708f994e81e86bdf8d8e14bd74ef65e73b9ecc74 Mon Sep 17 00:00:00 2001 From: Davide Bettio Date: Sat, 17 Feb 2024 00:47:13 +0100 Subject: [PATCH 3/3] tests: add test_utf8_atoms Test conversion from binary/list to unicode atoms and viceversa, and a number of other corner cases. Signed-off-by: Davide Bettio --- tests/erlang_tests/CMakeLists.txt | 2 + tests/erlang_tests/test_utf8_atoms.erl | 236 +++++++++++++++++++++++++ tests/test.c | 1 + 3 files changed, 239 insertions(+) create mode 100644 tests/erlang_tests/test_utf8_atoms.erl diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt index 0e1b9ea6f..61e9336ff 100644 --- a/tests/erlang_tests/CMakeLists.txt +++ b/tests/erlang_tests/CMakeLists.txt @@ -490,6 +490,7 @@ compile_erlang(test_crypto_strong_rand_bytes) compile_erlang(test_atomvm_random) compile_erlang(float_decode) +compile_erlang(test_utf8_atoms) add_custom_target(erlang_test_modules DEPENDS code_load_files @@ -945,4 +946,5 @@ add_custom_target(erlang_test_modules DEPENDS test_atomvm_random.beam float_decode.beam + test_utf8_atoms.beam ) diff --git a/tests/erlang_tests/test_utf8_atoms.erl b/tests/erlang_tests/test_utf8_atoms.erl new file mode 100644 index 000000000..90835d26f --- /dev/null +++ b/tests/erlang_tests/test_utf8_atoms.erl @@ -0,0 +1,236 @@ +% +% This file is part of AtomVM. +% +% Copyright 2024 Davide Bettio +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(test_utf8_atoms). +-export([start/0, conv/3, get_atom/1, get_list/1, get_binary/1, conv2/3]). + +start() -> + 32767 - test_from_atom() + + 4095 - test_to_atom() + + 63 - test_missing_atom() + + 7 - test_latin1_convs() + + 7 - test_invalid_bins(). + +test_latin1_convs() -> + comp(?MODULE:conv2(binary, l1, ?MODULE:get_binary(l1s)), 'µÃ\230Ã¥') + + comp(?MODULE:conv2(binary, l1, ?MODULE:get_binary(l1_mixed)), get_atom(l1_mixed)) * 2 + + comp(?MODULE:conv(binary, l1, ?MODULE:get_atom(l1_mixed)), get_binary(l1_mixed)) * 4. + +test_from_atom() -> + test_to_list(l1) + + test_to_list(l1s) * 2 + + test_to_list(gr) * 4 + + test_to_list(jp) * 8 + + test_to_list(jp_mixed) * 16 + + test_to_l1bin(l1) * 32 + + test_to_l1bin(l1s_plain) * 64 + + test_to_l1bincatch(gr) * 128 + + test_to_l1bincatch(jp) * 256 + + test_to_l1bincatch(jp_mixed) * 512 + + test_to_u8bin(l1) * 1024 + + test_to_u8bin(l1s) * 2048 + + test_to_u8bin(gr) * 4096 + + test_to_u8bin(jp) * 8192 + + test_to_u8bin(jp_mixed) * 16384. + +test_to_atom() -> + test_from_list(l1) + + test_from_list(l1s) * 2 + + test_from_list(gr) * 4 + + test_from_list(jp) * 8 + + test_from_list(jp_mixed) * 16 + + test_from_l1bin(l1) * 32 + + test_from_l1bin(l1s_plain) * 64 + + test_from_u8bin(l1) * 128 + + test_from_u8bin(l1s) * 256 + + test_from_u8bin(gr) * 512 + + test_from_u8bin(jp) * 1024 + + test_from_u8bin(jp_mixed) * 2048. + +test_missing_atom() -> + comp( + erlang:list_to_atom(get_list(l1s_missing)), + erlang:binary_to_atom(get_binary(l1s_missing), utf8) + ) + + comp( + erlang:list_to_atom(get_list(jp_mixed_missing)), + erlang:binary_to_atom(get_binary(jp_mixed_missing), utf8) + ) * 2 + + comp( + erlang:list_to_atom(get_list(l1s_missing)), + erlang:binary_to_atom(get_binary(l1s_missing), unicode) + ) * 4 + + comp( + erlang:list_to_atom(get_list(jp_mixed_missing)), + erlang:binary_to_atom(get_binary(jp_mixed_missing), unicode) + ) * 8 + + comp_opt( + fun() -> erlang:list_to_atom(get_list(l1s_missing)) end, + fun() -> erlang:binary_to_atom(get_binary(l1s_missing)) end + ) * 16 + + comp_opt( + fun() -> erlang:list_to_atom(get_list(jp_mixed_missing)) end, + fun() -> erlang:binary_to_atom(get_binary(jp_mixed_missing)) end + ) * 32. + +test_invalid_bins() -> + test_from_u8bincatch(invalid1) + + test_from_u8bincatch(invalid2) * 2 + + test_from_u8bincatch(invalid2) * 4. + +test_to_list(Id) -> + case ?MODULE:conv(list, x, ?MODULE:get_atom(Id)) == ?MODULE:get_list(Id) of + true -> + 1; + false -> + erlang:display({list, Id}), + 0 + end. + +test_to_l1bin(Id) -> + case ?MODULE:conv(binary, l1, ?MODULE:get_atom(Id)) == ?MODULE:get_binary(Id) of + true -> + 1; + false -> + erlang:display({l1bin, Id}), + 0 + end. + +test_to_l1bincatch(Id) -> + try ?MODULE:conv(binary, l1, ?MODULE:get_atom(Id)) of + _X -> + erlang:display({err, Id}), + 0 + catch + error:badarg -> + 1 + end. + +test_to_u8bin(Id) -> + case ?MODULE:conv(binary, u8, ?MODULE:get_atom(Id)) == ?MODULE:get_binary(Id) of + true -> + 1; + false -> + erlang:display({u8bin, Id}), + 0 + end. + +test_from_list(Id) -> + case ?MODULE:conv2(list, x, ?MODULE:get_list(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({flist, Id}), + 0 + end. + +test_from_l1bin(Id) -> + case ?MODULE:conv2(binary, l1, ?MODULE:get_binary(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({fl1bin, Id}), + 0 + end. + +test_from_u8bin(Id) -> + case ?MODULE:conv2(binary, u8, ?MODULE:get_binary(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({fu8bin, Id}), + 0 + end. + +test_from_u8bincatch(Id) -> + try ?MODULE:conv2(binary, u8, ?MODULE:get_binary(Id)) of + _X -> + erlang:display({u8err, Id}), + 0 + catch + error:badarg -> + 1 + end. + +conv(list, _Fmt, Atom) -> + erlang:atom_to_list(Atom); +conv(binary, l1, Atom) -> + erlang:atom_to_binary(Atom, latin1); +conv(binary, u8, Atom) -> + erlang:atom_to_binary(Atom, utf8). + +conv2(list, _Fmt, S) -> + erlang:list_to_atom(S); +conv2(binary, l1, S) -> + erlang:binary_to_atom(S, latin1); +conv2(binary, u8, S) -> + erlang:binary_to_atom(S, utf8). + +comp(A, A) -> 1; +comp(_A, _B) -> 0. + +comp_opt(Fun1, Fun2) -> + case erlang:system_info(machine) of + "BEAM" -> + case erlang:system_info(otp_release) of + Version when Version >= "23" -> comp(Fun1(), Fun2()); + _OldVersion -> 1 + end; + _ -> + comp(Fun1(), Fun2()) + end. + +get_atom(Id) -> + case Id of + l1 -> 'abcd'; + l1_mixed -> 'testé'; + l1s -> 'µØå'; + l1s_plain -> 'µØå'; + gr -> 'ΓΔ'; + jp -> 'アーラン'; + jp_mixed -> 'latin1じゃない' + end. + +get_list(Id) -> + case Id of + l1 -> "abcd"; + l1s -> "µØå"; + l1s_missing -> "µ_å"; + gr -> "ΓΔ"; + jp -> "アーラン"; + jp_mixed -> "latin1じゃない"; + jp_mixed_missing -> "latin1_じゃない" + end. + +get_binary(Id) -> + case Id of + l1 -> <<"abcd"/utf8>>; + l1_mixed -> <<"testé">>; + l1s_plain -> <<"µØå">>; + l1s -> <<"µØå"/utf8>>; + l1s_missing -> <<"µ_å"/utf8>>; + gr -> <<"ΓΔ"/utf8>>; + jp -> <<"アーラン"/utf8>>; + jp_mixed -> <<"latin1じゃない"/utf8>>; + jp_mixed_missing -> <<"latin1_じゃない"/utf8>>; + invalid1 -> <<230>>; + invalid2 -> <<16#f0, 16#90, 16#28, 16#bc>>; + invalid3 -> <<16#fc, 16#a1, 16#a1, 16#a1, 16#a1, 16#a1>> + end. diff --git a/tests/test.c b/tests/test.c index 71ca55c38..aecf2ac76 100644 --- a/tests/test.c +++ b/tests/test.c @@ -520,6 +520,7 @@ struct Test tests[] = { #endif TEST_CASE(float_decode), + TEST_CASE(test_utf8_atoms), // TEST CRASHES HERE: TEST_CASE(memlimit), { NULL, 0, false, false }