Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve unicode atom support #1038

Merged
merged 3 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [0.6.0-beta.1] - Unreleased

### Added

- Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions
- `binary_to_atom/1` and `atom_to_binary/1` that default to utf8 (they were introduced with OTP23)

### Fixed

- ESP32: fix i2c_driver_acquire and i2c_driver_release functions, that were working only once.
- Sending messages to registered processes using the `!` operator now works.
- Fixed bug in `OP_SEND` that would accept sending a message to any integer or term without raising an error.

### Changed

- `binary_to_atom/2` validates utf8 strings
- `*_to_atom` and `atom_to_*` properly convert latin1 (not just ASCII) to utf8 and viceversa

## [0.6.0-beta.0] - 2024-02-08

### Added
Expand Down
34 changes: 29 additions & 5 deletions libs/estdlib/src/erlang.erl
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,11 @@
list_to_integer/1,
list_to_tuple/1,
iolist_to_binary/1,
binary_to_atom/1,
binary_to_atom/2,
binary_to_integer/1,
binary_to_list/1,
atom_to_binary/1,
atom_to_binary/2,
atom_to_list/1,
float_to_binary/1,
Expand Down Expand Up @@ -117,6 +119,8 @@
%% * review API documentation for timer functions in this module
%%

-type atom_encoding() :: latin1 | utf8 | unicode.

-type mem_type() :: binary.
-type time_unit() :: second | millisecond | microsecond.
-type timestamp() :: {
Expand Down Expand Up @@ -582,13 +586,22 @@ iolist_to_binary(_IOList) ->

%%-----------------------------------------------------------------------------
%% @param Binary Binary to convert to atom
%% @param Encoding encoding for conversion
%% @returns an atom from passed binary
%% @doc Convert a binary to atom, defaults to utf8.
%% @end
%%-----------------------------------------------------------------------------
-spec binary_to_atom(Binary :: binary()) -> atom().
binary_to_atom(_Binary) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Binary Binary to convert to atom
%% @param Encoding encoding for conversion (any of latin1, utf8 or unicode)
%% @returns an atom from passed binary
%% @doc Convert a binary to atom.
%% Only latin1 encoded is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec binary_to_atom(Binary :: binary(), Encoding :: latin1) -> atom().
-spec binary_to_atom(Binary :: binary(), Encoding :: atom_encoding()) -> atom().
binary_to_atom(_Binary, _Encoding) ->
erlang:nif_error(undefined).

Expand All @@ -614,13 +627,24 @@ binary_to_list(_Binary) ->

%%-----------------------------------------------------------------------------
%% @param Atom Atom to convert
%% @param Encoding Encoding for conversion
%% @returns a binary with the atom's name
%% @doc Convert an atom to a binary, defaults to utf8.
%% Only latin1 encoding is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec atom_to_binary(Atom :: atom()) -> binary().
atom_to_binary(_Atom) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Atom Atom to convert
%% @param Encoding Encoding for conversion (any of latin1, utf8 or unicode)
%% @returns a binary with the atom's name
%% @doc Convert an atom to a binary.
%% Only latin1 encoding is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec atom_to_binary(Atom :: atom(), Encoding :: latin1) -> binary().
-spec atom_to_binary(Atom :: atom(), Encoding :: atom_encoding()) -> binary().
atom_to_binary(_Atom, _Encoding) ->
erlang:nif_error(undefined).

Expand Down
2 changes: 2 additions & 0 deletions src/libAtomVM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ set(HEADER_FILES
term.h
timer_list.h
trace.h
unicode.h
utils.h
valueshashtable.h
${CMAKE_CURRENT_BINARY_DIR}/avm_version.h
Expand Down Expand Up @@ -94,6 +95,7 @@ set(SOURCE_FILES
stacktrace.c
term.c
timer_list.c
unicode.c
valueshashtable.c
)

Expand Down
15 changes: 15 additions & 0 deletions src/libAtomVM/atom_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include "atom.h"
#include "smp.h"
#include "unicode.h"
#include "utils.h"

#ifndef AVM_NO_SMP
Expand Down Expand Up @@ -279,6 +280,20 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index,
return node;
}

bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom)
{
SMP_RDLOCK(table);

struct HNode *node = (struct HNode *) atom;
const uint8_t *data = atom_string_data(node->key);
size_t len = atom_string_len(node->key);

bool result = unicode_buf_is_ascii(data, len);

SMP_UNLOCK(table);
return result;
}

void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf)
{
SMP_RDLOCK(table);
Expand Down
3 changes: 3 additions & 0 deletions src/libAtomVM/atom_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#ifndef _ATOM_TABLE_
#define _ATOM_TABLE_

#include <stdbool.h>

#include "atom.h"

#define ATOM_TABLE_NOT_FOUND -1
Expand Down Expand Up @@ -56,6 +58,7 @@ int atom_table_ensure_atoms(
int atom_table_cmp_using_atom_index(
struct AtomTable *table, int t_atom_index, int other_atom_index);
atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, size_t *out_len);
bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom);
void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf);
void atom_table_write_cstring(
struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf);
Expand Down
4 changes: 4 additions & 0 deletions src/libAtomVM/defaultatoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ static const char *const fibonacci_atom = "\x9" "fibonacci";
static const char *const call_atom = "\x5" "$call";
static const char *const cast_atom = "\x5" "$cast";

static const char *const unicode_atom = "\x7" "unicode";

void defaultatoms_init(GlobalContext *glb)
{
int ok = 1;
Expand Down Expand Up @@ -300,6 +302,8 @@ void defaultatoms_init(GlobalContext *glb)
ok &= globalcontext_insert_atom(glb, call_atom) == CALL_ATOM_INDEX;
ok &= globalcontext_insert_atom(glb, cast_atom) == CAST_ATOM_INDEX;

ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;

if (!ok) {
AVM_ABORT();
}
Expand Down
6 changes: 5 additions & 1 deletion src/libAtomVM/defaultatoms.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ extern "C" {
#define CALL_ATOM_INDEX 108
#define CAST_ATOM_INDEX 109

#define PLATFORM_ATOMS_BASE_INDEX 110
#define UNICODE_ATOM_INDEX 110

#define PLATFORM_ATOMS_BASE_INDEX 111

#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
Expand Down Expand Up @@ -309,6 +311,8 @@ extern "C" {
#define CALL_ATOM TERM_FROM_ATOM_INDEX(CALL_ATOM_INDEX)
#define CAST_ATOM TERM_FROM_ATOM_INDEX(CAST_ATOM_INDEX)

#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)

void defaultatoms_init(GlobalContext *glb);

void platform_defaultatoms_init(GlobalContext *glb);
Expand Down
56 changes: 56 additions & 0 deletions src/libAtomVM/interop.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,62 @@ char *interop_list_to_string(term list, int *ok)
return str;
}

char *interop_list_to_utf8_string(term list, int *ok)
{
size_t byte_len = 0;

term t = list;
while (term_is_nonempty_list(t)) {
term head = term_get_list_head(t);
if (UNLIKELY(!term_is_integer(head))) {
*ok = 0;
return NULL;
}
avm_int_t codepoint = term_to_int(head);
if (UNLIKELY(codepoint < 0)) {
*ok = 0;
return NULL;
} else if (codepoint <= 127) {
byte_len++;
} else {
size_t codepoint_size;
bool is_encodable = bitstring_utf8_encode(codepoint, NULL, &codepoint_size);
if (UNLIKELY(!is_encodable)) {
*ok = 0;
return NULL;
}
byte_len += codepoint_size;
}
t = term_get_list_tail(t);
}

if (!term_is_nil(t)) {
*ok = 0;
return NULL;
}

uint8_t *str = malloc(byte_len + 1);
if (IS_NULL_PTR(str)) {
*ok = 0;
return NULL;
}

t = list;
size_t i = 0;
while (i < byte_len) {
term codepoint_term = term_get_list_head(t);
size_t codepoint_size;
// list has been previously checked, no need to check again
bitstring_utf8_encode(term_to_int(codepoint_term), &str[i], &codepoint_size);
t = term_get_list_tail(t);
i += codepoint_size;
}
str[byte_len] = 0;

*ok = 1;
return (char *) str;
}

char *interop_atom_to_string(Context *ctx, term atom)
{
GlobalContext *glb = ctx->global;
Expand Down
1 change: 1 addition & 0 deletions src/libAtomVM/interop.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ typedef void (*interop_chardata_rest_fun)(term t, void *accum);

char *interop_term_to_string(term t, int *ok);
char *interop_binary_to_string(term binary);
char *interop_list_to_utf8_string(term list, int *ok);
char *interop_list_to_string(term list, int *ok);
char *interop_iolist_to_string(term list, int *ok);
char *interop_atom_to_string(Context *ctx, term atom);
Expand Down
Loading
Loading