Skip to content

Commit

Permalink
Encode atoms using UTF-8 in all cases, in line with OTP-26
Browse files Browse the repository at this point in the history
Signed-off-by: Fred Dushin <fred@dushin.net>
  • Loading branch information
fadushin committed Feb 25, 2024
1 parent 521618b commit cdd10da
Show file tree
Hide file tree
Showing 10 changed files with 253 additions and 41 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ used)
- New atom table, which uses less memory, has improved performances and better code.
- SPI: when gpio number is not provided for `miso` or `mosi` default to disabled
- Change port call tuple format to the same format as gen_server, so casts can be supported too
- Use UTF-8 encoding for atoms when using `erlang:term_to_binary/1,2`, in conformance with OTP-26

### Fixed

Expand Down
24 changes: 24 additions & 0 deletions doc/src/programmers-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,30 @@ The currently supported keys are enumerated in the following table:

See the `word_size` key in the [System APIs](#system-apis) section for information about how to find the number of bytes used in a machine word on the current platform.

### External Term Format

The `erlang:term_to_binary/1` and `erlang:binary_to_term/2` can be used to serialize arbitrary term data into and out of binary data. These operations can be useful for applications that wish to share term data over some network protocol, such as HTTP or MQTT, or wish to store serialized term data in some permanant sttorage (e.g., Non-volatile storage on ESP32 devices).

For example, to convert a term to a binary, use `erlang:term_to_binary/1,2`, e.g.,

%% erlang
Term = ...
Binary = erlang:term_to_binary(Term),

And to convert the binary back to a term, use `erlang:binary_to_term/1,2`, e.g.,

%% erlang
Binary = ...
{Term, _Used} = erlang:binary_to_term(Binary, [used]),

By default, AtomVM will encode all atoms using UTF-8 encoding. This encoding is the default encoding for OTP-26 and later releases. If you would like to use the legacy Latin1 encoding for atoms that do not contain UTF-8 extended characters, provide the `{minor_version, 1}` to the `erlang:term_to_binary/2` function. For example:

%% erlang
Term = ...
Binary = erlang:term_to_binary(Term, [{minor_version, 1}]),

For more information about Erlang external term format, consult the [Erlang Documentation](https://www.erlang.org/doc/apps/erts/erl_ext_dist.html)

### System APIs

You can obtain system information about the AtomVM virtual machine via the [`erlang:system_info/1`](./apidocs/erlang/estdlib/erlang.md#system_info1) function, which takes an atom parameter designating the desired datum. Allowable parameters include
Expand Down
20 changes: 19 additions & 1 deletion libs/estdlib/src/erlang.erl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
garbage_collect/1,
binary_to_term/1,
term_to_binary/1,
term_to_binary/2,
timestamp/0,
universaltime/0,
localtime/0
Expand Down Expand Up @@ -1149,7 +1150,7 @@ binary_to_term(_Binary) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @returns A binary encoding passed term.
%% @returns the binary encoding of a term
%% @param Term term to encode
%% @doc Encode a term to a binary that can later be decoded with `binary_to_term/1'.
%% This function should be mostly compatible with its Erlang/OTP counterpart.
Expand All @@ -1161,6 +1162,23 @@ binary_to_term(_Binary) ->
term_to_binary(_Term) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @returns the binary encoding of a term
%% @param Term term to encode
%% @param Options encoding options. Currently, the only supported encoding
%% options are `{minor_version, 1}', which will encode atoms using
%% latin1 encoding, if the atom does not contain any extended UTF-8
%% characters.
%% @doc Encode a term to a binary that can later be decoded with `binary_to_term/1'.
%% This function should be mostly compatible with its Erlang/OTP counterpart.
%% Unlike modern Erlang/OTP, resources are currently serialized as empty
%% binaries.
%% @end
%%-----------------------------------------------------------------------------
-spec term_to_binary(Term :: any(), Options :: [{minor_version, 1}]) -> binary().
term_to_binary(_Term, _Options) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @returns A tuple representing the current timestamp.
%% @see monotonic_time/1
Expand Down
6 changes: 3 additions & 3 deletions src/libAtomVM/bitstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[3] & 0x3F);
// overlong encoding or invalid codepoint
if (v <= 0x10000 || v > 0x10FFFF) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 4;
Expand All @@ -165,7 +165,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[2] & 0x3F);
// overlong encoding or surrogate
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 3;
Expand All @@ -176,7 +176,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[1] & 0x3F);
// overlong encoding
if (v < 0x80) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 2;
Expand Down
99 changes: 77 additions & 22 deletions src/libAtomVM/externalterm.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "externalterm.h"

#include "bitstring.h"
#include "context.h"
#include "list.h"

Expand Down Expand Up @@ -71,9 +72,9 @@

static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm_size, bool copy, Heap *heap, GlobalContext *glb);
static int calculate_heap_usage(const uint8_t *external_term_buf, size_t remaining, size_t *eterm_size, bool copy);
static size_t compute_external_size(term t, GlobalContext *glb);
static int externalterm_from_term(uint8_t **buf, size_t *len, term t, GlobalContext *glb);
static int serialize_term(uint8_t *buf, term t, GlobalContext *glb);
static size_t compute_external_size(term t, ExternalTermOpts opts, GlobalContext *glb);
static int externalterm_from_term(uint8_t **buf, size_t *len, term t, ExternalTermOpts opts, GlobalContext *glb);
static int serialize_term(uint8_t *buf, term t, ExternalTermOpts opts, GlobalContext *glb);

/**
* @brief
Expand Down Expand Up @@ -162,27 +163,27 @@ enum ExternalTermResult externalterm_from_binary(Context *ctx, term *dst, term b
}
}

static int externalterm_from_term(uint8_t **buf, size_t *len, term t, GlobalContext *glb)
static int externalterm_from_term(uint8_t **buf, size_t *len, term t, ExternalTermOpts opts, GlobalContext *glb)
{
*len = compute_external_size(t, glb) + 1;
*len = compute_external_size(t, opts, glb) + 1;
*buf = malloc(*len);
if (IS_NULL_PTR(*buf)) {
fprintf(stderr, "Unable to allocate %zu bytes for externalized term.\n", *len);
AVM_ABORT();
}
size_t k = serialize_term(*buf + 1, t, glb);
size_t k = serialize_term(*buf + 1, t, opts, glb);
*buf[0] = EXTERNAL_TERM_TAG;
return k + 1;
}

term externalterm_to_binary(Context *ctx, term t)
term externalterm_to_binary(Context *ctx, term t, ExternalTermOpts opts)
{
//
// convert
//
uint8_t *buf;
size_t len;
externalterm_from_term(&buf, &len, t, ctx->global);
externalterm_from_term(&buf, &len, t, opts, ctx->global);
//
// Ensure enough free space in heap for binary
//
Expand All @@ -199,9 +200,9 @@ term externalterm_to_binary(Context *ctx, term t)
return binary;
}

static size_t compute_external_size(term t, GlobalContext *glb)
static size_t compute_external_size(term t, ExternalTermOpts opts, GlobalContext *glb)
{
return serialize_term(NULL, t, glb);
return serialize_term(NULL, t, opts, glb);
}

static uint8_t get_num_bytes(avm_uint64_t val)
Expand All @@ -225,7 +226,47 @@ static void write_bytes(uint8_t *buf, avm_uint64_t val)
}
}

static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
static bool has_extended_utf8_encoding(const uint8_t *atom_data, size_t atom_len)
{
for (size_t i = 0; i < atom_len; ) {
size_t out_len = 0;
uint32_t c;
enum UnicodeTransformDecodeResult res = bitstring_utf8_decode(
atom_data + i,
atom_len - i,
&c,
&out_len
);
if (res == UnicodeTransformDecodeSuccess && out_len != 1) {
return true;
} else {
++i;
}
}
return false;
}

static inline void encode_atom_latin1(uint8_t *buf, atom_ref_t atom_ref, size_t atom_len, int *offset, GlobalContext *glb)
{
*offset = 3;
if (!IS_NULL_PTR(buf)) {
buf[0] = ATOM_EXT;
WRITE_16_UNALIGNED(buf + 1, atom_len);
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 3);
}
}

static inline void encode_atom_utf8(uint8_t *buf, atom_ref_t atom_ref, size_t atom_len, int *offset, GlobalContext *glb)
{
*offset = 2;
if (!IS_NULL_PTR(buf)) {
buf[0] = SMALL_ATOM_UTF8_EXT;
buf[1] = atom_len;
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 2);
}
}

static int serialize_term(uint8_t *buf, term t, ExternalTermOpts opts, GlobalContext *glb)
{
if (term_is_uint8(t)) {
if (!IS_NULL_PTR(buf)) {
Expand Down Expand Up @@ -272,12 +313,26 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
int atom_index = term_to_atom_index(t);
size_t atom_len;
atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len);
if (!IS_NULL_PTR(buf)) {
buf[0] = ATOM_EXT;
WRITE_16_UNALIGNED(buf + 1, atom_len);
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 3);

uint8_t *atom_data = malloc(atom_len);
if (IS_NULL_PTR(atom_data)) {
// Not much else we can do here...
AVM_ABORT();
}
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, atom_data);

int offset = 0;
if (opts & ExternalTermAllowLatin1Encoding) {
if (has_extended_utf8_encoding(atom_data, atom_len)) {
encode_atom_utf8(buf, atom_ref, atom_len, &offset, glb);
} else {
encode_atom_latin1(buf, atom_ref, atom_len, &offset, glb);
}
} else {
encode_atom_utf8(buf, atom_ref, atom_len, &offset, glb);
}
return 3 + atom_len;
free(atom_data);
return offset + atom_len;

} else if (term_is_tuple(t)) {
size_t arity = term_get_tuple_arity(t);
Expand All @@ -292,7 +347,7 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
size_t k = 2;
for (size_t i = 0; i < arity; ++i) {
term e = term_get_tuple_element(t, i);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, e, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, e, opts, glb);
}
return k;

Expand Down Expand Up @@ -332,11 +387,11 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
term i = t;
while (term_is_nonempty_list(i)) {
term e = term_get_list_head(i);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, e, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, e, opts, glb);
i = term_get_list_tail(i);
++len;
}
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, i, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, i, opts, glb);
if (!IS_NULL_PTR(buf)) {
WRITE_32_UNALIGNED(buf + 1, len);
}
Expand All @@ -363,9 +418,9 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
size_t k = 5;
for (size_t i = 0; i < size; ++i) {
term key = term_get_map_key(t, i);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, key, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, key, opts, glb);
term value = term_get_map_value(t, i);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, value, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, value, opts, glb);
}
return k;
} else if (term_is_function(t)) {
Expand All @@ -376,7 +431,7 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
const term *boxed_value = term_to_const_term_ptr(t);
for (size_t i = 1; i <= 3; ++i) {
term mfa = boxed_value[i];
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, mfa, glb);
k += serialize_term(IS_NULL_PTR(buf) ? NULL : buf + k, mfa, opts, glb);
}
return k;
} else {
Expand Down
9 changes: 7 additions & 2 deletions src/libAtomVM/externalterm.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ enum ExternalTermResult
typedef enum
{
ExternalTermNoOpts = 0,
ExternalTermToHeapFragment = 1
ExternalTermToHeapFragment = 1,
ExternalTermAllowLatin1Encoding = 2
} ExternalTermOpts;

/**
Expand Down Expand Up @@ -89,10 +90,14 @@ enum ExternalTermResult externalterm_from_binary(Context *ctx, term *dst, term b
* WARNING: This function may call the GC, which may render the input binary invalid.
* @param ctx the context that owns the memory that will be allocated.
* @param t the term to return as binary.
* @param opts encoding options. If the ExternalTermAllowLatin1Encoding bit is
* set in opts, then atoms that do not contain extended UTF-8 character will be
* encoded using latin1 (ATOM_EXT) encoding; otherwise, atoms are encoded in UTF-8
* (SMALL_ATOM_UTF8_EXT or ATOM_UTF8_EXT) encoding.
* @returns the term deserialized from the input term, or an invalid term, if
* deserialization fails.
*/
term externalterm_to_binary(Context *ctx, term t);
term externalterm_to_binary(Context *ctx, term t, ExternalTermOpts opts);

#ifdef __cplusplus
}
Expand Down
19 changes: 16 additions & 3 deletions src/libAtomVM/nifs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2914,11 +2914,24 @@ static term nif_erlang_binary_to_term(Context *ctx, int argc, term argv[])

static term nif_erlang_term_to_binary(Context *ctx, int argc, term argv[])
{
if (argc != 1) {
RAISE_ERROR(BADARG_ATOM);
ExternalTermOpts opts = ExternalTermNoOpts;
if (argc == 2) {
term options = argv[1];
VALIDATE_VALUE(options, term_is_list);

term minor_version = interop_kv_get_value(options, ATOM_STR("\xD", "minor_version"), ctx->global);
if (!term_is_invalid_term(minor_version)) {
VALIDATE_VALUE(minor_version, term_is_integer);
if (term_to_int(minor_version) != 1) {
RAISE_ERROR(BADARG_ATOM);
} else {
opts |= ExternalTermAllowLatin1Encoding;
}
}
}

term t = argv[0];
term ret = externalterm_to_binary(ctx, t);
term ret = externalterm_to_binary(ctx, t, opts);
if (term_is_invalid_term(ret)) {
RAISE_ERROR(BADARG_ATOM);
}
Expand Down
1 change: 1 addition & 0 deletions src/libAtomVM/nifs.gperf
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ erlang:put/2, &put_nif
erlang:binary_to_term/1, &binary_to_term_nif
erlang:binary_to_term/2, &binary_to_term_nif
erlang:term_to_binary/1, &term_to_binary_nif
erlang:term_to_binary/2, &term_to_binary_nif
erlang:throw/1, &throw_nif
erlang:raise/3, &raise_nif
erlang:unlink/1, &unlink_nif
Expand Down
Loading

0 comments on commit cdd10da

Please sign in to comment.