Skip to content

Commit

Permalink
Merge pull request #1006 from fadushin/utf8-atoms
Browse files Browse the repository at this point in the history
Encode atoms using UTF-8

As of OTP-26, atoms are encoded using UTF-8 encoding tags, when using the
`term_to_binary/1,2` Nif.

This PR adopts the same behavior for AtomVM.
Closes #1004.

These changes are made under both the "Apache 2.0" and the "GNU Lesser General
Public License 2.1 or later" license terms (dual license).

SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
  • Loading branch information
bettio committed Feb 27, 2024
2 parents fdfba62 + 6b001a2 commit dc4d7c4
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 44 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions
- `binary_to_atom/1` and `atom_to_binary/1` that default to utf8 (they were introduced with OTP23)
- Added Pico cmake option `AVM_WAIT_BOOTSEL_ON_EXIT` (default `ON`) to allow tools to use automated `BOOTSEL` mode after main application exits
- Use UTF-8 encoding for atoms when using `erlang:term_to_binary/1`, in conformance with OTP-26

### Fixed

Expand Down
22 changes: 21 additions & 1 deletion doc/src/programmers-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,26 @@ The currently supported keys are enumerated in the following table:

See the `word_size` key in the [System APIs](#system-apis) section for information about how to find the number of bytes used in a machine word on the current platform.

### External Term Format

The `erlang:term_to_binary/1` function can be used to serialize arbitrary term data into and out of binary data. These operations can be useful for applications that wish to share term data over some network protocol, such as HTTP or MQTT, or wish to store serialized term data in some permanant sttorage (e.g., Non-volatile storage on ESP32 devices).

For example, to convert a term to a binary, use `erlang:term_to_binary/1`, e.g.,

%% erlang
Term = ...
Binary = erlang:term_to_binary(Term),

And to convert the binary back to a term, use `erlang:binary_to_term/1,2`, e.g.,

%% erlang
Binary = ...
{Term, _Used} = erlang:binary_to_term(Binary, [used]),

By default, AtomVM will encode all atoms using UTF-8 encoding. This encoding is the default encoding for OTP-26 and later releases.

For more information about Erlang external term format, consult the [Erlang Documentation](https://www.erlang.org/doc/apps/erts/erl_ext_dist.html)

### System APIs

You can obtain system information about the AtomVM virtual machine via the [`erlang:system_info/1`](./apidocs/erlang/estdlib/erlang.md#system_info1) function, which takes an atom parameter designating the desired datum. Allowable parameters include
Expand Down Expand Up @@ -1693,7 +1713,7 @@ The station mode configuration supports the following options:
| `dhcp_hostname` | `string() \| binary()` | no | `atomvm-<MAC>` where `<MAC>` is the factory-assigned MAC-address of the device | DHCP hostname for the connecting device |
```{important}
The WiFi network to which you are connecting must support DHCP and IPv4.
The WiFi network to which you are connecting must support DHCP and IPv4.
IPv6 addressing is not yet supported on AtomVM.
```
Expand Down
6 changes: 3 additions & 3 deletions src/libAtomVM/bitstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[3] & 0x3F);
// overlong encoding or invalid codepoint
if (v <= 0x10000 || v > 0x10FFFF) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 4;
Expand All @@ -165,7 +165,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[2] & 0x3F);
// overlong encoding or surrogate
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 3;
Expand All @@ -176,7 +176,7 @@ enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size
v |= (buf[1] & 0x3F);
// overlong encoding
if (v < 0x80) {
return false;
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 2;
Expand Down
8 changes: 4 additions & 4 deletions src/libAtomVM/externalterm.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,11 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
size_t atom_len;
atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len);
if (!IS_NULL_PTR(buf)) {
buf[0] = ATOM_EXT;
WRITE_16_UNALIGNED(buf + 1, atom_len);
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 3);
buf[0] = SMALL_ATOM_UTF8_EXT;
buf[1] = atom_len;
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 2);
}
return 3 + atom_len;
return 2 + atom_len;

} else if (term_is_tuple(t)) {
size_t arity = term_get_tuple_arity(t);
Expand Down
129 changes: 95 additions & 34 deletions tests/erlang_tests/test_binary_to_term.erl
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,77 @@

start() ->
% Starting from OTP-26, atoms are encoded as UTF-8 by default.
TermToBinaryOptions =
case erlang:system_info(machine) of
"BEAM" ->
case erlang:system_info(version) >= "13.2" of
true -> [{minor_version, 1}];
false -> []
end;
"ATOM" ->
[]
end,
test_reverse(foo, <<131, 100, 0, 3, 102, 111, 111>>, TermToBinaryOptions),
test_reverse(bar, <<131, 100, 0, 3, 98, 97, 114>>, TermToBinaryOptions),
test_reverse(foo, {<<131, 119, 3, 102, 111, 111>>, <<131, 100, 0, 3, 102, 111, 111>>}),
test_reverse(bar, {<<131, 119, 3, 98, 97, 114>>, <<131, 100, 0, 3, 98, 97, 114>>}),
test_reverse(
'∀x∃y.f(x,y)',
<<131, 119, 15, 226, 136, 128, 120, 226, 136, 131, 121, 46, 102, 40, 120, 44, 121, 41>>,
[]
),
test_reverse(
':アトムVM',
<<131, 119, 16, 58, 227, 130, 162, 227, 131, 136, 227, 131, 160, 239, 188, 182, 239, 188,
173>>,
[]
),
test_reverse(128, <<131, 97, 128>>),
test_reverse(257, <<131, 98, 0, 0, 1, 1>>),
test_reverse(0, <<131, 97, 0>>),
test_reverse(-1, <<131, 98, 255, 255, 255, 255>>),
test_reverse(32768, <<131, 98, 0, 0, 128, 0>>),
test_reverse(-32768, <<131, 98, 255, 255, 128, 0>>),
test_reverse(
{foo, bar},
<<131, 104, 2, 100, 0, 3, 102, 111, 111, 100, 0, 3, 98, 97, 114>>,
TermToBinaryOptions
{foo, bar}, {
<<131, 104, 2, 119, 3, 102, 111, 111, 119, 3, 98, 97, 114>>,
<<131, 104, 2, 100, 0, 3, 102, 111, 111, 100, 0, 3, 98, 97, 114>>
}
),
test_reverse({foo, 0}, <<131, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0>>, TermToBinaryOptions),
test_reverse({foo, 0}, {
<<131, 104, 2, 119, 3, 102, 111, 111, 97, 0>>,
<<131, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0>>
}),
test_reverse([], <<131, 106>>),
test_reverse(
[{foo, 0}, {bar, 1}],
<<131, 108, 0, 0, 0, 2, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0, 104, 2, 100, 0, 3, 98, 97,
114, 97, 1, 106>>,
TermToBinaryOptions
[{foo, 0}, {bar, 1}], {
<<131, 108, 0, 0, 0, 2, 104, 2, 119, 3, 102, 111, 111, 97, 0, 104, 2, 119, 3, 98, 97,
114, 97, 1, 106>>,
<<131, 108, 0, 0, 0, 2, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0, 104, 2, 100, 0, 3, 98,
97, 114, 97, 1, 106>>
}
),
test_reverse(
[improper | list],
<<131, 108, 0, 0, 0, 1, 100, 0, 8, 105, 109, 112, 114, 111, 112, 101, 114, 100, 0, 4, 108,
105, 115, 116>>,
TermToBinaryOptions
{
<<131, 108, 0, 0, 0, 1, 119, 8, 105, 109, 112, 114, 111, 112, 101, 114, 119, 4, 108,
105, 115, 116>>,
<<131, 108, 0, 0, 0, 1, 100, 0, 8, 105, 109, 112, 114, 111, 112, 101, 114, 100, 0, 4,
108, 105, 115, 116>>
}
),
test_reverse({foo, bar}, {
<<131, 104, 2, 119, 3, 102, 111, 111, 119, 3, 98, 97, 114>>,
<<131, 104, 2, 100, 0, 3, 102, 111, 111, 100, 0, 3, 98, 97, 114>>
}),
test_reverse({foo, 0}, {
<<131, 104, 2, 119, 3, 102, 111, 111, 97, 0>>,
<<131, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0>>
}),
test_reverse([], <<131, 106>>),
test_reverse(
[{foo, 0}, {bar, 1}], {
<<131, 108, 0, 0, 0, 2, 104, 2, 119, 3, 102, 111, 111, 97, 0, 104, 2, 119, 3, 98, 97,
114, 97, 1, 106>>,
<<131, 108, 0, 0, 0, 2, 104, 2, 100, 0, 3, 102, 111, 111, 97, 0, 104, 2, 100, 0, 3, 98,
97, 114, 97, 1, 106>>
}
),
test_reverse(
[improper | list], {
<<131, 108, 0, 0, 0, 1, 119, 8, 105, 109, 112, 114, 111, 112, 101, 114, 119, 4, 108,
105, 115, 116>>,
<<131, 108, 0, 0, 0, 1, 100, 0, 8, 105, 109, 112, 114, 111, 112, 101, 114, 100, 0, 4,
108, 105, 115, 116>>
}
),
test_reverse(<<"foobar">>, <<131, 109, 0, 0, 0, 6, 102, 111, 111, 98, 97, 114>>),
test_reverse(<<":アトムVM">>, <<131, 109, 0, 0, 0, 6, 58, 162, 200, 224, 54, 45>>),
Expand All @@ -86,7 +121,7 @@ start() ->
57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48,
49, 50, 51, 52, 53>>
),
ok = test_external_function(TermToBinaryOptions),
ok = test_external_function(),

{32768, 6} = erlang:binary_to_term(<<131, 98, 0, 0, 128, 0, 127>>, [used]),
test_catenate_and_split([foo, bar, 128, {foo, bar}, [a, b, c, {d}]]),
Expand All @@ -99,7 +134,16 @@ start() ->
test_reverse(T, Interop) ->
test_reverse(T, Interop, []).

test_reverse(T, Interop, Options) ->
test_reverse(T, {Utf8Interop, Latin1Interop}, Options) ->
case get_otp_version() of
X when is_integer(X) andalso X >= 26 ->
test_reverse(T, Utf8Interop, Options);
atomvm ->
test_reverse(T, Utf8Interop, Options);
_ ->
test_reverse(T, Latin1Interop, Options)
end;
test_reverse(T, Interop, Options) when is_binary(Interop) andalso is_list(Options) ->
Bin =
case Options of
[] -> erlang:term_to_binary(T);
Expand Down Expand Up @@ -173,18 +217,27 @@ mutate_bin(Bin, I) ->
I2 = Ith bxor 16#FF,
<<Prefix/binary, I2:8/integer-unsigned, Rest/binary>>.

test_external_function(Options) ->
test_external_function() ->
T = [fun ?MODULE:apply/2, fun ?MODULE:apply/3],
Bin =
case Options of
[] -> erlang:term_to_binary(T);
_ -> erlang:term_to_binary(T, Options)
case get_otp_version() of
X when is_integer(X) andalso X >= 26 orelse X =:= atomvm ->
%% expect SMALL_ATOM_UTF8_EXT encoding
<<131, 108, 0, 0, 0, 2, 113, 119, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97, 114,
121, 95, 116, 111, 95, 116, 101, 114, 109, 119, 5, 97, 112, 112, 108, 121, 97,
2, 113, 119, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97, 114, 121, 95, 116,
111, 95, 116, 101, 114, 109, 119, 5, 97, 112, 112, 108, 121, 97, 3, 106>>;
_ ->
%% expect ATOM_EXT encoding
<<131, 108, 0, 0, 0, 2, 113, 100, 0, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97,
114, 121, 95, 116, 111, 95, 116, 101, 114, 109, 100, 0, 5, 97, 112, 112, 108,
121, 97, 2, 113, 100, 0, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97, 114, 121,
95, 116, 111, 95, 116, 101, 114, 109, 100, 0, 5, 97, 112, 112, 108, 121, 97, 3,
106>>
end,
Bin =
<<131, 108, 0, 0, 0, 2, 113, 100, 0, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97, 114, 121,
95, 116, 111, 95, 116, 101, 114, 109, 100, 0, 5, 97, 112, 112, 108, 121, 97, 2, 113,
100, 0, 19, 116, 101, 115, 116, 95, 98, 105, 110, 97, 114, 121, 95, 116, 111, 95, 116,
101, 114, 109, 100, 0, 5, 97, 112, 112, 108, 121, 97, 3, 106>>,

Bin = erlang:term_to_binary(T),

[Fun2, Fun3] = binary_to_term(Bin),
true = is_function(Fun2),
true = is_function(Fun3),
Expand Down Expand Up @@ -337,3 +390,11 @@ expect_badarg(Fun) ->
_:badarg ->
ok
end.

get_otp_version() ->
case erlang:system_info(machine) of
"BEAM" ->
list_to_integer(erlang:system_info(otp_release));
_ ->
atomvm
end.
4 changes: 2 additions & 2 deletions tests/erlang_tests/test_gc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ start() ->
{HeapSize, _} = make_a_big_heap(),
MemorySize = erlang:process_info(self(), memory),
true = erlang:garbage_collect(),
NewHeapSize = erlang:process_info(self(), heap_size),
{heap_size, NewHeapSize} = erlang:process_info(self(), heap_size),
ok =
case NewHeapSize < HeapSize of
true -> ok;
Expand All @@ -42,7 +42,7 @@ start() ->

make_a_big_heap() ->
LargeBlob = create_string(1024, []),
HeapSize = erlang:process_info(self(), heap_size),
{heap_size, HeapSize} = erlang:process_info(self(), heap_size),
{HeapSize, length(LargeBlob)}.

create_string(0, Accum) ->
Expand Down

0 comments on commit dc4d7c4

Please sign in to comment.