Skip to content

Commit

Permalink
next_char uses unsigned chars
Browse files Browse the repository at this point in the history
  • Loading branch information
anarthal committed Feb 12, 2024
1 parent f7531ef commit 9486e6f
Show file tree
Hide file tree
Showing 13 changed files with 130 additions and 61 deletions.
16 changes: 10 additions & 6 deletions include/boost/mysql/character_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <boost/mysql/detail/config.hpp>
#include <boost/mysql/detail/make_string_view.hpp>

#include <boost/core/span.hpp>

#include <cstddef>

namespace boost {
Expand All @@ -38,11 +40,12 @@ struct character_set
string_view name;

/**
* \brief Obtains the given string's first character size.
* \brief Obtains the size of the first character of a string.
* \details
* Given an input string `s`, this function must return the number of
* bytes that the first character in `s` spans, or 0 in case of error.
* `s` is guaranteed to be a non-empty string (`s.size() > 0`).
* Given a range of bytes, `r`, this function must interpret `r` as a
* string encoded using this character set, and return the number of
* bytes that the first character in the string spans, or 0 in case of error.
* `r` is guaranteed to be non-empty (`r.size() > 0`).
* \n
* In some character sets (like UTF-8), not all byte sequences represent
* valid characters. If this function finds an invalid byte sequence while
Expand All @@ -51,9 +54,10 @@ struct character_set
* This function must not throw exceptions or have side effects.
* \n
* \par Function signature
* The function signature should be: `std::size_t (*next_char)(string_view) noexcept`
* The function signature should be:
* `std::size_t (*next_char)(boost::span<const unsigned char> r) noexcept`
*/
std::size_t (*next_char)(string_view) noexcept;
std::size_t (*next_char)(span<const unsigned char>) noexcept;
};

/// (EXPERIMENTAL) The utf8mb4 character set (the one you should use by default).
Expand Down
9 changes: 5 additions & 4 deletions include/boost/mysql/detail/character_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@

#include <boost/mysql/detail/config.hpp>

#include <boost/core/span.hpp>

#include <cstddef>

namespace boost {
namespace mysql {
namespace detail {

inline std::size_t next_char_latin1(string_view) noexcept { return 1; }
inline std::size_t next_char_ascii(string_view input) noexcept
inline std::size_t next_char_ascii(span<const unsigned char> input) noexcept
{
return static_cast<unsigned char>(input[0]) <= 0x7f ? 1 : 0;
return input[0] <= 0x7f ? 1 : 0;
}
BOOST_MYSQL_DECL std::size_t next_char_utf8mb4(string_view input) noexcept;
BOOST_MYSQL_DECL std::size_t next_char_utf8mb4(span<const unsigned char> input) noexcept;

} // namespace detail
} // namespace mysql
Expand Down
9 changes: 4 additions & 5 deletions include/boost/mysql/impl/character_set.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ namespace boost {
namespace mysql {
namespace detail {

inline bool in_range(char byte, unsigned char lower, unsigned char upper) noexcept
inline bool in_range(unsigned char byte, unsigned char lower, unsigned char upper) noexcept
{
auto b = static_cast<unsigned char>(byte);
return b >= lower && b <= upper;
return byte >= lower && byte <= upper;
}

} // namespace detail
} // namespace mysql
} // namespace boost

std::size_t boost::mysql::detail::next_char_utf8mb4(string_view input) noexcept
std::size_t boost::mysql::detail::next_char_utf8mb4(span<const unsigned char> input) noexcept
{
// s[0] s[1] s[2] s[3] comment
// 00-7F ascii
Expand All @@ -42,7 +41,7 @@ std::size_t boost::mysql::detail::next_char_utf8mb4(string_view input) noexcept

BOOST_ASSERT(!input.empty());

auto first_char = static_cast<unsigned char>(input.front());
auto first_char = input.front();
if (first_char < 0x80)
{
return 1;
Expand Down
5 changes: 3 additions & 2 deletions include/boost/mysql/impl/escape_string.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include <boost/mysql/detail/output_string.hpp>

#include <boost/mysql/impl/internal/call_next_char.hpp>

namespace boost {
namespace mysql {
namespace detail {
Expand Down Expand Up @@ -67,8 +69,7 @@ escape_impl(string_view input, character_set charset, Escaper escaper, output_st
else
{
// Advance with the charset function
std::size_t char_size = charset.next_char({it, end});
BOOST_ASSERT(char_size <= static_cast<std::size_t>(end - it));
std::size_t char_size = detail::call_next_char(charset, it, end);
if (char_size == 0u)
return client_errc::invalid_encoding;
it += char_size;
Expand Down
2 changes: 1 addition & 1 deletion include/boost/mysql/impl/format_sql.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ class format_state
BOOST_ATTRIBUTE_NODISCARD
bool advance(const char*& it, const char* end)
{
std::size_t size = ctx_.impl_.opts.charset.next_char({it, end});
std::size_t size = detail::call_next_char(ctx_.impl_.opts.charset, it, end);
if (size == 0)
{
ctx_.add_error(client_errc::format_string_invalid_encoding);
Expand Down
39 changes: 39 additions & 0 deletions include/boost/mysql/impl/internal/call_next_char.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//
// Copyright (c) 2019-2023 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//

#ifndef BOOST_MYSQL_IMPL_INTERNAL_CALL_NEXT_CHAR_HPP
#define BOOST_MYSQL_IMPL_INTERNAL_CALL_NEXT_CHAR_HPP

#include <boost/mysql/character_set.hpp>

#include <boost/assert.hpp>

#include <cstddef>

namespace boost {
namespace mysql {
namespace detail {

inline std::size_t call_next_char(const character_set& charset, const char* first, const char* last) noexcept
{
// Range must be non-empty
BOOST_ASSERT(last > first);

// ASCII characters are always 1 byte (UTF-16 and friends are not supported)
auto* data = reinterpret_cast<const unsigned char*>(first);
if (*data < 0x80)
return 1u;

// May be a multi-byte character. Call the relevant function
return charset.next_char({data, static_cast<std::size_t>(last - first)});
}

} // namespace detail
} // namespace mysql
} // namespace boost

#endif
33 changes: 33 additions & 0 deletions test/unit/include/test_unit/ff_charset.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//
// Copyright (c) 2019-2023 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//

#ifndef BOOST_MYSQL_TEST_UNIT_INCLUDE_TEST_UNIT_FF_CHARSET_HPP
#define BOOST_MYSQL_TEST_UNIT_INCLUDE_TEST_UNIT_FF_CHARSET_HPP

#include <boost/mysql/character_set.hpp>

#include <boost/mysql/detail/make_string_view.hpp>

namespace boost {
namespace mysql {
namespace test {

// A hypothetical character set with rules that may confuse formatting algorithms.
// Some MySQL charsets (e.g. gbk) contain ASCII-compatible continuation characters, like this one.
inline std::size_t ff_charset_next_char(boost::span<const unsigned char> r) noexcept
{
if (r[0] == 0xff) // 0xff marks a two-byte character
return r.size() > 1u ? 2u : 0u;
return 1u;
};
constexpr character_set ff_charset{detail::make_string_view("ff_charset"), ff_charset_next_char};

} // namespace test
} // namespace mysql
} // namespace boost

#endif
26 changes: 17 additions & 9 deletions test/unit/test/character_set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <boost/mysql/character_set.hpp>
#include <boost/mysql/string_view.hpp>

#include <boost/mysql/impl/internal/call_next_char.hpp>

#include <boost/test/tools/context.hpp>
#include <boost/test/unit_test.hpp>

Expand All @@ -21,6 +23,12 @@ using namespace boost::mysql::test;

BOOST_AUTO_TEST_SUITE(test_character_set)

// Helper
static std::size_t call_next_char(const character_set& charset, string_view s) noexcept
{
return detail::call_next_char(charset, s.data(), s.data() + s.size());
}

BOOST_AUTO_TEST_CASE(utf8mb4_single_byte_valid)
{
for (int i = 0; i < 0x80; ++i)
Expand All @@ -29,11 +37,11 @@ BOOST_AUTO_TEST_CASE(utf8mb4_single_byte_valid)
{
// Exactly the required space
char str[2]{static_cast<char>(i), '\0'};
auto actual_len = utf8mb4_charset.next_char(string_view(str, 1));
auto actual_len = detail::call_next_char(utf8mb4_charset, str, str + 1);
BOOST_TEST(actual_len == 1u);

// Extra space
actual_len = utf8mb4_charset.next_char(string_view(str, 2));
actual_len = detail::call_next_char(utf8mb4_charset, str, str + 2);
BOOST_TEST(actual_len == 1u);
}
}
Expand Down Expand Up @@ -193,17 +201,17 @@ BOOST_AUTO_TEST_CASE(utf8mb4_multibyte_valid)
BOOST_TEST_CONTEXT(tc.name)
{
// Exactly the required space
auto actual_len = utf8mb4_charset.next_char(tc.input);
auto actual_len = call_next_char(utf8mb4_charset, tc.input);
BOOST_TEST(actual_len == tc.expected);

// Extra space
auto extra_space_input = std::string(tc.input) + "abc";
actual_len = utf8mb4_charset.next_char(extra_space_input);
actual_len = call_next_char(utf8mb4_charset, extra_space_input);
BOOST_TEST(actual_len == tc.expected);

// Not enough space (end of data before the end of the byte sequence)
auto not_enough_input = tc.input.substr(1);
actual_len = utf8mb4_charset.next_char(not_enough_input);
actual_len = call_next_char(utf8mb4_charset, not_enough_input);
BOOST_TEST(actual_len == 0u);
}
}
Expand All @@ -224,7 +232,7 @@ BOOST_AUTO_TEST_CASE(utf8mb4_invalid_start_byte)
BOOST_TEST_CONTEXT(+b)
{
auto input = static_cast<char>(b);
auto size = utf8mb4_charset.next_char(string_view(&input, 1));
auto size = detail::call_next_char(utf8mb4_charset, &input, &input + 1);
BOOST_TEST(size == 0u);
}
}
Expand Down Expand Up @@ -442,7 +450,7 @@ BOOST_AUTO_TEST_CASE(utf8mb4_invalid_continuation)
{
// add some extra continuation bytes, so we never fail because of lack of space
auto input = std::string(tc.input) + "\x91\x91";
auto size = utf8mb4_charset.next_char(input);
auto size = call_next_char(utf8mb4_charset, input);
BOOST_TEST(size == 0u);
}
}
Expand All @@ -456,7 +464,7 @@ BOOST_AUTO_TEST_CASE(ascii)
BOOST_TEST_CONTEXT(i)
{
char str[2]{static_cast<char>(i), '\0'};
auto size = ascii_charset.next_char(string_view(str, 2));
auto size = detail::call_next_char(ascii_charset, str, str + 2);
BOOST_TEST(size == 1u);
}
}
Expand All @@ -467,7 +475,7 @@ BOOST_AUTO_TEST_CASE(ascii)
BOOST_TEST_CONTEXT(i)
{
char str[2]{static_cast<char>(i), '\0'};
auto size = ascii_charset.next_char(string_view(str, 2));
auto size = detail::call_next_char(ascii_charset, str, str + 2);
BOOST_TEST(size == 0u);
}
}
Expand Down
22 changes: 3 additions & 19 deletions test/unit/test/escape_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,12 @@

#include "test_common/create_basic.hpp"
#include "test_common/printing.hpp"
#include "test_unit/ff_charset.hpp"

using namespace boost::mysql;

BOOST_AUTO_TEST_SUITE(test_escape_string)

// A hypothetical character set with rules that may confuse the algorithm.
// Some MySQL charsets (e.g. gbk) contain ASCII-compatible continuation characters
std::size_t next_char_test_encoding(string_view input) noexcept
{
// This is a hypothetical encoding used for testing
BOOST_ASSERT(!input.empty());

// Multibyte characters start with 0xff, and continuation bytes can include ascii-compatible characters
if (input.size() >= 2u && static_cast<unsigned char>(input[0]) == 0xff)
return 2;

// Otherwise, it's a plain character
return 1;
}

constexpr character_set test_charset{"test", &next_char_test_encoding};

//
// Escaping using backslashes
//
Expand Down Expand Up @@ -135,7 +119,7 @@ BOOST_AUTO_TEST_CASE(backslashes_multibyte_ascii_compatible_chars)
string_view s = "This is \\ a string \xff\\ with a weird \xff\" encoding \"";
std::string output = "abc";

auto ec = escape_string(s, {test_charset, true}, quoting_context::double_quote, output);
auto ec = escape_string(s, {test::ff_charset, true}, quoting_context::double_quote, output);

BOOST_TEST(ec == error_code());
BOOST_TEST(output == "This is \\\\ a string \xff\\ with a weird \xff\" encoding \\\"");
Expand Down Expand Up @@ -245,7 +229,7 @@ BOOST_AUTO_TEST_CASE(quotes_multibyte_ascii_compatible_chars)
string_view s = "This is \" a string \xfe\" with a weird \xff\" encoding \"";
std::string output = "abc";

auto ec = escape_string(s, {test_charset, false}, quoting_context::double_quote, output);
auto ec = escape_string(s, {test::ff_charset, false}, quoting_context::double_quote, output);

BOOST_TEST(ec == error_code());
BOOST_TEST(output == "This is \"\" a string \xfe\"\" with a weird \xff\" encoding \"\"");
Expand Down
1 change: 1 addition & 0 deletions test/unit/test/format_sql/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "format_common.hpp"
#include "test_common/printing.hpp"
#include "test_unit/ff_charset.hpp"

//
// Contains spotchecks verifying that the main success and error cases
Expand Down
12 changes: 2 additions & 10 deletions test/unit/test/format_sql/format_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <boost/mysql/format_sql.hpp>
#include <boost/mysql/string_view.hpp>

#include <boost/core/span.hpp>

#include <string>
#include <vector>

Expand Down Expand Up @@ -50,16 +52,6 @@ struct formatter<custom::condition>
}
};

// Custom charset function
inline std::size_t ff_charset_next_char(string_view s) noexcept
{
auto c = static_cast<unsigned char>(s[0]);
if (c == 0xff) // 0xff marks a two-byte character
return s.size() > 1u ? 2u : 0u;
return 1u;
};
constexpr character_set ff_charset{"ff_charset", ff_charset_next_char};

} // namespace mysql
} // namespace boost

Expand Down
7 changes: 4 additions & 3 deletions test/unit/test/format_sql/format_strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "format_common.hpp"
#include "test_common/printing.hpp"
#include "test_unit/ff_charset.hpp"

using namespace boost::mysql;

Expand Down Expand Up @@ -113,8 +114,8 @@ BOOST_AUTO_TEST_CASE(success)
// backslash_slashes and character set are propagated
BOOST_AUTO_TEST_CASE(options_propagated)
{
format_options opts_charset{ff_charset, true};
format_options opts_backslashes{ff_charset, false};
format_options opts_charset{test::ff_charset, true};
format_options opts_backslashes{test::ff_charset, false};

// Charset affects format strings
BOOST_TEST(format_sql("SELECT \xffh + {};", opts_charset, 42) == "SELECT \xffh + 42;");
Expand All @@ -132,7 +133,7 @@ BOOST_AUTO_TEST_CASE(options_propagated)
// interpret {} characters as continuations, rather than trying to expand them
BOOST_AUTO_TEST_CASE(format_strings_brace_continuation)
{
format_options custom_opts{ff_charset, true};
format_options custom_opts{test::ff_charset, true};

BOOST_TEST(format_sql("SELECT \xff{ + {};", custom_opts, 42) == "SELECT \xff{ + 42;");
BOOST_TEST(format_sql("SELECT \xff} + {};", custom_opts, 42) == "SELECT \xff} + 42;");
Expand Down
Loading

0 comments on commit 9486e6f

Please sign in to comment.