next_char uses unsigned chars

boostorg · Feb 12, 2024 · 9486e6f · 9486e6f
1 parent f7531ef
commit 9486e6f
Show file tree

Hide file tree

Showing 13 changed files with 130 additions and 61 deletions.
diff --git a/include/boost/mysql/character_set.hpp b/include/boost/mysql/character_set.hpp
@@ -14,6 +14,8 @@
 #include <boost/mysql/detail/config.hpp>
 #include <boost/mysql/detail/make_string_view.hpp>
 
+#include <boost/core/span.hpp>
+
 #include <cstddef>
 
 namespace boost {
@@ -38,11 +40,12 @@ struct character_set
     string_view name;
 
     /**
-     * \brief Obtains the given string's first character size.
+     * \brief Obtains the size of the first character of a string.
      * \details
-     * Given an input string `s`, this function must return the number of
-     * bytes that the first character in `s` spans, or 0 in case of error.
-     * `s` is guaranteed to be a non-empty string (`s.size() > 0`).
+     * Given a range of bytes, `r`, this function must interpret `r` as a
+     * string encoded using this character set, and return the number of
+     * bytes that the first character in the string spans, or 0 in case of error.
+     * `r` is guaranteed to be non-empty (`r.size() > 0`).
      * \n
      * In some character sets (like UTF-8), not all byte sequences represent
      * valid characters. If this function finds an invalid byte sequence while
@@ -51,9 +54,10 @@ struct character_set
      * This function must not throw exceptions or have side effects.
      * \n
      * \par Function signature
-     * The function signature should be: `std::size_t (*next_char)(string_view) noexcept`
+     * The function signature should be:
+     * `std::size_t (*next_char)(boost::span<const unsigned char> r) noexcept`
      */
-    std::size_t (*next_char)(string_view) noexcept;
+    std::size_t (*next_char)(span<const unsigned char>) noexcept;
 };
 
 /// (EXPERIMENTAL) The utf8mb4 character set (the one you should use by default).

diff --git a/include/boost/mysql/detail/character_set.hpp b/include/boost/mysql/detail/character_set.hpp
@@ -12,18 +12,19 @@
 
 #include <boost/mysql/detail/config.hpp>
 
+#include <boost/core/span.hpp>
+
 #include <cstddef>
 
 namespace boost {
 namespace mysql {
 namespace detail {
 
-inline std::size_t next_char_latin1(string_view) noexcept { return 1; }
-inline std::size_t next_char_ascii(string_view input) noexcept
+inline std::size_t next_char_ascii(span<const unsigned char> input) noexcept
 {
-    return static_cast<unsigned char>(input[0]) <= 0x7f ? 1 : 0;
+    return input[0] <= 0x7f ? 1 : 0;
 }
-BOOST_MYSQL_DECL std::size_t next_char_utf8mb4(string_view input) noexcept;
+BOOST_MYSQL_DECL std::size_t next_char_utf8mb4(span<const unsigned char> input) noexcept;
 
 }  // namespace detail
 }  // namespace mysql

diff --git a/include/boost/mysql/impl/character_set.ipp b/include/boost/mysql/impl/character_set.ipp
@@ -16,17 +16,16 @@ namespace boost {
 namespace mysql {
 namespace detail {
 
-inline bool in_range(char byte, unsigned char lower, unsigned char upper) noexcept
+inline bool in_range(unsigned char byte, unsigned char lower, unsigned char upper) noexcept
 {
-    auto b = static_cast<unsigned char>(byte);
-    return b >= lower && b <= upper;
+    return byte >= lower && byte <= upper;
 }
 
 }  // namespace detail
 }  // namespace mysql
 }  // namespace boost
 
-std::size_t boost::mysql::detail::next_char_utf8mb4(string_view input) noexcept
+std::size_t boost::mysql::detail::next_char_utf8mb4(span<const unsigned char> input) noexcept
 {
     // s[0]    s[1]    s[2]    s[3]    comment
     // 00-7F                           ascii
@@ -42,7 +41,7 @@ std::size_t boost::mysql::detail::next_char_utf8mb4(string_view input) noexcept
 
     BOOST_ASSERT(!input.empty());
 
-    auto first_char = static_cast<unsigned char>(input.front());
+    auto first_char = input.front();
     if (first_char < 0x80)
     {
         return 1;

diff --git a/include/boost/mysql/impl/escape_string.ipp b/include/boost/mysql/impl/escape_string.ipp
@@ -18,6 +18,8 @@
 
 #include <boost/mysql/detail/output_string.hpp>
 
+#include <boost/mysql/impl/internal/call_next_char.hpp>
+
 namespace boost {
 namespace mysql {
 namespace detail {
@@ -67,8 +69,7 @@ escape_impl(string_view input, character_set charset, Escaper escaper, output_st
         else
         {
             // Advance with the charset function
-            std::size_t char_size = charset.next_char({it, end});
-            BOOST_ASSERT(char_size <= static_cast<std::size_t>(end - it));
+            std::size_t char_size = detail::call_next_char(charset, it, end);
             if (char_size == 0u)
                 return client_errc::invalid_encoding;
             it += char_size;

diff --git a/include/boost/mysql/impl/format_sql.ipp b/include/boost/mysql/impl/format_sql.ipp
@@ -231,7 +231,7 @@ class format_state
     BOOST_ATTRIBUTE_NODISCARD
     bool advance(const char*& it, const char* end)
     {
-        std::size_t size = ctx_.impl_.opts.charset.next_char({it, end});
+        std::size_t size = detail::call_next_char(ctx_.impl_.opts.charset, it, end);
         if (size == 0)
         {
             ctx_.add_error(client_errc::format_string_invalid_encoding);

diff --git a/include/boost/mysql/impl/internal/call_next_char.hpp b/include/boost/mysql/impl/internal/call_next_char.hpp
@@ -0,0 +1,39 @@
+//
+// Copyright (c) 2019-2023 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#ifndef BOOST_MYSQL_IMPL_INTERNAL_CALL_NEXT_CHAR_HPP
+#define BOOST_MYSQL_IMPL_INTERNAL_CALL_NEXT_CHAR_HPP
+
+#include <boost/mysql/character_set.hpp>
+
+#include <boost/assert.hpp>
+
+#include <cstddef>
+
+namespace boost {
+namespace mysql {
+namespace detail {
+
+inline std::size_t call_next_char(const character_set& charset, const char* first, const char* last) noexcept
+{
+    // Range must be non-empty
+    BOOST_ASSERT(last > first);
+
+    // ASCII characters are always 1 byte (UTF-16 and friends are not supported)
+    auto* data = reinterpret_cast<const unsigned char*>(first);
+    if (*data < 0x80)
+        return 1u;
+
+    // May be a multi-byte character. Call the relevant function
+    return charset.next_char({data, static_cast<std::size_t>(last - first)});
+}
+
+}  // namespace detail
+}  // namespace mysql
+}  // namespace boost
+
+#endif
diff --git a/test/unit/include/test_unit/ff_charset.hpp b/test/unit/include/test_unit/ff_charset.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright (c) 2019-2023 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#ifndef BOOST_MYSQL_TEST_UNIT_INCLUDE_TEST_UNIT_FF_CHARSET_HPP
+#define BOOST_MYSQL_TEST_UNIT_INCLUDE_TEST_UNIT_FF_CHARSET_HPP
+
+#include <boost/mysql/character_set.hpp>
+
+#include <boost/mysql/detail/make_string_view.hpp>
+
+namespace boost {
+namespace mysql {
+namespace test {
+
+// A hypothetical character set with rules that may confuse formatting algorithms.
+// Some MySQL charsets (e.g. gbk) contain ASCII-compatible continuation characters, like this one.
+inline std::size_t ff_charset_next_char(boost::span<const unsigned char> r) noexcept
+{
+    if (r[0] == 0xff)  // 0xff marks a two-byte character
+        return r.size() > 1u ? 2u : 0u;
+    return 1u;
+};
+constexpr character_set ff_charset{detail::make_string_view("ff_charset"), ff_charset_next_char};
+
+}  // namespace test
+}  // namespace mysql
+}  // namespace boost
+
+#endif
diff --git a/test/unit/test/character_set.cpp b/test/unit/test/character_set.cpp
@@ -8,6 +8,8 @@
 #include <boost/mysql/character_set.hpp>
 #include <boost/mysql/string_view.hpp>
 
+#include <boost/mysql/impl/internal/call_next_char.hpp>
+
 #include <boost/test/tools/context.hpp>
 #include <boost/test/unit_test.hpp>
 
@@ -21,6 +23,12 @@ using namespace boost::mysql::test;
 
 BOOST_AUTO_TEST_SUITE(test_character_set)
 
+// Helper
+static std::size_t call_next_char(const character_set& charset, string_view s) noexcept
+{
+    return detail::call_next_char(charset, s.data(), s.data() + s.size());
+}
+
 BOOST_AUTO_TEST_CASE(utf8mb4_single_byte_valid)
 {
     for (int i = 0; i < 0x80; ++i)
@@ -29,11 +37,11 @@ BOOST_AUTO_TEST_CASE(utf8mb4_single_byte_valid)
         {
             // Exactly the required space
             char str[2]{static_cast<char>(i), '\0'};
-            auto actual_len = utf8mb4_charset.next_char(string_view(str, 1));
+            auto actual_len = detail::call_next_char(utf8mb4_charset, str, str + 1);
             BOOST_TEST(actual_len == 1u);
 
             // Extra space
-            actual_len = utf8mb4_charset.next_char(string_view(str, 2));
+            actual_len = detail::call_next_char(utf8mb4_charset, str, str + 2);
             BOOST_TEST(actual_len == 1u);
         }
     }
@@ -193,17 +201,17 @@ BOOST_AUTO_TEST_CASE(utf8mb4_multibyte_valid)
         BOOST_TEST_CONTEXT(tc.name)
         {
             // Exactly the required space
-            auto actual_len = utf8mb4_charset.next_char(tc.input);
+            auto actual_len = call_next_char(utf8mb4_charset, tc.input);
             BOOST_TEST(actual_len == tc.expected);
 
             // Extra space
             auto extra_space_input = std::string(tc.input) + "abc";
-            actual_len = utf8mb4_charset.next_char(extra_space_input);
+            actual_len = call_next_char(utf8mb4_charset, extra_space_input);
             BOOST_TEST(actual_len == tc.expected);
 
             // Not enough space (end of data before the end of the byte sequence)
             auto not_enough_input = tc.input.substr(1);
-            actual_len = utf8mb4_charset.next_char(not_enough_input);
+            actual_len = call_next_char(utf8mb4_charset, not_enough_input);
             BOOST_TEST(actual_len == 0u);
         }
     }
@@ -224,7 +232,7 @@ BOOST_AUTO_TEST_CASE(utf8mb4_invalid_start_byte)
         BOOST_TEST_CONTEXT(+b)
         {
             auto input = static_cast<char>(b);
-            auto size = utf8mb4_charset.next_char(string_view(&input, 1));
+            auto size = detail::call_next_char(utf8mb4_charset, &input, &input + 1);
             BOOST_TEST(size == 0u);
         }
     }
@@ -442,7 +450,7 @@ BOOST_AUTO_TEST_CASE(utf8mb4_invalid_continuation)
         {
             // add some extra continuation bytes, so we never fail because of lack of space
             auto input = std::string(tc.input) + "\x91\x91";
-            auto size = utf8mb4_charset.next_char(input);
+            auto size = call_next_char(utf8mb4_charset, input);
             BOOST_TEST(size == 0u);
         }
     }
@@ -456,7 +464,7 @@ BOOST_AUTO_TEST_CASE(ascii)
         BOOST_TEST_CONTEXT(i)
         {
             char str[2]{static_cast<char>(i), '\0'};
-            auto size = ascii_charset.next_char(string_view(str, 2));
+            auto size = detail::call_next_char(ascii_charset, str, str + 2);
             BOOST_TEST(size == 1u);
         }
     }
@@ -467,7 +475,7 @@ BOOST_AUTO_TEST_CASE(ascii)
         BOOST_TEST_CONTEXT(i)
         {
             char str[2]{static_cast<char>(i), '\0'};
-            auto size = ascii_charset.next_char(string_view(str, 2));
+            auto size = detail::call_next_char(ascii_charset, str, str + 2);
             BOOST_TEST(size == 0u);
         }
     }

diff --git a/test/unit/test/escape_string.cpp b/test/unit/test/escape_string.cpp
@@ -21,28 +21,12 @@
 
 #include "test_common/create_basic.hpp"
 #include "test_common/printing.hpp"
+#include "test_unit/ff_charset.hpp"
 
 using namespace boost::mysql;
 
 BOOST_AUTO_TEST_SUITE(test_escape_string)
 
-// A hypothetical character set with rules that may confuse the algorithm.
-// Some MySQL charsets (e.g. gbk) contain ASCII-compatible continuation characters
-std::size_t next_char_test_encoding(string_view input) noexcept
-{
-    // This is a hypothetical encoding used for testing
-    BOOST_ASSERT(!input.empty());
-
-    // Multibyte characters start with 0xff, and continuation bytes can include ascii-compatible characters
-    if (input.size() >= 2u && static_cast<unsigned char>(input[0]) == 0xff)
-        return 2;
-
-    // Otherwise, it's a plain character
-    return 1;
-}
-
-constexpr character_set test_charset{"test", &next_char_test_encoding};
-
 //
 // Escaping using backslashes
 //
@@ -135,7 +119,7 @@ BOOST_AUTO_TEST_CASE(backslashes_multibyte_ascii_compatible_chars)
     string_view s = "This is \\ a string \xff\\ with a weird \xff\" encoding \"";
     std::string output = "abc";
 
-    auto ec = escape_string(s, {test_charset, true}, quoting_context::double_quote, output);
+    auto ec = escape_string(s, {test::ff_charset, true}, quoting_context::double_quote, output);
 
     BOOST_TEST(ec == error_code());
     BOOST_TEST(output == "This is \\\\ a string \xff\\ with a weird \xff\" encoding \\\"");
@@ -245,7 +229,7 @@ BOOST_AUTO_TEST_CASE(quotes_multibyte_ascii_compatible_chars)
     string_view s = "This is \" a string \xfe\" with a weird \xff\" encoding \"";
     std::string output = "abc";
 
-    auto ec = escape_string(s, {test_charset, false}, quoting_context::double_quote, output);
+    auto ec = escape_string(s, {test::ff_charset, false}, quoting_context::double_quote, output);
 
     BOOST_TEST(ec == error_code());
     BOOST_TEST(output == "This is \"\" a string \xfe\"\" with a weird \xff\" encoding \"\"");

diff --git a/test/unit/test/format_sql/api.cpp b/test/unit/test/format_sql/api.cpp
@@ -18,6 +18,7 @@
 
 #include "format_common.hpp"
 #include "test_common/printing.hpp"
+#include "test_unit/ff_charset.hpp"
 
 //
 // Contains spotchecks verifying that the main success and error cases

diff --git a/test/unit/test/format_sql/format_common.hpp b/test/unit/test/format_sql/format_common.hpp
@@ -11,6 +11,8 @@
 #include <boost/mysql/format_sql.hpp>
 #include <boost/mysql/string_view.hpp>
 
+#include <boost/core/span.hpp>
+
 #include <string>
 #include <vector>
 
@@ -50,16 +52,6 @@ struct formatter<custom::condition>
     }
 };
 
-// Custom charset function
-inline std::size_t ff_charset_next_char(string_view s) noexcept
-{
-    auto c = static_cast<unsigned char>(s[0]);
-    if (c == 0xff)  // 0xff marks a two-byte character
-        return s.size() > 1u ? 2u : 0u;
-    return 1u;
-};
-constexpr character_set ff_charset{"ff_charset", ff_charset_next_char};
-
 }  // namespace mysql
 }  // namespace boost
 

diff --git a/test/unit/test/format_sql/format_strings.cpp b/test/unit/test/format_sql/format_strings.cpp
@@ -17,6 +17,7 @@
 
 #include "format_common.hpp"
 #include "test_common/printing.hpp"
+#include "test_unit/ff_charset.hpp"
 
 using namespace boost::mysql;
 
@@ -113,8 +114,8 @@ BOOST_AUTO_TEST_CASE(success)
 // backslash_slashes and character set are propagated
 BOOST_AUTO_TEST_CASE(options_propagated)
 {
-    format_options opts_charset{ff_charset, true};
-    format_options opts_backslashes{ff_charset, false};
+    format_options opts_charset{test::ff_charset, true};
+    format_options opts_backslashes{test::ff_charset, false};
 
     // Charset affects format strings
     BOOST_TEST(format_sql("SELECT \xffh + {};", opts_charset, 42) == "SELECT \xffh + 42;");
@@ -132,7 +133,7 @@ BOOST_AUTO_TEST_CASE(options_propagated)
 // interpret {} characters as continuations, rather than trying to expand them
 BOOST_AUTO_TEST_CASE(format_strings_brace_continuation)
 {
-    format_options custom_opts{ff_charset, true};
+    format_options custom_opts{test::ff_charset, true};
 
     BOOST_TEST(format_sql("SELECT \xff{ + {};", custom_opts, 42) == "SELECT \xff{ + 42;");
     BOOST_TEST(format_sql("SELECT \xff} + {};", custom_opts, 42) == "SELECT \xff} + 42;");