From d6e10d2dabaf689ef36e628ee6d54cd460664615 Mon Sep 17 00:00:00 2001 From: Raiki Tamura Date: Sat, 2 Dec 2023 21:01:59 +0900 Subject: [PATCH] gccrs: Implement quick-check for Unicode gcc/rust/ChangeLog: * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Add . * util/make-rust-unicode.py: Output NFC_Quick_Check table. * util/rust-codepoint.h (struct Codepoint): Add is_supplementary method. * util/rust-unicode-data.h: Generated. * util/rust-unicode.cc (binary_search_sorted_array): Removed. (lookup_cc): Remove namespace. (is_alphabetic): Use std::binary_search (nfc_quick_check): New function. (nfc_normalize): Use nfc_quick_check. (is_nfc_qc_maybe): New function. (is_nfc_qc_no): New function. (rust_nfc_qc_test): New test. * util/rust-unicode.h (is_nfc_qc_no): New function. (is_nfc_qc_maybe): New function. (enum class): New enum class. (nfc_quick_check): New function. (rust_nfc_qc_test): New test. Signed-off-by: Raiki Tamura --- gcc/rust/rust-lang.cc | 1 + gcc/rust/rust-system.h | 1 + gcc/rust/util/make-rust-unicode.py | 39 +++++-- gcc/rust/util/rust-codepoint.h | 1 + gcc/rust/util/rust-unicode-data.h | 158 ++++++++++++++++++++++++++++- gcc/rust/util/rust-unicode.cc | 143 +++++++++++++------------- gcc/rust/util/rust-unicode.h | 19 ++++ 7 files changed, 280 insertions(+), 82 deletions(-) diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc index 839025633e42..a167d50ac7e4 100644 --- a/gcc/rust/rust-lang.cc +++ b/gcc/rust/rust-lang.cc @@ -441,6 +441,7 @@ run_rust_tests () { // Call tests for the rust frontend here rust_input_source_test (); + rust_nfc_qc_test (); rust_utf8_normalize_test (); rust_punycode_encode_test (); rust_cfg_parser_test (); diff --git a/gcc/rust/rust-system.h b/gcc/rust/rust-system.h index 6ee4531ebfa6..0135b1b67228 100644 --- a/gcc/rust/rust-system.h +++ b/gcc/rust/rust-system.h @@ -44,6 +44,7 @@ #include #include #include +#include // Rust frontend requires C++11 minimum, so will have unordered_map and set #include diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py index 5303440fd251..f6f04ebdf5b3 100644 --- a/gcc/rust/util/make-rust-unicode.py +++ b/gcc/rust/util/make-rust-unicode.py @@ -250,6 +250,30 @@ def write_numeric() -> None: print("}};") +def write_nfc_qc(): + print( + "const std::array, {}> NFC_QC_NO_RANGES = {{{{".format( + len(nfc_qc_no_ranges) + ) + ) + print(" // clang-format off") + for r in nfc_qc_no_ranges: + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) + print(" // clang-format on") + print("}};") + + print( + "const std::array, {}> NFC_QC_MAYBE_RANGES = {{{{".format( + len(nfc_qc_maybe_ranges) + ) + ) + print(" // clang-format off") + for r in nfc_qc_maybe_ranges: + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) + print(" // clang-format on") + print("}};") + + def main() -> None: if len(sys.argv) != 4: print("too few arguments", file=sys.stderr) @@ -265,13 +289,12 @@ def main() -> None: print(COPYRIGHT) print() - print('#include "rust-system.h"') - print() - print("namespace Rust {") - print() + print('#include "rust-system.h"\n') + print("namespace Rust {\n") print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) - print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints))) - print() + print( + "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints)) + ) write_decomposition() print() @@ -283,8 +306,8 @@ def main() -> None: print() write_numeric() print() - - # TODO: write NFC_QC table + write_nfc_qc() + print() print("} // namespace Rust") diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h index 425d364b1e14..eb005eb3e2fc 100644 --- a/gcc/rust/util/rust-codepoint.h +++ b/gcc/rust/util/rust-codepoint.h @@ -40,6 +40,7 @@ struct Codepoint static Codepoint eof () { return Codepoint (UINT32_MAX); } bool is_eof () const { return value == UINT32_MAX; } bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; } + bool is_supplementary_character () const { return value > 0xFFFF; } // Returns a C++ string containing string value of codepoint. std::string as_string (); diff --git a/gcc/rust/util/rust-unicode-data.h b/gcc/rust/util/rust-unicode-data.h index c42460f4541b..e07752a4f63b 100644 --- a/gcc/rust/util/rust-unicode-data.h +++ b/gcc/rust/util/rust-unicode-data.h @@ -20,7 +20,7 @@ namespace Rust { -const uint32_t NUM_ALPHABETIC_RANGES = 1117; +const uint32_t NUM_ALPHABETIC_RANGES = 1141; const uint32_t NUM_NUMERIC_CODEPOINTS = 1831; const std::map> DECOMPOSITION_MAP = { @@ -4167,6 +4167,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0bd7, 0x0bd8}, {0x0c00, 0x0c01}, {0x0c01, 0x0c04}, + {0x0c04, 0x0c05}, {0x0c05, 0x0c0d}, {0x0c0e, 0x0c11}, {0x0c12, 0x0c29}, @@ -4202,6 +4203,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0ce0, 0x0ce2}, {0x0ce2, 0x0ce4}, {0x0cf1, 0x0cf3}, + {0x0cf3, 0x0cf4}, {0x0d00, 0x0d02}, {0x0d02, 0x0d04}, {0x0d04, 0x0d0d}, @@ -4257,7 +4259,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0f49, 0x0f6d}, {0x0f71, 0x0f7f}, {0x0f7f, 0x0f80}, - {0x0f80, 0x0f82}, + {0x0f80, 0x0f84}, {0x0f88, 0x0f8d}, {0x0f8d, 0x0f98}, {0x0f99, 0x0fbd}, @@ -4758,6 +4760,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11071, 0x11073}, {0x11073, 0x11075}, {0x11075, 0x11076}, + {0x11080, 0x11082}, {0x11082, 0x11083}, {0x11083, 0x110b0}, {0x110b0, 0x110b3}, @@ -4794,6 +4797,8 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11234, 0x11235}, {0x11237, 0x11238}, {0x1123e, 0x1123f}, + {0x1123f, 0x11241}, + {0x11241, 0x11242}, {0x11280, 0x11287}, {0x11288, 0x11289}, {0x1128a, 0x1128e}, @@ -4948,12 +4953,22 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11ee0, 0x11ef3}, {0x11ef3, 0x11ef5}, {0x11ef5, 0x11ef7}, + {0x11f00, 0x11f02}, + {0x11f02, 0x11f03}, + {0x11f03, 0x11f04}, + {0x11f04, 0x11f11}, + {0x11f12, 0x11f34}, + {0x11f34, 0x11f36}, + {0x11f36, 0x11f3b}, + {0x11f3e, 0x11f40}, + {0x11f40, 0x11f41}, {0x11fb0, 0x11fb1}, {0x12000, 0x1239a}, {0x12400, 0x1246f}, {0x12480, 0x12544}, {0x12f90, 0x12ff1}, - {0x13000, 0x1342f}, + {0x13000, 0x13430}, + {0x13441, 0x13447}, {0x14400, 0x14647}, {0x16800, 0x16a39}, {0x16a40, 0x16a5f}, @@ -4980,7 +4995,9 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1aff5, 0x1affc}, {0x1affd, 0x1afff}, {0x1b000, 0x1b123}, + {0x1b132, 0x1b133}, {0x1b150, 0x1b153}, + {0x1b155, 0x1b156}, {0x1b164, 0x1b168}, {0x1b170, 0x1b2fc}, {0x1bc00, 0x1bc6b}, @@ -5021,16 +5038,21 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1df00, 0x1df0a}, {0x1df0a, 0x1df0b}, {0x1df0b, 0x1df1f}, + {0x1df25, 0x1df2b}, {0x1e000, 0x1e007}, {0x1e008, 0x1e019}, {0x1e01b, 0x1e022}, {0x1e023, 0x1e025}, {0x1e026, 0x1e02b}, + {0x1e030, 0x1e06e}, + {0x1e08f, 0x1e090}, {0x1e100, 0x1e12d}, {0x1e137, 0x1e13e}, {0x1e14e, 0x1e14f}, {0x1e290, 0x1e2ae}, {0x1e2c0, 0x1e2ec}, + {0x1e4d0, 0x1e4eb}, + {0x1e4eb, 0x1e4ec}, {0x1e7e0, 0x1e7e7}, {0x1e7e8, 0x1e7ec}, {0x1e7ed, 0x1e7ef}, @@ -5076,12 +5098,14 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1f150, 0x1f16a}, {0x1f170, 0x1f18a}, {0x20000, 0x2a6e0}, - {0x2a700, 0x2b739}, + {0x2a700, 0x2b73a}, {0x2b740, 0x2b81e}, {0x2b820, 0x2cea2}, {0x2ceb0, 0x2ebe1}, + {0x2ebf0, 0x2ee5e}, {0x2f800, 0x2fa1e}, {0x30000, 0x3134b}, + {0x31350, 0x323b0}, // clang-format on }}; @@ -5205,4 +5229,130 @@ const std::array NUMERIC_CODEPOINTS = {{ // clang-format on }}; +const std::array, 74> NFC_QC_NO_RANGES = {{ + // clang-format off + {0x0340, 0x0342}, + {0x0343, 0x0345}, + {0x0374, 0x0375}, + {0x037e, 0x037f}, + {0x0387, 0x0388}, + {0x0958, 0x0960}, + {0x09dc, 0x09de}, + {0x09df, 0x09e0}, + {0x0a33, 0x0a34}, + {0x0a36, 0x0a37}, + {0x0a59, 0x0a5c}, + {0x0a5e, 0x0a5f}, + {0x0b5c, 0x0b5e}, + {0x0f43, 0x0f44}, + {0x0f4d, 0x0f4e}, + {0x0f52, 0x0f53}, + {0x0f57, 0x0f58}, + {0x0f5c, 0x0f5d}, + {0x0f69, 0x0f6a}, + {0x0f73, 0x0f74}, + {0x0f75, 0x0f77}, + {0x0f78, 0x0f79}, + {0x0f81, 0x0f82}, + {0x0f93, 0x0f94}, + {0x0f9d, 0x0f9e}, + {0x0fa2, 0x0fa3}, + {0x0fa7, 0x0fa8}, + {0x0fac, 0x0fad}, + {0x0fb9, 0x0fba}, + {0x1f71, 0x1f72}, + {0x1f73, 0x1f74}, + {0x1f75, 0x1f76}, + {0x1f77, 0x1f78}, + {0x1f79, 0x1f7a}, + {0x1f7b, 0x1f7c}, + {0x1f7d, 0x1f7e}, + {0x1fbb, 0x1fbc}, + {0x1fbe, 0x1fbf}, + {0x1fc9, 0x1fca}, + {0x1fcb, 0x1fcc}, + {0x1fd3, 0x1fd4}, + {0x1fdb, 0x1fdc}, + {0x1fe3, 0x1fe4}, + {0x1feb, 0x1fec}, + {0x1fee, 0x1ff0}, + {0x1ff9, 0x1ffa}, + {0x1ffb, 0x1ffc}, + {0x1ffd, 0x1ffe}, + {0x2000, 0x2002}, + {0x2126, 0x2127}, + {0x212a, 0x212c}, + {0x2329, 0x232a}, + {0x232a, 0x232b}, + {0x2adc, 0x2add}, + {0xf900, 0xfa0e}, + {0xfa10, 0xfa11}, + {0xfa12, 0xfa13}, + {0xfa15, 0xfa1f}, + {0xfa20, 0xfa21}, + {0xfa22, 0xfa23}, + {0xfa25, 0xfa27}, + {0xfa2a, 0xfa6e}, + {0xfa70, 0xfada}, + {0xfb1d, 0xfb1e}, + {0xfb1f, 0xfb20}, + {0xfb2a, 0xfb37}, + {0xfb38, 0xfb3d}, + {0xfb3e, 0xfb3f}, + {0xfb40, 0xfb42}, + {0xfb43, 0xfb45}, + {0xfb46, 0xfb4f}, + {0x1d15e, 0x1d165}, + {0x1d1bb, 0x1d1c1}, + {0x2f800, 0x2fa1e}, + // clang-format on +}}; +const std::array, 43> NFC_QC_MAYBE_RANGES = {{ + // clang-format off + {0x0300, 0x0305}, + {0x0306, 0x030d}, + {0x030f, 0x0310}, + {0x0311, 0x0312}, + {0x0313, 0x0315}, + {0x031b, 0x031c}, + {0x0323, 0x0329}, + {0x032d, 0x032f}, + {0x0330, 0x0332}, + {0x0338, 0x0339}, + {0x0342, 0x0343}, + {0x0345, 0x0346}, + {0x0653, 0x0656}, + {0x093c, 0x093d}, + {0x09be, 0x09bf}, + {0x09d7, 0x09d8}, + {0x0b3e, 0x0b3f}, + {0x0b56, 0x0b57}, + {0x0b57, 0x0b58}, + {0x0bbe, 0x0bbf}, + {0x0bd7, 0x0bd8}, + {0x0c56, 0x0c57}, + {0x0cc2, 0x0cc3}, + {0x0cd5, 0x0cd7}, + {0x0d3e, 0x0d3f}, + {0x0d57, 0x0d58}, + {0x0dca, 0x0dcb}, + {0x0dcf, 0x0dd0}, + {0x0ddf, 0x0de0}, + {0x102e, 0x102f}, + {0x1161, 0x1176}, + {0x11a8, 0x11c3}, + {0x1b35, 0x1b36}, + {0x3099, 0x309b}, + {0x110ba, 0x110bb}, + {0x11127, 0x11128}, + {0x1133e, 0x1133f}, + {0x11357, 0x11358}, + {0x114b0, 0x114b1}, + {0x114ba, 0x114bb}, + {0x114bd, 0x114be}, + {0x115af, 0x115b0}, + {0x11930, 0x11931}, + // clang-format on +}}; + } // namespace Rust diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index 999ecb042ca3..6bd2db550a18 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -39,75 +39,27 @@ const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28; const uint32_t N_COUNT = V_COUNT * T_COUNT; const uint32_t S_COUNT = L_COUNT * N_COUNT; +// Check if the codepoint is in any of the ranges (half-open intervals [a,b)). template -int64_t +bool binary_search_ranges ( - // FIXME: use binray search function from const std::array, SIZE> &ranges, uint32_t target_cp) { - if (SIZE == 0) - return -1; - - uint32_t low, high, mid; - uint32_t start, end; - low = 0; - high = SIZE - 1; - mid = (low + high) / 2; - while (high - low > 1) - { - start = ranges[mid].first; - end = ranges[mid].second; - if (start <= target_cp && target_cp < end) - { - return mid; - } - else if (end <= target_cp) - low = mid + 1; - else - high = mid - 1; - mid = (low + high) / 2; - } - - if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second) - return mid; - else - return -1; -} - -template -int64_t -binary_search_sorted_array (const std::array &array, - uint32_t target) -{ - // FIXME: use binray search function from - if (SIZE == 0) - return -1; - - uint32_t low, high, mid; - low = 0; - high = SIZE; - mid = (low + high) / 2; - while (high - low > 1) - { - if (array[mid] <= target) - low = mid; - else - high = mid; - mid = (low + high) / 2; - } - - if (array[mid] == target) - return mid; + auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp, + [] (const std::pair &a, + uint32_t b) { return a.second <= b; }); + if (it == ranges.end ()) + return false; else - return -1; + return it->first <= target_cp && target_cp < it->second; } int lookup_cc (codepoint_t c) { - auto it = Rust::CCC_TABLE.find (c.value); - if (it != Rust::CCC_TABLE.end ()) + auto it = CCC_TABLE.find (c.value); + if (it != CCC_TABLE.end ()) return it->second; else // Starter. Returns zero. @@ -289,10 +241,44 @@ recomp (string_t s) return buf; } +// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms +QuickCheckResult +nfc_quick_check (const string_t &s) +{ + int last_canonical_class = 0; + QuickCheckResult res = QuickCheckResult::YES; + + for (unsigned long i = 0; i < s.size (); i++) + { + codepoint_t c = s[i]; + + if (c.is_supplementary_character ()) + i++; + + int canonical_class = lookup_cc (c); + if (last_canonical_class > canonical_class && canonical_class != 0) + return QuickCheckResult::NO; + + if (is_nfc_qc_no (c.value)) + return QuickCheckResult::NO; + + if (is_nfc_qc_maybe (c.value)) + res = QuickCheckResult::MAYBE; + + last_canonical_class = canonical_class; + } + return res; +} + string_t -nfc_normalize (string_t s) +nfc_normalize (const string_t &s) { - // TODO: Quick Check + if (nfc_quick_check (s) == QuickCheckResult::YES) + return s; + + // TODO: optimize normalization. + // i.e. only normalize a limited area around MAYBE character, instead of + // performing complete normlization of the entire string // decompose string_t d = decomp_cano (s); @@ -312,21 +298,26 @@ Utf8String::nfc_normalize () const bool is_alphabetic (uint32_t codepoint) { - int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint); - if (res < 0) - return false; - else - return true; + return binary_search_ranges (ALPHABETIC_RANGES, codepoint); } bool is_numeric (uint32_t codepoint) { - int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint); - if (res < 0) - return false; - else - return true; + return std::binary_search (NUMERIC_CODEPOINTS.begin (), + NUMERIC_CODEPOINTS.end (), codepoint); +} + +bool +is_nfc_qc_maybe (uint32_t codepoint) +{ + return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint); +} + +bool +is_nfc_qc_no (uint32_t codepoint) +{ + return binary_search_ranges (NFC_QC_NO_RANGES, codepoint); } bool @@ -344,6 +335,18 @@ is_ascii_only (const std::string &str) namespace selftest { +void +rust_nfc_qc_test () +{ + ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}), + Rust::QuickCheckResult::YES); + ASSERT_EQ (Rust::nfc_quick_check ( + {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}), + Rust::QuickCheckResult::MAYBE); + ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}), + Rust::QuickCheckResult::NO); +} + void assert_normalize (const std::vector origin, const std::vector expected) diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h index 2538436797fb..aa7bd8a56321 100644 --- a/gcc/rust/util/rust-unicode.h +++ b/gcc/rust/util/rust-unicode.h @@ -68,12 +68,31 @@ is_ascii_only (const std::string &str); bool is_numeric (uint32_t codepoint); +bool +is_nfc_qc_no (uint32_t codepoint); + +bool +is_nfc_qc_maybe (uint32_t codepoint); + +enum class QuickCheckResult +{ + YES, + NO, + MAYBE +}; + +QuickCheckResult +nfc_quick_check (const std::vector &s); + } // namespace Rust #if CHECKING_P namespace selftest { +void +rust_nfc_qc_test (); + void rust_utf8_normalize_test ();