Added crc64 implementations for arm and intel, Added some avx512 code…

… for crc32 on intel as well.
awslabs · Jan 18, 2024 · 2a084ac · 2a084ac
1 parent 5138407
commit 2a084ac
Show file tree

Hide file tree

Showing 10 changed files with 1,387 additions and 3 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -115,8 +115,9 @@ if (USE_CPU_EXTENSIONS)
             "source/arm/*.c"
             )
         SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
+        SET_SOURCE_FILES_PROPERTIES(source/arm/crc64_arm.c PROPERTIES COMPILE_FLAGS "-march=armv8.1-a+crypto -mtune=neoverse-v1")
     elseif ((NOT MSVC) AND AWS_ARCH_ARM32)
-        set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc -Werror")
+        set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc+crypto -Werror")
         check_c_source_compiles("
             #include <arm_acle.h>
             int main() {
@@ -130,6 +131,7 @@ if (USE_CPU_EXTENSIONS)
                 "source/arm/asm/*.c"
                 )
             SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
+            SET_SOURCE_FILES_PROPERTIES(source/arm/crc64_arm.c PROPERTIES COMPILE_FLAGS "-march=armv8.1-a+crypto -mtune=neoverse-v1")
         endif()
     endif()
 endif()

diff --git a/include/aws/checksums/crc.h b/include/aws/checksums/crc.h
@@ -18,15 +18,25 @@ AWS_EXTERN_C_BEGIN
  * Pass 0 in the previousCrc32 parameter as an initial value unless continuing
  * to update a running crc in a subsequent call.
  */
-AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32);
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32);
 
 /**
  * The entry point function to perform a Castagnoli CRC32c (iSCSI) computation.
  * Selects a suitable implementation based on hardware capabilities.
  * Pass 0 in the previousCrc32 parameter as an initial value unless continuing
  * to update a running crc in a subsequent call.
  */
-AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32);
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c);
+
+/**
+ * The entry point function to perform a CRC64-XZ (a.k.a. CRC64-GOECMA) computation.
+ * Selects a suitable implementation based on hardware capabilities.
+ * Pass 0 in the previousCrc64 parameter as an initial value unless continuing
+ * to update a running crc in a subsequent call.
+ * There are many variants of CRC64 algorithms. This CRC64 variant is bit-reflected (based on
+ * the non bit-reflected polynomial 0x142F0E1EBA9EA3693) and inverts the CRC input and output bits.
+ */
+AWS_CHECKSUMS_API uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t previous_crc64);
 
 AWS_EXTERN_C_END
 AWS_POP_SANE_WARNING_LEVEL

diff --git a/include/aws/checksums/private/crc64_priv.h b/include/aws/checksums/private/crc64_priv.h
@@ -0,0 +1,43 @@
+#ifndef AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H
+#define AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H
+
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <stdint.h>
+#include <aws/common/macros.h>
+#include <aws/checksums/exports.h>
+
+AWS_EXTERN_C_BEGIN
+
+uint64_t aws_checksums_crc64xz_sw(const uint8_t *input, int length, uint64_t previousCrc64);
+
+#if defined(__x86_64__)
+uint64_t aws_checksums_crc64xz_intel_clmul(const uint8_t *input, int length, uint64_t previousCrc64);
+#endif /* defined(__x86_64__) */
+
+#if defined(__aarch64__)
+uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, uint64_t previousCrc64);
+#endif /* defined(__aarch64__) */
+
+/* Pre-computed constants for CRC64 */
+typedef struct {
+    uint64_t x2048[8]; // x^2112 mod P(x) / x^2048 mod P(x)
+    uint64_t x1536[8]; // x^1600 mod P(x) / x^1536 mod P(x)
+    uint64_t x1024[8]; // x^1088 mod P(x) / x^1024 mod P(x)
+    uint64_t x512[8];  // x^576  mod P(x) / x^512  mod P(x)
+    uint64_t x384[2];  // x^448  mod P(x) / x^384  mod P(x)
+    uint64_t x256[2];  // x^320  mod P(x) / x^256  mod P(x)
+    uint64_t x128[2];  // x^192  mod P(x) / x^128  mod P(x)
+    uint64_t mu_poly[2]; // Barrett mu / polynomial P(x)
+    uint64_t trailing[15][2]; // Folding constants for 15 possible trailing input data lengths
+} aws_checksums_crc64_constants_t;
+
+extern uint8_t aws_checksums_masks_shifts[6][16];
+extern aws_checksums_crc64_constants_t aws_checksums_crc64xz_constants;
+
+AWS_EXTERN_C_END
+
+#endif /* AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H */
diff --git a/source/arm/crc64_arm.c b/source/arm/crc64_arm.c
@@ -0,0 +1,232 @@
+/**
+* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+* SPDX-License-Identifier: Apache-2.0.
+*/
+
+#include <aws/checksums/private/crc64_priv.h>
+
+#if defined(__aarch64__)
+
+#include <arm_neon.h>
+
+// Load a uint8x16_t neon register from uint8_t pointer
+#define load_u8(uint8_t_ptr) vld1q_u8((uint8_t_ptr))
+// Load a poly64x2_t neon register from a uint8_t pointer
+#define load_p64_u8(uint8_t_ptr) vreinterpretq_p64_u8(load_u8(uint8_t_ptr))
+// Load a poly64x2_t neon register from a uint64_t pointer
+#define load_p64(uint64_t_ptr) vreinterpretq_p64_u64(vld1q_u64((uint64_t_ptr)))
+// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 least significant bytes.
+#define mask_low_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[5] - (intptr_t) (count)))
+// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 most significant bytes.
+#define mask_high_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[3] + (intptr_t) (count)))
+// Mask the bytes in a neon poly64x2_t register and preserve 0 to 15 most significant bytes.
+#define mask_high_p64(poly, count) vreinterpretq_p64_u8(mask_high_u8(vreinterpretq_u8_p64(poly), count))
+// Left shift bytes in a neon uint8x16_t register - shift count from 0 to 15.
+#define left_shift_u8(u8, count)  vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] - (intptr_t) (count)))
+// Right shift bytes in a neon uint8x16_t register - shift count from 0 to 15.
+#define right_shift_u8(u8, count) vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] + (intptr_t) (count)))
+// Left shift bytes in a neon poly64x2_t register - shift count from 0 to 15.
+#define left_shift_p64(poly, count) vreinterpretq_p64_u8(left_shift_u8(vreinterpretq_u8_p64(poly), count))
+// Right shift a neon poly64x2_t register 0 to 15 bytes - imm must be an immediate constant
+#define right_shift_imm_p64(poly, imm)  vreinterpretq_p64_u8(vextq_u8(vreinterpretq_u8_p64(poly), vdupq_n_u8(0), imm))
+// Carryless multiply the lower 64-bit halves of two poly64x2_t neon registers
+#define pmull_lo(a, b) (vreinterpretq_p64_p128(vmull_p64((poly64_t) vreinterpretq_p128_p64(a), (poly64_t) vreinterpretq_p128_p64(b))))
+// Carryless multiply the upper 64-bit halves of two poly64x2_t neon registers
+#define pmull_hi(a, b) (vreinterpretq_p64_p128(vmull_high_p64((a), (b))))
+// XOR two neon poly64x2_t registers
+#define xor_p64(a, b) vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(a), vreinterpretq_u8_p64(b)))
+#if defined(__ARM_FEATURE_SHA3)
+// The presence of the ARM SHA3 feature also implies the three-way xor instruction
+#define xor3_p64(a, b, c) vreinterpretq_p64_u64(veor3q_u64(vreinterpretq_u64_p64(a), vreinterpretq_u64_p64(b), vreinterpretq_u64_p64(c)))
+#else
+// Without SHA3, implement three-way xor with two normal xors
+#define xor3_p64(a, b, c) xor_p64(xor_p64(a, b), c)
+#endif // defined(__ARM_FEATURE_SHA3)
+
+/** Compute CRC64XZ using ARMv8 NEON +crypto/pmull64 instructions. */
+uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const uint64_t previousCrc64) {
+    if (!input || length <= 0) {
+        return previousCrc64;
+    }
+
+    // Invert the previous crc bits and load into the lower half of a neon register
+    poly64x2_t a1 = vreinterpretq_p64_u64(vcombine_u64(vcreate_u64(~previousCrc64), vcreate_u64(0)));
+
+    // Load the x^128 and x^192 constants - they'll (very likely) be needed
+    const poly64x2_t x128 = load_p64(aws_checksums_crc64xz_constants.x128);
+
+    if (length < 16) {
+        // Neon register loads are 16 bytes at once, so for lengths less than 16 we need to
+        // carefully load from memory to prevent reading beyond the end of the input buffer
+        int alignment = (intptr_t) input & 15;
+        if (alignment + length <= 16) {
+            // The input falls in a single 16 byte segment so we load from a 16 byte aligned address
+            // The input data will be loaded "into the middle" of the neon register
+            // Right shift the input data register to eliminate any leading bytes and move the data to the least
+            // significant bytes, then mask out the most significant bytes that may contain garbage
+            uint8x16_t load = mask_low_u8(right_shift_u8(load_u8(input - alignment), alignment), length);
+            // XOR the masked input data with the previous crc
+            a1 = xor_p64(a1, vreinterpretq_p64_u8(load));
+        } else {
+            // The input spans two 16 byte segments so it's safe to load the input from its actual starting address
+            // The input data will be in the least significant bytes of the neon register
+            // Mask out the most significant bytes that may contain garbage
+            uint8x16_t load = mask_low_u8(load_u8(input), length);
+            // XOR the masked input data with the previous crc
+            a1 = xor_p64(a1, vreinterpretq_p64_u8(load));
+        }
+
+        if (length <= 8) {
+            // For 8 or less bytes of input just left shift to effectively multiply by x^64
+            a1 = left_shift_p64(a1, 8 - length);
+        } else {
+            // For 8-15 bytes of input we can't left shift without losing the most significant bytes
+            // We need to fold the lower half of the crc register into the upper half
+            a1 = left_shift_p64(a1, 16 - length);
+            // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves)
+            // XOR the result with the right shifted upper half of the crc
+            a1 = xor_p64(right_shift_imm_p64(a1, 8), pmull_lo(a1, vextq_p64(x128, x128, 1)));
+        }
+
+        // Fall through to Barrett modular reduction
+
+    } else { // There are 16 or more bytes of input
+
+        // Load the next 16 bytes of input and XOR with the previous crc
+        a1 = xor_p64(a1, load_p64_u8(input));
+        input += 16;
+        length -= 16;
+
+        if (length < 112) {
+
+            const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256);
+
+            if (length & 64) {
+                // Fold the current crc register with 64 bytes of input by multiplying 64-bit chunks by x^576 through x^128
+                const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512);
+                const poly64x2_t x384 = load_p64(aws_checksums_crc64xz_constants.x384);
+                poly64x2_t b1 = load_p64_u8(input + 0);
+                poly64x2_t c1 = load_p64_u8(input + 16);
+                poly64x2_t d1 = load_p64_u8(input + 32);
+                poly64x2_t e1 = load_p64_u8(input + 48);
+                a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), pmull_lo(x384, b1));
+                b1 = xor3_p64(pmull_hi(x384, b1), pmull_lo(x256, c1), pmull_hi(x256, c1));
+                c1 = xor3_p64(pmull_lo(x128, d1), pmull_hi(x128, d1), e1);
+                a1 = xor3_p64(a1, b1, c1);
+                input += 64;
+            }
+
+            if (length & 32) {
+                // Fold the current running value with 32 bytes of input by multiplying 64-bit chunks by x^320 through x^128
+                poly64x2_t b1 = load_p64_u8(input + 0);
+                poly64x2_t c1 = load_p64_u8(input + 16);
+                a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1));
+                a1 = xor3_p64(a1, pmull_lo(x128, b1), pmull_hi(x128, b1));
+                input += 32;
+            }
+        } else { // There are 112 or more bytes of input
+
+            const poly64x2_t x1024 = load_p64(aws_checksums_crc64xz_constants.x1024);
+
+            // Load another 112 bytes of input
+            poly64x2_t b1 = load_p64_u8(input + 0);
+            poly64x2_t c1 = load_p64_u8(input + 16);
+            poly64x2_t d1 = load_p64_u8(input + 32);
+            poly64x2_t e1 = load_p64_u8(input + 48);
+            poly64x2_t f1 = load_p64_u8(input + 64);
+            poly64x2_t g1 = load_p64_u8(input + 80);
+            poly64x2_t h1 = load_p64_u8(input + 96);
+            input += 112;
+            length -= 112;
+
+            // Spin through additional chunks of 128 bytes, if any
+            int loops = length / 128;
+            while (loops--) {
+                // Fold input values in parallel by multiplying by x^1088 and x^1024 constants
+                a1 = xor3_p64(pmull_lo(x1024, a1), pmull_hi(x1024, a1), load_p64_u8(input + 0));
+                b1 = xor3_p64(pmull_lo(x1024, b1), pmull_hi(x1024, b1), load_p64_u8(input + 16));
+                c1 = xor3_p64(pmull_lo(x1024, c1), pmull_hi(x1024, c1), load_p64_u8(input + 32));
+                d1 = xor3_p64(pmull_lo(x1024, d1), pmull_hi(x1024, d1), load_p64_u8(input + 48));
+                e1 = xor3_p64(pmull_lo(x1024, e1), pmull_hi(x1024, e1), load_p64_u8(input + 64));
+                f1 = xor3_p64(pmull_lo(x1024, f1), pmull_hi(x1024, f1), load_p64_u8(input + 80));
+                g1 = xor3_p64(pmull_lo(x1024, g1), pmull_hi(x1024, g1), load_p64_u8(input + 96));
+                h1 = xor3_p64(pmull_lo(x1024, h1), pmull_hi(x1024, h1), load_p64_u8(input + 112));
+                input += 128;
+            }
+
+            // Fold 128 bytes down to 64 bytes by multiplying by the x^576 and x^512 constants
+            const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512);
+            a1 = xor3_p64(e1, pmull_lo(x512, a1), pmull_hi(x512, a1));
+            b1 = xor3_p64(f1, pmull_lo(x512, b1), pmull_hi(x512, b1));
+            c1 = xor3_p64(g1, pmull_lo(x512, c1), pmull_hi(x512, c1));
+            d1 = xor3_p64(h1, pmull_lo(x512, d1), pmull_hi(x512, d1));
+
+            if (length & 64) {
+                // Fold the current 64 bytes with 64 bytes of input by multiplying by x^576 and x^512 constants
+                a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), load_p64_u8(input + 0));
+                b1 = xor3_p64(pmull_lo(x512, b1), pmull_hi(x512, b1), load_p64_u8(input + 16));
+                c1 = xor3_p64(pmull_lo(x512, c1), pmull_hi(x512, c1), load_p64_u8(input + 32));
+                d1 = xor3_p64(pmull_lo(x512, d1), pmull_hi(x512, d1), load_p64_u8(input + 48));
+                input += 64;
+            }
+
+            // Fold 64 bytes down to 32 bytes by multiplying by the x^320 and x^256 constants
+            const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256);
+            a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1));
+            b1 = xor3_p64(d1, pmull_lo(x256, b1), pmull_hi(x256, b1));
+
+            if (length & 32) {
+                // Fold the current running value with 32 bytes of input by multiplying by x^320 and x^256 constants
+                a1 = xor3_p64(pmull_lo(x256, a1), pmull_hi(x256, a1), load_p64_u8(input + 0));
+                b1 = xor3_p64(pmull_lo(x256, b1), pmull_hi(x256, b1), load_p64_u8(input + 16));
+                input += 32;
+            }
+
+            // Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants
+            a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1));
+        }
+
+        if (length & 16) {
+            // Fold the current 16 bytes with 16 bytes of input by multiplying by x^192 and x^128 constants
+            a1 = xor3_p64(pmull_lo(x128, a1), pmull_hi(x128, a1), load_p64_u8(input + 0));
+            input += 16;
+        }
+
+        // There must only be 0-15 bytes of input left
+        length &= 15;
+
+        if (length == 0) {
+            // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves)
+            poly64x2_t mul_by_x128 = pmull_lo(a1, vextq_p64(x128, x128, 1));
+            // XOR the result with the right shifted upper half of the crc
+            a1 = xor_p64(right_shift_imm_p64(a1, 8), mul_by_x128);
+        } else {
+            // Handle any trailing input from 1-15 bytes
+            const poly64x2_t trailing_constants = load_p64(aws_checksums_crc64xz_constants.trailing[length - 1]);
+            // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input
+            a1 = xor_p64(pmull_lo(a1, trailing_constants), pmull_hi(a1, trailing_constants));
+            // Safely load ending at the last byte of trailing input and mask out any leading garbage
+            poly64x2_t trailing_input = mask_high_p64(load_p64_u8(input + length - 16), length);
+            // Multiply the lower half of the trailing input register by x^128 (swapping x^192 and x^128 halves)
+            poly64x2_t mul_by_x128 = pmull_lo(trailing_input, vextq_p64(x128, x128, 1));
+            // XOR the results with the right shifted upper half of the trailing input
+            a1 = xor3_p64(a1, right_shift_imm_p64(trailing_input, 8), mul_by_x128);
+        }
+    }
+
+    // Barrett modular reduction
+
+    // Load the Barrett mu and (bit-reflected) polynomial
+    const poly64x2_t mu_poly = load_p64(aws_checksums_crc64xz_constants.mu_poly);
+    // Multiply the lower half of the crc register by mu (mu is in the lower half of mu_poly)
+    poly64x2_t mul_by_mu = pmull_lo(a1, mu_poly);
+    // Multiply lower half of mul_by_mu result by poly (which is swapped into the lower half)
+    poly64x2_t mul_by_poly = pmull_lo(mul_by_mu, vextq_p64(mu_poly, mu_poly, 1));
+    // Swap halves of mul_by_mu and add the upper halves of everything
+    poly64x2_t result = xor3_p64(a1, vextq_p64(mul_by_mu, mul_by_mu, 1), mul_by_poly);
+
+    // Reduction result is the upper half - invert the bits before returning the crc
+    return ~vgetq_lane_u64(vreinterpretq_u64_p64(result), 1);
+}
+
+#endif // defined(__aarch64__) && defined(__GNUC__)