Skip to content

Commit

Permalink
Added crc64 implementations for arm and intel, Added some avx512 code…
Browse files Browse the repository at this point in the history
… for crc32 on intel as well.
  • Loading branch information
JonathanHenson committed Jan 18, 2024
1 parent 5138407 commit 2a084ac
Show file tree
Hide file tree
Showing 10 changed files with 1,387 additions and 3 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,9 @@ if (USE_CPU_EXTENSIONS)
"source/arm/*.c"
)
SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
SET_SOURCE_FILES_PROPERTIES(source/arm/crc64_arm.c PROPERTIES COMPILE_FLAGS "-march=armv8.1-a+crypto -mtune=neoverse-v1")
elseif ((NOT MSVC) AND AWS_ARCH_ARM32)
set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc -Werror")
set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc+crypto -Werror")
check_c_source_compiles("
#include <arm_acle.h>
int main() {
Expand All @@ -130,6 +131,7 @@ if (USE_CPU_EXTENSIONS)
"source/arm/asm/*.c"
)
SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
SET_SOURCE_FILES_PROPERTIES(source/arm/crc64_arm.c PROPERTIES COMPILE_FLAGS "-march=armv8.1-a+crypto -mtune=neoverse-v1")
endif()
endif()
endif()
Expand Down
14 changes: 12 additions & 2 deletions include/aws/checksums/crc.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,25 @@ AWS_EXTERN_C_BEGIN
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
*/
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32);
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32);

/**
* The entry point function to perform a Castagnoli CRC32c (iSCSI) computation.
* Selects a suitable implementation based on hardware capabilities.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
*/
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32);
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c);

/**
* The entry point function to perform a CRC64-XZ (a.k.a. CRC64-GOECMA) computation.
* Selects a suitable implementation based on hardware capabilities.
* Pass 0 in the previousCrc64 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
* There are many variants of CRC64 algorithms. This CRC64 variant is bit-reflected (based on
* the non bit-reflected polynomial 0x142F0E1EBA9EA3693) and inverts the CRC input and output bits.
*/
AWS_CHECKSUMS_API uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t previous_crc64);

AWS_EXTERN_C_END
AWS_POP_SANE_WARNING_LEVEL
Expand Down
43 changes: 43 additions & 0 deletions include/aws/checksums/private/crc64_priv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H
#define AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H

/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <stdint.h>
#include <aws/common/macros.h>
#include <aws/checksums/exports.h>

AWS_EXTERN_C_BEGIN

uint64_t aws_checksums_crc64xz_sw(const uint8_t *input, int length, uint64_t previousCrc64);

#if defined(__x86_64__)
uint64_t aws_checksums_crc64xz_intel_clmul(const uint8_t *input, int length, uint64_t previousCrc64);
#endif /* defined(__x86_64__) */

#if defined(__aarch64__)
uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, uint64_t previousCrc64);
#endif /* defined(__aarch64__) */

/* Pre-computed constants for CRC64 */
typedef struct {
uint64_t x2048[8]; // x^2112 mod P(x) / x^2048 mod P(x)
uint64_t x1536[8]; // x^1600 mod P(x) / x^1536 mod P(x)
uint64_t x1024[8]; // x^1088 mod P(x) / x^1024 mod P(x)
uint64_t x512[8]; // x^576 mod P(x) / x^512 mod P(x)
uint64_t x384[2]; // x^448 mod P(x) / x^384 mod P(x)
uint64_t x256[2]; // x^320 mod P(x) / x^256 mod P(x)
uint64_t x128[2]; // x^192 mod P(x) / x^128 mod P(x)
uint64_t mu_poly[2]; // Barrett mu / polynomial P(x)
uint64_t trailing[15][2]; // Folding constants for 15 possible trailing input data lengths
} aws_checksums_crc64_constants_t;

extern uint8_t aws_checksums_masks_shifts[6][16];
extern aws_checksums_crc64_constants_t aws_checksums_crc64xz_constants;

AWS_EXTERN_C_END

#endif /* AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H */
232 changes: 232 additions & 0 deletions source/arm/crc64_arm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc64_priv.h>

#if defined(__aarch64__)

#include <arm_neon.h>

// Load a uint8x16_t neon register from uint8_t pointer
#define load_u8(uint8_t_ptr) vld1q_u8((uint8_t_ptr))
// Load a poly64x2_t neon register from a uint8_t pointer
#define load_p64_u8(uint8_t_ptr) vreinterpretq_p64_u8(load_u8(uint8_t_ptr))
// Load a poly64x2_t neon register from a uint64_t pointer
#define load_p64(uint64_t_ptr) vreinterpretq_p64_u64(vld1q_u64((uint64_t_ptr)))
// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 least significant bytes.
#define mask_low_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[5] - (intptr_t) (count)))
// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 most significant bytes.
#define mask_high_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[3] + (intptr_t) (count)))
// Mask the bytes in a neon poly64x2_t register and preserve 0 to 15 most significant bytes.
#define mask_high_p64(poly, count) vreinterpretq_p64_u8(mask_high_u8(vreinterpretq_u8_p64(poly), count))
// Left shift bytes in a neon uint8x16_t register - shift count from 0 to 15.
#define left_shift_u8(u8, count) vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] - (intptr_t) (count)))
// Right shift bytes in a neon uint8x16_t register - shift count from 0 to 15.
#define right_shift_u8(u8, count) vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] + (intptr_t) (count)))
// Left shift bytes in a neon poly64x2_t register - shift count from 0 to 15.
#define left_shift_p64(poly, count) vreinterpretq_p64_u8(left_shift_u8(vreinterpretq_u8_p64(poly), count))
// Right shift a neon poly64x2_t register 0 to 15 bytes - imm must be an immediate constant
#define right_shift_imm_p64(poly, imm) vreinterpretq_p64_u8(vextq_u8(vreinterpretq_u8_p64(poly), vdupq_n_u8(0), imm))
// Carryless multiply the lower 64-bit halves of two poly64x2_t neon registers
#define pmull_lo(a, b) (vreinterpretq_p64_p128(vmull_p64((poly64_t) vreinterpretq_p128_p64(a), (poly64_t) vreinterpretq_p128_p64(b))))
// Carryless multiply the upper 64-bit halves of two poly64x2_t neon registers
#define pmull_hi(a, b) (vreinterpretq_p64_p128(vmull_high_p64((a), (b))))
// XOR two neon poly64x2_t registers
#define xor_p64(a, b) vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(a), vreinterpretq_u8_p64(b)))
#if defined(__ARM_FEATURE_SHA3)
// The presence of the ARM SHA3 feature also implies the three-way xor instruction
#define xor3_p64(a, b, c) vreinterpretq_p64_u64(veor3q_u64(vreinterpretq_u64_p64(a), vreinterpretq_u64_p64(b), vreinterpretq_u64_p64(c)))
#else
// Without SHA3, implement three-way xor with two normal xors
#define xor3_p64(a, b, c) xor_p64(xor_p64(a, b), c)
#endif // defined(__ARM_FEATURE_SHA3)

/** Compute CRC64XZ using ARMv8 NEON +crypto/pmull64 instructions. */
uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const uint64_t previousCrc64) {
if (!input || length <= 0) {
return previousCrc64;
}

// Invert the previous crc bits and load into the lower half of a neon register
poly64x2_t a1 = vreinterpretq_p64_u64(vcombine_u64(vcreate_u64(~previousCrc64), vcreate_u64(0)));

// Load the x^128 and x^192 constants - they'll (very likely) be needed
const poly64x2_t x128 = load_p64(aws_checksums_crc64xz_constants.x128);

if (length < 16) {
// Neon register loads are 16 bytes at once, so for lengths less than 16 we need to
// carefully load from memory to prevent reading beyond the end of the input buffer
int alignment = (intptr_t) input & 15;
if (alignment + length <= 16) {
// The input falls in a single 16 byte segment so we load from a 16 byte aligned address
// The input data will be loaded "into the middle" of the neon register
// Right shift the input data register to eliminate any leading bytes and move the data to the least
// significant bytes, then mask out the most significant bytes that may contain garbage
uint8x16_t load = mask_low_u8(right_shift_u8(load_u8(input - alignment), alignment), length);
// XOR the masked input data with the previous crc
a1 = xor_p64(a1, vreinterpretq_p64_u8(load));
} else {
// The input spans two 16 byte segments so it's safe to load the input from its actual starting address
// The input data will be in the least significant bytes of the neon register
// Mask out the most significant bytes that may contain garbage
uint8x16_t load = mask_low_u8(load_u8(input), length);
// XOR the masked input data with the previous crc
a1 = xor_p64(a1, vreinterpretq_p64_u8(load));
}

if (length <= 8) {
// For 8 or less bytes of input just left shift to effectively multiply by x^64
a1 = left_shift_p64(a1, 8 - length);
} else {
// For 8-15 bytes of input we can't left shift without losing the most significant bytes
// We need to fold the lower half of the crc register into the upper half
a1 = left_shift_p64(a1, 16 - length);
// Multiply the lower half of the crc register by x^128 (swapping upper and lower halves)
// XOR the result with the right shifted upper half of the crc
a1 = xor_p64(right_shift_imm_p64(a1, 8), pmull_lo(a1, vextq_p64(x128, x128, 1)));
}

// Fall through to Barrett modular reduction

} else { // There are 16 or more bytes of input

// Load the next 16 bytes of input and XOR with the previous crc
a1 = xor_p64(a1, load_p64_u8(input));
input += 16;
length -= 16;

if (length < 112) {

const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256);

if (length & 64) {
// Fold the current crc register with 64 bytes of input by multiplying 64-bit chunks by x^576 through x^128
const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512);
const poly64x2_t x384 = load_p64(aws_checksums_crc64xz_constants.x384);
poly64x2_t b1 = load_p64_u8(input + 0);
poly64x2_t c1 = load_p64_u8(input + 16);
poly64x2_t d1 = load_p64_u8(input + 32);
poly64x2_t e1 = load_p64_u8(input + 48);
a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), pmull_lo(x384, b1));
b1 = xor3_p64(pmull_hi(x384, b1), pmull_lo(x256, c1), pmull_hi(x256, c1));
c1 = xor3_p64(pmull_lo(x128, d1), pmull_hi(x128, d1), e1);
a1 = xor3_p64(a1, b1, c1);
input += 64;
}

if (length & 32) {
// Fold the current running value with 32 bytes of input by multiplying 64-bit chunks by x^320 through x^128
poly64x2_t b1 = load_p64_u8(input + 0);
poly64x2_t c1 = load_p64_u8(input + 16);
a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1));
a1 = xor3_p64(a1, pmull_lo(x128, b1), pmull_hi(x128, b1));
input += 32;
}
} else { // There are 112 or more bytes of input

const poly64x2_t x1024 = load_p64(aws_checksums_crc64xz_constants.x1024);

// Load another 112 bytes of input
poly64x2_t b1 = load_p64_u8(input + 0);
poly64x2_t c1 = load_p64_u8(input + 16);
poly64x2_t d1 = load_p64_u8(input + 32);
poly64x2_t e1 = load_p64_u8(input + 48);
poly64x2_t f1 = load_p64_u8(input + 64);
poly64x2_t g1 = load_p64_u8(input + 80);
poly64x2_t h1 = load_p64_u8(input + 96);
input += 112;
length -= 112;

// Spin through additional chunks of 128 bytes, if any
int loops = length / 128;
while (loops--) {
// Fold input values in parallel by multiplying by x^1088 and x^1024 constants
a1 = xor3_p64(pmull_lo(x1024, a1), pmull_hi(x1024, a1), load_p64_u8(input + 0));
b1 = xor3_p64(pmull_lo(x1024, b1), pmull_hi(x1024, b1), load_p64_u8(input + 16));
c1 = xor3_p64(pmull_lo(x1024, c1), pmull_hi(x1024, c1), load_p64_u8(input + 32));
d1 = xor3_p64(pmull_lo(x1024, d1), pmull_hi(x1024, d1), load_p64_u8(input + 48));
e1 = xor3_p64(pmull_lo(x1024, e1), pmull_hi(x1024, e1), load_p64_u8(input + 64));
f1 = xor3_p64(pmull_lo(x1024, f1), pmull_hi(x1024, f1), load_p64_u8(input + 80));
g1 = xor3_p64(pmull_lo(x1024, g1), pmull_hi(x1024, g1), load_p64_u8(input + 96));
h1 = xor3_p64(pmull_lo(x1024, h1), pmull_hi(x1024, h1), load_p64_u8(input + 112));
input += 128;
}

// Fold 128 bytes down to 64 bytes by multiplying by the x^576 and x^512 constants
const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512);
a1 = xor3_p64(e1, pmull_lo(x512, a1), pmull_hi(x512, a1));
b1 = xor3_p64(f1, pmull_lo(x512, b1), pmull_hi(x512, b1));
c1 = xor3_p64(g1, pmull_lo(x512, c1), pmull_hi(x512, c1));
d1 = xor3_p64(h1, pmull_lo(x512, d1), pmull_hi(x512, d1));

if (length & 64) {
// Fold the current 64 bytes with 64 bytes of input by multiplying by x^576 and x^512 constants
a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), load_p64_u8(input + 0));
b1 = xor3_p64(pmull_lo(x512, b1), pmull_hi(x512, b1), load_p64_u8(input + 16));
c1 = xor3_p64(pmull_lo(x512, c1), pmull_hi(x512, c1), load_p64_u8(input + 32));
d1 = xor3_p64(pmull_lo(x512, d1), pmull_hi(x512, d1), load_p64_u8(input + 48));
input += 64;
}

// Fold 64 bytes down to 32 bytes by multiplying by the x^320 and x^256 constants
const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256);
a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1));
b1 = xor3_p64(d1, pmull_lo(x256, b1), pmull_hi(x256, b1));

if (length & 32) {
// Fold the current running value with 32 bytes of input by multiplying by x^320 and x^256 constants
a1 = xor3_p64(pmull_lo(x256, a1), pmull_hi(x256, a1), load_p64_u8(input + 0));
b1 = xor3_p64(pmull_lo(x256, b1), pmull_hi(x256, b1), load_p64_u8(input + 16));
input += 32;
}

// Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants
a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1));
}

if (length & 16) {
// Fold the current 16 bytes with 16 bytes of input by multiplying by x^192 and x^128 constants
a1 = xor3_p64(pmull_lo(x128, a1), pmull_hi(x128, a1), load_p64_u8(input + 0));
input += 16;
}

// There must only be 0-15 bytes of input left
length &= 15;

if (length == 0) {
// Multiply the lower half of the crc register by x^128 (swapping upper and lower halves)
poly64x2_t mul_by_x128 = pmull_lo(a1, vextq_p64(x128, x128, 1));
// XOR the result with the right shifted upper half of the crc
a1 = xor_p64(right_shift_imm_p64(a1, 8), mul_by_x128);
} else {
// Handle any trailing input from 1-15 bytes
const poly64x2_t trailing_constants = load_p64(aws_checksums_crc64xz_constants.trailing[length - 1]);
// Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input
a1 = xor_p64(pmull_lo(a1, trailing_constants), pmull_hi(a1, trailing_constants));
// Safely load ending at the last byte of trailing input and mask out any leading garbage
poly64x2_t trailing_input = mask_high_p64(load_p64_u8(input + length - 16), length);
// Multiply the lower half of the trailing input register by x^128 (swapping x^192 and x^128 halves)
poly64x2_t mul_by_x128 = pmull_lo(trailing_input, vextq_p64(x128, x128, 1));
// XOR the results with the right shifted upper half of the trailing input
a1 = xor3_p64(a1, right_shift_imm_p64(trailing_input, 8), mul_by_x128);
}
}

// Barrett modular reduction

// Load the Barrett mu and (bit-reflected) polynomial
const poly64x2_t mu_poly = load_p64(aws_checksums_crc64xz_constants.mu_poly);
// Multiply the lower half of the crc register by mu (mu is in the lower half of mu_poly)
poly64x2_t mul_by_mu = pmull_lo(a1, mu_poly);
// Multiply lower half of mul_by_mu result by poly (which is swapped into the lower half)
poly64x2_t mul_by_poly = pmull_lo(mul_by_mu, vextq_p64(mu_poly, mu_poly, 1));
// Swap halves of mul_by_mu and add the upper halves of everything
poly64x2_t result = xor3_p64(a1, vextq_p64(mul_by_mu, mul_by_mu, 1), mul_by_poly);

// Reduction result is the upper half - invert the bits before returning the crc
return ~vgetq_lane_u64(vreinterpretq_u64_p64(result), 1);
}

#endif // defined(__aarch64__) && defined(__GNUC__)
Loading

0 comments on commit 2a084ac

Please sign in to comment.