From 0a4f1c261d0e29d4278be818b7bdae9dc59e69af Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Thu, 8 Dec 2022 10:35:10 -0800 Subject: [PATCH] delete the internal kangarootwelve bindings My kangarootwelve_xkcp crate was originally derived from these bindings, but today it's better maintained and easier to build. This also improves the performance of the very long K12 benchmark. --- Cargo.toml | 4 +- benches/bench.rs | 6 +- benches/kangarootwelve/.gitignore | 4 - benches/kangarootwelve/Cargo.toml | 12 - benches/kangarootwelve/K12/Makefile | 15 - benches/kangarootwelve/K12/Makefile.build | 78 -- benches/kangarootwelve/K12/README.markdown | 58 - .../K12/lib/Inplace32BI/KeccakP-1600-SnP.h | 30 - .../Inplace32BI/KeccakP-1600-inplace32BI.c | 1067 ---------------- .../kangarootwelve/K12/lib/KangarooTwelve.c | 572 --------- .../kangarootwelve/K12/lib/KangarooTwelve.h | 110 -- .../K12/lib/Optimized64/KeccakP-1600-AVX2.s | 607 --------- .../K12/lib/Optimized64/KeccakP-1600-AVX512.s | 503 -------- .../K12/lib/Optimized64/KeccakP-1600-SnP.h | 69 - .../K12/lib/Optimized64/KeccakP-1600-opt64.c | 1105 ----------------- .../Optimized64/KeccakP-1600-timesN-AVX2.c | 427 ------- .../Optimized64/KeccakP-1600-timesN-AVX512.c | 426 ------- .../Optimized64/KeccakP-1600-timesN-SSSE3.c | 446 ------- benches/kangarootwelve/K12/lib/brg_endian.h | 143 --- .../K12/support/Build/ExpandProducts.xsl | 76 -- .../K12/support/Build/ToGlobalMakefile.xsl | 182 --- .../K12/support/Build/ToOneTarget.xsl | 86 -- .../K12/support/Build/ToTargetMakefile.xsl | 208 ---- .../K12/support/Build/ToVCXProj.xsl | 131 -- benches/kangarootwelve/K12/tests/main.c | 70 -- .../K12/tests/testKangarooTwelve.c | 318 ----- .../K12/tests/testKangarooTwelve.h | 17 - .../K12/tests/testPerformance.c | 219 ---- .../K12/tests/testPerformance.h | 19 - benches/kangarootwelve/K12/tests/timing.h | 110 -- benches/kangarootwelve/README.md | 10 - benches/kangarootwelve/build.rs | 19 - benches/kangarootwelve/src/lib.rs | 55 - 33 files changed, 6 insertions(+), 7196 deletions(-) delete mode 100644 benches/kangarootwelve/.gitignore delete mode 100644 benches/kangarootwelve/Cargo.toml delete mode 100644 benches/kangarootwelve/K12/Makefile delete mode 100644 benches/kangarootwelve/K12/Makefile.build delete mode 100644 benches/kangarootwelve/K12/README.markdown delete mode 100644 benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-SnP.h delete mode 100644 benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c delete mode 100644 benches/kangarootwelve/K12/lib/KangarooTwelve.c delete mode 100644 benches/kangarootwelve/K12/lib/KangarooTwelve.h delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX2.s delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX512.s delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-SnP.h delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-opt64.c delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c delete mode 100644 benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c delete mode 100644 benches/kangarootwelve/K12/lib/brg_endian.h delete mode 100644 benches/kangarootwelve/K12/support/Build/ExpandProducts.xsl delete mode 100644 benches/kangarootwelve/K12/support/Build/ToGlobalMakefile.xsl delete mode 100644 benches/kangarootwelve/K12/support/Build/ToOneTarget.xsl delete mode 100644 benches/kangarootwelve/K12/support/Build/ToTargetMakefile.xsl delete mode 100644 benches/kangarootwelve/K12/support/Build/ToVCXProj.xsl delete mode 100644 benches/kangarootwelve/K12/tests/main.c delete mode 100644 benches/kangarootwelve/K12/tests/testKangarooTwelve.c delete mode 100644 benches/kangarootwelve/K12/tests/testKangarooTwelve.h delete mode 100644 benches/kangarootwelve/K12/tests/testPerformance.c delete mode 100644 benches/kangarootwelve/K12/tests/testPerformance.h delete mode 100644 benches/kangarootwelve/K12/tests/timing.h delete mode 100644 benches/kangarootwelve/README.md delete mode 100644 benches/kangarootwelve/build.rs delete mode 100644 benches/kangarootwelve/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 3f00939..af1d56d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,8 @@ edition = "2018" # only controls whether blake2b_simd and blake2s_simd build with no_std. default = ["std"] std = ["blake2b_simd/std", "blake2s_simd/std"] +# only for benchmarks +kangarootwelve = ["kangarootwelve_xkcp"] [dependencies] arrayvec = "0.7.0" @@ -26,7 +28,7 @@ blake2b_simd = { path = "./blake2b", default-features = false } blake2s_simd = { path = "./blake2s", default-features = false } blake2-avx2-sneves = { path = "benches/blake2-avx2-sneves", optional = true } hex = "0.3.2" -kangarootwelve = { path = "benches/kangarootwelve", optional = true } +kangarootwelve_xkcp = { version = "0.1.5", optional = true } lazy_static = "1.3.0" libsodium-ffi = { version = "0.1.17", optional = true } openssl = { version = "0.10.23", optional = true } diff --git a/benches/bench.rs b/benches/bench.rs index d81862b..49659a0 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -361,20 +361,20 @@ fn bench_long_sneves_blake2sp(b: &mut Bencher) { #[bench] fn bench_verylong_kangarootwelve(b: &mut Bencher) { let mut input = RandomInput::new(b, VERYLONG); - b.iter(|| kangarootwelve::kangarootwelve(input.get())); + b.iter(|| kangarootwelve_xkcp::hash(input.get())); } #[cfg(feature = "kangarootwelve")] #[bench] fn bench_long_kangarootwelve(b: &mut Bencher) { let mut input = RandomInput::new(b, LONG); - b.iter(|| kangarootwelve::kangarootwelve(input.get())); + b.iter(|| kangarootwelve_xkcp::hash(input.get())); } #[cfg(feature = "kangarootwelve")] #[bench] fn bench_onebyte_kangarootwelve(b: &mut Bencher) { - b.iter(|| kangarootwelve::kangarootwelve(b"x")); + b.iter(|| kangarootwelve_xkcp::hash(b"x")); } #[cfg(feature = "libsodium-ffi")] diff --git a/benches/kangarootwelve/.gitignore b/benches/kangarootwelve/.gitignore deleted file mode 100644 index e609942..0000000 --- a/benches/kangarootwelve/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -/target -**/*.rs.bk -Cargo.lock -K12/bin diff --git a/benches/kangarootwelve/Cargo.toml b/benches/kangarootwelve/Cargo.toml deleted file mode 100644 index b5101f6..0000000 --- a/benches/kangarootwelve/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "kangarootwelve" -version = "0.0.0" -authors = ["Jack O'Connor "] -edition = "2018" - -[dependencies] - -[build-dependencies] - -[dev-dependencies] -hex = "0.3.2" diff --git a/benches/kangarootwelve/K12/Makefile b/benches/kangarootwelve/K12/Makefile deleted file mode 100644 index dbf11de..0000000 --- a/benches/kangarootwelve/K12/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -_list: Makefile.build support/Build/ToGlobalMakefile.xsl - -bin/.build/Makefile: bin/.build/Makefile.expanded - mkdir -p $(dir $@) - xsltproc --xinclude -o $@ support/Build/ToGlobalMakefile.xsl $< - -bin/.build/Makefile.expanded: Makefile.build - mkdir -p $(dir $@) - xsltproc --xinclude -o $@ support/Build/ExpandProducts.xsl $< - --include bin/.build/Makefile - -.PHONY: clean -clean: - rm -rf bin/ diff --git a/benches/kangarootwelve/K12/Makefile.build b/benches/kangarootwelve/K12/Makefile.build deleted file mode 100644 index 5576e0a..0000000 --- a/benches/kangarootwelve/K12/Makefile.build +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - -fomit-frame-pointer - -O2 - -g0 - - - - - - lib/Inplace32BI/KeccakP-1600-inplace32BI.c - lib/Inplace32BI/KeccakP-1600-SnP.h - - - - lib/Optimized64/KeccakP-1600-opt64.c - lib/Optimized64/KeccakP-1600-SnP.h - lib/Optimized64/KeccakP-1600-AVX2.s - lib/Optimized64/KeccakP-1600-AVX512.s - lib/Optimized64/KeccakP-1600-timesN-SSSE3.c - lib/Optimized64/KeccakP-1600-timesN-AVX2.c - lib/Optimized64/KeccakP-1600-timesN-AVX512.c - - - - - - lib/KangarooTwelve.c - lib/KangarooTwelve.h - - - - - - lib/brg_endian.h - - - - - tests/main.c - tests/testPerformance.c - tests/timing.h - tests/testPerformance.h - tests/testKangarooTwelve.c - tests/testKangarooTwelve.h - -lm - - - - - - - - - - - - - - - - - - - - diff --git a/benches/kangarootwelve/K12/README.markdown b/benches/kangarootwelve/K12/README.markdown deleted file mode 100644 index 6f77de3..0000000 --- a/benches/kangarootwelve/K12/README.markdown +++ /dev/null @@ -1,58 +0,0 @@ -# What is KangarooTwelve ? - -[**KangarooTwelve**][k12] (or **K12**) is a fast and secure extendable-output function (XOF), the generalization of hash functions to arbitrary output lengths. -Derived from Keccak, it aims at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security. - -On high-end platforms, it can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors. -On Intel's® Haswell and Skylake architectures, KangarooTwelve tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.55 cycles/byte on the SkylakeX architecture. -On low-end platforms, as well as for short messages, it also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128. - -More details can be found in our [ACNS Paper][eprint]. - -# What can I find here? - -This repository contains source code that implements the extandable output (or hash) function [**KangarooTwelve**][k12] (or **K12**). -Its purpose is to offer optimized implementations of K12 and nothing else. - -The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for K12. -It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the K12 tree hash mode. -Also, some sources have been merged to reduce the file count. - -* For the higher layer, we kept only the code needed for K12. -* For the lower layer, we removed all the functions that are not needed for K12. The lower layer therefore implements a subset of the SnP and PlSnP interfaces. - -For Keccak or Xoodoo-based functions other than K12 only, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP. - - -# How can I build this K12 code? - -This repository uses the same build system as that of the XKCP. -To build, the following tools are needed: - -* *GCC* -* *GNU make* -* *xsltproc* - -The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g., - -``` -make generic64/K12Tests -``` - -to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name. An alternate C compiler can be specified via the `CC` environment variable. - -Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g., - -``` -make generic64/K12Tests.pack -``` - -This creates a `.tar.gz` archive with all the necessary files to build the given target. - -The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters. - -For Microsoft Visual Studio support and other details, please refer to the [XKCP][xkcp]. - -[k12]: https://keccak.team/kangarootwelve.html -[xkcp]: https://github.com/XKCP/XKCP -[eprint]: https://eprint.iacr.org/2016/770.pdf diff --git a/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-SnP.h b/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-SnP.h deleted file mode 100644 index d10c598..0000000 --- a/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-SnP.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakP1600_disableParallelism - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_Initialize(void *state); -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_Permute_12rounds(void *state); -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); - -#endif diff --git a/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c b/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c deleted file mode 100644 index 7ced8c7..0000000 --- a/benches/kangarootwelve/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c +++ /dev/null @@ -1,1067 +0,0 @@ -/* -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include "brg_endian.h" -#include "KeccakP-1600-SnP.h" - -const char * KeccakP1600_GetImplementation() -{ - return "in-place 32-bit implementation"; -} - - -typedef unsigned char UINT8; -typedef unsigned int UINT32; -/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */ -/* typedef unsigned long UINT32; */ - -#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset)))) - -/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ -#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - temp0 = (low); \ - temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ - temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ - temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ - temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ - temp1 = (high); \ - temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); \ - temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ - temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ - temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); - -#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000); - -#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000); - -#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even = (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd = (temp0 >> 16) | (temp1 & 0xFFFF0000); - -/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ -#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - temp0 = (even); \ - temp1 = (odd); \ - temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \ - temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \ - temp0 = temp; \ - temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ - temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ - temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ - temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ - temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); \ - temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ - temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ - temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); - -#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \ - prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - low = temp0; \ - high = temp1; - -#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \ - prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - lowOut = lowIn ^ temp0; \ - highOut = highIn ^ temp1; - -void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length) -{ - UINT8 laneAsBytes[8]; - UINT32 low, high; - UINT32 temp, temp0, temp1; - UINT32 *stateAsHalfLanes = (UINT32*)state; - - memset(laneAsBytes, 0xFF, offset); - memset(laneAsBytes+offset, 0x00, length); - memset(laneAsBytes+offset+length, 0xFF, 8-offset-length); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - low = *((UINT32*)(laneAsBytes+0)); - high = *((UINT32*)(laneAsBytes+4)); -#else - low = laneAsBytes[0] - | ((UINT32)(laneAsBytes[1]) << 8) - | ((UINT32)(laneAsBytes[2]) << 16) - | ((UINT32)(laneAsBytes[3]) << 24); - high = laneAsBytes[4] - | ((UINT32)(laneAsBytes[5]) << 8) - | ((UINT32)(laneAsBytes[6]) << 16) - | ((UINT32)(laneAsBytes[7]) << 24); -#endif - toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_Initialize(void *state) -{ - memset(state, 0, 200); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) -{ - unsigned int lanePosition = offset/8; - unsigned int offsetInLane = offset%8; - UINT32 low, high; - UINT32 temp, temp0, temp1; - UINT32 *stateAsHalfLanes = (UINT32*)state; - - if (offsetInLane < 4) { - low = (UINT32)byte << (offsetInLane*8); - high = 0; - } - else { - low = 0; - high = (UINT32)byte << ((offsetInLane-4)*8); - } - toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) -{ - UINT8 laneAsBytes[8]; - UINT32 low, high; - UINT32 temp, temp0, temp1; - UINT32 *stateAsHalfLanes = (UINT32*)state; - - memset(laneAsBytes, 0, 8); - memcpy(laneAsBytes+offset, data, length); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - low = *((UINT32*)(laneAsBytes+0)); - high = *((UINT32*)(laneAsBytes+4)); -#else - low = laneAsBytes[0] - | ((UINT32)(laneAsBytes[1]) << 8) - | ((UINT32)(laneAsBytes[2]) << 16) - | ((UINT32)(laneAsBytes[3]) << 24); - high = laneAsBytes[4] - | ((UINT32)(laneAsBytes[5]) << 8) - | ((UINT32)(laneAsBytes[6]) << 16) - | ((UINT32)(laneAsBytes[7]) << 24); -#endif - toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - const UINT32 * pI = (const UINT32 *)data; - UINT32 * pS = (UINT32*)state; - UINT32 t, x0, x1; - int i; - for (i = laneCount-1; i >= 0; --i) { -#ifdef NO_MISALIGNED_ACCESSES - UINT32 low; - UINT32 high; - memcpy(&low, pI++, 4); - memcpy(&high, pI++, 4); - toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1); -#else - toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1) -#endif - } -#else - unsigned int lanePosition; - for(lanePosition=0; lanePosition 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) -{ - UINT32 *stateAsHalfLanes = (UINT32*)state; - UINT32 low, high, temp, temp0, temp1; - UINT8 laneAsBytes[8]; - - fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - *((UINT32*)(laneAsBytes+0)) = low; - *((UINT32*)(laneAsBytes+4)) = high; -#else - laneAsBytes[0] = low & 0xFF; - laneAsBytes[1] = (low >> 8) & 0xFF; - laneAsBytes[2] = (low >> 16) & 0xFF; - laneAsBytes[3] = (low >> 24) & 0xFF; - laneAsBytes[4] = high & 0xFF; - laneAsBytes[5] = (high >> 8) & 0xFF; - laneAsBytes[6] = (high >> 16) & 0xFF; - laneAsBytes[7] = (high >> 24) & 0xFF; -#endif - memcpy(data, laneAsBytes+offset, length); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - UINT32 * pI = (UINT32 *)data; - const UINT32 * pS = ( const UINT32 *)state; - UINT32 t, x0, x1; - int i; - for (i = laneCount-1; i >= 0; --i) { -#ifdef NO_MISALIGNED_ACCESSES - UINT32 low; - UINT32 high; - fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1); - memcpy(pI++, &low, 4); - memcpy(pI++, &high, 4); -#else - fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1) -#endif - } -#else - unsigned int lanePosition; - for(lanePosition=0; lanePosition> 8) & 0xFF; - laneAsBytes[2] = (low >> 16) & 0xFF; - laneAsBytes[3] = (low >> 24) & 0xFF; - laneAsBytes[4] = high & 0xFF; - laneAsBytes[5] = (high >> 8) & 0xFF; - laneAsBytes[6] = (high >> 16) & 0xFF; - laneAsBytes[7] = (high >> 24) & 0xFF; - memcpy(data+lanePosition*8, laneAsBytes, 8); - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ - { \ - if ((offset) == 0) { \ - SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ - SnP_ExtractBytesInLane(state, \ - (length)/SnP_laneLengthInBytes, \ - (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ - 0, \ - (length)%SnP_laneLengthInBytes); \ - } \ - else { \ - unsigned int _sizeLeft = (length); \ - unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ - unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ - unsigned char *_curData = (data); \ - while(_sizeLeft > 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] = -{ - 0x00000001UL, 0x00000000UL, - 0x00000000UL, 0x00000089UL, - 0x00000000UL, 0x8000008bUL, - 0x00000000UL, 0x80008080UL, - 0x00000001UL, 0x0000008bUL, - 0x00000001UL, 0x00008000UL, - 0x00000001UL, 0x80008088UL, - 0x00000001UL, 0x80000082UL, - 0x00000000UL, 0x0000000bUL, - 0x00000000UL, 0x0000000aUL, - 0x00000001UL, 0x00008082UL, - 0x00000000UL, 0x00008003UL, - 0x00000001UL, 0x0000808bUL, - 0x00000001UL, 0x8000000bUL, - 0x00000001UL, 0x8000008aUL, - 0x00000001UL, 0x80000081UL, - 0x00000000UL, 0x80000081UL, - 0x00000000UL, 0x80000008UL, - 0x00000000UL, 0x00000083UL, - 0x00000000UL, 0x80008003UL, - 0x00000001UL, 0x80008088UL, - 0x00000000UL, 0x80000088UL, - 0x00000001UL, 0x00008000UL, - 0x00000000UL, 0x80008082UL, - 0x000000FFUL -}; - -#define KeccakRound0() \ - Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \ - Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \ - Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \ - De1 = Cz^Cw; \ - Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Age0^De0), 22); \ - Bi = ROL32((Aki1^Di1), 22); \ - Bo = ROL32((Amo1^Do1), 11); \ - Bu = ROL32((Asu0^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Age0 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Age1^De1), 22); \ - Bi = ROL32((Aki0^Di0), 21); \ - Bo = ROL32((Amo0^Do0), 10); \ - Bu = ROL32((Asu1^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Age1 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aka1^Da1), 2); \ - Bo = ROL32((Ame1^De1), 23); \ - Bu = ROL32((Asi1^Di1), 31); \ - Ba = ROL32((Abo0^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aka0^Da0), 1); \ - Bo = ROL32((Ame0^De0), 22); \ - Bu = ROL32((Asi0^Di0), 30); \ - Ba = ROL32((Abo1^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Asa0^Da0), 9); \ - Ba = ROL32((Abe1^De1), 1); \ - Be = ROL32((Agi0^Di0), 3); \ - Bi = ROL32((Ako1^Do1), 13); \ - Bo = ROL32((Amu0^Du0), 4); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Asa1^Da1), 9); \ - Ba = (Abe0^De0); \ - Be = ROL32((Agi1^Di1), 3); \ - Bi = ROL32((Ako0^Do0), 12); \ - Bo = ROL32((Amu1^Du1), 4); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aga0^Da0), 18); \ - Bi = ROL32((Ake0^De0), 5); \ - Bo = ROL32((Ami1^Di1), 8); \ - Bu = ROL32((Aso0^Do0), 28); \ - Ba = ROL32((Abu1^Du1), 14); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aga1^Da1), 18); \ - Bi = ROL32((Ake1^De1), 5); \ - Bo = ROL32((Ami0^Di0), 7); \ - Bu = ROL32((Aso1^Do1), 28); \ - Ba = ROL32((Abu0^Du0), 13); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Ama1^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Abi0^Di0), 31); \ - Be = ROL32((Ago1^Do1), 28); \ - Bi = ROL32((Aku1^Du1), 20); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Ama0^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Abi1^Di1), 31); \ - Be = ROL32((Ago0^Do0), 27); \ - Bi = ROL32((Aku0^Du0), 19); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ) - -#define KeccakRound1() \ - Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \ - Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \ - Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \ - De1 = Cz^Cw; \ - Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Ame1^De0), 22); \ - Bi = ROL32((Agi1^Di1), 22); \ - Bo = ROL32((Aso1^Do1), 11); \ - Bu = ROL32((Aku1^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Ame0^De1), 22); \ - Bi = ROL32((Agi0^Di0), 21); \ - Bo = ROL32((Aso0^Do0), 10); \ - Bu = ROL32((Aku0^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Asa1^Da1), 2); \ - Bo = ROL32((Ake1^De1), 23); \ - Bu = ROL32((Abi1^Di1), 31); \ - Ba = ROL32((Amo1^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Asa0^Da0), 1); \ - Bo = ROL32((Ake0^De0), 22); \ - Bu = ROL32((Abi0^Di0), 30); \ - Ba = ROL32((Amo0^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Ama1^Da0), 9); \ - Ba = ROL32((Age1^De1), 1); \ - Be = ROL32((Asi1^Di0), 3); \ - Bi = ROL32((Ako0^Do1), 13); \ - Bo = ROL32((Abu1^Du0), 4); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Ama0^Da1), 9); \ - Ba = (Age0^De0); \ - Be = ROL32((Asi0^Di1), 3); \ - Bi = ROL32((Ako1^Do0), 12); \ - Bo = ROL32((Abu0^Du1), 4); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aka1^Da0), 18); \ - Bi = ROL32((Abe1^De0), 5); \ - Bo = ROL32((Ami0^Di1), 8); \ - Bu = ROL32((Ago1^Do0), 28); \ - Ba = ROL32((Asu1^Du1), 14); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aka0^Da1), 18); \ - Bi = ROL32((Abe0^De1), 5); \ - Bo = ROL32((Ami1^Di0), 7); \ - Bu = ROL32((Ago0^Do1), 28); \ - Ba = ROL32((Asu0^Du0), 13); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aga1^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Aki1^Di0), 31); \ - Be = ROL32((Abo1^Do1), 28); \ - Bi = ROL32((Amu1^Du1), 20); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aga0^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Aki0^Di1), 31); \ - Be = ROL32((Abo0^Do0), 27); \ - Bi = ROL32((Amu0^Du0), 19); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); - -#define KeccakRound2() \ - Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \ - Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \ - Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \ - De1 = Cz^Cw; \ - Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Ake1^De0), 22); \ - Bi = ROL32((Asi0^Di1), 22); \ - Bo = ROL32((Ago0^Do1), 11); \ - Bu = ROL32((Amu1^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Ake0^De1), 22); \ - Bi = ROL32((Asi1^Di0), 21); \ - Bo = ROL32((Ago1^Do0), 10); \ - Bu = ROL32((Amu0^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Ama0^Da1), 2); \ - Bo = ROL32((Abe0^De1), 23); \ - Bu = ROL32((Aki0^Di1), 31); \ - Ba = ROL32((Aso1^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Ama1^Da0), 1); \ - Bo = ROL32((Abe1^De0), 22); \ - Bu = ROL32((Aki1^Di0), 30); \ - Ba = ROL32((Aso0^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aga1^Da0), 9); \ - Ba = ROL32((Ame0^De1), 1); \ - Be = ROL32((Abi1^Di0), 3); \ - Bi = ROL32((Ako1^Do1), 13); \ - Bo = ROL32((Asu1^Du0), 4); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aga0^Da1), 9); \ - Ba = (Ame1^De0); \ - Be = ROL32((Abi0^Di1), 3); \ - Bi = ROL32((Ako0^Do0), 12); \ - Bo = ROL32((Asu0^Du1), 4); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Asa1^Da0), 18); \ - Bi = ROL32((Age1^De0), 5); \ - Bo = ROL32((Ami1^Di1), 8); \ - Bu = ROL32((Abo1^Do0), 28); \ - Ba = ROL32((Aku0^Du1), 14); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Asa0^Da1), 18); \ - Bi = ROL32((Age0^De1), 5); \ - Bo = ROL32((Ami0^Di0), 7); \ - Bu = ROL32((Abo0^Do1), 28); \ - Ba = ROL32((Aku1^Du0), 13); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aka0^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Agi1^Di0), 31); \ - Be = ROL32((Amo0^Do1), 28); \ - Bi = ROL32((Abu0^Du1), 20); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aka1^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Agi0^Di1), 31); \ - Be = ROL32((Amo1^Do0), 27); \ - Bi = ROL32((Abu1^Du0), 19); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); - -#define KeccakRound3() \ - Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \ - Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \ - Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \ - De1 = Cz^Cw; \ - Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Abe0^De0), 22); \ - Bi = ROL32((Abi0^Di1), 22); \ - Bo = ROL32((Abo0^Do1), 11); \ - Bu = ROL32((Abu0^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Abe1^De1), 22); \ - Bi = ROL32((Abi1^Di0), 21); \ - Bo = ROL32((Abo1^Do0), 10); \ - Bu = ROL32((Abu1^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aga0^Da1), 2); \ - Bo = ROL32((Age0^De1), 23); \ - Bu = ROL32((Agi0^Di1), 31); \ - Ba = ROL32((Ago0^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aga1^Da0), 1); \ - Bo = ROL32((Age1^De0), 22); \ - Bu = ROL32((Agi1^Di0), 30); \ - Ba = ROL32((Ago1^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aka0^Da0), 9); \ - Ba = ROL32((Ake0^De1), 1); \ - Be = ROL32((Aki0^Di0), 3); \ - Bi = ROL32((Ako0^Do1), 13); \ - Bo = ROL32((Aku0^Du0), 4); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aka1^Da1), 9); \ - Ba = (Ake1^De0); \ - Be = ROL32((Aki1^Di1), 3); \ - Bi = ROL32((Ako1^Do0), 12); \ - Bo = ROL32((Aku1^Du1), 4); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Ama0^Da0), 18); \ - Bi = ROL32((Ame0^De0), 5); \ - Bo = ROL32((Ami0^Di1), 8); \ - Bu = ROL32((Amo0^Do0), 28); \ - Ba = ROL32((Amu0^Du1), 14); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Ama1^Da1), 18); \ - Bi = ROL32((Ame1^De1), 5); \ - Bo = ROL32((Ami1^Di0), 7); \ - Bu = ROL32((Amo1^Do1), 28); \ - Ba = ROL32((Amu1^Du0), 13); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Asa0^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Asi0^Di0), 31); \ - Be = ROL32((Aso0^Do1), 28); \ - Bi = ROL32((Asu0^Du1), 20); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Asa1^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Asi1^Di1), 31); \ - Be = ROL32((Aso1^Do0), 27); \ - Bi = ROL32((Asu1^Du0), 19); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); - -void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds) -{ - UINT32 Da0, De0, Di0, Do0, Du0; - UINT32 Da1, De1, Di1, Do1, Du1; - UINT32 Ba, Be, Bi, Bo, Bu; - UINT32 Cx, Cy, Cz, Cw; - const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2; - UINT32 *stateAsHalfLanes = (UINT32*)state; - #define Aba0 stateAsHalfLanes[ 0] - #define Aba1 stateAsHalfLanes[ 1] - #define Abe0 stateAsHalfLanes[ 2] - #define Abe1 stateAsHalfLanes[ 3] - #define Abi0 stateAsHalfLanes[ 4] - #define Abi1 stateAsHalfLanes[ 5] - #define Abo0 stateAsHalfLanes[ 6] - #define Abo1 stateAsHalfLanes[ 7] - #define Abu0 stateAsHalfLanes[ 8] - #define Abu1 stateAsHalfLanes[ 9] - #define Aga0 stateAsHalfLanes[10] - #define Aga1 stateAsHalfLanes[11] - #define Age0 stateAsHalfLanes[12] - #define Age1 stateAsHalfLanes[13] - #define Agi0 stateAsHalfLanes[14] - #define Agi1 stateAsHalfLanes[15] - #define Ago0 stateAsHalfLanes[16] - #define Ago1 stateAsHalfLanes[17] - #define Agu0 stateAsHalfLanes[18] - #define Agu1 stateAsHalfLanes[19] - #define Aka0 stateAsHalfLanes[20] - #define Aka1 stateAsHalfLanes[21] - #define Ake0 stateAsHalfLanes[22] - #define Ake1 stateAsHalfLanes[23] - #define Aki0 stateAsHalfLanes[24] - #define Aki1 stateAsHalfLanes[25] - #define Ako0 stateAsHalfLanes[26] - #define Ako1 stateAsHalfLanes[27] - #define Aku0 stateAsHalfLanes[28] - #define Aku1 stateAsHalfLanes[29] - #define Ama0 stateAsHalfLanes[30] - #define Ama1 stateAsHalfLanes[31] - #define Ame0 stateAsHalfLanes[32] - #define Ame1 stateAsHalfLanes[33] - #define Ami0 stateAsHalfLanes[34] - #define Ami1 stateAsHalfLanes[35] - #define Amo0 stateAsHalfLanes[36] - #define Amo1 stateAsHalfLanes[37] - #define Amu0 stateAsHalfLanes[38] - #define Amu1 stateAsHalfLanes[39] - #define Asa0 stateAsHalfLanes[40] - #define Asa1 stateAsHalfLanes[41] - #define Ase0 stateAsHalfLanes[42] - #define Ase1 stateAsHalfLanes[43] - #define Asi0 stateAsHalfLanes[44] - #define Asi1 stateAsHalfLanes[45] - #define Aso0 stateAsHalfLanes[46] - #define Aso1 stateAsHalfLanes[47] - #define Asu0 stateAsHalfLanes[48] - #define Asu1 stateAsHalfLanes[49] - - nRounds &= 3; - switch ( nRounds ) - { - #define I0 Ba - #define I1 Be - #define T0 Bi - #define T1 Bo - #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \ - I0 = (in0)[0]; I1 = (in0)[1]; \ - T0 = (in1)[0]; T1 = (in1)[1]; \ - (in0)[eo0] = T0; (in0)[eo0^1] = T1; \ - T0 = (in2)[0]; T1 = (in2)[1]; \ - (in1)[eo1] = T0; (in1)[eo1^1] = T1; \ - T0 = (in3)[0]; T1 = (in3)[1]; \ - (in2)[eo2] = T0; (in2)[eo2^1] = T1; \ - (in3)[eo3] = I0; (in3)[eo3^1] = I1 - #define SwapPI2( in0,in1,in2,in3 ) \ - I0 = (in0)[0]; I1 = (in0)[1]; \ - T0 = (in1)[0]; T1 = (in1)[1]; \ - (in0)[1] = T0; (in0)[0] = T1; \ - (in1)[1] = I0; (in1)[0] = I1; \ - I0 = (in2)[0]; I1 = (in2)[1]; \ - T0 = (in3)[0]; T1 = (in3)[1]; \ - (in2)[1] = T0; (in2)[0] = T1; \ - (in3)[1] = I0; (in3)[0] = I1 - #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0 - - case 1: - SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 ); - SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 ); - SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 ); - SwapEO( Ami0, Ami1 ); - SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 ); - SwapEO( Ako0, Ako1 ); - SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 ); - break; - - case 2: - SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 ); - SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 ); - SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 ); - SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 ); - SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 ); - break; - - case 3: - SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 ); - SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 ); - SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 ); - SwapEO( Ami0, Ami1 ); - SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 ); - SwapEO( Ako0, Ako1 ); - SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 ); - break; - #undef I0 - #undef I1 - #undef T0 - #undef T1 - #undef SwapPI13 - #undef SwapPI2 - #undef SwapEO - } - - do - { - /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */ - switch ( nRounds ) - { - case 0: KeccakRound0(); /* fall through */ - case 3: KeccakRound1(); - case 2: KeccakRound2(); - case 1: KeccakRound3(); - } - nRounds = 0; - } - while ( *pRoundConstants != 0xFF ); - - #undef Aba0 - #undef Aba1 - #undef Abe0 - #undef Abe1 - #undef Abi0 - #undef Abi1 - #undef Abo0 - #undef Abo1 - #undef Abu0 - #undef Abu1 - #undef Aga0 - #undef Aga1 - #undef Age0 - #undef Age1 - #undef Agi0 - #undef Agi1 - #undef Ago0 - #undef Ago1 - #undef Agu0 - #undef Agu1 - #undef Aka0 - #undef Aka1 - #undef Ake0 - #undef Ake1 - #undef Aki0 - #undef Aki1 - #undef Ako0 - #undef Ako1 - #undef Aku0 - #undef Aku1 - #undef Ama0 - #undef Ama1 - #undef Ame0 - #undef Ame1 - #undef Ami0 - #undef Ami1 - #undef Amo0 - #undef Amo1 - #undef Amu0 - #undef Amu1 - #undef Asa0 - #undef Asa1 - #undef Ase0 - #undef Ase1 - #undef Asi0 - #undef Asi1 - #undef Aso0 - #undef Aso1 - #undef Asu0 - #undef Asu1 -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_Permute_12rounds(void *state) -{ - KeccakP1600_Permute_Nrounds(state, 12); -} diff --git a/benches/kangarootwelve/K12/lib/KangarooTwelve.c b/benches/kangarootwelve/K12/lib/KangarooTwelve.c deleted file mode 100644 index c097a27..0000000 --- a/benches/kangarootwelve/K12/lib/KangarooTwelve.c +++ /dev/null @@ -1,572 +0,0 @@ -/* -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#include -#include -#include "KangarooTwelve.h" - -void KangarooTwelve_SetProcessorCapabilities(); -int K12_enableSSSE3 = 0; -int K12_enableAVX2 = 0; -int K12_enableAVX512 = 0; - -int KeccakWidth1600_12rounds_SpongeInitialize(KeccakWidth1600_12rounds_SpongeInstance *instance, unsigned int rate, unsigned int capacity) -{ - if (rate+capacity != 1600) - return 1; - if ((rate <= 0) || (rate > 1600) || ((rate % 8) != 0)) - return 1; - KeccakP1600_Initialize(instance->state); - instance->rate = rate; - instance->byteIOIndex = 0; - instance->squeezing = 0; - - return 0; -} - -/* ---------------------------------------------------------------- */ - -int KeccakWidth1600_12rounds_SpongeAbsorb(KeccakWidth1600_12rounds_SpongeInstance *instance, const unsigned char *data, size_t dataByteLen) -{ - size_t i, j; - unsigned int partialBlock; - const unsigned char *curData; - unsigned int rateInBytes = instance->rate/8; - - if (instance->squeezing) - return 1; /* Too late for additional input */ - - i = 0; - curData = data; - while(i < dataByteLen) { - if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) { -#ifdef KeccakP1600_12rounds_FastLoop_supported - /* processing full blocks first */ - if ((rateInBytes % (1600/200)) == 0) { - /* fast lane: whole lane rate */ - j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, rateInBytes/(1600/200), curData, dataByteLen - i); - i += j; - curData += j; - } - else { -#endif - for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { - KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes); - KeccakP1600_Permute_12rounds(instance->state); - curData+=rateInBytes; - } - i = dataByteLen - j; -#ifdef KeccakP1600_12rounds_FastLoop_supported - } -#endif - } - else { - /* normal lane: using the message queue */ - partialBlock = (unsigned int)(dataByteLen - i); - if (partialBlock+instance->byteIOIndex > rateInBytes) - partialBlock = rateInBytes-instance->byteIOIndex; - i += partialBlock; - - KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock); - curData += partialBlock; - instance->byteIOIndex += partialBlock; - if (instance->byteIOIndex == rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - } - } - return 0; -} - -/* ---------------------------------------------------------------- */ - -int KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(KeccakWidth1600_12rounds_SpongeInstance *instance, unsigned char delimitedData) -{ - unsigned int rateInBytes = instance->rate/8; - - if (delimitedData == 0) - return 1; - if (instance->squeezing) - return 1; /* Too late for additional input */ - - /* Last few bits, whose delimiter coincides with first bit of padding */ - KeccakP1600_AddByte(instance->state, delimitedData, instance->byteIOIndex); - /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */ - if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1))) - KeccakP1600_Permute_12rounds(instance->state); - /* Second bit of padding */ - KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1); - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - instance->squeezing = 1; - return 0; -} - -/* ---------------------------------------------------------------- */ - -int KeccakWidth1600_12rounds_SpongeSqueeze(KeccakWidth1600_12rounds_SpongeInstance *instance, unsigned char *data, size_t dataByteLen) -{ - size_t i, j; - unsigned int partialBlock; - unsigned int rateInBytes = instance->rate/8; - unsigned char *curData; - - if (!instance->squeezing) - KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(instance, 0x01); - - i = 0; - curData = data; - while(i < dataByteLen) { - if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) { - for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes); - curData+=rateInBytes; - } - i = dataByteLen - j; - } - else { - /* normal lane: using the message queue */ - if (instance->byteIOIndex == rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - partialBlock = (unsigned int)(dataByteLen - i); - if (partialBlock+instance->byteIOIndex > rateInBytes) - partialBlock = rateInBytes-instance->byteIOIndex; - i += partialBlock; - - KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock); - curData += partialBlock; - instance->byteIOIndex += partialBlock; - } - } - return 0; -} - -/* ---------------------------------------------------------------- */ - -#define chunkSize 8192 -#define laneSize 8 -#define suffixLeaf 0x0B /* '110': message hop, simple padding, inner node */ - -#define security 128 -#define capacity (2*security) -#define capacityInBytes (capacity/8) -#define rate (1600-capacity) - -#ifndef KeccakP1600_disableParallelism - -int KeccakP1600times2_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - result |= K12_enableSSSE3; - return result; -} - -const char * KeccakP1600times2_GetImplementation() -{ - if (K12_enableAVX512) - return "AVX-512 implementation"; - else - if (K12_enableSSSE3) - return "SSSE3 implementation"; - else - return ""; -} - -void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); - -void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) - KangarooTwelve_AVX512_Process2Leaves(input, output); - else - if (K12_enableSSSE3) - KangarooTwelve_SSSE3_Process2Leaves(input, output); -} - -int KeccakP1600times4_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - result |= K12_enableAVX2; - return result; -} - -const char * KeccakP1600times4_GetImplementation() -{ - if (K12_enableAVX512) - return "AVX-512 implementation"; - else - if (K12_enableAVX2) - return "AVX2 implementation"; - else - return ""; -} - -void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); - -void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) - KangarooTwelve_AVX512_Process4Leaves(input, output); - else - if (K12_enableAVX2) - KangarooTwelve_AVX2_Process4Leaves(input, output); -} - -int KeccakP1600times8_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - return result; -} - -const char * KeccakP1600times8_GetImplementation() -{ - if (K12_enableAVX512) - return "AVX-512 implementation"; - else - return ""; -} - -void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); - -void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) - KangarooTwelve_AVX512_Process8Leaves(input, output); -} - -#define ProcessLeaves( Parallellism ) \ - while ( inLen >= Parallellism * chunkSize ) { \ - unsigned char intermediate[Parallellism*capacityInBytes]; \ - \ - KangarooTwelve_Process##Parallellism##Leaves(input, intermediate); \ - input += Parallellism * chunkSize; \ - inLen -= Parallellism * chunkSize; \ - ktInstance->blockNumber += Parallellism; \ - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, Parallellism * capacityInBytes) != 0) return 1; \ - } - -#endif - -static unsigned int right_encode( unsigned char * encbuf, size_t value ) -{ - unsigned int n, i; - size_t v; - - for ( v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8 ) - ; /* empty */ - for ( i = 1; i <= n; ++i ) - encbuf[i-1] = (unsigned char)(value >> (8 * (n-i))); - encbuf[n] = (unsigned char)n; - return n + 1; -} - -int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputLen) -{ - KangarooTwelve_SetProcessorCapabilities(); - ktInstance->fixedOutputLength = outputLen; - ktInstance->queueAbsorbedLen = 0; - ktInstance->blockNumber = 0; - ktInstance->phase = ABSORBING; - return KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->finalNode, rate, capacity); -} - -int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inLen) -{ - if (ktInstance->phase != ABSORBING) - return 1; - - if ( ktInstance->blockNumber == 0 ) { - /* First block, absorb in final node */ - unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen); - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, input, len) != 0) - return 1; - input += len; - inLen -= len; - ktInstance->queueAbsorbedLen += len; - if ( (ktInstance->queueAbsorbedLen == chunkSize) && (inLen != 0) ) { - /* First block complete and more input data available, finalize it */ - const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */ - ktInstance->queueAbsorbedLen = 0; - ktInstance->blockNumber = 1; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, &padding, 1) != 0) - return 1; - ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */ - } - } - else if ( ktInstance->queueAbsorbedLen != 0 ) { - /* There is data in the queue, absorb further in queue until block complete */ - unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen); - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0) - return 1; - input += len; - inLen -= len; - ktInstance->queueAbsorbedLen += len; - if ( ktInstance->queueAbsorbedLen == chunkSize ) { - unsigned char intermediate[capacityInBytes]; - ktInstance->queueAbsorbedLen = 0; - ++ktInstance->blockNumber; - if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) - return 1; - } - } - -#ifndef KeccakP1600_disableParallelism - if (KeccakP1600times8_IsAvailable()) { - ProcessLeaves(8); - } - - if (KeccakP1600times4_IsAvailable()) { - ProcessLeaves(4); - } - - if (KeccakP1600times2_IsAvailable()) { - ProcessLeaves(2); - } -#endif - - while ( inLen > 0 ) { - unsigned int len = (inLen < chunkSize) ? inLen : chunkSize; - if (KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->queueNode, rate, capacity) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0) - return 1; - input += len; - inLen -= len; - if ( len == chunkSize ) { - unsigned char intermediate[capacityInBytes]; - ++ktInstance->blockNumber; - if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) - return 1; - } - else - ktInstance->queueAbsorbedLen = len; - } - - return 0; -} - -int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char * output, const unsigned char * customization, size_t customLen) -{ - unsigned char encbuf[sizeof(size_t)+1+2]; - unsigned char padding; - - if (ktInstance->phase != ABSORBING) - return 1; - - /* Absorb customization | right_encode(customLen) */ - if ((customLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customLen) != 0)) - return 1; - if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customLen)) != 0) - return 1; - - if ( ktInstance->blockNumber == 0 ) { - /* Non complete first block in final node, pad it */ - padding = 0x07; /* '11': message hop, final node */ - } - else { - unsigned int n; - - if ( ktInstance->queueAbsorbedLen != 0 ) { - /* There is data in the queue node */ - unsigned char intermediate[capacityInBytes]; - ++ktInstance->blockNumber; - if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) - return 1; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) - return 1; - } - --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */ - n = right_encode(encbuf, ktInstance->blockNumber); - encbuf[n++] = 0xFF; - encbuf[n++] = 0xFF; - if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, encbuf, n) != 0) - return 1; - padding = 0x06; /* '01': chaining hop, final node */ - } - if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->finalNode, padding) != 0) - return 1; - if ( ktInstance->fixedOutputLength != 0 ) { - ktInstance->phase = FINAL; - return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength); - } - ktInstance->phase = SQUEEZING; - return 0; -} - -int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char * output, size_t outputLen) -{ - if (ktInstance->phase != SQUEEZING) - return 1; - return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, outputLen); -} - -int KangarooTwelve( const unsigned char * input, size_t inLen, unsigned char * output, size_t outLen, const unsigned char * customization, size_t customLen ) -{ - KangarooTwelve_Instance ktInstance; - - if (outLen == 0) - return 1; - if (KangarooTwelve_Initialize(&ktInstance, outLen) != 0) - return 1; - if (KangarooTwelve_Update(&ktInstance, input, inLen) != 0) - return 1; - return KangarooTwelve_Final(&ktInstance, output, customization, customLen); -} - -/* Processor capability detection code by Samuel Neves and Jack O'Connor, see - * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c - */ - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#else -#if defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#else - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -static enum cpu_feature g_cpu_features = UNDEFINED; - -static enum cpu_feature - get_cpu_features() { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 0)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void KangarooTwelve_SetProcessorCapabilities() -{ - enum cpu_feature features = get_cpu_features(); - K12_enableSSSE3 = (features & SSSE3); - K12_enableAVX2 = (features & AVX2); - K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL); -} diff --git a/benches/kangarootwelve/K12/lib/KangarooTwelve.h b/benches/kangarootwelve/K12/lib/KangarooTwelve.h deleted file mode 100644 index 6b9edfa..0000000 --- a/benches/kangarootwelve/K12/lib/KangarooTwelve.h +++ /dev/null @@ -1,110 +0,0 @@ -/* -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _KangarooTwelve_h_ -#define _KangarooTwelve_h_ - -#include -#include "KeccakP-1600-SnP.h" - -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -ALIGN(KeccakP1600_stateAlignment) typedef struct KeccakWidth1600_12rounds_SpongeInstanceStruct { - unsigned char state[KeccakP1600_stateSizeInBytes]; - unsigned int rate; - unsigned int byteIOIndex; - int squeezing; -} KeccakWidth1600_12rounds_SpongeInstance; - -typedef enum { - NOT_INITIALIZED, - ABSORBING, - FINAL, - SQUEEZING -} KCP_Phases; -typedef KCP_Phases KangarooTwelve_Phases; - -typedef struct { - KeccakWidth1600_12rounds_SpongeInstance queueNode; - KeccakWidth1600_12rounds_SpongeInstance finalNode; - size_t fixedOutputLength; - size_t blockNumber; - unsigned int queueAbsorbedLen; - KangarooTwelve_Phases phase; -} KangarooTwelve_Instance; - -/** Extendable ouput function KangarooTwelve. - * @param input Pointer to the input message (M). - * @param inputByteLen The length of the input message in bytes. - * @param output Pointer to the output buffer. - * @param outputByteLen The desired number of output bytes. - * @param customization Pointer to the customization string (C). - * @param customByteLen The length of the customization string in bytes. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen ); - -/** - * Function to initialize a KangarooTwelve instance. - * @param ktInstance Pointer to the instance to be initialized. - * @param outputByteLen The desired number of output bytes, - * or 0 for an arbitrarily-long output. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen); - -/** - * Function to give input data to be absorbed. - * @param ktInstance Pointer to the instance initialized by KangarooTwelve_Initialize(). - * @param input Pointer to the input message data (M). - * @param inputByteLen The number of bytes provided in the input message data. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen); - -/** - * Function to call after all the input message has been input, and to get - * output bytes if the length was specified when calling KangarooTwelve_Initialize(). - * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). - * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of - * output bytes is equal to @a outputByteLen. - * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes - * must be extracted using the KangarooTwelve_Squeeze() function. - * @param output Pointer to the buffer where to store the output data. - * @param customization Pointer to the customization string (C). - * @param customByteLen The length of the customization string in bytes. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen); - -/** - * Function to squeeze output data. - * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). - * @param data Pointer to the buffer where to store the output data. - * @param outputByteLen The number of output bytes desired. - * @pre KangarooTwelve_Final() must have been already called. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen); - -#endif diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX2.s b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX2.s deleted file mode 100644 index e394ea0..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX2.s +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright (c) 2006-2017, CRYPTOGAMS by -# Copyright (c) 2017 Ronny Van Keer -# All rights reserved. -# -# The source code in this file is licensed under the CRYPTOGAMS license. -# For further details see http://www.openssl.org/~appro/cryptogams/. -# -# Notes: -# The code for the permutation (__KeccakF1600) was generated with -# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project -# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). -# The rest of the code was written by Ronny Van Keer. - -.arch .avx2 - -.text - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_Initialize(void *state); -# -.globl KeccakP1600_AVX2_Initialize -.type KeccakP1600_AVX2_Initialize,@function -.align 32 -KeccakP1600_AVX2_Initialize: - vpxor %ymm0,%ymm0,%ymm0 - vmovdqa %ymm0,0*32(%rdi) - vmovdqa %ymm0,1*32(%rdi) - vmovdqa %ymm0,2*32(%rdi) - vmovdqa %ymm0,3*32(%rdi) - vmovdqa %ymm0,4*32(%rdi) - vmovdqa %ymm0,5*32(%rdi) - movq $0,6*32(%rdi) - ret -.size KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); -# %rdi %rsi %rdx -# -.globl KeccakP1600_AVX2_AddByte -.type KeccakP1600_AVX2_AddByte,@function -.align 32 -KeccakP1600_AVX2_AddByte: - mov %rdx, %rax - and $7, %rax - and $0xFFFFFFF8, %edx - lea mapState(%rip), %r9 - mov (%r9, %rdx), %rdx - add %rdx, %rdi - add %rax, %rdi - xorb %sil, (%rdi) - ret -.size KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX2_AddBytes -.type KeccakP1600_AVX2_AddBytes,@function -.align 32 -KeccakP1600_AVX2_AddBytes: - cmp $0, %rcx - jz KeccakP1600_AVX2_AddBytes_Exit - mov %rdx, %rax # rax offset in lane - and $0xFFFFFFF8, %edx # rdx pointer into state index mapper - lea mapState(%rip), %r9 - add %r9, %rdx - and $7, %rax - jz KeccakP1600_AVX2_AddBytes_LaneAlignedCheck - mov $8, %r9 # r9 is (max) length of incomplete lane - sub %rax, %r9 - cmp %rcx, %r9 - cmovae %rcx, %r9 - sub %r9, %rcx # length -= length of incomplete lane - add (%rdx), %rax # rax = pointer to state lane - add $8, %rdx - add %rdi, %rax -KeccakP1600_AVX2_AddBytes_NotAlignedLoop: - mov (%rsi), %r8b - inc %rsi - xorb %r8b, (%rax) - inc %rax - dec %r9 - jnz KeccakP1600_AVX2_AddBytes_NotAlignedLoop - jmp KeccakP1600_AVX2_AddBytes_LaneAlignedCheck -KeccakP1600_AVX2_AddBytes_LaneAlignedLoop: - mov (%rsi), %r8 - add $8, %rsi - mov (%rdx), %rax - add $8, %rdx - add %rdi, %rax - xor %r8, (%rax) -KeccakP1600_AVX2_AddBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX2_AddBytes_LaneAlignedLoop -KeccakP1600_AVX2_AddBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX2_AddBytes_Exit - mov (%rdx), %rax - add %rdi, %rax -KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop: - mov (%rsi), %r8b - inc %rsi - xor %r8b, (%rax) - inc %rax - dec %rcx - jnz KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop -KeccakP1600_AVX2_AddBytes_Exit: - ret -.size KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX2_ExtractBytes -.type KeccakP1600_AVX2_ExtractBytes,@function -.align 32 -KeccakP1600_AVX2_ExtractBytes: - push %rbx - cmp $0, %rcx - jz KeccakP1600_AVX2_ExtractBytes_Exit - mov %rdx, %rax # rax offset in lane - and $0xFFFFFFF8, %edx # rdx pointer into state index mapper - lea mapState(%rip), %r9 - add %r9, %rdx - and $7, %rax - jz KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck - mov $8, %rbx # rbx is (max) length of incomplete lane - sub %rax, %rbx - cmp %rcx, %rbx - cmovae %rcx, %rbx - sub %rbx, %rcx # length -= length of incomplete lane - mov (%rdx), %r9 - add $8, %rdx - add %rdi, %r9 - add %rax, %r9 -KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop: - mov (%r9), %r8b - inc %r9 - mov %r8b, (%rsi) - inc %rsi - dec %rbx - jnz KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop - jmp KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck -KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop: - mov (%rdx), %rax - add $8, %rdx - add %rdi, %rax - mov (%rax), %r8 - mov %r8, (%rsi) - add $8, %rsi -KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop -KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX2_ExtractBytes_Exit - mov (%rdx), %rax - add %rdi, %rax - mov (%rax), %r8 -KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop: - mov %r8b, (%rsi) - shr $8, %r8 - inc %rsi - dec %rcx - jnz KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop -KeccakP1600_AVX2_ExtractBytes_Exit: - pop %rbx - ret -.size KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes - -# ----------------------------------------------------------------------------- -# -# internal -# -.type __KeccakF1600,@function -.align 32 -__KeccakF1600: -.Loop_avx2: - ######################################### Theta - vpshufd $0b01001110,%ymm2,%ymm13 - vpxor %ymm3,%ymm5,%ymm12 - vpxor %ymm6,%ymm4,%ymm9 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm9,%ymm12,%ymm12 # C[1..4] - - vpermq $0b10010011,%ymm12,%ymm11 - vpxor %ymm2,%ymm13,%ymm13 - vpermq $0b01001110,%ymm13,%ymm7 - - vpsrlq $63,%ymm12,%ymm8 - vpaddq %ymm12,%ymm12,%ymm9 - vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) - - vpermq $0b00111001,%ymm8,%ymm15 - vpxor %ymm11,%ymm8,%ymm14 - vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] - - vpxor %ymm0,%ymm13,%ymm13 - vpxor %ymm7,%ymm13,%ymm13 # C[0..0] - - vpsrlq $63,%ymm13,%ymm7 - vpaddq %ymm13,%ymm13,%ymm8 - vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) - - vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] - vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] - - vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 - vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 - vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] - - ######################################### Rho + Pi + pre-Chi shuffle - vpsllvq 0*32-96(%r8),%ymm2,%ymm10 - vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 - vpor %ymm10,%ymm2,%ymm2 - - vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta - vpsllvq 2*32-96(%r8),%ymm3,%ymm11 - vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 - vpor %ymm11,%ymm3,%ymm3 - - vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta - vpsllvq 3*32-96(%r8),%ymm4,%ymm12 - vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 - vpor %ymm12,%ymm4,%ymm4 - - vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta - vpsllvq 4*32-96(%r8),%ymm5,%ymm13 - vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 - vpor %ymm13,%ymm5,%ymm5 - - vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta - vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 - vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 - vpsllvq 5*32-96(%r8),%ymm6,%ymm14 - vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 - vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 - - vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta - vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 - vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 - vpsllvq 1*32-96(%r8),%ymm1,%ymm15 - vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 - vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 - - ######################################### Chi - vpsrldq $8,%ymm8,%ymm14 - vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] - - vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] - vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] - vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] - vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] - vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] - vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] - vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] - vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] - vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] - vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] - vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] - vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] - vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] - vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] - - vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] - vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] - vpxor %ymm10,%ymm3,%ymm3 - vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] - vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] - vpxor %ymm12,%ymm5,%ymm5 - vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] - vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] - vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] - vpxor %ymm13,%ymm6,%ymm6 - - vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] - vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] - vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] - vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] - vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] - - vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] - vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] - vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] - vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] - vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] - vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] - vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] - vpxor %ymm9,%ymm2,%ymm2 - - vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] - vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle - vpermq $0b10001101,%ymm5,%ymm5 - vpermq $0b01110010,%ymm6,%ymm6 - - vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] - vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] - vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] - vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] - vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] - vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] - vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm11,%ymm4,%ymm4 - - ######################################### Iota - vpxor (%r10),%ymm0,%ymm0 - lea 32(%r10),%r10 - - dec %eax - jnz .Loop_avx2 - ret -.size __KeccakF1600,.-__KeccakF1600 - - - -.globl KeccakP1600_AVX2_Permute_12rounds -.type KeccakP1600_AVX2_Permute_12rounds,@function -.align 32 -KeccakP1600_AVX2_Permute_12rounds: - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - lea 96(%rdi),%rdi - vzeroupper - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 - call __KeccakF1600 - vmovq %xmm0,-96(%rdi) - vmovdqu %ymm1,8+32*0-96(%rdi) - vmovdqu %ymm2,8+32*1-96(%rdi) - vmovdqu %ymm3,8+32*2-96(%rdi) - vmovdqu %ymm4,8+32*3-96(%rdi) - vmovdqu %ymm5,8+32*4-96(%rdi) - vmovdqu %ymm6,8+32*5-96(%rdi) - vzeroupper - ret -.size KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds - -# ----------------------------------------------------------------------------- -# -# size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX2_12rounds_FastLoop_Absorb -.type KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function -.align 32 -KeccakP1600_AVX2_12rounds_FastLoop_Absorb: - push %rbx - push %r10 - shr $3, %rcx # rcx = data length in lanes - mov %rdx, %rbx # rbx = initial data pointer - cmp %rsi, %rcx - jb KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit - vzeroupper - cmp $21, %rsi - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes - sub $21, %rcx - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea 96(%rdi),%rdi - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes: - vpbroadcastq (%rdx),%ymm7 - vmovdqu 8(%rdx),%ymm8 - - vmovdqa map2(%rip), %xmm15 - vpcmpeqq %ymm14, %ymm14, %ymm14 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 - - vmovdqa mask3_21(%rip), %ymm14 - vpxor %ymm10, %ymm10, %ymm10 - vmovdqa map3(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 - - vmovdqa mask4_21(%rip), %ymm14 - vpxor %ymm11, %ymm11, %ymm11 - vmovdqa map4(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 - - vmovdqa mask5_21(%rip), %ymm14 - vpxor %ymm12, %ymm12, %ymm12 - vmovdqa map5(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 - - vmovdqa mask6_21(%rip), %ymm14 - vpxor %ymm13, %ymm13, %ymm13 - vmovdqa map6(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm11,%ymm4,%ymm4 - vpxor %ymm12,%ymm5,%ymm5 - vpxor %ymm13,%ymm6,%ymm6 - add $21*8, %rdx - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - call __KeccakF1600 - sub $21, %rcx - jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit: - vmovq %xmm0,-96(%rdi) - vmovdqu %ymm1,8+32*0-96(%rdi) - vmovdqu %ymm2,8+32*1-96(%rdi) - vmovdqu %ymm3,8+32*2-96(%rdi) - vmovdqu %ymm4,8+32*3-96(%rdi) - vmovdqu %ymm5,8+32*4-96(%rdi) - vmovdqu %ymm6,8+32*5-96(%rdi) -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit: - vzeroupper - mov %rdx, %rax # return number of bytes processed - sub %rbx, %rax - pop %r10 - pop %rbx - ret -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes: - cmp $17, %rsi - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes - sub $17, %rcx - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea 96(%rdi),%rdi - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes: - vpbroadcastq (%rdx),%ymm7 - vmovdqu 8(%rdx),%ymm8 - - vmovdqa mask2_17(%rip), %ymm14 - vpxor %ymm9, %ymm9, %ymm9 - vmovdqa map2(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 - - vmovdqa mask3_17(%rip), %ymm14 - vpxor %ymm10, %ymm10, %ymm10 - vmovdqa map3(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 - - vmovdqa mask4_17(%rip), %ymm14 - vpxor %ymm11, %ymm11, %ymm11 - vmovdqa map4(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 - - vmovdqa mask5_17(%rip), %ymm14 - vpxor %ymm12, %ymm12, %ymm12 - vmovdqa map5(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 - - vmovdqa mask6_17(%rip), %ymm14 - vpxor %ymm13, %ymm13, %ymm13 - vmovdqa map6(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm11,%ymm4,%ymm4 - vpxor %ymm12,%ymm5,%ymm5 - vpxor %ymm13,%ymm6,%ymm6 - add $17*8, %rdx - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - call __KeccakF1600 - sub $17, %rcx - jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes - jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes: - lea mapState(%rip), %r9 - mov %rsi, %rax -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop: - mov (%rdx), %r8 - add $8, %rdx - mov (%r9), %r10 - add $8, %r9 - add %rdi, %r10 - xor %r8, (%r10) - sub $1, %rax - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop - sub %rsi, %rcx - push %rdi - push %rsi - push %rdx - push %rcx - call KeccakP1600_AVX2_Permute_12rounds@PLT - pop %rcx - pop %rdx - pop %rsi - pop %rdi - cmp %rsi, %rcx - jae KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes - jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit -.size KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb - -.equ ALLON, 0xFFFFFFFFFFFFFFFF - -.align 64 -rhotates_left: - .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] - .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] - .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] - .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] - .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] - .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] -rhotates_right: - .quad 64-3, 64-18, 64-36, 64-41 - .quad 64-1, 64-62, 64-28, 64-27 - .quad 64-45, 64-6, 64-56, 64-39 - .quad 64-10, 64-61, 64-55, 64-8 - .quad 64-2, 64-15, 64-25, 64-20 - .quad 64-44, 64-43, 64-21, 64-14 -iotas: - .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 - .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 - .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a - .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 - .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b - .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 - .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 - .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 - .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a - .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 - .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 - .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a - .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b - .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b - .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 - .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 - .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 - .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 - .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a - .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a - .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 - .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 - .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 - .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 - -mapState: - .quad 0*8, 1*8, 2*8, 3*8, 4*8 - .quad 7*8, 21*8, 10*8, 15*8, 20*8 - .quad 5*8, 13*8, 22*8, 19*8, 12*8 - .quad 8*8, 9*8, 18*8, 23*8, 16*8 - .quad 6*8, 17*8, 14*8, 11*8, 24*8 - - .align 16 -map2: - .long 10*8, 20*8, 5*8, 15*8 -map3: - .long 16*8, 7*8, 23*8, 14*8 -map4: - .long 11*8, 22*8, 8*8, 19*8 -map5: - .long 21*8, 17*8, 13*8, 9*8 -map6: - .long 6*8, 12*8, 18*8, 24*8 - - .align 32 -mask3_21: - .quad ALLON, ALLON, 0, ALLON -mask4_21: - .quad ALLON, 0, ALLON, ALLON -mask5_21: - .quad 0, ALLON, ALLON, ALLON -mask6_21: - .quad ALLON, ALLON, ALLON, 0 - -mask2_17: - .quad ALLON, 0, ALLON, ALLON -mask3_17: - .quad ALLON, ALLON, 0, ALLON -mask4_17: - .quad ALLON, 0, ALLON, 0 -mask5_17: - .quad 0, 0, ALLON, ALLON -mask6_17: - .quad ALLON, ALLON, 0, 0 - -.asciz "Keccak-1600 for AVX2, CRYPTOGAMS by " diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX512.s b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX512.s deleted file mode 100644 index d6fad5c..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-AVX512.s +++ /dev/null @@ -1,503 +0,0 @@ -# Copyright (c) 2006-2017, CRYPTOGAMS by -# Copyright (c) 2018 Ronny Van Keer -# All rights reserved. -# -# The source code in this file is licensed under the CRYPTOGAMS license. -# For further details see http://www.openssl.org/~appro/cryptogams/. -# -# Notes: -# The code for the permutation (__KeccakF1600) was generated with -# Andy Polyakov's keccak1600-avx512.pl from the CRYPTOGAMS project -# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx512.pl). -# The rest of the code was written by Ronny Van Keer. - -.arch .avx512f - -.text - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_Initialize(void *state); -# -.globl KeccakP1600_AVX512_Initialize -.type KeccakP1600_AVX512_Initialize,@function -.align 32 -KeccakP1600_AVX512_Initialize: - vpxorq %zmm0,%zmm0,%zmm0 - vmovdqa64 %zmm0,0*64(%rdi) - vmovdqa64 %zmm0,1*64(%rdi) - vmovdqa64 %zmm0,2*64(%rdi) - movq $0,3*64(%rdi) - ret -.size KeccakP1600_AVX512_Initialize,.-KeccakP1600_AVX512_Initialize - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); -# %rdi %rsi %rdx -#!! -#.globl KeccakP1600_AVX512_AddByte -#.type KeccakP1600_AVX512_AddByte,@function -#.align 32 -#KeccakP1600_AVX512_AddByte: -# mov %rdx, %rax -# and $7, %rax -# and $0xFFFFFFF8, %edx -# mov mapState(%rdx), %rdx -# add %rdx, %rdi -# add %rax, %rdi -# xorb %sil, (%rdi) -# ret -#.size KeccakP1600_AVX512_AddByte,.-KeccakP1600_AVX512_AddByte - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX512_AddBytes -.type KeccakP1600_AVX512_AddBytes,@function -.align 32 -KeccakP1600_AVX512_AddBytes: - cmp $0, %rcx - jz KeccakP1600_AVX512_AddBytes_Exit - add %rdx, %rdi # state += offset - and $7, %rdx - jz KeccakP1600_AVX512_AddBytes_LaneAlignedCheck - mov $8, %r9 # r9 is (max) length of incomplete lane - sub %rdx, %r9 - cmp %rcx, %r9 - cmovae %rcx, %r9 - sub %r9, %rcx # length -= length of incomplete lane -KeccakP1600_AVX512_AddBytes_NotAlignedLoop: - mov (%rsi), %r8b - inc %rsi - xorb %r8b, (%rdi) - inc %rdi - dec %r9 - jnz KeccakP1600_AVX512_AddBytes_NotAlignedLoop - jmp KeccakP1600_AVX512_AddBytes_LaneAlignedCheck -KeccakP1600_AVX512_AddBytes_LaneAlignedLoop: - mov (%rsi), %r8 - add $8, %rsi - xor %r8, (%rdi) - add $8, %rdi -KeccakP1600_AVX512_AddBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX512_AddBytes_LaneAlignedLoop -KeccakP1600_AVX512_AddBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX512_AddBytes_Exit -KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop: - mov (%rsi), %r8b - inc %rsi - xor %r8b, (%rdi) - inc %rdi - dec %rcx - jnz KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop -KeccakP1600_AVX512_AddBytes_Exit: - ret -.size KeccakP1600_AVX512_AddBytes,.-KeccakP1600_AVX512_AddBytes - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX512_ExtractBytes -.type KeccakP1600_AVX512_ExtractBytes,@function -.align 32 -KeccakP1600_AVX512_ExtractBytes: - cmp $0, %rcx - jz KeccakP1600_AVX512_ExtractBytes_Exit - add %rdx, %rdi # state += offset - and $7, %rdx - jz KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck - mov $8, %rax # rax is (max) length of incomplete lane - sub %rdx, %rax - cmp %rcx, %rax - cmovae %rcx, %rax - sub %rax, %rcx # length -= length of incomplete lane -KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop: - mov (%rdi), %r8b - inc %rdi - mov %r8b, (%rsi) - inc %rsi - dec %rax - jnz KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop - jmp KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck -KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop: - mov (%rdi), %r8 - add $8, %rdi - mov %r8, (%rsi) - add $8, %rsi -KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop -KeccakP1600_AVX512_ExtractBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX512_ExtractBytes_Exit - mov (%rdi), %r8 -KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop: - mov %r8b, (%rsi) - shr $8, %r8 - inc %rsi - dec %rcx - jnz KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop -KeccakP1600_AVX512_ExtractBytes_Exit: - ret -.size KeccakP1600_AVX512_ExtractBytes,.-KeccakP1600_AVX512_ExtractBytes - -# ----------------------------------------------------------------------------- -# -# internal -# -.text -.type __KeccakF1600,@function -.align 32 -__KeccakF1600: -.Loop_avx512: - ######################################### Theta, even round - vmovdqa64 %zmm0,%zmm5 # put aside original A00 - vpternlogq $0x96,%zmm2,%zmm1,%zmm0 # and use it as "C00" - vpternlogq $0x96,%zmm4,%zmm3,%zmm0 - vprolq $1,%zmm0,%zmm6 - vpermq %zmm0,%zmm13,%zmm0 - vpermq %zmm6,%zmm16,%zmm6 - vpternlogq $0x96,%zmm0,%zmm6,%zmm5 # T[0] is original A00 - vpternlogq $0x96,%zmm0,%zmm6,%zmm1 - vpternlogq $0x96,%zmm0,%zmm6,%zmm2 - vpternlogq $0x96,%zmm0,%zmm6,%zmm3 - vpternlogq $0x96,%zmm0,%zmm6,%zmm4 - ######################################### Rho - vprolvq %zmm22,%zmm5,%zmm0 # T[0] is original A00 - vprolvq %zmm23,%zmm1,%zmm1 - vprolvq %zmm24,%zmm2,%zmm2 - vprolvq %zmm25,%zmm3,%zmm3 - vprolvq %zmm26,%zmm4,%zmm4 - ######################################### Pi - vpermq %zmm0,%zmm17,%zmm0 - vpermq %zmm1,%zmm18,%zmm1 - vpermq %zmm2,%zmm19,%zmm2 - vpermq %zmm3,%zmm20,%zmm3 - vpermq %zmm4,%zmm21,%zmm4 - ######################################### Chi - vmovdqa64 %zmm0,%zmm5 - vmovdqa64 %zmm1,%zmm6 - vpternlogq $0xD2,%zmm2,%zmm1,%zmm0 - vpternlogq $0xD2,%zmm3,%zmm2,%zmm1 - vpternlogq $0xD2,%zmm4,%zmm3,%zmm2 - vpternlogq $0xD2,%zmm5,%zmm4,%zmm3 - vpternlogq $0xD2,%zmm6,%zmm5,%zmm4 - ######################################### Iota - vpxorq (%r10),%zmm0,%zmm0{%k1} - lea 16(%r10),%r10 - ######################################### Harmonize rounds - vpblendmq %zmm2,%zmm1,%zmm6{%k2} - vpblendmq %zmm3,%zmm2,%zmm7{%k2} - vpblendmq %zmm4,%zmm3,%zmm8{%k2} - vpblendmq %zmm1,%zmm0,%zmm5{%k2} - vpblendmq %zmm0,%zmm4,%zmm9{%k2} - vpblendmq %zmm3,%zmm6,%zmm6{%k3} - vpblendmq %zmm4,%zmm7,%zmm7{%k3} - vpblendmq %zmm2,%zmm5,%zmm5{%k3} - vpblendmq %zmm0,%zmm8,%zmm8{%k3} - vpblendmq %zmm1,%zmm9,%zmm9{%k3} - vpblendmq %zmm4,%zmm6,%zmm6{%k4} - vpblendmq %zmm3,%zmm5,%zmm5{%k4} - vpblendmq %zmm0,%zmm7,%zmm7{%k4} - vpblendmq %zmm1,%zmm8,%zmm8{%k4} - vpblendmq %zmm2,%zmm9,%zmm9{%k4} - vpblendmq %zmm4,%zmm5,%zmm5{%k5} - vpblendmq %zmm0,%zmm6,%zmm6{%k5} - vpblendmq %zmm1,%zmm7,%zmm7{%k5} - vpblendmq %zmm2,%zmm8,%zmm8{%k5} - vpblendmq %zmm3,%zmm9,%zmm9{%k5} - #vpermq %zmm5,%zmm33,%zmm0 # doesn't actually change order - vpermq %zmm6,%zmm13,%zmm1 - vpermq %zmm7,%zmm14,%zmm2 - vpermq %zmm8,%zmm15,%zmm3 - vpermq %zmm9,%zmm16,%zmm4 - ######################################### Theta, odd round - vmovdqa64 %zmm5,%zmm0 # real A00 - vpternlogq $0x96,%zmm2,%zmm1,%zmm5 # C00 is %zmm5's alias - vpternlogq $0x96,%zmm4,%zmm3,%zmm5 - vprolq $1,%zmm5,%zmm6 - vpermq %zmm5,%zmm13,%zmm5 - vpermq %zmm6,%zmm16,%zmm6 - vpternlogq $0x96,%zmm5,%zmm6,%zmm0 - vpternlogq $0x96,%zmm5,%zmm6,%zmm3 - vpternlogq $0x96,%zmm5,%zmm6,%zmm1 - vpternlogq $0x96,%zmm5,%zmm6,%zmm4 - vpternlogq $0x96,%zmm5,%zmm6,%zmm2 - ######################################### Rho - vprolvq %zmm27,%zmm0,%zmm0 - vprolvq %zmm30,%zmm3,%zmm6 - vprolvq %zmm28,%zmm1,%zmm7 - vprolvq %zmm31,%zmm4,%zmm8 - vprolvq %zmm29,%zmm2,%zmm9 - vpermq %zmm0,%zmm16,%zmm10 - vpermq %zmm0,%zmm15,%zmm11 - ######################################### Iota - vpxorq -8(%r10),%zmm0,%zmm0{%k1} - ######################################### Pi - vpermq %zmm6,%zmm14,%zmm1 - vpermq %zmm7,%zmm16,%zmm2 - vpermq %zmm8,%zmm13,%zmm3 - vpermq %zmm9,%zmm15,%zmm4 - ######################################### Chi - vpternlogq $0xD2,%zmm11,%zmm10,%zmm0 - vpermq %zmm6,%zmm13,%zmm12 - #vpermq %zmm6,%zmm33,%zmm6 - vpternlogq $0xD2,%zmm6,%zmm12,%zmm1 - vpermq %zmm7,%zmm15,%zmm5 - vpermq %zmm7,%zmm14,%zmm7 - vpternlogq $0xD2,%zmm7,%zmm5,%zmm2 - #vpermq %zmm8,%zmm33,%zmm8 - vpermq %zmm8,%zmm16,%zmm6 - vpternlogq $0xD2,%zmm6,%zmm8,%zmm3 - vpermq %zmm9,%zmm14,%zmm5 - vpermq %zmm9,%zmm13,%zmm9 - vpternlogq $0xD2,%zmm9,%zmm5,%zmm4 - dec %eax - jnz .Loop_avx512 - ret -.size __KeccakF1600,.-__KeccakF1600 - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_Permute_12rounds(void *state); -# %rdi -# -.globl KeccakP1600_AVX512_Permute_12rounds -.type KeccakP1600_AVX512_Permute_12rounds,@function -.align 32 -KeccakP1600_AVX512_Permute_12rounds: - lea 96(%rdi),%rdi - lea theta_perm(%rip),%r8 - kxnorw %k6,%k6,%k6 - kshiftrw $15,%k6,%k1 - kshiftrw $11,%k6,%k6 - kshiftlw $1,%k1,%k2 - kshiftlw $2,%k1,%k3 - kshiftlw $3,%k1,%k4 - kshiftlw $4,%k1,%k5 - #vmovdqa64 64*0(%r8),%zmm33 - vmovdqa64 64*1(%r8),%zmm13 - vmovdqa64 64*2(%r8),%zmm14 - vmovdqa64 64*3(%r8),%zmm15 - vmovdqa64 64*4(%r8),%zmm16 - vmovdqa64 64*5(%r8),%zmm27 - vmovdqa64 64*6(%r8),%zmm28 - vmovdqa64 64*7(%r8),%zmm29 - vmovdqa64 64*8(%r8),%zmm30 - vmovdqa64 64*9(%r8),%zmm31 - vmovdqa64 64*10(%r8),%zmm22 - vmovdqa64 64*11(%r8),%zmm23 - vmovdqa64 64*12(%r8),%zmm24 - vmovdqa64 64*13(%r8),%zmm25 - vmovdqa64 64*14(%r8),%zmm26 - vmovdqa64 64*15(%r8),%zmm17 - vmovdqa64 64*16(%r8),%zmm18 - vmovdqa64 64*17(%r8),%zmm19 - vmovdqa64 64*18(%r8),%zmm20 - vmovdqa64 64*19(%r8),%zmm21 - vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} -# vpxorq %zmm5,%zmm5,%zmm5 - vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} - vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} - vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} - vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - vmovdqu64 %zmm0,40*0-96(%rdi){%k6} - vmovdqu64 %zmm1,40*1-96(%rdi){%k6} - vmovdqu64 %zmm2,40*2-96(%rdi){%k6} - vmovdqu64 %zmm3,40*3-96(%rdi){%k6} - vmovdqu64 %zmm4,40*4-96(%rdi){%k6} - vzeroupper - ret -.size KeccakP1600_AVX512_Permute_12rounds,.-KeccakP1600_AVX512_Permute_12rounds - -# ----------------------------------------------------------------------------- -# -# size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); -# %rdi %rsi %rdx %rcx -# -.globl KeccakP1600_AVX512_12rounds_FastLoop_Absorb -.type KeccakP1600_AVX512_12rounds_FastLoop_Absorb,@function -.align 32 -KeccakP1600_AVX512_12rounds_FastLoop_Absorb: - push %rbx - push %r10 - shr $3, %rcx # rcx = data length in lanes - mov %rdx, %rbx # rbx = initial data pointer - cmp %rsi, %rcx - jb KeccakP1600_AVX512_FastLoop_Absorb_Exit - lea 96(%rdi),%rdi - lea theta_perm(%rip),%r8 - kxnorw %k6,%k6,%k6 - kshiftrw $15,%k6,%k1 - kshiftrw $11,%k6,%k6 - kshiftlw $1,%k1,%k2 - kshiftlw $2,%k1,%k3 - kshiftlw $3,%k1,%k4 - kshiftlw $4,%k1,%k5 - vmovdqa64 64*1(%r8),%zmm13 - vmovdqa64 64*2(%r8),%zmm14 - vmovdqa64 64*3(%r8),%zmm15 - vmovdqa64 64*4(%r8),%zmm16 - vmovdqa64 64*5(%r8),%zmm27 - vmovdqa64 64*6(%r8),%zmm28 - vmovdqa64 64*7(%r8),%zmm29 - vmovdqa64 64*8(%r8),%zmm30 - vmovdqa64 64*9(%r8),%zmm31 - vmovdqa64 64*10(%r8),%zmm22 - vmovdqa64 64*11(%r8),%zmm23 - vmovdqa64 64*12(%r8),%zmm24 - vmovdqa64 64*13(%r8),%zmm25 - vmovdqa64 64*14(%r8),%zmm26 - vmovdqa64 64*15(%r8),%zmm17 - vmovdqa64 64*16(%r8),%zmm18 - vmovdqa64 64*17(%r8),%zmm19 - vmovdqa64 64*18(%r8),%zmm20 - vmovdqa64 64*19(%r8),%zmm21 - vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} - vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} - vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} - vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} - vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} - cmp $21, %rsi - jnz KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes - sub $21, %rcx -KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes: - vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} - vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} - vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k6}{z} - vmovdqu64 8*20(%rdx),%zmm9{%k1}{z} - vpxorq %zmm5,%zmm0,%zmm0 - vpxorq %zmm6,%zmm1,%zmm1 - vpxorq %zmm7,%zmm2,%zmm2 - vpxorq %zmm8,%zmm3,%zmm3 - vpxorq %zmm9,%zmm4,%zmm4 - add $21*8, %rdx - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - sub $21, %rcx - jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes -KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit: - vmovdqu64 %zmm0,40*0-96(%rdi){%k6} - vmovdqu64 %zmm1,40*1-96(%rdi){%k6} - vmovdqu64 %zmm2,40*2-96(%rdi){%k6} - vmovdqu64 %zmm3,40*3-96(%rdi){%k6} - vmovdqu64 %zmm4,40*4-96(%rdi){%k6} -KeccakP1600_AVX512_FastLoop_Absorb_Exit: - vzeroupper - mov %rdx, %rax # return number of bytes processed - sub %rbx, %rax - pop %r10 - pop %rbx - ret -KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes: - cmp $17, %rsi - jnz KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes - sub $17, %rcx -KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes: - vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} - vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} - vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k1}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k2} - vpxorq %zmm5,%zmm0,%zmm0 - vpxorq %zmm6,%zmm1,%zmm1 - vpxorq %zmm7,%zmm2,%zmm2 - vpxorq %zmm8,%zmm3,%zmm3 - add $17*8, %rdx - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - sub $17, %rcx - jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes - jmp KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit -KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes: - lea -96(%rdi), %rdi -KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop: - mov %rsi, %rax - mov %rdi, %r10 -KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop: - mov (%rdx), %r8 - add $8, %rdx - xor %r8, (%r10) - add $8, %r10 - sub $1, %rax - jnz KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop - sub %rsi, %rcx - push %rdi - push %rsi - push %rdx - push %rcx - call KeccakP1600_AVX512_Permute_12rounds@PLT - pop %rcx - pop %rdx - pop %rsi - pop %rdi - cmp %rsi, %rcx - jae KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop - jmp KeccakP1600_AVX512_FastLoop_Absorb_Exit -.size KeccakP1600_AVX512_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX512_12rounds_FastLoop_Absorb - -.align 64 -theta_perm: - .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] - .quad 4, 0, 1, 2, 3, 5, 6, 7 - .quad 3, 4, 0, 1, 2, 5, 6, 7 - .quad 2, 3, 4, 0, 1, 5, 6, 7 - .quad 1, 2, 3, 4, 0, 5, 6, 7 -rhotates1: - .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] - .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] - .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] - .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] - .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] -rhotates0: - .quad 0, 1, 62, 28, 27, 0, 0, 0 - .quad 36, 44, 6, 55, 20, 0, 0, 0 - .quad 3, 10, 43, 25, 39, 0, 0, 0 - .quad 41, 45, 15, 21, 8, 0, 0, 0 - .quad 18, 2, 61, 56, 14, 0, 0, 0 -pi0_perm: - .quad 0, 3, 1, 4, 2, 5, 6, 7 - .quad 1, 4, 2, 0, 3, 5, 6, 7 - .quad 2, 0, 3, 1, 4, 5, 6, 7 - .quad 3, 1, 4, 2, 0, 5, 6, 7 - .quad 4, 2, 0, 3, 1, 5, 6, 7 -iotas: - .quad 0x0000000000000001 - .quad 0x0000000000008082 - .quad 0x800000000000808a - .quad 0x8000000080008000 - .quad 0x000000000000808b - .quad 0x0000000080000001 - .quad 0x8000000080008081 - .quad 0x8000000000008009 - .quad 0x000000000000008a - .quad 0x0000000000000088 - .quad 0x0000000080008009 - .quad 0x000000008000000a - .quad 0x000000008000808b - .quad 0x800000000000008b - .quad 0x8000000000008089 - .quad 0x8000000000008003 - .quad 0x8000000000008002 - .quad 0x8000000000000080 - .quad 0x000000000000800a - .quad 0x800000008000000a - .quad 0x8000000080008081 - .quad 0x8000000000008080 - .quad 0x0000000080000001 - .quad 0x8000000080008008 -iotas_end: -.asciz "Keccak-1600 for AVX-512F, CRYPTOGAMS by " diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-SnP.h b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-SnP.h deleted file mode 100644 index 8900796..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-SnP.h +++ /dev/null @@ -1,69 +0,0 @@ -/* -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -/* Keccak-p[1600] */ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 64 -#define KeccakP1600_12rounds_FastLoop_supported - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_Initialize(void *state); -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_Permute_12rounds(void *state); -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_AVX512_Initialize(void *state); -void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_AVX512_Permute_12rounds(void *state); -void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_AVX2_Initialize(void *state); -void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_AVX2_Permute_12rounds(void *state); -void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_opt64_Initialize(void *state); -void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_opt64_Permute_12rounds(void *state); -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -/* Keccak-p[1600]×2 */ - -int KeccakP1600times2_IsAvailable(); -const char * KeccakP1600times2_GetImplementation(); - -/* Keccak-p[1600]×4 */ - -int KeccakP1600times4_IsAvailable(); -const char * KeccakP1600times4_GetImplementation(); - -/* Keccak-p[1600]×8 */ - -int KeccakP1600times8_IsAvailable(); -const char * KeccakP1600times8_GetImplementation(); - -#endif diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-opt64.c b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-opt64.c deleted file mode 100644 index 26e7fa5..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-opt64.c +++ /dev/null @@ -1,1105 +0,0 @@ -/* -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "brg_endian.h" -#include "KeccakP-1600-SnP.h" - -extern int K12_enableAVX2; -extern int K12_enableAVX512; - -const char * KeccakP1600_GetImplementation() -{ - if (K12_enableAVX512) - return "AVX-512 implementation"; - else - if (K12_enableAVX2) - return "AVX2 implementation"; - else - return "generic 64-bit implementation"; -} - -#include - -void KeccakP1600_Initialize(void *state) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_Initialize(state); - else - if (K12_enableAVX2) - KeccakP1600_AVX2_Initialize(state); - else - KeccakP1600_opt64_Initialize(state); -} - -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) -{ - if (K12_enableAVX512) - ((unsigned char*)(state))[offset] ^= data; - else - if (K12_enableAVX2) - KeccakP1600_AVX2_AddByte(state, data, offset); - else - KeccakP1600_opt64_AddByte(state, data, offset); -} - -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_AddBytes(state, data, offset, length); - else - if (K12_enableAVX2) - KeccakP1600_AVX2_AddBytes(state, data, offset, length); - else - KeccakP1600_opt64_AddBytes(state, data, offset, length); -} - -void KeccakP1600_Permute_12rounds(void *state) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_Permute_12rounds(state); - else - if (K12_enableAVX2) - KeccakP1600_AVX2_Permute_12rounds(state); - else - KeccakP1600_opt64_Permute_12rounds(state); -} - -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_ExtractBytes(state, data, offset, length); - else - if (K12_enableAVX2) - KeccakP1600_AVX2_ExtractBytes(state, data, offset, length); - else - KeccakP1600_opt64_ExtractBytes(state, data, offset, length); -} - -size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) -{ - if (K12_enableAVX512) - return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); - else - if (K12_enableAVX2) - return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); - else - return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); -} - -#define KeccakP1600_opt64_implementation_config "all rounds unrolled" -#define KeccakP1600_opt64_fullUnrolling -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "6 rounds unrolled" -#define KeccakP1600_opt64_unrolling 6 -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, 6 rounds unrolled" -#define KeccakP1600_opt64_unrolling 6 -#define KeccakP1600_opt64_useLaneComplementing -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled" -#define KeccakP1600_opt64_fullUnrolling -#define KeccakP1600_opt64_useLaneComplementing -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled, using SHLD for rotations" -#define KeccakP1600_opt64_fullUnrolling -#define KeccakP1600_opt64_useLaneComplementing -#define KeccakP1600_opt64_useSHLD -*/ - -typedef unsigned char UINT8; -typedef unsigned long long int UINT64; - -#if defined(KeccakP1600_opt64_useLaneComplementing) -#define UseBebigokimisa -#endif - -#if defined(_MSC_VER) -#define ROL64(a, offset) _rotl64(a, offset) -#elif defined(KeccakP1600_opt64_useSHLD) - #define ROL64(x,N) ({ \ - register UINT64 __out; \ - register UINT64 __in = x; \ - __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \ - __out; \ - }) -#else -#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) -#endif - -#ifdef KeccakP1600_opt64_fullUnrolling -#define FullUnrolling -#else -#define Unrolling KeccakP1600_opt64_unrolling -#endif - -static const UINT64 KeccakF1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL }; - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_Initialize(void *state) -{ - memset(state, 0, 200); -#ifdef KeccakP1600_opt64_useLaneComplementing - ((UINT64*)state)[ 1] = ~(UINT64)0; - ((UINT64*)state)[ 2] = ~(UINT64)0; - ((UINT64*)state)[ 8] = ~(UINT64)0; - ((UINT64*)state)[12] = ~(UINT64)0; - ((UINT64*)state)[17] = ~(UINT64)0; - ((UINT64*)state)[20] = ~(UINT64)0; -#endif -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - UINT64 lane; - if (length == 0) - return; - if (length == 1) - lane = data[0]; - else { - lane = 0; - memcpy(&lane, data, length); - } - lane <<= offset*8; -#else - UINT64 lane = 0; - unsigned int i; - for(i=0; i 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -#define declareABCDE \ - UINT64 Aba, Abe, Abi, Abo, Abu; \ - UINT64 Aga, Age, Agi, Ago, Agu; \ - UINT64 Aka, Ake, Aki, Ako, Aku; \ - UINT64 Ama, Ame, Ami, Amo, Amu; \ - UINT64 Asa, Ase, Asi, Aso, Asu; \ - UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \ - UINT64 Bga, Bge, Bgi, Bgo, Bgu; \ - UINT64 Bka, Bke, Bki, Bko, Bku; \ - UINT64 Bma, Bme, Bmi, Bmo, Bmu; \ - UINT64 Bsa, Bse, Bsi, Bso, Bsu; \ - UINT64 Ca, Ce, Ci, Co, Cu; \ - UINT64 Da, De, Di, Do, Du; \ - UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \ - UINT64 Ega, Ege, Egi, Ego, Egu; \ - UINT64 Eka, Eke, Eki, Eko, Eku; \ - UINT64 Ema, Eme, Emi, Emo, Emu; \ - UINT64 Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = Aba^Aga^Aka^Ama^Asa; \ - Ce = Abe^Age^Ake^Ame^Ase; \ - Ci = Abi^Agi^Aki^Ami^Asi; \ - Co = Abo^Ago^Ako^Amo^Aso; \ - Cu = Abu^Agu^Aku^Amu^Asu; \ - -#ifdef UseBebigokimisa -/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^( Bbe | Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)| Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^( Bbo & Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^( Bbu | Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^( Bba & Bbe ); \ - Cu = E##bu; \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^( Bge | Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^( Bgi & Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^( Bgo |(~Bgu)); \ - Ci ^= E##gi; \ - E##go = Bgo ^( Bgu | Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^( Bga & Bge ); \ - Cu ^= E##gu; \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^( Bke | Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^( Bki & Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = (~Bko)^( Bku | Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^( Bka & Bke ); \ - Cu ^= E##ku; \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^( Bme & Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^( Bmi | Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)| Bmu ); \ - Ci ^= E##mi; \ - E##mo = (~Bmo)^( Bmu & Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^( Bma | Bme ); \ - Cu ^= E##mu; \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = (~Bse)^( Bsi | Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^( Bso & Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^( Bsu | Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^( Bsa & Bse ); \ - Cu ^= E##su; \ -\ - -/* --- Code for round (lane complementing pattern 'bebigokimisa') */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^( Bbe | Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - E##be = Bbe ^((~Bbi)| Bbo ); \ - E##bi = Bbi ^( Bbo & Bbu ); \ - E##bo = Bbo ^( Bbu | Bba ); \ - E##bu = Bbu ^( Bba & Bbe ); \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^( Bge | Bgi ); \ - E##ge = Bge ^( Bgi & Bgo ); \ - E##gi = Bgi ^( Bgo |(~Bgu)); \ - E##go = Bgo ^( Bgu | Bga ); \ - E##gu = Bgu ^( Bga & Bge ); \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^( Bke | Bki ); \ - E##ke = Bke ^( Bki & Bko ); \ - E##ki = Bki ^((~Bko)& Bku ); \ - E##ko = (~Bko)^( Bku | Bka ); \ - E##ku = Bku ^( Bka & Bke ); \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^( Bme & Bmi ); \ - E##me = Bme ^( Bmi | Bmo ); \ - E##mi = Bmi ^((~Bmo)| Bmu ); \ - E##mo = (~Bmo)^( Bmu & Bma ); \ - E##mu = Bmu ^( Bma | Bme ); \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - E##se = (~Bse)^( Bsi | Bso ); \ - E##si = Bsi ^( Bso & Bsu ); \ - E##so = Bso ^( Bsu | Bsa ); \ - E##su = Bsu ^( Bsa & Bse ); \ -\ - -#else /* UseBebigokimisa */ -/* --- Code for round, with prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^((~Bba)& Bbe ); \ - Cu = E##bu; \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - Ci ^= E##gi; \ - E##go = Bgo ^((~Bgu)& Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^((~Bga)& Bge ); \ - Cu ^= E##gu; \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^((~Bki)& Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = Bko ^((~Bku)& Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^((~Bka)& Bke ); \ - Cu ^= E##ku; \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^((~Bmi)& Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - Ci ^= E##mi; \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^((~Bma)& Bme ); \ - Cu ^= E##mu; \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = Bse ^((~Bsi)& Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^((~Bso)& Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^((~Bsu)& Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^((~Bsa)& Bse ); \ - Cu ^= E##su; \ -\ - -/* --- Code for round */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - E##bu = Bbu ^((~Bba)& Bbe ); \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - E##go = Bgo ^((~Bgu)& Bga ); \ - E##gu = Bgu ^((~Bga)& Bge ); \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - E##ke = Bke ^((~Bki)& Bko ); \ - E##ki = Bki ^((~Bko)& Bku ); \ - E##ko = Bko ^((~Bku)& Bka ); \ - E##ku = Bku ^((~Bka)& Bke ); \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - E##me = Bme ^((~Bmi)& Bmo ); \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - E##mu = Bmu ^((~Bma)& Bme ); \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - E##se = Bse ^((~Bsi)& Bso ); \ - E##si = Bsi ^((~Bso)& Bsu ); \ - E##so = Bso ^((~Bsu)& Bsa ); \ - E##su = Bsu ^((~Bsa)& Bse ); \ -\ - -#endif /* UseBebigokimisa */ - -#define copyFromState(X, state) \ - X##ba = state[ 0]; \ - X##be = state[ 1]; \ - X##bi = state[ 2]; \ - X##bo = state[ 3]; \ - X##bu = state[ 4]; \ - X##ga = state[ 5]; \ - X##ge = state[ 6]; \ - X##gi = state[ 7]; \ - X##go = state[ 8]; \ - X##gu = state[ 9]; \ - X##ka = state[10]; \ - X##ke = state[11]; \ - X##ki = state[12]; \ - X##ko = state[13]; \ - X##ku = state[14]; \ - X##ma = state[15]; \ - X##me = state[16]; \ - X##mi = state[17]; \ - X##mo = state[18]; \ - X##mu = state[19]; \ - X##sa = state[20]; \ - X##se = state[21]; \ - X##si = state[22]; \ - X##so = state[23]; \ - X##su = state[24]; \ - -#define copyToState(state, X) \ - state[ 0] = X##ba; \ - state[ 1] = X##be; \ - state[ 2] = X##bi; \ - state[ 3] = X##bo; \ - state[ 4] = X##bu; \ - state[ 5] = X##ga; \ - state[ 6] = X##ge; \ - state[ 7] = X##gi; \ - state[ 8] = X##go; \ - state[ 9] = X##gu; \ - state[10] = X##ka; \ - state[11] = X##ke; \ - state[12] = X##ki; \ - state[13] = X##ko; \ - state[14] = X##ku; \ - state[15] = X##ma; \ - state[16] = X##me; \ - state[17] = X##mi; \ - state[18] = X##mo; \ - state[19] = X##mu; \ - state[20] = X##sa; \ - state[21] = X##se; \ - state[22] = X##si; \ - state[23] = X##so; \ - state[24] = X##su; \ - -#define copyStateVariables(X, Y) \ - X##ba = Y##ba; \ - X##be = Y##be; \ - X##bi = Y##bi; \ - X##bo = Y##bo; \ - X##bu = Y##bu; \ - X##ga = Y##ga; \ - X##ge = Y##ge; \ - X##gi = Y##gi; \ - X##go = Y##go; \ - X##gu = Y##gu; \ - X##ka = Y##ka; \ - X##ke = Y##ke; \ - X##ki = Y##ki; \ - X##ko = Y##ko; \ - X##ku = Y##ku; \ - X##ma = Y##ma; \ - X##me = Y##me; \ - X##mi = Y##mi; \ - X##mo = Y##mo; \ - X##mu = Y##mu; \ - X##sa = Y##sa; \ - X##se = Y##se; \ - X##si = Y##si; \ - X##so = Y##so; \ - X##su = Y##su; \ - -#if ((defined(FullUnrolling)) || (Unrolling == 12)) -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 6) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (Unrolling == 4) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (Unrolling == 3) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#elif (Unrolling == 2) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#elif (Unrolling == 1) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#else -#error "Unrolling is not correctly specified!" -#endif - -void KeccakP1600_opt64_Permute_12rounds(void *state) -{ - declareABCDE - #ifndef KeccakP1600_opt64_fullUnrolling - unsigned int i; - #endif - UINT64 *stateAsLanes = (UINT64*)state; - - copyFromState(A, stateAsLanes) - rounds12 - copyToState(stateAsLanes, A) -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) -{ - UINT64 lane = ((UINT64*)state)[lanePosition]; -#ifdef KeccakP1600_opt64_useLaneComplementing - if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) - lane = ~lane; -#endif -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - { - UINT64 lane1[1]; - lane1[0] = lane; - memcpy(data, (UINT8*)lane1+offset, length); - } -#else - unsigned int i; - lane >>= offset*8; - for(i=0; i>= 8; - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) -static void fromWordToBytes(UINT8 *bytes, const UINT64 word) -{ - unsigned int i; - - for(i=0; i<(64/8); i++) - bytes[i] = (word >> (8*i)) & 0xFF; -} -#endif - -void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - memcpy(data, state, laneCount*8); -#else - unsigned int i; - - for(i=0; i 1) { - ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; - if (laneCount > 2) { - ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; - if (laneCount > 8) { - ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8]; - if (laneCount > 12) { - ((UINT64*)data)[12] = ~((UINT64*)data)[12]; - if (laneCount > 17) { - ((UINT64*)data)[17] = ~((UINT64*)data)[17]; - if (laneCount > 20) { - ((UINT64*)data)[20] = ~((UINT64*)data)[20]; - } - } - } - } - } - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ - { \ - if ((offset) == 0) { \ - SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ - SnP_ExtractBytesInLane(state, \ - (length)/SnP_laneLengthInBytes, \ - (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ - 0, \ - (length)%SnP_laneLengthInBytes); \ - } \ - else { \ - unsigned int _sizeLeft = (length); \ - unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ - unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ - unsigned char *_curData = (data); \ - while(_sizeLeft > 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define HTOLE64(x) (x) -#else -#define HTOLE64(x) (\ - ((x & 0xff00000000000000ull) >> 56) | \ - ((x & 0x00ff000000000000ull) >> 40) | \ - ((x & 0x0000ff0000000000ull) >> 24) | \ - ((x & 0x000000ff00000000ull) >> 8) | \ - ((x & 0x00000000ff000000ull) << 8) | \ - ((x & 0x0000000000ff0000ull) << 24) | \ - ((x & 0x000000000000ff00ull) << 40) | \ - ((x & 0x00000000000000ffull) << 56)) -#endif - -#define addInput(X, input, laneCount) \ - if (laneCount == 21) { \ - X##ba ^= HTOLE64(input[ 0]); \ - X##be ^= HTOLE64(input[ 1]); \ - X##bi ^= HTOLE64(input[ 2]); \ - X##bo ^= HTOLE64(input[ 3]); \ - X##bu ^= HTOLE64(input[ 4]); \ - X##ga ^= HTOLE64(input[ 5]); \ - X##ge ^= HTOLE64(input[ 6]); \ - X##gi ^= HTOLE64(input[ 7]); \ - X##go ^= HTOLE64(input[ 8]); \ - X##gu ^= HTOLE64(input[ 9]); \ - X##ka ^= HTOLE64(input[10]); \ - X##ke ^= HTOLE64(input[11]); \ - X##ki ^= HTOLE64(input[12]); \ - X##ko ^= HTOLE64(input[13]); \ - X##ku ^= HTOLE64(input[14]); \ - X##ma ^= HTOLE64(input[15]); \ - X##me ^= HTOLE64(input[16]); \ - X##mi ^= HTOLE64(input[17]); \ - X##mo ^= HTOLE64(input[18]); \ - X##mu ^= HTOLE64(input[19]); \ - X##sa ^= HTOLE64(input[20]); \ - } \ - -#include - -size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) -{ - size_t originalDataByteLen = dataByteLen; - declareABCDE - #ifndef KeccakP1600_opt64_fullUnrolling - unsigned int i; - #endif - UINT64 *stateAsLanes = (UINT64*)state; - UINT64 *inDataAsLanes = (UINT64*)data; - - assert(laneCount == 21); - - #define laneCount 21 - copyFromState(A, stateAsLanes) - while(dataByteLen >= laneCount*8) { - addInput(A, inDataAsLanes, laneCount) - rounds12 - inDataAsLanes += laneCount; - dataByteLen -= laneCount*8; - } - #undef laneCount - copyToState(stateAsLanes, A) - return originalDataByteLen - dataByteLen; -} diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c deleted file mode 100644 index 3c169f8..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +++ /dev/null @@ -1,427 +0,0 @@ -/* -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "KeccakP-1600-SnP.h" - -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#define AVX2alignment 32 - -#define ANDnu256(a, b) _mm256_andnot_si256(a, b) -#define CONST256(a) _mm256_load_si256((const __m256i *)&(a)) -#define CONST256_64(a) _mm256_set1_epi64x(a) -#define LOAD256(a) _mm256_load_si256((const __m256i *)&(a)) -#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) -#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) -#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) -#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) -static const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; -static const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; -#define STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) -#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) -#define XOR256(a, b) _mm256_xor_si256(a, b) -#define XOReq256(a, b) a = _mm256_xor_si256(a, b) -#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) -#define PERM128( a, b, c ) (__m256i)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) -#define SHUFFLE64( a, b, c ) (__m256i)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) -#define ZERO() _mm256_setzero_si256() - -static ALIGN(AVX2alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define declareABCDE \ - __m256i Aba, Abe, Abi, Abo, Abu; \ - __m256i Aga, Age, Agi, Ago, Agu; \ - __m256i Aka, Ake, Aki, Ako, Aku; \ - __m256i Ama, Ame, Ami, Amo, Amu; \ - __m256i Asa, Ase, Asi, Aso, Asu; \ - __m256i Bba, Bbe, Bbi, Bbo, Bbu; \ - __m256i Bga, Bge, Bgi, Bgo, Bgu; \ - __m256i Bka, Bke, Bki, Bko, Bku; \ - __m256i Bma, Bme, Bmi, Bmo, Bmu; \ - __m256i Bsa, Bse, Bsi, Bso, Bsu; \ - __m256i Ca, Ce, Ci, Co, Cu; \ - __m256i Ca1, Ce1, Ci1, Co1, Cu1; \ - __m256i Da, De, Di, Do, Du; \ - __m256i Eba, Ebe, Ebi, Ebo, Ebu; \ - __m256i Ega, Ege, Egi, Ego, Egu; \ - __m256i Eka, Eke, Eki, Eko, Eku; \ - __m256i Ema, Eme, Emi, Emo, Emu; \ - __m256i Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ - Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ - Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ - Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ - Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ - -/* --- Theta Rho Pi Chi Iota Prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ - Cu = E##bu; \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(Ca, E##ga); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(Ce, E##ge); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - XOReq256(Ci, E##gi); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - XOReq256(Co, E##go); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ - XOReq256(Cu, E##gu); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(Ca, E##ka); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(Ce, E##ke); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - XOReq256(Ci, E##ki); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - XOReq256(Co, E##ko); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ - XOReq256(Cu, E##ku); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(Ca, E##ma); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(Ce, E##me); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - XOReq256(Ci, E##mi); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - XOReq256(Co, E##mo); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ - XOReq256(Cu, E##mu); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(Ca, E##sa); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(Ce, E##se); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - XOReq256(Ci, E##si); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - XOReq256(Co, E##so); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ - XOReq256(Cu, E##su); \ -\ - -/* --- Theta Rho Pi Chi Iota */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ -\ - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1, data2, data3) \ - XOReq256(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ - XOReq256(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ - XOReq256(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ - XOReq256(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ - XOReq256(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ - XOReq256(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ - XOReq256(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ - XOReq256(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ - XOReq256(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ - XOReq256(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ - XOReq256(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ - XOReq256(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ - XOReq256(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ - XOReq256(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ - XOReq256(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ - XOReq256(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1, data2, data3) \ - XORdata16(X, data0, data1, data2, data3) \ - XOReq256(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ - XOReq256(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ - XOReq256(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ - XOReq256(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ - XOReq256(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - declareABCDE - unsigned int j; - - initializeState(A); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - XOReq256(Ame, CONST256_64(0x0BULL)); - XOReq256(Asa, CONST256_64(0x8000000000000000ULL)); - rounds12 - - { - __m256i lanesL01, lanesL23, lanesH01, lanesH23; - - lanesL01 = UNPACKL( Aba, Abe ); - lanesH01 = UNPACKH( Aba, Abe ); - lanesL23 = UNPACKL( Abi, Abo ); - lanesH23 = UNPACKH( Abi, Abo ); - STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); - STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); - STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); - STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); - } -} diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c deleted file mode 100644 index d290771..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +++ /dev/null @@ -1,426 +0,0 @@ -/* -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "KeccakP-1600-SnP.h" - -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#define AVX512alignment 64 - -#define LOAD4_32(a,b,c,d) _mm_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d)) -#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h)) -#define LOAD_GATHER2_64(idx,p) _mm_i32gather_epi64( (const void*)(p), idx, 8) -#define LOAD_GATHER4_64(idx,p) _mm256_i32gather_epi64( (const void*)(p), idx, 8) -#define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8) -#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8) - - -/* Keccak-p[1600]×2 */ - -#define XOR(a,b) _mm_xor_si128(a,b) -#define XOReq(a, b) a = _mm_xor_si128(a, b) -#define XOR3(a,b,c) _mm_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define ROL(a,offset) _mm_rol_epi64(a,offset) -#define Chi(a,b,c) _mm_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm_set1_epi64((__m64)(a)) -#define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b)) -#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) -#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) -#define ZERO() _mm_setzero_si128() - -static ALIGN(AVX512alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define KeccakP_DeclareVars(type) \ - type _Ba, _Be, _Bi, _Bo, _Bu; \ - type _Da, _De, _Di, _Do, _Du; \ - type _ba, _be, _bi, _bo, _bu; \ - type _ga, _ge, _gi, _go, _gu; \ - type _ka, _ke, _ki, _ko, _ku; \ - type _ma, _me, _mi, _mo, _mu; \ - type _sa, _se, _si, _so, _su - -#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \ - _Bb1 = XOR(_L1, _Da); \ - _Bb2 = XOR(_L2, _De); \ - _Bb3 = XOR(_L3, _Di); \ - _Bb4 = XOR(_L4, _Do); \ - _Bb5 = XOR(_L5, _Du); \ - if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \ - _Bb2 = ROL(_Bb2, _Rr2); \ - _Bb3 = ROL(_Bb3, _Rr3); \ - _Bb4 = ROL(_Bb4, _Rr4); \ - _Bb5 = ROL(_Bb5, _Rr5); \ - _L1 = Chi( _Ba, _Be, _Bi); \ - _L2 = Chi( _Be, _Bi, _Bo); \ - _L3 = Chi( _Bi, _Bo, _Bu); \ - _L4 = Chi( _Bo, _Bu, _Ba); \ - _L5 = Chi( _Bu, _Ba, _Be); - -#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \ - _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \ - _Be = XOR5( _be, _ge, _ke, _me, _se ); \ - _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \ - _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \ - _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \ - _Da = ROL( _Be, 1 ); \ - _De = ROL( _Bi, 1 ); \ - _Di = ROL( _Bo, 1 ); \ - _Do = ROL( _Bu, 1 ); \ - _Du = ROL( _Ba, 1 ); \ - _Da = XOR( _Da, _Bu ); \ - _De = XOR( _De, _Ba ); \ - _Di = XOR( _Di, _Be ); \ - _Do = XOR( _Do, _Bi ); \ - _Du = XOR( _Du, _Bo ); \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \ - _L1 = XOR(_L1, _rc) /* Iota */ - -#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 ) - -#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 ) - -#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 ) - -#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 ) - -#define KeccakP_4rounds( i ) \ - KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST_64(KeccakP1600RoundConstants[i]) ); \ - KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \ - KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \ - KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \ - KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST_64(KeccakP1600RoundConstants[i+1]) ); \ - KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \ - KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \ - KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST_64(KeccakP1600RoundConstants[i+2]) ); \ - KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \ - KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \ - KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST_64(KeccakP1600RoundConstants[i+3]) ); \ - KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \ - KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \ - KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su ) - -#define rounds12 \ - KeccakP_4rounds( 12 ); \ - KeccakP_4rounds( 16 ); \ - KeccakP_4rounds( 20 ) - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1) \ - XOReq(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ - XOReq(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ - XOReq(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ - XOReq(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ - XOReq(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ - XOReq(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ - XOReq(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ - XOReq(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ - XOReq(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ - XOReq(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ - XOReq(X##ka, LOAD6464((data1)[10], (data0)[10])); \ - XOReq(X##ke, LOAD6464((data1)[11], (data0)[11])); \ - XOReq(X##ki, LOAD6464((data1)[12], (data0)[12])); \ - XOReq(X##ko, LOAD6464((data1)[13], (data0)[13])); \ - XOReq(X##ku, LOAD6464((data1)[14], (data0)[14])); \ - XOReq(X##ma, LOAD6464((data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1) \ - XORdata16(X, data0, data1) \ - XOReq(X##me, LOAD6464((data1)[16], (data0)[16])); \ - XOReq(X##mi, LOAD6464((data1)[17], (data0)[17])); \ - XOReq(X##mo, LOAD6464((data1)[18], (data0)[18])); \ - XOReq(X##mu, LOAD6464((data1)[19], (data0)[19])); \ - XOReq(X##sa, LOAD6464((data1)[20], (data0)[20])); \ - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m128i); - unsigned int j; - - initializeState(_); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - - STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( _ba, _be ) ); - STORE128u( *(__m128i*)&(output[16]), UNPACKL( _bi, _bo ) ); - STORE128u( *(__m128i*)&(output[32]), UNPACKH( _ba, _be ) ); - STORE128u( *(__m128i*)&(output[48]), UNPACKH( _bi, _bo ) ); -} - -#undef XOR -#undef XOReq -#undef XOR3 -#undef XOR5 -#undef ROL -#undef Chi -#undef CONST_64 -#undef LOAD6464 -#undef STORE128u -#undef UNPACKL -#undef UNPACKH -#undef ZERO -#undef XORdata16 -#undef XORdata21 - - -/* Keccak-p[1600]×4 */ - -#define XOR(a,b) _mm256_xor_si256(a,b) -#define XOReq(a,b) a = _mm256_xor_si256(a,b) -#define XOR3(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define XOR512(a,b) _mm512_xor_si512(a,b) -#define ROL(a,offset) _mm256_rol_epi64(a,offset) -#define Chi(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm256_set1_epi64x(a) -#define ZERO() _mm256_setzero_si256() -#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) - -#define XORdata16(X, data0, data1, data2, data3) \ - XOReq(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ - XOReq(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ - XOReq(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ - XOReq(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ - XOReq(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ - XOReq(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ - XOReq(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ - XOReq(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ - XOReq(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ - XOReq(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ - XOReq(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ - XOReq(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ - XOReq(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ - XOReq(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ - XOReq(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ - XOReq(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1, data2, data3) \ - XORdata16(X, data0, data1, data2, data3) \ - XOReq(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ - XOReq(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ - XOReq(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ - XOReq(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ - XOReq(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ - -void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m256i); - unsigned int j; - - initializeState(_); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - -#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) -#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) -#define PERM128( a, b, c ) (__m256i)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) - { - __m256i lanesL01, lanesL23, lanesH01, lanesH23; - - lanesL01 = UNPACKL( _ba, _be ); - lanesH01 = UNPACKH( _ba, _be ); - lanesL23 = UNPACKL( _bi, _bo ); - lanesH23 = UNPACKH( _bi, _bo ); - STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); - STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); - STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); - STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); - } -/* TODO: check if something like this would be better: - index512 = LOAD8_32(3*laneOffset+1, 2*laneOffset+1, 1*laneOffset+1, 0*laneOffset+1, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset); - STORE_SCATTER8_64(dataAsLanes+0, index512, stateAsLanes512[0/2]); - STORE_SCATTER8_64(dataAsLanes+2, index512, stateAsLanes512[2/2]); -*/ -} - -#undef XOR -#undef XOReq -#undef XOR3 -#undef XOR5 -#undef XOR512 -#undef ROL -#undef Chi -#undef CONST_64 -#undef ZERO -#undef LOAD4_64 -#undef XORdata16 -#undef XORdata21 - - -/* Keccak-p[1600]×8 */ - -#define XOR(a,b) _mm512_xor_si512(a,b) -#define XOReq(a,b) a = _mm512_xor_si512(a,b) -#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define XOReq512(a, b) a = XOR(a,b) -#define ROL(a,offset) _mm512_rol_epi64(a,offset) -#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm512_set1_epi64(a) -#define ZERO() _mm512_setzero_si512() - -#define XORdata16(X, index, dataAsLanes) \ - XOReq(X##ba, LOAD_GATHER8_64(index, (dataAsLanes) + 0)); \ - XOReq(X##be, LOAD_GATHER8_64(index, (dataAsLanes) + 1)); \ - XOReq(X##bi, LOAD_GATHER8_64(index, (dataAsLanes) + 2)); \ - XOReq(X##bo, LOAD_GATHER8_64(index, (dataAsLanes) + 3)); \ - XOReq(X##bu, LOAD_GATHER8_64(index, (dataAsLanes) + 4)); \ - XOReq(X##ga, LOAD_GATHER8_64(index, (dataAsLanes) + 5)); \ - XOReq(X##ge, LOAD_GATHER8_64(index, (dataAsLanes) + 6)); \ - XOReq(X##gi, LOAD_GATHER8_64(index, (dataAsLanes) + 7)); \ - XOReq(X##go, LOAD_GATHER8_64(index, (dataAsLanes) + 8)); \ - XOReq(X##gu, LOAD_GATHER8_64(index, (dataAsLanes) + 9)); \ - XOReq(X##ka, LOAD_GATHER8_64(index, (dataAsLanes) + 10)); \ - XOReq(X##ke, LOAD_GATHER8_64(index, (dataAsLanes) + 11)); \ - XOReq(X##ki, LOAD_GATHER8_64(index, (dataAsLanes) + 12)); \ - XOReq(X##ko, LOAD_GATHER8_64(index, (dataAsLanes) + 13)); \ - XOReq(X##ku, LOAD_GATHER8_64(index, (dataAsLanes) + 14)); \ - XOReq(X##ma, LOAD_GATHER8_64(index, (dataAsLanes) + 15)); \ - -#define XORdata21(X, index, dataAsLanes) \ - XORdata16(X, index, dataAsLanes) \ - XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \ - XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \ - XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \ - XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \ - XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \ - -void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m512i); - unsigned int j; - const uint64_t *outputAsLanes = (const uint64_t *)output; - __m256i index; - - initializeState(_); - - index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8)); - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, index, (const uint64_t *)input); - rounds12 - input += rateInBytes; - } - - XORdata16(_, index, (const uint64_t *)input); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - - index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4); - STORE_SCATTER8_64(outputAsLanes+0, index, _ba); - STORE_SCATTER8_64(outputAsLanes+1, index, _be); - STORE_SCATTER8_64(outputAsLanes+2, index, _bi); - STORE_SCATTER8_64(outputAsLanes+3, index, _bo); -} diff --git a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c b/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c deleted file mode 100644 index 5067f29..0000000 --- a/benches/kangarootwelve/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +++ /dev/null @@ -1,446 +0,0 @@ -/* -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "KeccakP-1600-SnP.h" - -#define KeccakP1600times2_SSSE3_unrolling 2 - -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#define SSSE3alignment 16 - -#define ANDnu128(a, b) _mm_andnot_si128(a, b) -#define CONST128(a) _mm_load_si128((const __m128i *)&(a)) -#define LOAD128(a) _mm_load_si128((const __m128i *)&(a)) -#define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b)) -#define CONST128_64(a) _mm_set1_epi64((__m64)(a)) -#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) -#define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) -#define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) -static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; -static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; -#define STORE128(a, b) _mm_store_si128((__m128i *)&(a), b) -#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) -#define XOR128(a, b) _mm_xor_si128(a, b) -#define XOReq128(a, b) a = _mm_xor_si128(a, b) -#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) -#define ZERO() _mm_setzero_si128() - -static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define declareABCDE \ - __m128i Aba, Abe, Abi, Abo, Abu; \ - __m128i Aga, Age, Agi, Ago, Agu; \ - __m128i Aka, Ake, Aki, Ako, Aku; \ - __m128i Ama, Ame, Ami, Amo, Amu; \ - __m128i Asa, Ase, Asi, Aso, Asu; \ - __m128i Bba, Bbe, Bbi, Bbo, Bbu; \ - __m128i Bga, Bge, Bgi, Bgo, Bgu; \ - __m128i Bka, Bke, Bki, Bko, Bku; \ - __m128i Bma, Bme, Bmi, Bmo, Bmu; \ - __m128i Bsa, Bse, Bsi, Bso, Bsu; \ - __m128i Ca, Ce, Ci, Co, Cu; \ - __m128i Da, De, Di, Do, Du; \ - __m128i Eba, Ebe, Ebi, Ebo, Ebu; \ - __m128i Ega, Ege, Egi, Ego, Egu; \ - __m128i Eka, Eke, Eki, Eko, Eku; \ - __m128i Ema, Eme, Emi, Emo, Emu; \ - __m128i Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \ - Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \ - Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \ - Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \ - Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \ - -/* --- Theta Rho Pi Chi Iota Prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = XOR128(Cu, ROL64in128(Ce, 1)); \ - De = XOR128(Ca, ROL64in128(Ci, 1)); \ - Di = XOR128(Ce, ROL64in128(Co, 1)); \ - Do = XOR128(Ci, ROL64in128(Cu, 1)); \ - Du = XOR128(Co, ROL64in128(Ca, 1)); \ -\ - XOReq128(A##ba, Da); \ - Bba = A##ba; \ - XOReq128(A##ge, De); \ - Bbe = ROL64in128(A##ge, 44); \ - XOReq128(A##ki, Di); \ - Bbi = ROL64in128(A##ki, 43); \ - E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ - XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq128(A##mo, Do); \ - Bbo = ROL64in128(A##mo, 21); \ - E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq128(A##su, Du); \ - Bbu = ROL64in128(A##su, 14); \ - E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ - Cu = E##bu; \ -\ - XOReq128(A##bo, Do); \ - Bga = ROL64in128(A##bo, 28); \ - XOReq128(A##gu, Du); \ - Bge = ROL64in128(A##gu, 20); \ - XOReq128(A##ka, Da); \ - Bgi = ROL64in128(A##ka, 3); \ - E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ - XOReq128(Ca, E##ga); \ - XOReq128(A##me, De); \ - Bgo = ROL64in128(A##me, 45); \ - E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ - XOReq128(Ce, E##ge); \ - XOReq128(A##si, Di); \ - Bgu = ROL64in128(A##si, 61); \ - E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ - XOReq128(Ci, E##gi); \ - E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ - XOReq128(Co, E##go); \ - E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ - XOReq128(Cu, E##gu); \ -\ - XOReq128(A##be, De); \ - Bka = ROL64in128(A##be, 1); \ - XOReq128(A##gi, Di); \ - Bke = ROL64in128(A##gi, 6); \ - XOReq128(A##ko, Do); \ - Bki = ROL64in128(A##ko, 25); \ - E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ - XOReq128(Ca, E##ka); \ - XOReq128(A##mu, Du); \ - Bko = ROL64in128_8(A##mu); \ - E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ - XOReq128(Ce, E##ke); \ - XOReq128(A##sa, Da); \ - Bku = ROL64in128(A##sa, 18); \ - E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ - XOReq128(Ci, E##ki); \ - E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ - XOReq128(Co, E##ko); \ - E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ - XOReq128(Cu, E##ku); \ -\ - XOReq128(A##bu, Du); \ - Bma = ROL64in128(A##bu, 27); \ - XOReq128(A##ga, Da); \ - Bme = ROL64in128(A##ga, 36); \ - XOReq128(A##ke, De); \ - Bmi = ROL64in128(A##ke, 10); \ - E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ - XOReq128(Ca, E##ma); \ - XOReq128(A##mi, Di); \ - Bmo = ROL64in128(A##mi, 15); \ - E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ - XOReq128(Ce, E##me); \ - XOReq128(A##so, Do); \ - Bmu = ROL64in128_56(A##so); \ - E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ - XOReq128(Ci, E##mi); \ - E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ - XOReq128(Co, E##mo); \ - E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ - XOReq128(Cu, E##mu); \ -\ - XOReq128(A##bi, Di); \ - Bsa = ROL64in128(A##bi, 62); \ - XOReq128(A##go, Do); \ - Bse = ROL64in128(A##go, 55); \ - XOReq128(A##ku, Du); \ - Bsi = ROL64in128(A##ku, 39); \ - E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ - XOReq128(Ca, E##sa); \ - XOReq128(A##ma, Da); \ - Bso = ROL64in128(A##ma, 41); \ - E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ - XOReq128(Ce, E##se); \ - XOReq128(A##se, De); \ - Bsu = ROL64in128(A##se, 2); \ - E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ - XOReq128(Ci, E##si); \ - E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ - XOReq128(Co, E##so); \ - E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ - XOReq128(Cu, E##su); \ -\ - -/* --- Theta Rho Pi Chi Iota */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = XOR128(Cu, ROL64in128(Ce, 1)); \ - De = XOR128(Ca, ROL64in128(Ci, 1)); \ - Di = XOR128(Ce, ROL64in128(Co, 1)); \ - Do = XOR128(Ci, ROL64in128(Cu, 1)); \ - Du = XOR128(Co, ROL64in128(Ca, 1)); \ -\ - XOReq128(A##ba, Da); \ - Bba = A##ba; \ - XOReq128(A##ge, De); \ - Bbe = ROL64in128(A##ge, 44); \ - XOReq128(A##ki, Di); \ - Bbi = ROL64in128(A##ki, 43); \ - E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ - XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ - XOReq128(A##mo, Do); \ - Bbo = ROL64in128(A##mo, 21); \ - E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ - XOReq128(A##su, Du); \ - Bbu = ROL64in128(A##su, 14); \ - E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ - E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ - E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ -\ - XOReq128(A##bo, Do); \ - Bga = ROL64in128(A##bo, 28); \ - XOReq128(A##gu, Du); \ - Bge = ROL64in128(A##gu, 20); \ - XOReq128(A##ka, Da); \ - Bgi = ROL64in128(A##ka, 3); \ - E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ - XOReq128(A##me, De); \ - Bgo = ROL64in128(A##me, 45); \ - E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ - XOReq128(A##si, Di); \ - Bgu = ROL64in128(A##si, 61); \ - E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ - E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ - E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ -\ - XOReq128(A##be, De); \ - Bka = ROL64in128(A##be, 1); \ - XOReq128(A##gi, Di); \ - Bke = ROL64in128(A##gi, 6); \ - XOReq128(A##ko, Do); \ - Bki = ROL64in128(A##ko, 25); \ - E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ - XOReq128(A##mu, Du); \ - Bko = ROL64in128_8(A##mu); \ - E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ - XOReq128(A##sa, Da); \ - Bku = ROL64in128(A##sa, 18); \ - E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ - E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ - E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ -\ - XOReq128(A##bu, Du); \ - Bma = ROL64in128(A##bu, 27); \ - XOReq128(A##ga, Da); \ - Bme = ROL64in128(A##ga, 36); \ - XOReq128(A##ke, De); \ - Bmi = ROL64in128(A##ke, 10); \ - E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ - XOReq128(A##mi, Di); \ - Bmo = ROL64in128(A##mi, 15); \ - E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ - XOReq128(A##so, Do); \ - Bmu = ROL64in128_56(A##so); \ - E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ - E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ - E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ -\ - XOReq128(A##bi, Di); \ - Bsa = ROL64in128(A##bi, 62); \ - XOReq128(A##go, Do); \ - Bse = ROL64in128(A##go, 55); \ - XOReq128(A##ku, Du); \ - Bsi = ROL64in128(A##ku, 39); \ - E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ - XOReq128(A##ma, Da); \ - Bso = ROL64in128(A##ma, 41); \ - E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ - XOReq128(A##se, De); \ - Bsu = ROL64in128(A##se, 2); \ - E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ - E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ - E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ -\ - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1) \ - XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ - XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ - XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ - XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ - XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ - XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ - XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ - XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ - XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ - XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ - XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \ - XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \ - XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \ - XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \ - XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \ - XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1) \ - XORdata16(X, data0, data1) \ - XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \ - XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \ - XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \ - XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \ - XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \ - -#if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12)) -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 6) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 4) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 2) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#else -#error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!" -#endif - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - declareABCDE - #ifndef KeccakP1600times2_SSSE3_fullUnrolling - unsigned int i; - #endif - unsigned int j; - - initializeState(A); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - XOReq128(Ame, _mm_set1_epi64x(0x0BULL)); - XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL)); - rounds12 - - STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); - STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); - STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) ); - STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) ); -} diff --git a/benches/kangarootwelve/K12/lib/brg_endian.h b/benches/kangarootwelve/K12/lib/brg_endian.h deleted file mode 100644 index 7c640b9..0000000 --- a/benches/kangarootwelve/K12/lib/brg_endian.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 - Changes for ARM 9/9/2010 -*/ - -#ifndef _BRG_ENDIAN_H -#define _BRG_ENDIAN_H - -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ - -#if 0 -/* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif -#endif -#endif - -/* Now attempt to set the define for platform byte order using any */ -/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ -/* seem to encompass most endian symbol definitions */ - -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -/* if the platform byte order could not be determined, then try to */ -/* set this define using common machine defines */ -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) || \ - defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif defined(__arm__) -# ifdef __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# else -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif 1 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order -#endif - -#endif - -#endif diff --git a/benches/kangarootwelve/K12/support/Build/ExpandProducts.xsl b/benches/kangarootwelve/K12/support/Build/ExpandProducts.xsl deleted file mode 100644 index 9d3f64f..0000000 --- a/benches/kangarootwelve/K12/support/Build/ExpandProducts.xsl +++ /dev/null @@ -1,76 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - diff --git a/benches/kangarootwelve/K12/support/Build/ToGlobalMakefile.xsl b/benches/kangarootwelve/K12/support/Build/ToGlobalMakefile.xsl deleted file mode 100644 index c7fe1b0..0000000 --- a/benches/kangarootwelve/K12/support/Build/ToGlobalMakefile.xsl +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - - - - - - - - - - - : - - - - - .packs: - - - - - - .PHONY: - - - - - - - - - - - - : - - - - - - : - - - $(MAKE) -f - - - - - - - - : - - - $(MAKE) -f - - - - - - - - : - - - - - - : - - - mkdir -p $(dir $@) - xsltproc -o $@ support/Build/ToVCXProj.xsl - - - - - - : - - support/Build/ToTargetMakefile.xsl - mkdir -p $(dir $@) - xsltproc -o $@ support/Build/ToTargetMakefile.xsl $< - - - - : support/Build/ToOneTarget.xsl bin/.build/Makefile.expanded Makefile.build - mkdir -p $(dir $@) - xsltproc -o $@ -param nameTarget "' - - '" support/Build/ToOneTarget.xsl bin/.build/Makefile.expanded - - - - - - - .PHONY: - - - .packs - - - - - : - - - - .packs - : - .packs - - - - - - - - - - - @echo "+ - - [.packs]" - - - - - - - @echo "- - - [.pack|.vcxproj]" - - - - -.PHONY: _list -_list: - @echo "The defined targets (-) and groups of targets (+) are:" - - - @echo "+ - - [.packs]" - - - - - - - - .PHONY: - - - .packs - - - - - - - - diff --git a/benches/kangarootwelve/K12/support/Build/ToOneTarget.xsl b/benches/kangarootwelve/K12/support/Build/ToOneTarget.xsl deleted file mode 100644 index 0d230e4..0000000 --- a/benches/kangarootwelve/K12/support/Build/ToOneTarget.xsl +++ /dev/null @@ -1,86 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/benches/kangarootwelve/K12/support/Build/ToTargetMakefile.xsl b/benches/kangarootwelve/K12/support/Build/ToTargetMakefile.xsl deleted file mode 100644 index 5e01231..0000000 --- a/benches/kangarootwelve/K12/support/Build/ToTargetMakefile.xsl +++ /dev/null @@ -1,208 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - CFLAGS := $(CFLAGS) - - - - - - - - CFLAGS := $(CFLAGS) -D - - ="" - - - - - - - - CFLAGS := $(CFLAGS) -I - - - - - - - - - HEADERS := $(HEADERS) - - - - SOURCES := $(SOURCES) - - - - - - - - - - SOURCES := $(SOURCES) - - - - - $(BINDIR)/ - - - - .o - - - : - - $(HEADERS) - $(CC) $(INCLUDES) $(CFLAGS) - - -c $< -o $@ -OBJECTS := $(OBJECTS) - - - - - - - - - - - - - - all: - - - - - : - - - - - .pack: - - - - - BINDIR = bin/.build/ - - -$(BINDIR): - mkdir -p $(BINDIR) - -MAKE ?= gmake -CC ?= gcc -AR = ar - - - - - CFLAGS := $(CFLAGS) -fpic - - - - - - - - bin/ - - : $(BINDIR) $(OBJECTS) - mkdir -p $(dir $@) - - - - - mkdir -p $@.headers - cp -f $(HEADERS) $@.headers/ - $(AR) rcsv $@ $(OBJECTS) - - - - mkdir -p $@.headers - cp -f $(HEADERS) $@.headers/ - $(CC) -shared -o $@ $(OBJECTS) $(CFLAGS) - - - - $(CC) -o $@ $(OBJECTS) $(CFLAGS) - - - - - - - - - : $(SOURCES) - mkdir -p bin/.pack/ - - - rm -rf bin/.pack/ - - /* - cp $(SOURCES) bin/.pack/ - - / - cd bin/.pack/ ; tar -czf - - - - /* - - - - - diff --git a/benches/kangarootwelve/K12/support/Build/ToVCXProj.xsl b/benches/kangarootwelve/K12/support/Build/ToVCXProj.xsl deleted file mode 100644 index 03aaaf5..0000000 --- a/benches/kangarootwelve/K12/support/Build/ToVCXProj.xsl +++ /dev/null @@ -1,131 +0,0 @@ - - - - - - - - - - - - - Debug - Win32 - - - Release - Win32 - - - - {6F1C9407-7A01-444D-A07B-7DAE147F22A1} - XKCP - - - - Application - true - v110 - MultiByte - - - Application - false - v110 - true - MultiByte - - - - - - - - - - - - - $(SolutionDir)$(ProjectName)\$(Configuration)\ - $(SolutionDir)$(ProjectName)\$(Configuration)\ - - - $(SolutionDir)$(ProjectName)\$(Configuration)\ - $(SolutionDir)$(ProjectName)\$(Configuration)\ - - - - Level3 - Disabled - - - - true - - - - - Level3 - MaxSpeed - true - true - - - - true - true - true - - - - - - - - - - - - - - - - - $(ProjectDir)..\..\ - - ; - - - - - - - - - - - - - - - - - - - - - - diff --git a/benches/kangarootwelve/K12/tests/main.c b/benches/kangarootwelve/K12/tests/main.c deleted file mode 100644 index efd384d..0000000 --- a/benches/kangarootwelve/K12/tests/main.c +++ /dev/null @@ -1,70 +0,0 @@ -/* -Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, -Michaël Peeters, Gilles Van Assche and Ronny Van Keer, -hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to our website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#include -#include -#include -#include -#include "testKangarooTwelve.h" -#include "testPerformance.h" - -void printHelp() -{ - printf("Usage: KeccakTests command(s), where the commands can be\n"); - printf(" --help or -h To display this page\n"); - printf(" --all or -a All tests\n"); - printf(" --KangarooTwelve or -K12 Tests on KangarooTwelve\n"); - printf(" --speed or -s Speed measuresments\n"); -} - -int process(int argc, char* argv[]) -{ - int i; - int help = 0; - int KangarooTwelve = 0; - int speed = 0; - - if (argc == 1) - help = 1; - - for(i=1; i -#endif -#include -#include - -#if defined(EMBEDDED) -static void assert(int condition) -{ - if (!condition) - { - for ( ; ; ) ; - } -} -#else -#include -#endif - -static void generateSimpleRawMaterial(unsigned char* data, unsigned int length, unsigned char seed1, unsigned int seed2) -{ - unsigned int i; - - for(i=0; i> (8-seed2)); - byte = seed1 + 161*length - iRolled + i; - data[i] = byte; - } -} - -static void performTestKangarooTwelveOneInput(unsigned int inputLen, unsigned int outputLen, unsigned int customLen, KangarooTwelve_Instance *pSpongeChecksum, unsigned int mode, unsigned int useSqueeze) -{ - unsigned char input[inputByteSize]; - unsigned char output[outputByteSize]; - unsigned char customization[customizationByteSize]; - int result; - unsigned int i; - - generateSimpleRawMaterial(customization, customizationByteSize, customLen, 97); - generateSimpleRawMaterial(input, inputLen, outputLen, inputLen + customLen); - - #ifdef VERBOSE - printf( "outputLen %5u, inputLen %5u, customLen %3u\n", outputLen, inputLen, customLen); - #endif - if (!useSqueeze) - { - if (mode == 0) - { - /* Input/Output full size in one call */ - result = KangarooTwelve( input, inputLen, output, outputLen, customization, customLen ); - assert(result == 0); - } - else if (mode == 1) - { - /* Input/Output one byte per call */ - KangarooTwelve_Instance kt; - result = KangarooTwelve_Initialize(&kt, outputLen); - assert(result == 0); - for (i = 0; i < inputLen; ++i ) - { - result = KangarooTwelve_Update(&kt, input + i, 1); - assert(result == 0); - } - result = KangarooTwelve_Final(&kt, output, customization, customLen ); - assert(result == 0); - } - else if (mode == 2) - { - /* Input/Output random number of bytes per call */ - KangarooTwelve_Instance kt; - unsigned char *pInput = input; - result = KangarooTwelve_Initialize(&kt, outputLen); - assert(result == 0); - while (inputLen) - { - unsigned int len = ((rand() << 15) ^ rand()) % (inputLen + 1); - result = KangarooTwelve_Update(&kt, pInput, len); - assert(result == 0); - pInput += len; - inputLen -= len; - } - result = KangarooTwelve_Final(&kt, output, customization, customLen); - assert(result == 0); - } - } - else - { - if (mode == 0) - { - KangarooTwelve_Instance kt; - result = KangarooTwelve_Initialize(&kt, 0); - assert(result == 0); - result = KangarooTwelve_Update(&kt, input, inputLen); - assert(result == 0); - result = KangarooTwelve_Final(&kt, 0, customization, customLen); - assert(result == 0); - result = KangarooTwelve_Squeeze(&kt, output, outputLen); - assert(result == 0); - } - else if (mode == 1) - { - KangarooTwelve_Instance kt; - result = KangarooTwelve_Initialize(&kt, 0); - assert(result == 0); - result = KangarooTwelve_Update(&kt, input, inputLen); - assert(result == 0); - result = KangarooTwelve_Final(&kt, 0, customization, customLen); - assert(result == 0); - - for (i = 0; i < outputLen; ++i) - { - result = KangarooTwelve_Squeeze(&kt, output + i, 1); - assert(result == 0); - } - } - else if (mode == 2) - { - KangarooTwelve_Instance kt; - unsigned int len; - result = KangarooTwelve_Initialize(&kt, 0); - assert(result == 0); - result = KangarooTwelve_Update(&kt, input, inputLen); - assert(result == 0); - result = KangarooTwelve_Final(&kt, 0, customization, customLen); - assert(result == 0); - - for (i = 0; i < outputLen; i += len) - { - len = ((rand() << 15) ^ rand()) % ((outputLen-i) + 1); - result = KangarooTwelve_Squeeze(&kt, output+i, len); - assert(result == 0); - } - } - } - - #ifdef VERBOSE - { - unsigned int i; - - printf("KangarooTwelve\n"); - printf("Input of %d bytes:", inputLen); - for(i=0; (i 16) - printf(" ..."); - printf("\n"); - printf("Output of %d bytes:", outputLen); - for(i=0; i -#include -#include -#include -#include "KangarooTwelve.h" -#include "timing.h" -#include "testPerformance.h" - -void displayMeasurements1101001000(uint_32t *measurements, uint_32t *laneCounts, unsigned int numberOfColumns, unsigned int laneLengthInBytes); - -#define xstr(s) str(s) -#define str(s) #s - -uint_32t measureKangarooTwelve(uint_32t dtMin, unsigned int inputLen) -{ - ALIGN(32) unsigned char input[2*1024*1024]; - ALIGN(32) unsigned char output[32]; - measureTimingDeclare - - assert(inputLen <= 2*1024*1024); - - memset(input, 0xA5, 16); - - measureTimingBeginDeclared - KangarooTwelve(input, inputLen, output, 32, (const unsigned char *)"", 0); - measureTimingEnd -} - -void KangarooTwelve_SetProcessorCapabilities(); - -void printKangarooTwelvePerformanceHeader( void ) -{ - KangarooTwelve_SetProcessorCapabilities(); - printf("*** KangarooTwelve ***\n"); - printf("Using Keccak-p[1600,12] implementations:\n"); - printf("- \303\2271: %s\n", KeccakP1600_GetImplementation()); - #if defined(KeccakP1600_12rounds_FastLoop_supported) - printf(" + KeccakP1600_12rounds_FastLoop_Absorb()\n"); - #endif - -#ifndef KeccakP1600_disableParallelism - if (KeccakP1600times2_IsAvailable()) { - printf("- \303\2272: %s\n", KeccakP1600times2_GetImplementation()); - #if defined(KeccakP1600times2_12rounds_FastLoop_supported) - printf(" + KeccakP1600times2_12rounds_FastLoop_Absorb()\n"); - #endif - } - else - printf("- \303\2272: not used\n"); - - if (KeccakP1600times4_IsAvailable()) { - printf("- \303\2274: %s\n", KeccakP1600times4_GetImplementation()); - #if defined(KeccakP1600times4_12rounds_FastLoop_supported) - printf(" + KeccakP1600times4_12rounds_FastLoop_Absorb()\n"); - #endif - } - else - printf("- \303\2274: not used\n"); - - if (KeccakP1600times8_IsAvailable()) { - printf("- \303\2278: %s\n", KeccakP1600times8_GetImplementation()); - #if defined(KeccakP1600times8_12rounds_FastLoop_supported) - printf(" + KeccakP1600times8_12rounds_FastLoop_Absorb()\n"); - #endif - } - else - printf("- \303\2278: not used\n"); -#endif - - printf("\n"); -} - -void testKangarooTwelvePerformanceOne( void ) -{ - const unsigned int chunkSize = 8192; - unsigned halfTones; - uint_32t calibration = calibrate(); - unsigned int chunkSizeLog = (unsigned int)floor(log(chunkSize)/log(2.0)+0.5); - int displaySlope = 0; - - measureKangarooTwelve(calibration, 500000); - for(halfTones=chunkSizeLog*12-28; halfTones<=13*12; halfTones+=4) { - double I = pow(2.0, halfTones/12.0); - unsigned int i = (unsigned int)floor(I+0.5); - uint_32t time, timePlus1Block, timePlus2Blocks, timePlus4Blocks, timePlus8Blocks; - uint_32t timePlus84Blocks, timePlus168Blocks; - time = measureKangarooTwelve(calibration, i); - if (i == chunkSize) { - displaySlope = 1; - timePlus1Block = measureKangarooTwelve(calibration, i+1*chunkSize); - timePlus2Blocks = measureKangarooTwelve(calibration, i+2*chunkSize); - timePlus4Blocks = measureKangarooTwelve(calibration, i+4*chunkSize); - timePlus8Blocks = measureKangarooTwelve(calibration, i+8*chunkSize); - timePlus84Blocks = measureKangarooTwelve(calibration, i+84*chunkSize); - timePlus168Blocks = measureKangarooTwelve(calibration, i+168*chunkSize); - } - printf("%8d bytes: %9d cycles, %6.3f cycles/byte\n", i, time, time*1.0/i); - if (displaySlope) { - printf(" +1 block: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus1Block, (timePlus1Block-(double)(time))*1.0/chunkSize/1.0); - printf(" +2 blocks: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus2Blocks, (timePlus2Blocks-(double)(time))*1.0/chunkSize/2.0); - printf(" +4 blocks: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus4Blocks, (timePlus4Blocks-(double)(time))*1.0/chunkSize/4.0); - printf(" +8 blocks: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus8Blocks, (timePlus8Blocks-(double)(time))*1.0/chunkSize/8.0); - printf(" +84 blocks: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus84Blocks, (timePlus84Blocks-(double)(time))*1.0/chunkSize/84.0); - printf(" +168 blocks: %9d cycles, %6.3f cycles/byte (slope)\n", timePlus168Blocks, (timePlus168Blocks-(double)(time))*1.0/chunkSize/168.0); - displaySlope = 0; - } - } - for(halfTones=12*12; halfTones<=20*12; halfTones+=4) { - double I = chunkSize + pow(2.0, halfTones/12.0); - unsigned int i = (unsigned int)floor(I+0.5); - uint_32t time; - time = measureKangarooTwelve(calibration, i); - printf("%8d bytes: %9d cycles, %6.3f cycles/byte\n", i, time, time*1.0/i); - } - printf("\n\n"); -} - -void testKangarooTwelvePerformance() -{ - printKangarooTwelvePerformanceHeader(); - testKangarooTwelvePerformanceOne(); -} -void testPerformance() -{ - testKangarooTwelvePerformance(); -} - -void bubbleSort(double *list, unsigned int size) -{ - unsigned int n = size; - - do { - unsigned int newn = 0; - unsigned int i; - - for(i=1; i list[i]) { - double temp = list[i-1]; - list[i-1] = list[i]; - list[i] = temp; - newn = i; - } - } - n = newn; - } - while(n > 0); -} - -double med4(double x0, double x1, double x2, double x3) -{ - double list[4]; - list[0] = x0; - list[1] = x1; - list[2] = x2; - list[3] = x3; - bubbleSort(list, 4); - if (fabs(list[2]-list[0]) < fabs(list[3]-list[1])) - return 0.25*list[0]+0.375*list[1]+0.25*list[2]+0.125*list[3]; - else - return 0.125*list[0]+0.25*list[1]+0.375*list[2]+0.25*list[3]; -} - -void displayMeasurements1101001000(uint_32t *measurements, uint_32t *laneCounts, unsigned int numberOfColumns, unsigned int laneLengthInBytes) -{ - double cpb[4]; - unsigned int i; - - for(i=0; i -Reply-To: hash-forum at nist.gov -To: Multiple recipients of list - -Sorry for the earlier empty email. I pushed send by mistake while starting my message. - -Yes, it's a real shame that C doesn't have a standard way to do this. Below is some code that you are free to copy if you wish I have used variants of this function for years, all the way back to AES days, and the code is entirely mine, so I hereby release it to the public domain. If you keep reading below, I also give some concrete suggestions on how to use it. - -This code works on x86 family CPUs (32-big and 64-bit), under MSVC, gcc, and BorlandC, including older compiler versions where the __rdtsc() function is not defined. It also checks for ANSI compiles (i.e., -ansi using gcc, /Za using MSVC, and -A using Borland) and disables the call, to avoid compile-time warnings/errors. The function HiResTime() currently returns only 32 bits, mostly for historical reasons. However, that's enough to do most timing measurements, and you could easily enhance it to return 64 bits if desired. I normally compile with multiple compilers -- e.g., three versions of MSVC (v4.2, v6.0 and v9.0), at least two versions of gcc, plus Borland -- and take performance measurements on all of them. - -[…] - -*/ - -/************** Timing routine (for performance measurements) ***********/ -/* By Doug Whiting */ -/* unfortunately, this is generally assembly code and not very portable */ -#if defined(_M_IX86) || defined(__i386) || defined(_i386) || defined(__i386__) || defined(i386) || \ - defined(_X86_) || defined(__x86_64__) || defined(_M_X64) || defined(__x86_64) -#define _Is_X86_ 1 -#endif - -#if defined(_Is_X86_) && (!defined(__STRICT_ANSI__)) && (defined(__GNUC__) || !defined(__STDC__)) && \ - (defined(__BORLANDC__) || defined(_MSC_VER) || defined(__MINGW_H) || defined(__GNUC__)) -#define HI_RES_CLK_OK 1 /* it's ok to use RDTSC opcode */ - -#if defined(_MSC_VER) /* && defined(_M_X64) */ -#include -#pragma intrinsic(__rdtsc) /* use MSVC rdtsc call where defined */ -#endif - -#endif - -typedef unsigned int uint_32t; - -static uint_32t HiResTime(void) /* return the current value of time stamp counter */ - { -#if defined(HI_RES_CLK_OK) - uint_32t x[2]; -#if defined(__BORLANDC__) -#define COMPILER_ID "BCC" - __emit__(0x0F,0x31); /* RDTSC instruction */ - _asm { mov x[0],eax }; -#elif defined(_MSC_VER) -#define COMPILER_ID "MSC" -#if defined(_MSC_VER) /* && defined(_M_X64) */ - x[0] = (uint_32t) __rdtsc(); -#else - _asm { _emit 0fh }; _asm { _emit 031h }; - _asm { mov x[0],eax }; -#endif -#elif defined(__MINGW_H) || defined(__GNUC__) -#define COMPILER_ID "GCC" - asm volatile("rdtsc" : "=a"(x[0]), "=d"(x[1])); -#else -#error "HI_RES_CLK_OK -- but no assembler code for this platform (?)" -#endif - return x[0]; -#else - /* avoid annoying MSVC 9.0 compiler warning #4720 in ANSI mode! */ -#if (!defined(_MSC_VER)) || (!defined(__STDC__)) || (_MSC_VER < 1300) - FatalError("No support for RDTSC on this CPU platform\n"); -#endif - return 0; -#endif /* defined(HI_RES_CLK_OK) */ - } - -#define TIMER_SAMPLE_CNT (100) - -static uint_32t calibrate() -{ - uint_32t dtMin = 0xFFFFFFFF; /* big number to start */ - uint_32t t0,t1,i; - - for (i=0;i < TIMER_SAMPLE_CNT;i++) /* calibrate the overhead for measuring time */ - { - t0 = HiResTime(); - t1 = HiResTime(); - if (dtMin > t1-t0) /* keep only the minimum time */ - dtMin = t1-t0; - } - return dtMin; -} - -#define measureTimingDeclare \ - uint_32t tMin = 0xFFFFFFFF; \ - uint_32t t0,t1,i; - -#define measureTimingBeginDeclared \ - for (i=0;i < TIMER_SAMPLE_CNT;i++) \ - { \ - t0 = HiResTime(); - -#define measureTimingBegin \ - uint_32t tMin = 0xFFFFFFFF; \ - uint_32t t0,t1,i; \ - for (i=0;i < TIMER_SAMPLE_CNT;i++) \ - { \ - t0 = HiResTime(); - -#define measureTimingEnd \ - t1 = HiResTime(); \ - if (tMin > t1-t0 - dtMin) \ - tMin = t1-t0 - dtMin; \ - } \ - return tMin; diff --git a/benches/kangarootwelve/README.md b/benches/kangarootwelve/README.md deleted file mode 100644 index 4f391d7..0000000 --- a/benches/kangarootwelve/README.md +++ /dev/null @@ -1,10 +0,0 @@ -A Rust FFI wrapper around the [XKCP/K12](https://github.com/XKCP/K12) C -implementation of KangarooTwelve, which is vendored here (version -[ad51d21](https://github.com/XKCP/K12/commit/ad51d21e52dd2ffc1315d1a76a9cd229a23ebe5c), -2020-02-16) and statically linked. It's intended for testing and -benchmarking only. - -The build is hardcoded to use the `generic64` target, which includes -runtime feature detection for AVX2 and AVX-512. If you're on a 32-bit -machine or cross-compiling, you'll need to manually edit `build.rs` to -build the `generic32` target. diff --git a/benches/kangarootwelve/build.rs b/benches/kangarootwelve/build.rs deleted file mode 100644 index ecce11d..0000000 --- a/benches/kangarootwelve/build.rs +++ /dev/null @@ -1,19 +0,0 @@ -use std::env; -use std::path::PathBuf; -use std::process::Command; - -fn main() { - // For 32-bit targets, replace this with "generic32". - let target = "generic64"; - let manifest_dir: PathBuf = env::var("CARGO_MANIFEST_DIR").unwrap().into(); - let k12_dir = manifest_dir.join("K12"); - let build_dir = k12_dir.join(format!("bin/{}", target)); - let build_status = Command::new("make") - .arg(format!("{}/libk12.a", target)) - .current_dir(&k12_dir) - .status() - .unwrap(); - assert!(build_status.success()); - println!("cargo:rustc-link-search={}", build_dir.to_str().unwrap()); - println!("cargo:rustc-link-lib=static=k12"); -} diff --git a/benches/kangarootwelve/src/lib.rs b/benches/kangarootwelve/src/lib.rs deleted file mode 100644 index acfd0e8..0000000 --- a/benches/kangarootwelve/src/lib.rs +++ /dev/null @@ -1,55 +0,0 @@ -pub fn kangarootwelve(input: &[u8]) -> [u8; 32] { - let mut hash = [0u8; 32]; - let ret = unsafe { - KangarooTwelve( - input.as_ptr(), - input.len(), - hash.as_mut_ptr(), - hash.len(), - std::ptr::null(), - 0, - ) - }; - debug_assert_eq!(0, ret, "KangarooTwelve returned an error code"); - hash -} - -extern "C" { - #[doc = " Extendable ouput function KangarooTwelve."] - #[doc = " @param input Pointer to the input message (M)."] - #[doc = " @param inputByteLen The length of the input message in bytes."] - #[doc = " @param output Pointer to the output buffer."] - #[doc = " @param outputByteLen The desired number of output bytes."] - #[doc = " @param customization Pointer to the customization string (C)."] - #[doc = " @param customByteLen The length of the customization string in bytes."] - #[doc = " @return 0 if successful, 1 otherwise."] - fn KangarooTwelve( - input: *const ::std::os::raw::c_uchar, - inputByteLen: usize, - output: *mut ::std::os::raw::c_uchar, - outputByteLen: usize, - customization: *const ::std::os::raw::c_uchar, - customByteLen: usize, - ) -> ::std::os::raw::c_int; -} - -#[cfg(test)] -mod test { - use super::*; - - // from https://eprint.iacr.org/2016/770.pdf - #[test] - fn test_vectors() { - let empty_expected = "1ac2d450fc3b4205d19da7bfca1b37513c0803577ac7167f06fe2ce1f0ef39e5"; - let empty_output = kangarootwelve(&[]); - assert_eq!(empty_expected, hex::encode(&empty_output)); - - let seventeen_expected = "6bf75fa2239198db4772e36478f8e19b0f371205f6a9a93a273f51df37122888"; - let mut input = Vec::new(); - for i in 0..17u8 { - input.push(i); - } - let seventeen_output = kangarootwelve(&input); - assert_eq!(seventeen_expected, hex::encode(&seventeen_output)); - } -}