Skip to content

Commit

Permalink
Add AVX implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
chfast committed Feb 21, 2022
1 parent 7ce00d6 commit 8550d74
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 0 deletions.
1 change: 1 addition & 0 deletions test/benchmarks/benchmarks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ BENCHMARK_TEMPLATE(shift, uint256, uint64_t, shl_halves)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_c)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_e)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_w)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_avx)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_1)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_2)->DenseRange(-1, 3);
BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_3)->DenseRange(-1, 3);
Expand Down
58 changes: 58 additions & 0 deletions test/experimental/shift.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,68 @@
// Licensed under the Apache License, Version 2.0.
#pragma once

#include <immintrin.h>
#include <intx/intx.hpp>

namespace intx::experimental
{
inline uint256 shl_words_avx(const uint256& x, uint64_t sw) noexcept
{
sw = (sw < 4) ? sw : 4;
int idxs[][8] = {
{0, 1, 2, 3, 4, 5, 6, 7},
{-1, -1, 0, 1, 2, 3, 4, 5},
{-1, -1, -1, -1, 0, 1, 2, 3},
{-1, -1, -1, -1, -1, -1, 0, 1},
{-1, -1, -1, -1, -1, -1, -1, -1},
};

auto idx = _mm256_load_si256((__m256i*)idxs[sw]);
auto a = _mm256_load_si256((__m256i*)&x);

auto p = _mm256_permutevar8x32_epi32(a, idx);

auto zero = __m256{};
auto bf = _mm256_blendv_ps(*(__m256*)&p, zero, *(__m256*)&idx);
auto b = *(__m256i*)&bf;

uint256 res;
_mm256_store_si256((__m256i*)&res, b);

return res;
}

inline uint256 shl_bits_avx(const uint256& x, uint64_t sb) noexcept
{
auto a = _mm256_loadu_si256((__m256i*)&x);
auto zero = __m256i{};

auto p = _mm256_permute4x64_epi64(a, 0b10010000);

auto b = _mm256_blend_epi32(p, zero, 0b11);

__m128i rcount{int64_t(64 - sb), 0};
auto c = _mm256_srl_epi64(b, rcount);

__m128i count{int64_t(sb), 0};
auto d = _mm256_sll_epi64(a, count);

auto e = _mm256_or_si256(c, d);

uint256 res;
_mm256_storeu_si256((__m256i*)&res, e);

return res;
}

[[gnu::noinline]] inline uint256 shl_avx(const uint256& x, uint64_t shift) noexcept
{
auto sw = shift / 64;
auto sb = shift % 64;
auto a = shl_words_avx(x, sw);
return shl_bits_avx(a, sb);
}

inline constexpr uint64_t shld(uint64_t x1, uint64_t x2, uint64_t c)
{
if (c == 0)
Expand Down
42 changes: 42 additions & 0 deletions test/unittests/test_bitwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,3 +347,45 @@ TYPED_TEST(uint_test, shift_against_mul)
auto y = a * s;
EXPECT_EQ(x, y);
}

TEST(avx, shl_words)
{
const auto x = 0x18191a1b1c1d1e1f28292a2b2c2d2e2f38393a3b3c3d3e3f48494a4b4c4d4e4f_u256;
EXPECT_EQ(experimental::shl_words_avx(x, 0), x);
EXPECT_EQ(experimental::shl_words_avx(x, 1), x << 64);
EXPECT_EQ(experimental::shl_words_avx(x, 2), x << 128);
EXPECT_EQ(experimental::shl_words_avx(x, 3), x << 192);
EXPECT_EQ(experimental::shl_words_avx(x, 4), 0);
EXPECT_EQ(experimental::shl_words_avx(x, 5), 0);
EXPECT_EQ(experimental::shl_words_avx(x, 123131231), 0);
}

TEST(avx, shl_bits)
{
const auto x = 0x18191a1b1c1d1e1f28292a2b2c2d2e2f38393a3b3c3d3e3f48494a4b4c4d4e4f_u256;
EXPECT_EQ(experimental::shl_bits_avx(x, 0), x);
EXPECT_EQ(experimental::shl_bits_avx(x, 1), x << 1);
EXPECT_EQ(experimental::shl_bits_avx(x, 2), x << 2);
EXPECT_EQ(experimental::shl_bits_avx(x, 3), x << 3);
EXPECT_EQ(experimental::shl_bits_avx(x, 31), x << 31);
EXPECT_EQ(experimental::shl_bits_avx(x, 32), x << 32);
EXPECT_EQ(experimental::shl_bits_avx(x, 33), x << 33);
EXPECT_EQ(experimental::shl_bits_avx(x, 63), x << 63);
EXPECT_EQ(experimental::shl_bits_avx(x, 64), x << 64);
}

TEST(avx, shl_avx)
{
const auto x = 0x18191a1b1c1d1e1f28292a2b2c2d2e2f38393a3b3c3d3e3f48494a4b4c4d4e4f_u256;
EXPECT_EQ(experimental::shl_avx(x, 0), x);
EXPECT_EQ(experimental::shl_avx(x, 1), x << 1);
EXPECT_EQ(experimental::shl_avx(x, 2), x << 2);
EXPECT_EQ(experimental::shl_avx(x, 3), x << 3);
EXPECT_EQ(experimental::shl_avx(x, 31), x << 31);
EXPECT_EQ(experimental::shl_avx(x, 32), x << 32);
EXPECT_EQ(experimental::shl_avx(x, 33), x << 33);
EXPECT_EQ(experimental::shl_avx(x, 63), x << 63);
EXPECT_EQ(experimental::shl_avx(x, 64), x << 64);
EXPECT_EQ(experimental::shl_avx(x, 65), x << 65);
EXPECT_EQ(experimental::shl_avx(x, 255), x << 255);
}

0 comments on commit 8550d74

Please sign in to comment.