Skip to content

Commit

Permalink
[cilksan] Fix logic for combining reducer views to handle cases where…
Browse files Browse the repository at this point in the history
… reducers are registered and used in nested Cilk contexts, such as in nested parallel loops. Fix miscellaneous typos.
  • Loading branch information
neboat committed Jan 15, 2025
1 parent ea02835 commit 0a455ab
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 6 deletions.
30 changes: 25 additions & 5 deletions cilksan/reducers.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "cilksan_internal.h"
#include "debug_util.h"
#include "driver.h"
#include "vector.h"
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
Expand All @@ -17,6 +18,9 @@ static void reducer_register(const csi_id_t call_id, unsigned MAAP_count,
reducer_views->insert((hyper_table::bucket){
.key = (uintptr_t)key,
.value = {.view = key, .reduce_fn = (__cilk_reduce_fn)reduce_ptr}});
DBG_TRACE(REDUCER,
"reducer_register: registered %p, reducer_views %p, occupancy %d\n",
key, reducer_views, reducer_views->occupancy);
}

if (!is_execution_parallel())
Expand Down Expand Up @@ -58,6 +62,10 @@ CILKSAN_API void __csan_llvm_reducer_unregister(const csi_id_t call_id,

// Remove this reducer from the table.
if (hyper_table *reducer_views = CilkSanImpl.get_reducer_views()) {
DBG_TRACE(
REDUCER,
"reducer_unregister: unregistering %p, reducer_views %p, occupancy %d\n",
key, reducer_views, reducer_views->occupancy);
reducer_views->remove((uintptr_t)key);
}

Expand Down Expand Up @@ -139,10 +147,16 @@ void CilkSanImpl_t::reduce_local_views() {
// Reduce every reducer view in the table with its leftmost view.
int32_t capacity = reducer_views->capacity;
hyper_table::bucket *buckets = reducer_views->buckets;
bool holdsLeftmostViews = false;
Vector_t<int32_t> keysToRemove;
for (int32_t i = 0; i < capacity; ++i) {
hyper_table::bucket b = buckets[i];
if (!is_valid(b.key))
continue;
if (b.key == (uintptr_t)(b.value.view)) {
holdsLeftmostViews = true;
continue;
}

DBG_TRACE(REDUCER,
"reduce_local_views: found view to reduce at %d: %p -> %p\n", i,
Expand All @@ -154,14 +168,20 @@ void CilkSanImpl_t::reduce_local_views() {
// Delete the right view.
free(rb.view);
mark_free(rb.view);
keysToRemove.push_back(b.key);
}
enable_checking();

// Delete the table of local reducer views
DBG_TRACE(REDUCER, "reduce_local_views: delete reducer_views %p\n",
reducer_views);
delete reducer_views;
f->reducer_views = nullptr;
if (!holdsLeftmostViews) {
// Delete the table of local reducer views
DBG_TRACE(REDUCER, "reduce_local_views: delete reducer_views %p\n",
reducer_views);
delete reducer_views;
f->reducer_views = nullptr;
} else {
for (int32_t i = 0; i < keysToRemove.size(); ++i)
reducer_views->remove(buckets[keysToRemove[i]].key);
}
}

hyper_table *
Expand Down
2 changes: 1 addition & 1 deletion test/cilksan/TestCases/alloctypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ void global_test() {

// CHECK-GLOBAL: Race detected on location [[GLOBAL]]
// CHECK-GLOBAL: * Write {{[0-9a-f]+}} global_test
// CHECK-GLOBALOB: * Write {{[0-9a-f]+}} global_test
// CHECK-GLOBAL: * Write {{[0-9a-f]+}} global_test
// CHECK-GLOBAL: Common calling context
// CHECK-GLOBAL-NEXT: Parfor

Expand Down
201 changes: 201 additions & 0 deletions test/cilksan/TestCases/nested-loops-with-reducers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// RUN: %clangxx_cilksan -std=c++20 -fopencilk -O3 -g %s -o %t
// RUN: %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,CILKSAN
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <random>

#include <cilk/cilk.h>

struct Scalar {
constexpr static uint64_t PRIME = 0xffffffff00000001ull;

// Scalar() { /* Deliberately skip initialization of _raw to allow more
// aggresive optimization */
// }

Scalar() = default;

explicit Scalar(uint64_t raw) : _raw{raw} {}

explicit operator uint64_t() const { return _raw; }

auto operator+(Scalar other) const -> Scalar {
const uint64_t sum = _raw + other._raw;
const Scalar ret{
sum < _raw || sum < other._raw || sum >= PRIME ? sum - PRIME : sum};
return ret;
}

auto operator-(Scalar other) const -> Scalar {
const uint64_t diff = _raw - other._raw;
const Scalar ret{(diff > _raw) ? diff + PRIME : diff};
return ret;
}

auto operator*(Scalar other) const -> Scalar {
// Start by carrying out an ordinary 64x64->128 bit multiplication
const uint32_t a0 = _raw, a1 = _raw >> 32;
const uint32_t b0 = other._raw, b1 = other._raw >> 32;
const uint64_t p0 = static_cast<uint64_t>(a0) * b0,
p1 = static_cast<uint64_t>(a0) * b1,
p2 = static_cast<uint64_t>(a1) * b0,
p3 = static_cast<uint64_t>(a1) * b1;
const uint32_t cy = ((p0 >> 32) + static_cast<uint32_t>(p1) +
static_cast<uint32_t>(p2)) >>
32;
const uint64_t x = p0 + (p1 << 32) + (p2 << 32),
y = p3 + (p1 >> 32) + (p2 >> 32) + cy;
// Store result in 4 32-bit words
const uint32_t c0 = x, c1 = x >> 32, c2 = y, c3 = y >> 32;
// Now perform reduction: modulus is phi^2 - phi + 1 where phi = 2^32
// ab = c0 + c1*phi + c2*phi^2 + c3*phi^3
// Exploit phi^2 = phi-1 and phi^3 = phi * (phi-1) = (phi-1) - phi = -1
// ab = c0 + c1*phi + c2*(phi-1) - c3
// = (c0-c2-c3) + (c1+c2)*phi
const Scalar ret = (Scalar(c0) - Scalar(c2) - Scalar(c3)) +
(Scalar(static_cast<uint64_t>(c1) << 32) +
Scalar(static_cast<uint64_t>(c2) << 32));
return ret;
}

auto operator==(Scalar other) const -> bool { return _raw == other._raw; }

auto operator!=(Scalar other) const -> bool { return _raw != other._raw; }

// No comparison operators as they make no sense for finite fields

auto operator+=(Scalar other) -> Scalar & { return *this = *this + other; }

auto operator-=(Scalar other) -> Scalar & { return *this = *this - other; }

auto operator*=(Scalar other) -> Scalar & { return *this = *this * other; }

template <typename RNG> inline static auto random(RNG &rng) -> Scalar {
static std::uniform_int_distribution<uint64_t> dist(0, PRIME - 1);
return Scalar{dist(rng)};
}

auto is_valid() const -> bool { return _raw < PRIME; }

private:
uint64_t _raw;
};

static inline void zero_scalar(void *view) {
*reinterpret_cast<Scalar *>(view) = Scalar{0};
}

static inline void add_scalar(void *left, void *right) {
*reinterpret_cast<Scalar *>(left) += *reinterpret_cast<Scalar *>(right);
}

using ScalarAddReducer = Scalar cilk_reducer(zero_scalar, add_scalar);

auto reduce_with_cilk(Scalar **as, Scalar **bs, Scalar *c, Scalar *coeffs,
size_t n, size_t m) -> std::array<Scalar, 3> {
ScalarAddReducer p0{0}, p2{0}, p3{0};
cilk_for (size_t i = 0; i < n; i++) {
// Obtain dense representations of the polynomials
const Scalar *a = as[i];
const Scalar *b = bs[i];
const size_t half = m / 2;

ScalarAddReducer lp0{0}, lp2{0}, lp3{0};
cilk_for (size_t j = 0; j < half; j++) {
lp0 += a[j] * b[j] * c[j];
const Scalar a2 = a[j + half] + a[j + half] - a[j],
b2 = b[j + half] + b[j + half] - b[j],
c2 = c[j + half] + c[j + half] - c[j];
lp2 += a2 * b2 * c2;
const Scalar a3 = a2 + a[j + half] - a[j],
b3 = b2 + b[j + half] - b[j],
c3 = c2 + c[j + half] - c[j];
lp3 += a3 * b3 * c3;
}
p0 += coeffs[i] * lp0;
p2 += coeffs[i] * lp2;
p3 += coeffs[i] * lp3;
}
return {p0, p2, p3};
}

auto reduce_serial(Scalar **as, Scalar **bs, Scalar *c, Scalar *coeffs,
size_t n, size_t m) -> std::array<Scalar, 3> {
Scalar p0{0}, p2{0}, p3{0};
for (size_t i = 0; i < n; i++) {
// Obtain dense representations of the polynomials
const Scalar *a = as[i];
const Scalar *b = bs[i];
const size_t half = m / 2;

Scalar lp0{0}, lp2{0}, lp3{0};
for (size_t j = 0; j < half; j++) {
lp0 += a[j] * b[j] * c[j];
const Scalar a2 = a[j + half] + a[j + half] - a[j],
b2 = b[j + half] + b[j + half] - b[j],
c2 = c[j + half] + c[j + half] - c[j];
lp2 += a2 * b2 * c2;
const Scalar a3 = a2 + a[j + half] - a[j],
b3 = b2 + b[j + half] - b[j],
c3 = c2 + c[j + half] - c[j];
lp3 += a3 * b3 * c3;
}

p0 += coeffs[i] * lp0;
p2 += coeffs[i] * lp2;
p3 += coeffs[i] * lp3;
}
return {p0, p2, p3};
}

auto main() -> int {
const size_t N = 12;
const size_t M = 128;

std::mt19937_64 rng{42};

Scalar *as[N];
Scalar *bs[N];
Scalar c[M];
Scalar coeffs[N];

for (size_t i = 0; i < N; i++) {
as[i] = new Scalar[M];
bs[i] = new Scalar[M];
coeffs[i] = Scalar::random(rng);
for (size_t j = 0; j < M; j++) {
as[i][j] = Scalar::random(rng);
bs[i][j] = Scalar::random(rng);
}
}
for (size_t i = 0; i < M; i++) {
c[i] = Scalar::random(rng);
}

const auto res_cilk = reduce_with_cilk(as, bs, c, coeffs, N, M);
const auto res_serial = reduce_serial(as, bs, c, coeffs, N, M);
if (res_cilk != res_serial) {
printf("res_cilk = %lu, %lu, %lu\n", static_cast<uint64_t>(res_cilk[0]),
static_cast<uint64_t>(res_cilk[1]),
static_cast<uint64_t>(res_cilk[2]));
printf("res_serial = %lu, %lu, %lu\n",
static_cast<uint64_t>(res_serial[0]),
static_cast<uint64_t>(res_serial[1]),
static_cast<uint64_t>(res_serial[2]));
}
for (size_t i = 0; i < N; i++) {
delete[] as[i];
delete[] bs[i];
}
return 0;
}

// NOLINTEND

// CHECK-NOT: res_cilk =
// CHECK-NOT: res_serial =

// CILKSAN: Cilksan detected 0 distinct races.
// CILKSAN-NEXT: Cilksan suppressed 0 duplicate race reports.

0 comments on commit 0a455ab

Please sign in to comment.