Skip to content

Commit

Permalink
safe thrust cached mr
Browse files Browse the repository at this point in the history
  • Loading branch information
cuteday committed Apr 15, 2024
1 parent ed9c175 commit 48598dc
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 57 deletions.
55 changes: 33 additions & 22 deletions src/core/device/thrust.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,51 +23,62 @@ class thrust_cached_resource final :

private:
typedef typename Upstream::pointer void_ptr;
typedef std::tuple<std::ptrdiff_t, std::size_t, void_ptr> block_t;
std::vector<block_t> blocks{};
using block_key_type = std::pair<std::ptrdiff_t, std::size_t>; // size, alignment
using free_blocks_container = std::multimap<block_key_type, void_ptr>;
using allocated_blocks_container = std::vector<std::pair<void_ptr, block_key_type>>;

free_blocks_container free_blocks;
allocated_blocks_container allocated_blocks;
Upstream *m_upstream;

public:
void release() {
Log(Info, "thrust_cached_resource::release()");
std::for_each(blocks.begin(), blocks.end(), [this](block_t &block) {
auto &[bytes, alignment, ptr] = block;
m_upstream->do_deallocate(ptr, bytes, alignment);
});
// Deallocate all outstanding blocks in both lists.
for (typename free_blocks_container::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
m_upstream->do_deallocate(i->second, i->first.first, i->first.second);

for (typename allocated_blocks_container::iterator i = allocated_blocks.begin();
i != allocated_blocks.end(); ++i)
m_upstream->do_deallocate(i->first, i->second.first, i->second.second);
}

void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override {
Log(Info, "thrust_cached_resource::do_allocate(): num_bytes == %zu", bytes);

void_ptr result = nullptr;

auto const fitting_block =
std::find_if(blocks.cbegin(), blocks.cend(), [bytes, alignment](block_t const &block) {
auto &[b_bytes, b_alignment, _] = block;
return b_bytes == bytes && b_alignment == alignment;
});
typename free_blocks_container::iterator free_block = free_blocks.find({bytes, alignment});

if (fitting_block != blocks.end()) {
if (free_block != free_blocks.end()) {
Log(Info, "thrust_cached_resource::do_allocate(): found a free block of %zd bytes", bytes);
result = std::get<2>(*fitting_block);
result = free_block->second;
free_blocks.erase(free_block);
} else {
Log(Info, "thrust_cached_resource::do_allocate(): allocating new block of %zd bytes", bytes);
result = m_upstream->do_allocate(bytes, alignment);
blocks.emplace_back(bytes, alignment, result);
}

allocated_blocks.push_back(std::make_pair(result, block_key_type{bytes, alignment}));
return result;
}

void do_deallocate(void_ptr ptr, std::size_t bytes, std::size_t alignment) override {
Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p",
reinterpret_cast<void *>(ptr.get()));
auto const fitting_block =
std::find_if(blocks.cbegin(), blocks.cend(),
[ptr](block_t const &block) { return std::get<2>(block) == ptr; });
Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", reinterpret_cast<void *>(ptr.get()));

if (fitting_block == blocks.end())
//typename allocated_blocks_container::iterator iter = allocated_blocks.find(ptr);
typename allocated_blocks_container::iterator iter = std::find_if(allocated_blocks.begin(),
allocated_blocks.end(), [ptr](const typename allocated_blocks_container::value_type& pair){
return pair.first == ptr; });
if (iter == allocated_blocks.end()) {
Log(Error, "Pointer `%p` was not allocated by this allocator",
thrust::raw_pointer_cast(ptr));
reinterpret_cast<void *>(ptr.get()));
return;
}

block_key_type key = iter->second;

allocated_blocks.erase(iter);
free_blocks.insert(std::make_pair(key, ptr));
}
};

Expand Down
82 changes: 47 additions & 35 deletions src/render/wavefront/integrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/retag.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#include <thrust/mr/allocator.h>
#include <thrust/device_allocator.h>
#include <thrust/mr/device_memory_resource.h>

#include "device/cuda.h"
#include "device/thrust.h"
Expand Down Expand Up @@ -108,47 +113,54 @@ void WavefrontPathTracer::handleMiss() {

void WavefrontPathTracer::generateScatterRays(int depth) {
PROFILE("Generate scatter rays");
static thrust::cuda::vector<ScatterRayKeyIndex> scatterRayKeys1;
if (scatterRayKeys1.size() != maxQueueSize) {
scatterRayKeys1.resize(maxQueueSize);
Log(Info, "resized ");
using MemRes = thrust::device_ptr_memory_resource<thrust_cached_resource<thrust::device_memory_resource>>;
using Alloc = thrust::mr::allocator<ScatterRayKeyIndex, MemRes>;
static std::unique_ptr<MemRes> memory;
static std::unique_ptr<Alloc> alloc;
if (!memory) {
memory = std::make_unique<MemRes>();
alloc = std::make_unique<Alloc>(memory.get());
}
{
PROFILE("Sort scatter rays");
auto *queue = scatterRayQueue;
auto *auxBuffer = scatterRaySortBuffer;
ScatterRayKeyIndex *keys = scatterRayKeys->data();
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size())
keys[index].key = std::numeric_limits<int64_t>::max();
else {
ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
}
keys[index].index = index;
}, KRR_DEFAULT_STREAM);
cudaMemcpyAsync(thrust::raw_pointer_cast(scatterRayKeys1.data()), keys,
maxQueueSize * sizeof(ScatterRayKeyIndex),
cudaMemcpyDeviceToDevice, KRR_DEFAULT_STREAM);
thrust::sort(thrust::cuda::par_nosync.on(KRR_DEFAULT_STREAM),
thrust::retag<cached_allocator_tag>(scatterRayKeys1.begin()),
thrust::retag<cached_allocator_tag>(scatterRayKeys1.end()),
//keys, keys + maxQueueSize,
[] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
return a.key < b.key;
});
//// sorted to auxiliary buffer
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
auxBuffer->operator[](index) = w.operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
// blit back
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
queue->operator[](index) =
auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
{
PROFILE("Update keys");
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size())
keys[index].key = std::numeric_limits<int64_t>::max();
else {
ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
}
keys[index].index = index;
}, KRR_DEFAULT_STREAM);
}
{
PROFILE("Sort indices");
thrust::sort(thrust::cuda::par_nosync(*alloc).on(KRR_DEFAULT_STREAM),
keys, keys + maxQueueSize,
[] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
return a.key < b.key;
});
}
{
PROFILE("Reorder and blit");
// sorted to auxiliary buffer
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
auxBuffer->operator[](index) = w.operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
// blit back
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
queue->operator[](index) =
auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
}
}
ForAllQueued(
scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem& w) {
Expand Down

0 comments on commit 48598dc

Please sign in to comment.