diff --git a/src/core/device/thrust.h b/src/core/device/thrust.h index b537447..81a3423 100644 --- a/src/core/device/thrust.h +++ b/src/core/device/thrust.h @@ -23,51 +23,62 @@ class thrust_cached_resource final : private: typedef typename Upstream::pointer void_ptr; - typedef std::tuple block_t; - std::vector blocks{}; + using block_key_type = std::pair; // size, alignment + using free_blocks_container = std::multimap; + using allocated_blocks_container = std::vector>; + + free_blocks_container free_blocks; + allocated_blocks_container allocated_blocks; Upstream *m_upstream; public: void release() { Log(Info, "thrust_cached_resource::release()"); - std::for_each(blocks.begin(), blocks.end(), [this](block_t &block) { - auto &[bytes, alignment, ptr] = block; - m_upstream->do_deallocate(ptr, bytes, alignment); - }); + // Deallocate all outstanding blocks in both lists. + for (typename free_blocks_container::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i) + m_upstream->do_deallocate(i->second, i->first.first, i->first.second); + + for (typename allocated_blocks_container::iterator i = allocated_blocks.begin(); + i != allocated_blocks.end(); ++i) + m_upstream->do_deallocate(i->first, i->second.first, i->second.second); } void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override { Log(Info, "thrust_cached_resource::do_allocate(): num_bytes == %zu", bytes); - void_ptr result = nullptr; - auto const fitting_block = - std::find_if(blocks.cbegin(), blocks.cend(), [bytes, alignment](block_t const &block) { - auto &[b_bytes, b_alignment, _] = block; - return b_bytes == bytes && b_alignment == alignment; - }); + typename free_blocks_container::iterator free_block = free_blocks.find({bytes, alignment}); - if (fitting_block != blocks.end()) { + if (free_block != free_blocks.end()) { Log(Info, "thrust_cached_resource::do_allocate(): found a free block of %zd bytes", bytes); - result = std::get<2>(*fitting_block); + result = free_block->second; + free_blocks.erase(free_block); } else { Log(Info, "thrust_cached_resource::do_allocate(): allocating new block of %zd bytes", bytes); result = m_upstream->do_allocate(bytes, alignment); - blocks.emplace_back(bytes, alignment, result); } + + allocated_blocks.push_back(std::make_pair(result, block_key_type{bytes, alignment})); return result; } void do_deallocate(void_ptr ptr, std::size_t bytes, std::size_t alignment) override { - Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", - reinterpret_cast(ptr.get())); - auto const fitting_block = - std::find_if(blocks.cbegin(), blocks.cend(), - [ptr](block_t const &block) { return std::get<2>(block) == ptr; }); + Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", reinterpret_cast(ptr.get())); - if (fitting_block == blocks.end()) + //typename allocated_blocks_container::iterator iter = allocated_blocks.find(ptr); + typename allocated_blocks_container::iterator iter = std::find_if(allocated_blocks.begin(), + allocated_blocks.end(), [ptr](const typename allocated_blocks_container::value_type& pair){ + return pair.first == ptr; }); + if (iter == allocated_blocks.end()) { Log(Error, "Pointer `%p` was not allocated by this allocator", - thrust::raw_pointer_cast(ptr)); + reinterpret_cast(ptr.get())); + return; + } + + block_key_type key = iter->second; + + allocated_blocks.erase(iter); + free_blocks.insert(std::make_pair(key, ptr)); } }; diff --git a/src/render/wavefront/integrator.cpp b/src/render/wavefront/integrator.cpp index a47a41b..87d8f05 100644 --- a/src/render/wavefront/integrator.cpp +++ b/src/render/wavefront/integrator.cpp @@ -3,6 +3,11 @@ #include #include #include +#include +#include +#include +#include +#include #include "device/cuda.h" #include "device/thrust.h" @@ -108,47 +113,54 @@ void WavefrontPathTracer::handleMiss() { void WavefrontPathTracer::generateScatterRays(int depth) { PROFILE("Generate scatter rays"); - static thrust::cuda::vector scatterRayKeys1; - if (scatterRayKeys1.size() != maxQueueSize) { - scatterRayKeys1.resize(maxQueueSize); - Log(Info, "resized "); + using MemRes = thrust::device_ptr_memory_resource>; + using Alloc = thrust::mr::allocator; + static std::unique_ptr memory; + static std::unique_ptr alloc; + if (!memory) { + memory = std::make_unique(); + alloc = std::make_unique(memory.get()); } { PROFILE("Sort scatter rays"); auto *queue = scatterRayQueue; auto *auxBuffer = scatterRaySortBuffer; ScatterRayKeyIndex *keys = scatterRayKeys->data(); - GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { - if (index >= queue->size()) - keys[index].key = std::numeric_limits::max(); - else { - ScatterRayQueue::GetSetIndirector w = queue->operator[](index); - keys[index].key = static_cast(w.soa->intr.sd.bsdfType[w.i]); - } - keys[index].index = index; - }, KRR_DEFAULT_STREAM); - cudaMemcpyAsync(thrust::raw_pointer_cast(scatterRayKeys1.data()), keys, - maxQueueSize * sizeof(ScatterRayKeyIndex), - cudaMemcpyDeviceToDevice, KRR_DEFAULT_STREAM); - thrust::sort(thrust::cuda::par_nosync.on(KRR_DEFAULT_STREAM), - thrust::retag(scatterRayKeys1.begin()), - thrust::retag(scatterRayKeys1.end()), - //keys, keys + maxQueueSize, - [] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) { - return a.key < b.key; - }); - //// sorted to auxiliary buffer - GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { - if (index >= queue->size()) return; - ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index); - auxBuffer->operator[](index) = w.operator krr::ScatterRayWorkItem(); - }, KRR_DEFAULT_STREAM); - // blit back - GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { - if (index >= queue->size()) return; - queue->operator[](index) = - auxBuffer->operator[](index).operator krr::ScatterRayWorkItem(); - }, KRR_DEFAULT_STREAM); + { + PROFILE("Update keys"); + GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { + if (index >= queue->size()) + keys[index].key = std::numeric_limits::max(); + else { + ScatterRayQueue::GetSetIndirector w = queue->operator[](index); + keys[index].key = static_cast(w.soa->intr.sd.bsdfType[w.i]); + } + keys[index].index = index; + }, KRR_DEFAULT_STREAM); + } + { + PROFILE("Sort indices"); + thrust::sort(thrust::cuda::par_nosync(*alloc).on(KRR_DEFAULT_STREAM), + keys, keys + maxQueueSize, + [] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) { + return a.key < b.key; + }); + } + { + PROFILE("Reorder and blit"); + // sorted to auxiliary buffer + GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { + if (index >= queue->size()) return; + ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index); + auxBuffer->operator[](index) = w.operator krr::ScatterRayWorkItem(); + }, KRR_DEFAULT_STREAM); + // blit back + GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) { + if (index >= queue->size()) return; + queue->operator[](index) = + auxBuffer->operator[](index).operator krr::ScatterRayWorkItem(); + }, KRR_DEFAULT_STREAM); + } } ForAllQueued( scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem& w) {