Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement thrust cached memory resource (useful for gpu async sorting) #54

Closed
wants to merge 13 commits into from
14 changes: 7 additions & 7 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
[submodule "src/ext/assimp"]
path = src/ext/assimp
url = git@github.com:assimp/assimp.git
url = https://github.com/assimp/assimp.git
[submodule "src/ext/glfw"]
path = src/ext/glfw
url = git@github.com:glfw/glfw.git
url = https://github.com/glfw/glfw.git
[submodule "src/ext/pybind11"]
path = src/ext/pybind11
url = git@github.com:pybind/pybind11.git
url = https://github.com/pybind/pybind11.git
branch = stable
[submodule "src/ext/pbrtparser"]
path = src/ext/pbrtparser
url = git@github.com:cuteday/pbrt-parser.git
url = https://github.com/cuteday/pbrt-parser.git
branch = cute
[submodule "src/core/math/3rdparty/eigen"]
path = src/core/math/3rdparty/eigen
url = git@github.com:cuteday/Eigen.git
url = https://github.com/cuteday/Eigen.git
[submodule "src/ext/nvrhi"]
path = src/ext/nvrhi
url = git@github.com:cuteday/nvrhi.git
url = https://github.com/cuteday/nvrhi.git
[submodule "src/ext/openvdb"]
path = src/ext/openvdb
url = git@github.com:cuteday/openvdb_win.git
url = https://github.com/cuteday/openvdb_win.git
61 changes: 61 additions & 0 deletions src/core/device/soa.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma once
#include "common.h"
#include <type_traits>

NAMESPACE_BEGIN(krr)

// specifications for SOA::GetSetIndirector
template <typename T>
class SOA {
public:
struct GetSetIndirector {
GetSetIndirector() = default;
KRR_CALLABLE operator T() const;
KRR_CALLABLE void operator=(const T &val);
KRR_CALLABLE void operator=(const GetSetIndirector &other);
SOA<T> *soa; int i;
};
};

// https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator
template <typename T>
class SOAIterator {
public:
using difference_type = int;
using value_type = T;
using reference = typename SOA<T>::GetSetIndirector;
using pointer = void;
using iterator_category = std::random_access_iterator_tag;

KRR_CALLABLE SOAIterator() : m_soa(nullptr), m_index(0) {}
KRR_CALLABLE SOAIterator(SOA<T> *soa, int index) : m_soa(soa), m_index(index) {}
KRR_CALLABLE SOAIterator(const SOA<T> *soa, int index) : m_soa(const_cast<SOA<T>*>(soa)), m_index(index) {}

KRR_CALLABLE SOAIterator& operator +=(int n) { m_index += n; return *this; }
KRR_CALLABLE SOAIterator& operator -=(int n) { m_index -= n; return *this; }
KRR_CALLABLE SOAIterator& operator ++() { ++m_index; return *this; }
KRR_CALLABLE SOAIterator& operator --() { --m_index; return *this; }
KRR_CALLABLE SOAIterator operator ++(int) { SOAIterator it = *this; ++m_index; return it; }
KRR_CALLABLE SOAIterator operator --(int) { SOAIterator it = *this; --m_index; return it; }
KRR_CALLABLE SOAIterator operator+(difference_type n) const { return SOAIterator(m_soa, m_index + n); }
KRR_CALLABLE SOAIterator operator-(difference_type n) const { return SOAIterator(m_soa, m_index - n); }
KRR_CALLABLE difference_type operator-(const SOAIterator& it) const { return m_index - it.m_index; }
KRR_CALLABLE friend SOAIterator operator+(difference_type n, const SOAIterator& it) { return it + n; }
KRR_CALLABLE friend SOAIterator operator-(difference_type n, const SOAIterator &it) { return it - n; }

KRR_CALLABLE bool operator==(const SOAIterator& it) const { return m_index == it.m_index; }
KRR_CALLABLE bool operator!=(const SOAIterator &it) const { return m_index != it.m_index; }
KRR_CALLABLE bool operator<(const SOAIterator &it) const { return m_index < it.m_index; }
KRR_CALLABLE bool operator<=(const SOAIterator &it) const { return m_index <= it.m_index; }
KRR_CALLABLE bool operator>(const SOAIterator &it) const { return m_index > it.m_index; }
KRR_CALLABLE bool operator>=(const SOAIterator &it) const { return m_index >= it.m_index; }

KRR_CALLABLE reference operator*() { return {m_soa, m_index}; }
KRR_CALLABLE reference operator[](difference_type n) { return {m_soa, m_index + n}; }

private:
std::conditional_t<std::is_const_v<T>, const SOA<T>*, SOA<T>*> m_soa;
difference_type m_index;
};

NAMESPACE_END(krr)
85 changes: 85 additions & 0 deletions src/core/device/thrust.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <thrust/system/cuda/vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <map>
#include <cassert>

#include "common.h"
#include "logger.h"

NAMESPACE_BEGIN(krr)

template <class Upstream>
class thrust_cached_resource final :
public thrust::mr::memory_resource<typename Upstream::pointer> {
public:
thrust_cached_resource(Upstream *upstream) : m_upstream(upstream) {}
thrust_cached_resource() : m_upstream(thrust::mr::get_global_resource<Upstream>()) {}
~thrust_cached_resource() { release(); }

private:
typedef typename Upstream::pointer void_ptr;
using block_key_type = std::pair<std::ptrdiff_t, std::size_t>; // size, alignment
using free_blocks_container = std::multimap<block_key_type, void_ptr>;
using allocated_blocks_container = std::vector<std::pair<void_ptr, block_key_type>>;

free_blocks_container free_blocks;
allocated_blocks_container allocated_blocks;
Upstream *m_upstream;

public:
void release() {
Log(Info, "thrust_cached_resource::release()");
// Deallocate all outstanding blocks in both lists.
for (typename free_blocks_container::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
m_upstream->do_deallocate(i->second, i->first.first, i->first.second);

for (typename allocated_blocks_container::iterator i = allocated_blocks.begin();
i != allocated_blocks.end(); ++i)
m_upstream->do_deallocate(i->first, i->second.first, i->second.second);
}

void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override {
Log(Info, "thrust_cached_resource::do_allocate(): num_bytes == %zu", bytes);
void_ptr result = nullptr;

typename free_blocks_container::iterator free_block = free_blocks.find({bytes, alignment});

if (free_block != free_blocks.end()) {
Log(Info, "thrust_cached_resource::do_allocate(): found a free block of %zd bytes", bytes);
result = free_block->second;
free_blocks.erase(free_block);
} else {
Log(Info, "thrust_cached_resource::do_allocate(): allocating new block of %zd bytes", bytes);
result = m_upstream->do_allocate(bytes, alignment);
}

allocated_blocks.push_back(std::make_pair(result, block_key_type{bytes, alignment}));
return result;
}

void do_deallocate(void_ptr ptr, std::size_t bytes, std::size_t alignment) override {
Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", reinterpret_cast<void *>(ptr.get()));

//typename allocated_blocks_container::iterator iter = allocated_blocks.find(ptr);
typename allocated_blocks_container::iterator iter = std::find_if(allocated_blocks.begin(),
allocated_blocks.end(), [ptr](const typename allocated_blocks_container::value_type& pair){
return pair.first == ptr; });
if (iter == allocated_blocks.end()) {
Log(Error, "Pointer `%p` was not allocated by this allocator",
reinterpret_cast<void *>(ptr.get()));
return;
}

block_key_type key = iter->second;

allocated_blocks.erase(iter);
free_blocks.insert(std::make_pair(key, ptr));
}
};

NAMESPACE_END(krr)
2 changes: 1 addition & 1 deletion src/core/light.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ class InfiniteLight {
}

KRR_DEVICE Spectrum Li(Vector3f wi, const SampledWavelengths &lambda) const {
Vector2f uv = worldToLatLong(rotation.transpose() * wi);
Vector2f uv = utils::worldToLatLong(rotation.transpose() * wi);
RGB L = image.isValid() ? tint * image.evaluate(uv).head<3>() : tint;
return scale * Spectrum::fromRGB(L, SpectrumType::RGBIlluminant, lambda, *colorSpace);
}
Expand Down
5 changes: 4 additions & 1 deletion src/core/logger.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,11 @@ class Logger {

static void log(Level level, const string& msg, bool terminate = false);
};

#ifndef KRR_DEBUG_BUILD
inline void logDebug(const std::string &msg) {}
#else
inline void logDebug(const std::string& msg) { Logger::log(Logger::Level::Debug, msg); }
#endif
inline void logInfo(const std::string& msg) { Logger::log(Logger::Level::Info, msg); }
inline void logSuccess(const std::string& msg) { Logger::log(Logger::Level::Success, msg); }
inline void logWarning(const std::string& msg) { Logger::log(Logger::Level::Warning, msg); }
Expand Down
4 changes: 4 additions & 0 deletions src/core/renderpass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ RenderTexture::RenderTexture(vkrhi::IDevice *device, vkrhi::TextureHandle textur
mCudaSurface = cudaHandler->mapVulkanTextureToCudaSurface(mTexture, cudaArrayColorAttachment);
}

RenderTexture::~RenderTexture() {
CUDA_CHECK(cudaDestroySurfaceObject(mCudaSurface));
}

vkrhi::TextureDesc RenderTexture::getVulkanDesc(const Vector2i size, vkrhi::Format format,
const std::string name) {
vkrhi::TextureDesc textureDesc;
Expand Down
2 changes: 1 addition & 1 deletion src/core/renderpass.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class RenderTexture {
using SharedPtr = std::shared_ptr<RenderTexture>;

RenderTexture(vkrhi::IDevice *device, vkrhi::TextureHandle texture);
~RenderTexture() = default;
~RenderTexture();

static RenderTexture::SharedPtr create(vkrhi::IDevice *device,
const Vector2i size, vkrhi::Format format, const std::string name = "");
Expand Down
12 changes: 10 additions & 2 deletions src/core/window.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,11 @@ void DeviceManager::getFrameSize(int &width, int &height) const {
void DeviceManager::updateWindowSize() {
if (mWindow == nullptr) return;
int width, height;
glfwGetWindowSize(mWindow, &width, &height);
// as the validation layer needs us to get the window extent through this way.
// https://www.reddit.com/r/vulkan/comments/14uen19/do_i_need_to_recreate_glfw_surfaces_on_window/
auto surfaceCap = mVulkanPhysicalDevice.getSurfaceCapabilitiesKHR(mWindowSurface);
width = surfaceCap.currentExtent.width;
height = surfaceCap.currentExtent.height;

if (width == 0 || height == 0) {
// window is minimized
Expand Down Expand Up @@ -1075,6 +1079,9 @@ bool DeviceManager::createSwapChain() {

const bool enableSwapChainSharing = queues.size() > 1;

auto nextDesc = vk::SwapchainPresentScalingCreateInfoEXT().setScalingBehavior(
vk::PresentScalingFlagBitsEXT::eOneToOne);

auto desc =
vk::SwapchainCreateInfoKHR()
.setSurface(mWindowSurface)
Expand All @@ -1094,7 +1101,8 @@ bool DeviceManager::createSwapChain() {
.setPresentMode(mDeviceParams.vsyncEnabled ? vk::PresentModeKHR::eFifo
: vk::PresentModeKHR::eImmediate)
.setClipped(true)
.setOldSwapchain(nullptr);
.setOldSwapchain(nullptr)
.setPNext(&nextDesc);

const vk::Result res = mVulkanDevice.createSwapchainKHR(&desc, nullptr, &mSwapChain);
if (res != vk::Result::eSuccess) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/kiraray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ extern "C" int main(int argc, char *argv[]) {
Log(Info, "Working directory: %s", KRR_PROJECT_DIR);
Log(Info, "Kiraray build type: %s", KRR_BUILD_TYPE);
#ifdef KRR_DEBUG_BUILD
Log(Warning, "Running in debug mode, the performance may be extremely slow. "
"Switch to Release build for normal performance!");
Log(Warning, "Running in debug mode, the performance may be slow."
"Switch to Release build for faster performance!");
#endif

string configFile = "common/configs/example_cbox.json";
Expand Down
4 changes: 1 addition & 3 deletions src/render/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

NAMESPACE_BEGIN(krr)

using namespace utils;

KRR_CALLABLE float evalMIS(float p0, float p1) {
#if MIS_POWER_HEURISTIC
return p0 * p0 / (p0 * p0 + p1 * p1);
Expand Down Expand Up @@ -94,7 +92,7 @@ KRR_CALLABLE int sampleDiscrete(gpu::span<const float> weights, float u, float *
for (float w : weights) sumWeights += w;

float up = u * sumWeights;
if (up == sumWeights) up = nextFloatDown(up);
if (up == sumWeights) up = utils::nextFloatDown(up);

int offset = 0;
float sum = 0;
Expand Down
1 change: 0 additions & 1 deletion src/render/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

NAMESPACE_BEGIN(krr)

using namespace utils;
using namespace rt;

struct HitInfo {
Expand Down
4 changes: 1 addition & 3 deletions src/render/spectrum.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,7 @@ KRR_CALLABLE float RGBColorSpace::lum(const SpectrumType& s, const SampledWavele
SampledSpectrum pdf = lambda.pdf();
return SampledSpectrum(Ys * s).safeDiv(pdf).mean() / CIE_Y_integral;
}
else
static_assert(!std::is_same_v<SpectrumType, SpectrumType>,
"SpectrumType must be either RGB or SampledSpectrum");
else static_assert(false, "SpectrumType must be either RGB or SampledSpectrum");
}

KRR_CALLABLE float luminance(RGB color) {
Expand Down
65 changes: 63 additions & 2 deletions src/render/wavefront/integrator.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/retag.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#include <thrust/mr/allocator.h>
#include <thrust/device_allocator.h>
#include <thrust/mr/device_memory_resource.h>

#include "device/cuda.h"
#include "device/thrust.h"
#include "integrator.h"
#include "wavefront.h"
#include "render/spectrum.h"
#include "render/profiler/profiler.h"
#include "workqueue.h"

NAMESPACE_BEGIN(krr)
extern "C" char WAVEFRONT_PTX[];
Expand Down Expand Up @@ -35,6 +43,10 @@ void WavefrontPathTracer::initialize() {
if (mediumScatterQueue) mediumScatterQueue->resize(maxQueueSize, alloc);
else mediumScatterQueue = alloc.new_object<MediumScatterQueue>(maxQueueSize, alloc);
}
if (scatterRayKeys) scatterRayKeys->resize(maxQueueSize);
else scatterRayKeys = new TypedBuffer<ScatterRayKeyIndex>(maxQueueSize);
if (scatterRaySortBuffer) scatterRaySortBuffer->resize(maxQueueSize, alloc);
else scatterRaySortBuffer = alloc.new_object<ScatterRayQueue>(maxQueueSize, alloc);
if (!camera) camera = alloc.new_object<rt::CameraData>();
CUDA_SYNC_CHECK();
}
Expand Down Expand Up @@ -101,8 +113,57 @@ void WavefrontPathTracer::handleMiss() {

void WavefrontPathTracer::generateScatterRays(int depth) {
PROFILE("Generate scatter rays");
using MemRes = thrust::device_ptr_memory_resource<thrust_cached_resource<thrust::device_memory_resource>>;
using Alloc = thrust::mr::allocator<ScatterRayKeyIndex, MemRes>;
static std::unique_ptr<MemRes> memory;
static std::unique_ptr<Alloc> alloc;
if (!memory) {
memory = std::make_unique<MemRes>();
alloc = std::make_unique<Alloc>(memory.get());
}
{
PROFILE("Sort scatter rays");
auto *queue = scatterRayQueue;
auto *auxBuffer = scatterRaySortBuffer;
ScatterRayKeyIndex *keys = scatterRayKeys->data();
{
PROFILE("Update keys");
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size())
keys[index].key = std::numeric_limits<int64_t>::max();
else {
ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
}
keys[index].index = index;
}, KRR_DEFAULT_STREAM);
}
{
PROFILE("Sort indices");
thrust::sort(thrust::cuda::par_nosync(*alloc).on(KRR_DEFAULT_STREAM),
keys, keys + maxQueueSize,
[] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
return a.key < b.key;
});
}
{
PROFILE("Reorder and blit");
// sorted to auxiliary buffer
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
auxBuffer->operator[](index) = w.operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
// blit back
GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
if (index >= queue->size()) return;
queue->operator[](index) =
auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
}, KRR_DEFAULT_STREAM);
}
}
ForAllQueued(
scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem & w) {
scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem& w) {
Sampler sampler = &pixelState->sampler[w.pixelId];
/* Russian Roulette: If the path is terminated by this vertex,
then NEE should not be evaluated */
Expand Down
Loading
Loading