cuteday · cuteday · Apr 10, 2024 · Apr 10, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,23 +1,23 @@
 [submodule "src/ext/assimp"]
 	path = src/ext/assimp
-	url = git@github.com:assimp/assimp.git
+	url = https://github.com/assimp/assimp.git
 [submodule "src/ext/glfw"]
 	path = src/ext/glfw
-	url = git@github.com:glfw/glfw.git
+	url = https://github.com/glfw/glfw.git
 [submodule "src/ext/pybind11"]
 	path = src/ext/pybind11
-	url = git@github.com:pybind/pybind11.git
+	url = https://github.com/pybind/pybind11.git
 	branch = stable
 [submodule "src/ext/pbrtparser"]
 	path = src/ext/pbrtparser
-	url = git@github.com:cuteday/pbrt-parser.git
+	url = https://github.com/cuteday/pbrt-parser.git
 	branch = cute
 [submodule "src/core/math/3rdparty/eigen"]
 	path = src/core/math/3rdparty/eigen
-	url = git@github.com:cuteday/Eigen.git
+	url = https://github.com/cuteday/Eigen.git
 [submodule "src/ext/nvrhi"]
 	path = src/ext/nvrhi
-	url = git@github.com:cuteday/nvrhi.git
+	url = https://github.com/cuteday/nvrhi.git
 [submodule "src/ext/openvdb"]
 	path = src/ext/openvdb
-	url = git@github.com:cuteday/openvdb_win.git
+	url = https://github.com/cuteday/openvdb_win.git
diff --git a/src/core/device/soa.h b/src/core/device/soa.h
@@ -0,0 +1,61 @@
+#pragma once
+#include "common.h"
+#include <type_traits>
+
+NAMESPACE_BEGIN(krr)
+
+// specifications for SOA::GetSetIndirector
+template <typename T> 
+class SOA {
+public:
+	struct GetSetIndirector { 
+		GetSetIndirector() = default; 
+		KRR_CALLABLE operator T() const;
+		KRR_CALLABLE void operator=(const T &val);
+		KRR_CALLABLE void operator=(const GetSetIndirector &other);
+		SOA<T> *soa; int i; 
+	};
+};
+
+// https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator
+template <typename T> 
+class SOAIterator {
+public:
+	using difference_type	= int;
+	using value_type		= T;
+	using reference			= typename SOA<T>::GetSetIndirector;
+	using pointer			= void;
+	using iterator_category = std::random_access_iterator_tag;
+
+	KRR_CALLABLE SOAIterator() : m_soa(nullptr), m_index(0) {}
+	KRR_CALLABLE SOAIterator(SOA<T> *soa, int index) : m_soa(soa), m_index(index) {}
+	KRR_CALLABLE SOAIterator(const SOA<T> *soa, int index) : m_soa(const_cast<SOA<T>*>(soa)), m_index(index) {}
+
+	KRR_CALLABLE SOAIterator& operator +=(int n) { m_index += n; return *this; }
+	KRR_CALLABLE SOAIterator& operator -=(int n) { m_index -= n; return *this; }
+	KRR_CALLABLE SOAIterator& operator ++() { ++m_index; return *this; }
+	KRR_CALLABLE SOAIterator& operator --() { --m_index; return *this; }
+	KRR_CALLABLE SOAIterator operator ++(int) { SOAIterator it = *this; ++m_index; return it; }
+	KRR_CALLABLE SOAIterator operator --(int) { SOAIterator it = *this; --m_index; return it; }
+	KRR_CALLABLE SOAIterator operator+(difference_type n) const { return SOAIterator(m_soa, m_index + n); }
+	KRR_CALLABLE SOAIterator operator-(difference_type n) const { return SOAIterator(m_soa, m_index - n); }
+	KRR_CALLABLE difference_type operator-(const SOAIterator& it) const { return m_index - it.m_index; }
+	KRR_CALLABLE friend SOAIterator operator+(difference_type n, const SOAIterator& it) { return it + n; }
+	KRR_CALLABLE friend SOAIterator operator-(difference_type n, const SOAIterator &it) { return it - n; }
+
+	KRR_CALLABLE bool operator==(const SOAIterator& it) const { return m_index == it.m_index; }
+	KRR_CALLABLE bool operator!=(const SOAIterator &it) const { return m_index != it.m_index; }
+	KRR_CALLABLE bool operator<(const SOAIterator &it) const { return m_index < it.m_index; }
+	KRR_CALLABLE bool operator<=(const SOAIterator &it) const { return m_index <= it.m_index; }
+	KRR_CALLABLE bool operator>(const SOAIterator &it) const { return m_index > it.m_index; }
+	KRR_CALLABLE bool operator>=(const SOAIterator &it) const { return m_index >= it.m_index; }
+
+	KRR_CALLABLE reference operator*() { return {m_soa, m_index}; }
+	KRR_CALLABLE reference operator[](difference_type n) { return {m_soa, m_index + n}; }
+
+private:
+	std::conditional_t<std::is_const_v<T>, const SOA<T>*, SOA<T>*> m_soa;
+	difference_type m_index;
+};
+
+NAMESPACE_END(krr)
diff --git a/src/core/device/thrust.h b/src/core/device/thrust.h
@@ -0,0 +1,85 @@
+#include <thrust/system/cuda/vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/generate.h>
+#include <thrust/sort.h>
+#include <thrust/pair.h>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <cassert>
+
+#include "common.h"
+#include "logger.h"
+
+NAMESPACE_BEGIN(krr)
+
+template <class Upstream>
+class thrust_cached_resource final :
+	public thrust::mr::memory_resource<typename Upstream::pointer> {
+public:
+	thrust_cached_resource(Upstream *upstream) : m_upstream(upstream) {}
+	thrust_cached_resource() : m_upstream(thrust::mr::get_global_resource<Upstream>()) {}
+	~thrust_cached_resource() { release(); }
+
+private:
+	typedef typename Upstream::pointer void_ptr;
+	using block_key_type = std::pair<std::ptrdiff_t, std::size_t>;	// size, alignment
+	using free_blocks_container = std::multimap<block_key_type, void_ptr>;
+	using allocated_blocks_container = std::vector<std::pair<void_ptr, block_key_type>>;
+
+	free_blocks_container free_blocks;
+	allocated_blocks_container allocated_blocks;
+	Upstream *m_upstream;
+
+public:
+	void release() {
+		Log(Info, "thrust_cached_resource::release()");
+		// Deallocate all outstanding blocks in both lists.
+		for (typename free_blocks_container::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
+			m_upstream->do_deallocate(i->second, i->first.first, i->first.second);
+
+		for (typename allocated_blocks_container::iterator i = allocated_blocks.begin();
+			 i != allocated_blocks.end(); ++i)
+			m_upstream->do_deallocate(i->first, i->second.first, i->second.second);
+	}
+
+	void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override {
+		Log(Info, "thrust_cached_resource::do_allocate(): num_bytes == %zu", bytes);
+		void_ptr result = nullptr;
+
+		typename free_blocks_container::iterator free_block = free_blocks.find({bytes, alignment});
+
+		if (free_block != free_blocks.end()) {
+			Log(Info, "thrust_cached_resource::do_allocate(): found a free block of %zd bytes", bytes);
+			result = free_block->second;
+			free_blocks.erase(free_block);
+		} else {
+			Log(Info, "thrust_cached_resource::do_allocate(): allocating new block of %zd bytes", bytes);
+			result = m_upstream->do_allocate(bytes, alignment);
+		}
+
+		allocated_blocks.push_back(std::make_pair(result, block_key_type{bytes, alignment}));
+		return result;
+	}
+
+	void do_deallocate(void_ptr ptr, std::size_t bytes, std::size_t alignment) override {
+		Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", reinterpret_cast<void *>(ptr.get()));
+
+		//typename allocated_blocks_container::iterator iter = allocated_blocks.find(ptr);
+		typename allocated_blocks_container::iterator iter = std::find_if(allocated_blocks.begin(), 
+			allocated_blocks.end(), [ptr](const typename allocated_blocks_container::value_type& pair){
+							 return pair.first == ptr; });
+		if (iter == allocated_blocks.end()) {
+			Log(Error, "Pointer `%p` was not allocated by this allocator",
+				reinterpret_cast<void *>(ptr.get()));
+			return;
+		}
+
+		block_key_type key = iter->second;
+
+		allocated_blocks.erase(iter);
+		free_blocks.insert(std::make_pair(key, ptr));
+	}
+};
+
+NAMESPACE_END(krr)
diff --git a/src/core/light.h b/src/core/light.h
@@ -240,7 +240,7 @@ class InfiniteLight {
 	}
 
 	KRR_DEVICE Spectrum Li(Vector3f wi, const SampledWavelengths &lambda) const {
-		Vector2f uv = worldToLatLong(rotation.transpose() * wi);
+		Vector2f uv = utils::worldToLatLong(rotation.transpose() * wi);
 		RGB L		= image.isValid() ? tint * image.evaluate(uv).head<3>() : tint;
 		return scale * Spectrum::fromRGB(L, SpectrumType::RGBIlluminant, lambda, *colorSpace);
 	}

diff --git a/src/core/logger.h b/src/core/logger.h
@@ -61,8 +61,11 @@ class Logger {
 
 	static void log(Level level, const string& msg, bool terminate = false);
 };
-
+#ifndef KRR_DEBUG_BUILD
+inline void logDebug(const std::string &msg) {}	
+#else
 inline void logDebug(const std::string& msg) { Logger::log(Logger::Level::Debug, msg); }
+#endif
 inline void logInfo(const std::string& msg) { Logger::log(Logger::Level::Info, msg); }
 inline void logSuccess(const std::string& msg) { Logger::log(Logger::Level::Success, msg); }
 inline void logWarning(const std::string& msg) { Logger::log(Logger::Level::Warning, msg); }

diff --git a/src/core/renderpass.cpp b/src/core/renderpass.cpp
@@ -11,6 +11,10 @@ RenderTexture::RenderTexture(vkrhi::IDevice *device, vkrhi::TextureHandle textur
 	mCudaSurface = cudaHandler->mapVulkanTextureToCudaSurface(mTexture, cudaArrayColorAttachment);
 }
 
+RenderTexture::~RenderTexture() { 
+	CUDA_CHECK(cudaDestroySurfaceObject(mCudaSurface)); 
+}
+
 vkrhi::TextureDesc RenderTexture::getVulkanDesc(const Vector2i size, vkrhi::Format format,
 										const std::string name) {
 	vkrhi::TextureDesc textureDesc;

diff --git a/src/core/renderpass.h b/src/core/renderpass.h
@@ -22,7 +22,7 @@ class RenderTexture {
 	using SharedPtr = std::shared_ptr<RenderTexture>;
 
 	RenderTexture(vkrhi::IDevice *device, vkrhi::TextureHandle texture);
-	~RenderTexture() = default;
+	~RenderTexture();
 
 	static RenderTexture::SharedPtr create(vkrhi::IDevice *device, 
 		const Vector2i size, vkrhi::Format format, const std::string name = "");

diff --git a/src/core/window.cpp b/src/core/window.cpp
@@ -497,7 +497,11 @@ void DeviceManager::getFrameSize(int &width, int &height) const {
 void DeviceManager::updateWindowSize() {
 	if (mWindow == nullptr) return;
 	int width, height;
-	glfwGetWindowSize(mWindow, &width, &height);
+	// as the validation layer needs us to get the window extent through this way.
+	// https://www.reddit.com/r/vulkan/comments/14uen19/do_i_need_to_recreate_glfw_surfaces_on_window/
+	auto surfaceCap = mVulkanPhysicalDevice.getSurfaceCapabilitiesKHR(mWindowSurface);
+	width			= surfaceCap.currentExtent.width;
+	height			= surfaceCap.currentExtent.height;
 
 	if (width == 0 || height == 0) {
 		// window is minimized
@@ -1075,6 +1079,9 @@ bool DeviceManager::createSwapChain() {
 
 	const bool enableSwapChainSharing = queues.size() > 1;
 
+	auto nextDesc = vk::SwapchainPresentScalingCreateInfoEXT().setScalingBehavior(
+		vk::PresentScalingFlagBitsEXT::eOneToOne);
+
 	auto desc =
 		vk::SwapchainCreateInfoKHR()
 			.setSurface(mWindowSurface)
@@ -1094,7 +1101,8 @@ bool DeviceManager::createSwapChain() {
 			.setPresentMode(mDeviceParams.vsyncEnabled ? vk::PresentModeKHR::eFifo
 														: vk::PresentModeKHR::eImmediate)
 			.setClipped(true)
-			.setOldSwapchain(nullptr);
+			.setOldSwapchain(nullptr)
+			.setPNext(&nextDesc);
 
 	const vk::Result res = mVulkanDevice.createSwapchainKHR(&desc, nullptr, &mSwapChain);
 	if (res != vk::Result::eSuccess) {

diff --git a/src/main/kiraray.cpp b/src/main/kiraray.cpp
@@ -8,8 +8,8 @@ extern "C" int main(int argc, char *argv[]) {
 	Log(Info, "Working directory: %s", KRR_PROJECT_DIR);
 	Log(Info, "Kiraray build type: %s", KRR_BUILD_TYPE);
 #ifdef KRR_DEBUG_BUILD
-	Log(Warning, "Running in debug mode, the performance may be extremely slow. "
-			   "Switch to Release build for normal performance!");
+	Log(Warning, "Running in debug mode, the performance may be slow."
+			   "Switch to Release build for faster performance!");
 #endif
 
 	string configFile = "common/configs/example_cbox.json";

diff --git a/src/render/sampling.h b/src/render/sampling.h
@@ -9,8 +9,6 @@
 
 NAMESPACE_BEGIN(krr)
 
-using namespace utils;
-
 KRR_CALLABLE float evalMIS(float p0, float p1) {
 #if MIS_POWER_HEURISTIC
 	return p0 * p0 / (p0 * p0 + p1 * p1);
@@ -94,7 +92,7 @@ KRR_CALLABLE int sampleDiscrete(gpu::span<const float> weights, float u, float *
 	for (float w : weights) sumWeights += w;
 
 	float up = u * sumWeights;
-	if (up == sumWeights) up = nextFloatDown(up);
+	if (up == sumWeights) up = utils::nextFloatDown(up);
 
 	int offset = 0;
 	float sum  = 0;

diff --git a/src/render/shared.h b/src/render/shared.h
@@ -21,7 +21,6 @@
 
 NAMESPACE_BEGIN(krr)
 
-using namespace utils;
 using namespace rt;
 
 struct HitInfo {

diff --git a/src/render/spectrum.h b/src/render/spectrum.h
@@ -467,9 +467,7 @@ KRR_CALLABLE float RGBColorSpace::lum(const SpectrumType& s, const SampledWavele
 		SampledSpectrum pdf = lambda.pdf();
 		return SampledSpectrum(Ys * s).safeDiv(pdf).mean() / CIE_Y_integral;
 	}
-	else 
-		static_assert(!std::is_same_v<SpectrumType, SpectrumType>, 
-			"SpectrumType must be either RGB or SampledSpectrum");
+	else static_assert(false, "SpectrumType must be either RGB or SampledSpectrum");
 }
 
 KRR_CALLABLE float luminance(RGB color) {

diff --git a/src/render/wavefront/integrator.cpp b/src/render/wavefront/integrator.cpp
@@ -1,12 +1,20 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/device_allocator.h>
+#include <thrust/mr/device_memory_resource.h>
 
 #include "device/cuda.h"
+#include "device/thrust.h"
 #include "integrator.h"
 #include "wavefront.h"
 #include "render/spectrum.h"
 #include "render/profiler/profiler.h"
-#include "workqueue.h"
 
 NAMESPACE_BEGIN(krr)
 extern "C" char WAVEFRONT_PTX[];
@@ -35,6 +43,10 @@ void WavefrontPathTracer::initialize() {
 		if (mediumScatterQueue) mediumScatterQueue->resize(maxQueueSize, alloc);
 		else mediumScatterQueue = alloc.new_object<MediumScatterQueue>(maxQueueSize, alloc);
 	}
+	if (scatterRayKeys) scatterRayKeys->resize(maxQueueSize);
+	else scatterRayKeys = new TypedBuffer<ScatterRayKeyIndex>(maxQueueSize);
+	if (scatterRaySortBuffer) scatterRaySortBuffer->resize(maxQueueSize, alloc);
+	else scatterRaySortBuffer = alloc.new_object<ScatterRayQueue>(maxQueueSize, alloc);
 	if (!camera) camera = alloc.new_object<rt::CameraData>();
 	CUDA_SYNC_CHECK();
 }
@@ -101,8 +113,57 @@ void WavefrontPathTracer::handleMiss() {
 
 void WavefrontPathTracer::generateScatterRays(int depth) {
 	PROFILE("Generate scatter rays");
+	using MemRes = thrust::device_ptr_memory_resource<thrust_cached_resource<thrust::device_memory_resource>>;
+	using Alloc = thrust::mr::allocator<ScatterRayKeyIndex, MemRes>;
+	static std::unique_ptr<MemRes> memory;
+	static std::unique_ptr<Alloc> alloc;
+	if (!memory) {
+		memory = std::make_unique<MemRes>();
+		alloc = std::make_unique<Alloc>(memory.get());
+	}
+	{
+		PROFILE("Sort scatter rays");
+		auto *queue				 = scatterRayQueue;
+		auto *auxBuffer			 = scatterRaySortBuffer;
+		ScatterRayKeyIndex *keys = scatterRayKeys->data();
+		{
+			PROFILE("Update keys");
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) 
+						keys[index].key = std::numeric_limits<int64_t>::max();
+					else {
+						ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
+						keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
+					}	
+					keys[index].index = index;
+				}, KRR_DEFAULT_STREAM);
+		}
+		{
+			PROFILE("Sort indices");
+			thrust::sort(thrust::cuda::par_nosync(*alloc).on(KRR_DEFAULT_STREAM),
+						 keys, keys + maxQueueSize,
+						 [] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
+							 return a.key < b.key;
+						 });
+		}
+		{
+			PROFILE("Reorder and blit");
+			// sorted to auxiliary buffer
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) return;
+					ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
+					auxBuffer->operator[](index)		= w.operator krr::ScatterRayWorkItem();
+				}, KRR_DEFAULT_STREAM);
+			// blit back
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) return;
+					queue->operator[](index) =
+						auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
+				}, KRR_DEFAULT_STREAM);
+		}
+	}
 	ForAllQueued(
-		scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem & w) {
+		scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem& w) {
 			Sampler sampler = &pixelState->sampler[w.pixelId];
 			/*  Russian Roulette: If the path is terminated by this vertex,
 				then NEE should not be evaluated */