diff --git a/src/core/device/thrust.h b/src/core/device/thrust.h
index b537447..81a3423 100644
--- a/src/core/device/thrust.h
+++ b/src/core/device/thrust.h
@@ -23,51 +23,62 @@ class thrust_cached_resource final :
 
 private:
 	typedef typename Upstream::pointer void_ptr;
-	typedef std::tuple<std::ptrdiff_t, std::size_t, void_ptr> block_t;
-	std::vector<block_t> blocks{};
+	using block_key_type = std::pair<std::ptrdiff_t, std::size_t>;	// size, alignment
+	using free_blocks_container = std::multimap<block_key_type, void_ptr>;
+	using allocated_blocks_container = std::vector<std::pair<void_ptr, block_key_type>>;
+
+	free_blocks_container free_blocks;
+	allocated_blocks_container allocated_blocks;
 	Upstream *m_upstream;
 
 public:
 	void release() {
 		Log(Info, "thrust_cached_resource::release()");
-		std::for_each(blocks.begin(), blocks.end(), [this](block_t &block) {
-			auto &[bytes, alignment, ptr] = block;
-			m_upstream->do_deallocate(ptr, bytes, alignment);
-		});
+		// Deallocate all outstanding blocks in both lists.
+		for (typename free_blocks_container::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
+			m_upstream->do_deallocate(i->second, i->first.first, i->first.second);
+
+		for (typename allocated_blocks_container::iterator i = allocated_blocks.begin();
+			 i != allocated_blocks.end(); ++i)
+			m_upstream->do_deallocate(i->first, i->second.first, i->second.second);
 	}
 
 	void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override {
 		Log(Info, "thrust_cached_resource::do_allocate(): num_bytes == %zu", bytes);
-
 		void_ptr result = nullptr;
 
-		auto const fitting_block =
-			std::find_if(blocks.cbegin(), blocks.cend(), [bytes, alignment](block_t const &block) {
-				auto &[b_bytes, b_alignment, _] = block;
-				return b_bytes == bytes && b_alignment == alignment;
-			});
+		typename free_blocks_container::iterator free_block = free_blocks.find({bytes, alignment});
 
-		if (fitting_block != blocks.end()) {
+		if (free_block != free_blocks.end()) {
 			Log(Info, "thrust_cached_resource::do_allocate(): found a free block of %zd bytes", bytes);
-			result = std::get<2>(*fitting_block);
+			result = free_block->second;
+			free_blocks.erase(free_block);
 		} else {
 			Log(Info, "thrust_cached_resource::do_allocate(): allocating new block of %zd bytes", bytes);
 			result = m_upstream->do_allocate(bytes, alignment);
-			blocks.emplace_back(bytes, alignment, result);
 		}
+
+		allocated_blocks.push_back(std::make_pair(result, block_key_type{bytes, alignment}));
 		return result;
 	}
 
 	void do_deallocate(void_ptr ptr, std::size_t bytes, std::size_t alignment) override {
-		Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p",
-			reinterpret_cast<void *>(ptr.get()));
-		auto const fitting_block =
-			std::find_if(blocks.cbegin(), blocks.cend(),
-						 [ptr](block_t const &block) { return std::get<2>(block) == ptr; });
+		Log(Info, "thrust_cached_resource::do_deallocate(): ptr == %p", reinterpret_cast<void *>(ptr.get()));
 
-		if (fitting_block == blocks.end())
+		//typename allocated_blocks_container::iterator iter = allocated_blocks.find(ptr);
+		typename allocated_blocks_container::iterator iter = std::find_if(allocated_blocks.begin(), 
+			allocated_blocks.end(), [ptr](const typename allocated_blocks_container::value_type& pair){
+							 return pair.first == ptr; });
+		if (iter == allocated_blocks.end()) {
 			Log(Error, "Pointer `%p` was not allocated by this allocator",
-				thrust::raw_pointer_cast(ptr));
+				reinterpret_cast<void *>(ptr.get()));
+			return;
+		}
+
+		block_key_type key = iter->second;
+
+		allocated_blocks.erase(iter);
+		free_blocks.insert(std::make_pair(key, ptr));
 	}
 };
 
diff --git a/src/render/wavefront/integrator.cpp b/src/render/wavefront/integrator.cpp
index a47a41b..87d8f05 100644
--- a/src/render/wavefront/integrator.cpp
+++ b/src/render/wavefront/integrator.cpp
@@ -3,6 +3,11 @@
 #include <thrust/sort.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/device_allocator.h>
+#include <thrust/mr/device_memory_resource.h>
 
 #include "device/cuda.h"
 #include "device/thrust.h"
@@ -108,47 +113,54 @@ void WavefrontPathTracer::handleMiss() {
 
 void WavefrontPathTracer::generateScatterRays(int depth) {
 	PROFILE("Generate scatter rays");
-	static thrust::cuda::vector<ScatterRayKeyIndex> scatterRayKeys1;
-	if (scatterRayKeys1.size() != maxQueueSize) {
-		scatterRayKeys1.resize(maxQueueSize);
-		Log(Info, "resized ");
+	using MemRes = thrust::device_ptr_memory_resource<thrust_cached_resource<thrust::device_memory_resource>>;
+	using Alloc = thrust::mr::allocator<ScatterRayKeyIndex, MemRes>;
+	static std::unique_ptr<MemRes> memory;
+	static std::unique_ptr<Alloc> alloc;
+	if (!memory) {
+		memory = std::make_unique<MemRes>();
+		alloc = std::make_unique<Alloc>(memory.get());
 	}
 	{
 		PROFILE("Sort scatter rays");
 		auto *queue				 = scatterRayQueue;
 		auto *auxBuffer			 = scatterRaySortBuffer;
 		ScatterRayKeyIndex *keys = scatterRayKeys->data();
-		GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
-				if (index >= queue->size()) 
-					keys[index].key = std::numeric_limits<int64_t>::max();
-				else {
-					ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
-					keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
-				}	
-				keys[index].index = index;
-			}, KRR_DEFAULT_STREAM);
-		cudaMemcpyAsync(thrust::raw_pointer_cast(scatterRayKeys1.data()), keys,
-			maxQueueSize * sizeof(ScatterRayKeyIndex), 
-			cudaMemcpyDeviceToDevice, KRR_DEFAULT_STREAM);
-		thrust::sort(thrust::cuda::par_nosync.on(KRR_DEFAULT_STREAM), 
-			thrust::retag<cached_allocator_tag>(scatterRayKeys1.begin()),
-			thrust::retag<cached_allocator_tag>(scatterRayKeys1.end()),
-				//keys, keys + maxQueueSize,
-				[] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
-					return a.key < b.key;
-				});
-		//// sorted to auxiliary buffer
-		GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
-				if (index >= queue->size()) return;
-				ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
-				auxBuffer->operator[](index)		= w.operator krr::ScatterRayWorkItem();
-			}, KRR_DEFAULT_STREAM);
-		// blit back
-		GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
-				if (index >= queue->size()) return;
-				queue->operator[](index) =
-					auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
-			}, KRR_DEFAULT_STREAM);
+		{
+			PROFILE("Update keys");
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) 
+						keys[index].key = std::numeric_limits<int64_t>::max();
+					else {
+						ScatterRayQueue::GetSetIndirector w = queue->operator[](index);
+						keys[index].key = static_cast<int64_t>(w.soa->intr.sd.bsdfType[w.i]);
+					}	
+					keys[index].index = index;
+				}, KRR_DEFAULT_STREAM);
+		}
+		{
+			PROFILE("Sort indices");
+			thrust::sort(thrust::cuda::par_nosync(*alloc).on(KRR_DEFAULT_STREAM),
+						 keys, keys + maxQueueSize,
+						 [] KRR_DEVICE(const ScatterRayKeyIndex &a, const ScatterRayKeyIndex &b) {
+							 return a.key < b.key;
+						 });
+		}
+		{
+			PROFILE("Reorder and blit");
+			// sorted to auxiliary buffer
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) return;
+					ScatterRayQueue::GetSetIndirector w = queue->operator[](keys[index].index);
+					auxBuffer->operator[](index)		= w.operator krr::ScatterRayWorkItem();
+				}, KRR_DEFAULT_STREAM);
+			// blit back
+			GPUParallelFor(maxQueueSize, [=] KRR_DEVICE (int index) {
+					if (index >= queue->size()) return;
+					queue->operator[](index) =
+						auxBuffer->operator[](index).operator krr::ScatterRayWorkItem();
+				}, KRR_DEFAULT_STREAM);
+		}
 	}
 	ForAllQueued(
 		scatterRayQueue, maxQueueSize, KRR_DEVICE_LAMBDA(ScatterRayWorkItem& w) {