From 13979d490f88c33c8d3bc98dff5a4bacbc93e374 Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Sat, 23 Nov 2024 18:25:58 +0100
Subject: [PATCH 1/4] Latte/Vulkan: Add multiple entry LRU cache support for
 indices

---
 src/Cafe/HW/Latte/Core/LatteIndices.cpp       | 114 ++++++++-----
 src/Cafe/HW/Latte/Core/LatteIndices.h         |   2 +-
 src/Cafe/HW/Latte/Core/LatteOverlay.cpp       |   6 +
 .../HW/Latte/Core/LattePerformanceMonitor.cpp |   3 +-
 .../HW/Latte/Core/LattePerformanceMonitor.h   |   6 +
 src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp  |   1 -
 .../HW/Latte/Renderer/OpenGL/OpenGLRenderer.h |  17 +-
 src/Cafe/HW/Latte/Renderer/Renderer.h         |  11 +-
 .../Renderer/Vulkan/VKRMemoryManager.cpp      | 147 +++++++++++++++--
 .../Latte/Renderer/Vulkan/VKRMemoryManager.h  | 154 +++++++++++++++---
 .../Latte/Renderer/Vulkan/VulkanRenderer.cpp  |   6 +-
 .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h |   5 +-
 .../Renderer/Vulkan/VulkanRendererCore.cpp    |  29 ++--
 13 files changed, 396 insertions(+), 105 deletions(-)

diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp
index 6e1d74559..aec51725f 100644
--- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp
@@ -1,6 +1,7 @@
 #include "Cafe/HW/Latte/Core/LatteConst.h"
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
+#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Common/cpu_features.h"
 
 #if defined(ARCH_X86_64) && defined(__GNUC__)
@@ -9,32 +10,53 @@
 
 struct  
 {
-	const void* lastPtr;
-	uint32 lastCount;
-	LattePrimitiveMode lastPrimitiveMode;
-	LatteIndexType lastIndexType;
-	// output
-	uint32 indexMin;
-	uint32 indexMax;
-	Renderer::INDEX_TYPE renderIndexType;
-	uint32 outputCount;
-	uint32 indexBufferOffset;
-	uint32 indexBufferIndex;
+	struct CacheEntry
+	{
+		// input data
+		const void* lastPtr;
+		uint32 lastCount;
+		LattePrimitiveMode lastPrimitiveMode;
+		LatteIndexType lastIndexType;
+		uint64 lastUsed;
+		// output
+		uint32 indexMin;
+		uint32 indexMax;
+		Renderer::INDEX_TYPE renderIndexType;
+		uint32 outputCount;
+		Renderer::IndexAllocation indexAllocation;
+	};
+	std::array<CacheEntry, 8> entry;
+	uint64 currentUsageCounter{0};
 }LatteIndexCache{};
 
 void LatteIndices_invalidate(const void* memPtr, uint32 size)
 {
-	if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
+	for(auto& entry : LatteIndexCache.entry)
 	{
-		LatteIndexCache.lastPtr = nullptr;
-		LatteIndexCache.lastCount = 0;
+		if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
+		{
+			if(entry.lastPtr != nullptr)
+				g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+			entry.lastPtr = nullptr;
+			entry.lastCount = 0;
+		}
 	}
 }
 
 void LatteIndices_invalidateAll()
 {
-	LatteIndexCache.lastPtr = nullptr;
-	LatteIndexCache.lastCount = 0;
+	for(auto& entry : LatteIndexCache.entry)
+	{
+		if (entry.lastPtr != nullptr)
+			g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+		entry.lastPtr = nullptr;
+		entry.lastCount = 0;
+	}
+}
+
+uint64 LatteIndices_GetNextUsageIndex()
+{
+	return LatteIndexCache.currentUsageCounter++;
 }
 
 uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
@@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
 	}
 }
 
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
 {
 	// what this should do:
 	// [x] use fast SIMD-based index decoding
@@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 	// [ ] better cache implementation, allow to cache across frames
 
 	// reuse from cache if data didn't change
-	if (LatteIndexCache.lastPtr == indexData &&
-		LatteIndexCache.lastCount == count &&
-		LatteIndexCache.lastPrimitiveMode == primitiveMode &&
-		LatteIndexCache.lastIndexType == indexType)
-	{
-		indexMin = LatteIndexCache.indexMin;
-		indexMax = LatteIndexCache.indexMax;
-		renderIndexType = LatteIndexCache.renderIndexType;
-		outputCount = LatteIndexCache.outputCount;
-		indexBufferOffset = LatteIndexCache.indexBufferOffset;
-		indexBufferIndex = LatteIndexCache.indexBufferIndex;
+	auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
+	{
+		return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
+	});
+	if (cacheEntry != LatteIndexCache.entry.end())
+	{
+		indexMin = cacheEntry->indexMin;
+		indexMax = cacheEntry->indexMax;
+		renderIndexType = cacheEntry->renderIndexType;
+		outputCount = cacheEntry->outputCount;
+		indexAllocation = cacheEntry->indexAllocation;
+		cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 		return;
 	}
 
@@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		indexMin = 0;
 		indexMax = std::max(count, 1u)-1;
 		renderIndexType = Renderer::INDEX_TYPE::NONE;
+		indexAllocation = {};
 		return; // no indices
 	}
 	// query index buffer from renderer
-	void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
+	indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
+	void* indexOutputPtr = indexAllocation.mem;
 
 	// decode indices
 	indexMin = std::numeric_limits<uint32>::max();
@@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		// recalculate index range but filter out primitive restart index
 		LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
 	}
-	g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
+	g_renderer->indexData_uploadIndexMemory(indexAllocation);
+	performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
+	// get least recently used cache entry
+	auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
+	{
+		return a.lastUsed < b.lastUsed;
+	});
+	// invalidate previous allocation
+	if(lruEntry->lastPtr != nullptr)
+		g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
 	// update cache
-	LatteIndexCache.lastPtr = indexData;
-	LatteIndexCache.lastCount = count;
-	LatteIndexCache.lastPrimitiveMode = primitiveMode;
-	LatteIndexCache.lastIndexType = indexType;
-	LatteIndexCache.indexMin = indexMin;
-	LatteIndexCache.indexMax = indexMax;
-	LatteIndexCache.renderIndexType = renderIndexType;
-	LatteIndexCache.outputCount = outputCount;
-	LatteIndexCache.indexBufferOffset = indexBufferOffset;
-	LatteIndexCache.indexBufferIndex = indexBufferIndex;
+	lruEntry->lastPtr = indexData;
+	lruEntry->lastCount = count;
+	lruEntry->lastPrimitiveMode = primitiveMode;
+	lruEntry->lastIndexType = indexType;
+	lruEntry->indexMin = indexMin;
+	lruEntry->indexMax = indexMax;
+	lruEntry->renderIndexType = renderIndexType;
+	lruEntry->outputCount = outputCount;
+	lruEntry->indexAllocation = indexAllocation;
+	lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 }
diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.h b/src/Cafe/HW/Latte/Core/LatteIndices.h
index 917d7991b..8aace24e4 100644
--- a/src/Cafe/HW/Latte/Core/LatteIndices.h
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.h
@@ -4,4 +4,4 @@
 
 void LatteIndices_invalidate(const void* memPtr, uint32 size);
 void LatteIndices_invalidateAll();
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
\ No newline at end of file
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
\ No newline at end of file
diff --git a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
index 238f85e80..e6edb904d 100644
--- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
@@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
 				ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
 
 			if (config.overlay.debug)
+			{
+				// general debug info
+				ImGui::Text("--- Debug info ---");
+				ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
+				// backend specific info
 				g_renderer->AppendOverlayDebugInfo();
+			}
 
 			position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
 		}
diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
index f27674464..14dfe9a97 100644
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
@@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
 		uniformBankDataUploadedPerFrame /= 1024ULL;
 		uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
 		uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
-		indexDataUploadPerFrame /= 1024ULL;
 
 		double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
 		uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
@@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
 		uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		// set stats
-
+		performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
 		// next counter cycle
 		sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
 		performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
index 713e094e0..7252e6734 100644
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
@@ -131,6 +131,12 @@ typedef struct
 		LattePerfStatCounter numDrawBarriersPerFrame;
 		LattePerfStatCounter numBeginRenderpassPerFrame;
 	}vk;
+
+	// calculated stats (per frame)
+	struct
+	{
+		uint32 indexDataUploadPerFrame;
+	}stats;
 }performanceMonitor_t;
 
 extern performanceMonitor_t performanceMonitor;
diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
index 3bb6c7e30..2efef5bff 100644
--- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
@@ -11,7 +11,6 @@
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Cafe/GraphicPack/GraphicPack2.h"
 #include "config/ActiveSettings.h"
-#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
 #include "gui/guiWrapper.h"
 #include "Cafe/OS/libs/erreula/erreula.h"
 #include "input/InputManager.h"
diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
index 313ea3c0a..e29e9d4c4 100644
--- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
@@ -102,16 +102,21 @@ class OpenGLRenderer : public Renderer
 	static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
 	static void SetArrayElementBuffer(GLuint arrayElementBuffer);
 
-	// index
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
+	// index (not used by OpenGL renderer yet)
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override
 	{
-		assert_dbg();
-		return nullptr;
+		cemu_assert_unimplemented();
+		return {};
 	}
 
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override
 	{
-		assert_dbg();
+		cemu_assert_unimplemented();
+	}
+
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override
+	{
+		cemu_assert_unimplemented();
 	}
 
 	// uniform
diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h
index 0b694bb95..77d588b96 100644
--- a/src/Cafe/HW/Latte/Renderer/Renderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Renderer.h
@@ -138,8 +138,15 @@ class Renderer
 	virtual void draw_endSequence() = 0;
 
 	// index
-	virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
-	virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
+	struct IndexAllocation
+	{
+		void* mem; // pointer to index data inside buffer
+		void* rendererInternal; // for renderer use
+	};
+
+	virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
+	virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
+	virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;
 
 	// occlusion queries
 	virtual LatteQueryObject* occlusionQuery_create() = 0;
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
index c4f47a2bd..33af36515 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
@@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
 	AllocatorBuffer_t newBuffer{};
 	newBuffer.writeIndex = 0;
 	newBuffer.basePtr = nullptr;
-	if (m_bufferType == BUFFER_TYPE::STAGING)
+	if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::INDEX)
+	else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::STRIDE)
+	else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
 	else
 		cemu_assert_debug(false);
@@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 		uint32 distanceToSyncPoint;
 		if (!itr.queue_syncPoints.empty())
 		{
-			if(itr.queue_syncPoints.front().offset < itr.writeIndex)
+			if (itr.queue_syncPoints.front().offset < itr.writeIndex)
 				distanceToSyncPoint = 0xFFFFFFFF;
 			else
 				distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
@@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 
 void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
 {
-	cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
+	cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
 	// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
 	VkMappedMemoryRange flushedRange{};
 	flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
 	}
 }
 
+/* VKRSynchronizedHeapAllocator */
+
+VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
+	: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
+
+VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
+{
+	CHAddr addr = m_chunkedHeap.alloc(size, alignment);
+	m_activeAllocations.emplace_back(addr);
+	AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
+	res->bufferIndex = addr.chunkIndex;
+	res->bufferOffset = addr.offset;
+	res->size = size;
+	res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
+	m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
+	return res;
+}
+
+void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
+{
+	// put the allocation on a delayed release queue for the current command buffer
+	uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
+	auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
+	cemu_assert_debug(it != m_activeAllocations.end());
+	m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
+	m_activeAllocations.erase(it);
+	m_poolAllocatorReservation.freeObj(uploadReservation);
+}
+
+void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
+{
+	if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
+	{
+		VkMappedMemoryRange flushedRange{};
+		flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+		flushedRange.memory = uploadReservation->vkMem;
+		flushedRange.offset = uploadReservation->bufferOffset;
+		flushedRange.size = uploadReservation->size;
+		vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
+	}
+}
+
+void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
+{
+	auto it = m_releaseQueue.begin();
+	while (it != m_releaseQueue.end())
+	{
+		if (it->first <= latestFinishedCommandBufferId)
+		{
+			// release allocations
+			for(auto& addr : it->second)
+				m_chunkedHeap.free(addr);
+			it = m_releaseQueue.erase(it);
+			continue;
+		}
+		it++;
+	}
+}
+
+void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+{
+	m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
+}
+
 /* VkTextureChunkedHeap */
 
 uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
@@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1);
 
 	// pad minimumAllocationSize to 32KB alignment
-	minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1);
+	minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1);
 
 	uint32 allocationSize = 1024 * 1024 * 128;
 	if (chunkIndex == 0)
@@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
 	std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
 	// remove device local memory types from host local vector
-	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
-	{
+	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
 		return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
 	};
 	hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
@@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;
 
 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;
 
 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@@ -238,6 +301,66 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	return 0;
 }
 
+/* VkBufferChunkedHeap */
+
+VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
+{
+	auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
+	VkBuffer buffer;
+	VkDeviceMemory bufferMemory;
+	bool allocSuccess;
+	if (bufferType == VKR_BUFFER_TYPE::STAGING)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::INDEX)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else
+		cemu_assert_debug(false);
+	if (!allocSuccess)
+		return nullptr;
+
+	VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
+	// if host visible, then map buffer
+	void* data = nullptr;
+	if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+	{
+		vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
+		bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	}
+	bufferObj->m_mappedMemory = (uint8*)data;
+	return bufferObj;
+}
+
+VKRBuffer::~VKRBuffer()
+{
+	if(m_mappedMemory)
+		vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
+	vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
+	vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
+}
+
+VkBufferChunkedHeap::~VkBufferChunkedHeap()
+{
+	for (auto& chunk : m_chunkBuffers)
+		delete chunk;
+}
+
+uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
+{
+	size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
+	VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	if(!buffer)
+		buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+	if(!buffer)
+		VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
+	cemu_assert_debug(buffer);
+	cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
+	m_chunkBuffers.emplace_back(buffer);
+	// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
+	return allocationSize;
+}
+
 uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
 {
 	VkPhysicalDeviceMemoryProperties memProperties;
@@ -423,7 +546,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz
 	importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
 	importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
 	importHostMem.pHostPointer = hostPointer;
-	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or 
+	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
 	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT
 	// whats the difference ?
 
@@ -469,7 +592,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
 	auto it = map_textureHeap.find(typeFilter);
 	if (it == map_textureHeap.end())
 	{
-		texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
+		texHeap = new VkTextureChunkedHeap(this, typeFilter);
 		map_textureHeap.emplace(typeFilter, texHeap);
 	}
 	else
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
index bf2d919b3..ecf539961 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@@ -2,6 +2,36 @@
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
 #include "util/ChunkedHeap/ChunkedHeap.h"
+#include "util/helpers/MemoryPool.h"
+
+enum class VKR_BUFFER_TYPE
+{
+	STAGING, // staging upload buffer
+	INDEX, // buffer for index data
+	STRIDE, // buffer for stride-adjusted vertex data
+};
+
+class VKRBuffer
+{
+  public:
+	static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
+	~VKRBuffer();
+
+	VkBuffer GetVkBuffer() const { return m_buffer; }
+	VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
+
+	uint8* GetPtr() const { return m_mappedMemory; }
+
+	bool RequiresFlush() const { return m_requiresFlush; }
+
+  private:
+	VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
+
+	VkBuffer m_buffer;
+	VkDeviceMemory m_bufferMemory;
+	uint8* m_mappedMemory;
+	bool m_requiresFlush{false};
+};
 
 struct VkImageMemAllocation
 {
@@ -17,15 +47,13 @@ struct VkImageMemAllocation
 class VkTextureChunkedHeap : private ChunkedHeap
 {
 public:
-	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
+	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
 
 	struct ChunkInfo
 	{
 		VkDeviceMemory mem;
 	};
 
-	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
-
 	CHAddr allocMem(uint32 size, uint32 alignment)
 	{
 		if (alignment < 4)
@@ -43,11 +71,6 @@ class VkTextureChunkedHeap : private ChunkedHeap
 		this->free(addr);
 	}
 
-	void setDevice(VkDevice dev)
-	{
-		m_device = dev;
-	}
-
 	VkDeviceMemory getChunkMem(uint32 index)
 	{
 		if (index >= m_list_chunkInfo.size())
@@ -61,24 +84,69 @@ class VkTextureChunkedHeap : private ChunkedHeap
 		allocatedBytes = numAllocatedBytes;
 	}
 
-	VkDevice m_device;
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
 	uint32 m_typeFilter{ 0xFFFFFFFF };
 	class VKRMemoryManager* m_vkrMemoryManager;
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };
 
+class VkBufferChunkedHeap : private ChunkedHeap
+{
+  public:
+	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
+	~VkBufferChunkedHeap();
+
+	using ChunkedHeap::alloc;
+	using ChunkedHeap::free;
+
+	uint8* GetChunkPtr(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return nullptr;
+		return m_chunkBuffers[index]->GetPtr();
+	}
+
+	void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
+	{
+		if (index >= m_chunkBuffers.size())
+		{
+			buffer = VK_NULL_HANDLE;
+			mem = VK_NULL_HANDLE;
+			return;
+		}
+		buffer = m_chunkBuffers[index]->GetVkBuffer();
+		mem = m_chunkBuffers[index]->GetVkBufferMemory();
+	}
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+	{
+		numBuffers = m_chunkBuffers.size();
+		totalBufferSize = numHeapBytes;
+		freeBufferSize = numHeapBytes - numAllocatedBytes;
+	}
+
+	bool RequiresFlush(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return false;
+		return m_chunkBuffers[index]->RequiresFlush();
+	}
+
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
+	VKR_BUFFER_TYPE m_bufferType;
+	std::vector<VKRBuffer*> m_chunkBuffers;
+	size_t m_minimumBufferAllocationSize;
+};
+
 // a circular ring-buffer which tracks and releases memory per command-buffer
 class VKRSynchronizedRingAllocator
 {
 public:
-	enum class BUFFER_TYPE
-	{
-		STAGING, // staging upload buffer
-		INDEX, // buffer for index data
-		STRIDE, // buffer for stride-adjusted vertex data
-	};
-
-	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
+	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
 	VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy
 
 	struct BufferSyncPoint_t
@@ -126,13 +194,53 @@ class VKRSynchronizedRingAllocator
 
 	const class VulkanRenderer* m_vkr;
 	const class VKRMemoryManager* m_vkrMemMgr;
-	const BUFFER_TYPE m_bufferType;
+	const VKR_BUFFER_TYPE m_bufferType;
 	const uint32 m_minimumBufferAllocSize;
 
 	std::vector<AllocatorBuffer_t> m_buffers;
 
 };
 
+// heap style allocator with released memory being freed after the current command buffer finishes
+class VKRSynchronizedHeapAllocator
+{
+	struct TrackedAllocation
+	{
+		TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
+		CHAddr allocation;
+	};
+
+  public:
+	VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
+	VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
+
+	struct AllocatorReservation
+	{
+		VkBuffer vkBuffer;
+		VkDeviceMemory vkMem;
+		uint8* memPtr;
+		uint32 bufferOffset;
+		uint32 size;
+		uint32 bufferIndex;
+	};
+
+	AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
+	void FreeReservation(AllocatorReservation* uploadReservation);
+	void FlushReservation(AllocatorReservation* uploadReservation);
+
+	void CleanupBuffer(uint64 latestFinishedCommandBufferId);
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
+  private:
+	const class VKRMemoryManager* m_vkrMemMgr;
+	VkBufferChunkedHeap m_chunkedHeap;
+	// allocations
+	std::vector<TrackedAllocation> m_activeAllocations;
+	MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
+	// release queue
+	std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
+};
+
 void LatteIndices_invalidateAll();
 
 class VKRMemoryManager
@@ -140,9 +248,9 @@ class VKRMemoryManager
 	friend class VKRSynchronizedRingAllocator;
 public:
 	VKRMemoryManager(class VulkanRenderer* renderer) :
-			m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
-			m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
-			m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
+			m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
+			m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
+			m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
 	{
 		m_vkr = renderer;
 	}
@@ -167,7 +275,7 @@ class VKRMemoryManager
 	}
 
 	VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
-	VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
+	VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
 	VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data
 
 	void cleanupBuffers(uint64 latestFinishedCommandBufferId)
@@ -202,6 +310,6 @@ class VKRMemoryManager
 	private:
 		class VulkanRenderer* m_vkr;
 		VKRSynchronizedRingAllocator m_stagingBuffer;
-		VKRSynchronizedRingAllocator m_indexBuffer;
+		VKRSynchronizedHeapAllocator m_indexBuffer;
 		VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
 };
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
index 37432eebe..201639875 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@@ -3699,7 +3699,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin
 
 void VulkanRenderer::AppendOverlayDebugInfo()
 {
-	ImGui::Text("--- Vulkan info ---");
+	ImGui::Text("--- Vulkan debug info ---");
 	ImGui::Text("GfxPipelines   %u", performanceMonitor.vk.numGraphicPipelines.get());
 	ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
 	ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
@@ -3716,7 +3716,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 
 	ImGui::Text("BeginRP/f      %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
 	ImGui::Text("Barriers/f     %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
-	ImGui::Text("--- Cache info ---");
+	ImGui::Text("--- Cache debug info ---");
 
 	uint32 bufferCacheHeapSize = 0;
 	uint32 bufferCacheAllocationSize = 0;
@@ -3736,7 +3736,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
 
-	memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
+	memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
 	ImGui::Text("Index");
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
index 52c1c6ed2..5ef4558da 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
@@ -328,8 +328,9 @@ class VulkanRenderer : public Renderer
 
 	RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;
 
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override;
 
 	// externally callable
 	void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
index 3a6840728..a72b093ba 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
@@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
 	return draw_createGraphicsPipeline(indexCount);
 }
 
-void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
+Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
 {
-	auto& indexAllocator = this->memoryManager->getIndexAllocator();
-	auto resv = indexAllocator.AllocateBufferMemory(size, 32);
-	offset = resv.bufferOffset;
-	bufferIndex = resv.bufferIndex;
-	return resv.memPtr;
+	VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
+	return { resv->memPtr, resv };
 }
 
-void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
+void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
 {
-	// does nothing since the index buffer memory is coherent
+	memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
+}
+
+void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
+{
+	memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
 }
 
 float s_vkUniformData[512 * 4];
@@ -1415,14 +1417,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 	uint32 hostIndexCount;
 	uint32 indexMin = 0;
 	uint32 indexMax = 0;
-	uint32 indexBufferOffset = 0;
-	uint32 indexBufferIndex = 0;
-	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
-
+	Renderer::IndexAllocation indexAllocation;
+	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
+	VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
 	// update index binding
 	bool isPrevIndexData = false;
 	if (hostIndexType != INDEX_TYPE::NONE)
 	{
+		uint32 indexBufferIndex = indexReservation->bufferIndex;
+		uint32 indexBufferOffset = indexReservation->bufferOffset;
 		if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
 		{
 			m_state.activeIndexType = hostIndexType;
@@ -1435,7 +1438,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 				vkType = VK_INDEX_TYPE_UINT32;
 			else
 				cemu_assert(false);
-			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
+			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
 		}
 		else
 			isPrevIndexData = true;

From e97493b2a1687b66cd283eddec375d459118e6fd Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:17:18 +0100
Subject: [PATCH 2/4] Optimize ChunkedHeap

---
 .../Latte/Renderer/Vulkan/VKRMemoryManager.h  |  12 +-
 src/Common/precompiled.h                      |  19 ++
 src/util/ChunkedHeap/ChunkedHeap.h            | 170 +++++++++---------
 3 files changed, 114 insertions(+), 87 deletions(-)

diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
index ecf539961..08af5882d 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@@ -44,7 +44,7 @@ struct VkImageMemAllocation
 	uint32 getAllocationSize() { return allocationSize; }
 };
 
-class VkTextureChunkedHeap : private ChunkedHeap
+class VkTextureChunkedHeap : private ChunkedHeap<>
 {
 public:
 	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
@@ -80,8 +80,8 @@ class VkTextureChunkedHeap : private ChunkedHeap
 
 	void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
 	{
-		totalHeapSize = numHeapBytes;
-		allocatedBytes = numAllocatedBytes;
+		totalHeapSize = m_numHeapBytes;
+		allocatedBytes = m_numAllocatedBytes;
 	}
 
   private:
@@ -92,7 +92,7 @@ class VkTextureChunkedHeap : private ChunkedHeap
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };
 
-class VkBufferChunkedHeap : private ChunkedHeap
+class VkBufferChunkedHeap : private ChunkedHeap<>
 {
   public:
 	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
@@ -123,8 +123,8 @@ class VkBufferChunkedHeap : private ChunkedHeap
 	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
 	{
 		numBuffers = m_chunkBuffers.size();
-		totalBufferSize = numHeapBytes;
-		freeBufferSize = numHeapBytes - numAllocatedBytes;
+		totalBufferSize = m_numHeapBytes;
+		freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
 	}
 
 	bool RequiresFlush(uint32 index) const
diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h
index d4df43437..3dfeaf74f 100644
--- a/src/Common/precompiled.h
+++ b/src/Common/precompiled.h
@@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
 	#define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif
 
+#if defined(_MSC_VER)
+#define FORCE_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
+
+FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
+{
+#if defined(_MSC_VER)
+	return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
+#elif defined(__GNUC__) || defined(__clang__)
+	return __builtin_ctz(v);
+#else
+	return std::countr_zero(v);
+#endif
+}
+
 // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
 #if defined(__aarch64__)
 
diff --git a/src/util/ChunkedHeap/ChunkedHeap.h b/src/util/ChunkedHeap/ChunkedHeap.h
index abc454293..21a1b868e 100644
--- a/src/util/ChunkedHeap/ChunkedHeap.h
+++ b/src/util/ChunkedHeap/ChunkedHeap.h
@@ -1,35 +1,39 @@
 #pragma once
 
+#include <util/helpers/MemoryPool.h>
+
 struct CHAddr
 {
 	uint32 offset;
 	uint32 chunkIndex;
+	void* internal; // AllocRange
 
-	CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
+	CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
 	CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
 
 	bool isValid() { return chunkIndex != 0xFFFFFFFF; };
 	static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
 };
 
+template<uint32 TMinimumAlignment = 32>
 class ChunkedHeap
 {
-	struct allocRange_t
+	struct AllocRange
 	{
-		allocRange_t* nextFree{};
-		allocRange_t* prevFree{};
-		allocRange_t* prevOrdered{};
-		allocRange_t* nextOrdered{};
+		AllocRange* nextFree{};
+		AllocRange* prevFree{};
+		AllocRange* prevOrdered{};
+		AllocRange* nextOrdered{};
 		uint32 offset;
 		uint32 chunkIndex;
 		uint32 size;
 		bool isFree;
-		allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
+		AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
 	};
 
-	struct chunk_t
+	struct Chunk
 	{
-		std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
+		uint32 size;
 	};
 
 public:
@@ -47,45 +51,32 @@ class ChunkedHeap
 		_free(addr);
 	}
 
-	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
-	{
-		return 0;
-	}
+	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;
 
 private:
 	unsigned ulog2(uint32 v)
 	{
-		static const unsigned MUL_DE_BRUIJN_BIT[] =
-		{
-		   0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
-		   8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
-		};
-
-		v |= v >> 1;
-		v |= v >> 2;
-		v |= v >> 4;
-		v |= v >> 8;
-		v |= v >> 16;
-
-		return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
+		cemu_assert_debug(v != 0);
+		return 31 - std::countl_zero(v);
 	}
 
-	void trackFreeRange(allocRange_t* range)
+	void trackFreeRange(AllocRange* range)
 	{
 		// get index of msb
 		cemu_assert_debug(range->size != 0); // size of zero is not allowed
 		uint32 bucketIndex = ulog2(range->size);
-		range->nextFree = bucketFreeRange[bucketIndex];
-		if (bucketFreeRange[bucketIndex])
-			bucketFreeRange[bucketIndex]->prevFree = range;
+		range->nextFree = m_bucketFreeRange[bucketIndex];
+		if (m_bucketFreeRange[bucketIndex])
+			m_bucketFreeRange[bucketIndex]->prevFree = range;
 		range->prevFree = nullptr;
-		bucketFreeRange[bucketIndex] = range;
+		m_bucketFreeRange[bucketIndex] = range;
+		m_bucketUseMask |= (1u << bucketIndex);
 	}
 
-	void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
+	void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
 	{
-		allocRange_t* prevRange = range->prevFree;
-		allocRange_t* nextRange = range->nextFree;
+		AllocRange* prevRange = range->prevFree;
+		AllocRange* nextRange = range->nextFree;
 		if (prevRange)
 		{
 			prevRange->nextFree = nextRange;
@@ -94,36 +85,42 @@ class ChunkedHeap
 		}
 		else
 		{
-			if (bucketFreeRange[bucketIndex] != range)
-				assert_dbg();
-			bucketFreeRange[bucketIndex] = nextRange;
+			cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
+			m_bucketFreeRange[bucketIndex] = nextRange;
 			if (nextRange)
 				nextRange->prevFree = nullptr;
+			else
+				m_bucketUseMask &= ~(1u << bucketIndex);
 		}
 	}
 
 	bool allocateChunk(uint32 minimumAllocationSize)
 	{
-		uint32 chunkIndex = (uint32)list_chunks.size();
-		list_chunks.emplace_back(new chunk_t());
+		uint32 chunkIndex = (uint32)m_chunks.size();
+		m_chunks.emplace_back();
 		uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
+		cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
 		if (chunkSize == 0)
 			return false;
-		allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
+		cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
+		AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
 		trackFreeRange(range);
-		numHeapBytes += chunkSize;
+		m_numHeapBytes += chunkSize;
 		return true;
 	}
 
-	void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
+	void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
 	{
+		cemu_assert_debug(allocSize > 0);
 		// remove the range from the chain of free ranges
 		forgetFreeRange(range, bucketIndex);
 		// split head, allocation and tail into separate ranges
-		if (allocOffset > range->offset)
+		uint32 headBytes = allocOffset - range->offset;
+		if (headBytes > 0)
 		{
 			// alignment padding -> create free range
-			allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
+			cemu_assert_debug(headBytes >= TMinimumAlignment);
+			AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
 			trackFreeRange(head);
 			if (range->prevOrdered)
 				range->prevOrdered->nextOrdered = head;
@@ -131,10 +128,12 @@ class ChunkedHeap
 			head->nextOrdered = range;
 			range->prevOrdered = head;
 		}
-		if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
+		uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
+		if (tailBytes > 0)
 		{
 			// tail -> create free range
-			allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
+			cemu_assert_debug(tailBytes >= TMinimumAlignment);
+			AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
 			trackFreeRange(tail);
 			if (range->nextOrdered)
 				range->nextOrdered->prevOrdered = tail;
@@ -149,36 +148,51 @@ class ChunkedHeap
 
 	CHAddr _alloc(uint32 size, uint32 alignment)
 	{
+		cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
+		// make sure size is not zero and align it
+		if(size == 0) [[unlikely]]
+			size = TMinimumAlignment;
+		else
+			size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
 		// find smallest bucket to scan
 		uint32 alignmentM1 = alignment - 1;
 		uint32 bucketIndex = ulog2(size);
-		while (bucketIndex < 32)
+		// check if the bucket is available
+		if( !(m_bucketUseMask & (1u << bucketIndex)) )
 		{
-			allocRange_t* range = bucketFreeRange[bucketIndex];
+			// skip to next non-empty bucket
+			uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += nextIndex;
+		}
+		while (bucketIndex < 31)
+		{
+			AllocRange* range = m_bucketFreeRange[bucketIndex];
 			while (range)
 			{
 				if (range->size >= size)
 				{
 					// verify if aligned allocation fits
 					uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
-					uint32 alignmentLoss = alignedOffset - range->offset;
-					if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
+					uint32 endOffset = alignedOffset + size;
+					if((range->offset+range->size) >= endOffset)
 					{
 						_allocFrom(range, bucketIndex, alignedOffset, size);
-						list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
-						numAllocatedBytes += size;
-						return CHAddr(alignedOffset, range->chunkIndex);
+						m_numAllocatedBytes += size;
+						return CHAddr(alignedOffset, range->chunkIndex, range);
 					}
 				}
 				range = range->nextFree;
 			}
-			bucketIndex++; // try higher bucket
+			// check next non-empty bucket or skip to end
+			bucketIndex++;
+			uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += emptyBuckets;
 		}
-		if(allocationLimitReached)
+		if(m_allocationLimitReached)
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		if (!allocateChunk(size))
 		{
-			allocationLimitReached = true;
+			m_allocationLimitReached = true;
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		}
 		return _alloc(size, alignment);
@@ -186,24 +200,16 @@ class ChunkedHeap
 
 	void _free(CHAddr addr)
 	{
-		auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
-		if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
+		if(!addr.internal)
 		{
 			cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
-			cemuLog_log(LogType::Force, "Debug info:");
-			for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
-			{
-				cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
-			}
 			return;
 		}
-
-		allocRange_t* range = it->second;
-		numAllocatedBytes -= it->second->size;
-		list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
+		AllocRange* range = (AllocRange*)addr.internal;
+		m_numAllocatedBytes -= range->size;
 		// try merge left or right
-		allocRange_t* prevRange = range->prevOrdered;
-		allocRange_t* nextRange = range->nextOrdered;
+		AllocRange* prevRange = range->prevOrdered;
+		AllocRange* nextRange = range->nextOrdered;
 		if (prevRange && prevRange->isFree)
 		{
 			if (nextRange && nextRange->isFree)
@@ -216,8 +222,8 @@ class ChunkedHeap
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
-				delete nextRange;
+				m_allocEntriesPool.freeObj(range);
+				m_allocEntriesPool.freeObj(nextRange);
 			}
 			else
 			{
@@ -228,7 +234,7 @@ class ChunkedHeap
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
+				m_allocEntriesPool.freeObj(range);
 			}
 		}
 		else if (nextRange && nextRange->isFree)
@@ -242,7 +248,7 @@ class ChunkedHeap
 				range->prevOrdered->nextOrdered = nextRange;
 			nextRange->prevOrdered = range->prevOrdered;
 			trackFreeRange(nextRange);
-			delete range;
+			m_allocEntriesPool.freeObj(range);
 		}
 		else
 		{
@@ -265,7 +271,7 @@ class ChunkedHeap
 
 		for (uint32 i = 0; i < 32; i++)
 		{
-			allocRange_t* ar = bucketFreeRange[i];
+			AllocRange* ar = m_bucketFreeRange[i];
 			while (ar)
 			{
 				availableRange_t dbgRange;
@@ -278,7 +284,7 @@ class ChunkedHeap
 					if (itr.chunkIndex != dbgRange.chunkIndex)
 						continue;
 					if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
-						assert_dbg();
+						cemu_assert_error();
 				}
 
 				availRanges.emplace_back(dbgRange);
@@ -290,14 +296,16 @@ class ChunkedHeap
 	}
 
 private:
-	std::vector<chunk_t*> list_chunks;
-	allocRange_t* bucketFreeRange[32]{};
-	bool allocationLimitReached = false;
+	std::vector<Chunk> m_chunks;
+	uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
+	AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
+	bool m_allocationLimitReached = false;
+	MemoryPool<AllocRange> m_allocEntriesPool{64};
 
 public:
 	// statistics
-	uint32 numHeapBytes{}; // total size of the heap
-	uint32 numAllocatedBytes{};
+	uint32 m_numHeapBytes{}; // total size of the heap
+	uint32 m_numAllocatedBytes{};
 };
 
 class VGenericHeap
@@ -633,7 +641,7 @@ class ChunkedFlatAllocator
 
 	uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; }
 	uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; }
-	
+
 private:
 	void allocateAdditionalChunk()
 	{

From d59e17b105f65ba63deddae5affc8cc1f51d7a48 Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:33:21 +0100
Subject: [PATCH 3/4] Properly handle resource clean up

---
 src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp | 8 +++++---
 src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp   | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
index 33af36515..3494dbc5e 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
@@ -334,10 +334,12 @@ VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMe
 
 VKRBuffer::~VKRBuffer()
 {
-	if(m_mappedMemory)
+	if (m_mappedMemory)
 		vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
-	vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
-	vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
+	if (m_bufferMemory != VK_NULL_HANDLE)
+		vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
+	if (m_buffer != VK_NULL_HANDLE)
+		vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
 }
 
 VkBufferChunkedHeap::~VkBufferChunkedHeap()
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
index 201639875..589047914 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@@ -679,6 +679,9 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr);
 	}
 
+	// destroy memory manager
+	delete memoryManager;
+
 	// destroy instance, devices
 	if (m_instance != VK_NULL_HANDLE)
 	{
@@ -690,9 +693,6 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyInstance(m_instance, nullptr);
 	}
 
-	// destroy memory manager
-	delete memoryManager;
-
 	// crashes?
 	//glslang::FinalizeProcess();
 }

From 04cad5677bf0e3cb7e7ce0a1d75e09899864faab Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Fri, 6 Dec 2024 20:53:52 +0100
Subject: [PATCH 4/4] Handle flush heuristics

---
 src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
index 167911b6a..a8f819019 100644
--- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
@@ -141,6 +141,14 @@ class DrawPassContext
 
 void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);
 
+// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
+void LatteCP_signalEnterWait()
+{
+	// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
+	// we only flush caches when the GPU goes idle or has to wait for any operation
+	LatteIndices_invalidateAll();
+}
+
 /*
 * Read a U32 from the command buffer
 * If no data is available then wait in a busy loop
@@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
 	const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
 	const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;
 
+	LatteCP_signalEnterWait();
+
 	bool stalls = false;
 	if ((word0 & 0x10) != 0)
 	{
@@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
 	else if(SEM_SIGNAL == 7)
 	{
 		// wait
+		LatteCP_signalEnterWait();
 		size_t loopCount = 0;
 		while (true)
 		{
@@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 				}
 				case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 				{
+					LatteCP_signalEnterWait();
 					LatteCP_itHLESwapScanBuffer(cmdData, nWords);
 					break;
 				}
 				case IT_HLE_WAIT_FOR_FLIP:
 				{
+					LatteCP_signalEnterWait();
 					LatteCP_itHLEWaitForFlip(cmdData, nWords);
 					break;
 				}
@@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
 			}
 			case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 			{
+				LatteCP_signalEnterWait();
 				LatteCP_itHLESwapScanBuffer(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 64;
 				break;
 			}
 			case IT_HLE_WAIT_FOR_FLIP:
 			{
+				LatteCP_signalEnterWait();
 				LatteCP_itHLEWaitForFlip(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 1;
 				break;