From 13979d490f88c33c8d3bc98dff5a4bacbc93e374 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sat, 23 Nov 2024 18:25:58 +0100 Subject: [PATCH 1/4] Latte/Vulkan: Add multiple entry LRU cache support for indices --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 114 ++++++++----- src/Cafe/HW/Latte/Core/LatteIndices.h | 2 +- src/Cafe/HW/Latte/Core/LatteOverlay.cpp | 6 + .../HW/Latte/Core/LattePerformanceMonitor.cpp | 3 +- .../HW/Latte/Core/LattePerformanceMonitor.h | 6 + src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 1 - .../HW/Latte/Renderer/OpenGL/OpenGLRenderer.h | 17 +- src/Cafe/HW/Latte/Renderer/Renderer.h | 11 +- .../Renderer/Vulkan/VKRMemoryManager.cpp | 147 +++++++++++++++-- .../Latte/Renderer/Vulkan/VKRMemoryManager.h | 154 +++++++++++++++--- .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 6 +- .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 5 +- .../Renderer/Vulkan/VulkanRendererCore.cpp | 29 ++-- 13 files changed, 396 insertions(+), 105 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 6e1d74559..aec51725f 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h" #include "Common/cpu_features.h" #if defined(ARCH_X86_64) && defined(__GNUC__) @@ -9,32 +10,53 @@ struct { - const void* lastPtr; - uint32 lastCount; - LattePrimitiveMode lastPrimitiveMode; - LatteIndexType lastIndexType; - // output - uint32 indexMin; - uint32 indexMax; - Renderer::INDEX_TYPE renderIndexType; - uint32 outputCount; - uint32 indexBufferOffset; - uint32 indexBufferIndex; + struct CacheEntry + { + // input data + const void* lastPtr; + uint32 lastCount; + LattePrimitiveMode lastPrimitiveMode; + LatteIndexType lastIndexType; + uint64 lastUsed; + // output + uint32 indexMin; + uint32 indexMax; + Renderer::INDEX_TYPE renderIndexType; + uint32 outputCount; + Renderer::IndexAllocation indexAllocation; + }; + std::array entry; + uint64 currentUsageCounter{0}; }LatteIndexCache{}; void LatteIndices_invalidate(const void* memPtr, uint32 size) { - if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) ) + for(auto& entry : LatteIndexCache.entry) { - LatteIndexCache.lastPtr = nullptr; - LatteIndexCache.lastCount = 0; + if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) ) + { + if(entry.lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(entry.indexAllocation); + entry.lastPtr = nullptr; + entry.lastCount = 0; + } } } void LatteIndices_invalidateAll() { - LatteIndexCache.lastPtr = nullptr; - LatteIndexCache.lastCount = 0; + for(auto& entry : LatteIndexCache.entry) + { + if (entry.lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(entry.indexAllocation); + entry.lastPtr = nullptr; + entry.lastCount = 0; + } +} + +uint64 LatteIndices_GetNextUsageIndex() +{ + return LatteIndexCache.currentUsageCounter++; } uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count) @@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn } } -void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex) +void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation) { // what this should do: // [x] use fast SIMD-based index decoding @@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 // [ ] better cache implementation, allow to cache across frames // reuse from cache if data didn't change - if (LatteIndexCache.lastPtr == indexData && - LatteIndexCache.lastCount == count && - LatteIndexCache.lastPrimitiveMode == primitiveMode && - LatteIndexCache.lastIndexType == indexType) - { - indexMin = LatteIndexCache.indexMin; - indexMax = LatteIndexCache.indexMax; - renderIndexType = LatteIndexCache.renderIndexType; - outputCount = LatteIndexCache.outputCount; - indexBufferOffset = LatteIndexCache.indexBufferOffset; - indexBufferIndex = LatteIndexCache.indexBufferIndex; + auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry) + { + return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType; + }); + if (cacheEntry != LatteIndexCache.entry.end()) + { + indexMin = cacheEntry->indexMin; + indexMax = cacheEntry->indexMax; + renderIndexType = cacheEntry->renderIndexType; + outputCount = cacheEntry->outputCount; + indexAllocation = cacheEntry->indexAllocation; + cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex(); return; } @@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 indexMin = 0; indexMax = std::max(count, 1u)-1; renderIndexType = Renderer::INDEX_TYPE::NONE; + indexAllocation = {}; return; // no indices } // query index buffer from renderer - void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex); + indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize); + void* indexOutputPtr = indexAllocation.mem; // decode indices indexMin = std::numeric_limits::max(); @@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 // recalculate index range but filter out primitive restart index LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax); } - g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize); + g_renderer->indexData_uploadIndexMemory(indexAllocation); + performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize; + // get least recently used cache entry + auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b) + { + return a.lastUsed < b.lastUsed; + }); + // invalidate previous allocation + if(lruEntry->lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation); // update cache - LatteIndexCache.lastPtr = indexData; - LatteIndexCache.lastCount = count; - LatteIndexCache.lastPrimitiveMode = primitiveMode; - LatteIndexCache.lastIndexType = indexType; - LatteIndexCache.indexMin = indexMin; - LatteIndexCache.indexMax = indexMax; - LatteIndexCache.renderIndexType = renderIndexType; - LatteIndexCache.outputCount = outputCount; - LatteIndexCache.indexBufferOffset = indexBufferOffset; - LatteIndexCache.indexBufferIndex = indexBufferIndex; + lruEntry->lastPtr = indexData; + lruEntry->lastCount = count; + lruEntry->lastPrimitiveMode = primitiveMode; + lruEntry->lastIndexType = indexType; + lruEntry->indexMin = indexMin; + lruEntry->indexMax = indexMax; + lruEntry->renderIndexType = renderIndexType; + lruEntry->outputCount = outputCount; + lruEntry->indexAllocation = indexAllocation; + lruEntry->lastUsed = LatteIndices_GetNextUsageIndex(); } diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.h b/src/Cafe/HW/Latte/Core/LatteIndices.h index 917d7991b..8aace24e4 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.h +++ b/src/Cafe/HW/Latte/Core/LatteIndices.h @@ -4,4 +4,4 @@ void LatteIndices_invalidate(const void* memPtr, uint32 size); void LatteIndices_invalidateAll(); -void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex); \ No newline at end of file +void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation); \ No newline at end of file diff --git a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp index 238f85e80..e6edb904d 100644 --- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp +++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp @@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal); if (config.overlay.debug) + { + // general debug info + ImGui::Text("--- Debug info ---"); + ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024); + // backend specific info g_renderer->AppendOverlayDebugInfo(); + } position.y += (ImGui::GetWindowSize().y + 10.0f) * direction; } diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp index f27674464..14dfe9a97 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp @@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd() uniformBankDataUploadedPerFrame /= 1024ULL; uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames); uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames); - indexDataUploadPerFrame /= 1024ULL; double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS; uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames; @@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd() uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime); uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime); // set stats - + performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame; // next counter cycle sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES; performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0; diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h index 713e094e0..7252e6734 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h @@ -131,6 +131,12 @@ typedef struct LattePerfStatCounter numDrawBarriersPerFrame; LattePerfStatCounter numBeginRenderpassPerFrame; }vk; + + // calculated stats (per frame) + struct + { + uint32 indexDataUploadPerFrame; + }stats; }performanceMonitor_t; extern performanceMonitor_t performanceMonitor; diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 3bb6c7e30..2efef5bff 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -11,7 +11,6 @@ #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h" #include "Cafe/GraphicPack/GraphicPack2.h" #include "config/ActiveSettings.h" -#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "gui/guiWrapper.h" #include "Cafe/OS/libs/erreula/erreula.h" #include "input/InputManager.h" diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h index 313ea3c0a..e29e9d4c4 100644 --- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h @@ -102,16 +102,21 @@ class OpenGLRenderer : public Renderer static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor); static void SetArrayElementBuffer(GLuint arrayElementBuffer); - // index - void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override + // index (not used by OpenGL renderer yet) + IndexAllocation indexData_reserveIndexMemory(uint32 size) override { - assert_dbg(); - return nullptr; + cemu_assert_unimplemented(); + return {}; } - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override + void indexData_releaseIndexMemory(IndexAllocation& allocation) override { - assert_dbg(); + cemu_assert_unimplemented(); + } + + void indexData_uploadIndexMemory(IndexAllocation& allocation) override + { + cemu_assert_unimplemented(); } // uniform diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 0b694bb95..77d588b96 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -138,8 +138,15 @@ class Renderer virtual void draw_endSequence() = 0; // index - virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0; - virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0; + struct IndexAllocation + { + void* mem; // pointer to index data inside buffer + void* rendererInternal; // for renderer use + }; + + virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0; + virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0; + virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0; // occlusion queries virtual LatteQueryObject* occlusionQuery_create() = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp index c4f47a2bd..33af36515 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp @@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq AllocatorBuffer_t newBuffer{}; newBuffer.writeIndex = 0; newBuffer.basePtr = nullptr; - if (m_bufferType == BUFFER_TYPE::STAGING) + if (m_bufferType == VKR_BUFFER_TYPE::STAGING) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); - else if (m_bufferType == BUFFER_TYPE::INDEX) + else if (m_bufferType == VKR_BUFFER_TYPE::INDEX) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); - else if (m_bufferType == BUFFER_TYPE::STRIDE) + else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); else cemu_assert_debug(false); @@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato uint32 distanceToSyncPoint; if (!itr.queue_syncPoints.empty()) { - if(itr.queue_syncPoints.front().offset < itr.writeIndex) + if (itr.queue_syncPoints.front().offset < itr.writeIndex) distanceToSyncPoint = 0xFFFFFFFF; else distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; @@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) { - cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent + cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent // todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant) VkMappedMemoryRange flushedRange{}; flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; @@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf } } +/* VKRSynchronizedHeapAllocator */ + +VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize) + : m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {}; + +VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + CHAddr addr = m_chunkedHeap.alloc(size, alignment); + m_activeAllocations.emplace_back(addr); + AllocatorReservation* res = m_poolAllocatorReservation.allocObj(); + res->bufferIndex = addr.chunkIndex; + res->bufferOffset = addr.offset; + res->size = size; + res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset; + m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem); + return res; +} + +void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation) +{ + // put the allocation on a delayed release queue for the current command buffer + uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId(); + auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; }); + cemu_assert_debug(it != m_activeAllocations.end()); + m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation); + m_activeAllocations.erase(it); + m_poolAllocatorReservation.freeObj(uploadReservation); +} + +void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) +{ + if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex)) + { + VkMappedMemoryRange flushedRange{}; + flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + flushedRange.memory = uploadReservation->vkMem; + flushedRange.offset = uploadReservation->bufferOffset; + flushedRange.size = uploadReservation->size; + vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange); + } +} + +void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId) +{ + auto it = m_releaseQueue.begin(); + while (it != m_releaseQueue.end()) + { + if (it->first <= latestFinishedCommandBufferId) + { + // release allocations + for(auto& addr : it->second) + m_chunkedHeap.free(addr); + it = m_releaseQueue.erase(it); + continue; + } + it++; + } +} + +void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize); +} + /* VkTextureChunkedHeap */ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) @@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1); // pad minimumAllocationSize to 32KB alignment - minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1); + minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1); uint32 allocationSize = 1024 * 1024 * 128; if (chunkIndex == 0) @@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA std::vector deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); std::vector hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0); // remove device local memory types from host local vector - auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool - { + auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool { return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end(); }; hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end()); @@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA allocInfo.memoryTypeIndex = memType; VkDeviceMemory imageMemory; - VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory); + VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory); if (r != VK_SUCCESS) continue; m_list_chunkInfo[chunkIndex].mem = imageMemory; @@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA allocInfo.memoryTypeIndex = memType; VkDeviceMemory imageMemory; - VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory); + VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory); if (r != VK_SUCCESS) continue; m_list_chunkInfo[chunkIndex].mem = imageMemory; @@ -238,6 +301,66 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA return 0; } +/* VkBufferChunkedHeap */ + +VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties) +{ + auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager(); + VkBuffer buffer; + VkDeviceMemory bufferMemory; + bool allocSuccess; + if (bufferType == VKR_BUFFER_TYPE::STAGING) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory); + else if (bufferType == VKR_BUFFER_TYPE::INDEX) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory); + else if (bufferType == VKR_BUFFER_TYPE::STRIDE) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory); + else + cemu_assert_debug(false); + if (!allocSuccess) + return nullptr; + + VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory); + // if host visible, then map buffer + void* data = nullptr; + if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + { + vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data); + bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + } + bufferObj->m_mappedMemory = (uint8*)data; + return bufferObj; +} + +VKRBuffer::~VKRBuffer() +{ + if(m_mappedMemory) + vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory); + vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr); + vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr); +} + +VkBufferChunkedHeap::~VkBufferChunkedHeap() +{ + for (auto& chunk : m_chunkBuffers) + delete chunk; +} + +uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) +{ + size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); + VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + if(!buffer) + buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if(!buffer) + VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap"); + cemu_assert_debug(buffer); + cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); + m_chunkBuffers.emplace_back(buffer); + // todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it? + return allocationSize; +} + uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const { VkPhysicalDeviceMemoryProperties memProperties; @@ -423,7 +546,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT; importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; importHostMem.pHostPointer = hostPointer; - // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or + // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT // whats the difference ? @@ -469,7 +592,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image) auto it = map_textureHeap.find(typeFilter); if (it == map_textureHeap.end()) { - texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice()); + texHeap = new VkTextureChunkedHeap(this, typeFilter); map_textureHeap.emplace(typeFilter, texHeap); } else diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h index bf2d919b3..ecf539961 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h @@ -2,6 +2,36 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "util/ChunkedHeap/ChunkedHeap.h" +#include "util/helpers/MemoryPool.h" + +enum class VKR_BUFFER_TYPE +{ + STAGING, // staging upload buffer + INDEX, // buffer for index data + STRIDE, // buffer for stride-adjusted vertex data +}; + +class VKRBuffer +{ + public: + static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties); + ~VKRBuffer(); + + VkBuffer GetVkBuffer() const { return m_buffer; } + VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; } + + uint8* GetPtr() const { return m_mappedMemory; } + + bool RequiresFlush() const { return m_requiresFlush; } + + private: + VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { }; + + VkBuffer m_buffer; + VkDeviceMemory m_bufferMemory; + uint8* m_mappedMemory; + bool m_requiresFlush{false}; +}; struct VkImageMemAllocation { @@ -17,15 +47,13 @@ struct VkImageMemAllocation class VkTextureChunkedHeap : private ChunkedHeap { public: - VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { }; + VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { }; struct ChunkInfo { VkDeviceMemory mem; }; - uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; - CHAddr allocMem(uint32 size, uint32 alignment) { if (alignment < 4) @@ -43,11 +71,6 @@ class VkTextureChunkedHeap : private ChunkedHeap this->free(addr); } - void setDevice(VkDevice dev) - { - m_device = dev; - } - VkDeviceMemory getChunkMem(uint32 index) { if (index >= m_list_chunkInfo.size()) @@ -61,24 +84,69 @@ class VkTextureChunkedHeap : private ChunkedHeap allocatedBytes = numAllocatedBytes; } - VkDevice m_device; + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + uint32 m_typeFilter{ 0xFFFFFFFF }; class VKRMemoryManager* m_vkrMemoryManager; std::vector m_list_chunkInfo; }; +class VkBufferChunkedHeap : private ChunkedHeap +{ + public: + VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + ~VkBufferChunkedHeap(); + + using ChunkedHeap::alloc; + using ChunkedHeap::free; + + uint8* GetChunkPtr(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return nullptr; + return m_chunkBuffers[index]->GetPtr(); + } + + void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem) + { + if (index >= m_chunkBuffers.size()) + { + buffer = VK_NULL_HANDLE; + mem = VK_NULL_HANDLE; + return; + } + buffer = m_chunkBuffers[index]->GetVkBuffer(); + mem = m_chunkBuffers[index]->GetVkBufferMemory(); + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const + { + numBuffers = m_chunkBuffers.size(); + totalBufferSize = numHeapBytes; + freeBufferSize = numHeapBytes - numAllocatedBytes; + } + + bool RequiresFlush(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return false; + return m_chunkBuffers[index]->RequiresFlush(); + } + + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + + VKR_BUFFER_TYPE m_bufferType; + std::vector m_chunkBuffers; + size_t m_minimumBufferAllocationSize; +}; + // a circular ring-buffer which tracks and releases memory per command-buffer class VKRSynchronizedRingAllocator { public: - enum class BUFFER_TYPE - { - STAGING, // staging upload buffer - INDEX, // buffer for index data - STRIDE, // buffer for stride-adjusted vertex data - }; - - VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy struct BufferSyncPoint_t @@ -126,13 +194,53 @@ class VKRSynchronizedRingAllocator const class VulkanRenderer* m_vkr; const class VKRMemoryManager* m_vkrMemMgr; - const BUFFER_TYPE m_bufferType; + const VKR_BUFFER_TYPE m_bufferType; const uint32 m_minimumBufferAllocSize; std::vector m_buffers; }; +// heap style allocator with released memory being freed after the current command buffer finishes +class VKRSynchronizedHeapAllocator +{ + struct TrackedAllocation + { + TrackedAllocation(CHAddr allocation) : allocation(allocation) {}; + CHAddr allocation; + }; + + public: + VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize); + VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy + + struct AllocatorReservation + { + VkBuffer vkBuffer; + VkDeviceMemory vkMem; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment); + void FreeReservation(AllocatorReservation* uploadReservation); + void FlushReservation(AllocatorReservation* uploadReservation); + + void CleanupBuffer(uint64 latestFinishedCommandBufferId); + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + private: + const class VKRMemoryManager* m_vkrMemMgr; + VkBufferChunkedHeap m_chunkedHeap; + // allocations + std::vector m_activeAllocations; + MemoryPool m_poolAllocatorReservation{32}; + // release queue + std::unordered_map> m_releaseQueue; +}; + void LatteIndices_invalidateAll(); class VKRMemoryManager @@ -140,9 +248,9 @@ class VKRMemoryManager friend class VKRSynchronizedRingAllocator; public: VKRMemoryManager(class VulkanRenderer* renderer) : - m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024), - m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024), - m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024) + m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024), + m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024), + m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024) { m_vkr = renderer; } @@ -167,7 +275,7 @@ class VKRMemoryManager } VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads - VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data + VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data void cleanupBuffers(uint64 latestFinishedCommandBufferId) @@ -202,6 +310,6 @@ class VKRMemoryManager private: class VulkanRenderer* m_vkr; VKRSynchronizedRingAllocator m_stagingBuffer; - VKRSynchronizedRingAllocator m_indexBuffer; + VKRSynchronizedHeapAllocator m_indexBuffer; VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer; }; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 37432eebe..201639875 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -3699,7 +3699,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin void VulkanRenderer::AppendOverlayDebugInfo() { - ImGui::Text("--- Vulkan info ---"); + ImGui::Text("--- Vulkan debug info ---"); ImGui::Text("GfxPipelines %u", performanceMonitor.vk.numGraphicPipelines.get()); ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get()); ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get()); @@ -3716,7 +3716,7 @@ void VulkanRenderer::AppendOverlayDebugInfo() ImGui::Text("BeginRP/f %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get()); ImGui::Text("Barriers/f %u", performanceMonitor.vk.numDrawBarriersPerFrame.get()); - ImGui::Text("--- Cache info ---"); + ImGui::Text("--- Cache debug info ---"); uint32 bufferCacheHeapSize = 0; uint32 bufferCacheAllocationSize = 0; @@ -3736,7 +3736,7 @@ void VulkanRenderer::AppendOverlayDebugInfo() ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); - memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize); + memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); ImGui::Text("Index"); ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 52c1c6ed2..5ef4558da 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -328,8 +328,9 @@ class VulkanRenderer : public Renderer RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override; - void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override; + IndexAllocation indexData_reserveIndexMemory(uint32 size) override; + void indexData_releaseIndexMemory(IndexAllocation& allocation) override; + void indexData_uploadIndexMemory(IndexAllocation& allocation) override; // externally callable void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 3a6840728..a72b093ba 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount return draw_createGraphicsPipeline(indexCount); } -void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) +Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size) { - auto& indexAllocator = this->memoryManager->getIndexAllocator(); - auto resv = indexAllocator.AllocateBufferMemory(size, 32); - offset = resv.bufferOffset; - bufferIndex = resv.bufferIndex; - return resv.memPtr; + VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32); + return { resv->memPtr, resv }; } -void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) +void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) { - // does nothing since the index buffer memory is coherent + memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal); +} + +void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) +{ + memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal); } float s_vkUniformData[512 * 4]; @@ -1415,14 +1417,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 hostIndexCount; uint32 indexMin = 0; uint32 indexMax = 0; - uint32 indexBufferOffset = 0; - uint32 indexBufferIndex = 0; - LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - + Renderer::IndexAllocation indexAllocation; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); + VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal; // update index binding bool isPrevIndexData = false; if (hostIndexType != INDEX_TYPE::NONE) { + uint32 indexBufferIndex = indexReservation->bufferIndex; + uint32 indexBufferOffset = indexReservation->bufferOffset; if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType) { m_state.activeIndexType = hostIndexType; @@ -1435,7 +1438,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 vkType = VK_INDEX_TYPE_UINT32; else cemu_assert(false); - vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType); + vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType); } else isPrevIndexData = true; From e97493b2a1687b66cd283eddec375d459118e6fd Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:17:18 +0100 Subject: [PATCH 2/4] Optimize ChunkedHeap --- .../Latte/Renderer/Vulkan/VKRMemoryManager.h | 12 +- src/Common/precompiled.h | 19 ++ src/util/ChunkedHeap/ChunkedHeap.h | 170 +++++++++--------- 3 files changed, 114 insertions(+), 87 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h index ecf539961..08af5882d 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h @@ -44,7 +44,7 @@ struct VkImageMemAllocation uint32 getAllocationSize() { return allocationSize; } }; -class VkTextureChunkedHeap : private ChunkedHeap +class VkTextureChunkedHeap : private ChunkedHeap<> { public: VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { }; @@ -80,8 +80,8 @@ class VkTextureChunkedHeap : private ChunkedHeap void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const { - totalHeapSize = numHeapBytes; - allocatedBytes = numAllocatedBytes; + totalHeapSize = m_numHeapBytes; + allocatedBytes = m_numAllocatedBytes; } private: @@ -92,7 +92,7 @@ class VkTextureChunkedHeap : private ChunkedHeap std::vector m_list_chunkInfo; }; -class VkBufferChunkedHeap : private ChunkedHeap +class VkBufferChunkedHeap : private ChunkedHeap<> { public: VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; @@ -123,8 +123,8 @@ class VkBufferChunkedHeap : private ChunkedHeap void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const { numBuffers = m_chunkBuffers.size(); - totalBufferSize = numHeapBytes; - freeBufferSize = numHeapBytes - numAllocatedBytes; + totalBufferSize = m_numHeapBytes; + freeBufferSize = m_numHeapBytes - m_numAllocatedBytes; } bool RequiresFlush(uint32 index) const diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h index d4df43437..3dfeaf74f 100644 --- a/src/Common/precompiled.h +++ b/src/Common/precompiled.h @@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor, #define NOEXPORT __attribute__ ((visibility ("hidden"))) #endif +#if defined(_MSC_VER) +#define FORCE_INLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE inline __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined +{ +#if defined(_MSC_VER) + return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_ctz(v); +#else + return std::countr_zero(v); +#endif +} + // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers #if defined(__aarch64__) diff --git a/src/util/ChunkedHeap/ChunkedHeap.h b/src/util/ChunkedHeap/ChunkedHeap.h index abc454293..21a1b868e 100644 --- a/src/util/ChunkedHeap/ChunkedHeap.h +++ b/src/util/ChunkedHeap/ChunkedHeap.h @@ -1,35 +1,39 @@ #pragma once +#include + struct CHAddr { uint32 offset; uint32 chunkIndex; + void* internal; // AllocRange - CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {}; + CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {}; CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {}; bool isValid() { return chunkIndex != 0xFFFFFFFF; }; static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); }; }; +template class ChunkedHeap { - struct allocRange_t + struct AllocRange { - allocRange_t* nextFree{}; - allocRange_t* prevFree{}; - allocRange_t* prevOrdered{}; - allocRange_t* nextOrdered{}; + AllocRange* nextFree{}; + AllocRange* prevFree{}; + AllocRange* prevOrdered{}; + AllocRange* nextOrdered{}; uint32 offset; uint32 chunkIndex; uint32 size; bool isFree; - allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {}; + AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {}; }; - struct chunk_t + struct Chunk { - std::unordered_map map_allocatedRange; + uint32 size; }; public: @@ -47,45 +51,32 @@ class ChunkedHeap _free(addr); } - virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) - { - return 0; - } + virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0; private: unsigned ulog2(uint32 v) { - static const unsigned MUL_DE_BRUIJN_BIT[] = - { - 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, - 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 - }; - - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - - return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27]; + cemu_assert_debug(v != 0); + return 31 - std::countl_zero(v); } - void trackFreeRange(allocRange_t* range) + void trackFreeRange(AllocRange* range) { // get index of msb cemu_assert_debug(range->size != 0); // size of zero is not allowed uint32 bucketIndex = ulog2(range->size); - range->nextFree = bucketFreeRange[bucketIndex]; - if (bucketFreeRange[bucketIndex]) - bucketFreeRange[bucketIndex]->prevFree = range; + range->nextFree = m_bucketFreeRange[bucketIndex]; + if (m_bucketFreeRange[bucketIndex]) + m_bucketFreeRange[bucketIndex]->prevFree = range; range->prevFree = nullptr; - bucketFreeRange[bucketIndex] = range; + m_bucketFreeRange[bucketIndex] = range; + m_bucketUseMask |= (1u << bucketIndex); } - void forgetFreeRange(allocRange_t* range, uint32 bucketIndex) + void forgetFreeRange(AllocRange* range, uint32 bucketIndex) { - allocRange_t* prevRange = range->prevFree; - allocRange_t* nextRange = range->nextFree; + AllocRange* prevRange = range->prevFree; + AllocRange* nextRange = range->nextFree; if (prevRange) { prevRange->nextFree = nextRange; @@ -94,36 +85,42 @@ class ChunkedHeap } else { - if (bucketFreeRange[bucketIndex] != range) - assert_dbg(); - bucketFreeRange[bucketIndex] = nextRange; + cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range); + m_bucketFreeRange[bucketIndex] = nextRange; if (nextRange) nextRange->prevFree = nullptr; + else + m_bucketUseMask &= ~(1u << bucketIndex); } } bool allocateChunk(uint32 minimumAllocationSize) { - uint32 chunkIndex = (uint32)list_chunks.size(); - list_chunks.emplace_back(new chunk_t()); + uint32 chunkIndex = (uint32)m_chunks.size(); + m_chunks.emplace_back(); uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize); + cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment if (chunkSize == 0) return false; - allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true); + cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB + AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true); trackFreeRange(range); - numHeapBytes += chunkSize; + m_numHeapBytes += chunkSize; return true; } - void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize) + void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize) { + cemu_assert_debug(allocSize > 0); // remove the range from the chain of free ranges forgetFreeRange(range, bucketIndex); // split head, allocation and tail into separate ranges - if (allocOffset > range->offset) + uint32 headBytes = allocOffset - range->offset; + if (headBytes > 0) { // alignment padding -> create free range - allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true); + cemu_assert_debug(headBytes >= TMinimumAlignment); + AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true); trackFreeRange(head); if (range->prevOrdered) range->prevOrdered->nextOrdered = head; @@ -131,10 +128,12 @@ class ChunkedHeap head->nextOrdered = range; range->prevOrdered = head; } - if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes? + uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize); + if (tailBytes > 0) { // tail -> create free range - allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true); + cemu_assert_debug(tailBytes >= TMinimumAlignment); + AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true); trackFreeRange(tail); if (range->nextOrdered) range->nextOrdered->prevOrdered = tail; @@ -149,36 +148,51 @@ class ChunkedHeap CHAddr _alloc(uint32 size, uint32 alignment) { + cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment)); + // make sure size is not zero and align it + if(size == 0) [[unlikely]] + size = TMinimumAlignment; + else + size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1); // find smallest bucket to scan uint32 alignmentM1 = alignment - 1; uint32 bucketIndex = ulog2(size); - while (bucketIndex < 32) + // check if the bucket is available + if( !(m_bucketUseMask & (1u << bucketIndex)) ) { - allocRange_t* range = bucketFreeRange[bucketIndex]; + // skip to next non-empty bucket + uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex); + bucketIndex += nextIndex; + } + while (bucketIndex < 31) + { + AllocRange* range = m_bucketFreeRange[bucketIndex]; while (range) { if (range->size >= size) { // verify if aligned allocation fits uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1; - uint32 alignmentLoss = alignedOffset - range->offset; - if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size) + uint32 endOffset = alignedOffset + size; + if((range->offset+range->size) >= endOffset) { _allocFrom(range, bucketIndex, alignedOffset, size); - list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range); - numAllocatedBytes += size; - return CHAddr(alignedOffset, range->chunkIndex); + m_numAllocatedBytes += size; + return CHAddr(alignedOffset, range->chunkIndex, range); } } range = range->nextFree; } - bucketIndex++; // try higher bucket + // check next non-empty bucket or skip to end + bucketIndex++; + uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex); + bucketIndex += emptyBuckets; } - if(allocationLimitReached) + if(m_allocationLimitReached) return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); if (!allocateChunk(size)) { - allocationLimitReached = true; + m_allocationLimitReached = true; return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); } return _alloc(size, alignment); @@ -186,24 +200,16 @@ class ChunkedHeap void _free(CHAddr addr) { - auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset); - if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end()) + if(!addr.internal) { cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset); - cemuLog_log(LogType::Force, "Debug info:"); - for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange) - { - cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size); - } return; } - - allocRange_t* range = it->second; - numAllocatedBytes -= it->second->size; - list_chunks[range->chunkIndex]->map_allocatedRange.erase(it); + AllocRange* range = (AllocRange*)addr.internal; + m_numAllocatedBytes -= range->size; // try merge left or right - allocRange_t* prevRange = range->prevOrdered; - allocRange_t* nextRange = range->nextOrdered; + AllocRange* prevRange = range->prevOrdered; + AllocRange* nextRange = range->nextOrdered; if (prevRange && prevRange->isFree) { if (nextRange && nextRange->isFree) @@ -216,8 +222,8 @@ class ChunkedHeap forgetFreeRange(prevRange, ulog2(prevRange->size)); prevRange->size = newSize; trackFreeRange(prevRange); - delete range; - delete nextRange; + m_allocEntriesPool.freeObj(range); + m_allocEntriesPool.freeObj(nextRange); } else { @@ -228,7 +234,7 @@ class ChunkedHeap forgetFreeRange(prevRange, ulog2(prevRange->size)); prevRange->size = newSize; trackFreeRange(prevRange); - delete range; + m_allocEntriesPool.freeObj(range); } } else if (nextRange && nextRange->isFree) @@ -242,7 +248,7 @@ class ChunkedHeap range->prevOrdered->nextOrdered = nextRange; nextRange->prevOrdered = range->prevOrdered; trackFreeRange(nextRange); - delete range; + m_allocEntriesPool.freeObj(range); } else { @@ -265,7 +271,7 @@ class ChunkedHeap for (uint32 i = 0; i < 32; i++) { - allocRange_t* ar = bucketFreeRange[i]; + AllocRange* ar = m_bucketFreeRange[i]; while (ar) { availableRange_t dbgRange; @@ -278,7 +284,7 @@ class ChunkedHeap if (itr.chunkIndex != dbgRange.chunkIndex) continue; if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset)) - assert_dbg(); + cemu_assert_error(); } availRanges.emplace_back(dbgRange); @@ -290,14 +296,16 @@ class ChunkedHeap } private: - std::vector list_chunks; - allocRange_t* bucketFreeRange[32]{}; - bool allocationLimitReached = false; + std::vector m_chunks; + uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction + AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed) + bool m_allocationLimitReached = false; + MemoryPool m_allocEntriesPool{64}; public: // statistics - uint32 numHeapBytes{}; // total size of the heap - uint32 numAllocatedBytes{}; + uint32 m_numHeapBytes{}; // total size of the heap + uint32 m_numAllocatedBytes{}; }; class VGenericHeap @@ -633,7 +641,7 @@ class ChunkedFlatAllocator uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; } uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; } - + private: void allocateAdditionalChunk() { From d59e17b105f65ba63deddae5affc8cc1f51d7a48 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:33:21 +0100 Subject: [PATCH 3/4] Properly handle resource clean up --- src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp | 8 +++++--- src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp index 33af36515..3494dbc5e 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp @@ -334,10 +334,12 @@ VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMe VKRBuffer::~VKRBuffer() { - if(m_mappedMemory) + if (m_mappedMemory) vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory); - vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr); - vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr); + if (m_bufferMemory != VK_NULL_HANDLE) + vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr); + if (m_buffer != VK_NULL_HANDLE) + vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr); } VkBufferChunkedHeap::~VkBufferChunkedHeap() diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 201639875..589047914 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -679,6 +679,9 @@ VulkanRenderer::~VulkanRenderer() vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr); } + // destroy memory manager + delete memoryManager; + // destroy instance, devices if (m_instance != VK_NULL_HANDLE) { @@ -690,9 +693,6 @@ VulkanRenderer::~VulkanRenderer() vkDestroyInstance(m_instance, nullptr); } - // destroy memory manager - delete memoryManager; - // crashes? //glslang::FinalizeProcess(); } From 04cad5677bf0e3cb7e7ce0a1d75e09899864faab Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Fri, 6 Dec 2024 20:53:52 +0100 Subject: [PATCH 4/4] Handle flush heuristics --- src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp index 167911b6a..a8f819019 100644 --- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp +++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp @@ -141,6 +141,14 @@ class DrawPassContext void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx); +// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits) +void LatteCP_signalEnterWait() +{ + // based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls, + // we only flush caches when the GPU goes idle or has to wait for any operation + LatteIndices_invalidateAll(); +} + /* * Read a U32 from the command buffer * If no data is available then wait in a busy loop @@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords) const uint32 GPU7_WAIT_MEM_OP_GREATER = 6; const uint32 GPU7_WAIT_MEM_OP_NEVER = 7; + LatteCP_signalEnterWait(); + bool stalls = false; if ((word0 & 0x10) != 0) { @@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords) else if(SEM_SIGNAL == 7) { // wait + LatteCP_signalEnterWait(); size_t loopCount = 0; while (true) { @@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx) } case IT_HLE_TRIGGER_SCANBUFFER_SWAP: { + LatteCP_signalEnterWait(); LatteCP_itHLESwapScanBuffer(cmdData, nWords); break; } case IT_HLE_WAIT_FOR_FLIP: { + LatteCP_signalEnterWait(); LatteCP_itHLEWaitForFlip(cmdData, nWords); break; } @@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer() } case IT_HLE_TRIGGER_SCANBUFFER_SWAP: { + LatteCP_signalEnterWait(); LatteCP_itHLESwapScanBuffer(cmd, nWords); timerRecheck += CP_TIMER_RECHECK / 64; break; } case IT_HLE_WAIT_FOR_FLIP: { + LatteCP_signalEnterWait(); LatteCP_itHLEWaitForFlip(cmd, nWords); timerRecheck += CP_TIMER_RECHECK / 1; break;