Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement better index caching in Latte #1443

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,14 @@ class DrawPassContext

void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);

// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
void LatteCP_signalEnterWait()
{
// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
// we only flush caches when the GPU goes idle or has to wait for any operation
LatteIndices_invalidateAll();
}

/*
* Read a U32 from the command buffer
* If no data is available then wait in a busy loop
Expand Down Expand Up @@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;

LatteCP_signalEnterWait();

bool stalls = false;
if ((word0 & 0x10) != 0)
{
Expand Down Expand Up @@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
else if(SEM_SIGNAL == 7)
{
// wait
LatteCP_signalEnterWait();
size_t loopCount = 0;
while (true)
{
Expand Down Expand Up @@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
}
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
{
LatteCP_signalEnterWait();
LatteCP_itHLESwapScanBuffer(cmdData, nWords);
break;
}
case IT_HLE_WAIT_FOR_FLIP:
{
LatteCP_signalEnterWait();
LatteCP_itHLEWaitForFlip(cmdData, nWords);
break;
}
Expand Down Expand Up @@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
}
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
{
LatteCP_signalEnterWait();
LatteCP_itHLESwapScanBuffer(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 64;
break;
}
case IT_HLE_WAIT_FOR_FLIP:
{
LatteCP_signalEnterWait();
LatteCP_itHLEWaitForFlip(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 1;
break;
Expand Down
114 changes: 74 additions & 40 deletions src/Cafe/HW/Latte/Core/LatteIndices.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Common/cpu_features.h"

#if defined(ARCH_X86_64) && defined(__GNUC__)
Expand All @@ -9,32 +10,53 @@

struct
{
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
uint32 indexBufferOffset;
uint32 indexBufferIndex;
struct CacheEntry
{
// input data
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
uint64 lastUsed;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
Renderer::IndexAllocation indexAllocation;
};
std::array<CacheEntry, 8> entry;
uint64 currentUsageCounter{0};
}LatteIndexCache{};

void LatteIndices_invalidate(const void* memPtr, uint32 size)
{
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
for(auto& entry : LatteIndexCache.entry)
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
{
if(entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}
}

void LatteIndices_invalidateAll()
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
for(auto& entry : LatteIndexCache.entry)
{
if (entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}

uint64 LatteIndices_GetNextUsageIndex()
{
return LatteIndexCache.currentUsageCounter++;
}

uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
Expand Down Expand Up @@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
}
}

void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
{
// what this should do:
// [x] use fast SIMD-based index decoding
Expand All @@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// [ ] better cache implementation, allow to cache across frames

// reuse from cache if data didn't change
if (LatteIndexCache.lastPtr == indexData &&
LatteIndexCache.lastCount == count &&
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
LatteIndexCache.lastIndexType == indexType)
{
indexMin = LatteIndexCache.indexMin;
indexMax = LatteIndexCache.indexMax;
renderIndexType = LatteIndexCache.renderIndexType;
outputCount = LatteIndexCache.outputCount;
indexBufferOffset = LatteIndexCache.indexBufferOffset;
indexBufferIndex = LatteIndexCache.indexBufferIndex;
auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
{
return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
});
if (cacheEntry != LatteIndexCache.entry.end())
{
indexMin = cacheEntry->indexMin;
indexMax = cacheEntry->indexMax;
renderIndexType = cacheEntry->renderIndexType;
outputCount = cacheEntry->outputCount;
indexAllocation = cacheEntry->indexAllocation;
cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
return;
}

Expand All @@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
indexMin = 0;
indexMax = std::max(count, 1u)-1;
renderIndexType = Renderer::INDEX_TYPE::NONE;
indexAllocation = {};
return; // no indices
}
// query index buffer from renderer
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
void* indexOutputPtr = indexAllocation.mem;

// decode indices
indexMin = std::numeric_limits<uint32>::max();
Expand Down Expand Up @@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// recalculate index range but filter out primitive restart index
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
}
g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
g_renderer->indexData_uploadIndexMemory(indexAllocation);
performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
// get least recently used cache entry
auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
{
return a.lastUsed < b.lastUsed;
});
// invalidate previous allocation
if(lruEntry->lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
// update cache
LatteIndexCache.lastPtr = indexData;
LatteIndexCache.lastCount = count;
LatteIndexCache.lastPrimitiveMode = primitiveMode;
LatteIndexCache.lastIndexType = indexType;
LatteIndexCache.indexMin = indexMin;
LatteIndexCache.indexMax = indexMax;
LatteIndexCache.renderIndexType = renderIndexType;
LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex;
lruEntry->lastPtr = indexData;
lruEntry->lastCount = count;
lruEntry->lastPrimitiveMode = primitiveMode;
lruEntry->lastIndexType = indexType;
lruEntry->indexMin = indexMin;
lruEntry->indexMax = indexMax;
lruEntry->renderIndexType = renderIndexType;
lruEntry->outputCount = outputCount;
lruEntry->indexAllocation = indexAllocation;
lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
}
2 changes: 1 addition & 1 deletion src/Cafe/HW/Latte/Core/LatteIndices.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

void LatteIndices_invalidate(const void* memPtr, uint32 size);
void LatteIndices_invalidateAll();
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
6 changes: 6 additions & 0 deletions src/Cafe/HW/Latte/Core/LatteOverlay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);

if (config.overlay.debug)
{
// general debug info
ImGui::Text("--- Debug info ---");
ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
// backend specific info
g_renderer->AppendOverlayDebugInfo();
}

position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
}
Expand Down
3 changes: 1 addition & 2 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,14 @@ void LattePerformanceMonitor_frameEnd()
uniformBankDataUploadedPerFrame /= 1024ULL;
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
indexDataUploadPerFrame /= 1024ULL;

double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
// set stats

performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
// next counter cycle
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
Expand Down
6 changes: 6 additions & 0 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ typedef struct
LattePerfStatCounter numDrawBarriersPerFrame;
LattePerfStatCounter numBeginRenderpassPerFrame;
}vk;

// calculated stats (per frame)
struct
{
uint32 indexDataUploadPerFrame;
}stats;
}performanceMonitor_t;

extern performanceMonitor_t performanceMonitor;
Expand Down
1 change: 0 additions & 1 deletion src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Cafe/GraphicPack/GraphicPack2.h"
#include "config/ActiveSettings.h"
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
#include "gui/guiWrapper.h"
#include "Cafe/OS/libs/erreula/erreula.h"
#include "input/InputManager.h"
Expand Down
17 changes: 11 additions & 6 deletions src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,21 @@ class OpenGLRenderer : public Renderer
static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
static void SetArrayElementBuffer(GLuint arrayElementBuffer);

// index
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
// index (not used by OpenGL renderer yet)
IndexAllocation indexData_reserveIndexMemory(uint32 size) override
{
assert_dbg();
return nullptr;
cemu_assert_unimplemented();
return {};
}

void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
void indexData_releaseIndexMemory(IndexAllocation& allocation) override
{
assert_dbg();
cemu_assert_unimplemented();
}

void indexData_uploadIndexMemory(IndexAllocation& allocation) override
{
cemu_assert_unimplemented();
}

// uniform
Expand Down
11 changes: 9 additions & 2 deletions src/Cafe/HW/Latte/Renderer/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,15 @@ class Renderer
virtual void draw_endSequence() = 0;

// index
virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
struct IndexAllocation
{
void* mem; // pointer to index data inside buffer
void* rendererInternal; // for renderer use
};

virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;

// occlusion queries
virtual LatteQueryObject* occlusionQuery_create() = 0;
Expand Down
Loading
Loading