From 50d9d9f1394ac9137fe6914af4905300f0de2f9e Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Tue, 6 Feb 2024 10:01:41 -0800
Subject: [PATCH 01/19] Improve memory allocations (#7540)

* Automatically flush CommandStream

When generating commands, we now automatically flush the CommandStream,
so that we're guaranteed to not overrun the circular buffer.

* clenaup CircularBuffer implementation and API

Also fix a bug in DEBUG mode that could corrupt the CircularBuffer, it
was due to a wrong debugging code attempting to clear the unused
area of the buffer (this was wrong because in "ashmem" mode, there are
no guaranteed unused areas).

* Fix a couple threading vs. allocations

- prepareVisibleLights was run on a dedicated thread (via JobSystem),
  but was using its own local ArenaScope. This is wrong because it
  could reset the root arena at any later point. This is fixed by
  just not using a local ArenaScope.

- related to the above, the root Arena (LinearAllocatorArena) didn't
  use a locked policy, which cause also cause problems since some
  allocations are done off the main thread. We now pre-allocate the one
  buffer we need.

This PR also renames some variable and types to improve readability.

* Rework RenderPass to improve allocations and API

RenderPass now is a fully immutable object that gets constructed with a
RenderPassBuilder. RenderPassBuilder can be passed around and doesn't
do any (major) allocations.

All RenderPass allocations and heavy lifting is done in
RenderPassBuilder::Build().

Additionally, RenderPass cannot be copied anymore.

Where allocations happen is now much clearer.

* new LinearAllocatorWithFallback

LinearAllocatorWithFallback is a linear allocator that can fall back
to the heap allocator. We use it for the high level command buffer to
avoid crashing when running out of memory.

FIXES=[277115740]

* Update filament/src/RenderPass.h

Co-authored-by: Powei Feng <powei@google.com>

* Update libs/utils/include/utils/Allocator.h

Co-authored-by: Powei Feng <powei@google.com>

---------

Co-authored-by: Powei Feng <powei@google.com>
---
 .../include/private/backend/CircularBuffer.h  |  45 +-
 .../private/backend/CommandBufferQueue.h      |  13 +-
 .../include/private/backend/CommandStream.h   |   2 +
 .../include/private/backend/HandleAllocator.h |   2 +-
 filament/backend/src/CircularBuffer.cpp       |  71 ++-
 filament/backend/src/CommandBufferQueue.cpp   |  62 ++-
 filament/src/Allocators.h                     |   2 +-
 filament/src/Froxelizer.cpp                   |   7 +-
 filament/src/Froxelizer.h                     |   2 +-
 filament/src/PostProcessManager.cpp           |  34 +-
 filament/src/PostProcessManager.h             |   5 +-
 filament/src/RenderPass.cpp                   | 467 +++++++++++-------
 filament/src/RenderPass.h                     | 250 ++++++----
 filament/src/RendererUtils.cpp                |   4 -
 filament/src/ShadowMapManager.cpp             |  32 +-
 filament/src/ShadowMapManager.h               |   4 +-
 filament/src/details/Engine.cpp               |   2 +-
 filament/src/details/Engine.h                 |  26 +-
 filament/src/details/Renderer.cpp             |  93 ++--
 filament/src/details/Renderer.h               |   5 +-
 filament/src/details/Scene.cpp                |  10 +-
 filament/src/details/Scene.h                  |   6 +-
 filament/src/details/View.cpp                 |  52 +-
 filament/src/details/View.h                   |   9 +-
 libs/utils/include/utils/Allocator.h          | 116 ++++-
 libs/utils/src/Allocator.cpp                  |  58 ++-
 26 files changed, 856 insertions(+), 523 deletions(-)

diff --git a/filament/backend/include/private/backend/CircularBuffer.h b/filament/backend/include/private/backend/CircularBuffer.h
index aae6e69c03b..7d2de52b009 100644
--- a/filament/backend/include/private/backend/CircularBuffer.h
+++ b/filament/backend/include/private/backend/CircularBuffer.h
@@ -17,7 +17,10 @@
 #ifndef TNT_FILAMENT_BACKEND_PRIVATE_CIRCULARBUFFER_H
 #define TNT_FILAMENT_BACKEND_PRIVATE_CIRCULARBUFFER_H
 
+#include <utils/debug.h>
+
 #include <stddef.h>
+#include <stdint.h>
 
 namespace filament::backend {
 
@@ -37,28 +40,36 @@ class CircularBuffer {
 
     ~CircularBuffer() noexcept;
 
-    // allocates 'size' bytes in the circular buffer and returns a pointer to the memory
-    // return the current head and moves it forward by size bytes
-    inline void* allocate(size_t size) noexcept {
+    static size_t getBlockSize() noexcept { return sPageSize; }
+
+    // Total size of circular buffer. This is a constant.
+    size_t size() const noexcept { return mSize; }
+
+    // Allocates `s` bytes in the circular buffer and returns a pointer to the memory. All
+    // allocations must not exceed size() bytes.
+    inline void* allocate(size_t s) noexcept {
+        // We can never allocate more that size().
+        assert_invariant(getUsed() + s <= size());
         char* const cur = static_cast<char*>(mHead);
-        mHead = cur + size;
+        mHead = cur + s;
         return cur;
     }
 
-    // Total size of circular buffer
-    size_t size() const noexcept { return mSize; }
-
-    // returns true if the buffer is empty (e.g. after calling flush)
+    // Returns true if the buffer is empty, i.e.: no allocations were made since
+    // calling getBuffer();
     bool empty() const noexcept { return mTail == mHead; }
 
-    void* getHead() const noexcept { return mHead; }
-
-    void* getTail() const noexcept { return mTail; }
+    // Returns the size used since the last call to getBuffer()
+    size_t getUsed() const noexcept { return intptr_t(mHead) - intptr_t(mTail); }
 
-    // call at least once every getRequiredSize() bytes allocated from the buffer
-    void circularize() noexcept;
-
-    static size_t getBlockSize() noexcept { return sPageSize; }
+    // Retrieves the current allocated range and frees it. It is the responsibility of the caller
+    // to make sure the returned range is no longer in use by the time allocate() allocates
+    // (size() - getUsed()) bytes.
+    struct Range {
+        void* tail;
+        void* head;
+    };
+    Range getBuffer() noexcept;
 
 private:
     void* alloc(size_t size) noexcept;
@@ -66,10 +77,10 @@ class CircularBuffer {
 
     // pointer to the beginning of the circular buffer (constant)
     void* mData = nullptr;
-    int mUsesAshmem = -1;
+    int mAshmemFd = -1;
 
     // size of the circular buffer (constant)
-    size_t mSize = 0;
+    size_t const mSize;
 
     // pointer to the beginning of recorded data
     void* mTail = nullptr;
diff --git a/filament/backend/include/private/backend/CommandBufferQueue.h b/filament/backend/include/private/backend/CommandBufferQueue.h
index 6a434477789..28122452386 100644
--- a/filament/backend/include/private/backend/CommandBufferQueue.h
+++ b/filament/backend/include/private/backend/CommandBufferQueue.h
@@ -33,7 +33,7 @@ namespace filament::backend {
  * A producer-consumer command queue that uses a CircularBuffer as main storage
  */
 class CommandBufferQueue {
-    struct Slice {
+    struct Range {
         void* begin;
         void* end;
     };
@@ -46,7 +46,7 @@ class CommandBufferQueue {
 
     mutable utils::Mutex mLock;
     mutable utils::Condition mCondition;
-    mutable std::vector<Slice> mCommandBuffersToExecute;
+    mutable std::vector<Range> mCommandBuffersToExecute;
     size_t mFreeSpace = 0;
     size_t mHighWatermark = 0;
     uint32_t mExitRequested = 0;
@@ -58,17 +58,20 @@ class CommandBufferQueue {
     CommandBufferQueue(size_t requiredSize, size_t bufferSize);
     ~CommandBufferQueue();
 
-    CircularBuffer& getCircularBuffer() { return mCircularBuffer; }
+    CircularBuffer& getCircularBuffer() noexcept { return mCircularBuffer; }
+    CircularBuffer const& getCircularBuffer() const noexcept { return mCircularBuffer; }
+
+    size_t getCapacity() const noexcept { return mRequiredSize; }
 
     size_t getHighWatermark() const noexcept { return mHighWatermark; }
 
     // wait for commands to be available and returns an array containing these commands
-    std::vector<Slice> waitForCommands() const;
+    std::vector<Range> waitForCommands() const;
 
     // return the memory used by this command buffer to the circular buffer
     // WARNING: releaseBuffer() must be called in sequence of the Slices returned by
     // waitForCommands()
-    void releaseBuffer(Slice const& buffer);
+    void releaseBuffer(Range const& buffer);
 
     // all commands buffers (Slices) written to this point are returned by waitForCommand(). This
     // call blocks until the CircularBuffer has at least mRequiredSize bytes available.
diff --git a/filament/backend/include/private/backend/CommandStream.h b/filament/backend/include/private/backend/CommandStream.h
index be84b323ad0..985fa5fcd6e 100644
--- a/filament/backend/include/private/backend/CommandStream.h
+++ b/filament/backend/include/private/backend/CommandStream.h
@@ -213,6 +213,8 @@ class CommandStream {
     CommandStream(CommandStream const& rhs) noexcept = delete;
     CommandStream& operator=(CommandStream const& rhs) noexcept = delete;
 
+    CircularBuffer const& getCircularBuffer() const noexcept { return mCurrentBuffer; }
+
 public:
 #define DECL_DRIVER_API(methodName, paramsDecl, params)                                         \
     inline void methodName(paramsDecl) {                                                        \
diff --git a/filament/backend/include/private/backend/HandleAllocator.h b/filament/backend/include/private/backend/HandleAllocator.h
index aa5f53be695..3a336e8d6e6 100644
--- a/filament/backend/include/private/backend/HandleAllocator.h
+++ b/filament/backend/include/private/backend/HandleAllocator.h
@@ -231,7 +231,7 @@ class HandleAllocator {
         explicit Allocator(const utils::AreaPolicy::HeapArea& area);
 
         // this is in fact always called with a constexpr size argument
-        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t extra) noexcept {
+        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t extra = 0) noexcept {
             void* p = nullptr;
                  if (size <= mPool0.getSize()) p = mPool0.alloc(size, 16, extra);
             else if (size <= mPool1.getSize()) p = mPool1.alloc(size, 16, extra);
diff --git a/filament/backend/src/CircularBuffer.cpp b/filament/backend/src/CircularBuffer.cpp
index d9a877d3f59..41dd4173008 100644
--- a/filament/backend/src/CircularBuffer.cpp
+++ b/filament/backend/src/CircularBuffer.cpp
@@ -16,6 +16,14 @@
 
 #include "private/backend/CircularBuffer.h"
 
+#include <utils/Log.h>
+#include <utils/Panic.h>
+#include <utils/architecture.h>
+#include <utils/ashmem.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+#include <utils/ostream.h>
+
 #if !defined(WIN32) && !defined(__EMSCRIPTEN__) && !defined(IOS)
 #    include <sys/mman.h>
 #    include <unistd.h>
@@ -24,23 +32,20 @@
 #    define HAS_MMAP 0
 #endif
 
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
 #include <stdio.h>
 
-#include <utils/architecture.h>
-#include <utils/ashmem.h>
-#include <utils/debug.h>
-#include <utils/Log.h>
-#include <utils/Panic.h>
-
 using namespace utils;
 
 namespace filament::backend {
 
 size_t CircularBuffer::sPageSize = arch::getPageSize();
 
-CircularBuffer::CircularBuffer(size_t size) {
+CircularBuffer::CircularBuffer(size_t size)
+    : mSize(size) {
     mData = alloc(size);
-    mSize = size;
     mTail = mData;
     mHead = mData;
 }
@@ -85,7 +90,7 @@ void* CircularBuffer::alloc(size_t size) noexcept {
                             MAP_PRIVATE, fd, (off_t)size);
                     if (vaddr_guard != MAP_FAILED && (vaddr_guard == (char*)vaddr_shadow + size)) {
                         // woo-hoo success!
-                        mUsesAshmem = fd;
+                        mAshmemFd = fd;
                         data = vaddr;
                     }
                 }
@@ -93,7 +98,7 @@ void* CircularBuffer::alloc(size_t size) noexcept {
         }
     }
 
-    if (UTILS_UNLIKELY(mUsesAshmem < 0)) {
+    if (UTILS_UNLIKELY(mAshmemFd < 0)) {
         // ashmem failed
         if (vaddr_guard != MAP_FAILED) {
             munmap(vaddr_guard, size);
@@ -137,9 +142,9 @@ void CircularBuffer::dealloc() noexcept {
     if (mData) {
         size_t const BLOCK_SIZE = getBlockSize();
         munmap(mData, mSize * 2 + BLOCK_SIZE);
-        if (mUsesAshmem >= 0) {
-            close(mUsesAshmem);
-            mUsesAshmem = -1;
+        if (mAshmemFd >= 0) {
+            close(mAshmemFd);
+            mAshmemFd = -1;
         }
     }
 #else
@@ -149,23 +154,37 @@ void CircularBuffer::dealloc() noexcept {
 }
 
 
-void CircularBuffer::circularize() noexcept {
-    if (mUsesAshmem > 0) {
-        intptr_t const overflow = intptr_t(mHead) - (intptr_t(mData) + ssize_t(mSize));
-        if (overflow >= 0) {
-            assert_invariant(size_t(overflow) <= mSize);
-            mHead = (void *) (intptr_t(mData) + overflow);
-            #ifndef NDEBUG
-            memset(mData, 0xA5, size_t(overflow));
-            #endif
-        }
-    } else {
-        // Only circularize if mHead if in the second buffer.
-        if (intptr_t(mHead) - intptr_t(mData) > ssize_t(mSize)) {
+CircularBuffer::Range CircularBuffer::getBuffer() noexcept {
+    Range const range{ .tail = mTail, .head = mHead };
+
+    char* const pData = static_cast<char*>(mData);
+    char const* const pEnd = pData + mSize;
+    char const* const pHead = static_cast<char const*>(mHead);
+    if (UTILS_UNLIKELY(pHead >= pEnd)) {
+        size_t const overflow = pHead - pEnd;
+        if (UTILS_LIKELY(mAshmemFd > 0)) {
+            assert_invariant(overflow <= mSize);
+            mHead = static_cast<void*>(pData + overflow);
+            // Data         Tail  End   Head              [virtual]
+            //  v             v    v     v
+            //  +-------------:----+-----:--------------+
+            //  |             :    |     :              |
+            //  +-----:------------+--------------------+
+            //       Head          |<------ copy ------>| [physical]
+        } else {
+            // Data         Tail  End   Head
+            //  v             v    v     v
+            //  +-------------:----+-----:--------------+
+            //  |             :    |     :              |
+            //  +-----|------------+-----|--------------+
+            //        |<---------------->|
+            //           sliding window
             mHead = mData;
         }
     }
     mTail = mHead;
+
+    return range;
 }
 
 } // namespace filament::backend
diff --git a/filament/backend/src/CommandBufferQueue.cpp b/filament/backend/src/CommandBufferQueue.cpp
index ccf9d33a0d7..e3e5de045c8 100644
--- a/filament/backend/src/CommandBufferQueue.cpp
+++ b/filament/backend/src/CommandBufferQueue.cpp
@@ -15,14 +15,25 @@
  */
 
 #include "private/backend/CommandBufferQueue.h"
+#include "private/backend/CircularBuffer.h"
+#include "private/backend/CommandStream.h"
 
+#include <utils/compiler.h>
 #include <utils/Log.h>
-#include <utils/Systrace.h>
+#include <utils/Mutex.h>
+#include <utils/ostream.h>
 #include <utils/Panic.h>
+#include <utils/Systrace.h>
 #include <utils/debug.h>
 
-#include "private/backend/BackendUtils.h"
-#include "private/backend/CommandStream.h"
+#include <algorithm>
+#include <mutex>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include <stddef.h>
+#include <stdint.h>
 
 using namespace utils;
 
@@ -65,50 +76,53 @@ void CommandBufferQueue::flush() noexcept {
     // always guaranteed to have enough space for the NoopCommand
     new(circularBuffer.allocate(sizeof(NoopCommand))) NoopCommand(nullptr);
 
-    // end of this slice
-    void* const head = circularBuffer.getHead();
+    const size_t requiredSize = mRequiredSize;
 
-    // beginning of this slice
-    void* const tail = circularBuffer.getTail();
+    // get the current buffer
+    auto const [begin, end] = circularBuffer.getBuffer();
 
-    // size of this slice
-    uint32_t const used = uint32_t(intptr_t(head) - intptr_t(tail));
+    assert_invariant(circularBuffer.empty());
 
-    circularBuffer.circularize();
+    // size of the current buffer
+    size_t const used = std::distance(
+            static_cast<char const*>(begin), static_cast<char const*>(end));
 
     std::unique_lock<utils::Mutex> lock(mLock);
-    mCommandBuffersToExecute.push_back({ tail, head });
+    mCommandBuffersToExecute.push_back({ begin, end });
+    mCondition.notify_one();
 
     // circular buffer is too small, we corrupted the stream
     ASSERT_POSTCONDITION(used <= mFreeSpace,
             "Backend CommandStream overflow. Commands are corrupted and unrecoverable.\n"
             "Please increase minCommandBufferSizeMB inside the Config passed to Engine::create.\n"
-            "Space used at this time: %u bytes",
-            (unsigned)used);
+            "Space used at this time: %u bytes, overflow: %u bytes",
+            (unsigned)used, unsigned(used - mFreeSpace));
 
     // wait until there is enough space in the buffer
     mFreeSpace -= used;
-    const size_t requiredSize = mRequiredSize;
+    if (UTILS_UNLIKELY(mFreeSpace < requiredSize)) {
+
 
 #ifndef NDEBUG
-    size_t totalUsed = circularBuffer.size() - mFreeSpace;
-    mHighWatermark = std::max(mHighWatermark, totalUsed);
-    if (UTILS_UNLIKELY(totalUsed > requiredSize)) {
-        slog.d << "CommandStream used too much space: " << totalUsed
-            << ", out of " << requiredSize << " (will block)" << io::endl;
-    }
+        size_t const totalUsed = circularBuffer.size() - mFreeSpace;
+        slog.d << "CommandStream used too much space (will block): "
+                << "needed space " << requiredSize << " out of " << mFreeSpace
+                << ", totalUsed=" << totalUsed << ", current=" << used
+                << ", queue size=" << mCommandBuffersToExecute.size() << " buffers"
+                << io::endl;
+
+        mHighWatermark = std::max(mHighWatermark, totalUsed);
 #endif
 
-    mCondition.notify_one();
-    if (UTILS_LIKELY(mFreeSpace < requiredSize)) {
         SYSTRACE_NAME("waiting: CircularBuffer::flush()");
         mCondition.wait(lock, [this, requiredSize]() -> bool {
+            // TODO: on macOS, we need to call pumpEvents from time to time
             return mFreeSpace >= requiredSize;
         });
     }
 }
 
-std::vector<CommandBufferQueue::Slice> CommandBufferQueue::waitForCommands() const {
+std::vector<CommandBufferQueue::Range> CommandBufferQueue::waitForCommands() const {
     if (!UTILS_HAS_THREADING) {
         return std::move(mCommandBuffersToExecute);
     }
@@ -123,7 +137,7 @@ std::vector<CommandBufferQueue::Slice> CommandBufferQueue::waitForCommands() con
     return std::move(mCommandBuffersToExecute);
 }
 
-void CommandBufferQueue::releaseBuffer(CommandBufferQueue::Slice const& buffer) {
+void CommandBufferQueue::releaseBuffer(CommandBufferQueue::Range const& buffer) {
     std::lock_guard<utils::Mutex> const lock(mLock);
     mFreeSpace += uintptr_t(buffer.end) - uintptr_t(buffer.begin);
     mCondition.notify_one();
diff --git a/filament/src/Allocators.h b/filament/src/Allocators.h
index eb354b8d329..84962e30c0e 100644
--- a/filament/src/Allocators.h
+++ b/filament/src/Allocators.h
@@ -54,7 +54,7 @@ using LinearAllocatorArena = utils::Arena<
 
 #endif
 
-using ArenaScope = utils::ArenaScope<LinearAllocatorArena>;
+using RootArenaScope = utils::ArenaScope<LinearAllocatorArena>;
 
 } // namespace filament
 
diff --git a/filament/src/Froxelizer.cpp b/filament/src/Froxelizer.cpp
index c469932c251..47bd0d343dd 100644
--- a/filament/src/Froxelizer.cpp
+++ b/filament/src/Froxelizer.cpp
@@ -168,7 +168,8 @@ void Froxelizer::setProjection(const mat4f& projection,
 }
 
 bool Froxelizer::prepare(
-        FEngine::DriverApi& driverApi, ArenaScope& arena, filament::Viewport const& viewport,
+        FEngine::DriverApi& driverApi, RootArenaScope& rootArenaScope,
+        filament::Viewport const& viewport,
         const mat4f& projection, float projectionNear, float projectionFar) noexcept {
     setViewport(viewport);
     setProjection(projection, projectionNear, projectionFar);
@@ -199,12 +200,12 @@ bool Froxelizer::prepare(
 
     // light records per froxel (~256 KiB)
     mLightRecords = {
-            arena.allocate<LightRecord>(getFroxelBufferEntryCount(), CACHELINE_SIZE),
+            rootArenaScope.allocate<LightRecord>(getFroxelBufferEntryCount(), CACHELINE_SIZE),
             getFroxelBufferEntryCount() };
 
     // froxel thread data (~256 KiB)
     mFroxelShardedData = {
-            arena.allocate<FroxelThreadData>(GROUP_COUNT, CACHELINE_SIZE),
+            rootArenaScope.allocate<FroxelThreadData>(GROUP_COUNT, CACHELINE_SIZE),
             uint32_t(GROUP_COUNT)
     };
 
diff --git a/filament/src/Froxelizer.h b/filament/src/Froxelizer.h
index 27885e24bc7..27ba3c57641 100644
--- a/filament/src/Froxelizer.h
+++ b/filament/src/Froxelizer.h
@@ -110,7 +110,7 @@ class Froxelizer {
      *
      * return true if updateUniforms() needs to be called
      */
-    bool prepare(backend::DriverApi& driverApi, ArenaScope& arena, Viewport const& viewport,
+    bool prepare(backend::DriverApi& driverApi, RootArenaScope& rootArenaScope, Viewport const& viewport,
             const math::mat4f& projection, float projectionNear, float projectionFar) noexcept;
 
     Froxel getFroxelAt(size_t x, size_t y, size_t z) const noexcept;
diff --git a/filament/src/PostProcessManager.cpp b/filament/src/PostProcessManager.cpp
index 78814f74852..f186ee9cb6d 100644
--- a/filament/src/PostProcessManager.cpp
+++ b/filament/src/PostProcessManager.cpp
@@ -414,7 +414,7 @@ void PostProcessManager::commitAndRender(FrameGraphResources::RenderPassInfo con
 // ------------------------------------------------------------------------------------------------
 
 PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph& fg,
-        RenderPass const& pass, uint8_t structureRenderFlags,
+        RenderPassBuilder const& passBuilder, uint8_t structureRenderFlags,
         uint32_t width, uint32_t height,
         StructurePassConfig const& config) noexcept {
 
@@ -466,17 +466,19 @@ PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph
                         .clearFlags = TargetBufferFlags::COLOR0 | TargetBufferFlags::DEPTH
                 });
             },
-            [=, renderPass = pass](FrameGraphResources const& resources,
+            [=, passBuilder = passBuilder](FrameGraphResources const& resources,
                     auto const&, DriverApi&) mutable {
                 Variant structureVariant(Variant::DEPTH_VARIANT);
                 structureVariant.setPicking(config.picking);
 
                 auto out = resources.getRenderPassInfo();
-                renderPass.setRenderFlags(structureRenderFlags);
-                renderPass.setVariant(structureVariant);
-                renderPass.appendCommands(mEngine, RenderPass::CommandTypeFlags::SSAO);
-                renderPass.sortCommands(mEngine);
-                renderPass.execute(mEngine, resources.getPassName(), out.target, out.params);
+
+                passBuilder.renderFlags(structureRenderFlags);
+                passBuilder.variant(structureVariant);
+                passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::SSAO);
+
+                RenderPass const pass{ passBuilder.build(mEngine) };
+                RenderPass::execute(pass, mEngine, resources.getPassName(), out.target, out.params);
             });
 
     auto depth = structurePass->depth;
@@ -523,7 +525,7 @@ PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph
 // ------------------------------------------------------------------------------------------------
 
 FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
-        RenderPass const& pass,
+        RenderPassBuilder const& passBuilder,
         FrameHistory const& frameHistory,
         CameraInfo const& cameraInfo,
         PerViewUniforms& uniforms,
@@ -586,7 +588,7 @@ FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
             },
             [this, projection = cameraInfo.projection,
                     userViewMatrix = cameraInfo.getUserViewMatrix(), uvFromClipMatrix, historyProjection,
-                    options, &uniforms, renderPass = pass]
+                    options, &uniforms, passBuilder = passBuilder]
             (FrameGraphResources const& resources, auto const& data, DriverApi& driver) mutable {
                 // set structure sampler
                 uniforms.prepareStructure(data.structure ?
@@ -607,17 +609,17 @@ FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
                 auto out = resources.getRenderPassInfo();
 
                 // Remove the HAS_SHADOWING RenderFlags, since it's irrelevant when rendering reflections
-                RenderPass::RenderFlags flags = renderPass.getRenderFlags();
-                flags &= ~RenderPass::HAS_SHADOWING;
-                renderPass.setRenderFlags(flags);
+                passBuilder.renderFlags(~RenderPass::HAS_SHADOWING, 0);
 
                 // use our special SSR variant, it can only be applied to object that have
                 // the SCREEN_SPACE ReflectionMode.
-                renderPass.setVariant(Variant{Variant::SPECIAL_SSR});
+                passBuilder.variant(Variant{ Variant::SPECIAL_SSR });
+
                 // generate all our drawing commands, except blended objects.
-                renderPass.appendCommands(mEngine, RenderPass::CommandTypeFlags::SCREEN_SPACE_REFLECTIONS);
-                renderPass.sortCommands(mEngine);
-                renderPass.execute(mEngine, resources.getPassName(), out.target, out.params);
+                passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::SCREEN_SPACE_REFLECTIONS);
+
+                RenderPass const pass{ passBuilder.build(mEngine) };
+                RenderPass::execute(pass, mEngine, resources.getPassName(), out.target, out.params);
             });
 
     return ssrPass->reflections;
diff --git a/filament/src/PostProcessManager.h b/filament/src/PostProcessManager.h
index 081e795f061..12b211dc238 100644
--- a/filament/src/PostProcessManager.h
+++ b/filament/src/PostProcessManager.h
@@ -50,6 +50,7 @@ class FMaterialInstance;
 class FrameGraph;
 class PerViewUniforms;
 class RenderPass;
+class RenderPassBuilder;
 struct CameraInfo;
 
 class PostProcessManager {
@@ -99,12 +100,12 @@ class PostProcessManager {
         FrameGraphId<FrameGraphTexture> picking;
     };
     StructurePassOutput structure(FrameGraph& fg,
-            RenderPass const& pass, uint8_t structureRenderFlags,
+            RenderPassBuilder const& passBuilder, uint8_t structureRenderFlags,
             uint32_t width, uint32_t height, StructurePassConfig const& config) noexcept;
 
     // reflections pass
     FrameGraphId<FrameGraphTexture> ssr(FrameGraph& fg,
-            RenderPass const& pass,
+            RenderPassBuilder const& passBuilder,
             FrameHistory const& frameHistory,
             CameraInfo const& cameraInfo,
             PerViewUniforms& uniforms,
diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index 2932fcf481b..b585c4d328b 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -19,17 +19,43 @@
 #include "RenderPrimitive.h"
 #include "ShadowMap.h"
 
+#include "details/Camera.h"
 #include "details/Material.h"
 #include "details/MaterialInstance.h"
 #include "details/View.h"
 
+#include "components/RenderableManager.h"
+
+#include <private/filament/EngineEnums.h>
 #include <private/filament/UibStructs.h>
+#include <private/filament/Variant.h>
+
+#include <filament/MaterialEnums.h>
+
+#include <backend/DriverApiForward.h>
+#include <backend/DriverEnums.h>
+#include <backend/Handle.h>
+#include <backend/PipelineState.h>
+
+#include "private/backend/CircularBuffer.h"
 
+#include <utils/compiler.h>
+#include <utils/debug.h>
 #include <utils/JobSystem.h>
+#include <utils/Panic.h>
+#include <utils/Slice.h>
 #include <utils/Systrace.h>
+#include <utils/Range.h>
 
+#include <algorithm>
+#include <functional>
+#include <limits>
 #include <utility>
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
 using namespace utils;
 using namespace filament::math;
 
@@ -37,60 +63,103 @@ namespace filament {
 
 using namespace backend;
 
-RenderPass::RenderPass(FEngine& engine,
-        RenderPass::Arena& arena) noexcept
-        : mCommandArena(arena),
-          mCustomCommands(engine.getPerRenderPassAllocator()) {
+RenderPassBuilder& RenderPassBuilder::customCommand(
+        FEngine& engine,
+        uint8_t channel,
+        RenderPass::Pass pass,
+        RenderPass::CustomCommand custom,
+        uint32_t order,
+        RenderPass::Executor::CustomCommandFn const& command) {
+    if (!mCustomCommands.has_value()) {
+        // construct the vector the first time
+        mCustomCommands.emplace(engine.getPerRenderPassArena());
+    }
+    mCustomCommands->emplace_back(channel, pass, custom, order, command);
+    return *this;
 }
 
-RenderPass::RenderPass(RenderPass const& rhs) = default;
+RenderPass RenderPassBuilder::build(FEngine& engine) {
+    ASSERT_POSTCONDITION(mRenderableSoa, "RenderPassBuilder::geometry() hasn't been called");
+    assert_invariant(mScissorViewport.width  <= std::numeric_limits<int32_t>::max());
+    assert_invariant(mScissorViewport.height <= std::numeric_limits<int32_t>::max());
+    return RenderPass{ engine, *this };
+}
 
-// this destructor is actually heavy because it inlines ~vector<>
-RenderPass::~RenderPass() noexcept = default;
+// ------------------------------------------------------------------------------------------------
+
+RenderPass::RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexcept
+        : mRenderableSoa(*builder.mRenderableSoa),
+          mVisibleRenderables(builder.mVisibleRenderables),
+          mUboHandle(builder.mUboHandle),
+          mCameraPosition(builder.mCameraPosition),
+          mCameraForwardVector(builder.mCameraForwardVector),
+          mFlags(builder.mFlags),
+          mVariant(builder.mVariant),
+          mVisibilityMask(builder.mVisibilityMask),
+          mScissorViewport(builder.mScissorViewport),
+          mCustomCommands(engine.getPerRenderPassArena()) {
+
+    // compute the number of commands we need
+    updateSummedPrimitiveCounts(
+            const_cast<FScene::RenderableSoa&>(mRenderableSoa), mVisibleRenderables);
+
+    uint32_t commandCount =
+            FScene::getPrimitiveCount(mRenderableSoa, mVisibleRenderables.last);
+    const bool colorPass  = bool(builder.mCommandTypeFlags & CommandTypeFlags::COLOR);
+    const bool depthPass  = bool(builder.mCommandTypeFlags & CommandTypeFlags::DEPTH);
+    commandCount *= uint32_t(colorPass * 2 + depthPass);
+    commandCount += 1; // for the sentinel
+
+    uint32_t const customCommandCount =
+            builder.mCustomCommands.has_value() ? builder.mCustomCommands->size() : 0;
 
-RenderPass::Command* RenderPass::append(size_t count) noexcept {
-    // this is like an "in-place" realloc(). Works only with LinearAllocator.
-    Command* const curr = mCommandArena.alloc<Command>(count);
+    Command* const curr = builder.mArena.alloc<Command>(commandCount + customCommandCount);
     assert_invariant(curr);
-    assert_invariant(mCommandBegin == nullptr || curr == mCommandEnd);
-    if (mCommandBegin == nullptr) {
-        mCommandBegin = mCommandEnd = curr;
+
+    if (UTILS_UNLIKELY(builder.mArena.getAllocator().isHeapAllocation(curr))) {
+        static bool sLogOnce = true;
+        if (UTILS_UNLIKELY(sLogOnce)) {
+            sLogOnce = false;
+            PANIC_LOG("RenderPass arena is full, using slower system heap. Please increase "
+                      "the appropriate constant (e.g. FILAMENT_PER_RENDER_PASS_ARENA_SIZE_IN_MB).");
+        }
     }
-    mCommandEnd += count;
-    return curr;
-}
 
-void RenderPass::resize(size_t count) noexcept {
-    if (mCommandBegin) {
-        mCommandEnd = mCommandBegin + count;
-        mCommandArena.rewind(mCommandEnd);
+    mCommandBegin = curr;
+    mCommandEnd = curr + commandCount + customCommandCount;
+
+    appendCommands(engine, { curr, commandCount }, builder.mCommandTypeFlags);
+
+    if (builder.mCustomCommands.has_value()) {
+        Command* p = curr + commandCount;
+        for (auto [channel, passId, command, order, fn]: builder.mCustomCommands.value()) {
+            appendCustomCommand(p++, channel, passId, command, order, fn);
+        }
     }
-}
 
-void RenderPass::setGeometry(FScene::RenderableSoa const& soa, Range<uint32_t> vr,
-        backend::Handle<backend::HwBufferObject> uboHandle) noexcept {
-    mRenderableSoa = &soa;
-    mVisibleRenderables = vr;
-    mUboHandle = uboHandle;
-}
+    // sort commands once we're done adding commands
+    sortCommands(builder.mArena);
 
-void RenderPass::setCamera(const CameraInfo& camera) noexcept {
-    mCameraPosition = camera.getPosition();
-    mCameraForwardVector = camera.getForwardVector();
+    if (engine.isAutomaticInstancingEnabled()) {
+        instanceify(engine, builder.mArena);
+    }
 }
 
-void RenderPass::setScissorViewport(backend::Viewport viewport) noexcept {
-    assert_invariant(viewport.width  <= std::numeric_limits<int32_t>::max());
-    assert_invariant(viewport.height <= std::numeric_limits<int32_t>::max());
-    mScissorViewport = viewport;
+// this destructor is actually heavy because it inlines ~vector<>
+RenderPass::~RenderPass() noexcept = default;
+
+void RenderPass::resize(Arena& arena, size_t count) noexcept {
+    if (mCommandBegin) {
+        mCommandEnd = mCommandBegin + count;
+        arena.rewind(mCommandEnd);
+    }
 }
 
-void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandTypeFlags) noexcept {
+void RenderPass::appendCommands(FEngine& engine,
+        Slice<Command> commands, CommandTypeFlags const commandTypeFlags) noexcept {
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
-    assert_invariant(mRenderableSoa);
-
     utils::Range<uint32_t> const vr = mVisibleRenderables;
     // trace the number of visible renderables
     SYSTRACE_VALUE32("visibleRenderables", vr.size());
@@ -104,17 +173,10 @@ void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandT
     const FScene::VisibleMaskType visibilityMask = mVisibilityMask;
 
     // up-to-date summed primitive counts needed for generateCommands()
-    FScene::RenderableSoa const& soa = *mRenderableSoa;
-    updateSummedPrimitiveCounts(const_cast<FScene::RenderableSoa&>(soa), vr);
+    FScene::RenderableSoa const& soa = mRenderableSoa;
 
-    // compute how much maximum storage we need for this pass
-    uint32_t commandCount = FScene::getPrimitiveCount(soa, vr.last);
-    // double the color pass for transparent objects that need to render twice
-    const bool colorPass  = bool(commandTypeFlags & CommandTypeFlags::COLOR);
-    const bool depthPass  = bool(commandTypeFlags & CommandTypeFlags::DEPTH);
-    commandCount *= uint32_t(colorPass * 2 + depthPass);
-    commandCount += 1; // for the sentinel
-    Command* const curr = append(commandCount);
+    Command* curr = commands.data();
+    size_t const commandCount = commands.size();
 
     auto stereoscopicEyeCount =
             renderFlags & IS_STEREOSCOPIC ? engine.getConfig().stereoscopicEyeCount : 1;
@@ -152,7 +214,8 @@ void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandT
     }
 }
 
-void RenderPass::appendCustomCommand(uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
+void RenderPass::appendCustomCommand(Command* commands,
+        uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
         Executor::CustomCommandFn command) {
 
     assert_invariant((uint64_t(order) << CUSTOM_ORDER_SHIFT) <=  CUSTOM_ORDER_MASK);
@@ -168,11 +231,10 @@ void RenderPass::appendCustomCommand(uint8_t channel, Pass pass, CustomCommand c
     cmd |= uint64_t(order) << CUSTOM_ORDER_SHIFT;
     cmd |= uint64_t(index);
 
-    Command* const curr = append(1);
-    curr->key = cmd;
+    commands->key = cmd;
 }
 
-void RenderPass::sortCommands(FEngine& engine) noexcept {
+void RenderPass::sortCommands(Arena& arena) noexcept {
     SYSTRACE_NAME("sort and trim commands");
 
     std::sort(mCommandBegin, mCommandEnd);
@@ -183,30 +245,20 @@ void RenderPass::sortCommands(FEngine& engine) noexcept {
                 return c.key != uint64_t(Pass::SENTINEL);
             });
 
-    resize(uint32_t(last - mCommandBegin));
-
-    if (engine.isAutomaticInstancingEnabled()) {
-        instanceify(engine);
-    }
+    resize(arena, uint32_t(last - mCommandBegin));
 }
 
-void RenderPass::execute(FEngine& engine, const char* name,
+void RenderPass::execute(RenderPass const& pass,
+        FEngine& engine, const char* name,
         backend::Handle<backend::HwRenderTarget> renderTarget,
-        backend::RenderPassParams params) const noexcept {
-
+        backend::RenderPassParams params) noexcept {
     DriverApi& driver = engine.getDriverApi();
-
-    // this is a good time to flush the CommandStream, because we're about to potentially
-    // output a lot of commands. This guarantees here that we have at least
-    // FILAMENT_MIN_COMMAND_BUFFERS_SIZE_IN_MB bytes (1MiB by default).
-    engine.flush();
-
     driver.beginRenderPass(renderTarget, params);
-    getExecutor().execute(engine, name);
+    pass.getExecutor().execute(engine, name);
     driver.endRenderPass();
 }
 
-void RenderPass::instanceify(FEngine& engine) noexcept {
+void RenderPass::instanceify(FEngine& engine, Arena& arena) noexcept {
     SYSTRACE_NAME("instanceify");
 
     // instanceify works by scanning the **sorted** command stream, looking for repeat draw
@@ -262,7 +314,8 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
                 // buffer large enough for all instances data
                 stagingBufferSize = sizeof(PerRenderableData) * (last - curr);
                 stagingBuffer = (PerRenderableData*)::malloc(stagingBufferSize);
-                uboData = mRenderableSoa->data<FScene::UBO>();
+                uboData = mRenderableSoa.data<FScene::UBO>();
+                assert_invariant(uboData);
             }
 
             // copy the ubo data to a staging buffer
@@ -315,7 +368,7 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
             return command.key == uint64_t(Pass::SENTINEL);
         });
 
-        resize(uint32_t(lastCommand - mCommandBegin));
+        resize(arena, uint32_t(lastCommand - mCommandBegin));
     }
 
     assert_invariant(stagingBuffer == nullptr);
@@ -323,7 +376,7 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
 
 
 /* static */
-UTILS_ALWAYS_INLINE // this function exists only to make the code more readable. we want it inlined.
+UTILS_ALWAYS_INLINE // This function exists only to make the code more readable. we want it inlined.
 inline              // and we don't need it in the compilation unit
 void RenderPass::setupColorCommand(Command& cmdDraw, Variant variant,
         FMaterialInstance const* const UTILS_RESTRICT mi, bool inverseFrontFaces) noexcept {
@@ -374,7 +427,7 @@ void RenderPass::setupColorCommand(Command& cmdDraw, Variant variant,
 
 /* static */
 UTILS_NOINLINE
-void RenderPass::generateCommands(uint32_t commandTypeFlags, Command* const commands,
+void RenderPass::generateCommands(CommandTypeFlags commandTypeFlags, Command* const commands,
         FScene::RenderableSoa const& soa, Range<uint32_t> range,
         Variant variant, RenderFlags renderFlags,
         FScene::VisibleMaskType visibilityMask, float3 cameraPosition, float3 cameraForward,
@@ -432,9 +485,9 @@ void RenderPass::generateCommands(uint32_t commandTypeFlags, Command* const comm
 }
 
 /* static */
-template<uint32_t commandTypeFlags>
+template<RenderPass::CommandTypeFlags commandTypeFlags>
 UTILS_NOINLINE
-RenderPass::Command* RenderPass::generateCommandsImpl(uint32_t extraFlags,
+RenderPass::Command* RenderPass::generateCommandsImpl(RenderPass::CommandTypeFlags extraFlags,
         Command* UTILS_RESTRICT curr,
         FScene::RenderableSoa const& UTILS_RESTRICT soa, Range<uint32_t> range,
         Variant const variant, RenderFlags renderFlags, FScene::VisibleMaskType visibilityMask,
@@ -737,13 +790,13 @@ void RenderPass::updateSummedPrimitiveCounts(
 // ------------------------------------------------------------------------------------------------
 
 void RenderPass::Executor::overridePolygonOffset(backend::PolygonOffset const* polygonOffset) noexcept {
-    if ((mPolygonOffsetOverride = (polygonOffset != nullptr))) {
+    if ((mPolygonOffsetOverride = (polygonOffset != nullptr))) { // NOLINT(*-assignment-in-if-condition)
         mPolygonOffset = *polygonOffset;
     }
 }
 
 void RenderPass::Executor::overrideScissor(backend::Viewport const* scissor) noexcept {
-    if ((mScissorOverride = (scissor != nullptr))) {
+    if ((mScissorOverride = (scissor != nullptr))) { // NOLINT(*-assignment-in-if-condition)
         mScissor = *scissor;
     }
 }
@@ -754,15 +807,20 @@ void RenderPass::Executor::overrideScissor(backend::Viewport const& scissor) noe
 }
 
 void RenderPass::Executor::execute(FEngine& engine, const char*) const noexcept {
-    execute(engine.getDriverApi(), mCommands.begin(), mCommands.end());
+    execute(engine, mCommands.begin(), mCommands.end());
 }
 
 UTILS_NOINLINE // no need to be inlined
-void RenderPass::Executor::execute(backend::DriverApi& driver,
+void RenderPass::Executor::execute(FEngine& engine,
         const Command* first, const Command* last) const noexcept {
+
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
+    DriverApi& driver = engine.getDriverApi();
+    size_t const capacity = engine.getMinCommandBufferSize();
+    CircularBuffer const& circularBuffer = driver.getCircularBuffer();
+
     if (first != last) {
         SYSTRACE_VALUE32("commandCount", last - first);
 
@@ -781,126 +839,163 @@ void RenderPass::Executor::execute(backend::DriverApi& driver,
         FMaterial const* UTILS_RESTRICT ma = nullptr;
         auto const* UTILS_RESTRICT pCustomCommands = mCustomCommands.data();
 
-        first--;
-        while (++first != last) {
-            assert_invariant(first->key != uint64_t(Pass::SENTINEL));
-
-            /*
-             * Be careful when changing code below, this is the hot inner-loop
-             */
-
-            if (UTILS_UNLIKELY((first->key & CUSTOM_MASK) != uint64_t(CustomCommand::PASS))) {
-                mi = nullptr; // custom command could change the currently bound MaterialInstance
-                uint32_t const index = (first->key & CUSTOM_INDEX_MASK) >> CUSTOM_INDEX_SHIFT;
-                assert_invariant(index < mCustomCommands.size());
-                pCustomCommands[index]();
-                continue;
+        // Maximum space occupied in the CircularBuffer by a single `Command`. This must be
+        // reevaluated when the inner loop below adds DriverApi commands or when we change the
+        // CommandStream protocol. Currently, the maximum is 240 bytes, and we use 256 to be on
+        // the safer side.
+        size_t const maxCommandSizeInBytes = 256;
+
+        // Number of Commands that can be issued and guaranteed to fit in the current
+        // CircularBuffer allocation. In practice, we'll have tons of headroom especially if
+        // skinning and morphing aren't used. With a 2 MiB buffer (the default) a batch is
+        // 8192 commands (i.e. draw calls).
+        size_t const batchCommandCount = capacity / maxCommandSizeInBytes;
+        while(first != last) {
+            Command const* const batchLast = std::min(first + batchCommandCount, last);
+
+            // actual number of commands we need to write (can be smaller than batchCommandCount)
+            size_t const commandCount = batchLast - first;
+            size_t const commandSizeInBytes = commandCount * maxCommandSizeInBytes;
+
+            // check we have enough capacity to write these commandCount commands, if not,
+            // request a new CircularBuffer allocation of `capacity` bytes.
+            if (UTILS_UNLIKELY(circularBuffer.getUsed() > capacity - commandSizeInBytes)) {
+                engine.flush(); // TODO: we should use a "fast" flush if possible
             }
 
-            // primitiveHandle may be invalid if no geometry was set on the renderable.
-            if (UTILS_UNLIKELY(!first->primitive.primitiveHandle)) {
-                continue;
-            }
+            first--;
+            while (++first != batchLast) {
+                assert_invariant(first->key != uint64_t(Pass::SENTINEL));
 
-            // per-renderable uniform
-            const PrimitiveInfo info = first->primitive;
-            pipeline.rasterState = info.rasterState;
-
-            if (UTILS_UNLIKELY(mi != info.mi)) {
-                // this is always taken the first time
-                mi = info.mi;
-                ma = mi->getMaterial();
-
-                auto const& scissor = mi->getScissor();
-                if (UTILS_UNLIKELY(mi->hasScissor())) {
-                    // scissor is set, we need to apply the offset/clip
-                    // clang vectorizes this!
-                    constexpr int32_t maxvali = std::numeric_limits<int32_t>::max();
-                    const backend::Viewport scissorViewport = mScissorViewport;
-                    // compute new left/bottom, assume no overflow
-                    int32_t l = scissor.left + scissorViewport.left;
-                    int32_t b = scissor.bottom + scissorViewport.bottom;
-                    // compute right/top without overflowing, scissor.width/height guaranteed
-                    // to convert to int32
-                    int32_t r = (l > maxvali - int32_t(scissor.width)) ?
-                            maxvali : l + int32_t(scissor.width);
-                    int32_t t = (b > maxvali - int32_t(scissor.height)) ?
-                            maxvali : b + int32_t(scissor.height);
-                    // clip to the viewport
-                    l = std::max(l, scissorViewport.left);
-                    b = std::max(b, scissorViewport.bottom);
-                    r = std::min(r, scissorViewport.left + int32_t(scissorViewport.width));
-                    t = std::min(t, scissorViewport.bottom + int32_t(scissorViewport.height));
-                    assert_invariant(r >= l && t >= b);
-                    *pScissor = { l, b, uint32_t(r - l), uint32_t(t - b) };
-                } else {
-                    // no scissor set (common case), 'scissor' has its default value, use that.
-                    *pScissor = scissor;
+                /*
+                 * Be careful when changing code below, this is the hot inner-loop
+                 */
+
+                if (UTILS_UNLIKELY((first->key & CUSTOM_MASK) != uint64_t(CustomCommand::PASS))) {
+                    mi = nullptr; // custom command could change the currently bound MaterialInstance
+                    uint32_t const index = (first->key & CUSTOM_INDEX_MASK) >> CUSTOM_INDEX_SHIFT;
+                    assert_invariant(index < mCustomCommands.size());
+                    pCustomCommands[index]();
+                    continue;
                 }
 
-                *pPipelinePolygonOffset = mi->getPolygonOffset();
-                pipeline.stencilState = mi->getStencilState();
-                mi->use(driver);
-            }
+                // primitiveHandle may be invalid if no geometry was set on the renderable.
+                if (UTILS_UNLIKELY(!first->primitive.primitiveHandle)) {
+                    continue;
+                }
 
-            pipeline.program = ma->getProgram(info.materialVariant);
+                // per-renderable uniform
+                const PrimitiveInfo info = first->primitive;
+                pipeline.rasterState = info.rasterState;
+
+                if (UTILS_UNLIKELY(mi != info.mi)) {
+                    // this is always taken the first time
+                    mi = info.mi;
+                    assert_invariant(mi);
+
+                    ma = mi->getMaterial();
+
+                    auto const& scissor = mi->getScissor();
+                    if (UTILS_UNLIKELY(mi->hasScissor())) {
+                        // scissor is set, we need to apply the offset/clip
+                        // clang vectorizes this!
+                        constexpr int32_t maxvali = std::numeric_limits<int32_t>::max();
+                        const backend::Viewport scissorViewport = mScissorViewport;
+                        // compute new left/bottom, assume no overflow
+                        int32_t l = scissor.left + scissorViewport.left;
+                        int32_t b = scissor.bottom + scissorViewport.bottom;
+                        // compute right/top without overflowing, scissor.width/height guaranteed
+                        // to convert to int32
+                        int32_t r = (l > maxvali - int32_t(scissor.width)) ?
+                                    maxvali : l + int32_t(scissor.width);
+                        int32_t t = (b > maxvali - int32_t(scissor.height)) ?
+                                    maxvali : b + int32_t(scissor.height);
+                        // clip to the viewport
+                        l = std::max(l, scissorViewport.left);
+                        b = std::max(b, scissorViewport.bottom);
+                        r = std::min(r, scissorViewport.left + int32_t(scissorViewport.width));
+                        t = std::min(t, scissorViewport.bottom + int32_t(scissorViewport.height));
+                        assert_invariant(r >= l && t >= b);
+                        *pScissor = { l, b, uint32_t(r - l), uint32_t(t - b) };
+                    } else {
+                        // no scissor set (common case), 'scissor' has its default value, use that.
+                        *pScissor = scissor;
+                    }
+
+                    *pPipelinePolygonOffset = mi->getPolygonOffset();
+                    pipeline.stencilState = mi->getStencilState();
+                    mi->use(driver);
+                }
 
-            uint16_t const instanceCount = info.instanceCount & PrimitiveInfo::INSTANCE_COUNT_MASK;
-            auto getPerObjectUboHandle =
-                    [this, &info, &instanceCount]() -> std::pair<Handle<backend::HwBufferObject>, uint32_t> {
-                if (info.instanceBufferHandle) {
-                    // "hybrid" instancing -- instanceBufferHandle takes the place of the UBO
-                    return { info.instanceBufferHandle, 0 };
+                assert_invariant(ma);
+                pipeline.program = ma->getProgram(info.materialVariant);
+
+                uint16_t const instanceCount =
+                        info.instanceCount & PrimitiveInfo::INSTANCE_COUNT_MASK;
+                auto getPerObjectUboHandle =
+                        [this, &info, &instanceCount]() -> std::pair<Handle<backend::HwBufferObject>, uint32_t> {
+                            if (info.instanceBufferHandle) {
+                                // "hybrid" instancing -- instanceBufferHandle takes the place of the UBO
+                                return { info.instanceBufferHandle, 0 };
+                            }
+                            bool const userInstancing =
+                                    (info.instanceCount & PrimitiveInfo::USER_INSTANCE_MASK) != 0u;
+                            if (!userInstancing && instanceCount > 1) {
+                                // automatic instancing
+                                return {
+                                        mInstancedUboHandle,
+                                        info.index * sizeof(PerRenderableData) };
+                            } else {
+                                // manual instancing
+                                return { mUboHandle, info.index * sizeof(PerRenderableData) };
+                            }
+                        };
+
+                // Bind per-renderable uniform block. There is no need to attempt to skip this command
+                // because the backends already do this.
+                auto const [perObjectUboHandle, offset] = getPerObjectUboHandle();
+                assert_invariant(perObjectUboHandle);
+                driver.bindBufferRange(BufferObjectBinding::UNIFORM,
+                        +UniformBindingPoints::PER_RENDERABLE,
+                        perObjectUboHandle,
+                        offset,
+                        sizeof(PerRenderableUib));
+
+                if (UTILS_UNLIKELY(info.skinningHandle)) {
+                    // note: we can't bind less than sizeof(PerRenderableBoneUib) due to glsl limitations
+                    driver.bindBufferRange(BufferObjectBinding::UNIFORM,
+                            +UniformBindingPoints::PER_RENDERABLE_BONES,
+                            info.skinningHandle,
+                            info.skinningOffset * sizeof(PerRenderableBoneUib::BoneData),
+                            sizeof(PerRenderableBoneUib));
+                    // note: always bind the skinningTexture because the shader needs it.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
+                            info.skinningTexture);
+                    // note: even if only skinning is enabled, binding morphTargetBuffer is needed.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphTargetBuffer);
                 }
-                bool const userInstancing =
-                        (info.instanceCount & PrimitiveInfo::USER_INSTANCE_MASK) != 0u;
-                if (!userInstancing && instanceCount > 1) {
-                    // automatic instancing
-                    return { mInstancedUboHandle, info.index * sizeof(PerRenderableData) };
-                } else {
-                    // manual instancing
-                    return { mUboHandle, info.index * sizeof(PerRenderableData) };
+
+                if (UTILS_UNLIKELY(info.morphWeightBuffer)) {
+                    // Instead of using a UBO per primitive, we could also have a single UBO for all
+                    // primitives and use bindUniformBufferRange which might be more efficient.
+                    driver.bindUniformBuffer(+UniformBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphWeightBuffer);
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphTargetBuffer);
+                    // note: even if only morphing is enabled, binding skinningTexture is needed.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
+                            info.skinningTexture);
                 }
-            };
-
-            // bind per-renderable uniform block. there is no need to attempt to skip this command
-            // because the backends already do this.
-            auto const [perObjectUboHandle, offset] = getPerObjectUboHandle();
-            assert_invariant(perObjectUboHandle);
-            driver.bindBufferRange(BufferObjectBinding::UNIFORM,
-                    +UniformBindingPoints::PER_RENDERABLE,
-                    perObjectUboHandle,
-                    offset,
-                    sizeof(PerRenderableUib));
-
-            if (UTILS_UNLIKELY(info.skinningHandle)) {
-                // note: we can't bind less than sizeof(PerRenderableBoneUib) due to glsl limitations
-                driver.bindBufferRange(BufferObjectBinding::UNIFORM,
-                        +UniformBindingPoints::PER_RENDERABLE_BONES,
-                        info.skinningHandle,
-                        info.skinningOffset * sizeof(PerRenderableBoneUib::BoneData),
-                        sizeof(PerRenderableBoneUib));
-                // note: always bind the skinningTexture because the shader needs it.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
-                        info.skinningTexture);
-                // note: even if only skinning is enabled, binding morphTargetBuffer is needed.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphTargetBuffer);
-           }
-
-            if (UTILS_UNLIKELY(info.morphWeightBuffer)) {
-                // Instead of using a UBO per primitive, we could also have a single UBO for all
-                // primitives and use bindUniformBufferRange which might be more efficient.
-                driver.bindUniformBuffer(+UniformBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphWeightBuffer);
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphTargetBuffer);
-                // note: even if only morphing is enabled, binding skinningTexture is needed.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
-                        info.skinningTexture);
+
+                driver.draw(pipeline, info.primitiveHandle, instanceCount);
             }
+        }
 
-            driver.draw(pipeline, info.primitiveHandle, instanceCount);
+        // If the remaining space is less than half the capacity, we flush right away to
+        // allow some headroom for commands that might come later.
+        if (UTILS_UNLIKELY(circularBuffer.getUsed() > capacity / 2)) {
+            engine.flush();
         }
     }
 
diff --git a/filament/src/RenderPass.h b/filament/src/RenderPass.h
index 4474079594f..646171efd58 100644
--- a/filament/src/RenderPass.h
+++ b/filament/src/RenderPass.h
@@ -22,26 +22,38 @@
 #include "details/Camera.h"
 #include "details/Scene.h"
 
-#include "backend/DriverApiForward.h"
-
-#include <private/filament/Variant.h>
+#include "private/filament/Variant.h"
+#include "utils/BitmaskEnum.h"
 
 #include <backend/DriverEnums.h>
 #include <backend/Handle.h>
 
 #include <utils/Allocator.h>
 #include <utils/Range.h>
+#include <utils/Slice.h>
 #include <utils/architecture.h>
-#include <utils/compiler.h>
 #include <utils/debug.h>
 
+#include <math/mathfwd.h>
+
 #include <functional>
 #include <limits>
+#include <optional>
+#include <type_traits>
+#include <tuple>
 #include <vector>
 
+#include <stddef.h>
+#include <stdint.h>
+
 namespace filament {
 
+namespace backend {
+class CommandBufferQueue;
+}
+
 class FMaterialInstance;
+class RenderPassBuilder;
 
 class RenderPass {
 public:
@@ -171,7 +183,7 @@ class RenderPass {
         EPILOG  = uint64_t(0x2) << CUSTOM_SHIFT
     };
 
-    enum CommandTypeFlags : uint8_t {
+    enum class CommandTypeFlags : uint32_t {
         COLOR = 0x1,    // generate the color pass only
         DEPTH = 0x2,    // generate the depth pass only ( e.g. shadowmap)
 
@@ -191,7 +203,6 @@ class RenderPass {
         SCREEN_SPACE_REFLECTIONS = COLOR | FILTER_TRANSLUCENT_OBJECTS
     };
 
-
     /*
      * The sorting material key is 32 bits and encoded as:
      *
@@ -240,7 +251,6 @@ class RenderPass {
         uint32_t skinningOffset = 0;                                    // 4 bytes
         uint16_t instanceCount;                                         // 2 bytes [MSb: user]
         Variant materialVariant;                                        // 1 byte
-//        uint8_t reserved[0] = {};                                       // 0 bytes
 
         static const uint16_t USER_INSTANCE_MASK = 0x8000u;
         static const uint16_t INSTANCE_COUNT_MASK = 0x7fffu;
@@ -253,7 +263,7 @@ class RenderPass {
         uint64_t reserved[1] = {};  //  8 bytes
         bool operator < (Command const& rhs) const noexcept { return key < rhs.key; }
         // placement new declared as "throw" to avoid the compiler's null-check
-        inline void* operator new (std::size_t, void* ptr) {
+        inline void* operator new (size_t, void* ptr) {
             assert_invariant(ptr);
             return ptr;
         }
@@ -269,61 +279,31 @@ class RenderPass {
 
     // Arena used for commands
     using Arena = utils::Arena<
-            utils::LinearAllocator,                 // note: can't change this allocator
+            utils::LinearAllocatorWithFallback,
             utils::LockingPolicy::NoLock,
             utils::TrackingPolicy::HighWatermark,
             utils::AreaPolicy::StaticArea>;
 
-    /*
-     * Create a RenderPass.
-     * The Arena is used to allocate commands which are then owned by the Arena.
-     */
-    RenderPass(FEngine& engine, Arena& arena) noexcept;
+    // RenderPass can only be moved
+    RenderPass(RenderPass&& rhs) = default;
 
-    // Copy the RenderPass as is. This can be used to create a RenderPass from a "template"
-    // by copying from an "empty" RenderPass.
-    RenderPass(RenderPass const& rhs);
+    // RenderPass can't be copied
+    RenderPass(RenderPass const& rhs) = delete;
+    RenderPass& operator=(RenderPass const& rhs) = delete;
+    RenderPass& operator=(RenderPass&& rhs) = delete;
 
     // allocated commands ARE NOT freed, they're owned by the Arena
     ~RenderPass() noexcept;
 
-    // a box that both offsets the viewport and clips it
-    void setScissorViewport(backend::Viewport viewport) noexcept;
-
-    // specifies the geometry to generate commands for
-    void setGeometry(FScene::RenderableSoa const& soa, utils::Range<uint32_t> vr,
-            backend::Handle<backend::HwBufferObject> uboHandle) noexcept;
-
-    // specifies camera information (e.g. used for sorting commands)
-    void setCamera(const CameraInfo& camera) noexcept;
-
-    //  flags controlling how commands are generated
-    void setRenderFlags(RenderFlags flags) noexcept { mFlags = flags; }
-    RenderFlags getRenderFlags() const noexcept { return mFlags; }
-
-    // variant to use
-    void setVariant(Variant variant) noexcept { mVariant = variant; }
-
-    // Sets the visibility mask, which is AND-ed against each Renderable's VISIBLE_MASK to determine
-    // if the renderable is visible for this pass.
-    // Defaults to all 1's, which means all renderables in this render pass will be rendered.
-    void setVisibilityMask(FScene::VisibleMaskType mask) noexcept { mVisibilityMask = mask; }
-
     Command const* begin() const noexcept { return mCommandBegin; }
     Command const* end() const noexcept { return mCommandEnd; }
     bool empty() const noexcept { return begin() == end(); }
 
-    // This is the main function of this class, this appends commands to the pass using
-    // the current camera, geometry and flags set. This can be called multiple times if needed.
-    void appendCommands(FEngine& engine, CommandTypeFlags commandTypeFlags) noexcept;
-
-    // sorts and instanceify commands then trims sentinels
-    void sortCommands(FEngine& engine) noexcept;
-
     // Helper to execute all the commands generated by this RenderPass
-    void execute(FEngine& engine, const char* name,
+    static void execute(RenderPass const& pass,
+            FEngine& engine, const char* name,
             backend::Handle<backend::HwRenderTarget> renderTarget,
-            backend::RenderPassParams params) const noexcept;
+            backend::RenderPassParams params) noexcept;
 
     /*
      * Executor holds the range of commands to execute for a given pass
@@ -331,6 +311,7 @@ class RenderPass {
     class Executor {
         using CustomCommandFn = std::function<void()>;
         friend class RenderPass;
+        friend class RenderPassBuilder;
 
         // these fields are constant after creation
         utils::Slice<Command> mCommands;
@@ -346,8 +327,7 @@ class RenderPass {
 
         Executor(RenderPass const* pass, Command const* b, Command const* e) noexcept;
 
-        void execute(backend::DriverApi& driver,
-                const Command* first, const Command* last) const noexcept;
+        void execute(FEngine& engine, const Command* first, const Command* last) const noexcept;
 
     public:
         Executor() = default;
@@ -366,37 +346,39 @@ class RenderPass {
     };
 
     // returns a new executor for this pass
-    Executor getExecutor() {
-        return { this, mCommandBegin, mCommandEnd };
-    }
-
     Executor getExecutor() const {
         return { this, mCommandBegin, mCommandEnd };
     }
 
-    // returns a new executor for this pass with a custom range
-    Executor getExecutor(Command const* b, Command const* e) {
-        return { this, b, e };
-    }
-
     Executor getExecutor(Command const* b, Command const* e) const {
         return { this, b, e };
     }
 
+private:
+    friend class FRenderer;
+    friend class RenderPassBuilder;
+    RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexcept;
+
+    // This is the main function of this class, this appends commands to the pass using
+    // the current camera, geometry and flags set. This can be called multiple times if needed.
+    void appendCommands(FEngine& engine,
+            utils::Slice<Command> commands, CommandTypeFlags commandTypeFlags) noexcept;
+
     // Appends a custom command.
-    void appendCustomCommand(uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
+    void appendCustomCommand(Command* commands,
+            uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
             Executor::CustomCommandFn command);
 
+    void resize(Arena& arena, size_t count) noexcept;
 
-private:
-    friend class FRenderer;
+    // sorts commands then trims sentinels
+    void sortCommands(Arena& arena) noexcept;
 
-    Command* append(size_t count) noexcept;
-    void resize(size_t count) noexcept;
-    void instanceify(FEngine& engine) noexcept;
+    // instanceify commands then trims sentinels
+    void instanceify(FEngine& engine, Arena& arena) noexcept;
 
-    // we choose the command count per job to minimize JobSystem overhead.
-    // on a Pixel 4, 2048 commands is about half a millisecond of processing.
+    // We choose the command count per job to minimize JobSystem overhead.
+    // On a Pixel 4, 2048 commands is about half a millisecond of processing.
     static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_COUNT = 2048;
     static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_SIZE  =
             sizeof(Command) * JOBS_PARALLEL_FOR_COMMANDS_COUNT;
@@ -404,15 +386,15 @@ class RenderPass {
     static_assert(JOBS_PARALLEL_FOR_COMMANDS_SIZE % utils::CACHELINE_SIZE == 0,
             "Size of Commands jobs must be multiple of a cache-line size");
 
-    static inline void generateCommands(uint32_t commandTypeFlags, Command* commands,
+    static inline void generateCommands(CommandTypeFlags commandTypeFlags, Command* commands,
             FScene::RenderableSoa const& soa, utils::Range<uint32_t> range,
             Variant variant, RenderFlags renderFlags,
             FScene::VisibleMaskType visibilityMask,
             math::float3 cameraPosition, math::float3 cameraForward,
             uint8_t instancedStereoEyeCount) noexcept;
 
-    template<uint32_t commandTypeFlags>
-    static inline Command* generateCommandsImpl(uint32_t extraFlags, Command* curr,
+    template<RenderPass::CommandTypeFlags commandTypeFlags>
+    static inline Command* generateCommandsImpl(RenderPass::CommandTypeFlags extraFlags, Command* curr,
             FScene::RenderableSoa const& soa, utils::Range<uint32_t> range,
             Variant variant, RenderFlags renderFlags, FScene::VisibleMaskType visibilityMask,
             math::float3 cameraPosition, math::float3 cameraForward,
@@ -424,50 +406,128 @@ class RenderPass {
     static void updateSummedPrimitiveCounts(
             FScene::RenderableSoa& renderableData, utils::Range<uint32_t> vr) noexcept;
 
-    // a reference to the Engine, mostly to get to things like JobSystem
 
-    // Arena where all Commands are allocated. The Arena owns the commands.
-    Arena& mCommandArena;
+    FScene::RenderableSoa const& mRenderableSoa;
+    utils::Range<uint32_t> const mVisibleRenderables;
+    backend::Handle<backend::HwBufferObject> const mUboHandle;
+    math::float3 const mCameraPosition;
+    math::float3 const mCameraForwardVector;
+    RenderFlags const mFlags;
+    Variant const mVariant;
+    FScene::VisibleMaskType const mVisibilityMask;
+    backend::Viewport const mScissorViewport;
 
     // Pointer to the first command
     Command* mCommandBegin = nullptr;
-
     // Pointer to one past the last command
     Command* mCommandEnd = nullptr;
+    // a UBO for instanced primitives
+    backend::Handle<backend::HwBufferObject> mInstancedUboHandle;
+    // a vector for our custom commands
+    using CustomCommandVector = std::vector<Executor::CustomCommandFn,
+            utils::STLAllocator<Executor::CustomCommandFn, LinearAllocatorArena>>;
+    mutable CustomCommandVector mCustomCommands;
+};
 
-    // the SOA containing the renderables we're interested in
-    FScene::RenderableSoa const* mRenderableSoa = nullptr;
+class RenderPassBuilder {
+    friend class RenderPass;
 
-    // The range of visible renderables in the SOA above
+    RenderPass::Arena& mArena;
+    RenderPass::CommandTypeFlags mCommandTypeFlags{};
+    backend::Viewport mScissorViewport{ 0, 0, INT32_MAX, INT32_MAX };
+    FScene::RenderableSoa const* mRenderableSoa = nullptr;
     utils::Range<uint32_t> mVisibleRenderables{};
-
-    // the UBO containing the data for the renderables
     backend::Handle<backend::HwBufferObject> mUboHandle;
-    backend::Handle<backend::HwBufferObject> mInstancedUboHandle;
-
-    // info about the camera
     math::float3 mCameraPosition{};
     math::float3 mCameraForwardVector{};
+    RenderPass::RenderFlags mFlags{};
+    Variant mVariant{};
+    FScene::VisibleMaskType mVisibilityMask = std::numeric_limits<FScene::VisibleMaskType>::max();
 
-    // info about the scene features (e.g.: has shadows, lighting, etc...)
-    RenderFlags mFlags{};
+    using CustomCommandRecord = std::tuple<
+            uint8_t,
+            RenderPass::Pass,
+            RenderPass::CustomCommand,
+            uint32_t,
+            RenderPass::Executor::CustomCommandFn>;
 
-    // Variant to use
-    Variant mVariant{};
+    using CustomCommandContainer = std::vector<CustomCommandRecord,
+            utils::STLAllocator<CustomCommandRecord, LinearAllocatorArena>>;
 
-    // Additional visibility mask
-    FScene::VisibleMaskType mVisibilityMask = std::numeric_limits<FScene::VisibleMaskType>::max();
+    // we make this optional because it's not used often, and we don't want to have
+    // to construct it by default.
+    std::optional<CustomCommandContainer> mCustomCommands;
 
-    backend::Viewport mScissorViewport{ 0, 0,
-            std::numeric_limits<int32_t>::max(),
-            std::numeric_limits<int32_t>::max() };
+public:
+    explicit RenderPassBuilder(RenderPass::Arena& arena) : mArena(arena) { }
 
-    // a vector for our custom commands
-    using CustomCommandVector = std::vector<Executor::CustomCommandFn,
-            utils::STLAllocator<Executor::CustomCommandFn, LinearAllocatorArena>>;
-    mutable CustomCommandVector mCustomCommands;
+    RenderPassBuilder& commandTypeFlags(RenderPass::CommandTypeFlags commandTypeFlags) noexcept {
+        mCommandTypeFlags = commandTypeFlags;
+        return *this;
+    }
+
+    RenderPassBuilder& scissorViewport(backend::Viewport viewport) noexcept {
+        mScissorViewport = viewport;
+        return *this;
+    }
+
+    // specifies the geometry to generate commands for
+    RenderPassBuilder& geometry(FScene::RenderableSoa const& soa, utils::Range<uint32_t> vr,
+            backend::Handle<backend::HwBufferObject> uboHandle) noexcept {
+        mRenderableSoa = &soa;
+        mVisibleRenderables = vr;
+        mUboHandle = uboHandle;
+        return *this;
+    }
+
+    // Specifies camera information (e.g. used for sorting commands)
+    RenderPassBuilder& camera(const CameraInfo& camera) noexcept {
+        mCameraPosition = camera.getPosition();
+        mCameraForwardVector = camera.getForwardVector();
+        return *this;
+    }
+
+    //  flags controlling how commands are generated
+    RenderPassBuilder& renderFlags(RenderPass::RenderFlags flags) noexcept {
+        mFlags = flags;
+        return *this;
+    }
+
+    // like above but allows to set specific flags
+    RenderPassBuilder& renderFlags(
+            RenderPass::RenderFlags mask, RenderPass::RenderFlags value) noexcept {
+        mFlags = (mFlags & mask) | (value & mask);
+        return *this;
+    }
+
+    // variant to use
+    RenderPassBuilder& variant(Variant variant) noexcept {
+        mVariant = variant;
+        return *this;
+    }
+
+    // Sets the visibility mask, which is AND-ed against each Renderable's VISIBLE_MASK to
+    // determine if the renderable is visible for this pass.
+    // Defaults to all 1's, which means all renderables in this render pass will be rendered.
+    RenderPassBuilder& visibilityMask(FScene::VisibleMaskType mask) noexcept {
+        mVisibilityMask = mask;
+        return *this;
+    }
+
+    RenderPassBuilder& customCommand(FEngine& engine,
+            uint8_t channel,
+            RenderPass::Pass pass,
+            RenderPass::CustomCommand custom,
+            uint32_t order,
+            const RenderPass::Executor::CustomCommandFn& command);
+
+    RenderPass build(FEngine& engine);
 };
 
+
 } // namespace filament
 
+template<> struct utils::EnableBitMaskOperators<filament::RenderPass::CommandTypeFlags>
+        : public std::true_type {};
+
 #endif // TNT_FILAMENT_RENDERPASS_H
diff --git a/filament/src/RendererUtils.cpp b/filament/src/RendererUtils.cpp
index 2707b9201be..a26b9b7b53e 100644
--- a/filament/src/RendererUtils.cpp
+++ b/filament/src/RendererUtils.cpp
@@ -228,10 +228,6 @@ FrameGraphId<FrameGraphTexture> RendererUtils::colorPass(
                     out.params.subpassMask = 1;
                 }
 
-                // this is a good time to flush the CommandStream, because we're about to potentially
-                // output a lot of commands. This guarantees here that we have at least
-                // FILAMENT_MIN_COMMAND_BUFFERS_SIZE_IN_MB bytes (1MiB by default).
-                engine.flush();
                 driver.beginRenderPass(out.target, out.params);
                 passExecutor.execute(engine, resources.getPassName());
                 driver.endRenderPass();
diff --git a/filament/src/ShadowMapManager.cpp b/filament/src/ShadowMapManager.cpp
index 9fa72d54b8b..ebf88427774 100644
--- a/filament/src/ShadowMapManager.cpp
+++ b/filament/src/ShadowMapManager.cpp
@@ -19,14 +19,21 @@
 #include "RenderPass.h"
 #include "ShadowMap.h"
 
+#include "details/DebugRegistry.h"
 #include "details/Texture.h"
 #include "details/View.h"
 
 #include <fg/FrameGraph.h>
 
+#include <backend/DriverEnums.h>
+
+#include <utils/compiler.h>
 #include <utils/debug.h>
 #include <utils/FixedCapacityVector.h>
 
+#include <new>
+#include <memory>
+
 namespace filament {
 
 using namespace backend;
@@ -128,7 +135,8 @@ void ShadowMapManager::addShadowMap(size_t lightIndex, bool spotlight,
 }
 
 FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameGraph& fg,
-        RenderPass const& pass, FView& view, CameraInfo const& mainCameraInfo,
+        RenderPassBuilder const& passBuilder,
+        FView& view, CameraInfo const& mainCameraInfo,
         float4 const& userTime) noexcept {
 
     const float moment2 = std::numeric_limits<half>::max();
@@ -206,8 +214,8 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
                 builder.sideEffect();
             },
             [this, &engine, &view, vsmShadowOptions,
-                scene, mainCameraInfo, userTime, passTemplate = pass](
-                    FrameGraphResources const&, auto const& data, DriverApi& driver) {
+                scene, mainCameraInfo, userTime, passBuilder = passBuilder](
+                    FrameGraphResources const&, auto const& data, DriverApi& driver) mutable {
 
                 // Note: we could almost parallel_for the loop below, the problem currently is
                 // that updatePrimitivesLod() updates temporary global state.
@@ -262,19 +270,20 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
                                 cameraInfo, scene->getRenderableData(), entry.range);
 
                         // generate and sort the commands for rendering the shadow map
-                        RenderPass pass(passTemplate);
-                        pass.setCamera(cameraInfo);
-                        pass.setVisibilityMask(entry.visibilityMask);
-                        pass.setGeometry(scene->getRenderableData(),
-                                entry.range, scene->getRenderableUBO());
-                        pass.appendCommands(engine, RenderPass::SHADOW);
-                        pass.sortCommands(engine);
+
+                        RenderPass const pass = passBuilder
+                            .camera(cameraInfo)
+                            .visibilityMask(entry.visibilityMask)
+                            .geometry(scene->getRenderableData(),
+                                    entry.range, scene->getRenderableUBO())
+                            .commandTypeFlags(RenderPass::CommandTypeFlags::SHADOW)
+                            .build(engine);
 
                         entry.executor = pass.getExecutor();
 
                         if (!view.hasVSM()) {
                             auto const* options = shadowMap.getShadowOptions();
-                            const PolygonOffset polygonOffset = { // handle reversed Z
+                            PolygonOffset const polygonOffset = { // handle reversed Z
                                     .slope    = -options->polygonOffsetSlope,
                                     .constant = -options->polygonOffsetConstant
                             };
@@ -395,7 +404,6 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
 
                     auto rt = resources.getRenderPassInfo(data.rt);
 
-                    engine.flush();
                     driver.beginRenderPass(rt.target, rt.params);
                     entry.shadowMap->bind(driver);
                     entry.executor.overrideScissor(entry.shadowMap->getScissor());
diff --git a/filament/src/ShadowMapManager.h b/filament/src/ShadowMapManager.h
index 5c38048ebed..6a6f752e42a 100644
--- a/filament/src/ShadowMapManager.h
+++ b/filament/src/ShadowMapManager.h
@@ -43,6 +43,7 @@ namespace filament {
 class FView;
 class FrameGraph;
 class RenderPass;
+class RenderPassBuilder;
 
 struct ShadowMappingUniforms {
     math::float4 cascadeSplits;
@@ -86,7 +87,8 @@ class ShadowMapManager {
             FScene::RenderableSoa& renderableData, FScene::LightSoa const& lightData) noexcept;
 
     // Renders all the shadow maps.
-    FrameGraphId<FrameGraphTexture> render(FEngine& engine, FrameGraph& fg, RenderPass const& pass,
+    FrameGraphId<FrameGraphTexture> render(FEngine& engine, FrameGraph& fg,
+            RenderPassBuilder const& passBuilder,
             FView& view, CameraInfo const& mainCameraInfo, math::float4 const& userTime) noexcept;
 
     // valid after calling update() above
diff --git a/filament/src/details/Engine.cpp b/filament/src/details/Engine.cpp
index c09711afa19..82b9399ec6b 100644
--- a/filament/src/details/Engine.cpp
+++ b/filament/src/details/Engine.cpp
@@ -198,7 +198,7 @@ FEngine::FEngine(Engine::Builder const& builder) :
         mCommandBufferQueue(
                 builder->mConfig.minCommandBufferSizeMB * MiB,
                 builder->mConfig.commandBufferSizeMB * MiB),
-        mPerRenderPassAllocator(
+        mPerRenderPassArena(
                 "FEngine::mPerRenderPassAllocator",
                 builder->mConfig.perRenderPassArenaSizeMB * MiB),
         mHeapAllocator("FEngine::mHeapAllocator", AreaPolicy::NullArea{}),
diff --git a/filament/src/details/Engine.h b/filament/src/details/Engine.h
index 03889f13936..03de2ce8414 100644
--- a/filament/src/details/Engine.h
+++ b/filament/src/details/Engine.h
@@ -58,17 +58,6 @@
 #include <filament/Texture.h>
 #include <filament/VertexBuffer.h>
 
-#if FILAMENT_ENABLE_MATDBG
-#include <matdbg/DebugServer.h>
-#else
-namespace filament {
-namespace matdbg {
-class DebugServer;
-using MaterialKey = uint32_t;
-} // namespace matdbg
-} // namespace filament
-#endif
-
 #include <utils/compiler.h>
 #include <utils/Allocator.h>
 #include <utils/JobSystem.h>
@@ -78,8 +67,19 @@ using MaterialKey = uint32_t;
 #include <memory>
 #include <new>
 #include <random>
+#include <thread>
+#include <type_traits>
 #include <unordered_map>
 
+#if FILAMENT_ENABLE_MATDBG
+#include <matdbg/DebugServer.h>
+#else
+namespace filament::matdbg {
+class DebugServer;
+using MaterialKey = uint32_t;
+} // namespace filament::matdbg
+#endif
+
 namespace filament {
 
 class Renderer;
@@ -142,7 +142,7 @@ class FEngine : public Engine {
     // the per-frame Area is used by all Renderer, so they must run in sequence and
     // have freed all allocated memory when done. If this needs to change in the future,
     // we'll simply have to use separate Areas (for instance).
-    LinearAllocatorArena& getPerRenderPassAllocator() noexcept { return mPerRenderPassAllocator; }
+    LinearAllocatorArena& getPerRenderPassArena() noexcept { return mPerRenderPassArena; }
 
     // Material IDs...
     uint32_t getMaterialId() const noexcept { return mMaterialId++; }
@@ -508,7 +508,7 @@ class FEngine : public Engine {
 
     uint32_t mFlushCounter = 0;
 
-    LinearAllocatorArena mPerRenderPassAllocator;
+    RootArenaScope::Arena mPerRenderPassArena;
     HeapAllocatorArena mHeapAllocator;
 
     utils::JobSystem mJobSystem;
diff --git a/filament/src/details/Renderer.cpp b/filament/src/details/Renderer.cpp
index 6611a48a832..c014be6eb66 100644
--- a/filament/src/details/Renderer.cpp
+++ b/filament/src/details/Renderer.cpp
@@ -16,6 +16,9 @@
 
 #include "details/Renderer.h"
 
+#include "Allocators.h"
+#include "DebugRegistry.h"
+#include "FrameHistory.h"
 #include "PostProcessManager.h"
 #include "RendererUtils.h"
 #include "RenderPass.h"
@@ -28,21 +31,40 @@
 #include "details/Texture.h"
 #include "details/View.h"
 
+#include <filament/Camera.h>
+#include <filament/Fence.h>
+#include <filament/Options.h>
 #include <filament/Renderer.h>
 
+#include <backend/DriverEnums.h>
+#include <backend/DriverApiForward.h>
+#include <backend/Handle.h>
 #include <backend/PixelBufferDescriptor.h>
 
 #include "fg/FrameGraph.h"
 #include "fg/FrameGraphId.h"
 #include "fg/FrameGraphResources.h"
+#include "fg/FrameGraphTexture.h"
+
+#include <math/vec2.h>
+#include <math/vec3.h>
+#include <math/mat4.h>
 
 #include <utils/compiler.h>
 #include <utils/JobSystem.h>
+#include <utils/Log.h>
+#include <utils/ostream.h>
 #include <utils/Panic.h>
 #include <utils/Systrace.h>
-#include <utils/vector.h>
 #include <utils/debug.h>
 
+#include <chrono>
+#include <limits>
+#include <utility>
+
+#include <stddef.h>
+#include <stdint.h>
+
 // this helps visualize what dynamic-scaling is doing
 #define DEBUG_DYNAMIC_SCALING false
 
@@ -62,8 +84,7 @@ FRenderer::FRenderer(FEngine& engine) :
         mHdrQualityMedium(TextureFormat::R11F_G11F_B10F),
         mHdrQualityHigh(TextureFormat::RGB16F),
         mIsRGB8Supported(false),
-        mUserEpoch(engine.getEngineEpoch()),
-        mPerRenderPassArena(engine.getPerRenderPassAllocator())
+        mUserEpoch(engine.getEngineEpoch())
 {
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
     debugRegistry.registerProperty("d.renderer.doFrameCapture",
@@ -442,7 +463,7 @@ void FRenderer::render(FView const* view) {
 
     if (UTILS_LIKELY(view && view->getScene())) {
         if (mViewRenderedCount) {
-            // this is a good place to kick the GPU, since we've rendered a View before,
+            // This is a good place to kick the GPU, since we've rendered a View before,
             // and we're about to render another one.
             mEngine.getDriverApi().flush();
         }
@@ -452,17 +473,17 @@ void FRenderer::render(FView const* view) {
 }
 
 void FRenderer::renderInternal(FView const* view) {
-    // per-renderpass data
-    ArenaScope rootArena(mPerRenderPassArena);
-
     FEngine& engine = mEngine;
-    JobSystem& js = engine.getJobSystem();
+
+    // per-renderpass data
+    RootArenaScope rootArenaScope(engine.getPerRenderPassArena());
 
     // create a root job so no other job can escape
+    JobSystem& js = engine.getJobSystem();
     auto *rootJob = js.setRootJob(js.createJob());
 
     // execute the render pass
-    renderJob(rootArena, const_cast<FView&>(*view));
+    renderJob(rootArenaScope, const_cast<FView&>(*view));
 
     // make sure to flush the command buffer
     engine.flush();
@@ -471,7 +492,7 @@ void FRenderer::renderInternal(FView const* view) {
     js.runAndWait(rootJob);
 }
 
-void FRenderer::renderJob(ArenaScope& arena, FView& view) {
+void FRenderer::renderJob(RootArenaScope& rootArenaScope, FView& view) {
     FEngine& engine = mEngine;
     JobSystem& js = engine.getJobSystem();
     FEngine::DriverApi& driver = engine.getDriverApi();
@@ -636,7 +657,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
         xvp.bottom = int32_t(guardBand);
     }
 
-    view.prepare(engine, driver, arena, svp, cameraInfo, getShaderUserTime(), needsAlphaChannel);
+    view.prepare(engine, driver, rootArenaScope, svp, cameraInfo, getShaderUserTime(), needsAlphaChannel);
 
     view.prepareUpscaler(scale, taaOptions, dsrOptions);
 
@@ -649,8 +670,10 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // Allocate some space for our commands in the per-frame Arena, and use that space as
     // an Arena for commands. All this space is released when we exit this method.
     size_t const perFrameCommandsSize = engine.getPerFrameCommandsSize();
-    void* const arenaBegin = arena.allocate(perFrameCommandsSize, CACHELINE_SIZE);
+    void* const arenaBegin = rootArenaScope.allocate(perFrameCommandsSize, CACHELINE_SIZE);
     void* const arenaEnd = pointermath::add(arenaBegin, perFrameCommandsSize);
+
+    // This arena *must* stay valid until all commands have been processed
     RenderPass::Arena commandArena("Command Arena", { arenaBegin, arenaEnd });
 
     RenderPass::RenderFlags renderFlags = 0;
@@ -658,8 +681,8 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     if (view.isFrontFaceWindingInverted())  renderFlags |= RenderPass::HAS_INVERSE_FRONT_FACES;
     if (view.hasInstancedStereo())          renderFlags |= RenderPass::IS_STEREOSCOPIC;
 
-    RenderPass pass(engine, commandArena);
-    pass.setRenderFlags(renderFlags);
+    RenderPassBuilder passBuilder(commandArena);
+    passBuilder.renderFlags(renderFlags);
 
     Variant variant;
     variant.setDirectionalLighting(view.hasDirectionalLight());
@@ -682,10 +705,10 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     if (view.needsShadowMap()) {
         Variant shadowVariant(Variant::DEPTH_VARIANT);
         shadowVariant.setVsm(view.getShadowType() == ShadowType::VSM);
-
-        RenderPass shadowPass(pass);
-        shadowPass.setVariant(shadowVariant);
-        auto shadows = view.renderShadowMaps(engine, fg, cameraInfo, mShaderUserTime, shadowPass);
+        auto shadows = view.renderShadowMaps(engine, fg, cameraInfo, mShaderUserTime,
+                RenderPassBuilder{ commandArena }
+                    .renderFlags(renderFlags)
+                    .variant(shadowVariant));
         blackboard["shadows"] = shadows;
     }
 
@@ -771,8 +794,9 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     view.updatePrimitivesLod(engine, cameraInfo,
             scene.getRenderableData(), view.getVisibleRenderables());
 
-    pass.setCamera(cameraInfo);
-    pass.setGeometry(scene.getRenderableData(), view.getVisibleRenderables(), scene.getRenderableUBO());
+    passBuilder.camera(cameraInfo);
+    passBuilder.geometry(scene.getRenderableData(),
+            view.getVisibleRenderables(), scene.getRenderableUBO());
 
     // view set-ups that need to happen before rendering
     fg.addTrivialSideEffectPass("Prepare View Uniforms",
@@ -818,7 +842,8 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // This is normally used by SSAO and contact-shadows
 
     // TODO: the scaling should depends on all passes that need the structure pass
-    const auto [structure, picking_] = ppm.structure(fg, pass, renderFlags, svp.width, svp.height, {
+    const auto [structure, picking_] = ppm.structure(fg,
+            passBuilder, renderFlags, svp.width, svp.height, {
             .scale = aoOptions.resolution,
             .picking = view.hasPicking()
     });
@@ -876,7 +901,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // screen-space reflections pass
 
     if (ssReflectionsOptions.enabled) {
-        auto reflections = ppm.ssr(fg, pass,
+        auto reflections = ppm.ssr(fg, passBuilder,
                 view.getFrameHistory(), cameraInfo,
                 view.getPerViewUniforms(),
                 structure,
@@ -894,10 +919,15 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // --------------------------------------------------------------------------------------------
     // Color passes
 
+    // this makes the viewport relative to xvp
+    // FIXME: we should use 'vp' when rendering directly into the swapchain, but that's hard to
+    //        know at this point. This will usually be the case when post-process is disabled.
+    // FIXME: we probably should take the dynamic scaling into account too
+    passBuilder.scissorViewport(hasPostProcess ? xvp : vp);
+
     // This one doesn't need to be a FrameGraph pass because it always happens by construction
     // (i.e. it won't be culled, unless everything is culled), so no need to complexify things.
-    pass.setVariant(variant);
-    pass.appendCommands(engine, RenderPass::COLOR);
+    passBuilder.variant(variant);
 
     // color-grading as subpass is done either by the color pass or the TAA pass if any
     auto colorGradingConfigForColor = colorGradingConfig;
@@ -905,7 +935,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
 
     if (colorGradingConfigForColor.asSubpass) {
         // append color grading subpass after all other passes
-        pass.appendCustomCommand(3,
+        passBuilder.customCommand(engine, 3,
                 RenderPass::Pass::BLENDED,
                 RenderPass::CustomCommand::EPILOG,
                 0, [&ppm, &driver, colorGradingConfigForColor]() {
@@ -913,7 +943,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
                 });
     } else if (colorGradingConfig.customResolve) {
         // append custom resolve subpass after all other passes
-        pass.appendCustomCommand(3,
+        passBuilder.customCommand(engine, 3,
                 RenderPass::Pass::BLENDED,
                 RenderPass::CustomCommand::EPILOG,
                 0, [&ppm, &driver]() {
@@ -921,16 +951,9 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
                 });
     }
 
-    // sort commands once we're done adding commands
-    pass.sortCommands(engine);
-
-
-    // this makes the viewport relative to xvp
-    // FIXME: we should use 'vp' when rendering directly into the swapchain, but that's hard to
-    //        know at this point. This will usually be the case when post-process is disabled.
-    // FIXME: we probably should take the dynamic scaling into account too
-    pass.setScissorViewport(hasPostProcess ? xvp : vp);
+    passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::COLOR);
 
+    RenderPass const pass{ passBuilder.build(engine) };
 
     FrameGraphTexture::Descriptor const desc = {
             .width = config.physicalViewport.width,
diff --git a/filament/src/details/Renderer.h b/filament/src/details/Renderer.h
index 2d08b6cfe0d..056d5599770 100644
--- a/filament/src/details/Renderer.h
+++ b/filament/src/details/Renderer.h
@@ -163,7 +163,7 @@ class FRenderer : public Renderer {
     }
 
     void renderInternal(FView const* view);
-    void renderJob(ArenaScope& arena, FView& view);
+    void renderJob(RootArenaScope& rootArenaScope, FView& view);
 
     // keep a reference to our engine
     FEngine& mEngine;
@@ -187,9 +187,6 @@ class FRenderer : public Renderer {
     backend::TargetBufferFlags mClearFlags{};
     tsl::robin_set<FRenderTarget*> mPreviousRenderTargets;
     std::function<void()> mBeginFrameInternal;
-
-    // per-frame arena for this Renderer
-    LinearAllocatorArena& mPerRenderPassArena;
 };
 
 FILAMENT_DOWNCAST(Renderer)
diff --git a/filament/src/details/Scene.cpp b/filament/src/details/Scene.cpp
index 942fb1e24fc..21840fa60cf 100644
--- a/filament/src/details/Scene.cpp
+++ b/filament/src/details/Scene.cpp
@@ -53,7 +53,7 @@ FScene::~FScene() noexcept = default;
 
 
 void FScene::prepare(utils::JobSystem& js,
-        LinearAllocatorArena& allocator,
+        RootArenaScope& rootArenaScope,
         mat4 const& worldTransform,
         bool shadowReceiversAreCasters) noexcept {
     // TODO: can we skip this in most cases? Since we rely on indices staying the same,
@@ -64,7 +64,7 @@ void FScene::prepare(utils::JobSystem& js,
     SYSTRACE_CONTEXT();
 
     // This will reset the allocator upon exiting
-    ArenaScope const arena(allocator);
+    ArenaScope<RootArenaScope::Arena> localArenaScope(rootArenaScope.getArena());
 
     FEngine& engine = mEngine;
     EntityManager const& em = engine.getEntityManager();
@@ -85,10 +85,10 @@ void FScene::prepare(utils::JobSystem& js,
             utils::STLAllocator< LightContainerData, LinearAllocatorArena >, false>;
 
     RenderableInstanceContainer renderableInstances{
-            RenderableInstanceContainer::with_capacity(entities.size(), allocator) };
+            RenderableInstanceContainer::with_capacity(entities.size(), localArenaScope.getArena()) };
 
     LightInstanceContainer lightInstances{
-            LightInstanceContainer::with_capacity(entities.size(), allocator) };
+            LightInstanceContainer::with_capacity(entities.size(), localArenaScope.getArena()) };
 
     SYSTRACE_NAME_BEGIN("InstanceLoop");
 
@@ -454,7 +454,7 @@ void FScene::terminate(FEngine&) {
     mRenderableViewUbh.clear();
 }
 
-void FScene::prepareDynamicLights(const CameraInfo& camera, ArenaScope&,
+void FScene::prepareDynamicLights(const CameraInfo& camera,
         Handle<HwBufferObject> lightUbh) noexcept {
     FEngine::DriverApi& driver = mEngine.getDriverApi();
     FLightManager const& lcm = mEngine.getLightManager();
diff --git a/filament/src/details/Scene.h b/filament/src/details/Scene.h
index 1882bb4dc30..490d115af3c 100644
--- a/filament/src/details/Scene.h
+++ b/filament/src/details/Scene.h
@@ -31,6 +31,8 @@
 #include <filament/Box.h>
 #include <filament/Scene.h>
 
+#include <math/mathfwd.h>
+
 #include <utils/compiler.h>
 #include <utils/Entity.h>
 #include <utils/Slice.h>
@@ -70,12 +72,12 @@ class FScene : public Scene {
     ~FScene() noexcept;
     void terminate(FEngine& engine);
 
-    void prepare(utils::JobSystem& js, LinearAllocatorArena& allocator,
+    void prepare(utils::JobSystem& js, RootArenaScope& rootArenaScope,
             math::mat4 const& worldTransform, bool shadowReceiversAreCasters) noexcept;
 
     void prepareVisibleRenderables(utils::Range<uint32_t> visibleRenderables) noexcept;
 
-    void prepareDynamicLights(const CameraInfo& camera, ArenaScope& arena,
+    void prepareDynamicLights(const CameraInfo& camera,
             backend::Handle<backend::HwBufferObject> lightUbh) noexcept;
 
     backend::Handle<backend::HwBufferObject> getRenderableUBO() const noexcept {
diff --git a/filament/src/details/View.cpp b/filament/src/details/View.cpp
index 7cc641de093..c3ddfe63678 100644
--- a/filament/src/details/View.cpp
+++ b/filament/src/details/View.cpp
@@ -341,8 +341,7 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
     mNeedsShadowMap = any(shadowTechnique & ShadowMapManager::ShadowTechnique::SHADOW_MAP);
 }
 
-void FView::prepareLighting(FEngine& engine, ArenaScope& arena,
-        CameraInfo const& cameraInfo) noexcept {
+void FView::prepareLighting(FEngine& engine, CameraInfo const& cameraInfo) noexcept {
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
@@ -354,7 +353,7 @@ void FView::prepareLighting(FEngine& engine, ArenaScope& arena,
      */
 
     if (hasDynamicLighting()) {
-        scene->prepareDynamicLights(cameraInfo, arena, mLightUbh);
+        scene->prepareDynamicLights(cameraInfo, mLightUbh);
     }
 
     // here the array of visible lights has been shrunk to CONFIG_MAX_LIGHT_COUNT
@@ -427,7 +426,7 @@ CameraInfo FView::computeCameraInfo(FEngine& engine) const noexcept {
     return { *camera, mat4{ rotation } * mat4::translation(translation) };
 }
 
-void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
+void FView::prepare(FEngine& engine, DriverApi& driver, RootArenaScope& rootArenaScope,
         filament::Viewport viewport, CameraInfo cameraInfo,
         float4 const& userTime, bool needsAlphaChannel) noexcept {
 
@@ -465,7 +464,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
      * Gather all information needed to render this scene. Apply the world origin to all
      * objects in the scene.
      */
-    scene->prepare(js, arena.getAllocator(),
+    scene->prepare(js, rootArenaScope,
             cameraInfo.worldTransform,
             hasVSM());
 
@@ -475,14 +474,22 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
 
     JobSystem::Job* froxelizeLightsJob = nullptr;
     JobSystem::Job* prepareVisibleLightsJob = nullptr;
-    if (scene->getLightData().size() > FScene::DIRECTIONAL_LIGHTS_COUNT) {
+    size_t const lightCount = scene->getLightData().size();
+    if (lightCount > FScene::DIRECTIONAL_LIGHTS_COUNT) {
         // create and start the prepareVisibleLights job
         // note: this job updates LightData (non const)
+        // allocate a scratch buffer for distances outside the job below, so we don't need
+        // to use a locked allocator; the downside is that we need to account for the worst case.
+        size_t const positionalLightCount = lightCount - FScene::DIRECTIONAL_LIGHTS_COUNT;
+        float* const distances = rootArenaScope.allocate<float>(
+                (positionalLightCount + 3u) & ~3u, CACHELINE_SIZE);
+
         prepareVisibleLightsJob = js.runAndRetain(js.createJob(nullptr,
-                [&engine, &arena, &viewMatrix = cameraInfo.view, &cullingFrustum,
+                [&engine, distances, positionalLightCount, &viewMatrix = cameraInfo.view, &cullingFrustum,
                  &lightData = scene->getLightData()]
                         (JobSystem&, JobSystem::Job*) {
-                    FView::prepareVisibleLights(engine.getLightManager(), arena,
+                    FView::prepareVisibleLights(engine.getLightManager(),
+                            { distances, distances + positionalLightCount },
                             viewMatrix, cullingFrustum, lightData);
                 }));
     }
@@ -530,7 +537,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
         // As soon as prepareVisibleLight finishes, we can kick-off the froxelization
         if (hasDynamicLighting()) {
             auto& froxelizer = mFroxelizer;
-            if (froxelizer.prepare(driver, arena, viewport,
+            if (froxelizer.prepare(driver, rootArenaScope, viewport,
                     cameraInfo.projection, cameraInfo.zn, cameraInfo.zf)) {
                 // TODO: might be more consistent to do this in prepareLighting(), but it's not
                 //       strictly necessary
@@ -645,7 +652,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
      * Relies on FScene::prepare() and prepareVisibleLights()
      */
 
-    prepareLighting(engine, arena, cameraInfo);
+    prepareLighting(engine, cameraInfo);
 
     /*
      * Update driver state
@@ -850,7 +857,8 @@ void FView::cullRenderables(JobSystem&,
     functor(0, renderableData.size());
 }
 
-void FView::prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena,
+void FView::prepareVisibleLights(FLightManager const& lcm,
+        utils::Slice<float> scratch,
         mat4f const& viewMatrix, Frustum const& frustum,
         FScene::LightSoa& lightData) noexcept {
     SYSTRACE_CALL();
@@ -918,28 +926,25 @@ void FView::prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena
      * - This helps our limited numbers of spot-shadow as well.
      */
 
-    ArenaScope arena(rootArena.getAllocator());
-    size_t const size = visibleLightCount;
     // number of point/spotlights
-    size_t const positionalLightCount = size - FScene::DIRECTIONAL_LIGHTS_COUNT;
+    size_t const positionalLightCount = visibleLightCount - FScene::DIRECTIONAL_LIGHTS_COUNT;
     if (positionalLightCount) {
-        // always allocate at least 4 entries, because the vectorized loops below rely on that
-        float* const UTILS_RESTRICT distances =
-                arena.allocate<float>((size + 3u) & ~3u, CACHELINE_SIZE);
-
+        assert_invariant(positionalLightCount <= scratch.size());
         // pre-compute the lights' distance to the camera, for sorting below
         // - we don't skip the directional light, because we don't care, it's ignored during sorting
+        float* const UTILS_RESTRICT distances = scratch.data();
         float4 const* const UTILS_RESTRICT spheres = lightData.data<FScene::POSITION_RADIUS>();
-        computeLightCameraDistances(distances, viewMatrix, spheres, size);
+        computeLightCameraDistances(distances, viewMatrix, spheres, visibleLightCount);
 
         // skip directional light
         Zip2Iterator<FScene::LightSoa::iterator, float*> b = { lightData.begin(), distances };
-        std::sort(b + FScene::DIRECTIONAL_LIGHTS_COUNT, b + size,
+        std::sort(b + FScene::DIRECTIONAL_LIGHTS_COUNT, b + visibleLightCount,
                 [](auto const& lhs, auto const& rhs) { return lhs.second < rhs.second; });
     }
 
     // drop excess lights
-    lightData.resize(std::min(size, CONFIG_MAX_LIGHT_COUNT + FScene::DIRECTIONAL_LIGHTS_COUNT));
+    lightData.resize(std::min(visibleLightCount,
+            CONFIG_MAX_LIGHT_COUNT + FScene::DIRECTIONAL_LIGHTS_COUNT));
 }
 
 // These methods need to exist so clang honors the __restrict__ keyword, which in turn
@@ -972,8 +977,9 @@ void FView::updatePrimitivesLod(FEngine& engine, const CameraInfo&,
 }
 
 FrameGraphId<FrameGraphTexture> FView::renderShadowMaps(FEngine& engine, FrameGraph& fg,
-        CameraInfo const& cameraInfo, float4 const& userTime, RenderPass const& pass) noexcept {
-    return mShadowMapManager.render(engine, fg, pass, *this, cameraInfo, userTime);
+        CameraInfo const& cameraInfo, float4 const& userTime,
+        RenderPassBuilder const& passBuilder) noexcept {
+    return mShadowMapManager.render(engine, fg, passBuilder, *this, cameraInfo, userTime);
 }
 
 void FView::commitFrameHistory(FEngine& engine) noexcept {
diff --git a/filament/src/details/View.h b/filament/src/details/View.h
index 204f11b9d56..d3b2c59fa73 100644
--- a/filament/src/details/View.h
+++ b/filament/src/details/View.h
@@ -88,7 +88,7 @@ class FView : public View {
 
     // note: viewport/cameraInfo are passed by value to make it clear that prepare cannot
     // keep references on them that would outlive the scope of prepare() (e.g. with JobSystem).
-    void prepare(FEngine& engine, backend::DriverApi& driver, ArenaScope& arena,
+    void prepare(FEngine& engine, backend::DriverApi& driver, RootArenaScope& rootArenaScope,
             filament::Viewport viewport, CameraInfo cameraInfo,
             math::float4 const& userTime, bool needsAlphaChannel) noexcept;
 
@@ -144,7 +144,7 @@ class FView : public View {
 
     void prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableData,
             FScene::LightSoa const& lightData, CameraInfo const& cameraInfo) noexcept;
-    void prepareLighting(FEngine& engine, ArenaScope& arena, CameraInfo const& cameraInfo) noexcept;
+    void prepareLighting(FEngine& engine, CameraInfo const& cameraInfo) noexcept;
 
     void prepareSSAO(backend::Handle<backend::HwTexture> ssao) const noexcept;
     void prepareSSR(backend::Handle<backend::HwTexture> ssr, bool disableSSR,
@@ -176,7 +176,7 @@ class FView : public View {
 
     FrameGraphId<FrameGraphTexture> renderShadowMaps(FEngine& engine, FrameGraph& fg,
             CameraInfo const& cameraInfo, math::float4 const& userTime,
-            RenderPass const& pass) noexcept;
+            RenderPassBuilder const& passBuilder) noexcept;
 
     void updatePrimitivesLod(
             FEngine& engine, const CameraInfo& camera,
@@ -460,7 +460,8 @@ class FView : public View {
     void prepareVisibleRenderables(utils::JobSystem& js,
             Frustum const& frustum, FScene::RenderableSoa& renderableData) const noexcept;
 
-    static void prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena,
+    static void prepareVisibleLights(FLightManager const& lcm,
+            utils::Slice<float> scratch,
             math::mat4f const& viewMatrix, Frustum const& frustum,
             FScene::LightSoa& lightData) noexcept;
 
diff --git a/libs/utils/include/utils/Allocator.h b/libs/utils/include/utils/Allocator.h
index c726ac5ffe6..02479bb49ab 100644
--- a/libs/utils/include/utils/Allocator.h
+++ b/libs/utils/include/utils/Allocator.h
@@ -30,6 +30,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <vector>
 
 namespace utils {
 
@@ -43,14 +44,14 @@ static inline P* add(P* a, T b) noexcept {
 template <typename P>
 static inline P* align(P* p, size_t alignment) noexcept {
     // alignment must be a power-of-two
-    assert(alignment && !(alignment & alignment-1));
+    assert_invariant(alignment && !(alignment & alignment-1));
     return (P*)((uintptr_t(p) + alignment - 1) & ~(alignment - 1));
 }
 
 template <typename P>
 static inline P* align(P* p, size_t alignment, size_t offset) noexcept {
     P* const r = align(add(p, offset), alignment);
-    assert(r >= add(p, offset));
+    assert_invariant(r >= add(p, offset));
     return r;
 }
 
@@ -89,20 +90,19 @@ class LinearAllocator {
         // branch-less allocation
         void* const p = pointermath::align(current(), alignment, extra);
         void* const c = pointermath::add(p, size);
-        bool success = c <= end();
+        bool const success = c <= end();
         set_current(success ? c : current());
         return success ? p : nullptr;
     }
 
     // API specific to this allocator
-
     void *getCurrent() UTILS_RESTRICT noexcept {
         return current();
     }
 
     // free memory back to the specified point
     void rewind(void* p) UTILS_RESTRICT noexcept {
-        assert(p>=mBegin && p<end());
+        assert_invariant(p >= mBegin && p < end());
         set_current(p);
     }
 
@@ -122,16 +122,21 @@ class LinearAllocator {
     void swap(LinearAllocator& rhs) noexcept;
 
     void *base() noexcept { return mBegin; }
+    void const *base() const noexcept { return mBegin; }
 
     void free(void*, size_t) UTILS_RESTRICT noexcept { }
 
-private:
+protected:
     void* end() UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mSize); }
+    void const* end() const UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mSize); }
+
     void* current() UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mCur); }
+    void const* current() const UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mCur); }
+
+private:
     void set_current(void* p) UTILS_RESTRICT noexcept {
         mCur = uint32_t(uintptr_t(p) - uintptr_t(mBegin));
     }
-
     void* mBegin = nullptr;
     uint32_t mSize = 0;
     uint32_t mCur = 0;
@@ -152,9 +157,7 @@ class HeapAllocator {
     explicit HeapAllocator(const AREA&) { }
 
     // our allocator concept
-    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t), size_t extra = 0) {
-        // this allocator doesn't support 'extra'
-        assert(extra == 0);
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t)) {
         return aligned_alloc(size, alignment);
     }
 
@@ -171,6 +174,50 @@ class HeapAllocator {
     void swap(HeapAllocator&) noexcept { }
 };
 
+/* ------------------------------------------------------------------------------------------------
+ * LinearAllocatorWithFallback
+ *
+ * This is a LinearAllocator that falls back to a HeapAllocator when allocation fail. The Heap
+ * allocator memory is freed only when the LinearAllocator is reset or destroyed.
+ * ------------------------------------------------------------------------------------------------
+ */
+class LinearAllocatorWithFallback : private LinearAllocator, private HeapAllocator {
+    std::vector<void*> mHeapAllocations;
+public:
+    LinearAllocatorWithFallback(void* begin, void* end) noexcept
+        : LinearAllocator(begin, end) {
+    }
+
+    template <typename AREA>
+    explicit LinearAllocatorWithFallback(const AREA& area)
+        : LinearAllocatorWithFallback(area.begin(), area.end()) {
+    }
+
+    ~LinearAllocatorWithFallback() noexcept {
+        LinearAllocatorWithFallback::reset();
+    }
+
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t));
+
+    void *getCurrent() noexcept {
+        return LinearAllocator::getCurrent();
+    }
+
+    void rewind(void* p) noexcept {
+        if (p >= LinearAllocator::base() && p < LinearAllocator::end()) {
+            LinearAllocator::rewind(p);
+        }
+    }
+
+    void reset() noexcept;
+
+    void free(void*, size_t) noexcept { }
+
+    bool isHeapAllocation(void* p) const noexcept {
+        return p < LinearAllocator::base() || p >= LinearAllocator::end();
+    }
+};
+
 // ------------------------------------------------------------------------------------------------
 
 class FreeList {
@@ -186,13 +233,13 @@ class FreeList {
         Node* const head = mHead;
         mHead = head ? head->next : nullptr;
         // this could indicate a use after free
-        assert(!mHead || mHead >= mBegin && mHead < mEnd);
+        assert_invariant(!mHead || mHead >= mBegin && mHead < mEnd);
         return head;
     }
 
     void push(void* p) noexcept {
-        assert(p);
-        assert(p >= mBegin && p < mEnd);
+        assert_invariant(p);
+        assert_invariant(p >= mBegin && p < mEnd);
         // TODO: assert this is one of our pointer (i.e.: it's address match one of ours)
         Node* const head = static_cast<Node*>(p);
         head->next = mHead;
@@ -229,16 +276,16 @@ class AtomicFreeList {
     AtomicFreeList& operator=(const FreeList& rhs) = delete;
 
     void* pop() noexcept {
-        Node* const storage = mStorage;
+        Node* const pStorage = mStorage;
 
         HeadPtr currentHead = mHead.load();
         while (currentHead.offset >= 0) {
-            // The value of "next" we load here might already contain application data if another
+            // The value of "pNext" we load here might already contain application data if another
             // thread raced ahead of us. But in that case, the computed "newHead" will be discarded
             // since compare_exchange_weak fails. Then this thread will loop with the updated
             // value of currentHead, and try again.
-            Node* const next = storage[currentHead.offset].next.load(std::memory_order_relaxed);
-            const HeadPtr newHead{ next ? int32_t(next - storage) : -1, currentHead.tag + 1 };
+            Node* const pNext = pStorage[currentHead.offset].next.load(std::memory_order_relaxed);
+            const HeadPtr newHead{ pNext ? int32_t(pNext - pStorage) : -1, currentHead.tag + 1 };
             // In the rare case that the other thread that raced ahead of us already returned the 
             // same mHead we just loaded, but it now has a different "next" value, the tag field will not 
             // match, and compare_exchange_weak will fail and prevent that particular race condition.
@@ -246,18 +293,18 @@ class AtomicFreeList {
                 // This assert needs to occur after we have validated that there was no race condition
                 // Otherwise, next might already contain application data, if another thread
                 // raced ahead of us after we loaded mHead, but before we loaded mHead->next.
-                assert(!next || next >= storage);
+                assert_invariant(!pNext || pNext >= pStorage);
                 break;
             }
         }
-        void* p = (currentHead.offset >= 0) ? (storage + currentHead.offset) : nullptr;
-        assert(!p || p >= storage);
+        void* p = (currentHead.offset >= 0) ? (pStorage + currentHead.offset) : nullptr;
+        assert_invariant(!p || p >= pStorage);
         return p;
     }
 
     void push(void* p) noexcept {
         Node* const storage = mStorage;
-        assert(p && p >= storage);
+        assert_invariant(p && p >= storage);
         Node* const node = static_cast<Node*>(p);
         HeadPtr currentHead = mHead.load();
         HeadPtr newHead = { int32_t(node - storage), currentHead.tag + 1 };
@@ -330,9 +377,9 @@ class PoolAllocator {
     // our allocator concept
     void* alloc(size_t size = ELEMENT_SIZE,
                 size_t alignment = ALIGNMENT, size_t offset = OFFSET) noexcept {
-        assert(size <= ELEMENT_SIZE);
-        assert(alignment <= ALIGNMENT);
-        assert(offset == OFFSET);
+        assert_invariant(size <= ELEMENT_SIZE);
+        assert_invariant(alignment <= ALIGNMENT);
+        assert_invariant(offset == OFFSET);
         return mFreeList.pop();
     }
 
@@ -587,23 +634,36 @@ class Arena {
 
     // allocate memory from arena with given size and alignment
     // (acceptable size/alignment may depend on the allocator provided)
-    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t), size_t extra = 0) noexcept {
+    void* alloc(size_t size, size_t alignment, size_t extra) noexcept {
         std::lock_guard<LockingPolicy> guard(mLock);
         void* p = mAllocator.alloc(size, alignment, extra);
         mListener.onAlloc(p, size, alignment, extra);
         return p;
     }
 
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t)) noexcept {
+        std::lock_guard<LockingPolicy> guard(mLock);
+        void* p = mAllocator.alloc(size, alignment);
+        mListener.onAlloc(p, size, alignment, 0);
+        return p;
+    }
+
     // Allocate an array of trivially destructible objects
     // for safety, we disable the object-based alloc method if the object type is not
     // trivially destructible, since free() won't call the destructor and this is allocating
     // an array.
     template <typename T,
             typename = typename std::enable_if<std::is_trivially_destructible<T>::value>::type>
-    T* alloc(size_t count, size_t alignment = alignof(T), size_t extra = 0) noexcept {
+    T* alloc(size_t count, size_t alignment, size_t extra) noexcept {
         return (T*)alloc(count * sizeof(T), alignment, extra);
     }
 
+    template <typename T,
+            typename = typename std::enable_if<std::is_trivially_destructible<T>::value>::type>
+    T* alloc(size_t count, size_t alignment = alignof(T)) noexcept {
+        return (T*)alloc(count * sizeof(T), alignment);
+    }
+
     // return memory pointed by p to the arena
     // (actual behaviour may depend on allocator provided)
     void free(void* p) noexcept {
@@ -720,6 +780,8 @@ class ArenaScope {
     }
 
 public:
+    using Arena = ARENA;
+
     explicit ArenaScope(ARENA& allocator)
             : mArena(allocator), mRewind(allocator.getCurrent()) {
     }
@@ -771,7 +833,7 @@ class ArenaScope {
     }
 
     // use with caution
-    ARENA& getAllocator() noexcept { return mArena; }
+    ARENA& getArena() noexcept { return mArena; }
 
 private:
     ARENA& mArena;
diff --git a/libs/utils/src/Allocator.cpp b/libs/utils/src/Allocator.cpp
index 2d7a8fcbe92..fd6e5945691 100644
--- a/libs/utils/src/Allocator.cpp
+++ b/libs/utils/src/Allocator.cpp
@@ -16,6 +16,8 @@
 
 #include <utils/Allocator.h>
 
+#include <utils/compiler.h>
+#include <utils/debug.h>
 #include <utils/Log.h>
 
 #include <algorithm>
@@ -52,6 +54,29 @@ void LinearAllocator::swap(LinearAllocator& rhs) noexcept {
     std::swap(mCur, rhs.mCur);
 }
 
+
+// ------------------------------------------------------------------------------------------------
+// LinearAllocatorWithFallback
+// ------------------------------------------------------------------------------------------------
+
+void* LinearAllocatorWithFallback::alloc(size_t size, size_t alignment) {
+    void* p = LinearAllocator::alloc(size, alignment);
+    if (UTILS_UNLIKELY(!p)) {
+        p = HeapAllocator::alloc(size, alignment);
+        mHeapAllocations.push_back(p);
+    }
+    assert_invariant(p);
+    return p;
+}
+
+void LinearAllocatorWithFallback::reset() noexcept {
+    LinearAllocator::reset();
+    for (auto* p : mHeapAllocations) {
+        HeapAllocator::free(p);
+    }
+    mHeapAllocations.clear();
+}
+
 // ------------------------------------------------------------------------------------------------
 // FreeList
 // ------------------------------------------------------------------------------------------------
@@ -61,8 +86,8 @@ FreeList::Node* FreeList::init(void* begin, void* end,
 {
     void* const p = pointermath::align(begin, alignment, extra);
     void* const n = pointermath::align(pointermath::add(p, elementSize), alignment, extra);
-    assert(p >= begin && p < end);
-    assert(n >= begin && n < end && n > p);
+    assert_invariant(p >= begin && p < end);
+    assert_invariant(n >= begin && n < end && n > p);
 
     const size_t d = uintptr_t(n) - uintptr_t(p);
     const size_t num = (uintptr_t(end) - uintptr_t(p)) / d;
@@ -77,8 +102,8 @@ FreeList::Node* FreeList::init(void* begin, void* end,
         cur->next = next;
         cur = next;
     }
-    assert(cur < end);
-    assert(pointermath::add(cur, d) <= end);
+    assert_invariant(cur < end);
+    assert_invariant(pointermath::add(cur, d) <= end);
     cur->next = nullptr;
     return head;
 }
@@ -97,13 +122,13 @@ AtomicFreeList::AtomicFreeList(void* begin, void* end,
 {
 #ifdef __ANDROID__
     // on some platform (e.g. web) this returns false. we really only care about mobile though.
-    assert(mHead.is_lock_free());
+    assert_invariant(mHead.is_lock_free());
 #endif
 
     void* const p = pointermath::align(begin, alignment, extra);
     void* const n = pointermath::align(pointermath::add(p, elementSize), alignment, extra);
-    assert(p >= begin && p < end);
-    assert(n >= begin && n < end && n > p);
+    assert_invariant(p >= begin && p < end);
+    assert_invariant(n >= begin && n < end && n > p);
 
     const size_t d = uintptr_t(n) - uintptr_t(p);
     const size_t num = (uintptr_t(end) - uintptr_t(p)) / d;
@@ -119,8 +144,8 @@ AtomicFreeList::AtomicFreeList(void* begin, void* end,
         cur->next = next;
         cur = next;
     }
-    assert(cur < end);
-    assert(pointermath::add(cur, d) <= end);
+    assert_invariant(cur < end);
+    assert_invariant(pointermath::add(cur, d) <= end);
     cur->next = nullptr;
 
     mHead.store({ int32_t(head - mStorage), 0 });
@@ -148,22 +173,25 @@ TrackingPolicy::HighWatermark::~HighWatermark() noexcept {
 }
 
 void TrackingPolicy::HighWatermark::onFree(void* p, size_t size) noexcept {
-    assert(mCurrent >= size);
+    // FIXME: this code is incorrect with LinearAllocators because free() is a no-op for them
+    assert_invariant(mCurrent >= size);
     mCurrent -= uint32_t(size);
 }
 void TrackingPolicy::HighWatermark::onReset() noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onReset() tries to call the underlying allocator's onReset()
-    assert(mBase);
+    assert_invariant(mBase);
     mCurrent = 0;
 }
 
 void TrackingPolicy::HighWatermark::onRewind(void const* addr) noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onRewind() tries to call the underlying allocator's onReset()
-    assert(mBase);
-    assert(addr >= mBase);
-    mCurrent = uint32_t(uintptr_t(addr) - uintptr_t(mBase));
+    assert_invariant(mBase);
+    // for LinearAllocatorWithFallback we could get pointers outside the range
+    if (addr >= mBase && addr < pointermath::add(mBase, mSize)) {
+        mCurrent = uint32_t(uintptr_t(addr) - uintptr_t(mBase));
+    }
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -183,7 +211,7 @@ void TrackingPolicy::Debug::onFree(void* p, size_t size) noexcept {
 void TrackingPolicy::Debug::onReset() noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onReset() tries to call the underlying allocator's onReset()
-    assert(mBase);
+    assert_invariant(mBase);
     memset(mBase, 0xec, mSize);
 }
 

From d640ba853bb06e98cfdb4a441b400a6be96cf5eb Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Mon, 5 Feb 2024 23:32:14 -0800
Subject: [PATCH 02/19] rework how we size the HandleAllocator's pools

- update the pools sizes for metal and vulkan, which were very outdated.
- add debug code on all backends to print the size of each handle
  (with a compile time switch)

The most important change is that now the 3 pools of HandleAllocator
are sized so that each can accommodate about the same amount of handles.
This makes it easier to reason about. The total amount of handles is
three times that, since there are 3 pools.
We also try to allocate the buckets so that handles are evenly
distributed, however, that's very hand wavy.

With the current setup the number of handles per pool is as follows:
- GL : 3240 / pool / MiB
- VK : 1820 / pool / MiB
- MTL: 1310 / pool / MiB
---
 .../include/private/backend/HandleAllocator.h | 12 ++++---
 filament/backend/src/HandleAllocator.cpp      | 25 ++++++++-----
 filament/backend/src/metal/MetalDriver.mm     | 34 ++++++++++++++++++
 filament/backend/src/opengl/OpenGLDriver.cpp  | 22 ++++++------
 filament/backend/src/vulkan/VulkanDriver.cpp  | 35 +++++++++++++++++++
 libs/utils/include/utils/Allocator.h          |  4 +++
 6 files changed, 109 insertions(+), 23 deletions(-)

diff --git a/filament/backend/include/private/backend/HandleAllocator.h b/filament/backend/include/private/backend/HandleAllocator.h
index 3a336e8d6e6..2e7c8d1530f 100644
--- a/filament/backend/include/private/backend/HandleAllocator.h
+++ b/filament/backend/include/private/backend/HandleAllocator.h
@@ -40,19 +40,23 @@
 #   define HANDLE_TYPE_SAFETY 0
 #endif
 
-#define HandleAllocatorGL  HandleAllocator<16, 64, 208>
-#define HandleAllocatorVK  HandleAllocator<16, 64, 880>
-#define HandleAllocatorMTL HandleAllocator<16, 64, 584>
+#define HandleAllocatorGL  HandleAllocator<16,  64, 208>    // ~3640 / pool / MiB
+#define HandleAllocatorVK  HandleAllocator<80, 176, 320>    // ~1820 / pool / MiB
+#define HandleAllocatorMTL HandleAllocator<48, 160, 592>    // ~1310 / pool / MiB
 
 namespace filament::backend {
 
 /*
  * A utility class to efficiently allocate and manage Handle<>
  */
-template <size_t P0, size_t P1, size_t P2>
+template<size_t P0, size_t P1, size_t P2>
 class HandleAllocator {
 public:
 
+    static_assert(P0 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
+    static_assert(P1 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
+    static_assert(P2 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
+
     HandleAllocator(const char* name, size_t size) noexcept;
     HandleAllocator(HandleAllocator const& rhs) = delete;
     HandleAllocator& operator=(HandleAllocator const& rhs) = delete;
diff --git a/filament/backend/src/HandleAllocator.cpp b/filament/backend/src/HandleAllocator.cpp
index 3257e4e2c94..d1b568af194 100644
--- a/filament/backend/src/HandleAllocator.cpp
+++ b/filament/backend/src/HandleAllocator.cpp
@@ -16,10 +16,17 @@
 
 #include "private/backend/HandleAllocator.h"
 
+#include <backend/Handle.h>
+
+#include <utils/Allocator.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
 #include <utils/Panic.h>
 
 #include <stdlib.h>
 
+#include <mutex>
+
 namespace filament::backend {
 
 using namespace utils;
@@ -28,14 +35,16 @@ template <size_t P0, size_t P1, size_t P2>
 UTILS_NOINLINE
 HandleAllocator<P0, P1, P2>::Allocator::Allocator(AreaPolicy::HeapArea const& area)
         : mArea(area) {
-    // TODO: we probably need a better way to set the size of these pools
-    const size_t unit = area.size() / 32;
-    const size_t offsetPool1 =      unit;
-    const size_t offsetPool2 = 16 * unit;
-    char* const p = (char*)area.begin();
-    mPool0 = PoolAllocator< P0, 16>(p, p + offsetPool1);
-    mPool1 = PoolAllocator< P1, 16>(p + offsetPool1, p + offsetPool2);
-    mPool2 = PoolAllocator< P2, 16>(p + offsetPool2, area.end());
+
+    // size the different pools so that they can all contain the same number of handles
+    size_t const count = area.size() / (P0 + P1 + P2);
+    char* const p0 = static_cast<char*>(area.begin());
+    char* const p1 = p0 + count * P0;
+    char* const p2 = p1 + count * P1;
+
+    mPool0 = PoolAllocator< P0, 16>(p0,              count * P0);
+    mPool1 = PoolAllocator< P1, 16>(p1 + count * P0, count * P1);
+    mPool2 = PoolAllocator< P2, 16>(p2 + count * P0, count * P2);
 }
 
 // ------------------------------------------------------------------------------------------------
diff --git a/filament/backend/src/metal/MetalDriver.mm b/filament/backend/src/metal/MetalDriver.mm
index ef5c35e1080..b1e3d7574f7 100644
--- a/filament/backend/src/metal/MetalDriver.mm
+++ b/filament/backend/src/metal/MetalDriver.mm
@@ -43,6 +43,40 @@
 namespace backend {
 
 Driver* MetalDriverFactory::create(MetalPlatform* const platform, const Platform::DriverConfig& driverConfig) {
+#if 0
+    // this is useful for development, but too verbose even for debug builds
+    // For reference on a 64-bits machine in Release mode:
+    //    MetalTimerQuery              :  16       few
+    //    HwStream                     :  24       few
+    //    MetalIndexBuffer             :  40       moderate
+    //    MetalFence                   :  48       few
+    //    MetalBufferObject            :  48       many
+    // -- less than or equal 48 bytes
+    //    MetalSamplerGroup            : 112       few
+    //    MetalProgram                 : 144       moderate
+    //    MetalTexture                 : 152       moderate
+    //    MetalVertexBuffer            : 152       moderate
+    // -- less than or equal 160 bytes
+    //    MetalSwapChain               : 184       few
+    //    MetalRenderTarget            : 272       few
+    //    MetalRenderPrimitive         : 584       many
+    // -- less than or equal to 592 bytes
+
+    utils::slog.d
+           << "\nMetalSwapChain: " << sizeof(MetalSwapChain)
+           << "\nMetalBufferObject: " << sizeof(MetalBufferObject)
+           << "\nMetalVertexBuffer: " << sizeof(MetalVertexBuffer)
+           << "\nMetalIndexBuffer: " << sizeof(MetalIndexBuffer)
+           << "\nMetalSamplerGroup: " << sizeof(MetalSamplerGroup)
+           << "\nMetalRenderPrimitive: " << sizeof(MetalRenderPrimitive)
+           << "\nMetalTexture: " << sizeof(MetalTexture)
+           << "\nMetalTimerQuery: " << sizeof(MetalTimerQuery)
+           << "\nHwStream: " << sizeof(HwStream)
+           << "\nMetalRenderTarget: " << sizeof(MetalRenderTarget)
+           << "\nMetalFence: " << sizeof(MetalFence)
+           << "\nMetalProgram: " << sizeof(MetalProgram)
+           << utils::io::endl;
+#endif
     return MetalDriver::create(platform, driverConfig);
 }
 
diff --git a/filament/backend/src/opengl/OpenGLDriver.cpp b/filament/backend/src/opengl/OpenGLDriver.cpp
index 1d3e06282c4..047f28383db 100644
--- a/filament/backend/src/opengl/OpenGLDriver.cpp
+++ b/filament/backend/src/opengl/OpenGLDriver.cpp
@@ -90,24 +90,24 @@ Driver* OpenGLDriver::create(OpenGLPlatform* const platform,
 #if 0
     // this is useful for development, but too verbose even for debug builds
     // For reference on a 64-bits machine in Release mode:
-    //    GLFence                   :   8       few
     //    GLIndexBuffer             :   8       moderate
-    //    GLSamplerGroup            :   8       few
+    //    GLSamplerGroup            :  16       few
+    //    GLSwapChain               :  16       few
+    //    GLTimerQuery              :  16       few
     // -- less than or equal 16 bytes
-    //    GLBufferObject            :  24       many
-    //    GLSync                    :  24       few
-    //    GLTimerQuery              :  32       few
-    //    OpenGLProgram             :  32       moderate
-    //    GLRenderPrimitive         :  48       many
+    //    GLFence                   :  24       few
+    //    GLBufferObject            :  32       many
+    //    GLRenderPrimitive         :  40       many
+    //    OpenGLProgram             :  56       moderate
+    //    GLTexture                 :  64       moderate
     // -- less than or equal 64 bytes
-    //    GLTexture                 :  72       moderate
+    //    GLStream                  : 104       few
     //    GLRenderTarget            : 112       few
-    //    GLStream                  : 184       few
     //    GLVertexBuffer            : 200       moderate
     // -- less than or equal to 208 bytes
 
     slog.d
-           << "HwFence: " << sizeof(HwFence)
+           << "\nGLSwapChain: " << sizeof(GLSwapChain)
            << "\nGLBufferObject: " << sizeof(GLBufferObject)
            << "\nGLVertexBuffer: " << sizeof(GLVertexBuffer)
            << "\nGLIndexBuffer: " << sizeof(GLIndexBuffer)
@@ -117,7 +117,7 @@ Driver* OpenGLDriver::create(OpenGLPlatform* const platform,
            << "\nGLTimerQuery: " << sizeof(GLTimerQuery)
            << "\nGLStream: " << sizeof(GLStream)
            << "\nGLRenderTarget: " << sizeof(GLRenderTarget)
-           << "\nGLSync: " << sizeof(GLSync)
+           << "\nGLFence: " << sizeof(GLFence)
            << "\nOpenGLProgram: " << sizeof(OpenGLProgram)
            << io::endl;
 #endif
diff --git a/filament/backend/src/vulkan/VulkanDriver.cpp b/filament/backend/src/vulkan/VulkanDriver.cpp
index 9680320f929..3f48b44ad10 100644
--- a/filament/backend/src/vulkan/VulkanDriver.cpp
+++ b/filament/backend/src/vulkan/VulkanDriver.cpp
@@ -213,6 +213,41 @@ VulkanDriver::~VulkanDriver() noexcept = default;
 UTILS_NOINLINE
 Driver* VulkanDriver::create(VulkanPlatform* platform, VulkanContext const& context,
          Platform::DriverConfig const& driverConfig) noexcept {
+#if 0
+    // this is useful for development, but too verbose even for debug builds
+    // For reference on a 64-bits machine in Release mode:
+    //    VulkanSamplerGroup            :  24       few
+    //    HwStream                      :  24       few
+    //    VulkanFence                   :  40       few
+    //    VulkanProgram                 :  40       moderate
+    //    VulkanIndexBuffer             :  72       moderate
+    //    VulkanBufferObject            :  72       many
+    // -- less than or equal 80 bytes
+    //    VulkanRenderPrimitive         : 104       many
+    //    VulkanSwapChain               : 112       few
+    //    VulkanTimerQuery              : 168       few
+    // -- less than or equal 176 bytes
+    //    VulkanTexture                 : 232       moderate
+    //    VulkanVertexBuffer            : 312       moderate
+    //    VulkanRenderTarget            : 320       few
+    // -- less than or equal to 320 bytes
+
+    utils::slog.d
+           << "\nVulkanSwapChain: " << sizeof(VulkanSwapChain)
+           << "\nVulkanBufferObject: " << sizeof(VulkanBufferObject)
+           << "\nVulkanVertexBuffer: " << sizeof(VulkanVertexBuffer)
+           << "\nVulkanIndexBuffer: " << sizeof(VulkanIndexBuffer)
+           << "\nVulkanSamplerGroup: " << sizeof(VulkanSamplerGroup)
+           << "\nVulkanRenderPrimitive: " << sizeof(VulkanRenderPrimitive)
+           << "\nVulkanTexture: " << sizeof(VulkanTexture)
+           << "\nVulkanTimerQuery: " << sizeof(VulkanTimerQuery)
+           << "\nHwStream: " << sizeof(HwStream)
+           << "\nVulkanRenderTarget: " << sizeof(VulkanRenderTarget)
+           << "\nVulkanFence: " << sizeof(VulkanFence)
+           << "\nVulkanProgram: " << sizeof(VulkanProgram)
+           << utils::io::endl;
+#endif
+
     assert_invariant(platform);
     size_t defaultSize = FVK_HANDLE_ARENA_SIZE_IN_MB * 1024U * 1024U;
     Platform::DriverConfig validConfig {driverConfig};
diff --git a/libs/utils/include/utils/Allocator.h b/libs/utils/include/utils/Allocator.h
index 02479bb49ab..2b02eb4cc0f 100644
--- a/libs/utils/include/utils/Allocator.h
+++ b/libs/utils/include/utils/Allocator.h
@@ -393,6 +393,10 @@ class PoolAllocator {
         : mFreeList(begin, end, ELEMENT_SIZE, ALIGNMENT, OFFSET) {
     }
 
+    PoolAllocator(void* begin, size_t size) noexcept
+        : mFreeList(begin, static_cast<char *>(begin) + size, ELEMENT_SIZE, ALIGNMENT, OFFSET) {
+    }
+
     template <typename AREA>
     explicit PoolAllocator(const AREA& area) noexcept
         : PoolAllocator(area.begin(), area.end()) {

From 20acc01fcd66da13309fe9ca12803519408f2b1a Mon Sep 17 00:00:00 2001
From: Powei Feng <powei@google.com>
Date: Wed, 7 Feb 2024 09:17:09 -0800
Subject: [PATCH 03/19] [release] update `base64` command (#7559)

Seems like a `-i` is now necessary for the command. Note that we recently startede using mac-mx machines.
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e7e0cb19d6f..bf7211a1039 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -129,7 +129,7 @@ jobs:
       - name: Sign sample-gltf-viewer
         run: |
           echo "${APK_KEYSTORE_BASE64}" > filament.jks.base64
-          base64 --decode filament.jks.base64 > filament.jks
+          base64 --decode -i filament.jks.base64 > filament.jks
           BUILD_TOOLS_VERSION=$(ls ${ANDROID_HOME}/build-tools | sort -V | tail -n 1)
           APKSIGNER=${ANDROID_HOME}/build-tools/${BUILD_TOOLS_VERSION}/apksigner
           IN_FILE="out/sample-gltf-viewer-release.apk"

From c43051728c2dcce5df372e95adc26798df3695ae Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Thu, 8 Feb 2024 16:00:48 -0800
Subject: [PATCH 04/19] fix a typo in handleallocator that could cause
 corruptions

Fixes #7563
---
 filament/backend/src/HandleAllocator.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/filament/backend/src/HandleAllocator.cpp b/filament/backend/src/HandleAllocator.cpp
index d1b568af194..7362066ca77 100644
--- a/filament/backend/src/HandleAllocator.cpp
+++ b/filament/backend/src/HandleAllocator.cpp
@@ -42,9 +42,9 @@ HandleAllocator<P0, P1, P2>::Allocator::Allocator(AreaPolicy::HeapArea const& ar
     char* const p1 = p0 + count * P0;
     char* const p2 = p1 + count * P1;
 
-    mPool0 = PoolAllocator< P0, 16>(p0,              count * P0);
-    mPool1 = PoolAllocator< P1, 16>(p1 + count * P0, count * P1);
-    mPool2 = PoolAllocator< P2, 16>(p2 + count * P0, count * P2);
+    mPool0 = PoolAllocator<P0, 16>(p0, count * P0);
+    mPool1 = PoolAllocator<P1, 16>(p1, count * P1);
+    mPool2 = PoolAllocator<P2, 16>(p2, count * P2);
 }
 
 // ------------------------------------------------------------------------------------------------

From f9c8e65ef3ca747d71166a205a774176bc819c13 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Thu, 8 Feb 2024 00:17:44 -0800
Subject: [PATCH 05/19] fix velocity update in FreeFlightManipulator

when the time step was getting to large, the velocity update could
become unstable and the camera would oscillate and eventually
fly off.
---
 libs/camutils/src/FreeFlightManipulator.h | 119 +++++++++++++++-------
 1 file changed, 80 insertions(+), 39 deletions(-)

diff --git a/libs/camutils/src/FreeFlightManipulator.h b/libs/camutils/src/FreeFlightManipulator.h
index 1df4dd56f73..c20b2578295 100644
--- a/libs/camutils/src/FreeFlightManipulator.h
+++ b/libs/camutils/src/FreeFlightManipulator.h
@@ -24,6 +24,7 @@
 #include <math/mat4.h>
 #include <math/quat.h>
 
+#include <algorithm>
 #include <cmath>
 
 namespace filament {
@@ -121,50 +122,90 @@ class FreeFlightManipulator : public Manipulator<FLOAT> {
     }
 
     void update(FLOAT deltaTime) override {
-        vec3 forceLocal { 0.0, 0.0, 0.0 };
 
-        if (mKeyDown[(int) Base::Key::FORWARD]) {
-            forceLocal += vec3{  0.0,  0.0, -1.0 };
-        }
-        if (mKeyDown[(int) Base::Key::LEFT]) {
-            forceLocal += vec3{ -1.0,  0.0,  0.0 };
-        }
-        if (mKeyDown[(int) Base::Key::BACKWARD]) {
-            forceLocal += vec3{  0.0,  0.0,  1.0 };
-        }
-        if (mKeyDown[(int) Base::Key::RIGHT]) {
-            forceLocal += vec3{  1.0,  0.0,  0.0 };
-        }
-
-        const mat4 orientation = mat4::lookAt(Base::mEye, Base::mTarget, Base::mProps.upVector);
-        vec3 forceWorld = (orientation * vec4{ forceLocal, 0.0f }).xyz;
-
-        if (mKeyDown[(int) Base::Key::UP]) {
-            forceWorld += vec3{  0.0,  1.0,  0.0 };
-        }
-        if (mKeyDown[(int) Base::Key::DOWN]) {
-            forceWorld += vec3{  0.0, -1.0,  0.0 };
-        }
-
-        forceWorld *= mMoveSpeed;
-
-        const auto dampingFactor = Base::mProps.flightMoveDamping;
+        auto getLocalDirection = [this]() -> vec3 {
+            vec3 directionLocal{ 0.0, 0.0, 0.0 };
+            if (mKeyDown[(int)Base::Key::FORWARD]) {
+                directionLocal += vec3{ 0.0, 0.0, -1.0 };
+            }
+            if (mKeyDown[(int)Base::Key::LEFT]) {
+                directionLocal += vec3{ -1.0, 0.0, 0.0 };
+            }
+            if (mKeyDown[(int)Base::Key::BACKWARD]) {
+                directionLocal += vec3{ 0.0, 0.0, 1.0 };
+            }
+            if (mKeyDown[(int)Base::Key::RIGHT]) {
+                directionLocal += vec3{ 1.0, 0.0, 0.0 };
+            }
+            return directionLocal;
+        };
+
+        auto getWorldDirection = [this](vec3 directionLocal) -> vec3 {
+            const mat4 orientation = mat4::lookAt(Base::mEye, Base::mTarget, Base::mProps.upVector);
+            vec3 directionWorld = (orientation * vec4{ directionLocal, 0.0f }).xyz;
+            if (mKeyDown[(int)Base::Key::UP]) {
+                directionWorld += vec3{ 0.0, 1.0, 0.0 };
+            }
+            if (mKeyDown[(int)Base::Key::DOWN]) {
+                directionWorld += vec3{ 0.0, -1.0, 0.0 };
+            }
+            return directionWorld;
+        };
+
+        vec3 const localDirection = getLocalDirection();
+        vec3 const worldDirection = getWorldDirection(localDirection);
+
+        // unit of dampingFactor is [1/s]
+        FLOAT const dampingFactor = Base::mProps.flightMoveDamping;
         if (dampingFactor == 0.0) {
             // Without damping, we simply treat the force as our velocity.
-            mEyeVelocity = forceWorld;
+            vec3 const speed = worldDirection * mMoveSpeed;
+            mEyeVelocity = speed;
+            vec3 const positionDelta = mEyeVelocity * deltaTime;
+            Base::mEye += positionDelta;
+            Base::mTarget += positionDelta;
         } else {
-            // The dampingFactor acts as "friction", which acts upon the camera in the direction
-            // opposite its velocity.
-            // Force is also multiplied by the dampingFactor, to "make up" for the friction.
-            // This ensures that the max velocity still approaches mMoveSpeed;
-            vec3 velocityDelta = (forceWorld - mEyeVelocity) * dampingFactor;
-            mEyeVelocity += velocityDelta * deltaTime;
+            auto dt = deltaTime / 16.0;
+            for (size_t i = 0; i < 16; i++) {
+                // Note: the algorithm below doesn't work well for large time steps because
+                //       we're not using a closed form for updating the position, so we need
+                //       to loop a few times. We could make this better by having a dynamic
+                //       loop count. What we're really doing is evaluation the solution to
+                //       a differential equation numerically.
+
+                // Kinetic friction is a force opposing velocity and proportional to it.:
+                //      F = -kv
+                //      F = ma
+                // ==>  ma = -kv
+                //      a = -vk/m               [m.s^-2] = [m/s] * [Kg/s] / [Kg]
+                // ==>  dampingFactor = k/m        [1/s] = [Kg/s] / [Kg]
+                //
+                // The velocity update for dt due to friction is then:
+                // v = v + a.dt
+                //   = v - v * dampingFactor * dt
+                //   = v * (1.0 - dampingFactor * dt)
+                mEyeVelocity = mEyeVelocity * saturate(1.0 - dampingFactor * dt);
+
+                // We also undergo an acceleration proportional to the distance to the target speed
+                // (the closer we are the less we accelerate, similar to a car).
+                //       F = k * (target_v - v)
+                //       F = ma
+                //  ==> ma = k * (target_v - v)
+                //       a = k/m * (target_v - v)       [m.s^-2] = [Kg/s] / [Kg] * [m/s]
+                //
+                // The velocity update for dt due to the acceleration (the gas basically) is then:
+                // v = v + a.dt
+                //   = v + k/m * (target_v - v).dt
+                // We're using the same dampingFactor here, but we don't have to.
+                auto const accelerationFactor = dampingFactor;
+                vec3 const acceleration = worldDirection *
+                        (accelerationFactor * std::max(mMoveSpeed - length(mEyeVelocity), FLOAT(0)));
+                mEyeVelocity += acceleration * dt;
+                vec3 const positionDelta = mEyeVelocity * dt;
+                Base::mEye += positionDelta;
+                Base::mTarget += positionDelta;
+            }
         }
-
-        const vec3 positionDelta = mEyeVelocity * deltaTime;
-
-        Base::mEye += positionDelta;
-        Base::mTarget += positionDelta;
     }
 
     Bookmark getCurrentBookmark() const override {

From ef703bb4be46fc50e9decae297a5592d9da1b967 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Tue, 6 Feb 2024 10:03:07 -0800
Subject: [PATCH 06/19] Better handle collisions and use-after-free detection

- each handle now has a 4-bits "age", meaning that handles are recycled
  only after 16 alloc/free cycles.
  This is used to detect double-free and use-after free.
  This should also allow us to compare handles, because freeing and
  reallocating an object, won't produce the same Handle (at least
  for 16 rounds).

- removed "type safety" checks because it's almost impossible to
  get it wrong thanks to our compile time type safety checks. This
  didn't provide a useful value added.

- This feature is built on top of being able to set/get a 8 bits tag
  associated with the memory block returned by the pool allocator. We
  use the "extra" parameter of the allocator to allocate a "hidden"
  structure containing the age of that memory block.

- Also we don't allow to compare Handle<> of different types
---
 filament/backend/include/backend/Handle.h     |  16 +-
 .../include/private/backend/HandleAllocator.h | 177 +++++++++---------
 filament/backend/src/HandleAllocator.cpp      |  41 +++-
 libs/utils/include/utils/Allocator.h          |  40 +++-
 4 files changed, 157 insertions(+), 117 deletions(-)

diff --git a/filament/backend/include/backend/Handle.h b/filament/backend/include/backend/Handle.h
index 7b8846ba7bc..ffc16133fd2 100644
--- a/filament/backend/include/backend/Handle.h
+++ b/filament/backend/include/backend/Handle.h
@@ -62,14 +62,6 @@ class HandleBase {
     // clear the handle, this doesn't free associated resources
     void clear() noexcept { object = nullid; }
 
-    // compare handles
-    bool operator==(const HandleBase& rhs) const noexcept { return object == rhs.object; }
-    bool operator!=(const HandleBase& rhs) const noexcept { return object != rhs.object; }
-    bool operator<(const HandleBase& rhs) const noexcept { return object < rhs.object; }
-    bool operator<=(const HandleBase& rhs) const noexcept { return object <= rhs.object; }
-    bool operator>(const HandleBase& rhs) const noexcept { return object > rhs.object; }
-    bool operator>=(const HandleBase& rhs) const noexcept { return object >= rhs.object; }
-
     // get this handle's handleId
     HandleId getId() const noexcept { return object; }
 
@@ -101,6 +93,14 @@ struct Handle : public HandleBase {
 
     explicit Handle(HandleId id) noexcept : HandleBase(id) { }
 
+    // compare handles of the same type
+    bool operator==(const Handle& rhs) const noexcept { return getId() == rhs.getId(); }
+    bool operator!=(const Handle& rhs) const noexcept { return getId() != rhs.getId(); }
+    bool operator<(const Handle& rhs) const noexcept { return getId() < rhs.getId(); }
+    bool operator<=(const Handle& rhs) const noexcept { return getId() <= rhs.getId(); }
+    bool operator>(const Handle& rhs) const noexcept { return getId() > rhs.getId(); }
+    bool operator>=(const Handle& rhs) const noexcept { return getId() >= rhs.getId(); }
+
     // type-safe Handle cast
     template<typename B, typename = std::enable_if_t<std::is_base_of<T, B>::value> >
     Handle(Handle<B> const& base) noexcept : HandleBase(base) { } // NOLINT(hicpp-explicit-conversions,google-explicit-constructor)
diff --git a/filament/backend/include/private/backend/HandleAllocator.h b/filament/backend/include/private/backend/HandleAllocator.h
index 2e7c8d1530f..04e66d85774 100644
--- a/filament/backend/include/private/backend/HandleAllocator.h
+++ b/filament/backend/include/private/backend/HandleAllocator.h
@@ -24,22 +24,19 @@
 #include <utils/compiler.h>
 #include <utils/debug.h>
 #include <utils/ostream.h>
+#include <utils/Panic.h>
 
 #include <tsl/robin_map.h>
 
+#include <cstddef>
 #include <exception>
 #include <type_traits>
 #include <unordered_map>
+#include <utility>
 
 #include <stddef.h>
 #include <stdint.h>
 
-#if !defined(NDEBUG) && UTILS_HAS_RTTI
-#   define HANDLE_TYPE_SAFETY 1
-#else
-#   define HANDLE_TYPE_SAFETY 0
-#endif
-
 #define HandleAllocatorGL  HandleAllocator<16,  64, 208>    // ~3640 / pool / MiB
 #define HandleAllocatorVK  HandleAllocator<80, 176, 320>    // ~1820 / pool / MiB
 #define HandleAllocatorMTL HandleAllocator<48, 160, 592>    // ~1310 / pool / MiB
@@ -52,11 +49,6 @@ namespace filament::backend {
 template<size_t P0, size_t P1, size_t P2>
 class HandleAllocator {
 public:
-
-    static_assert(P0 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
-    static_assert(P1 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
-    static_assert(P2 % 16 == 0, "HandleAllocator Pools must be multiple of 16 bytes");
-
     HandleAllocator(const char* name, size_t size) noexcept;
     HandleAllocator(HandleAllocator const& rhs) = delete;
     HandleAllocator& operator=(HandleAllocator const& rhs) = delete;
@@ -74,14 +66,9 @@ class HandleAllocator {
      */
     template<typename D, typename ... ARGS>
     Handle<D> allocateAndConstruct(ARGS&& ... args) noexcept {
-        Handle<D> h{ allocateHandle<sizeof(D)>() };
+        Handle<D> h{ allocateHandle<D>() };
         D* addr = handle_cast<D*>(h);
         new(addr) D(std::forward<ARGS>(args)...);
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return h;
     }
 
@@ -97,13 +84,7 @@ class HandleAllocator {
      */
     template<typename D>
     Handle<D> allocate() noexcept {
-        Handle<D> h{ allocateHandle<sizeof(D)>() };
-#if HANDLE_TYPE_SAFETY
-        D* addr = handle_cast<D*>(h);
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
+        Handle<D> h{ allocateHandle<D>() };
         return h;
     }
 
@@ -120,17 +101,10 @@ class HandleAllocator {
         assert_invariant(handle);
         D* addr = handle_cast<D*>(const_cast<Handle<B>&>(handle));
         assert_invariant(addr);
-
         // currently we implement construct<> with dtor+ctor, we could use operator= also
         // but all our dtors are trivial, ~D() is actually a noop.
         addr->~D();
         new(addr) D(std::forward<ARGS>(args)...);
-
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return addr;
     }
 
@@ -147,12 +121,6 @@ class HandleAllocator {
         D* addr = handle_cast<D*>(const_cast<Handle<B>&>(handle));
         assert_invariant(addr);
         new(addr) D(std::forward<ARGS>(args)...);
-
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return addr;
     }
 
@@ -168,19 +136,8 @@ class HandleAllocator {
     void deallocate(Handle<B>& handle, D const* p) noexcept {
         // allow to destroy the nullptr, similarly to operator delete
         if (p) {
-#if HANDLE_TYPE_SAFETY
-            mLock.lock();
-            auto typeId = mHandleTypeId[p];
-            mHandleTypeId.erase(p);
-            mLock.unlock();
-            if (UTILS_UNLIKELY(typeId != typeid(D).name())) {
-                utils::slog.e << "Destroying handle " << handle.getId() << ", type " << typeid(D).name()
-                       << ", but handle's actual type is " << typeId << utils::io::endl;
-                std::terminate();
-            }
-#endif
             p->~D();
-            deallocateHandle<sizeof(D)>(handle.getId());
+            deallocateHandle<D>(handle.getId());
         }
     }
 
@@ -208,7 +165,17 @@ class HandleAllocator {
             std::is_base_of_v<B, typename std::remove_pointer_t<Dp>>, Dp>
     handle_cast(Handle<B>& handle) noexcept {
         assert_invariant(handle);
-        void* const p = handleToPointer(handle.getId());
+        auto [p, tag] = handleToPointer(handle.getId());
+
+        if (isPoolHandle(handle.getId())) {
+            // check for use after free
+            uint8_t const age = (tag & HANDLE_AGE_MASK) >> HANDLE_AGE_SHIFT;
+            auto const pNode = static_cast<typename Allocator::Node*>(p);
+            uint8_t const expectedAge = pNode[-1].age;
+            ASSERT_POSTCONDITION(expectedAge == age,
+                    "use-after-free of Handle with id=%d", handle.getId());
+        }
+
         return static_cast<Dp>(p);
     }
 
@@ -223,29 +190,57 @@ class HandleAllocator {
 
 private:
 
-    // template <int P0, int P1, int P2>
+    template<typename D>
+    static constexpr size_t getBucketSize() noexcept {
+        if constexpr (sizeof(D) <= P0) { return P0; }
+        if constexpr (sizeof(D) <= P1) { return P1; }
+        static_assert(sizeof(D) <= P2);
+        return P2;
+    }
+
     class Allocator {
         friend class HandleAllocator;
-        utils::PoolAllocator<P0, 16>   mPool0;
-        utils::PoolAllocator<P1, 16>   mPool1;
-        utils::PoolAllocator<P2, 16>   mPool2;
+        static constexpr size_t MIN_ALIGNMENT = alignof(std::max_align_t);
+        struct Node { uint8_t age; };
+        // Note: using the `extra` parameter of PoolAllocator<>, even with a 1-byte structure,
+        // generally increases all pool allocations by 8-bytes because of alignment restrictions.
+        template<size_t SIZE>
+        using Pool = utils::PoolAllocator<SIZE, MIN_ALIGNMENT, sizeof(Node)>;
+        Pool<P0> mPool0;
+        Pool<P1> mPool1;
+        Pool<P2> mPool2;
         UTILS_UNUSED_IN_RELEASE const utils::AreaPolicy::HeapArea& mArea;
     public:
-        static constexpr size_t MIN_ALIGNMENT_SHIFT = 4;
         explicit Allocator(const utils::AreaPolicy::HeapArea& area);
 
+        static constexpr size_t getAlignment() noexcept { return MIN_ALIGNMENT; }
+
         // this is in fact always called with a constexpr size argument
-        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t extra = 0) noexcept {
+        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t, uint8_t* outAge) noexcept {
             void* p = nullptr;
-                 if (size <= mPool0.getSize()) p = mPool0.alloc(size, 16, extra);
-            else if (size <= mPool1.getSize()) p = mPool1.alloc(size, 16, extra);
-            else if (size <= mPool2.getSize()) p = mPool2.alloc(size, 16, extra);
+            if      (size <= mPool0.getSize()) p = mPool0.alloc(size);
+            else if (size <= mPool1.getSize()) p = mPool1.alloc(size);
+            else if (size <= mPool2.getSize()) p = mPool2.alloc(size);
+            if (UTILS_LIKELY(p)) {
+                Node const* const pNode = static_cast<Node const*>(p);
+                // we are guaranteed to have at least sizeof<Node> bytes of extra storage before
+                // the allocation address.
+                *outAge = pNode[-1].age;
+            }
             return p;
         }
 
         // this is in fact always called with a constexpr size argument
-        inline void free(void* p, size_t size) noexcept {
+        inline void free(void* p, size_t size, uint8_t age) noexcept {
             assert_invariant(p >= mArea.begin() && (char*)p + size <= (char*)mArea.end());
+
+            // check for double-free
+            Node* const pNode = static_cast<Node*>(p);
+            uint8_t& expectedAge = pNode[-1].age;
+            ASSERT_POSTCONDITION(expectedAge == age,
+                    "double-free of Handle of size %d at %p", size, p);
+            expectedAge = (expectedAge + 1) & 0xF; // fixme
+
             if (size <= mPool0.getSize()) { mPool0.free(p); return; }
             if (size <= mPool1.getSize()) { mPool1.free(p); return; }
             if (size <= mPool2.getSize()) { mPool2.free(p); return; }
@@ -267,24 +262,16 @@ class HandleAllocator {
     // allocateHandle()/deallocateHandle() selects the pool to use at compile-time based on the
     // allocation size this is always inlined, because all these do is to call
     // allocateHandleInPool()/deallocateHandleFromPool() with the right pool size.
-    template<size_t SIZE>
+    template<typename D>
     HandleBase::HandleId allocateHandle() noexcept {
-        if constexpr (SIZE <= P0) { return allocateHandleInPool<P0>(); }
-        if constexpr (SIZE <= P1) { return allocateHandleInPool<P1>(); }
-        static_assert(SIZE <= P2);
-        return allocateHandleInPool<P2>();
+        constexpr size_t BUCKET_SIZE = getBucketSize<D>();
+        return allocateHandleInPool<BUCKET_SIZE>();
     }
 
-    template<size_t SIZE>
+    template<typename D>
     void deallocateHandle(HandleBase::HandleId id) noexcept {
-        if constexpr (SIZE <= P0) {
-            deallocateHandleFromPool<P0>(id);
-        } else if constexpr (SIZE <= P1) {
-            deallocateHandleFromPool<P1>(id);
-        } else {
-            static_assert(SIZE <= P2);
-            deallocateHandleFromPool<P2>(id);
-        }
+        constexpr size_t BUCKET_SIZE = getBucketSize<D>();
+        deallocateHandleFromPool<BUCKET_SIZE>(id);
     }
 
     // allocateHandleInPool()/deallocateHandleFromPool() is NOT inlined, which will cause three
@@ -293,9 +280,11 @@ class HandleAllocator {
     template<size_t SIZE>
     UTILS_NOINLINE
     HandleBase::HandleId allocateHandleInPool() noexcept {
-        void* p = mHandleArena.alloc(SIZE);
+        uint8_t age;
+        void* p = mHandleArena.alloc(SIZE, alignof(std::max_align_t), 0, &age);
         if (UTILS_LIKELY(p)) {
-            return pointerToHandle(p);
+            uint32_t const tag = (uint32_t(age) << HANDLE_AGE_SHIFT) & HANDLE_AGE_MASK;
+            return arenaPointerToHandle(p, tag);
         } else {
             return allocateHandleSlow(SIZE);
         }
@@ -305,42 +294,51 @@ class HandleAllocator {
     UTILS_NOINLINE
     void deallocateHandleFromPool(HandleBase::HandleId id) noexcept {
         if (UTILS_LIKELY(isPoolHandle(id))) {
-            void* p = handleToPointer(id);
-            mHandleArena.free(p, SIZE);
+            auto [p, tag] = handleToPointer(id);
+            uint8_t const age = (tag & HANDLE_AGE_MASK) >> HANDLE_AGE_SHIFT;
+            mHandleArena.free(p, SIZE, age);
         } else {
             deallocateHandleSlow(id, SIZE);
         }
     }
 
-    static constexpr uint32_t HEAP_HANDLE_FLAG = 0x80000000u;
+    // we handle a 4 bits age per address
+    static constexpr uint32_t HANDLE_HEAP_FLAG      = 0x80000000u;      // pool vs heap handle
+    static constexpr uint32_t HANDLE_AGE_MASK       = 0x78000000u;      // handle's age
+    static constexpr uint32_t HANDLE_INDEX_MASK     = 0x07FFFFFFu;      // handle index
+    static constexpr uint32_t HANDLE_TAG_MASK       = HANDLE_AGE_MASK;
+    static constexpr uint32_t HANDLE_AGE_SHIFT      = 27;
 
     static bool isPoolHandle(HandleBase::HandleId id) noexcept {
-        return (id & HEAP_HANDLE_FLAG) == 0u;
+        return (id & HANDLE_HEAP_FLAG) == 0u;
     }
 
     HandleBase::HandleId allocateHandleSlow(size_t size) noexcept;
     void deallocateHandleSlow(HandleBase::HandleId id, size_t size) noexcept;
 
     // We inline this because it's just 4 instructions in the fast case
-    inline void* handleToPointer(HandleBase::HandleId id) const noexcept {
+    inline std::pair<void*, uint32_t> handleToPointer(HandleBase::HandleId id) const noexcept {
         // note: the null handle will end-up returning nullptr b/c it'll be handled as
         // a non-pool handle.
         if (UTILS_LIKELY(isPoolHandle(id))) {
             char* const base = (char*)mHandleArena.getArea().begin();
-            size_t offset = id << Allocator::MIN_ALIGNMENT_SHIFT;
-            return static_cast<void*>(base + offset);
+            uint32_t const tag = id & HANDLE_TAG_MASK;
+            size_t const offset = (id & HANDLE_INDEX_MASK) * Allocator::getAlignment();
+            return { static_cast<void*>(base + offset), tag };
         }
-        return handleToPointerSlow(id);
+        return { handleToPointerSlow(id), 0 };
     }
 
     void* handleToPointerSlow(HandleBase::HandleId id) const noexcept;
 
     // We inline this because it's just 3 instructions
-    inline HandleBase::HandleId pointerToHandle(void* p) const noexcept {
+    inline HandleBase::HandleId arenaPointerToHandle(void* p, uint32_t tag) const noexcept {
         char* const base = (char*)mHandleArena.getArea().begin();
-        size_t offset = (char*)p - base;
-        auto id = HandleBase::HandleId(offset >> Allocator::MIN_ALIGNMENT_SHIFT);
-        assert_invariant((id & HEAP_HANDLE_FLAG) == 0);
+        size_t const offset = (char*)p - base;
+        assert_invariant((offset % Allocator::getAlignment()) == 0);
+        auto id = HandleBase::HandleId(offset / Allocator::getAlignment());
+        id |= tag & HANDLE_TAG_MASK;
+        assert_invariant((id & HANDLE_HEAP_FLAG) == 0);
         return id;
     }
 
@@ -350,9 +348,6 @@ class HandleAllocator {
     mutable utils::Mutex mLock;
     tsl::robin_map<HandleBase::HandleId, void*> mOverflowMap;
     HandleBase::HandleId mId = 0;
-#if HANDLE_TYPE_SAFETY
-    mutable std::unordered_map<const void*, const char*> mHandleTypeId;
-#endif
 };
 
 } // namespace filament::backend
diff --git a/filament/backend/src/HandleAllocator.cpp b/filament/backend/src/HandleAllocator.cpp
index 7362066ca77..12bd26b84c9 100644
--- a/filament/backend/src/HandleAllocator.cpp
+++ b/filament/backend/src/HandleAllocator.cpp
@@ -19,12 +19,17 @@
 #include <backend/Handle.h>
 
 #include <utils/Allocator.h>
+#include <utils/Log.h>
+#include <utils/Panic.h>
 #include <utils/compiler.h>
 #include <utils/debug.h>
-#include <utils/Panic.h>
+#include <utils/ostream.h>
 
 #include <stdlib.h>
 
+#include <algorithm>
+#include <exception>
+#include <limits>
 #include <mutex>
 
 namespace filament::backend {
@@ -36,15 +41,29 @@ UTILS_NOINLINE
 HandleAllocator<P0, P1, P2>::Allocator::Allocator(AreaPolicy::HeapArea const& area)
         : mArea(area) {
 
+    // The largest handle this allocator can generate currently depends on the architecture's
+    // min alignment, typically 8 or 16 bytes.
+    // e.g. On Android armv8, the alignment is 16 bytes, so for a 1 MiB heap, the largest handle
+    //      index will be 65536. Note that this is not the same as the number of handles (which
+    //      will always be less).
+    // Because our maximum representable handle currently is 0x07FFFFFF, the maximum no-nonsensical
+    // heap size is 2 GiB, which amounts to 7.6 millions handles per pool (in the GL case).
+    size_t const maxHeapSize = std::min(area.size(), HANDLE_INDEX_MASK * getAlignment());
+
+    if (UTILS_UNLIKELY(maxHeapSize != area.size())) {
+        slog.w << "HandleAllocator heap size reduced to "
+               << maxHeapSize << " from " << area.size() << io::endl;
+    }
+
     // size the different pools so that they can all contain the same number of handles
-    size_t const count = area.size() / (P0 + P1 + P2);
+    size_t const count = maxHeapSize / (P0 + P1 + P2);
     char* const p0 = static_cast<char*>(area.begin());
     char* const p1 = p0 + count * P0;
     char* const p2 = p1 + count * P1;
 
-    mPool0 = PoolAllocator<P0, 16>(p0, count * P0);
-    mPool1 = PoolAllocator<P1, 16>(p1, count * P1);
-    mPool2 = PoolAllocator<P2, 16>(p2, count * P2);
+    mPool0 = Pool<P0>(p0, count * P0);
+    mPool1 = Pool<P1>(p1, count * P1);
+    mPool2 = Pool<P2>(p2, count * P2);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -82,11 +101,17 @@ template <size_t P0, size_t P1, size_t P2>
 HandleBase::HandleId HandleAllocator<P0, P1, P2>::allocateHandleSlow(size_t size) noexcept {
     void* p = ::malloc(size);
     std::unique_lock lock(mLock);
-    HandleBase::HandleId id = (++mId) | HEAP_HANDLE_FLAG;
+
+    HandleBase::HandleId id = (++mId) | HANDLE_HEAP_FLAG;
+
+    ASSERT_POSTCONDITION(mId < HANDLE_HEAP_FLAG,
+            "No more Handle ids available! This can happen if HandleAllocator arena has been full"
+            " for a while. Please increase FILAMENT_OPENGL_HANDLE_ARENA_SIZE_IN_MB");
+
     mOverflowMap.emplace(id, p);
     lock.unlock();
 
-    if (UTILS_UNLIKELY(id == (HEAP_HANDLE_FLAG|1u))) { // meaning id was zero
+    if (UTILS_UNLIKELY(id == (HANDLE_HEAP_FLAG | 1u))) { // meaning id was zero
         PANIC_LOG("HandleAllocator arena is full, using slower system heap. Please increase "
                   "the appropriate constant (e.g. FILAMENT_OPENGL_HANDLE_ARENA_SIZE_IN_MB).");
     }
@@ -95,7 +120,7 @@ HandleBase::HandleId HandleAllocator<P0, P1, P2>::allocateHandleSlow(size_t size
 
 template <size_t P0, size_t P1, size_t P2>
 void HandleAllocator<P0, P1, P2>::deallocateHandleSlow(HandleBase::HandleId id, size_t) noexcept {
-    assert_invariant(id & HEAP_HANDLE_FLAG);
+    assert_invariant(id & HANDLE_HEAP_FLAG);
     void* p = nullptr;
     auto& overflowMap = mOverflowMap;
 
diff --git a/libs/utils/include/utils/Allocator.h b/libs/utils/include/utils/Allocator.h
index 2b02eb4cc0f..073206f48ba 100644
--- a/libs/utils/include/utils/Allocator.h
+++ b/libs/utils/include/utils/Allocator.h
@@ -250,11 +250,11 @@ class FreeList {
         return mHead;
     }
 
-private:
     struct Node {
         Node* next;
     };
 
+private:
     static Node* init(void* begin, void* end,
             size_t elementSize, size_t alignment, size_t extra) noexcept;
 
@@ -272,8 +272,8 @@ class AtomicFreeList {
     AtomicFreeList() noexcept = default;
     AtomicFreeList(void* begin, void* end,
             size_t elementSize, size_t alignment, size_t extra) noexcept;
-    AtomicFreeList(const FreeList& rhs) = delete;
-    AtomicFreeList& operator=(const FreeList& rhs) = delete;
+    AtomicFreeList(const AtomicFreeList& rhs) = delete;
+    AtomicFreeList& operator=(const AtomicFreeList& rhs) = delete;
 
     void* pop() noexcept {
         Node* const pStorage = mStorage;
@@ -319,7 +319,6 @@ class AtomicFreeList {
         return mStorage + mHead.load(std::memory_order_relaxed).offset;
     }
 
-private:
     struct Node {
         // This should be a regular (non-atomic) pointer, but this causes TSAN to complain
         // about a data-race that exists but is benin. We always use this atomic<> in
@@ -350,6 +349,7 @@ class AtomicFreeList {
         std::atomic<Node*> next;
     };
 
+private:
     // This struct is using a 32-bit offset into the arena rather than
     // a direct pointer, because together with the 32-bit tag, it needs to 
     // fit into 8 bytes. If it was any larger, it would not be possible to
@@ -372,7 +372,8 @@ template <
         size_t OFFSET = 0,
         typename FREELIST = FreeList>
 class PoolAllocator {
-    static_assert(ELEMENT_SIZE >= sizeof(void*), "ELEMENT_SIZE must accommodate at least a pointer");
+    static_assert(ELEMENT_SIZE >= sizeof(typename FREELIST::Node),
+            "ELEMENT_SIZE must accommodate at least a FreeList::Node");
 public:
     // our allocator concept
     void* alloc(size_t size = ELEMENT_SIZE,
@@ -636,6 +637,15 @@ class Arena {
               mListener(name, mArea.data(), mArea.size()) {
     }
 
+    template<typename ... ARGS>
+    void* alloc(size_t size, size_t alignment, size_t extra, ARGS&& ... args) noexcept {
+        std::lock_guard<LockingPolicy> guard(mLock);
+        void* p = mAllocator.alloc(size, alignment, extra, std::forward<ARGS>(args) ...);
+        mListener.onAlloc(p, size, alignment, extra);
+        return p;
+    }
+
+
     // allocate memory from arena with given size and alignment
     // (acceptable size/alignment may depend on the allocator provided)
     void* alloc(size_t size, size_t alignment, size_t extra) noexcept {
@@ -668,13 +678,13 @@ class Arena {
         return (T*)alloc(count * sizeof(T), alignment);
     }
 
-    // return memory pointed by p to the arena
-    // (actual behaviour may depend on allocator provided)
-    void free(void* p) noexcept {
+    // some allocators require more parameters
+    template<typename ... ARGS>
+    void free(void* p, size_t size, ARGS&& ... args) noexcept {
         if (p) {
             std::lock_guard<LockingPolicy> guard(mLock);
-            mListener.onFree(p);
-            mAllocator.free(p);
+            mListener.onFree(p, size);
+            mAllocator.free(p, size, std::forward<ARGS>(args) ...);
         }
     }
 
@@ -687,6 +697,16 @@ class Arena {
         }
     }
 
+    // return memory pointed by p to the arena
+    // (actual behaviour may depend on allocator provided)
+    void free(void* p) noexcept {
+        if (p) {
+            std::lock_guard<LockingPolicy> guard(mLock);
+            mListener.onFree(p);
+            mAllocator.free(p);
+        }
+    }
+
     // some allocators don't have a free() call, but a single reset() or rewind() instead
     void reset() noexcept {
         std::lock_guard<LockingPolicy> guard(mLock);

From 653a0159910baab2767fc202e40948e51ab27f07 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Fri, 9 Feb 2024 15:02:32 -0800
Subject: [PATCH 07/19] fix uninitialized memory access

---
 filament/backend/src/HandleAllocator.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/filament/backend/src/HandleAllocator.cpp b/filament/backend/src/HandleAllocator.cpp
index 12bd26b84c9..bf8e779614c 100644
--- a/filament/backend/src/HandleAllocator.cpp
+++ b/filament/backend/src/HandleAllocator.cpp
@@ -25,13 +25,14 @@
 #include <utils/debug.h>
 #include <utils/ostream.h>
 
-#include <stdlib.h>
-
 #include <algorithm>
 #include <exception>
 #include <limits>
 #include <mutex>
 
+#include <stdlib.h>
+#include <string.h>
+
 namespace filament::backend {
 
 using namespace utils;
@@ -55,6 +56,10 @@ HandleAllocator<P0, P1, P2>::Allocator::Allocator(AreaPolicy::HeapArea const& ar
                << maxHeapSize << " from " << area.size() << io::endl;
     }
 
+    // make sure we start with a clean arena. This is needed to ensure that all blocks start
+    // with an age of 0.
+    memset(area.data(), 0, maxHeapSize);
+
     // size the different pools so that they can all contain the same number of handles
     size_t const count = maxHeapSize / (P0 + P1 + P2);
     char* const p0 = static_cast<char*>(area.begin());

From a27260b87f5ff12a6ce5d2e3f68a2377983d71f5 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Fri, 9 Feb 2024 14:44:17 -0800
Subject: [PATCH 08/19] lazy initialization of the ShadowMap cache in
 ShadowMapManager

This reduces resource utilisation for Views that never need shadows.
It saves a UBO, two Entities and about 10KB memory. We also lazily
allocate the debugging DataSource, which saves about 10K per View
in debug builds.

Overall this change makes "simple" Views less than 4KB heavy down from
about 24KB (debug, 14KB release).

The main changes:
- ShadowMapManager is now allocated lazily
- the ShadowMap cache object is also allocated lazily
- debug DataSource is allocated lazily
- ShadowMaps are prepared/initialized with a Builder, which makes it
  clearer that some APIs are only for preparing the ShadowMap cache.
---
 filament/include/filament/View.h       |   2 +-
 filament/src/ShadowMap.cpp             |  25 ++++-
 filament/src/ShadowMap.h               |  18 ++-
 filament/src/ShadowMapManager.cpp      | 148 ++++++++++++++++++-------
 filament/src/ShadowMapManager.h        |  77 ++++++++++---
 filament/src/View.cpp                  |   4 +-
 filament/src/details/DebugRegistry.cpp |  23 +++-
 filament/src/details/DebugRegistry.h   |  13 ++-
 filament/src/details/View.cpp          |  86 ++++++++------
 filament/src/details/View.h            |  12 +-
 libs/filamentapp/src/FilamentApp.cpp   |  11 +-
 libs/math/include/math/mat3.h          |   8 +-
 12 files changed, 320 insertions(+), 107 deletions(-)

diff --git a/filament/include/filament/View.h b/filament/include/filament/View.h
index e4ba827aad2..3cdd527fac7 100644
--- a/filament/include/filament/View.h
+++ b/filament/include/filament/View.h
@@ -719,7 +719,7 @@ class UTILS_PUBLIC View : public FilamentAPI {
     void setDebugCamera(Camera* UTILS_NULLABLE camera) noexcept;
 
     //! debugging: returns a Camera from the point of view of *the* dominant directional light used for shadowing.
-    Camera const* UTILS_NULLABLE getDirectionalLightCamera() const noexcept;
+    Camera const* UTILS_NULLABLE getDirectionalShadowCamera() const noexcept;
 
 
     /** Result of a picking query */
diff --git a/filament/src/ShadowMap.cpp b/filament/src/ShadowMap.cpp
index e4d7f67be01..7015e14466f 100644
--- a/filament/src/ShadowMap.cpp
+++ b/filament/src/ShadowMap.cpp
@@ -16,19 +16,39 @@
 
 #include "ShadowMap.h"
 
-#include "RenderPass.h"
+#include <filament/Box.h>
+#include <filament/Frustum.h>
+#include <filament/LightManager.h>
 
 #include "components/LightManager.h"
 
+#include "details/DebugRegistry.h"
 #include "details/Engine.h"
 #include "details/Scene.h"
 
+#include <backend/DriverApiForward.h>
 #include <backend/DriverEnums.h>
 
+#include <utils/compiler.h>
 #include <utils/debug.h>
+#include <utils/Entity.h>
+#include <utils/Slice.h>
 #include <utils/Systrace.h>
 
+#include <math/vec3.h>
+#include <math/vec4.h>
+#include <math/mat3.h>
+#include <math/mat4.h>
+#include <math/scalar.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
 #include <limits>
+#include <type_traits>
+
+#include <stddef.h>
+#include <stdint.h>
 
 using namespace utils;
 
@@ -239,7 +259,8 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
 ShadowMap::ShaderParameters ShadowMap::updatePunctual(
         mat4f const& Mv, float outerConeAngle, float nearPlane, float farPlane,
         const ShadowMapInfo& shadowMapInfo, const FLightManager::ShadowParams& params) noexcept {
-    const mat4f Mp = mat4f::perspective(outerConeAngle * f::RAD_TO_DEG * 2.0f, 1.0f, nearPlane, farPlane);
+    const mat4f Mp = mat4f::perspective(
+            outerConeAngle * f::RAD_TO_DEG * 2.0f, 1.0f, nearPlane, farPlane);
 
     assert_invariant(shadowMapInfo.textureDimension == mOptions->mapSize);
 
diff --git a/filament/src/ShadowMap.h b/filament/src/ShadowMap.h
index ce1ee860622..d0ca26945f9 100644
--- a/filament/src/ShadowMap.h
+++ b/filament/src/ShadowMap.h
@@ -17,20 +17,30 @@
 #ifndef TNT_FILAMENT_DETAILS_SHADOWMAP_H
 #define TNT_FILAMENT_DETAILS_SHADOWMAP_H
 
-#include "components/LightManager.h"
+#include <filament/Box.h>
 
+#include "Culler.h"
 #include "PerShadowMapUniforms.h"
 
 #include "details/Camera.h"
 #include "details/Scene.h"
 
+#include "components/LightManager.h"
+
 #include <backend/DriverApiForward.h>
+#include <backend/DriverEnums.h>
 
-#include <filament/Box.h>
-#include <filament/Viewport.h>
+#include <utils/compiler.h>
 
-#include <math/mat4.h>
+#include <math/mathfwd.h>
+#include <math/vec3.h>
 #include <math/vec4.h>
+#include <math/mat4.h>
+
+#include <array>
+
+#include <stddef.h>
+#include <stdint.h>
 
 namespace filament {
 
diff --git a/filament/src/ShadowMapManager.cpp b/filament/src/ShadowMapManager.cpp
index ebf88427774..f2c7c603190 100644
--- a/filament/src/ShadowMapManager.cpp
+++ b/filament/src/ShadowMapManager.cpp
@@ -15,25 +15,49 @@
  */
 
 #include "ShadowMapManager.h"
-
 #include "RenderPass.h"
 #include "ShadowMap.h"
 
+#include <filament/Frustum.h>
+#include <filament/LightManager.h>
+#include <filament/Options.h>
+
+#include <private/filament/EngineEnums.h>
+
+#include "components/RenderableManager.h"
+
+#include "details/Camera.h"
 #include "details/DebugRegistry.h"
 #include "details/Texture.h"
 #include "details/View.h"
 
-#include <fg/FrameGraph.h>
+#include "fg/FrameGraph.h"
+#include "fg/FrameGraphId.h"
+#include "fg/FrameGraphRenderPass.h"
+#include "fg/FrameGraphTexture.h"
 
+#include <backend/DriverApiForward.h>
 #include <backend/DriverEnums.h>
 
+#include <utils/FixedCapacityVector.h>
+#include <utils/Range.h>
+#include <utils/Slice.h>
 #include <utils/compiler.h>
 #include <utils/debug.h>
-#include <utils/FixedCapacityVector.h>
 
+#include <math/half.h>
+#include <math/mat4.h>
+#include <math/vec4.h>
+#include <math/scalar.h>
+
+#include <cmath>
+#include <limits>
 #include <new>
 #include <memory>
 
+#include <stdint.h>
+#include <stddef.h>
+
 namespace filament {
 
 using namespace backend;
@@ -41,15 +65,6 @@ using namespace math;
 
 ShadowMapManager::ShadowMapManager(FEngine& engine)
         : mEngine(engine) {
-    // initialize our ShadowMap array in-place
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        new (&entry) ShadowMap(engine);
-    }
-
-    mShadowUbh = engine.getDriverApi().createBufferObject(mShadowUb.getSize(),
-            BufferObjectBinding::UNIFORM, BufferUsage::DYNAMIC);
-
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
     debugRegistry.registerProperty("d.shadowmap.visualize_cascades",
             &engine.debug.shadowmap.visualize_cascades);
@@ -59,25 +74,75 @@ ShadowMapManager::ShadowMapManager(FEngine& engine)
 
 ShadowMapManager::~ShadowMapManager() {
     // destroy the ShadowMap array in-place
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        std::destroy_at(std::launder(reinterpret_cast<ShadowMap*>(&entry)));
+    if (UTILS_UNLIKELY(mInitialized)) {
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            std::destroy_at(std::launder(reinterpret_cast<ShadowMap*>(&entry)));
+        }
     }
 }
 
-void ShadowMapManager::terminate(FEngine& engine) {
-    DriverApi& driver = engine.getDriverApi();
-    driver.destroyBufferObject(mShadowUbh);
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        std::launder(reinterpret_cast<ShadowMap*>(&entry))->terminate(engine);
+void ShadowMapManager::createIfNeeded(FEngine& engine,
+        std::unique_ptr<ShadowMapManager>& inOutShadowMapManager) {
+    if (UTILS_UNLIKELY(!inOutShadowMapManager)) {
+        inOutShadowMapManager.reset(new ShadowMapManager(engine));
     }
 }
 
+void ShadowMapManager::terminate(FEngine& engine,
+        std::unique_ptr<ShadowMapManager>& shadowMapManager) {
+    if (shadowMapManager) {
+        shadowMapManager->terminate(engine);
+    }
+}
 
-ShadowMapManager::ShadowTechnique ShadowMapManager::update(FEngine& engine, FView& view,
+void ShadowMapManager::terminate(FEngine& engine) {
+    if (UTILS_UNLIKELY(mInitialized)) {
+        DriverApi& driver = engine.getDriverApi();
+        driver.destroyBufferObject(mShadowUbh);
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            std::launder(reinterpret_cast<ShadowMap*>(&entry))->terminate(engine);
+        }
+    }
+}
+
+ShadowMapManager::ShadowTechnique ShadowMapManager::update(
+        Builder const& builder,
+        FEngine& engine, FView& view,
         CameraInfo const& cameraInfo,
         FScene::RenderableSoa& renderableData, FScene::LightSoa const& lightData) noexcept {
+
+    if (!builder.mDirectionalShadowMapCount && !builder.mSpotShadowMapCount) {
+        // no shadows were recorder
+        return ShadowTechnique::NONE;
+    }
+
+    // initialize the shadowmap array the first time
+    if (UTILS_UNLIKELY(!mInitialized)) {
+        mInitialized = true;
+        // initialize our ShadowMap array in-place
+        mShadowUbh = engine.getDriverApi().createBufferObject(mShadowUb.getSize(),
+                BufferObjectBinding::UNIFORM, BufferUsage::DYNAMIC);
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            new(&entry) ShadowMap(engine);
+        }
+    }
+
+    mDirectionalShadowMapCount = builder.mDirectionalShadowMapCount;
+    mSpotShadowMapCount = builder.mSpotShadowMapCount;
+
+    for (auto const& entry : builder.mShadowMaps) {
+        auto& shadowMap = getShadowMap(entry.shadowIndex);
+        shadowMap.initialize(
+                entry.lightIndex,
+                entry.shadowType,
+                entry.shadowIndex,
+                entry.face,
+                entry.options);
+    }
+
     ShadowTechnique shadowTechnique = {};
 
     calculateTextureRequirements(engine, view, lightData);
@@ -96,42 +161,49 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::update(FEngine& engine, FVie
     return shadowTechnique;
 }
 
-void ShadowMapManager::reset() noexcept {
-    mDirectionalShadowMapCount = 0;
-    mSpotShadowMapCount = 0;
-}
-
-void ShadowMapManager::setDirectionalShadowMap(size_t lightIndex,
+ShadowMapManager::Builder& ShadowMapManager::Builder::directionalShadowMap(size_t lightIndex,
         LightManager::ShadowOptions const* options) noexcept {
     assert_invariant(options->shadowCascades <= CONFIG_MAX_SHADOW_CASCADES);
-
     // this updates getCascadedShadowMap()
     mDirectionalShadowMapCount = options->shadowCascades;
-    utils::Slice<ShadowMap> cascadedShadowMap = getCascadedShadowMap();
     for (size_t c = 0; c < options->shadowCascades; c++) {
-        ShadowMap& shadowMap = cascadedShadowMap[c];
-        shadowMap.initialize(lightIndex, ShadowType::DIRECTIONAL, c, 0, options);
+        mShadowMaps.push_back({
+                .lightIndex = lightIndex,
+                .shadowType = ShadowType::DIRECTIONAL,
+                .shadowIndex = uint8_t(c),
+                .face = 0,
+                .options = options });
     }
+    return *this;
 }
 
-void ShadowMapManager::addShadowMap(size_t lightIndex, bool spotlight,
+ShadowMapManager::Builder& ShadowMapManager::Builder::shadowMap(size_t lightIndex, bool spotlight,
         LightManager::ShadowOptions const* options) noexcept {
     if (spotlight) {
         const size_t c = mSpotShadowMapCount++;
         const size_t i = c + CONFIG_MAX_SHADOW_CASCADES;
         assert_invariant(i < CONFIG_MAX_SHADOWMAPS);
-        auto& shadowMap = getShadowMap(i);
-        shadowMap.initialize(lightIndex, ShadowType::SPOT, i, 0, options);
+        mShadowMaps.push_back({
+                .lightIndex = lightIndex,
+                .shadowType = ShadowType::SPOT,
+                .shadowIndex = uint8_t(i),
+                .face = 0,
+                .options = options });
     } else {
         // point-light, generate 6 independent shadowmaps
         for (size_t face = 0; face < 6; face++) {
             const size_t c = mSpotShadowMapCount++;
             const size_t i = c + CONFIG_MAX_SHADOW_CASCADES;
             assert_invariant(i < CONFIG_MAX_SHADOWMAPS);
-            auto& shadowMap = getShadowMap(i);
-            shadowMap.initialize(lightIndex, ShadowType::POINT, i, face, options);
+            mShadowMaps.push_back({
+                    .lightIndex = lightIndex,
+                    .shadowType = ShadowType::POINT,
+                    .shadowIndex = uint8_t(i),
+                    .face = uint8_t(face),
+                    .options = options });
         }
     }
+    return *this;
 }
 
 FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameGraph& fg,
@@ -773,7 +845,7 @@ void ShadowMapManager::preparePointShadowMap(ShadowMap& shadowMap,
         FEngine& engine, FView& view, CameraInfo const& mainCameraInfo,
         FScene::RenderableSoa& renderableData, utils::Range<uint32_t> range,
         FScene::LightSoa& lightData,
-        ShadowMap::SceneInfo const& sceneInfo) noexcept {
+        ShadowMap::SceneInfo const&) noexcept {
 
     const uint8_t face = shadowMap.getFace();
     const size_t lightIndex = shadowMap.getLightIndex();
diff --git a/filament/src/ShadowMapManager.h b/filament/src/ShadowMapManager.h
index 6a6f752e42a..8f154a4d3b4 100644
--- a/filament/src/ShadowMapManager.h
+++ b/filament/src/ShadowMapManager.h
@@ -17,29 +17,48 @@
 #ifndef TNT_FILAMENT_DETAILS_SHADOWMAPMANAGER_H
 #define TNT_FILAMENT_DETAILS_SHADOWMAPMANAGER_H
 
-#include <filament/Viewport.h>
-
+#include "Culler.h"
 #include "ShadowMap.h"
 #include "TypedUniformBuffer.h"
 
+#include <filament/LightManager.h>
+#include <filament/Options.h>
+
+#include <private/filament/EngineEnums.h>
+#include <private/filament/UibStructs.h>
+
+#include "components/RenderableManager.h"
+
 #include "details/Engine.h"
 #include "details/Scene.h"
 
-#include <private/filament/EngineEnums.h>
+#include "fg/FrameGraphId.h"
+#include "fg/FrameGraphTexture.h"
 
-#include <backend/DriverApiForward.h>
 #include <backend/DriverEnums.h>
 #include <backend/Handle.h>
 
+#include <utils/BitmaskEnum.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+#include <utils/Range.h>
 #include <utils/Slice.h>
 
-#include <math/vec3.h>
+#include <math/mat4.h>
+#include <math/vec4.h>
 
 #include <array>
 #include <memory>
+#include <new>
+#include <type_traits>
+#include <vector>
+
+#include <stdint.h>
+#include <stddef.h>
 
 namespace filament {
 
+class FCamera;
 class FView;
 class FrameGraph;
 class RenderPass;
@@ -55,7 +74,7 @@ struct ShadowMappingUniforms {
 class ShadowMapManager {
 public:
 
-    using ShadowMappingUniforms = ShadowMappingUniforms;
+    using ShadowMappingUniforms = filament::ShadowMappingUniforms;
 
     using ShadowType = ShadowMap::ShadowType;
 
@@ -65,24 +84,42 @@ class ShadowMapManager {
         SCREEN_SPACE = 0x2u,
     };
 
+    class Builder {
+        friend class ShadowMapManager;
+        uint32_t mDirectionalShadowMapCount = 0;
+        uint32_t mSpotShadowMapCount = 0;
+        struct ShadowMap {
+            size_t lightIndex;
+            ShadowType shadowType;
+            uint16_t shadowIndex;
+            uint8_t face;
+            LightManager::ShadowOptions const* options;
+        };
+        std::vector<ShadowMap> mShadowMaps;
+    public:
+        Builder& directionalShadowMap(size_t lightIndex,
+                LightManager::ShadowOptions const* options) noexcept;
 
-    explicit ShadowMapManager(FEngine& engine);
-    ~ShadowMapManager();
+        Builder& shadowMap(size_t lightIndex, bool spotlight,
+                LightManager::ShadowOptions const* options) noexcept;
 
-    void terminate(FEngine& engine);
+        bool hasShadowMaps() const noexcept {
+            return mDirectionalShadowMapCount || mSpotShadowMapCount;
+        }
+    };
 
-    // Reset shadow map layout.
-    void reset() noexcept;
+    ~ShadowMapManager();
 
-    void setDirectionalShadowMap(size_t lightIndex,
-            LightManager::ShadowOptions const* options) noexcept;
+    static void createIfNeeded(FEngine& engine,
+            std::unique_ptr<ShadowMapManager>& inOutShadowMapManager);
 
-    void addShadowMap(size_t lightIndex, bool spotlight,
-            LightManager::ShadowOptions const* options) noexcept;
+    static void terminate(FEngine& engine,
+            std::unique_ptr<ShadowMapManager>& shadowMapManager);
 
     // Updates all the shadow maps and performs culling.
     // Returns true if any of the shadow maps have visible shadows.
-    ShadowMapManager::ShadowTechnique update(FEngine& engine, FView& view,
+    ShadowMapManager::ShadowTechnique update(Builder const& builder,
+            FEngine& engine, FView& view,
             CameraInfo const& cameraInfo,
             FScene::RenderableSoa& renderableData, FScene::LightSoa const& lightData) noexcept;
 
@@ -101,11 +138,16 @@ class ShadowMapManager {
     bool hasSpotShadows() const { return !mSpotShadowMapCount; }
 
     // for debugging only
-    FCamera const* getDirectionalLightCamera() const noexcept {
+    FCamera const* getDirectionalShadowCamera() const noexcept {
+        if (!mInitialized) return nullptr;
         return getShadowMap(0).getDebugCamera();
     }
 
 private:
+    explicit ShadowMapManager(FEngine& engine);
+
+    void terminate(FEngine& engine);
+
     ShadowMapManager::ShadowTechnique updateCascadeShadowMaps(FEngine& engine,
             FView& view, CameraInfo cameraInfo, FScene::RenderableSoa& renderableData,
             FScene::LightSoa const& lightData, ShadowMap::SceneInfo sceneInfo) noexcept;
@@ -190,6 +232,7 @@ class ShadowMapManager {
     ShadowMapCacheContainer mShadowMapCache;
     uint32_t mDirectionalShadowMapCount = 0;
     uint32_t mSpotShadowMapCount = 0;
+    bool mInitialized = false;
 
     ShadowMap& getShadowMap(size_t index) noexcept {
         assert_invariant(index < CONFIG_MAX_SHADOWMAPS);
diff --git a/filament/src/View.cpp b/filament/src/View.cpp
index bc5da818290..2de966ea0c9 100644
--- a/filament/src/View.cpp
+++ b/filament/src/View.cpp
@@ -67,8 +67,8 @@ const char* View::getName() const noexcept {
     return downcast(this)->getName();
 }
 
-Camera const* View::getDirectionalLightCamera() const noexcept {
-    return downcast(this)->getDirectionalLightCamera();
+Camera const* View::getDirectionalShadowCamera() const noexcept {
+    return downcast(this)->getDirectionalShadowCamera();
 }
 
 void View::setShadowingEnabled(bool enabled) noexcept {
diff --git a/filament/src/details/DebugRegistry.cpp b/filament/src/details/DebugRegistry.cpp
index decd59610a9..ad1a54df74a 100644
--- a/filament/src/details/DebugRegistry.cpp
+++ b/filament/src/details/DebugRegistry.cpp
@@ -16,12 +16,18 @@
 
 #include "details/DebugRegistry.h"
 
+#include <utils/compiler.h>
+#include <utils/Invocable.h>
 #include <utils/Panic.h>
 
 #include <math/vec2.h>
 #include <math/vec3.h>
 #include <math/vec4.h>
 
+#include <functional>
+#include <string_view>
+#include <utility>
+
 #ifndef NDEBUG
 #   define DEBUG_PROPERTIES_WRITABLE true
 #else
@@ -120,12 +126,25 @@ void FDebugRegistry::registerDataSource(std::string_view name,
     }
 }
 
+void FDebugRegistry::registerDataSource(std::string_view name,
+        utils::Invocable<DataSource()>&& creator) noexcept {
+    mDataSourceCreatorMap[name] = std::move(creator);
+}
+
 DebugRegistry::DataSource FDebugRegistry::getDataSource(const char* name) const noexcept {
     std::string_view const key{ name };
     auto& dataSourceMap = mDataSourceMap;
     auto const& it = dataSourceMap.find(key);
-    if (it == dataSourceMap.end()) {
-        return { nullptr, 0u };
+    if (UTILS_UNLIKELY(it == dataSourceMap.end())) {
+        auto& dataSourceCreatorMap = mDataSourceCreatorMap;
+        auto const& pos = dataSourceCreatorMap.find(key);
+        if (pos == dataSourceCreatorMap.end()) {
+            return { nullptr, 0u };
+        }
+        DataSource dataSource{ pos->second() };
+        dataSourceMap[key] = dataSource;
+        dataSourceCreatorMap.erase(pos);
+        return dataSource;
     }
     return it->second;
 }
diff --git a/filament/src/details/DebugRegistry.h b/filament/src/details/DebugRegistry.h
index 94dfec19414..b60a1c69949 100644
--- a/filament/src/details/DebugRegistry.h
+++ b/filament/src/details/DebugRegistry.h
@@ -22,12 +22,17 @@
 #include <filament/DebugRegistry.h>
 
 #include <utils/compiler.h>
+#include <utils/Invocable.h>
+
+#include <math/mathfwd.h>
 
 #include <functional>
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 
+#include <stddef.h>
+
 namespace filament {
 
 class FEngine;
@@ -95,8 +100,13 @@ class FDebugRegistry : public DebugRegistry {
         registerProperty(name, p, FLOAT4, std::move(fn));
     }
 
+    // registers a DataSource directly
     void registerDataSource(std::string_view name, void const* data, size_t count) noexcept;
 
+    // registers a DataSource lazily
+    void registerDataSource(std::string_view name,
+            utils::Invocable<DataSource()>&& creator) noexcept;
+
 #if !defined(_MSC_VER)
 private:
 #endif
@@ -113,7 +123,8 @@ class FDebugRegistry : public DebugRegistry {
     void const* getPropertyAddress(const char* name) const noexcept;
     DataSource getDataSource(const char* name) const noexcept;
     std::unordered_map<std::string_view, PropertyInfo> mPropertyMap;
-    std::unordered_map<std::string_view, DataSource> mDataSourceMap;
+    mutable std::unordered_map<std::string_view, DataSource> mDataSourceMap;
+    mutable std::unordered_map<std::string_view, utils::Invocable<DataSource()>> mDataSourceCreatorMap;
 };
 
 FILAMENT_DOWNCAST(DebugRegistry)
diff --git a/filament/src/details/View.cpp b/filament/src/details/View.cpp
index c3ddfe63678..8188390f7b5 100644
--- a/filament/src/details/View.cpp
+++ b/filament/src/details/View.cpp
@@ -20,6 +20,7 @@
 #include "Froxelizer.h"
 #include "RenderPrimitive.h"
 #include "ResourceAllocator.h"
+#include "ShadowMapManager.h"
 
 #include "details/Engine.h"
 #include "details/IndirectLight.h"
@@ -43,6 +44,7 @@
 #include <math/scalar.h>
 #include <math/fast.h>
 
+#include <array>
 #include <memory>
 
 using namespace utils;
@@ -59,8 +61,8 @@ FView::FView(FEngine& engine)
         : mFroxelizer(engine),
           mFogEntity(engine.getEntityManager().create()),
           mIsStereoSupported(engine.getDriverApi().isStereoSupported()),
-          mPerViewUniforms(engine),
-          mShadowMapManager(engine) {
+          mPerViewUniforms(engine) {
+
     DriverApi& driver = engine.getDriverApi();
 
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
@@ -76,7 +78,11 @@ FView::FView(FEngine& engine)
 
 #ifndef NDEBUG
     debugRegistry.registerDataSource("d.view.frame_info",
-            mDebugFrameHistory.data(), mDebugFrameHistory.size());
+            [this]() -> DebugRegistry::DataSource {
+                assert_invariant(!mDebugFrameHistory);
+                mDebugFrameHistory = std::make_unique<std::array<DebugRegistry::FrameHistory, 5*60>>();
+                return { mDebugFrameHistory->data(), mDebugFrameHistory->size() };
+            });
     debugRegistry.registerProperty("d.view.pid.kp", &engine.debug.view.pid.kp);
     debugRegistry.registerProperty("d.view.pid.ki", &engine.debug.view.pid.ki);
     debugRegistry.registerProperty("d.view.pid.kd", &engine.debug.view.pid.kd);
@@ -113,7 +119,8 @@ void FView::terminate(FEngine& engine) {
     driver.destroyBufferObject(mLightUbh);
     driver.destroyBufferObject(mRenderableUbh);
     drainFrameHistory(engine);
-    mShadowMapManager.terminate(engine);
+
+    ShadowMapManager::terminate(engine, mShadowMapManager);
     mPerViewUniforms.terminate(driver);
     mFroxelizer.terminate(driver);
 
@@ -242,21 +249,24 @@ float2 FView::updateScale(FEngine& engine,
 
 #ifndef NDEBUG
     // only for debugging...
-    using duration_ms = std::chrono::duration<float, std::milli>;
-    const float target = (1000.0f * float(frameRateOptions.interval)) / displayInfo.refreshRate;
-    const float targetWithHeadroom = target * (1.0f - frameRateOptions.headRoomRatio);
-    std::move(mDebugFrameHistory.begin() + 1,
-            mDebugFrameHistory.end(), mDebugFrameHistory.begin());
-    mDebugFrameHistory.back() = {
-            .target             = target,
-            .targetWithHeadroom = targetWithHeadroom,
-            .frameTime          = std::chrono::duration_cast<duration_ms>(info.frameTime).count(),
-            .frameTimeDenoised  = std::chrono::duration_cast<duration_ms>(info.denoisedFrameTime).count(),
-            .scale              = mScale.x * mScale.y,
-            .pid_e              = mPidController.getError(),
-            .pid_i              = mPidController.getIntegral(),
-            .pid_d              = mPidController.getDerivative()
-    };
+    if (mDebugFrameHistory) {
+        using namespace std::chrono;
+        using duration_ms = duration<float, std::milli>;
+        const float target = (1000.0f * float(frameRateOptions.interval)) / displayInfo.refreshRate;
+        const float targetWithHeadroom = target * (1.0f - frameRateOptions.headRoomRatio);
+        std::move(mDebugFrameHistory->begin() + 1,
+                mDebugFrameHistory->end(), mDebugFrameHistory->begin());
+        mDebugFrameHistory->back() = {
+                .target             = target,
+                .targetWithHeadroom = targetWithHeadroom,
+                .frameTime          = duration_cast<duration_ms>(info.frameTime).count(),
+                .frameTimeDenoised  = duration_cast<duration_ms>(info.denoisedFrameTime).count(),
+                .scale              = mScale.x * mScale.y,
+                .pid_e              = mPidController.getError(),
+                .pid_i              = mPidController.getIntegral(),
+                .pid_d              = mPidController.getDerivative()
+        };
+    }
 #endif
 
     return mScale;
@@ -281,10 +291,10 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         return;
     }
 
-    mShadowMapManager.reset();
-
     auto& lcm = engine.getLightManager();
 
+    ShadowMapManager::Builder builder;
+
     // dominant directional light is always as index 0
     FLightManager::Instance const directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
     const bool hasDirectionalShadows = directionalLight && lcm.isShadowCaster(directionalLight);
@@ -292,7 +302,7 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         const auto& shadowOptions = lcm.getShadowOptions(directionalLight);
         assert_invariant(shadowOptions.shadowCascades >= 1 &&
                 shadowOptions.shadowCascades <= CONFIG_MAX_SHADOW_CASCADES);
-        mShadowMapManager.setDirectionalShadowMap(0, &shadowOptions);
+        builder.directionalShadowMap(0, &shadowOptions);
     }
 
     // Find all shadow-casting spotlights.
@@ -326,7 +336,7 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         if (shadowMapCount + shadowMapCountNeeded <= CONFIG_MAX_SHADOWMAPS) {
             shadowMapCount += shadowMapCountNeeded;
             const auto& shadowOptions = lcm.getShadowOptions(li);
-            mShadowMapManager.addShadowMap(l, spotLight, &shadowOptions);
+            builder.shadowMap(l, spotLight, &shadowOptions);
         }
 
         if (shadowMapCount >= CONFIG_MAX_SHADOWMAPS) {
@@ -334,11 +344,14 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         }
     }
 
-    auto shadowTechnique = mShadowMapManager.update(engine, *this, cameraInfo,
-            renderableData, lightData);
+    if (builder.hasShadowMaps()) {
+        ShadowMapManager::createIfNeeded(engine, mShadowMapManager);
+        auto shadowTechnique = mShadowMapManager->update(builder, engine, *this,
+                cameraInfo, renderableData, lightData);
 
-    mHasShadowing = any(shadowTechnique);
-    mNeedsShadowMap = any(shadowTechnique & ShadowMapManager::ShadowTechnique::SHADOW_MAP);
+        mHasShadowing = any(shadowTechnique);
+        mNeedsShadowMap = any(shadowTechnique & ShadowMapManager::ShadowTechnique::SHADOW_MAP);
+    }
 }
 
 void FView::prepareLighting(FEngine& engine, CameraInfo const& cameraInfo) noexcept {
@@ -613,7 +626,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, RootArenaScope& rootAren
                 uint32_t(endDirCastersOnly - beginRenderables)};
 
         merged = { 0, uint32_t(endPotentialSpotCastersOnly - beginRenderables) };
-        if (!mShadowMapManager.hasSpotShadows()) {
+        if (!needsShadowMap() || !mShadowMapManager->hasSpotShadows()) {
             // we know we don't have spot shadows, we can reduce the range to not even include
             // the potential spot casters
             merged = { 0, uint32_t(endDirCastersOnly - beginRenderables) };
@@ -679,8 +692,11 @@ void FView::bindPerViewUniformsAndSamplers(FEngine::DriverApi& driver) const noe
     driver.bindUniformBuffer(+UniformBindingPoints::LIGHTS,
             mLightUbh);
 
-    driver.bindUniformBuffer(+UniformBindingPoints::SHADOW,
-            mShadowMapManager.getShadowUniformsHandle());
+    if (needsShadowMap()) {
+        assert_invariant(mShadowMapManager->getShadowUniformsHandle());
+        driver.bindUniformBuffer(+UniformBindingPoints::SHADOW,
+                mShadowMapManager->getShadowUniformsHandle());
+    }
 
     driver.bindUniformBuffer(+UniformBindingPoints::FROXEL_RECORDS,
             mFroxelizer.getRecordBuffer());
@@ -781,7 +797,12 @@ void FView::prepareStructure(Handle<HwTexture> structure) const noexcept {
 }
 
 void FView::prepareShadow(Handle<HwTexture> texture) const noexcept {
-    const auto& uniforms = mShadowMapManager.getShadowMappingUniforms();
+    // when needsShadowMap() is not set, this method only just sets a dummy texture
+    // in the needed samplers (in that case `texture` is actually a dummy texture).
+    ShadowMapManager::ShadowMappingUniforms uniforms;
+    if (needsShadowMap()) {
+        uniforms = mShadowMapManager->getShadowMappingUniforms();
+    }
     switch (mShadowType) {
         case filament::ShadowType::PCF:
             mPerViewUniforms.prepareShadowPCF(texture, uniforms);
@@ -979,7 +1000,8 @@ void FView::updatePrimitivesLod(FEngine& engine, const CameraInfo&,
 FrameGraphId<FrameGraphTexture> FView::renderShadowMaps(FEngine& engine, FrameGraph& fg,
         CameraInfo const& cameraInfo, float4 const& userTime,
         RenderPassBuilder const& passBuilder) noexcept {
-    return mShadowMapManager.render(engine, fg, passBuilder, *this, cameraInfo, userTime);
+    assert_invariant(needsShadowMap());
+    return mShadowMapManager->render(engine, fg, passBuilder, *this, cameraInfo, userTime);
 }
 
 void FView::commitFrameHistory(FEngine& engine) noexcept {
diff --git a/filament/src/details/View.h b/filament/src/details/View.h
index d3b2c59fa73..6680306296d 100644
--- a/filament/src/details/View.h
+++ b/filament/src/details/View.h
@@ -53,6 +53,9 @@
 #include <math/scalar.h>
 #include <math/mat4.h>
 
+#include <array>
+#include <memory>
+
 namespace utils {
 class JobSystem;
 } // namespace utils;
@@ -198,8 +201,9 @@ class FView : public View {
 
     void setStereoscopicOptions(StereoscopicOptions const& options) noexcept;
 
-    FCamera const* getDirectionalLightCamera() const noexcept {
-        return mShadowMapManager.getDirectionalLightCamera();
+    FCamera const* getDirectionalShadowCamera() const noexcept {
+        if (!mShadowMapManager) return nullptr;
+        return mShadowMapManager->getDirectionalShadowCamera();
     }
 
     void setRenderTarget(FRenderTarget* renderTarget) noexcept {
@@ -555,7 +559,7 @@ class FView : public View {
     mutable bool mHasShadowing = false;
     mutable bool mNeedsShadowMap = false;
 
-    ShadowMapManager mShadowMapManager;
+    std::unique_ptr<ShadowMapManager> mShadowMapManager;
 
     std::array<math::float4, 4> mMaterialGlobals = {{
                                                             { 0, 0, 0, 1 },
@@ -565,7 +569,7 @@ class FView : public View {
                                                     }};
 
 #ifndef NDEBUG
-    std::array<DebugRegistry::FrameHistory, 5*60> mDebugFrameHistory;
+    std::unique_ptr<std::array<DebugRegistry::FrameHistory, 5*60>> mDebugFrameHistory;
 #endif
 };
 
diff --git a/libs/filamentapp/src/FilamentApp.cpp b/libs/filamentapp/src/FilamentApp.cpp
index 108bb0ac535..7e315b0da47 100644
--- a/libs/filamentapp/src/FilamentApp.cpp
+++ b/libs/filamentapp/src/FilamentApp.cpp
@@ -430,8 +430,10 @@ void FilamentApp::run(const Config& config, SetupCallback setupCallback,
         window->mDebugCamera->lookAt(eye, center, up);
 
         // Update the cube distortion matrix used for frustum visualization.
-        const Camera* lightmapCamera = window->mMainView->getView()->getDirectionalLightCamera();
-        lightmapCube->mapFrustum(*mEngine, lightmapCamera);
+        const Camera* lightmapCamera = window->mMainView->getView()->getDirectionalShadowCamera();
+        if (lightmapCamera) {
+            lightmapCube->mapFrustum(*mEngine, lightmapCamera);
+        }
         cameraCube->mapFrustum(*mEngine, window->mMainCamera);
 
         // Delay rendering for roughly one monitor refresh interval
@@ -713,7 +715,10 @@ FilamentApp::Window::Window(FilamentApp* filamentApp,
         mGodView->setCameraManipulator(mDebugCameraMan);
 
         // Ortho view obviously uses an ortho camera
-        mOrthoView->setCamera( (Camera *)mMainView->getView()->getDirectionalLightCamera() );
+        Camera const* debugDirectionalShadowCamera = mMainView->getView()->getDirectionalShadowCamera();
+        if (debugDirectionalShadowCamera) {
+            mOrthoView->setCamera(const_cast<Camera *>(debugDirectionalShadowCamera));
+        }
     }
 
     // configure the cameras
diff --git a/libs/math/include/math/mat3.h b/libs/math/include/math/mat3.h
index 5ad06bdf4bc..035865fe2bb 100644
--- a/libs/math/include/math/mat3.h
+++ b/libs/math/include/math/mat3.h
@@ -17,15 +17,21 @@
 #ifndef TNT_MATH_MAT3_H
 #define TNT_MATH_MAT3_H
 
-#include <math/TMatHelpers.h>
 #include <math/compiler.h>
 #include <math/quat.h>
 #include <math/vec3.h>
+#include <math/TMatHelpers.h>
+#include <math/TVecHelpers.h>
 
 #include <limits.h>
 #include <stdint.h>
 #include <sys/types.h>
 
+#include <cmath>
+
+#include <assert.h>
+#include <stddef.h>
+
 namespace filament {
 namespace math {
 // -------------------------------------------------------------------------------------

From e912dc2dc5f984ca5fdb165183eb1e3a2323f2ed Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Mon, 12 Feb 2024 12:55:11 -0800
Subject: [PATCH 09/19] PipelineCache didn't need to store a copy of
 RasterState

---
 filament/backend/src/vulkan/VulkanDriver.cpp  | 42 +++++++++----------
 .../src/vulkan/VulkanPipelineCache.cpp        | 24 +----------
 .../backend/src/vulkan/VulkanPipelineCache.h  | 10 -----
 3 files changed, 23 insertions(+), 53 deletions(-)

diff --git a/filament/backend/src/vulkan/VulkanDriver.cpp b/filament/backend/src/vulkan/VulkanDriver.cpp
index 3f48b44ad10..c700ce9c6a3 100644
--- a/filament/backend/src/vulkan/VulkanDriver.cpp
+++ b/filament/backend/src/vulkan/VulkanDriver.cpp
@@ -1676,26 +1676,26 @@ void VulkanDriver::draw(PipelineState pipelineState, Handle<HwRenderPrimitive> r
     // Update the VK raster state.
     const VulkanRenderTarget* rt = mCurrentRenderPass.renderTarget;
 
-    auto vkraster = mPipelineCache.getCurrentRasterState();
-    vkraster.cullMode = getCullMode(rasterState.culling);
-    vkraster.frontFace = getFrontFace(rasterState.inverseFrontFaces);
-    vkraster.depthBiasEnable = (depthOffset.constant || depthOffset.slope) ? true : false;
-    vkraster.depthBiasConstantFactor = depthOffset.constant;
-    vkraster.depthBiasSlopeFactor = depthOffset.slope;
-    vkraster.blendEnable = rasterState.hasBlending();
-    vkraster.srcColorBlendFactor = getBlendFactor(rasterState.blendFunctionSrcRGB);
-    vkraster.dstColorBlendFactor = getBlendFactor(rasterState.blendFunctionDstRGB);
-    vkraster.colorBlendOp = rasterState.blendEquationRGB;
-    vkraster.srcAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionSrcAlpha);
-    vkraster.dstAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionDstAlpha);
-    vkraster.alphaBlendOp =  rasterState.blendEquationAlpha;
-    vkraster.colorWriteMask = (VkColorComponentFlags) (rasterState.colorWrite ? 0xf : 0x0);
-    vkraster.depthWriteEnable = rasterState.depthWrite;
-    vkraster.depthCompareOp = rasterState.depthFunc;
-    vkraster.rasterizationSamples = rt->getSamples();
-    vkraster.alphaToCoverageEnable = rasterState.alphaToCoverage;
-    vkraster.colorTargetCount = rt->getColorTargetCount(mCurrentRenderPass);
-    mPipelineCache.setCurrentRasterState(vkraster);
+    VulkanPipelineCache::RasterState const vulkanRasterState{
+        .cullMode = getCullMode(rasterState.culling),
+        .frontFace = getFrontFace(rasterState.inverseFrontFaces),
+        .depthBiasEnable = (depthOffset.constant || depthOffset.slope) ? true : false,
+        .blendEnable = rasterState.hasBlending(),
+        .depthWriteEnable = rasterState.depthWrite,
+        .alphaToCoverageEnable = rasterState.alphaToCoverage,
+        .srcColorBlendFactor = getBlendFactor(rasterState.blendFunctionSrcRGB),
+        .dstColorBlendFactor = getBlendFactor(rasterState.blendFunctionDstRGB),
+        .srcAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionSrcAlpha),
+        .dstAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionDstAlpha),
+        .colorWriteMask = (VkColorComponentFlags) (rasterState.colorWrite ? 0xf : 0x0),
+        .rasterizationSamples = rt->getSamples(),
+        .colorTargetCount = rt->getColorTargetCount(mCurrentRenderPass),
+        .colorBlendOp = rasterState.blendEquationRGB,
+        .alphaBlendOp =  rasterState.blendEquationAlpha,
+        .depthCompareOp = rasterState.depthFunc,
+        .depthBiasConstantFactor = depthOffset.constant,
+        .depthBiasSlopeFactor = depthOffset.slope
+    };
 
     // Declare fixed-size arrays that get passed to the pipeCache and to vkCmdBindVertexBuffers.
     uint32_t const bufferCount = prim.vertexBuffer->attributes.size();
@@ -1706,7 +1706,7 @@ void VulkanDriver::draw(PipelineState pipelineState, Handle<HwRenderPrimitive> r
 
     // Push state changes to the VulkanPipelineCache instance. This is fast and does not make VK calls.
     mPipelineCache.bindProgram(program);
-    mPipelineCache.bindRasterState(mPipelineCache.getCurrentRasterState());
+    mPipelineCache.bindRasterState(vulkanRasterState);
     mPipelineCache.bindPrimitiveTopology(prim.primitiveTopology);
     mPipelineCache.bindVertexArray(attribDesc, bufferDesc, bufferCount);
 
diff --git a/filament/backend/src/vulkan/VulkanPipelineCache.cpp b/filament/backend/src/vulkan/VulkanPipelineCache.cpp
index 889888cd083..2d976f66ff2 100644
--- a/filament/backend/src/vulkan/VulkanPipelineCache.cpp
+++ b/filament/backend/src/vulkan/VulkanPipelineCache.cpp
@@ -34,8 +34,6 @@ using namespace bluevk;
 
 namespace filament::backend {
 
-static VulkanPipelineCache::RasterState createDefaultRasterState();
-
 static VkShaderStageFlags getShaderStageFlags(VulkanPipelineCache::UsageFlags key, uint16_t binding) {
     // NOTE: if you modify this function, you also need to modify getUsageFlags.
     assert_invariant(binding < MAX_SAMPLER_COUNT);
@@ -73,8 +71,7 @@ VulkanPipelineCache::UsageFlags VulkanPipelineCache::disableUsageFlags(uint16_t
 }
 
 VulkanPipelineCache::VulkanPipelineCache(VulkanResourceAllocator* allocator)
-    : mCurrentRasterState(createDefaultRasterState()),
-      mResourceAllocator(allocator),
+    : mResourceAllocator(allocator),
       mPipelineBoundResources(allocator) {
     mDummyBufferWriteInfo.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     mDummyBufferWriteInfo.pNext = nullptr;
@@ -569,7 +566,7 @@ void VulkanPipelineCache::bindProgram(VulkanProgram* program) noexcept {
 }
 
 void VulkanPipelineCache::bindRasterState(const RasterState& rasterState) noexcept {
-    mPipelineRequirements.rasterState = mCurrentRasterState = rasterState;
+    mPipelineRequirements.rasterState = rasterState;
 }
 
 void VulkanPipelineCache::bindRenderPass(VkRenderPass renderPass, int subpassIndex) noexcept {
@@ -917,23 +914,6 @@ bool VulkanPipelineCache::DescEqual::operator()(const DescriptorKey& k1,
     return true;
 }
 
-static VulkanPipelineCache::RasterState createDefaultRasterState() {
-    return VulkanPipelineCache::RasterState {
-        .cullMode = VK_CULL_MODE_NONE,
-        .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
-        .depthBiasEnable = VK_FALSE,
-        .blendEnable = VK_FALSE,
-        .depthWriteEnable = VK_TRUE,
-        .alphaToCoverageEnable = true,
-        .colorWriteMask = 0xf,
-        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
-        .colorTargetCount = 1,
-        .depthCompareOp = SamplerCompareFunc::LE,
-        .depthBiasConstantFactor = 0.0f,
-        .depthBiasSlopeFactor = 0.0f,
-    };
-}
-
 } // namespace filament::backend
 
 #pragma clang diagnostic pop
diff --git a/filament/backend/src/vulkan/VulkanPipelineCache.h b/filament/backend/src/vulkan/VulkanPipelineCache.h
index 018fd00efec..a28327c5d9e 100644
--- a/filament/backend/src/vulkan/VulkanPipelineCache.h
+++ b/filament/backend/src/vulkan/VulkanPipelineCache.h
@@ -199,15 +199,6 @@ class VulkanPipelineCache : public CommandBufferObserver {
         mPipelineBoundResources.acquire(resource);
     }
 
-    inline RasterState getCurrentRasterState() const noexcept {
-        return mCurrentRasterState;
-    }
-
-    // We need to update this outside of bindRasterState due to VulkanDriver::draw.
-    inline void setCurrentRasterState(RasterState const& rasterState) noexcept {
-        mCurrentRasterState = rasterState;
-    }
-
 private:
     // PIPELINE LAYOUT CACHE KEY
     // -------------------------
@@ -413,7 +404,6 @@ class VulkanPipelineCache : public CommandBufferObserver {
     VmaAllocator mAllocator = VK_NULL_HANDLE;
 
     // Current requirements for the pipeline layout, pipeline, and descriptor sets.
-    RasterState mCurrentRasterState;
     PipelineKey mPipelineRequirements = {};
     DescriptorKey mDescriptorRequirements = {};
 

From 9c6020a77a30222ebb85fa2bb43288ed15b10408 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Mon, 12 Feb 2024 12:56:16 -0800
Subject: [PATCH 10/19] Make VulkanResourceBase 8 bytes instead of 16.

---
 filament/backend/src/vulkan/VulkanResources.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/filament/backend/src/vulkan/VulkanResources.h b/filament/backend/src/vulkan/VulkanResources.h
index 77b6498b860..9421e11a14d 100644
--- a/filament/backend/src/vulkan/VulkanResources.h
+++ b/filament/backend/src/vulkan/VulkanResources.h
@@ -63,7 +63,8 @@ struct VulkanResourceBase {
     explicit VulkanResourceBase(VulkanResourceType type)
         : mRefCount(IS_HEAP_ALLOC_TYPE(type) ? 1 : 0),
           mType(type),
-          mHandleId(0) {}
+          mHandleId(0) {
+    }
 
 private:
     inline VulkanResourceType getType() {
@@ -82,6 +83,7 @@ struct VulkanResourceBase {
         if (IS_HEAP_ALLOC_TYPE(mType)) {
             return;
         }
+        assert_invariant(mRefCount < ((1<<24) - 1));
         ++mRefCount;
     }
 
@@ -89,6 +91,7 @@ struct VulkanResourceBase {
         if (IS_HEAP_ALLOC_TYPE(mType)) {
             return;
         }
+        assert_invariant(mRefCount > 0);
         --mRefCount;
     }
 
@@ -96,8 +99,8 @@ struct VulkanResourceBase {
         return mRefCount;
     }
 
-    size_t mRefCount = 0;
-    VulkanResourceType mType = VulkanResourceType::BUFFER_OBJECT;
+    uint32_t mRefCount : 24; // 16M is enough for the refcount
+    VulkanResourceType mType : 8;
     HandleBase::HandleId mHandleId;
 
     friend struct VulkanThreadSafeResource;

From 6ccfeddf26cf85db0df21ea92aa0d620159ea4c2 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Mon, 12 Feb 2024 16:02:20 -0800
Subject: [PATCH 11/19] fix a typo that broke the resourceallocator cache

the cache size is given in MiB not bytes, so we needed to convert it
to bytes.
---
 filament/src/ResourceAllocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/filament/src/ResourceAllocator.cpp b/filament/src/ResourceAllocator.cpp
index eeff5011a64..b8d1e6698e5 100644
--- a/filament/src/ResourceAllocator.cpp
+++ b/filament/src/ResourceAllocator.cpp
@@ -109,7 +109,7 @@ size_t ResourceAllocator::TextureKey::getSize() const noexcept {
 }
 
 ResourceAllocator::ResourceAllocator(Engine::Config const& config, DriverApi& driverApi) noexcept
-        : mCacheCapacity(config.resourceAllocatorCacheSizeMB),
+        : mCacheCapacity(config.resourceAllocatorCacheSizeMB << 20),
           mCacheMaxAge(config.resourceAllocatorCacheMaxAge),
           mBackend(driverApi) {
 }

From a9e8f40287a6d067b2f2884f56308f4b67a10fb3 Mon Sep 17 00:00:00 2001
From: Sungun Park <sungunpark@google.com>
Date: Tue, 13 Feb 2024 00:44:36 +0000
Subject: [PATCH 12/19] Release Filament 1.50.3

---
 README.md                      | 4 ++--
 RELEASE_NOTES.md               | 3 +++
 android/gradle.properties      | 2 +-
 ios/CocoaPods/Filament.podspec | 4 ++--
 web/filament-js/package.json   | 2 +-
 5 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 01bfe81f16f..bb2c074eb15 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ repositories {
 }
 
 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.50.2'
+    implementation 'com.google.android.filament:filament-android:1.50.3'
 }
 ```
 
@@ -51,7 +51,7 @@ Here are all the libraries available in the group `com.google.android.filament`:
 iOS projects can use CocoaPods to install the latest release:
 
 ```shell
-pod 'Filament', '~> 1.50.2'
+pod 'Filament', '~> 1.50.3'
 ```
 
 ### Snapshots
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 209a0afdec2..f7fa37e7f58 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -7,6 +7,9 @@ A new header is inserted each time a *tag* is created.
 Instead, if you are authoring a PR for the main branch, add your release note to
 [NEW_RELEASE_NOTES.md](./NEW_RELEASE_NOTES.md).
 
+## v1.50.4
+
+
 ## v1.50.3
 
 
diff --git a/android/gradle.properties b/android/gradle.properties
index e9b4bb3f457..53e64758c22 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=com.google.android.filament
-VERSION_NAME=1.50.2
+VERSION_NAME=1.50.3
 
 POM_DESCRIPTION=Real-time physically based rendering engine for Android.
 
diff --git a/ios/CocoaPods/Filament.podspec b/ios/CocoaPods/Filament.podspec
index ebefa012fb3..5b53805fa63 100644
--- a/ios/CocoaPods/Filament.podspec
+++ b/ios/CocoaPods/Filament.podspec
@@ -1,12 +1,12 @@
 Pod::Spec.new do |spec|
   spec.name = "Filament"
-  spec.version = "1.50.2"
+  spec.version = "1.50.3"
   spec.license = { :type => "Apache 2.0", :file => "LICENSE" }
   spec.homepage = "https://google.github.io/filament"
   spec.authors = "Google LLC."
   spec.summary = "Filament is a real-time physically based rendering engine for Android, iOS, Windows, Linux, macOS, and WASM/WebGL."
   spec.platform = :ios, "11.0"
-  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.2/filament-v1.50.2-ios.tgz" }
+  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.3/filament-v1.50.3-ios.tgz" }
 
   # Fix linking error with Xcode 12; we do not yet support the simulator on Apple silicon.
   spec.pod_target_xcconfig = {
diff --git a/web/filament-js/package.json b/web/filament-js/package.json
index c7ad9bd95fc..7ebc0a0ff4b 100644
--- a/web/filament-js/package.json
+++ b/web/filament-js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "filament",
-  "version": "1.50.2",
+  "version": "1.50.3",
   "description": "Real-time physically based rendering engine",
   "main": "filament.js",
   "module": "filament.js",

From 70b87510a2d3f1731b3d8cd5fc808917a32b272e Mon Sep 17 00:00:00 2001
From: Sungun Park <sungunpark@google.com>
Date: Tue, 13 Feb 2024 00:45:01 +0000
Subject: [PATCH 13/19] Bump version to 1.50.4

---
 README.md                      | 4 ++--
 android/gradle.properties      | 2 +-
 ios/CocoaPods/Filament.podspec | 4 ++--
 web/filament-js/package.json   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bb2c074eb15..66f0976391a 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ repositories {
 }
 
 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.50.3'
+    implementation 'com.google.android.filament:filament-android:1.50.4'
 }
 ```
 
@@ -51,7 +51,7 @@ Here are all the libraries available in the group `com.google.android.filament`:
 iOS projects can use CocoaPods to install the latest release:
 
 ```shell
-pod 'Filament', '~> 1.50.3'
+pod 'Filament', '~> 1.50.4'
 ```
 
 ### Snapshots
diff --git a/android/gradle.properties b/android/gradle.properties
index 53e64758c22..34340a20b8a 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=com.google.android.filament
-VERSION_NAME=1.50.3
+VERSION_NAME=1.50.4
 
 POM_DESCRIPTION=Real-time physically based rendering engine for Android.
 
diff --git a/ios/CocoaPods/Filament.podspec b/ios/CocoaPods/Filament.podspec
index 5b53805fa63..df0911beb7d 100644
--- a/ios/CocoaPods/Filament.podspec
+++ b/ios/CocoaPods/Filament.podspec
@@ -1,12 +1,12 @@
 Pod::Spec.new do |spec|
   spec.name = "Filament"
-  spec.version = "1.50.3"
+  spec.version = "1.50.4"
   spec.license = { :type => "Apache 2.0", :file => "LICENSE" }
   spec.homepage = "https://google.github.io/filament"
   spec.authors = "Google LLC."
   spec.summary = "Filament is a real-time physically based rendering engine for Android, iOS, Windows, Linux, macOS, and WASM/WebGL."
   spec.platform = :ios, "11.0"
-  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.3/filament-v1.50.3-ios.tgz" }
+  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.4/filament-v1.50.4-ios.tgz" }
 
   # Fix linking error with Xcode 12; we do not yet support the simulator on Apple silicon.
   spec.pod_target_xcconfig = {
diff --git a/web/filament-js/package.json b/web/filament-js/package.json
index 7ebc0a0ff4b..52b661bd10e 100644
--- a/web/filament-js/package.json
+++ b/web/filament-js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "filament",
-  "version": "1.50.3",
+  "version": "1.50.4",
   "description": "Real-time physically based rendering engine",
   "main": "filament.js",
   "module": "filament.js",

From b48b6136baa4f3a18fa6db7e41c6ac0a725fb98b Mon Sep 17 00:00:00 2001
From: Powei Feng <powei@google.com>
Date: Fri, 16 Feb 2024 13:06:01 -0800
Subject: [PATCH 14/19] geometry: properly reference memcpy usage (#7576)

---
 libs/geometry/src/MikktspaceImpl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libs/geometry/src/MikktspaceImpl.cpp b/libs/geometry/src/MikktspaceImpl.cpp
index f9f54c332cb..19942ada3bd 100644
--- a/libs/geometry/src/MikktspaceImpl.cpp
+++ b/libs/geometry/src/MikktspaceImpl.cpp
@@ -19,12 +19,13 @@
 #include <math/mat3.h>
 #include <math/norm.h>
 
-
 #include <meshoptimizer.h>
 #include <mikktspace/mikktspace.h>
 
 #include <vector>
 
+#include <string.h>  // memcpy
+
 namespace filament::geometry {
 
 using namespace filament::math;
@@ -98,7 +99,7 @@ void MikktspaceImpl::setTSpaceBasic(SMikkTSpaceContext const* context, float con
     cursor += 36;
     for (auto [attribArray, attribStride, attribSize]: wrapper->mInputAttribArrays) {
         uint8_t const* input = pointerAdd(attribArray, vertInd, attribStride);
-        std::memcpy(cursor, input, attribSize);
+        memcpy(cursor, input, attribSize);
         cursor += attribSize;
     }
 }

From fadd5eb953deb4eed64d3fc84965acda05d2c0e1 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Wed, 21 Feb 2024 10:50:14 -0800
Subject: [PATCH 15/19] fix a uninitialized memory access when no renderable
 are visible

---
 filament/src/RenderPass.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index b585c4d328b..d5063043f5a 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -164,6 +164,11 @@ void RenderPass::appendCommands(FEngine& engine,
     // trace the number of visible renderables
     SYSTRACE_VALUE32("visibleRenderables", vr.size());
     if (UTILS_UNLIKELY(vr.empty())) {
+        // no renderables, we still need the sentinel and the command buffer size should be
+        // exactly 1.
+        assert_invariant(commands.size() == 1);
+        Command* curr = commands.data();
+        curr->key = uint64_t(Pass::SENTINEL);
         return;
     }
 

From 3e644b25f09ecbe2e06a3d8ca664dda9d054f701 Mon Sep 17 00:00:00 2001
From: Benjamin Doherty <bendoherty@google.com>
Date: Wed, 21 Feb 2024 13:32:30 -0800
Subject: [PATCH 16/19] Fix an out-of-bounds memory access when no renderables
 are visible

---
 filament/src/RenderPass.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index d5063043f5a..5282fb32ca4 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -99,6 +99,10 @@ RenderPass::RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexce
           mScissorViewport(builder.mScissorViewport),
           mCustomCommands(engine.getPerRenderPassArena()) {
 
+    if (mVisibleRenderables.empty()) {
+        return;
+    }
+
     // compute the number of commands we need
     updateSummedPrimitiveCounts(
             const_cast<FScene::RenderableSoa&>(mRenderableSoa), mVisibleRenderables);

From 9e119937afcf3a88537a5d1048d630755a6f5d20 Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Wed, 21 Feb 2024 23:51:57 -0800
Subject: [PATCH 17/19] Better fix for OOB when we have no renderable

The OOB would happen is the scene never had any renderables, in that
case the scene's SoA would stay unallocated, but the summedAreaTable
code relies on it have at least a capacity of 1.

It was incorrect to skip the RenderPass entirely because it might have
had some custom commands that needed to be executed (e.g. for applying
post-processing in subpass mode).
---
 filament/src/RenderPass.cpp    | 4 ----
 filament/src/details/Scene.cpp | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index 5282fb32ca4..d5063043f5a 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -99,10 +99,6 @@ RenderPass::RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexce
           mScissorViewport(builder.mScissorViewport),
           mCustomCommands(engine.getPerRenderPassArena()) {
 
-    if (mVisibleRenderables.empty()) {
-        return;
-    }
-
     // compute the number of commands we need
     updateSummedPrimitiveCounts(
             const_cast<FScene::RenderableSoa&>(mRenderableSoa), mVisibleRenderables);
diff --git a/filament/src/details/Scene.cpp b/filament/src/details/Scene.cpp
index 21840fa60cf..ff6af6293a5 100644
--- a/filament/src/details/Scene.cpp
+++ b/filament/src/details/Scene.cpp
@@ -148,7 +148,7 @@ void FScene::prepare(utils::JobSystem& js,
 
     // TODO: the resize below could happen in a job
 
-    if (sceneData.size() != renderableInstances.size()) {
+    if (!sceneData.capacity() || sceneData.size() != renderableInstances.size()) {
         sceneData.clear();
         if (sceneData.capacity() < renderableDataCapacity) {
             sceneData.setCapacity(renderableDataCapacity);

From 65dfac963706ef9e963baca301dd99dc11f42b80 Mon Sep 17 00:00:00 2001
From: Sungun Park <sungunpark@google.com>
Date: Wed, 14 Feb 2024 04:40:30 +0000
Subject: [PATCH 18/19] Add stereoscopic type to Engine::Config (#7574)

* Add stereoscopic type to Engine::Config

This new type value will determine the algorithm used when stereoscopic
rendering is enabled.
---
 .../filament-android/src/main/cpp/Engine.cpp  |  3 +-
 .../com/google/android/filament/Engine.java   | 28 +++++++++++++++++--
 .../backend/include/backend/DriverEnums.h     |  8 ++++++
 filament/include/filament/Engine.h            | 14 ++++++++++
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/android/filament-android/src/main/cpp/Engine.cpp b/android/filament-android/src/main/cpp/Engine.cpp
index 05893cbd5bf..9a2180e022e 100644
--- a/android/filament-android/src/main/cpp/Engine.cpp
+++ b/android/filament-android/src/main/cpp/Engine.cpp
@@ -484,7 +484,7 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
 extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBuilderConfig(JNIEnv*,
         jclass, jlong nativeBuilder, jlong commandBufferSizeMB, jlong perRenderPassArenaSizeMB,
         jlong driverHandleArenaSizeMB, jlong minCommandBufferSizeMB, jlong perFrameCommandsSizeMB,
-        jlong jobSystemThreadCount, jlong stereoscopicEyeCount,
+        jlong jobSystemThreadCount, jint stereoscopicType, jlong stereoscopicEyeCount,
         jlong resourceAllocatorCacheSizeMB, jlong resourceAllocatorCacheMaxAge) {
     Engine::Builder* builder = (Engine::Builder*) nativeBuilder;
     Engine::Config config = {
@@ -494,6 +494,7 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
             .minCommandBufferSizeMB = (uint32_t) minCommandBufferSizeMB,
             .perFrameCommandsSizeMB = (uint32_t) perFrameCommandsSizeMB,
             .jobSystemThreadCount = (uint32_t) jobSystemThreadCount,
+            .stereoscopicType = (Engine::StereoscopicType) stereoscopicType,
             .stereoscopicEyeCount = (uint8_t) stereoscopicEyeCount,
             .resourceAllocatorCacheSizeMB = (uint32_t) resourceAllocatorCacheSizeMB,
             .resourceAllocatorCacheMaxAge = (uint8_t) resourceAllocatorCacheMaxAge,
diff --git a/android/filament-android/src/main/java/com/google/android/filament/Engine.java b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
index 6b4647e2ac1..393b711ac97 100644
--- a/android/filament-android/src/main/java/com/google/android/filament/Engine.java
+++ b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
@@ -158,6 +158,16 @@ public enum FeatureLevel {
         FEATURE_LEVEL_3,
     };
 
+    /**
+     * The type of technique for stereoscopic rendering
+     */
+    public enum StereoscopicType {
+        /** Stereoscopic rendering is performed using instanced rendering technique. */
+        INSTANCED,
+        /** Stereoscopic rendering is performed using the multiview feature from the graphics backend. */
+        MULTIVIEW,
+    };
+
     /**
      * Constructs <code>Engine</code> objects using a builder pattern.
      */
@@ -211,7 +221,8 @@ public Builder config(Config config) {
             nSetBuilderConfig(mNativeBuilder, config.commandBufferSizeMB,
                     config.perRenderPassArenaSizeMB, config.driverHandleArenaSizeMB,
                     config.minCommandBufferSizeMB, config.perFrameCommandsSizeMB,
-                    config.jobSystemThreadCount, config.stereoscopicEyeCount,
+                    config.jobSystemThreadCount,
+                    config.stereoscopicType.ordinal(), config.stereoscopicEyeCount,
                     config.resourceAllocatorCacheSizeMB, config.resourceAllocatorCacheMaxAge);
             return this;
         }
@@ -349,6 +360,19 @@ public static class Config {
          */
         public long jobSystemThreadCount = 0;
 
+        /**
+         * The type of technique for stereoscopic rendering.
+         *
+         * This setting determines the algorithm used when stereoscopic rendering is enabled. This
+         * decision applies to the entire Engine for the lifetime of the Engine. E.g., multiple
+         * Views created from the Engine must use the same stereoscopic type.
+         *
+         * Each view can enable stereoscopic rendering via the StereoscopicOptions::enable flag.
+         *
+         * @see View#setStereoscopicOptions
+         */
+        public StereoscopicType stereoscopicType = StereoscopicType.INSTANCED;
+
         /**
          * The number of eyes to render when stereoscopic rendering is enabled. Supported values are
          * between 1 and Engine#getMaxStereoscopicEyes() (inclusive).
@@ -1240,7 +1264,7 @@ private static void assertDestroy(boolean success) {
     private static native void nSetBuilderConfig(long nativeBuilder, long commandBufferSizeMB,
             long perRenderPassArenaSizeMB, long driverHandleArenaSizeMB,
             long minCommandBufferSizeMB, long perFrameCommandsSizeMB, long jobSystemThreadCount,
-            long stereoscopicEyeCount,
+            int stereoscopicType, long stereoscopicEyeCount,
             long resourceAllocatorCacheSizeMB, long resourceAllocatorCacheMaxAge);
     private static native void nSetBuilderFeatureLevel(long nativeBuilder, int ordinal);
     private static native void nSetBuilderSharedContext(long nativeBuilder, long sharedContext);
diff --git a/filament/backend/include/backend/DriverEnums.h b/filament/backend/include/backend/DriverEnums.h
index aba2b404145..c41d1b83049 100644
--- a/filament/backend/include/backend/DriverEnums.h
+++ b/filament/backend/include/backend/DriverEnums.h
@@ -1212,6 +1212,14 @@ enum class Workaround : uint16_t {
     DISABLE_THREAD_AFFINITY
 };
 
+//! The type of technique for stereoscopic rendering
+enum class StereoscopicType : uint8_t {
+    // Stereoscopic rendering is performed using instanced rendering technique.
+    INSTANCED,
+    // Stereoscopic rendering is performed using the multiview feature from the graphics backend.
+    MULTIVIEW,
+};
+
 } // namespace filament::backend
 
 template<> struct utils::EnableBitMaskOperators<filament::backend::ShaderStageFlags>
diff --git a/filament/include/filament/Engine.h b/filament/include/filament/Engine.h
index 904cbda4a3f..90cff6283dc 100644
--- a/filament/include/filament/Engine.h
+++ b/filament/include/filament/Engine.h
@@ -178,6 +178,7 @@ class UTILS_PUBLIC Engine {
     using Backend = backend::Backend;
     using DriverConfig = backend::Platform::DriverConfig;
     using FeatureLevel = backend::FeatureLevel;
+    using StereoscopicType = backend::StereoscopicType;
 
     /**
      * Config is used to define the memory footprint used by the engine, such as the
@@ -297,6 +298,19 @@ class UTILS_PUBLIC Engine {
          */
         size_t textureUseAfterFreePoolSize = 0;
 
+        /*
+         * The type of technique for stereoscopic rendering.
+         *
+         * This setting determines the algorithm used when stereoscopic rendering is enabled. This
+         * decision applies to the entire Engine for the lifetime of the Engine. E.g., multiple
+         * Views created from the Engine must use the same stereoscopic type.
+         *
+         * Each view can enable stereoscopic rendering via the StereoscopicOptions::enable flag.
+         *
+         * @see View::setStereoscopicOptions
+         */
+        StereoscopicType stereoscopicType = StereoscopicType::INSTANCED;
+
         /*
          * The number of eyes to render when stereoscopic rendering is enabled. Supported values are
          * between 1 and Engine::getMaxStereoscopicEyes() (inclusive).

From af48bc3c74c32b5f9a704ab7ab424baf8090378b Mon Sep 17 00:00:00 2001
From: Mathias Agopian <mathias@google.com>
Date: Tue, 20 Feb 2024 16:16:52 -0800
Subject: [PATCH 19/19] add the disableParallelShaderCompile option to
 Engine::Config

---
 .../filament-android/src/main/cpp/Engine.cpp   |  6 +++++-
 .../com/google/android/filament/Engine.java    | 18 ++++++++++++++++++
 filament/include/filament/Engine.h             |  6 ++++++
 filament/src/details/Engine.cpp                |  8 ++++++--
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/android/filament-android/src/main/cpp/Engine.cpp b/android/filament-android/src/main/cpp/Engine.cpp
index 9a2180e022e..80409702c37 100644
--- a/android/filament-android/src/main/cpp/Engine.cpp
+++ b/android/filament-android/src/main/cpp/Engine.cpp
@@ -484,7 +484,9 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
 extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBuilderConfig(JNIEnv*,
         jclass, jlong nativeBuilder, jlong commandBufferSizeMB, jlong perRenderPassArenaSizeMB,
         jlong driverHandleArenaSizeMB, jlong minCommandBufferSizeMB, jlong perFrameCommandsSizeMB,
-        jlong jobSystemThreadCount, jint stereoscopicType, jlong stereoscopicEyeCount,
+        jlong jobSystemThreadCount,
+        jlong textureUseAfterFreePoolSize, jboolean disableParallelShaderCompile,
+        jint stereoscopicType, jlong stereoscopicEyeCount,
         jlong resourceAllocatorCacheSizeMB, jlong resourceAllocatorCacheMaxAge) {
     Engine::Builder* builder = (Engine::Builder*) nativeBuilder;
     Engine::Config config = {
@@ -494,6 +496,8 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
             .minCommandBufferSizeMB = (uint32_t) minCommandBufferSizeMB,
             .perFrameCommandsSizeMB = (uint32_t) perFrameCommandsSizeMB,
             .jobSystemThreadCount = (uint32_t) jobSystemThreadCount,
+            .textureUseAfterFreePoolSize = (uint32_t) textureUseAfterFreePoolSize,
+            .disableParallelShaderCompile = (bool) disableParallelShaderCompile,
             .stereoscopicType = (Engine::StereoscopicType) stereoscopicType,
             .stereoscopicEyeCount = (uint8_t) stereoscopicEyeCount,
             .resourceAllocatorCacheSizeMB = (uint32_t) resourceAllocatorCacheSizeMB,
diff --git a/android/filament-android/src/main/java/com/google/android/filament/Engine.java b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
index 393b711ac97..06f0a2b8ae9 100644
--- a/android/filament-android/src/main/java/com/google/android/filament/Engine.java
+++ b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
@@ -222,6 +222,7 @@ public Builder config(Config config) {
                     config.perRenderPassArenaSizeMB, config.driverHandleArenaSizeMB,
                     config.minCommandBufferSizeMB, config.perFrameCommandsSizeMB,
                     config.jobSystemThreadCount,
+                    config.textureUseAfterFreePoolSize, config.disableParallelShaderCompile,
                     config.stereoscopicType.ordinal(), config.stereoscopicEyeCount,
                     config.resourceAllocatorCacheSizeMB, config.resourceAllocatorCacheMaxAge);
             return this;
@@ -360,6 +361,22 @@ public static class Config {
          */
         public long jobSystemThreadCount = 0;
 
+        /**
+         * Number of most-recently destroyed textures to track for use-after-free.
+         *
+         * This will cause the backend to throw an exception when a texture is freed but still bound
+         * to a SamplerGroup and used in a draw call. 0 disables completely.
+         *
+         * Currently only respected by the Metal backend.
+         */
+        public long textureUseAfterFreePoolSize = 0;
+
+        /**
+         * Set to `true` to forcibly disable parallel shader compilation in the backend.
+         * Currently only honored by the GL backend.
+         */
+        public boolean disableParallelShaderCompile = false;
+
         /**
          * The type of technique for stereoscopic rendering.
          *
@@ -1264,6 +1281,7 @@ private static void assertDestroy(boolean success) {
     private static native void nSetBuilderConfig(long nativeBuilder, long commandBufferSizeMB,
             long perRenderPassArenaSizeMB, long driverHandleArenaSizeMB,
             long minCommandBufferSizeMB, long perFrameCommandsSizeMB, long jobSystemThreadCount,
+            long textureUseAfterFreePoolSize, boolean disableParallelShaderCompile,
             int stereoscopicType, long stereoscopicEyeCount,
             long resourceAllocatorCacheSizeMB, long resourceAllocatorCacheMaxAge);
     private static native void nSetBuilderFeatureLevel(long nativeBuilder, int ordinal);
diff --git a/filament/include/filament/Engine.h b/filament/include/filament/Engine.h
index 90cff6283dc..2f8c6d4af74 100644
--- a/filament/include/filament/Engine.h
+++ b/filament/include/filament/Engine.h
@@ -298,6 +298,12 @@ class UTILS_PUBLIC Engine {
          */
         size_t textureUseAfterFreePoolSize = 0;
 
+        /**
+         * Set to `true` to forcibly disable parallel shader compilation in the backend.
+         * Currently only honored by the GL backend.
+         */
+        bool disableParallelShaderCompile = false;
+
         /*
          * The type of technique for stereoscopic rendering.
          *
diff --git a/filament/src/details/Engine.cpp b/filament/src/details/Engine.cpp
index 82b9399ec6b..2fce2349c39 100644
--- a/filament/src/details/Engine.cpp
+++ b/filament/src/details/Engine.cpp
@@ -98,7 +98,10 @@ Engine* FEngine::create(Engine::Builder const& builder) {
             return nullptr;
         }
         DriverConfig const driverConfig{
-            .handleArenaSize = instance->getRequestedDriverHandleArenaSize() };
+                .handleArenaSize = instance->getRequestedDriverHandleArenaSize(),
+                .textureUseAfterFreePoolSize = instance->getConfig().textureUseAfterFreePoolSize,
+                .disableParallelShaderCompile = instance->getConfig().disableParallelShaderCompile
+        };
         instance->mDriver = platform->createDriver(sharedContext, driverConfig);
 
     } else {
@@ -651,7 +654,8 @@ int FEngine::loop() {
 
     DriverConfig const driverConfig {
             .handleArenaSize = getRequestedDriverHandleArenaSize(),
-            .textureUseAfterFreePoolSize = mConfig.textureUseAfterFreePoolSize
+            .textureUseAfterFreePoolSize = mConfig.textureUseAfterFreePoolSize,
+            .disableParallelShaderCompile = mConfig.disableParallelShaderCompile
     };
     mDriver = mPlatform->createDriver(mSharedGLContext, driverConfig);