diff --git a/README.md b/README.md
index bb2c074eb15..66f0976391a 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ repositories {
 }
 
 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.50.3'
+    implementation 'com.google.android.filament:filament-android:1.50.4'
 }
 ```
 
@@ -51,7 +51,7 @@ Here are all the libraries available in the group `com.google.android.filament`:
 iOS projects can use CocoaPods to install the latest release:
 
 ```shell
-pod 'Filament', '~> 1.50.3'
+pod 'Filament', '~> 1.50.4'
 ```
 
 ### Snapshots
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 209a0afdec2..f7fa37e7f58 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -7,6 +7,9 @@ A new header is inserted each time a *tag* is created.
 Instead, if you are authoring a PR for the main branch, add your release note to
 [NEW_RELEASE_NOTES.md](./NEW_RELEASE_NOTES.md).
 
+## v1.50.4
+
+
 ## v1.50.3
 
 
diff --git a/android/filament-android/src/main/cpp/Engine.cpp b/android/filament-android/src/main/cpp/Engine.cpp
index 05893cbd5bf..80409702c37 100644
--- a/android/filament-android/src/main/cpp/Engine.cpp
+++ b/android/filament-android/src/main/cpp/Engine.cpp
@@ -484,7 +484,9 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
 extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBuilderConfig(JNIEnv*,
         jclass, jlong nativeBuilder, jlong commandBufferSizeMB, jlong perRenderPassArenaSizeMB,
         jlong driverHandleArenaSizeMB, jlong minCommandBufferSizeMB, jlong perFrameCommandsSizeMB,
-        jlong jobSystemThreadCount, jlong stereoscopicEyeCount,
+        jlong jobSystemThreadCount,
+        jlong textureUseAfterFreePoolSize, jboolean disableParallelShaderCompile,
+        jint stereoscopicType, jlong stereoscopicEyeCount,
         jlong resourceAllocatorCacheSizeMB, jlong resourceAllocatorCacheMaxAge) {
     Engine::Builder* builder = (Engine::Builder*) nativeBuilder;
     Engine::Config config = {
@@ -494,6 +496,9 @@ extern "C" JNIEXPORT void JNICALL Java_com_google_android_filament_Engine_nSetBu
             .minCommandBufferSizeMB = (uint32_t) minCommandBufferSizeMB,
             .perFrameCommandsSizeMB = (uint32_t) perFrameCommandsSizeMB,
             .jobSystemThreadCount = (uint32_t) jobSystemThreadCount,
+            .textureUseAfterFreePoolSize = (uint32_t) textureUseAfterFreePoolSize,
+            .disableParallelShaderCompile = (bool) disableParallelShaderCompile,
+            .stereoscopicType = (Engine::StereoscopicType) stereoscopicType,
             .stereoscopicEyeCount = (uint8_t) stereoscopicEyeCount,
             .resourceAllocatorCacheSizeMB = (uint32_t) resourceAllocatorCacheSizeMB,
             .resourceAllocatorCacheMaxAge = (uint8_t) resourceAllocatorCacheMaxAge,
diff --git a/android/filament-android/src/main/java/com/google/android/filament/Engine.java b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
index 6b4647e2ac1..06f0a2b8ae9 100644
--- a/android/filament-android/src/main/java/com/google/android/filament/Engine.java
+++ b/android/filament-android/src/main/java/com/google/android/filament/Engine.java
@@ -158,6 +158,16 @@ public enum FeatureLevel {
         FEATURE_LEVEL_3,
     };
 
+    /**
+     * The type of technique for stereoscopic rendering
+     */
+    public enum StereoscopicType {
+        /** Stereoscopic rendering is performed using instanced rendering technique. */
+        INSTANCED,
+        /** Stereoscopic rendering is performed using the multiview feature from the graphics backend. */
+        MULTIVIEW,
+    };
+
     /**
      * Constructs <code>Engine</code> objects using a builder pattern.
      */
@@ -211,7 +221,9 @@ public Builder config(Config config) {
             nSetBuilderConfig(mNativeBuilder, config.commandBufferSizeMB,
                     config.perRenderPassArenaSizeMB, config.driverHandleArenaSizeMB,
                     config.minCommandBufferSizeMB, config.perFrameCommandsSizeMB,
-                    config.jobSystemThreadCount, config.stereoscopicEyeCount,
+                    config.jobSystemThreadCount,
+                    config.textureUseAfterFreePoolSize, config.disableParallelShaderCompile,
+                    config.stereoscopicType.ordinal(), config.stereoscopicEyeCount,
                     config.resourceAllocatorCacheSizeMB, config.resourceAllocatorCacheMaxAge);
             return this;
         }
@@ -349,6 +361,35 @@ public static class Config {
          */
         public long jobSystemThreadCount = 0;
 
+        /**
+         * Number of most-recently destroyed textures to track for use-after-free.
+         *
+         * This will cause the backend to throw an exception when a texture is freed but still bound
+         * to a SamplerGroup and used in a draw call. 0 disables completely.
+         *
+         * Currently only respected by the Metal backend.
+         */
+        public long textureUseAfterFreePoolSize = 0;
+
+        /**
+         * Set to `true` to forcibly disable parallel shader compilation in the backend.
+         * Currently only honored by the GL backend.
+         */
+        public boolean disableParallelShaderCompile = false;
+
+        /**
+         * The type of technique for stereoscopic rendering.
+         *
+         * This setting determines the algorithm used when stereoscopic rendering is enabled. This
+         * decision applies to the entire Engine for the lifetime of the Engine. E.g., multiple
+         * Views created from the Engine must use the same stereoscopic type.
+         *
+         * Each view can enable stereoscopic rendering via the StereoscopicOptions::enable flag.
+         *
+         * @see View#setStereoscopicOptions
+         */
+        public StereoscopicType stereoscopicType = StereoscopicType.INSTANCED;
+
         /**
          * The number of eyes to render when stereoscopic rendering is enabled. Supported values are
          * between 1 and Engine#getMaxStereoscopicEyes() (inclusive).
@@ -1240,7 +1281,8 @@ private static void assertDestroy(boolean success) {
     private static native void nSetBuilderConfig(long nativeBuilder, long commandBufferSizeMB,
             long perRenderPassArenaSizeMB, long driverHandleArenaSizeMB,
             long minCommandBufferSizeMB, long perFrameCommandsSizeMB, long jobSystemThreadCount,
-            long stereoscopicEyeCount,
+            long textureUseAfterFreePoolSize, boolean disableParallelShaderCompile,
+            int stereoscopicType, long stereoscopicEyeCount,
             long resourceAllocatorCacheSizeMB, long resourceAllocatorCacheMaxAge);
     private static native void nSetBuilderFeatureLevel(long nativeBuilder, int ordinal);
     private static native void nSetBuilderSharedContext(long nativeBuilder, long sharedContext);
diff --git a/android/gradle.properties b/android/gradle.properties
index 53e64758c22..34340a20b8a 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=com.google.android.filament
-VERSION_NAME=1.50.3
+VERSION_NAME=1.50.4
 
 POM_DESCRIPTION=Real-time physically based rendering engine for Android.
 
diff --git a/filament/backend/include/backend/DriverEnums.h b/filament/backend/include/backend/DriverEnums.h
index aba2b404145..c41d1b83049 100644
--- a/filament/backend/include/backend/DriverEnums.h
+++ b/filament/backend/include/backend/DriverEnums.h
@@ -1212,6 +1212,14 @@ enum class Workaround : uint16_t {
     DISABLE_THREAD_AFFINITY
 };
 
+//! The type of technique for stereoscopic rendering
+enum class StereoscopicType : uint8_t {
+    // Stereoscopic rendering is performed using instanced rendering technique.
+    INSTANCED,
+    // Stereoscopic rendering is performed using the multiview feature from the graphics backend.
+    MULTIVIEW,
+};
+
 } // namespace filament::backend
 
 template<> struct utils::EnableBitMaskOperators<filament::backend::ShaderStageFlags>
diff --git a/filament/backend/include/backend/Handle.h b/filament/backend/include/backend/Handle.h
index 7b8846ba7bc..ffc16133fd2 100644
--- a/filament/backend/include/backend/Handle.h
+++ b/filament/backend/include/backend/Handle.h
@@ -62,14 +62,6 @@ class HandleBase {
     // clear the handle, this doesn't free associated resources
     void clear() noexcept { object = nullid; }
 
-    // compare handles
-    bool operator==(const HandleBase& rhs) const noexcept { return object == rhs.object; }
-    bool operator!=(const HandleBase& rhs) const noexcept { return object != rhs.object; }
-    bool operator<(const HandleBase& rhs) const noexcept { return object < rhs.object; }
-    bool operator<=(const HandleBase& rhs) const noexcept { return object <= rhs.object; }
-    bool operator>(const HandleBase& rhs) const noexcept { return object > rhs.object; }
-    bool operator>=(const HandleBase& rhs) const noexcept { return object >= rhs.object; }
-
     // get this handle's handleId
     HandleId getId() const noexcept { return object; }
 
@@ -101,6 +93,14 @@ struct Handle : public HandleBase {
 
     explicit Handle(HandleId id) noexcept : HandleBase(id) { }
 
+    // compare handles of the same type
+    bool operator==(const Handle& rhs) const noexcept { return getId() == rhs.getId(); }
+    bool operator!=(const Handle& rhs) const noexcept { return getId() != rhs.getId(); }
+    bool operator<(const Handle& rhs) const noexcept { return getId() < rhs.getId(); }
+    bool operator<=(const Handle& rhs) const noexcept { return getId() <= rhs.getId(); }
+    bool operator>(const Handle& rhs) const noexcept { return getId() > rhs.getId(); }
+    bool operator>=(const Handle& rhs) const noexcept { return getId() >= rhs.getId(); }
+
     // type-safe Handle cast
     template<typename B, typename = std::enable_if_t<std::is_base_of<T, B>::value> >
     Handle(Handle<B> const& base) noexcept : HandleBase(base) { } // NOLINT(hicpp-explicit-conversions,google-explicit-constructor)
diff --git a/filament/backend/include/private/backend/CircularBuffer.h b/filament/backend/include/private/backend/CircularBuffer.h
index aae6e69c03b..7d2de52b009 100644
--- a/filament/backend/include/private/backend/CircularBuffer.h
+++ b/filament/backend/include/private/backend/CircularBuffer.h
@@ -17,7 +17,10 @@
 #ifndef TNT_FILAMENT_BACKEND_PRIVATE_CIRCULARBUFFER_H
 #define TNT_FILAMENT_BACKEND_PRIVATE_CIRCULARBUFFER_H
 
+#include <utils/debug.h>
+
 #include <stddef.h>
+#include <stdint.h>
 
 namespace filament::backend {
 
@@ -37,28 +40,36 @@ class CircularBuffer {
 
     ~CircularBuffer() noexcept;
 
-    // allocates 'size' bytes in the circular buffer and returns a pointer to the memory
-    // return the current head and moves it forward by size bytes
-    inline void* allocate(size_t size) noexcept {
+    static size_t getBlockSize() noexcept { return sPageSize; }
+
+    // Total size of circular buffer. This is a constant.
+    size_t size() const noexcept { return mSize; }
+
+    // Allocates `s` bytes in the circular buffer and returns a pointer to the memory. All
+    // allocations must not exceed size() bytes.
+    inline void* allocate(size_t s) noexcept {
+        // We can never allocate more that size().
+        assert_invariant(getUsed() + s <= size());
         char* const cur = static_cast<char*>(mHead);
-        mHead = cur + size;
+        mHead = cur + s;
         return cur;
     }
 
-    // Total size of circular buffer
-    size_t size() const noexcept { return mSize; }
-
-    // returns true if the buffer is empty (e.g. after calling flush)
+    // Returns true if the buffer is empty, i.e.: no allocations were made since
+    // calling getBuffer();
     bool empty() const noexcept { return mTail == mHead; }
 
-    void* getHead() const noexcept { return mHead; }
-
-    void* getTail() const noexcept { return mTail; }
+    // Returns the size used since the last call to getBuffer()
+    size_t getUsed() const noexcept { return intptr_t(mHead) - intptr_t(mTail); }
 
-    // call at least once every getRequiredSize() bytes allocated from the buffer
-    void circularize() noexcept;
-
-    static size_t getBlockSize() noexcept { return sPageSize; }
+    // Retrieves the current allocated range and frees it. It is the responsibility of the caller
+    // to make sure the returned range is no longer in use by the time allocate() allocates
+    // (size() - getUsed()) bytes.
+    struct Range {
+        void* tail;
+        void* head;
+    };
+    Range getBuffer() noexcept;
 
 private:
     void* alloc(size_t size) noexcept;
@@ -66,10 +77,10 @@ class CircularBuffer {
 
     // pointer to the beginning of the circular buffer (constant)
     void* mData = nullptr;
-    int mUsesAshmem = -1;
+    int mAshmemFd = -1;
 
     // size of the circular buffer (constant)
-    size_t mSize = 0;
+    size_t const mSize;
 
     // pointer to the beginning of recorded data
     void* mTail = nullptr;
diff --git a/filament/backend/include/private/backend/CommandBufferQueue.h b/filament/backend/include/private/backend/CommandBufferQueue.h
index 6a434477789..28122452386 100644
--- a/filament/backend/include/private/backend/CommandBufferQueue.h
+++ b/filament/backend/include/private/backend/CommandBufferQueue.h
@@ -33,7 +33,7 @@ namespace filament::backend {
  * A producer-consumer command queue that uses a CircularBuffer as main storage
  */
 class CommandBufferQueue {
-    struct Slice {
+    struct Range {
         void* begin;
         void* end;
     };
@@ -46,7 +46,7 @@ class CommandBufferQueue {
 
     mutable utils::Mutex mLock;
     mutable utils::Condition mCondition;
-    mutable std::vector<Slice> mCommandBuffersToExecute;
+    mutable std::vector<Range> mCommandBuffersToExecute;
     size_t mFreeSpace = 0;
     size_t mHighWatermark = 0;
     uint32_t mExitRequested = 0;
@@ -58,17 +58,20 @@ class CommandBufferQueue {
     CommandBufferQueue(size_t requiredSize, size_t bufferSize);
     ~CommandBufferQueue();
 
-    CircularBuffer& getCircularBuffer() { return mCircularBuffer; }
+    CircularBuffer& getCircularBuffer() noexcept { return mCircularBuffer; }
+    CircularBuffer const& getCircularBuffer() const noexcept { return mCircularBuffer; }
+
+    size_t getCapacity() const noexcept { return mRequiredSize; }
 
     size_t getHighWatermark() const noexcept { return mHighWatermark; }
 
     // wait for commands to be available and returns an array containing these commands
-    std::vector<Slice> waitForCommands() const;
+    std::vector<Range> waitForCommands() const;
 
     // return the memory used by this command buffer to the circular buffer
     // WARNING: releaseBuffer() must be called in sequence of the Slices returned by
     // waitForCommands()
-    void releaseBuffer(Slice const& buffer);
+    void releaseBuffer(Range const& buffer);
 
     // all commands buffers (Slices) written to this point are returned by waitForCommand(). This
     // call blocks until the CircularBuffer has at least mRequiredSize bytes available.
diff --git a/filament/backend/include/private/backend/CommandStream.h b/filament/backend/include/private/backend/CommandStream.h
index be84b323ad0..985fa5fcd6e 100644
--- a/filament/backend/include/private/backend/CommandStream.h
+++ b/filament/backend/include/private/backend/CommandStream.h
@@ -213,6 +213,8 @@ class CommandStream {
     CommandStream(CommandStream const& rhs) noexcept = delete;
     CommandStream& operator=(CommandStream const& rhs) noexcept = delete;
 
+    CircularBuffer const& getCircularBuffer() const noexcept { return mCurrentBuffer; }
+
 public:
 #define DECL_DRIVER_API(methodName, paramsDecl, params)                                         \
     inline void methodName(paramsDecl) {                                                        \
diff --git a/filament/backend/include/private/backend/HandleAllocator.h b/filament/backend/include/private/backend/HandleAllocator.h
index aa5f53be695..04e66d85774 100644
--- a/filament/backend/include/private/backend/HandleAllocator.h
+++ b/filament/backend/include/private/backend/HandleAllocator.h
@@ -24,35 +24,31 @@
 #include <utils/compiler.h>
 #include <utils/debug.h>
 #include <utils/ostream.h>
+#include <utils/Panic.h>
 
 #include <tsl/robin_map.h>
 
+#include <cstddef>
 #include <exception>
 #include <type_traits>
 #include <unordered_map>
+#include <utility>
 
 #include <stddef.h>
 #include <stdint.h>
 
-#if !defined(NDEBUG) && UTILS_HAS_RTTI
-#   define HANDLE_TYPE_SAFETY 1
-#else
-#   define HANDLE_TYPE_SAFETY 0
-#endif
-
-#define HandleAllocatorGL  HandleAllocator<16, 64, 208>
-#define HandleAllocatorVK  HandleAllocator<16, 64, 880>
-#define HandleAllocatorMTL HandleAllocator<16, 64, 584>
+#define HandleAllocatorGL  HandleAllocator<16,  64, 208>    // ~3640 / pool / MiB
+#define HandleAllocatorVK  HandleAllocator<80, 176, 320>    // ~1820 / pool / MiB
+#define HandleAllocatorMTL HandleAllocator<48, 160, 592>    // ~1310 / pool / MiB
 
 namespace filament::backend {
 
 /*
  * A utility class to efficiently allocate and manage Handle<>
  */
-template <size_t P0, size_t P1, size_t P2>
+template<size_t P0, size_t P1, size_t P2>
 class HandleAllocator {
 public:
-
     HandleAllocator(const char* name, size_t size) noexcept;
     HandleAllocator(HandleAllocator const& rhs) = delete;
     HandleAllocator& operator=(HandleAllocator const& rhs) = delete;
@@ -70,14 +66,9 @@ class HandleAllocator {
      */
     template<typename D, typename ... ARGS>
     Handle<D> allocateAndConstruct(ARGS&& ... args) noexcept {
-        Handle<D> h{ allocateHandle<sizeof(D)>() };
+        Handle<D> h{ allocateHandle<D>() };
         D* addr = handle_cast<D*>(h);
         new(addr) D(std::forward<ARGS>(args)...);
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return h;
     }
 
@@ -93,13 +84,7 @@ class HandleAllocator {
      */
     template<typename D>
     Handle<D> allocate() noexcept {
-        Handle<D> h{ allocateHandle<sizeof(D)>() };
-#if HANDLE_TYPE_SAFETY
-        D* addr = handle_cast<D*>(h);
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
+        Handle<D> h{ allocateHandle<D>() };
         return h;
     }
 
@@ -116,17 +101,10 @@ class HandleAllocator {
         assert_invariant(handle);
         D* addr = handle_cast<D*>(const_cast<Handle<B>&>(handle));
         assert_invariant(addr);
-
         // currently we implement construct<> with dtor+ctor, we could use operator= also
         // but all our dtors are trivial, ~D() is actually a noop.
         addr->~D();
         new(addr) D(std::forward<ARGS>(args)...);
-
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return addr;
     }
 
@@ -143,12 +121,6 @@ class HandleAllocator {
         D* addr = handle_cast<D*>(const_cast<Handle<B>&>(handle));
         assert_invariant(addr);
         new(addr) D(std::forward<ARGS>(args)...);
-
-#if HANDLE_TYPE_SAFETY
-        mLock.lock();
-        mHandleTypeId[addr] = typeid(D).name();
-        mLock.unlock();
-#endif
         return addr;
     }
 
@@ -164,19 +136,8 @@ class HandleAllocator {
     void deallocate(Handle<B>& handle, D const* p) noexcept {
         // allow to destroy the nullptr, similarly to operator delete
         if (p) {
-#if HANDLE_TYPE_SAFETY
-            mLock.lock();
-            auto typeId = mHandleTypeId[p];
-            mHandleTypeId.erase(p);
-            mLock.unlock();
-            if (UTILS_UNLIKELY(typeId != typeid(D).name())) {
-                utils::slog.e << "Destroying handle " << handle.getId() << ", type " << typeid(D).name()
-                       << ", but handle's actual type is " << typeId << utils::io::endl;
-                std::terminate();
-            }
-#endif
             p->~D();
-            deallocateHandle<sizeof(D)>(handle.getId());
+            deallocateHandle<D>(handle.getId());
         }
     }
 
@@ -204,7 +165,17 @@ class HandleAllocator {
             std::is_base_of_v<B, typename std::remove_pointer_t<Dp>>, Dp>
     handle_cast(Handle<B>& handle) noexcept {
         assert_invariant(handle);
-        void* const p = handleToPointer(handle.getId());
+        auto [p, tag] = handleToPointer(handle.getId());
+
+        if (isPoolHandle(handle.getId())) {
+            // check for use after free
+            uint8_t const age = (tag & HANDLE_AGE_MASK) >> HANDLE_AGE_SHIFT;
+            auto const pNode = static_cast<typename Allocator::Node*>(p);
+            uint8_t const expectedAge = pNode[-1].age;
+            ASSERT_POSTCONDITION(expectedAge == age,
+                    "use-after-free of Handle with id=%d", handle.getId());
+        }
+
         return static_cast<Dp>(p);
     }
 
@@ -219,29 +190,57 @@ class HandleAllocator {
 
 private:
 
-    // template <int P0, int P1, int P2>
+    template<typename D>
+    static constexpr size_t getBucketSize() noexcept {
+        if constexpr (sizeof(D) <= P0) { return P0; }
+        if constexpr (sizeof(D) <= P1) { return P1; }
+        static_assert(sizeof(D) <= P2);
+        return P2;
+    }
+
     class Allocator {
         friend class HandleAllocator;
-        utils::PoolAllocator<P0, 16>   mPool0;
-        utils::PoolAllocator<P1, 16>   mPool1;
-        utils::PoolAllocator<P2, 16>   mPool2;
+        static constexpr size_t MIN_ALIGNMENT = alignof(std::max_align_t);
+        struct Node { uint8_t age; };
+        // Note: using the `extra` parameter of PoolAllocator<>, even with a 1-byte structure,
+        // generally increases all pool allocations by 8-bytes because of alignment restrictions.
+        template<size_t SIZE>
+        using Pool = utils::PoolAllocator<SIZE, MIN_ALIGNMENT, sizeof(Node)>;
+        Pool<P0> mPool0;
+        Pool<P1> mPool1;
+        Pool<P2> mPool2;
         UTILS_UNUSED_IN_RELEASE const utils::AreaPolicy::HeapArea& mArea;
     public:
-        static constexpr size_t MIN_ALIGNMENT_SHIFT = 4;
         explicit Allocator(const utils::AreaPolicy::HeapArea& area);
 
+        static constexpr size_t getAlignment() noexcept { return MIN_ALIGNMENT; }
+
         // this is in fact always called with a constexpr size argument
-        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t extra) noexcept {
+        [[nodiscard]] inline void* alloc(size_t size, size_t, size_t, uint8_t* outAge) noexcept {
             void* p = nullptr;
-                 if (size <= mPool0.getSize()) p = mPool0.alloc(size, 16, extra);
-            else if (size <= mPool1.getSize()) p = mPool1.alloc(size, 16, extra);
-            else if (size <= mPool2.getSize()) p = mPool2.alloc(size, 16, extra);
+            if      (size <= mPool0.getSize()) p = mPool0.alloc(size);
+            else if (size <= mPool1.getSize()) p = mPool1.alloc(size);
+            else if (size <= mPool2.getSize()) p = mPool2.alloc(size);
+            if (UTILS_LIKELY(p)) {
+                Node const* const pNode = static_cast<Node const*>(p);
+                // we are guaranteed to have at least sizeof<Node> bytes of extra storage before
+                // the allocation address.
+                *outAge = pNode[-1].age;
+            }
             return p;
         }
 
         // this is in fact always called with a constexpr size argument
-        inline void free(void* p, size_t size) noexcept {
+        inline void free(void* p, size_t size, uint8_t age) noexcept {
             assert_invariant(p >= mArea.begin() && (char*)p + size <= (char*)mArea.end());
+
+            // check for double-free
+            Node* const pNode = static_cast<Node*>(p);
+            uint8_t& expectedAge = pNode[-1].age;
+            ASSERT_POSTCONDITION(expectedAge == age,
+                    "double-free of Handle of size %d at %p", size, p);
+            expectedAge = (expectedAge + 1) & 0xF; // fixme
+
             if (size <= mPool0.getSize()) { mPool0.free(p); return; }
             if (size <= mPool1.getSize()) { mPool1.free(p); return; }
             if (size <= mPool2.getSize()) { mPool2.free(p); return; }
@@ -263,24 +262,16 @@ class HandleAllocator {
     // allocateHandle()/deallocateHandle() selects the pool to use at compile-time based on the
     // allocation size this is always inlined, because all these do is to call
     // allocateHandleInPool()/deallocateHandleFromPool() with the right pool size.
-    template<size_t SIZE>
+    template<typename D>
     HandleBase::HandleId allocateHandle() noexcept {
-        if constexpr (SIZE <= P0) { return allocateHandleInPool<P0>(); }
-        if constexpr (SIZE <= P1) { return allocateHandleInPool<P1>(); }
-        static_assert(SIZE <= P2);
-        return allocateHandleInPool<P2>();
+        constexpr size_t BUCKET_SIZE = getBucketSize<D>();
+        return allocateHandleInPool<BUCKET_SIZE>();
     }
 
-    template<size_t SIZE>
+    template<typename D>
     void deallocateHandle(HandleBase::HandleId id) noexcept {
-        if constexpr (SIZE <= P0) {
-            deallocateHandleFromPool<P0>(id);
-        } else if constexpr (SIZE <= P1) {
-            deallocateHandleFromPool<P1>(id);
-        } else {
-            static_assert(SIZE <= P2);
-            deallocateHandleFromPool<P2>(id);
-        }
+        constexpr size_t BUCKET_SIZE = getBucketSize<D>();
+        deallocateHandleFromPool<BUCKET_SIZE>(id);
     }
 
     // allocateHandleInPool()/deallocateHandleFromPool() is NOT inlined, which will cause three
@@ -289,9 +280,11 @@ class HandleAllocator {
     template<size_t SIZE>
     UTILS_NOINLINE
     HandleBase::HandleId allocateHandleInPool() noexcept {
-        void* p = mHandleArena.alloc(SIZE);
+        uint8_t age;
+        void* p = mHandleArena.alloc(SIZE, alignof(std::max_align_t), 0, &age);
         if (UTILS_LIKELY(p)) {
-            return pointerToHandle(p);
+            uint32_t const tag = (uint32_t(age) << HANDLE_AGE_SHIFT) & HANDLE_AGE_MASK;
+            return arenaPointerToHandle(p, tag);
         } else {
             return allocateHandleSlow(SIZE);
         }
@@ -301,42 +294,51 @@ class HandleAllocator {
     UTILS_NOINLINE
     void deallocateHandleFromPool(HandleBase::HandleId id) noexcept {
         if (UTILS_LIKELY(isPoolHandle(id))) {
-            void* p = handleToPointer(id);
-            mHandleArena.free(p, SIZE);
+            auto [p, tag] = handleToPointer(id);
+            uint8_t const age = (tag & HANDLE_AGE_MASK) >> HANDLE_AGE_SHIFT;
+            mHandleArena.free(p, SIZE, age);
         } else {
             deallocateHandleSlow(id, SIZE);
         }
     }
 
-    static constexpr uint32_t HEAP_HANDLE_FLAG = 0x80000000u;
+    // we handle a 4 bits age per address
+    static constexpr uint32_t HANDLE_HEAP_FLAG      = 0x80000000u;      // pool vs heap handle
+    static constexpr uint32_t HANDLE_AGE_MASK       = 0x78000000u;      // handle's age
+    static constexpr uint32_t HANDLE_INDEX_MASK     = 0x07FFFFFFu;      // handle index
+    static constexpr uint32_t HANDLE_TAG_MASK       = HANDLE_AGE_MASK;
+    static constexpr uint32_t HANDLE_AGE_SHIFT      = 27;
 
     static bool isPoolHandle(HandleBase::HandleId id) noexcept {
-        return (id & HEAP_HANDLE_FLAG) == 0u;
+        return (id & HANDLE_HEAP_FLAG) == 0u;
     }
 
     HandleBase::HandleId allocateHandleSlow(size_t size) noexcept;
     void deallocateHandleSlow(HandleBase::HandleId id, size_t size) noexcept;
 
     // We inline this because it's just 4 instructions in the fast case
-    inline void* handleToPointer(HandleBase::HandleId id) const noexcept {
+    inline std::pair<void*, uint32_t> handleToPointer(HandleBase::HandleId id) const noexcept {
         // note: the null handle will end-up returning nullptr b/c it'll be handled as
         // a non-pool handle.
         if (UTILS_LIKELY(isPoolHandle(id))) {
             char* const base = (char*)mHandleArena.getArea().begin();
-            size_t offset = id << Allocator::MIN_ALIGNMENT_SHIFT;
-            return static_cast<void*>(base + offset);
+            uint32_t const tag = id & HANDLE_TAG_MASK;
+            size_t const offset = (id & HANDLE_INDEX_MASK) * Allocator::getAlignment();
+            return { static_cast<void*>(base + offset), tag };
         }
-        return handleToPointerSlow(id);
+        return { handleToPointerSlow(id), 0 };
     }
 
     void* handleToPointerSlow(HandleBase::HandleId id) const noexcept;
 
     // We inline this because it's just 3 instructions
-    inline HandleBase::HandleId pointerToHandle(void* p) const noexcept {
+    inline HandleBase::HandleId arenaPointerToHandle(void* p, uint32_t tag) const noexcept {
         char* const base = (char*)mHandleArena.getArea().begin();
-        size_t offset = (char*)p - base;
-        auto id = HandleBase::HandleId(offset >> Allocator::MIN_ALIGNMENT_SHIFT);
-        assert_invariant((id & HEAP_HANDLE_FLAG) == 0);
+        size_t const offset = (char*)p - base;
+        assert_invariant((offset % Allocator::getAlignment()) == 0);
+        auto id = HandleBase::HandleId(offset / Allocator::getAlignment());
+        id |= tag & HANDLE_TAG_MASK;
+        assert_invariant((id & HANDLE_HEAP_FLAG) == 0);
         return id;
     }
 
@@ -346,9 +348,6 @@ class HandleAllocator {
     mutable utils::Mutex mLock;
     tsl::robin_map<HandleBase::HandleId, void*> mOverflowMap;
     HandleBase::HandleId mId = 0;
-#if HANDLE_TYPE_SAFETY
-    mutable std::unordered_map<const void*, const char*> mHandleTypeId;
-#endif
 };
 
 } // namespace filament::backend
diff --git a/filament/backend/src/CircularBuffer.cpp b/filament/backend/src/CircularBuffer.cpp
index d9a877d3f59..41dd4173008 100644
--- a/filament/backend/src/CircularBuffer.cpp
+++ b/filament/backend/src/CircularBuffer.cpp
@@ -16,6 +16,14 @@
 
 #include "private/backend/CircularBuffer.h"
 
+#include <utils/Log.h>
+#include <utils/Panic.h>
+#include <utils/architecture.h>
+#include <utils/ashmem.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+#include <utils/ostream.h>
+
 #if !defined(WIN32) && !defined(__EMSCRIPTEN__) && !defined(IOS)
 #    include <sys/mman.h>
 #    include <unistd.h>
@@ -24,23 +32,20 @@
 #    define HAS_MMAP 0
 #endif
 
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
 #include <stdio.h>
 
-#include <utils/architecture.h>
-#include <utils/ashmem.h>
-#include <utils/debug.h>
-#include <utils/Log.h>
-#include <utils/Panic.h>
-
 using namespace utils;
 
 namespace filament::backend {
 
 size_t CircularBuffer::sPageSize = arch::getPageSize();
 
-CircularBuffer::CircularBuffer(size_t size) {
+CircularBuffer::CircularBuffer(size_t size)
+    : mSize(size) {
     mData = alloc(size);
-    mSize = size;
     mTail = mData;
     mHead = mData;
 }
@@ -85,7 +90,7 @@ void* CircularBuffer::alloc(size_t size) noexcept {
                             MAP_PRIVATE, fd, (off_t)size);
                     if (vaddr_guard != MAP_FAILED && (vaddr_guard == (char*)vaddr_shadow + size)) {
                         // woo-hoo success!
-                        mUsesAshmem = fd;
+                        mAshmemFd = fd;
                         data = vaddr;
                     }
                 }
@@ -93,7 +98,7 @@ void* CircularBuffer::alloc(size_t size) noexcept {
         }
     }
 
-    if (UTILS_UNLIKELY(mUsesAshmem < 0)) {
+    if (UTILS_UNLIKELY(mAshmemFd < 0)) {
         // ashmem failed
         if (vaddr_guard != MAP_FAILED) {
             munmap(vaddr_guard, size);
@@ -137,9 +142,9 @@ void CircularBuffer::dealloc() noexcept {
     if (mData) {
         size_t const BLOCK_SIZE = getBlockSize();
         munmap(mData, mSize * 2 + BLOCK_SIZE);
-        if (mUsesAshmem >= 0) {
-            close(mUsesAshmem);
-            mUsesAshmem = -1;
+        if (mAshmemFd >= 0) {
+            close(mAshmemFd);
+            mAshmemFd = -1;
         }
     }
 #else
@@ -149,23 +154,37 @@ void CircularBuffer::dealloc() noexcept {
 }
 
 
-void CircularBuffer::circularize() noexcept {
-    if (mUsesAshmem > 0) {
-        intptr_t const overflow = intptr_t(mHead) - (intptr_t(mData) + ssize_t(mSize));
-        if (overflow >= 0) {
-            assert_invariant(size_t(overflow) <= mSize);
-            mHead = (void *) (intptr_t(mData) + overflow);
-            #ifndef NDEBUG
-            memset(mData, 0xA5, size_t(overflow));
-            #endif
-        }
-    } else {
-        // Only circularize if mHead if in the second buffer.
-        if (intptr_t(mHead) - intptr_t(mData) > ssize_t(mSize)) {
+CircularBuffer::Range CircularBuffer::getBuffer() noexcept {
+    Range const range{ .tail = mTail, .head = mHead };
+
+    char* const pData = static_cast<char*>(mData);
+    char const* const pEnd = pData + mSize;
+    char const* const pHead = static_cast<char const*>(mHead);
+    if (UTILS_UNLIKELY(pHead >= pEnd)) {
+        size_t const overflow = pHead - pEnd;
+        if (UTILS_LIKELY(mAshmemFd > 0)) {
+            assert_invariant(overflow <= mSize);
+            mHead = static_cast<void*>(pData + overflow);
+            // Data         Tail  End   Head              [virtual]
+            //  v             v    v     v
+            //  +-------------:----+-----:--------------+
+            //  |             :    |     :              |
+            //  +-----:------------+--------------------+
+            //       Head          |<------ copy ------>| [physical]
+        } else {
+            // Data         Tail  End   Head
+            //  v             v    v     v
+            //  +-------------:----+-----:--------------+
+            //  |             :    |     :              |
+            //  +-----|------------+-----|--------------+
+            //        |<---------------->|
+            //           sliding window
             mHead = mData;
         }
     }
     mTail = mHead;
+
+    return range;
 }
 
 } // namespace filament::backend
diff --git a/filament/backend/src/CommandBufferQueue.cpp b/filament/backend/src/CommandBufferQueue.cpp
index ccf9d33a0d7..e3e5de045c8 100644
--- a/filament/backend/src/CommandBufferQueue.cpp
+++ b/filament/backend/src/CommandBufferQueue.cpp
@@ -15,14 +15,25 @@
  */
 
 #include "private/backend/CommandBufferQueue.h"
+#include "private/backend/CircularBuffer.h"
+#include "private/backend/CommandStream.h"
 
+#include <utils/compiler.h>
 #include <utils/Log.h>
-#include <utils/Systrace.h>
+#include <utils/Mutex.h>
+#include <utils/ostream.h>
 #include <utils/Panic.h>
+#include <utils/Systrace.h>
 #include <utils/debug.h>
 
-#include "private/backend/BackendUtils.h"
-#include "private/backend/CommandStream.h"
+#include <algorithm>
+#include <mutex>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include <stddef.h>
+#include <stdint.h>
 
 using namespace utils;
 
@@ -65,50 +76,53 @@ void CommandBufferQueue::flush() noexcept {
     // always guaranteed to have enough space for the NoopCommand
     new(circularBuffer.allocate(sizeof(NoopCommand))) NoopCommand(nullptr);
 
-    // end of this slice
-    void* const head = circularBuffer.getHead();
+    const size_t requiredSize = mRequiredSize;
 
-    // beginning of this slice
-    void* const tail = circularBuffer.getTail();
+    // get the current buffer
+    auto const [begin, end] = circularBuffer.getBuffer();
 
-    // size of this slice
-    uint32_t const used = uint32_t(intptr_t(head) - intptr_t(tail));
+    assert_invariant(circularBuffer.empty());
 
-    circularBuffer.circularize();
+    // size of the current buffer
+    size_t const used = std::distance(
+            static_cast<char const*>(begin), static_cast<char const*>(end));
 
     std::unique_lock<utils::Mutex> lock(mLock);
-    mCommandBuffersToExecute.push_back({ tail, head });
+    mCommandBuffersToExecute.push_back({ begin, end });
+    mCondition.notify_one();
 
     // circular buffer is too small, we corrupted the stream
     ASSERT_POSTCONDITION(used <= mFreeSpace,
             "Backend CommandStream overflow. Commands are corrupted and unrecoverable.\n"
             "Please increase minCommandBufferSizeMB inside the Config passed to Engine::create.\n"
-            "Space used at this time: %u bytes",
-            (unsigned)used);
+            "Space used at this time: %u bytes, overflow: %u bytes",
+            (unsigned)used, unsigned(used - mFreeSpace));
 
     // wait until there is enough space in the buffer
     mFreeSpace -= used;
-    const size_t requiredSize = mRequiredSize;
+    if (UTILS_UNLIKELY(mFreeSpace < requiredSize)) {
+
 
 #ifndef NDEBUG
-    size_t totalUsed = circularBuffer.size() - mFreeSpace;
-    mHighWatermark = std::max(mHighWatermark, totalUsed);
-    if (UTILS_UNLIKELY(totalUsed > requiredSize)) {
-        slog.d << "CommandStream used too much space: " << totalUsed
-            << ", out of " << requiredSize << " (will block)" << io::endl;
-    }
+        size_t const totalUsed = circularBuffer.size() - mFreeSpace;
+        slog.d << "CommandStream used too much space (will block): "
+                << "needed space " << requiredSize << " out of " << mFreeSpace
+                << ", totalUsed=" << totalUsed << ", current=" << used
+                << ", queue size=" << mCommandBuffersToExecute.size() << " buffers"
+                << io::endl;
+
+        mHighWatermark = std::max(mHighWatermark, totalUsed);
 #endif
 
-    mCondition.notify_one();
-    if (UTILS_LIKELY(mFreeSpace < requiredSize)) {
         SYSTRACE_NAME("waiting: CircularBuffer::flush()");
         mCondition.wait(lock, [this, requiredSize]() -> bool {
+            // TODO: on macOS, we need to call pumpEvents from time to time
             return mFreeSpace >= requiredSize;
         });
     }
 }
 
-std::vector<CommandBufferQueue::Slice> CommandBufferQueue::waitForCommands() const {
+std::vector<CommandBufferQueue::Range> CommandBufferQueue::waitForCommands() const {
     if (!UTILS_HAS_THREADING) {
         return std::move(mCommandBuffersToExecute);
     }
@@ -123,7 +137,7 @@ std::vector<CommandBufferQueue::Slice> CommandBufferQueue::waitForCommands() con
     return std::move(mCommandBuffersToExecute);
 }
 
-void CommandBufferQueue::releaseBuffer(CommandBufferQueue::Slice const& buffer) {
+void CommandBufferQueue::releaseBuffer(CommandBufferQueue::Range const& buffer) {
     std::lock_guard<utils::Mutex> const lock(mLock);
     mFreeSpace += uintptr_t(buffer.end) - uintptr_t(buffer.begin);
     mCondition.notify_one();
diff --git a/filament/backend/src/HandleAllocator.cpp b/filament/backend/src/HandleAllocator.cpp
index 3257e4e2c94..bf8e779614c 100644
--- a/filament/backend/src/HandleAllocator.cpp
+++ b/filament/backend/src/HandleAllocator.cpp
@@ -16,9 +16,22 @@
 
 #include "private/backend/HandleAllocator.h"
 
+#include <backend/Handle.h>
+
+#include <utils/Allocator.h>
+#include <utils/Log.h>
 #include <utils/Panic.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+#include <utils/ostream.h>
+
+#include <algorithm>
+#include <exception>
+#include <limits>
+#include <mutex>
 
 #include <stdlib.h>
+#include <string.h>
 
 namespace filament::backend {
 
@@ -28,14 +41,34 @@ template <size_t P0, size_t P1, size_t P2>
 UTILS_NOINLINE
 HandleAllocator<P0, P1, P2>::Allocator::Allocator(AreaPolicy::HeapArea const& area)
         : mArea(area) {
-    // TODO: we probably need a better way to set the size of these pools
-    const size_t unit = area.size() / 32;
-    const size_t offsetPool1 =      unit;
-    const size_t offsetPool2 = 16 * unit;
-    char* const p = (char*)area.begin();
-    mPool0 = PoolAllocator< P0, 16>(p, p + offsetPool1);
-    mPool1 = PoolAllocator< P1, 16>(p + offsetPool1, p + offsetPool2);
-    mPool2 = PoolAllocator< P2, 16>(p + offsetPool2, area.end());
+
+    // The largest handle this allocator can generate currently depends on the architecture's
+    // min alignment, typically 8 or 16 bytes.
+    // e.g. On Android armv8, the alignment is 16 bytes, so for a 1 MiB heap, the largest handle
+    //      index will be 65536. Note that this is not the same as the number of handles (which
+    //      will always be less).
+    // Because our maximum representable handle currently is 0x07FFFFFF, the maximum no-nonsensical
+    // heap size is 2 GiB, which amounts to 7.6 millions handles per pool (in the GL case).
+    size_t const maxHeapSize = std::min(area.size(), HANDLE_INDEX_MASK * getAlignment());
+
+    if (UTILS_UNLIKELY(maxHeapSize != area.size())) {
+        slog.w << "HandleAllocator heap size reduced to "
+               << maxHeapSize << " from " << area.size() << io::endl;
+    }
+
+    // make sure we start with a clean arena. This is needed to ensure that all blocks start
+    // with an age of 0.
+    memset(area.data(), 0, maxHeapSize);
+
+    // size the different pools so that they can all contain the same number of handles
+    size_t const count = maxHeapSize / (P0 + P1 + P2);
+    char* const p0 = static_cast<char*>(area.begin());
+    char* const p1 = p0 + count * P0;
+    char* const p2 = p1 + count * P1;
+
+    mPool0 = Pool<P0>(p0, count * P0);
+    mPool1 = Pool<P1>(p1, count * P1);
+    mPool2 = Pool<P2>(p2, count * P2);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -73,11 +106,17 @@ template <size_t P0, size_t P1, size_t P2>
 HandleBase::HandleId HandleAllocator<P0, P1, P2>::allocateHandleSlow(size_t size) noexcept {
     void* p = ::malloc(size);
     std::unique_lock lock(mLock);
-    HandleBase::HandleId id = (++mId) | HEAP_HANDLE_FLAG;
+
+    HandleBase::HandleId id = (++mId) | HANDLE_HEAP_FLAG;
+
+    ASSERT_POSTCONDITION(mId < HANDLE_HEAP_FLAG,
+            "No more Handle ids available! This can happen if HandleAllocator arena has been full"
+            " for a while. Please increase FILAMENT_OPENGL_HANDLE_ARENA_SIZE_IN_MB");
+
     mOverflowMap.emplace(id, p);
     lock.unlock();
 
-    if (UTILS_UNLIKELY(id == (HEAP_HANDLE_FLAG|1u))) { // meaning id was zero
+    if (UTILS_UNLIKELY(id == (HANDLE_HEAP_FLAG | 1u))) { // meaning id was zero
         PANIC_LOG("HandleAllocator arena is full, using slower system heap. Please increase "
                   "the appropriate constant (e.g. FILAMENT_OPENGL_HANDLE_ARENA_SIZE_IN_MB).");
     }
@@ -86,7 +125,7 @@ HandleBase::HandleId HandleAllocator<P0, P1, P2>::allocateHandleSlow(size_t size
 
 template <size_t P0, size_t P1, size_t P2>
 void HandleAllocator<P0, P1, P2>::deallocateHandleSlow(HandleBase::HandleId id, size_t) noexcept {
-    assert_invariant(id & HEAP_HANDLE_FLAG);
+    assert_invariant(id & HANDLE_HEAP_FLAG);
     void* p = nullptr;
     auto& overflowMap = mOverflowMap;
 
diff --git a/filament/backend/src/metal/MetalDriver.mm b/filament/backend/src/metal/MetalDriver.mm
index ef5c35e1080..b1e3d7574f7 100644
--- a/filament/backend/src/metal/MetalDriver.mm
+++ b/filament/backend/src/metal/MetalDriver.mm
@@ -43,6 +43,40 @@
 namespace backend {
 
 Driver* MetalDriverFactory::create(MetalPlatform* const platform, const Platform::DriverConfig& driverConfig) {
+#if 0
+    // this is useful for development, but too verbose even for debug builds
+    // For reference on a 64-bits machine in Release mode:
+    //    MetalTimerQuery              :  16       few
+    //    HwStream                     :  24       few
+    //    MetalIndexBuffer             :  40       moderate
+    //    MetalFence                   :  48       few
+    //    MetalBufferObject            :  48       many
+    // -- less than or equal 48 bytes
+    //    MetalSamplerGroup            : 112       few
+    //    MetalProgram                 : 144       moderate
+    //    MetalTexture                 : 152       moderate
+    //    MetalVertexBuffer            : 152       moderate
+    // -- less than or equal 160 bytes
+    //    MetalSwapChain               : 184       few
+    //    MetalRenderTarget            : 272       few
+    //    MetalRenderPrimitive         : 584       many
+    // -- less than or equal to 592 bytes
+
+    utils::slog.d
+           << "\nMetalSwapChain: " << sizeof(MetalSwapChain)
+           << "\nMetalBufferObject: " << sizeof(MetalBufferObject)
+           << "\nMetalVertexBuffer: " << sizeof(MetalVertexBuffer)
+           << "\nMetalIndexBuffer: " << sizeof(MetalIndexBuffer)
+           << "\nMetalSamplerGroup: " << sizeof(MetalSamplerGroup)
+           << "\nMetalRenderPrimitive: " << sizeof(MetalRenderPrimitive)
+           << "\nMetalTexture: " << sizeof(MetalTexture)
+           << "\nMetalTimerQuery: " << sizeof(MetalTimerQuery)
+           << "\nHwStream: " << sizeof(HwStream)
+           << "\nMetalRenderTarget: " << sizeof(MetalRenderTarget)
+           << "\nMetalFence: " << sizeof(MetalFence)
+           << "\nMetalProgram: " << sizeof(MetalProgram)
+           << utils::io::endl;
+#endif
     return MetalDriver::create(platform, driverConfig);
 }
 
diff --git a/filament/backend/src/opengl/OpenGLDriver.cpp b/filament/backend/src/opengl/OpenGLDriver.cpp
index 1d3e06282c4..047f28383db 100644
--- a/filament/backend/src/opengl/OpenGLDriver.cpp
+++ b/filament/backend/src/opengl/OpenGLDriver.cpp
@@ -90,24 +90,24 @@ Driver* OpenGLDriver::create(OpenGLPlatform* const platform,
 #if 0
     // this is useful for development, but too verbose even for debug builds
     // For reference on a 64-bits machine in Release mode:
-    //    GLFence                   :   8       few
     //    GLIndexBuffer             :   8       moderate
-    //    GLSamplerGroup            :   8       few
+    //    GLSamplerGroup            :  16       few
+    //    GLSwapChain               :  16       few
+    //    GLTimerQuery              :  16       few
     // -- less than or equal 16 bytes
-    //    GLBufferObject            :  24       many
-    //    GLSync                    :  24       few
-    //    GLTimerQuery              :  32       few
-    //    OpenGLProgram             :  32       moderate
-    //    GLRenderPrimitive         :  48       many
+    //    GLFence                   :  24       few
+    //    GLBufferObject            :  32       many
+    //    GLRenderPrimitive         :  40       many
+    //    OpenGLProgram             :  56       moderate
+    //    GLTexture                 :  64       moderate
     // -- less than or equal 64 bytes
-    //    GLTexture                 :  72       moderate
+    //    GLStream                  : 104       few
     //    GLRenderTarget            : 112       few
-    //    GLStream                  : 184       few
     //    GLVertexBuffer            : 200       moderate
     // -- less than or equal to 208 bytes
 
     slog.d
-           << "HwFence: " << sizeof(HwFence)
+           << "\nGLSwapChain: " << sizeof(GLSwapChain)
            << "\nGLBufferObject: " << sizeof(GLBufferObject)
            << "\nGLVertexBuffer: " << sizeof(GLVertexBuffer)
            << "\nGLIndexBuffer: " << sizeof(GLIndexBuffer)
@@ -117,7 +117,7 @@ Driver* OpenGLDriver::create(OpenGLPlatform* const platform,
            << "\nGLTimerQuery: " << sizeof(GLTimerQuery)
            << "\nGLStream: " << sizeof(GLStream)
            << "\nGLRenderTarget: " << sizeof(GLRenderTarget)
-           << "\nGLSync: " << sizeof(GLSync)
+           << "\nGLFence: " << sizeof(GLFence)
            << "\nOpenGLProgram: " << sizeof(OpenGLProgram)
            << io::endl;
 #endif
diff --git a/filament/backend/src/vulkan/VulkanDriver.cpp b/filament/backend/src/vulkan/VulkanDriver.cpp
index 9680320f929..c700ce9c6a3 100644
--- a/filament/backend/src/vulkan/VulkanDriver.cpp
+++ b/filament/backend/src/vulkan/VulkanDriver.cpp
@@ -213,6 +213,41 @@ VulkanDriver::~VulkanDriver() noexcept = default;
 UTILS_NOINLINE
 Driver* VulkanDriver::create(VulkanPlatform* platform, VulkanContext const& context,
          Platform::DriverConfig const& driverConfig) noexcept {
+#if 0
+    // this is useful for development, but too verbose even for debug builds
+    // For reference on a 64-bits machine in Release mode:
+    //    VulkanSamplerGroup            :  24       few
+    //    HwStream                      :  24       few
+    //    VulkanFence                   :  40       few
+    //    VulkanProgram                 :  40       moderate
+    //    VulkanIndexBuffer             :  72       moderate
+    //    VulkanBufferObject            :  72       many
+    // -- less than or equal 80 bytes
+    //    VulkanRenderPrimitive         : 104       many
+    //    VulkanSwapChain               : 112       few
+    //    VulkanTimerQuery              : 168       few
+    // -- less than or equal 176 bytes
+    //    VulkanTexture                 : 232       moderate
+    //    VulkanVertexBuffer            : 312       moderate
+    //    VulkanRenderTarget            : 320       few
+    // -- less than or equal to 320 bytes
+
+    utils::slog.d
+           << "\nVulkanSwapChain: " << sizeof(VulkanSwapChain)
+           << "\nVulkanBufferObject: " << sizeof(VulkanBufferObject)
+           << "\nVulkanVertexBuffer: " << sizeof(VulkanVertexBuffer)
+           << "\nVulkanIndexBuffer: " << sizeof(VulkanIndexBuffer)
+           << "\nVulkanSamplerGroup: " << sizeof(VulkanSamplerGroup)
+           << "\nVulkanRenderPrimitive: " << sizeof(VulkanRenderPrimitive)
+           << "\nVulkanTexture: " << sizeof(VulkanTexture)
+           << "\nVulkanTimerQuery: " << sizeof(VulkanTimerQuery)
+           << "\nHwStream: " << sizeof(HwStream)
+           << "\nVulkanRenderTarget: " << sizeof(VulkanRenderTarget)
+           << "\nVulkanFence: " << sizeof(VulkanFence)
+           << "\nVulkanProgram: " << sizeof(VulkanProgram)
+           << utils::io::endl;
+#endif
+
     assert_invariant(platform);
     size_t defaultSize = FVK_HANDLE_ARENA_SIZE_IN_MB * 1024U * 1024U;
     Platform::DriverConfig validConfig {driverConfig};
@@ -1641,26 +1676,26 @@ void VulkanDriver::draw(PipelineState pipelineState, Handle<HwRenderPrimitive> r
     // Update the VK raster state.
     const VulkanRenderTarget* rt = mCurrentRenderPass.renderTarget;
 
-    auto vkraster = mPipelineCache.getCurrentRasterState();
-    vkraster.cullMode = getCullMode(rasterState.culling);
-    vkraster.frontFace = getFrontFace(rasterState.inverseFrontFaces);
-    vkraster.depthBiasEnable = (depthOffset.constant || depthOffset.slope) ? true : false;
-    vkraster.depthBiasConstantFactor = depthOffset.constant;
-    vkraster.depthBiasSlopeFactor = depthOffset.slope;
-    vkraster.blendEnable = rasterState.hasBlending();
-    vkraster.srcColorBlendFactor = getBlendFactor(rasterState.blendFunctionSrcRGB);
-    vkraster.dstColorBlendFactor = getBlendFactor(rasterState.blendFunctionDstRGB);
-    vkraster.colorBlendOp = rasterState.blendEquationRGB;
-    vkraster.srcAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionSrcAlpha);
-    vkraster.dstAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionDstAlpha);
-    vkraster.alphaBlendOp =  rasterState.blendEquationAlpha;
-    vkraster.colorWriteMask = (VkColorComponentFlags) (rasterState.colorWrite ? 0xf : 0x0);
-    vkraster.depthWriteEnable = rasterState.depthWrite;
-    vkraster.depthCompareOp = rasterState.depthFunc;
-    vkraster.rasterizationSamples = rt->getSamples();
-    vkraster.alphaToCoverageEnable = rasterState.alphaToCoverage;
-    vkraster.colorTargetCount = rt->getColorTargetCount(mCurrentRenderPass);
-    mPipelineCache.setCurrentRasterState(vkraster);
+    VulkanPipelineCache::RasterState const vulkanRasterState{
+        .cullMode = getCullMode(rasterState.culling),
+        .frontFace = getFrontFace(rasterState.inverseFrontFaces),
+        .depthBiasEnable = (depthOffset.constant || depthOffset.slope) ? true : false,
+        .blendEnable = rasterState.hasBlending(),
+        .depthWriteEnable = rasterState.depthWrite,
+        .alphaToCoverageEnable = rasterState.alphaToCoverage,
+        .srcColorBlendFactor = getBlendFactor(rasterState.blendFunctionSrcRGB),
+        .dstColorBlendFactor = getBlendFactor(rasterState.blendFunctionDstRGB),
+        .srcAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionSrcAlpha),
+        .dstAlphaBlendFactor = getBlendFactor(rasterState.blendFunctionDstAlpha),
+        .colorWriteMask = (VkColorComponentFlags) (rasterState.colorWrite ? 0xf : 0x0),
+        .rasterizationSamples = rt->getSamples(),
+        .colorTargetCount = rt->getColorTargetCount(mCurrentRenderPass),
+        .colorBlendOp = rasterState.blendEquationRGB,
+        .alphaBlendOp =  rasterState.blendEquationAlpha,
+        .depthCompareOp = rasterState.depthFunc,
+        .depthBiasConstantFactor = depthOffset.constant,
+        .depthBiasSlopeFactor = depthOffset.slope
+    };
 
     // Declare fixed-size arrays that get passed to the pipeCache and to vkCmdBindVertexBuffers.
     uint32_t const bufferCount = prim.vertexBuffer->attributes.size();
@@ -1671,7 +1706,7 @@ void VulkanDriver::draw(PipelineState pipelineState, Handle<HwRenderPrimitive> r
 
     // Push state changes to the VulkanPipelineCache instance. This is fast and does not make VK calls.
     mPipelineCache.bindProgram(program);
-    mPipelineCache.bindRasterState(mPipelineCache.getCurrentRasterState());
+    mPipelineCache.bindRasterState(vulkanRasterState);
     mPipelineCache.bindPrimitiveTopology(prim.primitiveTopology);
     mPipelineCache.bindVertexArray(attribDesc, bufferDesc, bufferCount);
 
diff --git a/filament/backend/src/vulkan/VulkanPipelineCache.cpp b/filament/backend/src/vulkan/VulkanPipelineCache.cpp
index 889888cd083..2d976f66ff2 100644
--- a/filament/backend/src/vulkan/VulkanPipelineCache.cpp
+++ b/filament/backend/src/vulkan/VulkanPipelineCache.cpp
@@ -34,8 +34,6 @@ using namespace bluevk;
 
 namespace filament::backend {
 
-static VulkanPipelineCache::RasterState createDefaultRasterState();
-
 static VkShaderStageFlags getShaderStageFlags(VulkanPipelineCache::UsageFlags key, uint16_t binding) {
     // NOTE: if you modify this function, you also need to modify getUsageFlags.
     assert_invariant(binding < MAX_SAMPLER_COUNT);
@@ -73,8 +71,7 @@ VulkanPipelineCache::UsageFlags VulkanPipelineCache::disableUsageFlags(uint16_t
 }
 
 VulkanPipelineCache::VulkanPipelineCache(VulkanResourceAllocator* allocator)
-    : mCurrentRasterState(createDefaultRasterState()),
-      mResourceAllocator(allocator),
+    : mResourceAllocator(allocator),
       mPipelineBoundResources(allocator) {
     mDummyBufferWriteInfo.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     mDummyBufferWriteInfo.pNext = nullptr;
@@ -569,7 +566,7 @@ void VulkanPipelineCache::bindProgram(VulkanProgram* program) noexcept {
 }
 
 void VulkanPipelineCache::bindRasterState(const RasterState& rasterState) noexcept {
-    mPipelineRequirements.rasterState = mCurrentRasterState = rasterState;
+    mPipelineRequirements.rasterState = rasterState;
 }
 
 void VulkanPipelineCache::bindRenderPass(VkRenderPass renderPass, int subpassIndex) noexcept {
@@ -917,23 +914,6 @@ bool VulkanPipelineCache::DescEqual::operator()(const DescriptorKey& k1,
     return true;
 }
 
-static VulkanPipelineCache::RasterState createDefaultRasterState() {
-    return VulkanPipelineCache::RasterState {
-        .cullMode = VK_CULL_MODE_NONE,
-        .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
-        .depthBiasEnable = VK_FALSE,
-        .blendEnable = VK_FALSE,
-        .depthWriteEnable = VK_TRUE,
-        .alphaToCoverageEnable = true,
-        .colorWriteMask = 0xf,
-        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
-        .colorTargetCount = 1,
-        .depthCompareOp = SamplerCompareFunc::LE,
-        .depthBiasConstantFactor = 0.0f,
-        .depthBiasSlopeFactor = 0.0f,
-    };
-}
-
 } // namespace filament::backend
 
 #pragma clang diagnostic pop
diff --git a/filament/backend/src/vulkan/VulkanPipelineCache.h b/filament/backend/src/vulkan/VulkanPipelineCache.h
index 018fd00efec..a28327c5d9e 100644
--- a/filament/backend/src/vulkan/VulkanPipelineCache.h
+++ b/filament/backend/src/vulkan/VulkanPipelineCache.h
@@ -199,15 +199,6 @@ class VulkanPipelineCache : public CommandBufferObserver {
         mPipelineBoundResources.acquire(resource);
     }
 
-    inline RasterState getCurrentRasterState() const noexcept {
-        return mCurrentRasterState;
-    }
-
-    // We need to update this outside of bindRasterState due to VulkanDriver::draw.
-    inline void setCurrentRasterState(RasterState const& rasterState) noexcept {
-        mCurrentRasterState = rasterState;
-    }
-
 private:
     // PIPELINE LAYOUT CACHE KEY
     // -------------------------
@@ -413,7 +404,6 @@ class VulkanPipelineCache : public CommandBufferObserver {
     VmaAllocator mAllocator = VK_NULL_HANDLE;
 
     // Current requirements for the pipeline layout, pipeline, and descriptor sets.
-    RasterState mCurrentRasterState;
     PipelineKey mPipelineRequirements = {};
     DescriptorKey mDescriptorRequirements = {};
 
diff --git a/filament/backend/src/vulkan/VulkanResources.h b/filament/backend/src/vulkan/VulkanResources.h
index 77b6498b860..9421e11a14d 100644
--- a/filament/backend/src/vulkan/VulkanResources.h
+++ b/filament/backend/src/vulkan/VulkanResources.h
@@ -63,7 +63,8 @@ struct VulkanResourceBase {
     explicit VulkanResourceBase(VulkanResourceType type)
         : mRefCount(IS_HEAP_ALLOC_TYPE(type) ? 1 : 0),
           mType(type),
-          mHandleId(0) {}
+          mHandleId(0) {
+    }
 
 private:
     inline VulkanResourceType getType() {
@@ -82,6 +83,7 @@ struct VulkanResourceBase {
         if (IS_HEAP_ALLOC_TYPE(mType)) {
             return;
         }
+        assert_invariant(mRefCount < ((1<<24) - 1));
         ++mRefCount;
     }
 
@@ -89,6 +91,7 @@ struct VulkanResourceBase {
         if (IS_HEAP_ALLOC_TYPE(mType)) {
             return;
         }
+        assert_invariant(mRefCount > 0);
         --mRefCount;
     }
 
@@ -96,8 +99,8 @@ struct VulkanResourceBase {
         return mRefCount;
     }
 
-    size_t mRefCount = 0;
-    VulkanResourceType mType = VulkanResourceType::BUFFER_OBJECT;
+    uint32_t mRefCount : 24; // 16M is enough for the refcount
+    VulkanResourceType mType : 8;
     HandleBase::HandleId mHandleId;
 
     friend struct VulkanThreadSafeResource;
diff --git a/filament/include/filament/Engine.h b/filament/include/filament/Engine.h
index 904cbda4a3f..2f8c6d4af74 100644
--- a/filament/include/filament/Engine.h
+++ b/filament/include/filament/Engine.h
@@ -178,6 +178,7 @@ class UTILS_PUBLIC Engine {
     using Backend = backend::Backend;
     using DriverConfig = backend::Platform::DriverConfig;
     using FeatureLevel = backend::FeatureLevel;
+    using StereoscopicType = backend::StereoscopicType;
 
     /**
      * Config is used to define the memory footprint used by the engine, such as the
@@ -297,6 +298,25 @@ class UTILS_PUBLIC Engine {
          */
         size_t textureUseAfterFreePoolSize = 0;
 
+        /**
+         * Set to `true` to forcibly disable parallel shader compilation in the backend.
+         * Currently only honored by the GL backend.
+         */
+        bool disableParallelShaderCompile = false;
+
+        /*
+         * The type of technique for stereoscopic rendering.
+         *
+         * This setting determines the algorithm used when stereoscopic rendering is enabled. This
+         * decision applies to the entire Engine for the lifetime of the Engine. E.g., multiple
+         * Views created from the Engine must use the same stereoscopic type.
+         *
+         * Each view can enable stereoscopic rendering via the StereoscopicOptions::enable flag.
+         *
+         * @see View::setStereoscopicOptions
+         */
+        StereoscopicType stereoscopicType = StereoscopicType::INSTANCED;
+
         /*
          * The number of eyes to render when stereoscopic rendering is enabled. Supported values are
          * between 1 and Engine::getMaxStereoscopicEyes() (inclusive).
diff --git a/filament/include/filament/View.h b/filament/include/filament/View.h
index e4ba827aad2..3cdd527fac7 100644
--- a/filament/include/filament/View.h
+++ b/filament/include/filament/View.h
@@ -719,7 +719,7 @@ class UTILS_PUBLIC View : public FilamentAPI {
     void setDebugCamera(Camera* UTILS_NULLABLE camera) noexcept;
 
     //! debugging: returns a Camera from the point of view of *the* dominant directional light used for shadowing.
-    Camera const* UTILS_NULLABLE getDirectionalLightCamera() const noexcept;
+    Camera const* UTILS_NULLABLE getDirectionalShadowCamera() const noexcept;
 
 
     /** Result of a picking query */
diff --git a/filament/src/Allocators.h b/filament/src/Allocators.h
index eb354b8d329..84962e30c0e 100644
--- a/filament/src/Allocators.h
+++ b/filament/src/Allocators.h
@@ -54,7 +54,7 @@ using LinearAllocatorArena = utils::Arena<
 
 #endif
 
-using ArenaScope = utils::ArenaScope<LinearAllocatorArena>;
+using RootArenaScope = utils::ArenaScope<LinearAllocatorArena>;
 
 } // namespace filament
 
diff --git a/filament/src/Froxelizer.cpp b/filament/src/Froxelizer.cpp
index c469932c251..47bd0d343dd 100644
--- a/filament/src/Froxelizer.cpp
+++ b/filament/src/Froxelizer.cpp
@@ -168,7 +168,8 @@ void Froxelizer::setProjection(const mat4f& projection,
 }
 
 bool Froxelizer::prepare(
-        FEngine::DriverApi& driverApi, ArenaScope& arena, filament::Viewport const& viewport,
+        FEngine::DriverApi& driverApi, RootArenaScope& rootArenaScope,
+        filament::Viewport const& viewport,
         const mat4f& projection, float projectionNear, float projectionFar) noexcept {
     setViewport(viewport);
     setProjection(projection, projectionNear, projectionFar);
@@ -199,12 +200,12 @@ bool Froxelizer::prepare(
 
     // light records per froxel (~256 KiB)
     mLightRecords = {
-            arena.allocate<LightRecord>(getFroxelBufferEntryCount(), CACHELINE_SIZE),
+            rootArenaScope.allocate<LightRecord>(getFroxelBufferEntryCount(), CACHELINE_SIZE),
             getFroxelBufferEntryCount() };
 
     // froxel thread data (~256 KiB)
     mFroxelShardedData = {
-            arena.allocate<FroxelThreadData>(GROUP_COUNT, CACHELINE_SIZE),
+            rootArenaScope.allocate<FroxelThreadData>(GROUP_COUNT, CACHELINE_SIZE),
             uint32_t(GROUP_COUNT)
     };
 
diff --git a/filament/src/Froxelizer.h b/filament/src/Froxelizer.h
index 27885e24bc7..27ba3c57641 100644
--- a/filament/src/Froxelizer.h
+++ b/filament/src/Froxelizer.h
@@ -110,7 +110,7 @@ class Froxelizer {
      *
      * return true if updateUniforms() needs to be called
      */
-    bool prepare(backend::DriverApi& driverApi, ArenaScope& arena, Viewport const& viewport,
+    bool prepare(backend::DriverApi& driverApi, RootArenaScope& rootArenaScope, Viewport const& viewport,
             const math::mat4f& projection, float projectionNear, float projectionFar) noexcept;
 
     Froxel getFroxelAt(size_t x, size_t y, size_t z) const noexcept;
diff --git a/filament/src/PostProcessManager.cpp b/filament/src/PostProcessManager.cpp
index 78814f74852..f186ee9cb6d 100644
--- a/filament/src/PostProcessManager.cpp
+++ b/filament/src/PostProcessManager.cpp
@@ -414,7 +414,7 @@ void PostProcessManager::commitAndRender(FrameGraphResources::RenderPassInfo con
 // ------------------------------------------------------------------------------------------------
 
 PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph& fg,
-        RenderPass const& pass, uint8_t structureRenderFlags,
+        RenderPassBuilder const& passBuilder, uint8_t structureRenderFlags,
         uint32_t width, uint32_t height,
         StructurePassConfig const& config) noexcept {
 
@@ -466,17 +466,19 @@ PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph
                         .clearFlags = TargetBufferFlags::COLOR0 | TargetBufferFlags::DEPTH
                 });
             },
-            [=, renderPass = pass](FrameGraphResources const& resources,
+            [=, passBuilder = passBuilder](FrameGraphResources const& resources,
                     auto const&, DriverApi&) mutable {
                 Variant structureVariant(Variant::DEPTH_VARIANT);
                 structureVariant.setPicking(config.picking);
 
                 auto out = resources.getRenderPassInfo();
-                renderPass.setRenderFlags(structureRenderFlags);
-                renderPass.setVariant(structureVariant);
-                renderPass.appendCommands(mEngine, RenderPass::CommandTypeFlags::SSAO);
-                renderPass.sortCommands(mEngine);
-                renderPass.execute(mEngine, resources.getPassName(), out.target, out.params);
+
+                passBuilder.renderFlags(structureRenderFlags);
+                passBuilder.variant(structureVariant);
+                passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::SSAO);
+
+                RenderPass const pass{ passBuilder.build(mEngine) };
+                RenderPass::execute(pass, mEngine, resources.getPassName(), out.target, out.params);
             });
 
     auto depth = structurePass->depth;
@@ -523,7 +525,7 @@ PostProcessManager::StructurePassOutput PostProcessManager::structure(FrameGraph
 // ------------------------------------------------------------------------------------------------
 
 FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
-        RenderPass const& pass,
+        RenderPassBuilder const& passBuilder,
         FrameHistory const& frameHistory,
         CameraInfo const& cameraInfo,
         PerViewUniforms& uniforms,
@@ -586,7 +588,7 @@ FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
             },
             [this, projection = cameraInfo.projection,
                     userViewMatrix = cameraInfo.getUserViewMatrix(), uvFromClipMatrix, historyProjection,
-                    options, &uniforms, renderPass = pass]
+                    options, &uniforms, passBuilder = passBuilder]
             (FrameGraphResources const& resources, auto const& data, DriverApi& driver) mutable {
                 // set structure sampler
                 uniforms.prepareStructure(data.structure ?
@@ -607,17 +609,17 @@ FrameGraphId<FrameGraphTexture> PostProcessManager::ssr(FrameGraph& fg,
                 auto out = resources.getRenderPassInfo();
 
                 // Remove the HAS_SHADOWING RenderFlags, since it's irrelevant when rendering reflections
-                RenderPass::RenderFlags flags = renderPass.getRenderFlags();
-                flags &= ~RenderPass::HAS_SHADOWING;
-                renderPass.setRenderFlags(flags);
+                passBuilder.renderFlags(~RenderPass::HAS_SHADOWING, 0);
 
                 // use our special SSR variant, it can only be applied to object that have
                 // the SCREEN_SPACE ReflectionMode.
-                renderPass.setVariant(Variant{Variant::SPECIAL_SSR});
+                passBuilder.variant(Variant{ Variant::SPECIAL_SSR });
+
                 // generate all our drawing commands, except blended objects.
-                renderPass.appendCommands(mEngine, RenderPass::CommandTypeFlags::SCREEN_SPACE_REFLECTIONS);
-                renderPass.sortCommands(mEngine);
-                renderPass.execute(mEngine, resources.getPassName(), out.target, out.params);
+                passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::SCREEN_SPACE_REFLECTIONS);
+
+                RenderPass const pass{ passBuilder.build(mEngine) };
+                RenderPass::execute(pass, mEngine, resources.getPassName(), out.target, out.params);
             });
 
     return ssrPass->reflections;
diff --git a/filament/src/PostProcessManager.h b/filament/src/PostProcessManager.h
index 081e795f061..12b211dc238 100644
--- a/filament/src/PostProcessManager.h
+++ b/filament/src/PostProcessManager.h
@@ -50,6 +50,7 @@ class FMaterialInstance;
 class FrameGraph;
 class PerViewUniforms;
 class RenderPass;
+class RenderPassBuilder;
 struct CameraInfo;
 
 class PostProcessManager {
@@ -99,12 +100,12 @@ class PostProcessManager {
         FrameGraphId<FrameGraphTexture> picking;
     };
     StructurePassOutput structure(FrameGraph& fg,
-            RenderPass const& pass, uint8_t structureRenderFlags,
+            RenderPassBuilder const& passBuilder, uint8_t structureRenderFlags,
             uint32_t width, uint32_t height, StructurePassConfig const& config) noexcept;
 
     // reflections pass
     FrameGraphId<FrameGraphTexture> ssr(FrameGraph& fg,
-            RenderPass const& pass,
+            RenderPassBuilder const& passBuilder,
             FrameHistory const& frameHistory,
             CameraInfo const& cameraInfo,
             PerViewUniforms& uniforms,
diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index 2932fcf481b..d5063043f5a 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -19,17 +19,43 @@
 #include "RenderPrimitive.h"
 #include "ShadowMap.h"
 
+#include "details/Camera.h"
 #include "details/Material.h"
 #include "details/MaterialInstance.h"
 #include "details/View.h"
 
+#include "components/RenderableManager.h"
+
+#include <private/filament/EngineEnums.h>
 #include <private/filament/UibStructs.h>
+#include <private/filament/Variant.h>
+
+#include <filament/MaterialEnums.h>
+
+#include <backend/DriverApiForward.h>
+#include <backend/DriverEnums.h>
+#include <backend/Handle.h>
+#include <backend/PipelineState.h>
+
+#include "private/backend/CircularBuffer.h"
 
+#include <utils/compiler.h>
+#include <utils/debug.h>
 #include <utils/JobSystem.h>
+#include <utils/Panic.h>
+#include <utils/Slice.h>
 #include <utils/Systrace.h>
+#include <utils/Range.h>
 
+#include <algorithm>
+#include <functional>
+#include <limits>
 #include <utility>
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
 using namespace utils;
 using namespace filament::math;
 
@@ -37,64 +63,112 @@ namespace filament {
 
 using namespace backend;
 
-RenderPass::RenderPass(FEngine& engine,
-        RenderPass::Arena& arena) noexcept
-        : mCommandArena(arena),
-          mCustomCommands(engine.getPerRenderPassAllocator()) {
+RenderPassBuilder& RenderPassBuilder::customCommand(
+        FEngine& engine,
+        uint8_t channel,
+        RenderPass::Pass pass,
+        RenderPass::CustomCommand custom,
+        uint32_t order,
+        RenderPass::Executor::CustomCommandFn const& command) {
+    if (!mCustomCommands.has_value()) {
+        // construct the vector the first time
+        mCustomCommands.emplace(engine.getPerRenderPassArena());
+    }
+    mCustomCommands->emplace_back(channel, pass, custom, order, command);
+    return *this;
 }
 
-RenderPass::RenderPass(RenderPass const& rhs) = default;
+RenderPass RenderPassBuilder::build(FEngine& engine) {
+    ASSERT_POSTCONDITION(mRenderableSoa, "RenderPassBuilder::geometry() hasn't been called");
+    assert_invariant(mScissorViewport.width  <= std::numeric_limits<int32_t>::max());
+    assert_invariant(mScissorViewport.height <= std::numeric_limits<int32_t>::max());
+    return RenderPass{ engine, *this };
+}
 
-// this destructor is actually heavy because it inlines ~vector<>
-RenderPass::~RenderPass() noexcept = default;
+// ------------------------------------------------------------------------------------------------
+
+RenderPass::RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexcept
+        : mRenderableSoa(*builder.mRenderableSoa),
+          mVisibleRenderables(builder.mVisibleRenderables),
+          mUboHandle(builder.mUboHandle),
+          mCameraPosition(builder.mCameraPosition),
+          mCameraForwardVector(builder.mCameraForwardVector),
+          mFlags(builder.mFlags),
+          mVariant(builder.mVariant),
+          mVisibilityMask(builder.mVisibilityMask),
+          mScissorViewport(builder.mScissorViewport),
+          mCustomCommands(engine.getPerRenderPassArena()) {
+
+    // compute the number of commands we need
+    updateSummedPrimitiveCounts(
+            const_cast<FScene::RenderableSoa&>(mRenderableSoa), mVisibleRenderables);
+
+    uint32_t commandCount =
+            FScene::getPrimitiveCount(mRenderableSoa, mVisibleRenderables.last);
+    const bool colorPass  = bool(builder.mCommandTypeFlags & CommandTypeFlags::COLOR);
+    const bool depthPass  = bool(builder.mCommandTypeFlags & CommandTypeFlags::DEPTH);
+    commandCount *= uint32_t(colorPass * 2 + depthPass);
+    commandCount += 1; // for the sentinel
+
+    uint32_t const customCommandCount =
+            builder.mCustomCommands.has_value() ? builder.mCustomCommands->size() : 0;
 
-RenderPass::Command* RenderPass::append(size_t count) noexcept {
-    // this is like an "in-place" realloc(). Works only with LinearAllocator.
-    Command* const curr = mCommandArena.alloc<Command>(count);
+    Command* const curr = builder.mArena.alloc<Command>(commandCount + customCommandCount);
     assert_invariant(curr);
-    assert_invariant(mCommandBegin == nullptr || curr == mCommandEnd);
-    if (mCommandBegin == nullptr) {
-        mCommandBegin = mCommandEnd = curr;
+
+    if (UTILS_UNLIKELY(builder.mArena.getAllocator().isHeapAllocation(curr))) {
+        static bool sLogOnce = true;
+        if (UTILS_UNLIKELY(sLogOnce)) {
+            sLogOnce = false;
+            PANIC_LOG("RenderPass arena is full, using slower system heap. Please increase "
+                      "the appropriate constant (e.g. FILAMENT_PER_RENDER_PASS_ARENA_SIZE_IN_MB).");
+        }
     }
-    mCommandEnd += count;
-    return curr;
-}
 
-void RenderPass::resize(size_t count) noexcept {
-    if (mCommandBegin) {
-        mCommandEnd = mCommandBegin + count;
-        mCommandArena.rewind(mCommandEnd);
+    mCommandBegin = curr;
+    mCommandEnd = curr + commandCount + customCommandCount;
+
+    appendCommands(engine, { curr, commandCount }, builder.mCommandTypeFlags);
+
+    if (builder.mCustomCommands.has_value()) {
+        Command* p = curr + commandCount;
+        for (auto [channel, passId, command, order, fn]: builder.mCustomCommands.value()) {
+            appendCustomCommand(p++, channel, passId, command, order, fn);
+        }
     }
-}
 
-void RenderPass::setGeometry(FScene::RenderableSoa const& soa, Range<uint32_t> vr,
-        backend::Handle<backend::HwBufferObject> uboHandle) noexcept {
-    mRenderableSoa = &soa;
-    mVisibleRenderables = vr;
-    mUboHandle = uboHandle;
-}
+    // sort commands once we're done adding commands
+    sortCommands(builder.mArena);
 
-void RenderPass::setCamera(const CameraInfo& camera) noexcept {
-    mCameraPosition = camera.getPosition();
-    mCameraForwardVector = camera.getForwardVector();
+    if (engine.isAutomaticInstancingEnabled()) {
+        instanceify(engine, builder.mArena);
+    }
 }
 
-void RenderPass::setScissorViewport(backend::Viewport viewport) noexcept {
-    assert_invariant(viewport.width  <= std::numeric_limits<int32_t>::max());
-    assert_invariant(viewport.height <= std::numeric_limits<int32_t>::max());
-    mScissorViewport = viewport;
+// this destructor is actually heavy because it inlines ~vector<>
+RenderPass::~RenderPass() noexcept = default;
+
+void RenderPass::resize(Arena& arena, size_t count) noexcept {
+    if (mCommandBegin) {
+        mCommandEnd = mCommandBegin + count;
+        arena.rewind(mCommandEnd);
+    }
 }
 
-void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandTypeFlags) noexcept {
+void RenderPass::appendCommands(FEngine& engine,
+        Slice<Command> commands, CommandTypeFlags const commandTypeFlags) noexcept {
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
-    assert_invariant(mRenderableSoa);
-
     utils::Range<uint32_t> const vr = mVisibleRenderables;
     // trace the number of visible renderables
     SYSTRACE_VALUE32("visibleRenderables", vr.size());
     if (UTILS_UNLIKELY(vr.empty())) {
+        // no renderables, we still need the sentinel and the command buffer size should be
+        // exactly 1.
+        assert_invariant(commands.size() == 1);
+        Command* curr = commands.data();
+        curr->key = uint64_t(Pass::SENTINEL);
         return;
     }
 
@@ -104,17 +178,10 @@ void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandT
     const FScene::VisibleMaskType visibilityMask = mVisibilityMask;
 
     // up-to-date summed primitive counts needed for generateCommands()
-    FScene::RenderableSoa const& soa = *mRenderableSoa;
-    updateSummedPrimitiveCounts(const_cast<FScene::RenderableSoa&>(soa), vr);
+    FScene::RenderableSoa const& soa = mRenderableSoa;
 
-    // compute how much maximum storage we need for this pass
-    uint32_t commandCount = FScene::getPrimitiveCount(soa, vr.last);
-    // double the color pass for transparent objects that need to render twice
-    const bool colorPass  = bool(commandTypeFlags & CommandTypeFlags::COLOR);
-    const bool depthPass  = bool(commandTypeFlags & CommandTypeFlags::DEPTH);
-    commandCount *= uint32_t(colorPass * 2 + depthPass);
-    commandCount += 1; // for the sentinel
-    Command* const curr = append(commandCount);
+    Command* curr = commands.data();
+    size_t const commandCount = commands.size();
 
     auto stereoscopicEyeCount =
             renderFlags & IS_STEREOSCOPIC ? engine.getConfig().stereoscopicEyeCount : 1;
@@ -152,7 +219,8 @@ void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandT
     }
 }
 
-void RenderPass::appendCustomCommand(uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
+void RenderPass::appendCustomCommand(Command* commands,
+        uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
         Executor::CustomCommandFn command) {
 
     assert_invariant((uint64_t(order) << CUSTOM_ORDER_SHIFT) <=  CUSTOM_ORDER_MASK);
@@ -168,11 +236,10 @@ void RenderPass::appendCustomCommand(uint8_t channel, Pass pass, CustomCommand c
     cmd |= uint64_t(order) << CUSTOM_ORDER_SHIFT;
     cmd |= uint64_t(index);
 
-    Command* const curr = append(1);
-    curr->key = cmd;
+    commands->key = cmd;
 }
 
-void RenderPass::sortCommands(FEngine& engine) noexcept {
+void RenderPass::sortCommands(Arena& arena) noexcept {
     SYSTRACE_NAME("sort and trim commands");
 
     std::sort(mCommandBegin, mCommandEnd);
@@ -183,30 +250,20 @@ void RenderPass::sortCommands(FEngine& engine) noexcept {
                 return c.key != uint64_t(Pass::SENTINEL);
             });
 
-    resize(uint32_t(last - mCommandBegin));
-
-    if (engine.isAutomaticInstancingEnabled()) {
-        instanceify(engine);
-    }
+    resize(arena, uint32_t(last - mCommandBegin));
 }
 
-void RenderPass::execute(FEngine& engine, const char* name,
+void RenderPass::execute(RenderPass const& pass,
+        FEngine& engine, const char* name,
         backend::Handle<backend::HwRenderTarget> renderTarget,
-        backend::RenderPassParams params) const noexcept {
-
+        backend::RenderPassParams params) noexcept {
     DriverApi& driver = engine.getDriverApi();
-
-    // this is a good time to flush the CommandStream, because we're about to potentially
-    // output a lot of commands. This guarantees here that we have at least
-    // FILAMENT_MIN_COMMAND_BUFFERS_SIZE_IN_MB bytes (1MiB by default).
-    engine.flush();
-
     driver.beginRenderPass(renderTarget, params);
-    getExecutor().execute(engine, name);
+    pass.getExecutor().execute(engine, name);
     driver.endRenderPass();
 }
 
-void RenderPass::instanceify(FEngine& engine) noexcept {
+void RenderPass::instanceify(FEngine& engine, Arena& arena) noexcept {
     SYSTRACE_NAME("instanceify");
 
     // instanceify works by scanning the **sorted** command stream, looking for repeat draw
@@ -262,7 +319,8 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
                 // buffer large enough for all instances data
                 stagingBufferSize = sizeof(PerRenderableData) * (last - curr);
                 stagingBuffer = (PerRenderableData*)::malloc(stagingBufferSize);
-                uboData = mRenderableSoa->data<FScene::UBO>();
+                uboData = mRenderableSoa.data<FScene::UBO>();
+                assert_invariant(uboData);
             }
 
             // copy the ubo data to a staging buffer
@@ -315,7 +373,7 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
             return command.key == uint64_t(Pass::SENTINEL);
         });
 
-        resize(uint32_t(lastCommand - mCommandBegin));
+        resize(arena, uint32_t(lastCommand - mCommandBegin));
     }
 
     assert_invariant(stagingBuffer == nullptr);
@@ -323,7 +381,7 @@ void RenderPass::instanceify(FEngine& engine) noexcept {
 
 
 /* static */
-UTILS_ALWAYS_INLINE // this function exists only to make the code more readable. we want it inlined.
+UTILS_ALWAYS_INLINE // This function exists only to make the code more readable. we want it inlined.
 inline              // and we don't need it in the compilation unit
 void RenderPass::setupColorCommand(Command& cmdDraw, Variant variant,
         FMaterialInstance const* const UTILS_RESTRICT mi, bool inverseFrontFaces) noexcept {
@@ -374,7 +432,7 @@ void RenderPass::setupColorCommand(Command& cmdDraw, Variant variant,
 
 /* static */
 UTILS_NOINLINE
-void RenderPass::generateCommands(uint32_t commandTypeFlags, Command* const commands,
+void RenderPass::generateCommands(CommandTypeFlags commandTypeFlags, Command* const commands,
         FScene::RenderableSoa const& soa, Range<uint32_t> range,
         Variant variant, RenderFlags renderFlags,
         FScene::VisibleMaskType visibilityMask, float3 cameraPosition, float3 cameraForward,
@@ -432,9 +490,9 @@ void RenderPass::generateCommands(uint32_t commandTypeFlags, Command* const comm
 }
 
 /* static */
-template<uint32_t commandTypeFlags>
+template<RenderPass::CommandTypeFlags commandTypeFlags>
 UTILS_NOINLINE
-RenderPass::Command* RenderPass::generateCommandsImpl(uint32_t extraFlags,
+RenderPass::Command* RenderPass::generateCommandsImpl(RenderPass::CommandTypeFlags extraFlags,
         Command* UTILS_RESTRICT curr,
         FScene::RenderableSoa const& UTILS_RESTRICT soa, Range<uint32_t> range,
         Variant const variant, RenderFlags renderFlags, FScene::VisibleMaskType visibilityMask,
@@ -737,13 +795,13 @@ void RenderPass::updateSummedPrimitiveCounts(
 // ------------------------------------------------------------------------------------------------
 
 void RenderPass::Executor::overridePolygonOffset(backend::PolygonOffset const* polygonOffset) noexcept {
-    if ((mPolygonOffsetOverride = (polygonOffset != nullptr))) {
+    if ((mPolygonOffsetOverride = (polygonOffset != nullptr))) { // NOLINT(*-assignment-in-if-condition)
         mPolygonOffset = *polygonOffset;
     }
 }
 
 void RenderPass::Executor::overrideScissor(backend::Viewport const* scissor) noexcept {
-    if ((mScissorOverride = (scissor != nullptr))) {
+    if ((mScissorOverride = (scissor != nullptr))) { // NOLINT(*-assignment-in-if-condition)
         mScissor = *scissor;
     }
 }
@@ -754,15 +812,20 @@ void RenderPass::Executor::overrideScissor(backend::Viewport const& scissor) noe
 }
 
 void RenderPass::Executor::execute(FEngine& engine, const char*) const noexcept {
-    execute(engine.getDriverApi(), mCommands.begin(), mCommands.end());
+    execute(engine, mCommands.begin(), mCommands.end());
 }
 
 UTILS_NOINLINE // no need to be inlined
-void RenderPass::Executor::execute(backend::DriverApi& driver,
+void RenderPass::Executor::execute(FEngine& engine,
         const Command* first, const Command* last) const noexcept {
+
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
+    DriverApi& driver = engine.getDriverApi();
+    size_t const capacity = engine.getMinCommandBufferSize();
+    CircularBuffer const& circularBuffer = driver.getCircularBuffer();
+
     if (first != last) {
         SYSTRACE_VALUE32("commandCount", last - first);
 
@@ -781,126 +844,163 @@ void RenderPass::Executor::execute(backend::DriverApi& driver,
         FMaterial const* UTILS_RESTRICT ma = nullptr;
         auto const* UTILS_RESTRICT pCustomCommands = mCustomCommands.data();
 
-        first--;
-        while (++first != last) {
-            assert_invariant(first->key != uint64_t(Pass::SENTINEL));
-
-            /*
-             * Be careful when changing code below, this is the hot inner-loop
-             */
-
-            if (UTILS_UNLIKELY((first->key & CUSTOM_MASK) != uint64_t(CustomCommand::PASS))) {
-                mi = nullptr; // custom command could change the currently bound MaterialInstance
-                uint32_t const index = (first->key & CUSTOM_INDEX_MASK) >> CUSTOM_INDEX_SHIFT;
-                assert_invariant(index < mCustomCommands.size());
-                pCustomCommands[index]();
-                continue;
+        // Maximum space occupied in the CircularBuffer by a single `Command`. This must be
+        // reevaluated when the inner loop below adds DriverApi commands or when we change the
+        // CommandStream protocol. Currently, the maximum is 240 bytes, and we use 256 to be on
+        // the safer side.
+        size_t const maxCommandSizeInBytes = 256;
+
+        // Number of Commands that can be issued and guaranteed to fit in the current
+        // CircularBuffer allocation. In practice, we'll have tons of headroom especially if
+        // skinning and morphing aren't used. With a 2 MiB buffer (the default) a batch is
+        // 8192 commands (i.e. draw calls).
+        size_t const batchCommandCount = capacity / maxCommandSizeInBytes;
+        while(first != last) {
+            Command const* const batchLast = std::min(first + batchCommandCount, last);
+
+            // actual number of commands we need to write (can be smaller than batchCommandCount)
+            size_t const commandCount = batchLast - first;
+            size_t const commandSizeInBytes = commandCount * maxCommandSizeInBytes;
+
+            // check we have enough capacity to write these commandCount commands, if not,
+            // request a new CircularBuffer allocation of `capacity` bytes.
+            if (UTILS_UNLIKELY(circularBuffer.getUsed() > capacity - commandSizeInBytes)) {
+                engine.flush(); // TODO: we should use a "fast" flush if possible
             }
 
-            // primitiveHandle may be invalid if no geometry was set on the renderable.
-            if (UTILS_UNLIKELY(!first->primitive.primitiveHandle)) {
-                continue;
-            }
+            first--;
+            while (++first != batchLast) {
+                assert_invariant(first->key != uint64_t(Pass::SENTINEL));
 
-            // per-renderable uniform
-            const PrimitiveInfo info = first->primitive;
-            pipeline.rasterState = info.rasterState;
-
-            if (UTILS_UNLIKELY(mi != info.mi)) {
-                // this is always taken the first time
-                mi = info.mi;
-                ma = mi->getMaterial();
-
-                auto const& scissor = mi->getScissor();
-                if (UTILS_UNLIKELY(mi->hasScissor())) {
-                    // scissor is set, we need to apply the offset/clip
-                    // clang vectorizes this!
-                    constexpr int32_t maxvali = std::numeric_limits<int32_t>::max();
-                    const backend::Viewport scissorViewport = mScissorViewport;
-                    // compute new left/bottom, assume no overflow
-                    int32_t l = scissor.left + scissorViewport.left;
-                    int32_t b = scissor.bottom + scissorViewport.bottom;
-                    // compute right/top without overflowing, scissor.width/height guaranteed
-                    // to convert to int32
-                    int32_t r = (l > maxvali - int32_t(scissor.width)) ?
-                            maxvali : l + int32_t(scissor.width);
-                    int32_t t = (b > maxvali - int32_t(scissor.height)) ?
-                            maxvali : b + int32_t(scissor.height);
-                    // clip to the viewport
-                    l = std::max(l, scissorViewport.left);
-                    b = std::max(b, scissorViewport.bottom);
-                    r = std::min(r, scissorViewport.left + int32_t(scissorViewport.width));
-                    t = std::min(t, scissorViewport.bottom + int32_t(scissorViewport.height));
-                    assert_invariant(r >= l && t >= b);
-                    *pScissor = { l, b, uint32_t(r - l), uint32_t(t - b) };
-                } else {
-                    // no scissor set (common case), 'scissor' has its default value, use that.
-                    *pScissor = scissor;
+                /*
+                 * Be careful when changing code below, this is the hot inner-loop
+                 */
+
+                if (UTILS_UNLIKELY((first->key & CUSTOM_MASK) != uint64_t(CustomCommand::PASS))) {
+                    mi = nullptr; // custom command could change the currently bound MaterialInstance
+                    uint32_t const index = (first->key & CUSTOM_INDEX_MASK) >> CUSTOM_INDEX_SHIFT;
+                    assert_invariant(index < mCustomCommands.size());
+                    pCustomCommands[index]();
+                    continue;
                 }
 
-                *pPipelinePolygonOffset = mi->getPolygonOffset();
-                pipeline.stencilState = mi->getStencilState();
-                mi->use(driver);
-            }
+                // primitiveHandle may be invalid if no geometry was set on the renderable.
+                if (UTILS_UNLIKELY(!first->primitive.primitiveHandle)) {
+                    continue;
+                }
 
-            pipeline.program = ma->getProgram(info.materialVariant);
+                // per-renderable uniform
+                const PrimitiveInfo info = first->primitive;
+                pipeline.rasterState = info.rasterState;
+
+                if (UTILS_UNLIKELY(mi != info.mi)) {
+                    // this is always taken the first time
+                    mi = info.mi;
+                    assert_invariant(mi);
+
+                    ma = mi->getMaterial();
+
+                    auto const& scissor = mi->getScissor();
+                    if (UTILS_UNLIKELY(mi->hasScissor())) {
+                        // scissor is set, we need to apply the offset/clip
+                        // clang vectorizes this!
+                        constexpr int32_t maxvali = std::numeric_limits<int32_t>::max();
+                        const backend::Viewport scissorViewport = mScissorViewport;
+                        // compute new left/bottom, assume no overflow
+                        int32_t l = scissor.left + scissorViewport.left;
+                        int32_t b = scissor.bottom + scissorViewport.bottom;
+                        // compute right/top without overflowing, scissor.width/height guaranteed
+                        // to convert to int32
+                        int32_t r = (l > maxvali - int32_t(scissor.width)) ?
+                                    maxvali : l + int32_t(scissor.width);
+                        int32_t t = (b > maxvali - int32_t(scissor.height)) ?
+                                    maxvali : b + int32_t(scissor.height);
+                        // clip to the viewport
+                        l = std::max(l, scissorViewport.left);
+                        b = std::max(b, scissorViewport.bottom);
+                        r = std::min(r, scissorViewport.left + int32_t(scissorViewport.width));
+                        t = std::min(t, scissorViewport.bottom + int32_t(scissorViewport.height));
+                        assert_invariant(r >= l && t >= b);
+                        *pScissor = { l, b, uint32_t(r - l), uint32_t(t - b) };
+                    } else {
+                        // no scissor set (common case), 'scissor' has its default value, use that.
+                        *pScissor = scissor;
+                    }
+
+                    *pPipelinePolygonOffset = mi->getPolygonOffset();
+                    pipeline.stencilState = mi->getStencilState();
+                    mi->use(driver);
+                }
 
-            uint16_t const instanceCount = info.instanceCount & PrimitiveInfo::INSTANCE_COUNT_MASK;
-            auto getPerObjectUboHandle =
-                    [this, &info, &instanceCount]() -> std::pair<Handle<backend::HwBufferObject>, uint32_t> {
-                if (info.instanceBufferHandle) {
-                    // "hybrid" instancing -- instanceBufferHandle takes the place of the UBO
-                    return { info.instanceBufferHandle, 0 };
+                assert_invariant(ma);
+                pipeline.program = ma->getProgram(info.materialVariant);
+
+                uint16_t const instanceCount =
+                        info.instanceCount & PrimitiveInfo::INSTANCE_COUNT_MASK;
+                auto getPerObjectUboHandle =
+                        [this, &info, &instanceCount]() -> std::pair<Handle<backend::HwBufferObject>, uint32_t> {
+                            if (info.instanceBufferHandle) {
+                                // "hybrid" instancing -- instanceBufferHandle takes the place of the UBO
+                                return { info.instanceBufferHandle, 0 };
+                            }
+                            bool const userInstancing =
+                                    (info.instanceCount & PrimitiveInfo::USER_INSTANCE_MASK) != 0u;
+                            if (!userInstancing && instanceCount > 1) {
+                                // automatic instancing
+                                return {
+                                        mInstancedUboHandle,
+                                        info.index * sizeof(PerRenderableData) };
+                            } else {
+                                // manual instancing
+                                return { mUboHandle, info.index * sizeof(PerRenderableData) };
+                            }
+                        };
+
+                // Bind per-renderable uniform block. There is no need to attempt to skip this command
+                // because the backends already do this.
+                auto const [perObjectUboHandle, offset] = getPerObjectUboHandle();
+                assert_invariant(perObjectUboHandle);
+                driver.bindBufferRange(BufferObjectBinding::UNIFORM,
+                        +UniformBindingPoints::PER_RENDERABLE,
+                        perObjectUboHandle,
+                        offset,
+                        sizeof(PerRenderableUib));
+
+                if (UTILS_UNLIKELY(info.skinningHandle)) {
+                    // note: we can't bind less than sizeof(PerRenderableBoneUib) due to glsl limitations
+                    driver.bindBufferRange(BufferObjectBinding::UNIFORM,
+                            +UniformBindingPoints::PER_RENDERABLE_BONES,
+                            info.skinningHandle,
+                            info.skinningOffset * sizeof(PerRenderableBoneUib::BoneData),
+                            sizeof(PerRenderableBoneUib));
+                    // note: always bind the skinningTexture because the shader needs it.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
+                            info.skinningTexture);
+                    // note: even if only skinning is enabled, binding morphTargetBuffer is needed.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphTargetBuffer);
                 }
-                bool const userInstancing =
-                        (info.instanceCount & PrimitiveInfo::USER_INSTANCE_MASK) != 0u;
-                if (!userInstancing && instanceCount > 1) {
-                    // automatic instancing
-                    return { mInstancedUboHandle, info.index * sizeof(PerRenderableData) };
-                } else {
-                    // manual instancing
-                    return { mUboHandle, info.index * sizeof(PerRenderableData) };
+
+                if (UTILS_UNLIKELY(info.morphWeightBuffer)) {
+                    // Instead of using a UBO per primitive, we could also have a single UBO for all
+                    // primitives and use bindUniformBufferRange which might be more efficient.
+                    driver.bindUniformBuffer(+UniformBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphWeightBuffer);
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
+                            info.morphTargetBuffer);
+                    // note: even if only morphing is enabled, binding skinningTexture is needed.
+                    driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
+                            info.skinningTexture);
                 }
-            };
-
-            // bind per-renderable uniform block. there is no need to attempt to skip this command
-            // because the backends already do this.
-            auto const [perObjectUboHandle, offset] = getPerObjectUboHandle();
-            assert_invariant(perObjectUboHandle);
-            driver.bindBufferRange(BufferObjectBinding::UNIFORM,
-                    +UniformBindingPoints::PER_RENDERABLE,
-                    perObjectUboHandle,
-                    offset,
-                    sizeof(PerRenderableUib));
-
-            if (UTILS_UNLIKELY(info.skinningHandle)) {
-                // note: we can't bind less than sizeof(PerRenderableBoneUib) due to glsl limitations
-                driver.bindBufferRange(BufferObjectBinding::UNIFORM,
-                        +UniformBindingPoints::PER_RENDERABLE_BONES,
-                        info.skinningHandle,
-                        info.skinningOffset * sizeof(PerRenderableBoneUib::BoneData),
-                        sizeof(PerRenderableBoneUib));
-                // note: always bind the skinningTexture because the shader needs it.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
-                        info.skinningTexture);
-                // note: even if only skinning is enabled, binding morphTargetBuffer is needed.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphTargetBuffer);
-           }
-
-            if (UTILS_UNLIKELY(info.morphWeightBuffer)) {
-                // Instead of using a UBO per primitive, we could also have a single UBO for all
-                // primitives and use bindUniformBufferRange which might be more efficient.
-                driver.bindUniformBuffer(+UniformBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphWeightBuffer);
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_MORPHING,
-                        info.morphTargetBuffer);
-                // note: even if only morphing is enabled, binding skinningTexture is needed.
-                driver.bindSamplers(+SamplerBindingPoints::PER_RENDERABLE_SKINNING,
-                        info.skinningTexture);
+
+                driver.draw(pipeline, info.primitiveHandle, instanceCount);
             }
+        }
 
-            driver.draw(pipeline, info.primitiveHandle, instanceCount);
+        // If the remaining space is less than half the capacity, we flush right away to
+        // allow some headroom for commands that might come later.
+        if (UTILS_UNLIKELY(circularBuffer.getUsed() > capacity / 2)) {
+            engine.flush();
         }
     }
 
diff --git a/filament/src/RenderPass.h b/filament/src/RenderPass.h
index 4474079594f..646171efd58 100644
--- a/filament/src/RenderPass.h
+++ b/filament/src/RenderPass.h
@@ -22,26 +22,38 @@
 #include "details/Camera.h"
 #include "details/Scene.h"
 
-#include "backend/DriverApiForward.h"
-
-#include <private/filament/Variant.h>
+#include "private/filament/Variant.h"
+#include "utils/BitmaskEnum.h"
 
 #include <backend/DriverEnums.h>
 #include <backend/Handle.h>
 
 #include <utils/Allocator.h>
 #include <utils/Range.h>
+#include <utils/Slice.h>
 #include <utils/architecture.h>
-#include <utils/compiler.h>
 #include <utils/debug.h>
 
+#include <math/mathfwd.h>
+
 #include <functional>
 #include <limits>
+#include <optional>
+#include <type_traits>
+#include <tuple>
 #include <vector>
 
+#include <stddef.h>
+#include <stdint.h>
+
 namespace filament {
 
+namespace backend {
+class CommandBufferQueue;
+}
+
 class FMaterialInstance;
+class RenderPassBuilder;
 
 class RenderPass {
 public:
@@ -171,7 +183,7 @@ class RenderPass {
         EPILOG  = uint64_t(0x2) << CUSTOM_SHIFT
     };
 
-    enum CommandTypeFlags : uint8_t {
+    enum class CommandTypeFlags : uint32_t {
         COLOR = 0x1,    // generate the color pass only
         DEPTH = 0x2,    // generate the depth pass only ( e.g. shadowmap)
 
@@ -191,7 +203,6 @@ class RenderPass {
         SCREEN_SPACE_REFLECTIONS = COLOR | FILTER_TRANSLUCENT_OBJECTS
     };
 
-
     /*
      * The sorting material key is 32 bits and encoded as:
      *
@@ -240,7 +251,6 @@ class RenderPass {
         uint32_t skinningOffset = 0;                                    // 4 bytes
         uint16_t instanceCount;                                         // 2 bytes [MSb: user]
         Variant materialVariant;                                        // 1 byte
-//        uint8_t reserved[0] = {};                                       // 0 bytes
 
         static const uint16_t USER_INSTANCE_MASK = 0x8000u;
         static const uint16_t INSTANCE_COUNT_MASK = 0x7fffu;
@@ -253,7 +263,7 @@ class RenderPass {
         uint64_t reserved[1] = {};  //  8 bytes
         bool operator < (Command const& rhs) const noexcept { return key < rhs.key; }
         // placement new declared as "throw" to avoid the compiler's null-check
-        inline void* operator new (std::size_t, void* ptr) {
+        inline void* operator new (size_t, void* ptr) {
             assert_invariant(ptr);
             return ptr;
         }
@@ -269,61 +279,31 @@ class RenderPass {
 
     // Arena used for commands
     using Arena = utils::Arena<
-            utils::LinearAllocator,                 // note: can't change this allocator
+            utils::LinearAllocatorWithFallback,
             utils::LockingPolicy::NoLock,
             utils::TrackingPolicy::HighWatermark,
             utils::AreaPolicy::StaticArea>;
 
-    /*
-     * Create a RenderPass.
-     * The Arena is used to allocate commands which are then owned by the Arena.
-     */
-    RenderPass(FEngine& engine, Arena& arena) noexcept;
+    // RenderPass can only be moved
+    RenderPass(RenderPass&& rhs) = default;
 
-    // Copy the RenderPass as is. This can be used to create a RenderPass from a "template"
-    // by copying from an "empty" RenderPass.
-    RenderPass(RenderPass const& rhs);
+    // RenderPass can't be copied
+    RenderPass(RenderPass const& rhs) = delete;
+    RenderPass& operator=(RenderPass const& rhs) = delete;
+    RenderPass& operator=(RenderPass&& rhs) = delete;
 
     // allocated commands ARE NOT freed, they're owned by the Arena
     ~RenderPass() noexcept;
 
-    // a box that both offsets the viewport and clips it
-    void setScissorViewport(backend::Viewport viewport) noexcept;
-
-    // specifies the geometry to generate commands for
-    void setGeometry(FScene::RenderableSoa const& soa, utils::Range<uint32_t> vr,
-            backend::Handle<backend::HwBufferObject> uboHandle) noexcept;
-
-    // specifies camera information (e.g. used for sorting commands)
-    void setCamera(const CameraInfo& camera) noexcept;
-
-    //  flags controlling how commands are generated
-    void setRenderFlags(RenderFlags flags) noexcept { mFlags = flags; }
-    RenderFlags getRenderFlags() const noexcept { return mFlags; }
-
-    // variant to use
-    void setVariant(Variant variant) noexcept { mVariant = variant; }
-
-    // Sets the visibility mask, which is AND-ed against each Renderable's VISIBLE_MASK to determine
-    // if the renderable is visible for this pass.
-    // Defaults to all 1's, which means all renderables in this render pass will be rendered.
-    void setVisibilityMask(FScene::VisibleMaskType mask) noexcept { mVisibilityMask = mask; }
-
     Command const* begin() const noexcept { return mCommandBegin; }
     Command const* end() const noexcept { return mCommandEnd; }
     bool empty() const noexcept { return begin() == end(); }
 
-    // This is the main function of this class, this appends commands to the pass using
-    // the current camera, geometry and flags set. This can be called multiple times if needed.
-    void appendCommands(FEngine& engine, CommandTypeFlags commandTypeFlags) noexcept;
-
-    // sorts and instanceify commands then trims sentinels
-    void sortCommands(FEngine& engine) noexcept;
-
     // Helper to execute all the commands generated by this RenderPass
-    void execute(FEngine& engine, const char* name,
+    static void execute(RenderPass const& pass,
+            FEngine& engine, const char* name,
             backend::Handle<backend::HwRenderTarget> renderTarget,
-            backend::RenderPassParams params) const noexcept;
+            backend::RenderPassParams params) noexcept;
 
     /*
      * Executor holds the range of commands to execute for a given pass
@@ -331,6 +311,7 @@ class RenderPass {
     class Executor {
         using CustomCommandFn = std::function<void()>;
         friend class RenderPass;
+        friend class RenderPassBuilder;
 
         // these fields are constant after creation
         utils::Slice<Command> mCommands;
@@ -346,8 +327,7 @@ class RenderPass {
 
         Executor(RenderPass const* pass, Command const* b, Command const* e) noexcept;
 
-        void execute(backend::DriverApi& driver,
-                const Command* first, const Command* last) const noexcept;
+        void execute(FEngine& engine, const Command* first, const Command* last) const noexcept;
 
     public:
         Executor() = default;
@@ -366,37 +346,39 @@ class RenderPass {
     };
 
     // returns a new executor for this pass
-    Executor getExecutor() {
-        return { this, mCommandBegin, mCommandEnd };
-    }
-
     Executor getExecutor() const {
         return { this, mCommandBegin, mCommandEnd };
     }
 
-    // returns a new executor for this pass with a custom range
-    Executor getExecutor(Command const* b, Command const* e) {
-        return { this, b, e };
-    }
-
     Executor getExecutor(Command const* b, Command const* e) const {
         return { this, b, e };
     }
 
+private:
+    friend class FRenderer;
+    friend class RenderPassBuilder;
+    RenderPass(FEngine& engine, RenderPassBuilder const& builder) noexcept;
+
+    // This is the main function of this class, this appends commands to the pass using
+    // the current camera, geometry and flags set. This can be called multiple times if needed.
+    void appendCommands(FEngine& engine,
+            utils::Slice<Command> commands, CommandTypeFlags commandTypeFlags) noexcept;
+
     // Appends a custom command.
-    void appendCustomCommand(uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
+    void appendCustomCommand(Command* commands,
+            uint8_t channel, Pass pass, CustomCommand custom, uint32_t order,
             Executor::CustomCommandFn command);
 
+    void resize(Arena& arena, size_t count) noexcept;
 
-private:
-    friend class FRenderer;
+    // sorts commands then trims sentinels
+    void sortCommands(Arena& arena) noexcept;
 
-    Command* append(size_t count) noexcept;
-    void resize(size_t count) noexcept;
-    void instanceify(FEngine& engine) noexcept;
+    // instanceify commands then trims sentinels
+    void instanceify(FEngine& engine, Arena& arena) noexcept;
 
-    // we choose the command count per job to minimize JobSystem overhead.
-    // on a Pixel 4, 2048 commands is about half a millisecond of processing.
+    // We choose the command count per job to minimize JobSystem overhead.
+    // On a Pixel 4, 2048 commands is about half a millisecond of processing.
     static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_COUNT = 2048;
     static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_SIZE  =
             sizeof(Command) * JOBS_PARALLEL_FOR_COMMANDS_COUNT;
@@ -404,15 +386,15 @@ class RenderPass {
     static_assert(JOBS_PARALLEL_FOR_COMMANDS_SIZE % utils::CACHELINE_SIZE == 0,
             "Size of Commands jobs must be multiple of a cache-line size");
 
-    static inline void generateCommands(uint32_t commandTypeFlags, Command* commands,
+    static inline void generateCommands(CommandTypeFlags commandTypeFlags, Command* commands,
             FScene::RenderableSoa const& soa, utils::Range<uint32_t> range,
             Variant variant, RenderFlags renderFlags,
             FScene::VisibleMaskType visibilityMask,
             math::float3 cameraPosition, math::float3 cameraForward,
             uint8_t instancedStereoEyeCount) noexcept;
 
-    template<uint32_t commandTypeFlags>
-    static inline Command* generateCommandsImpl(uint32_t extraFlags, Command* curr,
+    template<RenderPass::CommandTypeFlags commandTypeFlags>
+    static inline Command* generateCommandsImpl(RenderPass::CommandTypeFlags extraFlags, Command* curr,
             FScene::RenderableSoa const& soa, utils::Range<uint32_t> range,
             Variant variant, RenderFlags renderFlags, FScene::VisibleMaskType visibilityMask,
             math::float3 cameraPosition, math::float3 cameraForward,
@@ -424,50 +406,128 @@ class RenderPass {
     static void updateSummedPrimitiveCounts(
             FScene::RenderableSoa& renderableData, utils::Range<uint32_t> vr) noexcept;
 
-    // a reference to the Engine, mostly to get to things like JobSystem
 
-    // Arena where all Commands are allocated. The Arena owns the commands.
-    Arena& mCommandArena;
+    FScene::RenderableSoa const& mRenderableSoa;
+    utils::Range<uint32_t> const mVisibleRenderables;
+    backend::Handle<backend::HwBufferObject> const mUboHandle;
+    math::float3 const mCameraPosition;
+    math::float3 const mCameraForwardVector;
+    RenderFlags const mFlags;
+    Variant const mVariant;
+    FScene::VisibleMaskType const mVisibilityMask;
+    backend::Viewport const mScissorViewport;
 
     // Pointer to the first command
     Command* mCommandBegin = nullptr;
-
     // Pointer to one past the last command
     Command* mCommandEnd = nullptr;
+    // a UBO for instanced primitives
+    backend::Handle<backend::HwBufferObject> mInstancedUboHandle;
+    // a vector for our custom commands
+    using CustomCommandVector = std::vector<Executor::CustomCommandFn,
+            utils::STLAllocator<Executor::CustomCommandFn, LinearAllocatorArena>>;
+    mutable CustomCommandVector mCustomCommands;
+};
 
-    // the SOA containing the renderables we're interested in
-    FScene::RenderableSoa const* mRenderableSoa = nullptr;
+class RenderPassBuilder {
+    friend class RenderPass;
 
-    // The range of visible renderables in the SOA above
+    RenderPass::Arena& mArena;
+    RenderPass::CommandTypeFlags mCommandTypeFlags{};
+    backend::Viewport mScissorViewport{ 0, 0, INT32_MAX, INT32_MAX };
+    FScene::RenderableSoa const* mRenderableSoa = nullptr;
     utils::Range<uint32_t> mVisibleRenderables{};
-
-    // the UBO containing the data for the renderables
     backend::Handle<backend::HwBufferObject> mUboHandle;
-    backend::Handle<backend::HwBufferObject> mInstancedUboHandle;
-
-    // info about the camera
     math::float3 mCameraPosition{};
     math::float3 mCameraForwardVector{};
+    RenderPass::RenderFlags mFlags{};
+    Variant mVariant{};
+    FScene::VisibleMaskType mVisibilityMask = std::numeric_limits<FScene::VisibleMaskType>::max();
 
-    // info about the scene features (e.g.: has shadows, lighting, etc...)
-    RenderFlags mFlags{};
+    using CustomCommandRecord = std::tuple<
+            uint8_t,
+            RenderPass::Pass,
+            RenderPass::CustomCommand,
+            uint32_t,
+            RenderPass::Executor::CustomCommandFn>;
 
-    // Variant to use
-    Variant mVariant{};
+    using CustomCommandContainer = std::vector<CustomCommandRecord,
+            utils::STLAllocator<CustomCommandRecord, LinearAllocatorArena>>;
 
-    // Additional visibility mask
-    FScene::VisibleMaskType mVisibilityMask = std::numeric_limits<FScene::VisibleMaskType>::max();
+    // we make this optional because it's not used often, and we don't want to have
+    // to construct it by default.
+    std::optional<CustomCommandContainer> mCustomCommands;
 
-    backend::Viewport mScissorViewport{ 0, 0,
-            std::numeric_limits<int32_t>::max(),
-            std::numeric_limits<int32_t>::max() };
+public:
+    explicit RenderPassBuilder(RenderPass::Arena& arena) : mArena(arena) { }
 
-    // a vector for our custom commands
-    using CustomCommandVector = std::vector<Executor::CustomCommandFn,
-            utils::STLAllocator<Executor::CustomCommandFn, LinearAllocatorArena>>;
-    mutable CustomCommandVector mCustomCommands;
+    RenderPassBuilder& commandTypeFlags(RenderPass::CommandTypeFlags commandTypeFlags) noexcept {
+        mCommandTypeFlags = commandTypeFlags;
+        return *this;
+    }
+
+    RenderPassBuilder& scissorViewport(backend::Viewport viewport) noexcept {
+        mScissorViewport = viewport;
+        return *this;
+    }
+
+    // specifies the geometry to generate commands for
+    RenderPassBuilder& geometry(FScene::RenderableSoa const& soa, utils::Range<uint32_t> vr,
+            backend::Handle<backend::HwBufferObject> uboHandle) noexcept {
+        mRenderableSoa = &soa;
+        mVisibleRenderables = vr;
+        mUboHandle = uboHandle;
+        return *this;
+    }
+
+    // Specifies camera information (e.g. used for sorting commands)
+    RenderPassBuilder& camera(const CameraInfo& camera) noexcept {
+        mCameraPosition = camera.getPosition();
+        mCameraForwardVector = camera.getForwardVector();
+        return *this;
+    }
+
+    //  flags controlling how commands are generated
+    RenderPassBuilder& renderFlags(RenderPass::RenderFlags flags) noexcept {
+        mFlags = flags;
+        return *this;
+    }
+
+    // like above but allows to set specific flags
+    RenderPassBuilder& renderFlags(
+            RenderPass::RenderFlags mask, RenderPass::RenderFlags value) noexcept {
+        mFlags = (mFlags & mask) | (value & mask);
+        return *this;
+    }
+
+    // variant to use
+    RenderPassBuilder& variant(Variant variant) noexcept {
+        mVariant = variant;
+        return *this;
+    }
+
+    // Sets the visibility mask, which is AND-ed against each Renderable's VISIBLE_MASK to
+    // determine if the renderable is visible for this pass.
+    // Defaults to all 1's, which means all renderables in this render pass will be rendered.
+    RenderPassBuilder& visibilityMask(FScene::VisibleMaskType mask) noexcept {
+        mVisibilityMask = mask;
+        return *this;
+    }
+
+    RenderPassBuilder& customCommand(FEngine& engine,
+            uint8_t channel,
+            RenderPass::Pass pass,
+            RenderPass::CustomCommand custom,
+            uint32_t order,
+            const RenderPass::Executor::CustomCommandFn& command);
+
+    RenderPass build(FEngine& engine);
 };
 
+
 } // namespace filament
 
+template<> struct utils::EnableBitMaskOperators<filament::RenderPass::CommandTypeFlags>
+        : public std::true_type {};
+
 #endif // TNT_FILAMENT_RENDERPASS_H
diff --git a/filament/src/RendererUtils.cpp b/filament/src/RendererUtils.cpp
index 2707b9201be..a26b9b7b53e 100644
--- a/filament/src/RendererUtils.cpp
+++ b/filament/src/RendererUtils.cpp
@@ -228,10 +228,6 @@ FrameGraphId<FrameGraphTexture> RendererUtils::colorPass(
                     out.params.subpassMask = 1;
                 }
 
-                // this is a good time to flush the CommandStream, because we're about to potentially
-                // output a lot of commands. This guarantees here that we have at least
-                // FILAMENT_MIN_COMMAND_BUFFERS_SIZE_IN_MB bytes (1MiB by default).
-                engine.flush();
                 driver.beginRenderPass(out.target, out.params);
                 passExecutor.execute(engine, resources.getPassName());
                 driver.endRenderPass();
diff --git a/filament/src/ShadowMap.cpp b/filament/src/ShadowMap.cpp
index e4d7f67be01..7015e14466f 100644
--- a/filament/src/ShadowMap.cpp
+++ b/filament/src/ShadowMap.cpp
@@ -16,19 +16,39 @@
 
 #include "ShadowMap.h"
 
-#include "RenderPass.h"
+#include <filament/Box.h>
+#include <filament/Frustum.h>
+#include <filament/LightManager.h>
 
 #include "components/LightManager.h"
 
+#include "details/DebugRegistry.h"
 #include "details/Engine.h"
 #include "details/Scene.h"
 
+#include <backend/DriverApiForward.h>
 #include <backend/DriverEnums.h>
 
+#include <utils/compiler.h>
 #include <utils/debug.h>
+#include <utils/Entity.h>
+#include <utils/Slice.h>
 #include <utils/Systrace.h>
 
+#include <math/vec3.h>
+#include <math/vec4.h>
+#include <math/mat3.h>
+#include <math/mat4.h>
+#include <math/scalar.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
 #include <limits>
+#include <type_traits>
+
+#include <stddef.h>
+#include <stdint.h>
 
 using namespace utils;
 
@@ -239,7 +259,8 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
 ShadowMap::ShaderParameters ShadowMap::updatePunctual(
         mat4f const& Mv, float outerConeAngle, float nearPlane, float farPlane,
         const ShadowMapInfo& shadowMapInfo, const FLightManager::ShadowParams& params) noexcept {
-    const mat4f Mp = mat4f::perspective(outerConeAngle * f::RAD_TO_DEG * 2.0f, 1.0f, nearPlane, farPlane);
+    const mat4f Mp = mat4f::perspective(
+            outerConeAngle * f::RAD_TO_DEG * 2.0f, 1.0f, nearPlane, farPlane);
 
     assert_invariant(shadowMapInfo.textureDimension == mOptions->mapSize);
 
diff --git a/filament/src/ShadowMap.h b/filament/src/ShadowMap.h
index ce1ee860622..d0ca26945f9 100644
--- a/filament/src/ShadowMap.h
+++ b/filament/src/ShadowMap.h
@@ -17,20 +17,30 @@
 #ifndef TNT_FILAMENT_DETAILS_SHADOWMAP_H
 #define TNT_FILAMENT_DETAILS_SHADOWMAP_H
 
-#include "components/LightManager.h"
+#include <filament/Box.h>
 
+#include "Culler.h"
 #include "PerShadowMapUniforms.h"
 
 #include "details/Camera.h"
 #include "details/Scene.h"
 
+#include "components/LightManager.h"
+
 #include <backend/DriverApiForward.h>
+#include <backend/DriverEnums.h>
 
-#include <filament/Box.h>
-#include <filament/Viewport.h>
+#include <utils/compiler.h>
 
-#include <math/mat4.h>
+#include <math/mathfwd.h>
+#include <math/vec3.h>
 #include <math/vec4.h>
+#include <math/mat4.h>
+
+#include <array>
+
+#include <stddef.h>
+#include <stdint.h>
 
 namespace filament {
 
diff --git a/filament/src/ShadowMapManager.cpp b/filament/src/ShadowMapManager.cpp
index 9fa72d54b8b..f2c7c603190 100644
--- a/filament/src/ShadowMapManager.cpp
+++ b/filament/src/ShadowMapManager.cpp
@@ -15,17 +15,48 @@
  */
 
 #include "ShadowMapManager.h"
-
 #include "RenderPass.h"
 #include "ShadowMap.h"
 
+#include <filament/Frustum.h>
+#include <filament/LightManager.h>
+#include <filament/Options.h>
+
+#include <private/filament/EngineEnums.h>
+
+#include "components/RenderableManager.h"
+
+#include "details/Camera.h"
+#include "details/DebugRegistry.h"
 #include "details/Texture.h"
 #include "details/View.h"
 
-#include <fg/FrameGraph.h>
+#include "fg/FrameGraph.h"
+#include "fg/FrameGraphId.h"
+#include "fg/FrameGraphRenderPass.h"
+#include "fg/FrameGraphTexture.h"
+
+#include <backend/DriverApiForward.h>
+#include <backend/DriverEnums.h>
 
-#include <utils/debug.h>
 #include <utils/FixedCapacityVector.h>
+#include <utils/Range.h>
+#include <utils/Slice.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+
+#include <math/half.h>
+#include <math/mat4.h>
+#include <math/vec4.h>
+#include <math/scalar.h>
+
+#include <cmath>
+#include <limits>
+#include <new>
+#include <memory>
+
+#include <stdint.h>
+#include <stddef.h>
 
 namespace filament {
 
@@ -34,15 +65,6 @@ using namespace math;
 
 ShadowMapManager::ShadowMapManager(FEngine& engine)
         : mEngine(engine) {
-    // initialize our ShadowMap array in-place
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        new (&entry) ShadowMap(engine);
-    }
-
-    mShadowUbh = engine.getDriverApi().createBufferObject(mShadowUb.getSize(),
-            BufferObjectBinding::UNIFORM, BufferUsage::DYNAMIC);
-
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
     debugRegistry.registerProperty("d.shadowmap.visualize_cascades",
             &engine.debug.shadowmap.visualize_cascades);
@@ -52,25 +74,75 @@ ShadowMapManager::ShadowMapManager(FEngine& engine)
 
 ShadowMapManager::~ShadowMapManager() {
     // destroy the ShadowMap array in-place
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        std::destroy_at(std::launder(reinterpret_cast<ShadowMap*>(&entry)));
+    if (UTILS_UNLIKELY(mInitialized)) {
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            std::destroy_at(std::launder(reinterpret_cast<ShadowMap*>(&entry)));
+        }
     }
 }
 
-void ShadowMapManager::terminate(FEngine& engine) {
-    DriverApi& driver = engine.getDriverApi();
-    driver.destroyBufferObject(mShadowUbh);
-    UTILS_NOUNROLL
-    for (auto& entry : mShadowMapCache) {
-        std::launder(reinterpret_cast<ShadowMap*>(&entry))->terminate(engine);
+void ShadowMapManager::createIfNeeded(FEngine& engine,
+        std::unique_ptr<ShadowMapManager>& inOutShadowMapManager) {
+    if (UTILS_UNLIKELY(!inOutShadowMapManager)) {
+        inOutShadowMapManager.reset(new ShadowMapManager(engine));
+    }
+}
+
+void ShadowMapManager::terminate(FEngine& engine,
+        std::unique_ptr<ShadowMapManager>& shadowMapManager) {
+    if (shadowMapManager) {
+        shadowMapManager->terminate(engine);
     }
 }
 
+void ShadowMapManager::terminate(FEngine& engine) {
+    if (UTILS_UNLIKELY(mInitialized)) {
+        DriverApi& driver = engine.getDriverApi();
+        driver.destroyBufferObject(mShadowUbh);
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            std::launder(reinterpret_cast<ShadowMap*>(&entry))->terminate(engine);
+        }
+    }
+}
 
-ShadowMapManager::ShadowTechnique ShadowMapManager::update(FEngine& engine, FView& view,
+ShadowMapManager::ShadowTechnique ShadowMapManager::update(
+        Builder const& builder,
+        FEngine& engine, FView& view,
         CameraInfo const& cameraInfo,
         FScene::RenderableSoa& renderableData, FScene::LightSoa const& lightData) noexcept {
+
+    if (!builder.mDirectionalShadowMapCount && !builder.mSpotShadowMapCount) {
+        // no shadows were recorder
+        return ShadowTechnique::NONE;
+    }
+
+    // initialize the shadowmap array the first time
+    if (UTILS_UNLIKELY(!mInitialized)) {
+        mInitialized = true;
+        // initialize our ShadowMap array in-place
+        mShadowUbh = engine.getDriverApi().createBufferObject(mShadowUb.getSize(),
+                BufferObjectBinding::UNIFORM, BufferUsage::DYNAMIC);
+        UTILS_NOUNROLL
+        for (auto& entry: mShadowMapCache) {
+            new(&entry) ShadowMap(engine);
+        }
+    }
+
+    mDirectionalShadowMapCount = builder.mDirectionalShadowMapCount;
+    mSpotShadowMapCount = builder.mSpotShadowMapCount;
+
+    for (auto const& entry : builder.mShadowMaps) {
+        auto& shadowMap = getShadowMap(entry.shadowIndex);
+        shadowMap.initialize(
+                entry.lightIndex,
+                entry.shadowType,
+                entry.shadowIndex,
+                entry.face,
+                entry.options);
+    }
+
     ShadowTechnique shadowTechnique = {};
 
     calculateTextureRequirements(engine, view, lightData);
@@ -89,46 +161,54 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::update(FEngine& engine, FVie
     return shadowTechnique;
 }
 
-void ShadowMapManager::reset() noexcept {
-    mDirectionalShadowMapCount = 0;
-    mSpotShadowMapCount = 0;
-}
-
-void ShadowMapManager::setDirectionalShadowMap(size_t lightIndex,
+ShadowMapManager::Builder& ShadowMapManager::Builder::directionalShadowMap(size_t lightIndex,
         LightManager::ShadowOptions const* options) noexcept {
     assert_invariant(options->shadowCascades <= CONFIG_MAX_SHADOW_CASCADES);
-
     // this updates getCascadedShadowMap()
     mDirectionalShadowMapCount = options->shadowCascades;
-    utils::Slice<ShadowMap> cascadedShadowMap = getCascadedShadowMap();
     for (size_t c = 0; c < options->shadowCascades; c++) {
-        ShadowMap& shadowMap = cascadedShadowMap[c];
-        shadowMap.initialize(lightIndex, ShadowType::DIRECTIONAL, c, 0, options);
+        mShadowMaps.push_back({
+                .lightIndex = lightIndex,
+                .shadowType = ShadowType::DIRECTIONAL,
+                .shadowIndex = uint8_t(c),
+                .face = 0,
+                .options = options });
     }
+    return *this;
 }
 
-void ShadowMapManager::addShadowMap(size_t lightIndex, bool spotlight,
+ShadowMapManager::Builder& ShadowMapManager::Builder::shadowMap(size_t lightIndex, bool spotlight,
         LightManager::ShadowOptions const* options) noexcept {
     if (spotlight) {
         const size_t c = mSpotShadowMapCount++;
         const size_t i = c + CONFIG_MAX_SHADOW_CASCADES;
         assert_invariant(i < CONFIG_MAX_SHADOWMAPS);
-        auto& shadowMap = getShadowMap(i);
-        shadowMap.initialize(lightIndex, ShadowType::SPOT, i, 0, options);
+        mShadowMaps.push_back({
+                .lightIndex = lightIndex,
+                .shadowType = ShadowType::SPOT,
+                .shadowIndex = uint8_t(i),
+                .face = 0,
+                .options = options });
     } else {
         // point-light, generate 6 independent shadowmaps
         for (size_t face = 0; face < 6; face++) {
             const size_t c = mSpotShadowMapCount++;
             const size_t i = c + CONFIG_MAX_SHADOW_CASCADES;
             assert_invariant(i < CONFIG_MAX_SHADOWMAPS);
-            auto& shadowMap = getShadowMap(i);
-            shadowMap.initialize(lightIndex, ShadowType::POINT, i, face, options);
+            mShadowMaps.push_back({
+                    .lightIndex = lightIndex,
+                    .shadowType = ShadowType::POINT,
+                    .shadowIndex = uint8_t(i),
+                    .face = uint8_t(face),
+                    .options = options });
         }
     }
+    return *this;
 }
 
 FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameGraph& fg,
-        RenderPass const& pass, FView& view, CameraInfo const& mainCameraInfo,
+        RenderPassBuilder const& passBuilder,
+        FView& view, CameraInfo const& mainCameraInfo,
         float4 const& userTime) noexcept {
 
     const float moment2 = std::numeric_limits<half>::max();
@@ -206,8 +286,8 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
                 builder.sideEffect();
             },
             [this, &engine, &view, vsmShadowOptions,
-                scene, mainCameraInfo, userTime, passTemplate = pass](
-                    FrameGraphResources const&, auto const& data, DriverApi& driver) {
+                scene, mainCameraInfo, userTime, passBuilder = passBuilder](
+                    FrameGraphResources const&, auto const& data, DriverApi& driver) mutable {
 
                 // Note: we could almost parallel_for the loop below, the problem currently is
                 // that updatePrimitivesLod() updates temporary global state.
@@ -262,19 +342,20 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
                                 cameraInfo, scene->getRenderableData(), entry.range);
 
                         // generate and sort the commands for rendering the shadow map
-                        RenderPass pass(passTemplate);
-                        pass.setCamera(cameraInfo);
-                        pass.setVisibilityMask(entry.visibilityMask);
-                        pass.setGeometry(scene->getRenderableData(),
-                                entry.range, scene->getRenderableUBO());
-                        pass.appendCommands(engine, RenderPass::SHADOW);
-                        pass.sortCommands(engine);
+
+                        RenderPass const pass = passBuilder
+                            .camera(cameraInfo)
+                            .visibilityMask(entry.visibilityMask)
+                            .geometry(scene->getRenderableData(),
+                                    entry.range, scene->getRenderableUBO())
+                            .commandTypeFlags(RenderPass::CommandTypeFlags::SHADOW)
+                            .build(engine);
 
                         entry.executor = pass.getExecutor();
 
                         if (!view.hasVSM()) {
                             auto const* options = shadowMap.getShadowOptions();
-                            const PolygonOffset polygonOffset = { // handle reversed Z
+                            PolygonOffset const polygonOffset = { // handle reversed Z
                                     .slope    = -options->polygonOffsetSlope,
                                     .constant = -options->polygonOffsetConstant
                             };
@@ -395,7 +476,6 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
 
                     auto rt = resources.getRenderPassInfo(data.rt);
 
-                    engine.flush();
                     driver.beginRenderPass(rt.target, rt.params);
                     entry.shadowMap->bind(driver);
                     entry.executor.overrideScissor(entry.shadowMap->getScissor());
@@ -765,7 +845,7 @@ void ShadowMapManager::preparePointShadowMap(ShadowMap& shadowMap,
         FEngine& engine, FView& view, CameraInfo const& mainCameraInfo,
         FScene::RenderableSoa& renderableData, utils::Range<uint32_t> range,
         FScene::LightSoa& lightData,
-        ShadowMap::SceneInfo const& sceneInfo) noexcept {
+        ShadowMap::SceneInfo const&) noexcept {
 
     const uint8_t face = shadowMap.getFace();
     const size_t lightIndex = shadowMap.getLightIndex();
diff --git a/filament/src/ShadowMapManager.h b/filament/src/ShadowMapManager.h
index 5c38048ebed..8f154a4d3b4 100644
--- a/filament/src/ShadowMapManager.h
+++ b/filament/src/ShadowMapManager.h
@@ -17,32 +17,52 @@
 #ifndef TNT_FILAMENT_DETAILS_SHADOWMAPMANAGER_H
 #define TNT_FILAMENT_DETAILS_SHADOWMAPMANAGER_H
 
-#include <filament/Viewport.h>
-
+#include "Culler.h"
 #include "ShadowMap.h"
 #include "TypedUniformBuffer.h"
 
+#include <filament/LightManager.h>
+#include <filament/Options.h>
+
+#include <private/filament/EngineEnums.h>
+#include <private/filament/UibStructs.h>
+
+#include "components/RenderableManager.h"
+
 #include "details/Engine.h"
 #include "details/Scene.h"
 
-#include <private/filament/EngineEnums.h>
+#include "fg/FrameGraphId.h"
+#include "fg/FrameGraphTexture.h"
 
-#include <backend/DriverApiForward.h>
 #include <backend/DriverEnums.h>
 #include <backend/Handle.h>
 
+#include <utils/BitmaskEnum.h>
+#include <utils/compiler.h>
+#include <utils/debug.h>
+#include <utils/Range.h>
 #include <utils/Slice.h>
 
-#include <math/vec3.h>
+#include <math/mat4.h>
+#include <math/vec4.h>
 
 #include <array>
 #include <memory>
+#include <new>
+#include <type_traits>
+#include <vector>
+
+#include <stdint.h>
+#include <stddef.h>
 
 namespace filament {
 
+class FCamera;
 class FView;
 class FrameGraph;
 class RenderPass;
+class RenderPassBuilder;
 
 struct ShadowMappingUniforms {
     math::float4 cascadeSplits;
@@ -54,7 +74,7 @@ struct ShadowMappingUniforms {
 class ShadowMapManager {
 public:
 
-    using ShadowMappingUniforms = ShadowMappingUniforms;
+    using ShadowMappingUniforms = filament::ShadowMappingUniforms;
 
     using ShadowType = ShadowMap::ShadowType;
 
@@ -64,29 +84,48 @@ class ShadowMapManager {
         SCREEN_SPACE = 0x2u,
     };
 
+    class Builder {
+        friend class ShadowMapManager;
+        uint32_t mDirectionalShadowMapCount = 0;
+        uint32_t mSpotShadowMapCount = 0;
+        struct ShadowMap {
+            size_t lightIndex;
+            ShadowType shadowType;
+            uint16_t shadowIndex;
+            uint8_t face;
+            LightManager::ShadowOptions const* options;
+        };
+        std::vector<ShadowMap> mShadowMaps;
+    public:
+        Builder& directionalShadowMap(size_t lightIndex,
+                LightManager::ShadowOptions const* options) noexcept;
 
-    explicit ShadowMapManager(FEngine& engine);
-    ~ShadowMapManager();
+        Builder& shadowMap(size_t lightIndex, bool spotlight,
+                LightManager::ShadowOptions const* options) noexcept;
 
-    void terminate(FEngine& engine);
+        bool hasShadowMaps() const noexcept {
+            return mDirectionalShadowMapCount || mSpotShadowMapCount;
+        }
+    };
 
-    // Reset shadow map layout.
-    void reset() noexcept;
+    ~ShadowMapManager();
 
-    void setDirectionalShadowMap(size_t lightIndex,
-            LightManager::ShadowOptions const* options) noexcept;
+    static void createIfNeeded(FEngine& engine,
+            std::unique_ptr<ShadowMapManager>& inOutShadowMapManager);
 
-    void addShadowMap(size_t lightIndex, bool spotlight,
-            LightManager::ShadowOptions const* options) noexcept;
+    static void terminate(FEngine& engine,
+            std::unique_ptr<ShadowMapManager>& shadowMapManager);
 
     // Updates all the shadow maps and performs culling.
     // Returns true if any of the shadow maps have visible shadows.
-    ShadowMapManager::ShadowTechnique update(FEngine& engine, FView& view,
+    ShadowMapManager::ShadowTechnique update(Builder const& builder,
+            FEngine& engine, FView& view,
             CameraInfo const& cameraInfo,
             FScene::RenderableSoa& renderableData, FScene::LightSoa const& lightData) noexcept;
 
     // Renders all the shadow maps.
-    FrameGraphId<FrameGraphTexture> render(FEngine& engine, FrameGraph& fg, RenderPass const& pass,
+    FrameGraphId<FrameGraphTexture> render(FEngine& engine, FrameGraph& fg,
+            RenderPassBuilder const& passBuilder,
             FView& view, CameraInfo const& mainCameraInfo, math::float4 const& userTime) noexcept;
 
     // valid after calling update() above
@@ -99,11 +138,16 @@ class ShadowMapManager {
     bool hasSpotShadows() const { return !mSpotShadowMapCount; }
 
     // for debugging only
-    FCamera const* getDirectionalLightCamera() const noexcept {
+    FCamera const* getDirectionalShadowCamera() const noexcept {
+        if (!mInitialized) return nullptr;
         return getShadowMap(0).getDebugCamera();
     }
 
 private:
+    explicit ShadowMapManager(FEngine& engine);
+
+    void terminate(FEngine& engine);
+
     ShadowMapManager::ShadowTechnique updateCascadeShadowMaps(FEngine& engine,
             FView& view, CameraInfo cameraInfo, FScene::RenderableSoa& renderableData,
             FScene::LightSoa const& lightData, ShadowMap::SceneInfo sceneInfo) noexcept;
@@ -188,6 +232,7 @@ class ShadowMapManager {
     ShadowMapCacheContainer mShadowMapCache;
     uint32_t mDirectionalShadowMapCount = 0;
     uint32_t mSpotShadowMapCount = 0;
+    bool mInitialized = false;
 
     ShadowMap& getShadowMap(size_t index) noexcept {
         assert_invariant(index < CONFIG_MAX_SHADOWMAPS);
diff --git a/filament/src/View.cpp b/filament/src/View.cpp
index bc5da818290..2de966ea0c9 100644
--- a/filament/src/View.cpp
+++ b/filament/src/View.cpp
@@ -67,8 +67,8 @@ const char* View::getName() const noexcept {
     return downcast(this)->getName();
 }
 
-Camera const* View::getDirectionalLightCamera() const noexcept {
-    return downcast(this)->getDirectionalLightCamera();
+Camera const* View::getDirectionalShadowCamera() const noexcept {
+    return downcast(this)->getDirectionalShadowCamera();
 }
 
 void View::setShadowingEnabled(bool enabled) noexcept {
diff --git a/filament/src/details/DebugRegistry.cpp b/filament/src/details/DebugRegistry.cpp
index decd59610a9..ad1a54df74a 100644
--- a/filament/src/details/DebugRegistry.cpp
+++ b/filament/src/details/DebugRegistry.cpp
@@ -16,12 +16,18 @@
 
 #include "details/DebugRegistry.h"
 
+#include <utils/compiler.h>
+#include <utils/Invocable.h>
 #include <utils/Panic.h>
 
 #include <math/vec2.h>
 #include <math/vec3.h>
 #include <math/vec4.h>
 
+#include <functional>
+#include <string_view>
+#include <utility>
+
 #ifndef NDEBUG
 #   define DEBUG_PROPERTIES_WRITABLE true
 #else
@@ -120,12 +126,25 @@ void FDebugRegistry::registerDataSource(std::string_view name,
     }
 }
 
+void FDebugRegistry::registerDataSource(std::string_view name,
+        utils::Invocable<DataSource()>&& creator) noexcept {
+    mDataSourceCreatorMap[name] = std::move(creator);
+}
+
 DebugRegistry::DataSource FDebugRegistry::getDataSource(const char* name) const noexcept {
     std::string_view const key{ name };
     auto& dataSourceMap = mDataSourceMap;
     auto const& it = dataSourceMap.find(key);
-    if (it == dataSourceMap.end()) {
-        return { nullptr, 0u };
+    if (UTILS_UNLIKELY(it == dataSourceMap.end())) {
+        auto& dataSourceCreatorMap = mDataSourceCreatorMap;
+        auto const& pos = dataSourceCreatorMap.find(key);
+        if (pos == dataSourceCreatorMap.end()) {
+            return { nullptr, 0u };
+        }
+        DataSource dataSource{ pos->second() };
+        dataSourceMap[key] = dataSource;
+        dataSourceCreatorMap.erase(pos);
+        return dataSource;
     }
     return it->second;
 }
diff --git a/filament/src/details/DebugRegistry.h b/filament/src/details/DebugRegistry.h
index 94dfec19414..b60a1c69949 100644
--- a/filament/src/details/DebugRegistry.h
+++ b/filament/src/details/DebugRegistry.h
@@ -22,12 +22,17 @@
 #include <filament/DebugRegistry.h>
 
 #include <utils/compiler.h>
+#include <utils/Invocable.h>
+
+#include <math/mathfwd.h>
 
 #include <functional>
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 
+#include <stddef.h>
+
 namespace filament {
 
 class FEngine;
@@ -95,8 +100,13 @@ class FDebugRegistry : public DebugRegistry {
         registerProperty(name, p, FLOAT4, std::move(fn));
     }
 
+    // registers a DataSource directly
     void registerDataSource(std::string_view name, void const* data, size_t count) noexcept;
 
+    // registers a DataSource lazily
+    void registerDataSource(std::string_view name,
+            utils::Invocable<DataSource()>&& creator) noexcept;
+
 #if !defined(_MSC_VER)
 private:
 #endif
@@ -113,7 +123,8 @@ class FDebugRegistry : public DebugRegistry {
     void const* getPropertyAddress(const char* name) const noexcept;
     DataSource getDataSource(const char* name) const noexcept;
     std::unordered_map<std::string_view, PropertyInfo> mPropertyMap;
-    std::unordered_map<std::string_view, DataSource> mDataSourceMap;
+    mutable std::unordered_map<std::string_view, DataSource> mDataSourceMap;
+    mutable std::unordered_map<std::string_view, utils::Invocable<DataSource()>> mDataSourceCreatorMap;
 };
 
 FILAMENT_DOWNCAST(DebugRegistry)
diff --git a/filament/src/details/Engine.cpp b/filament/src/details/Engine.cpp
index c09711afa19..2fce2349c39 100644
--- a/filament/src/details/Engine.cpp
+++ b/filament/src/details/Engine.cpp
@@ -98,7 +98,10 @@ Engine* FEngine::create(Engine::Builder const& builder) {
             return nullptr;
         }
         DriverConfig const driverConfig{
-            .handleArenaSize = instance->getRequestedDriverHandleArenaSize() };
+                .handleArenaSize = instance->getRequestedDriverHandleArenaSize(),
+                .textureUseAfterFreePoolSize = instance->getConfig().textureUseAfterFreePoolSize,
+                .disableParallelShaderCompile = instance->getConfig().disableParallelShaderCompile
+        };
         instance->mDriver = platform->createDriver(sharedContext, driverConfig);
 
     } else {
@@ -198,7 +201,7 @@ FEngine::FEngine(Engine::Builder const& builder) :
         mCommandBufferQueue(
                 builder->mConfig.minCommandBufferSizeMB * MiB,
                 builder->mConfig.commandBufferSizeMB * MiB),
-        mPerRenderPassAllocator(
+        mPerRenderPassArena(
                 "FEngine::mPerRenderPassAllocator",
                 builder->mConfig.perRenderPassArenaSizeMB * MiB),
         mHeapAllocator("FEngine::mHeapAllocator", AreaPolicy::NullArea{}),
@@ -651,7 +654,8 @@ int FEngine::loop() {
 
     DriverConfig const driverConfig {
             .handleArenaSize = getRequestedDriverHandleArenaSize(),
-            .textureUseAfterFreePoolSize = mConfig.textureUseAfterFreePoolSize
+            .textureUseAfterFreePoolSize = mConfig.textureUseAfterFreePoolSize,
+            .disableParallelShaderCompile = mConfig.disableParallelShaderCompile
     };
     mDriver = mPlatform->createDriver(mSharedGLContext, driverConfig);
 
diff --git a/filament/src/details/Engine.h b/filament/src/details/Engine.h
index 03889f13936..03de2ce8414 100644
--- a/filament/src/details/Engine.h
+++ b/filament/src/details/Engine.h
@@ -58,17 +58,6 @@
 #include <filament/Texture.h>
 #include <filament/VertexBuffer.h>
 
-#if FILAMENT_ENABLE_MATDBG
-#include <matdbg/DebugServer.h>
-#else
-namespace filament {
-namespace matdbg {
-class DebugServer;
-using MaterialKey = uint32_t;
-} // namespace matdbg
-} // namespace filament
-#endif
-
 #include <utils/compiler.h>
 #include <utils/Allocator.h>
 #include <utils/JobSystem.h>
@@ -78,8 +67,19 @@ using MaterialKey = uint32_t;
 #include <memory>
 #include <new>
 #include <random>
+#include <thread>
+#include <type_traits>
 #include <unordered_map>
 
+#if FILAMENT_ENABLE_MATDBG
+#include <matdbg/DebugServer.h>
+#else
+namespace filament::matdbg {
+class DebugServer;
+using MaterialKey = uint32_t;
+} // namespace filament::matdbg
+#endif
+
 namespace filament {
 
 class Renderer;
@@ -142,7 +142,7 @@ class FEngine : public Engine {
     // the per-frame Area is used by all Renderer, so they must run in sequence and
     // have freed all allocated memory when done. If this needs to change in the future,
     // we'll simply have to use separate Areas (for instance).
-    LinearAllocatorArena& getPerRenderPassAllocator() noexcept { return mPerRenderPassAllocator; }
+    LinearAllocatorArena& getPerRenderPassArena() noexcept { return mPerRenderPassArena; }
 
     // Material IDs...
     uint32_t getMaterialId() const noexcept { return mMaterialId++; }
@@ -508,7 +508,7 @@ class FEngine : public Engine {
 
     uint32_t mFlushCounter = 0;
 
-    LinearAllocatorArena mPerRenderPassAllocator;
+    RootArenaScope::Arena mPerRenderPassArena;
     HeapAllocatorArena mHeapAllocator;
 
     utils::JobSystem mJobSystem;
diff --git a/filament/src/details/Renderer.cpp b/filament/src/details/Renderer.cpp
index 6611a48a832..c014be6eb66 100644
--- a/filament/src/details/Renderer.cpp
+++ b/filament/src/details/Renderer.cpp
@@ -16,6 +16,9 @@
 
 #include "details/Renderer.h"
 
+#include "Allocators.h"
+#include "DebugRegistry.h"
+#include "FrameHistory.h"
 #include "PostProcessManager.h"
 #include "RendererUtils.h"
 #include "RenderPass.h"
@@ -28,21 +31,40 @@
 #include "details/Texture.h"
 #include "details/View.h"
 
+#include <filament/Camera.h>
+#include <filament/Fence.h>
+#include <filament/Options.h>
 #include <filament/Renderer.h>
 
+#include <backend/DriverEnums.h>
+#include <backend/DriverApiForward.h>
+#include <backend/Handle.h>
 #include <backend/PixelBufferDescriptor.h>
 
 #include "fg/FrameGraph.h"
 #include "fg/FrameGraphId.h"
 #include "fg/FrameGraphResources.h"
+#include "fg/FrameGraphTexture.h"
+
+#include <math/vec2.h>
+#include <math/vec3.h>
+#include <math/mat4.h>
 
 #include <utils/compiler.h>
 #include <utils/JobSystem.h>
+#include <utils/Log.h>
+#include <utils/ostream.h>
 #include <utils/Panic.h>
 #include <utils/Systrace.h>
-#include <utils/vector.h>
 #include <utils/debug.h>
 
+#include <chrono>
+#include <limits>
+#include <utility>
+
+#include <stddef.h>
+#include <stdint.h>
+
 // this helps visualize what dynamic-scaling is doing
 #define DEBUG_DYNAMIC_SCALING false
 
@@ -62,8 +84,7 @@ FRenderer::FRenderer(FEngine& engine) :
         mHdrQualityMedium(TextureFormat::R11F_G11F_B10F),
         mHdrQualityHigh(TextureFormat::RGB16F),
         mIsRGB8Supported(false),
-        mUserEpoch(engine.getEngineEpoch()),
-        mPerRenderPassArena(engine.getPerRenderPassAllocator())
+        mUserEpoch(engine.getEngineEpoch())
 {
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
     debugRegistry.registerProperty("d.renderer.doFrameCapture",
@@ -442,7 +463,7 @@ void FRenderer::render(FView const* view) {
 
     if (UTILS_LIKELY(view && view->getScene())) {
         if (mViewRenderedCount) {
-            // this is a good place to kick the GPU, since we've rendered a View before,
+            // This is a good place to kick the GPU, since we've rendered a View before,
             // and we're about to render another one.
             mEngine.getDriverApi().flush();
         }
@@ -452,17 +473,17 @@ void FRenderer::render(FView const* view) {
 }
 
 void FRenderer::renderInternal(FView const* view) {
-    // per-renderpass data
-    ArenaScope rootArena(mPerRenderPassArena);
-
     FEngine& engine = mEngine;
-    JobSystem& js = engine.getJobSystem();
+
+    // per-renderpass data
+    RootArenaScope rootArenaScope(engine.getPerRenderPassArena());
 
     // create a root job so no other job can escape
+    JobSystem& js = engine.getJobSystem();
     auto *rootJob = js.setRootJob(js.createJob());
 
     // execute the render pass
-    renderJob(rootArena, const_cast<FView&>(*view));
+    renderJob(rootArenaScope, const_cast<FView&>(*view));
 
     // make sure to flush the command buffer
     engine.flush();
@@ -471,7 +492,7 @@ void FRenderer::renderInternal(FView const* view) {
     js.runAndWait(rootJob);
 }
 
-void FRenderer::renderJob(ArenaScope& arena, FView& view) {
+void FRenderer::renderJob(RootArenaScope& rootArenaScope, FView& view) {
     FEngine& engine = mEngine;
     JobSystem& js = engine.getJobSystem();
     FEngine::DriverApi& driver = engine.getDriverApi();
@@ -636,7 +657,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
         xvp.bottom = int32_t(guardBand);
     }
 
-    view.prepare(engine, driver, arena, svp, cameraInfo, getShaderUserTime(), needsAlphaChannel);
+    view.prepare(engine, driver, rootArenaScope, svp, cameraInfo, getShaderUserTime(), needsAlphaChannel);
 
     view.prepareUpscaler(scale, taaOptions, dsrOptions);
 
@@ -649,8 +670,10 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // Allocate some space for our commands in the per-frame Arena, and use that space as
     // an Arena for commands. All this space is released when we exit this method.
     size_t const perFrameCommandsSize = engine.getPerFrameCommandsSize();
-    void* const arenaBegin = arena.allocate(perFrameCommandsSize, CACHELINE_SIZE);
+    void* const arenaBegin = rootArenaScope.allocate(perFrameCommandsSize, CACHELINE_SIZE);
     void* const arenaEnd = pointermath::add(arenaBegin, perFrameCommandsSize);
+
+    // This arena *must* stay valid until all commands have been processed
     RenderPass::Arena commandArena("Command Arena", { arenaBegin, arenaEnd });
 
     RenderPass::RenderFlags renderFlags = 0;
@@ -658,8 +681,8 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     if (view.isFrontFaceWindingInverted())  renderFlags |= RenderPass::HAS_INVERSE_FRONT_FACES;
     if (view.hasInstancedStereo())          renderFlags |= RenderPass::IS_STEREOSCOPIC;
 
-    RenderPass pass(engine, commandArena);
-    pass.setRenderFlags(renderFlags);
+    RenderPassBuilder passBuilder(commandArena);
+    passBuilder.renderFlags(renderFlags);
 
     Variant variant;
     variant.setDirectionalLighting(view.hasDirectionalLight());
@@ -682,10 +705,10 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     if (view.needsShadowMap()) {
         Variant shadowVariant(Variant::DEPTH_VARIANT);
         shadowVariant.setVsm(view.getShadowType() == ShadowType::VSM);
-
-        RenderPass shadowPass(pass);
-        shadowPass.setVariant(shadowVariant);
-        auto shadows = view.renderShadowMaps(engine, fg, cameraInfo, mShaderUserTime, shadowPass);
+        auto shadows = view.renderShadowMaps(engine, fg, cameraInfo, mShaderUserTime,
+                RenderPassBuilder{ commandArena }
+                    .renderFlags(renderFlags)
+                    .variant(shadowVariant));
         blackboard["shadows"] = shadows;
     }
 
@@ -771,8 +794,9 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     view.updatePrimitivesLod(engine, cameraInfo,
             scene.getRenderableData(), view.getVisibleRenderables());
 
-    pass.setCamera(cameraInfo);
-    pass.setGeometry(scene.getRenderableData(), view.getVisibleRenderables(), scene.getRenderableUBO());
+    passBuilder.camera(cameraInfo);
+    passBuilder.geometry(scene.getRenderableData(),
+            view.getVisibleRenderables(), scene.getRenderableUBO());
 
     // view set-ups that need to happen before rendering
     fg.addTrivialSideEffectPass("Prepare View Uniforms",
@@ -818,7 +842,8 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // This is normally used by SSAO and contact-shadows
 
     // TODO: the scaling should depends on all passes that need the structure pass
-    const auto [structure, picking_] = ppm.structure(fg, pass, renderFlags, svp.width, svp.height, {
+    const auto [structure, picking_] = ppm.structure(fg,
+            passBuilder, renderFlags, svp.width, svp.height, {
             .scale = aoOptions.resolution,
             .picking = view.hasPicking()
     });
@@ -876,7 +901,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // screen-space reflections pass
 
     if (ssReflectionsOptions.enabled) {
-        auto reflections = ppm.ssr(fg, pass,
+        auto reflections = ppm.ssr(fg, passBuilder,
                 view.getFrameHistory(), cameraInfo,
                 view.getPerViewUniforms(),
                 structure,
@@ -894,10 +919,15 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
     // --------------------------------------------------------------------------------------------
     // Color passes
 
+    // this makes the viewport relative to xvp
+    // FIXME: we should use 'vp' when rendering directly into the swapchain, but that's hard to
+    //        know at this point. This will usually be the case when post-process is disabled.
+    // FIXME: we probably should take the dynamic scaling into account too
+    passBuilder.scissorViewport(hasPostProcess ? xvp : vp);
+
     // This one doesn't need to be a FrameGraph pass because it always happens by construction
     // (i.e. it won't be culled, unless everything is culled), so no need to complexify things.
-    pass.setVariant(variant);
-    pass.appendCommands(engine, RenderPass::COLOR);
+    passBuilder.variant(variant);
 
     // color-grading as subpass is done either by the color pass or the TAA pass if any
     auto colorGradingConfigForColor = colorGradingConfig;
@@ -905,7 +935,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
 
     if (colorGradingConfigForColor.asSubpass) {
         // append color grading subpass after all other passes
-        pass.appendCustomCommand(3,
+        passBuilder.customCommand(engine, 3,
                 RenderPass::Pass::BLENDED,
                 RenderPass::CustomCommand::EPILOG,
                 0, [&ppm, &driver, colorGradingConfigForColor]() {
@@ -913,7 +943,7 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
                 });
     } else if (colorGradingConfig.customResolve) {
         // append custom resolve subpass after all other passes
-        pass.appendCustomCommand(3,
+        passBuilder.customCommand(engine, 3,
                 RenderPass::Pass::BLENDED,
                 RenderPass::CustomCommand::EPILOG,
                 0, [&ppm, &driver]() {
@@ -921,16 +951,9 @@ void FRenderer::renderJob(ArenaScope& arena, FView& view) {
                 });
     }
 
-    // sort commands once we're done adding commands
-    pass.sortCommands(engine);
-
-
-    // this makes the viewport relative to xvp
-    // FIXME: we should use 'vp' when rendering directly into the swapchain, but that's hard to
-    //        know at this point. This will usually be the case when post-process is disabled.
-    // FIXME: we probably should take the dynamic scaling into account too
-    pass.setScissorViewport(hasPostProcess ? xvp : vp);
+    passBuilder.commandTypeFlags(RenderPass::CommandTypeFlags::COLOR);
 
+    RenderPass const pass{ passBuilder.build(engine) };
 
     FrameGraphTexture::Descriptor const desc = {
             .width = config.physicalViewport.width,
diff --git a/filament/src/details/Renderer.h b/filament/src/details/Renderer.h
index 2d08b6cfe0d..056d5599770 100644
--- a/filament/src/details/Renderer.h
+++ b/filament/src/details/Renderer.h
@@ -163,7 +163,7 @@ class FRenderer : public Renderer {
     }
 
     void renderInternal(FView const* view);
-    void renderJob(ArenaScope& arena, FView& view);
+    void renderJob(RootArenaScope& rootArenaScope, FView& view);
 
     // keep a reference to our engine
     FEngine& mEngine;
@@ -187,9 +187,6 @@ class FRenderer : public Renderer {
     backend::TargetBufferFlags mClearFlags{};
     tsl::robin_set<FRenderTarget*> mPreviousRenderTargets;
     std::function<void()> mBeginFrameInternal;
-
-    // per-frame arena for this Renderer
-    LinearAllocatorArena& mPerRenderPassArena;
 };
 
 FILAMENT_DOWNCAST(Renderer)
diff --git a/filament/src/details/Scene.cpp b/filament/src/details/Scene.cpp
index 942fb1e24fc..ff6af6293a5 100644
--- a/filament/src/details/Scene.cpp
+++ b/filament/src/details/Scene.cpp
@@ -53,7 +53,7 @@ FScene::~FScene() noexcept = default;
 
 
 void FScene::prepare(utils::JobSystem& js,
-        LinearAllocatorArena& allocator,
+        RootArenaScope& rootArenaScope,
         mat4 const& worldTransform,
         bool shadowReceiversAreCasters) noexcept {
     // TODO: can we skip this in most cases? Since we rely on indices staying the same,
@@ -64,7 +64,7 @@ void FScene::prepare(utils::JobSystem& js,
     SYSTRACE_CONTEXT();
 
     // This will reset the allocator upon exiting
-    ArenaScope const arena(allocator);
+    ArenaScope<RootArenaScope::Arena> localArenaScope(rootArenaScope.getArena());
 
     FEngine& engine = mEngine;
     EntityManager const& em = engine.getEntityManager();
@@ -85,10 +85,10 @@ void FScene::prepare(utils::JobSystem& js,
             utils::STLAllocator< LightContainerData, LinearAllocatorArena >, false>;
 
     RenderableInstanceContainer renderableInstances{
-            RenderableInstanceContainer::with_capacity(entities.size(), allocator) };
+            RenderableInstanceContainer::with_capacity(entities.size(), localArenaScope.getArena()) };
 
     LightInstanceContainer lightInstances{
-            LightInstanceContainer::with_capacity(entities.size(), allocator) };
+            LightInstanceContainer::with_capacity(entities.size(), localArenaScope.getArena()) };
 
     SYSTRACE_NAME_BEGIN("InstanceLoop");
 
@@ -148,7 +148,7 @@ void FScene::prepare(utils::JobSystem& js,
 
     // TODO: the resize below could happen in a job
 
-    if (sceneData.size() != renderableInstances.size()) {
+    if (!sceneData.capacity() || sceneData.size() != renderableInstances.size()) {
         sceneData.clear();
         if (sceneData.capacity() < renderableDataCapacity) {
             sceneData.setCapacity(renderableDataCapacity);
@@ -454,7 +454,7 @@ void FScene::terminate(FEngine&) {
     mRenderableViewUbh.clear();
 }
 
-void FScene::prepareDynamicLights(const CameraInfo& camera, ArenaScope&,
+void FScene::prepareDynamicLights(const CameraInfo& camera,
         Handle<HwBufferObject> lightUbh) noexcept {
     FEngine::DriverApi& driver = mEngine.getDriverApi();
     FLightManager const& lcm = mEngine.getLightManager();
diff --git a/filament/src/details/Scene.h b/filament/src/details/Scene.h
index 1882bb4dc30..490d115af3c 100644
--- a/filament/src/details/Scene.h
+++ b/filament/src/details/Scene.h
@@ -31,6 +31,8 @@
 #include <filament/Box.h>
 #include <filament/Scene.h>
 
+#include <math/mathfwd.h>
+
 #include <utils/compiler.h>
 #include <utils/Entity.h>
 #include <utils/Slice.h>
@@ -70,12 +72,12 @@ class FScene : public Scene {
     ~FScene() noexcept;
     void terminate(FEngine& engine);
 
-    void prepare(utils::JobSystem& js, LinearAllocatorArena& allocator,
+    void prepare(utils::JobSystem& js, RootArenaScope& rootArenaScope,
             math::mat4 const& worldTransform, bool shadowReceiversAreCasters) noexcept;
 
     void prepareVisibleRenderables(utils::Range<uint32_t> visibleRenderables) noexcept;
 
-    void prepareDynamicLights(const CameraInfo& camera, ArenaScope& arena,
+    void prepareDynamicLights(const CameraInfo& camera,
             backend::Handle<backend::HwBufferObject> lightUbh) noexcept;
 
     backend::Handle<backend::HwBufferObject> getRenderableUBO() const noexcept {
diff --git a/filament/src/details/View.cpp b/filament/src/details/View.cpp
index 7cc641de093..8188390f7b5 100644
--- a/filament/src/details/View.cpp
+++ b/filament/src/details/View.cpp
@@ -20,6 +20,7 @@
 #include "Froxelizer.h"
 #include "RenderPrimitive.h"
 #include "ResourceAllocator.h"
+#include "ShadowMapManager.h"
 
 #include "details/Engine.h"
 #include "details/IndirectLight.h"
@@ -43,6 +44,7 @@
 #include <math/scalar.h>
 #include <math/fast.h>
 
+#include <array>
 #include <memory>
 
 using namespace utils;
@@ -59,8 +61,8 @@ FView::FView(FEngine& engine)
         : mFroxelizer(engine),
           mFogEntity(engine.getEntityManager().create()),
           mIsStereoSupported(engine.getDriverApi().isStereoSupported()),
-          mPerViewUniforms(engine),
-          mShadowMapManager(engine) {
+          mPerViewUniforms(engine) {
+
     DriverApi& driver = engine.getDriverApi();
 
     FDebugRegistry& debugRegistry = engine.getDebugRegistry();
@@ -76,7 +78,11 @@ FView::FView(FEngine& engine)
 
 #ifndef NDEBUG
     debugRegistry.registerDataSource("d.view.frame_info",
-            mDebugFrameHistory.data(), mDebugFrameHistory.size());
+            [this]() -> DebugRegistry::DataSource {
+                assert_invariant(!mDebugFrameHistory);
+                mDebugFrameHistory = std::make_unique<std::array<DebugRegistry::FrameHistory, 5*60>>();
+                return { mDebugFrameHistory->data(), mDebugFrameHistory->size() };
+            });
     debugRegistry.registerProperty("d.view.pid.kp", &engine.debug.view.pid.kp);
     debugRegistry.registerProperty("d.view.pid.ki", &engine.debug.view.pid.ki);
     debugRegistry.registerProperty("d.view.pid.kd", &engine.debug.view.pid.kd);
@@ -113,7 +119,8 @@ void FView::terminate(FEngine& engine) {
     driver.destroyBufferObject(mLightUbh);
     driver.destroyBufferObject(mRenderableUbh);
     drainFrameHistory(engine);
-    mShadowMapManager.terminate(engine);
+
+    ShadowMapManager::terminate(engine, mShadowMapManager);
     mPerViewUniforms.terminate(driver);
     mFroxelizer.terminate(driver);
 
@@ -242,21 +249,24 @@ float2 FView::updateScale(FEngine& engine,
 
 #ifndef NDEBUG
     // only for debugging...
-    using duration_ms = std::chrono::duration<float, std::milli>;
-    const float target = (1000.0f * float(frameRateOptions.interval)) / displayInfo.refreshRate;
-    const float targetWithHeadroom = target * (1.0f - frameRateOptions.headRoomRatio);
-    std::move(mDebugFrameHistory.begin() + 1,
-            mDebugFrameHistory.end(), mDebugFrameHistory.begin());
-    mDebugFrameHistory.back() = {
-            .target             = target,
-            .targetWithHeadroom = targetWithHeadroom,
-            .frameTime          = std::chrono::duration_cast<duration_ms>(info.frameTime).count(),
-            .frameTimeDenoised  = std::chrono::duration_cast<duration_ms>(info.denoisedFrameTime).count(),
-            .scale              = mScale.x * mScale.y,
-            .pid_e              = mPidController.getError(),
-            .pid_i              = mPidController.getIntegral(),
-            .pid_d              = mPidController.getDerivative()
-    };
+    if (mDebugFrameHistory) {
+        using namespace std::chrono;
+        using duration_ms = duration<float, std::milli>;
+        const float target = (1000.0f * float(frameRateOptions.interval)) / displayInfo.refreshRate;
+        const float targetWithHeadroom = target * (1.0f - frameRateOptions.headRoomRatio);
+        std::move(mDebugFrameHistory->begin() + 1,
+                mDebugFrameHistory->end(), mDebugFrameHistory->begin());
+        mDebugFrameHistory->back() = {
+                .target             = target,
+                .targetWithHeadroom = targetWithHeadroom,
+                .frameTime          = duration_cast<duration_ms>(info.frameTime).count(),
+                .frameTimeDenoised  = duration_cast<duration_ms>(info.denoisedFrameTime).count(),
+                .scale              = mScale.x * mScale.y,
+                .pid_e              = mPidController.getError(),
+                .pid_i              = mPidController.getIntegral(),
+                .pid_d              = mPidController.getDerivative()
+        };
+    }
 #endif
 
     return mScale;
@@ -281,10 +291,10 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         return;
     }
 
-    mShadowMapManager.reset();
-
     auto& lcm = engine.getLightManager();
 
+    ShadowMapManager::Builder builder;
+
     // dominant directional light is always as index 0
     FLightManager::Instance const directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
     const bool hasDirectionalShadows = directionalLight && lcm.isShadowCaster(directionalLight);
@@ -292,7 +302,7 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         const auto& shadowOptions = lcm.getShadowOptions(directionalLight);
         assert_invariant(shadowOptions.shadowCascades >= 1 &&
                 shadowOptions.shadowCascades <= CONFIG_MAX_SHADOW_CASCADES);
-        mShadowMapManager.setDirectionalShadowMap(0, &shadowOptions);
+        builder.directionalShadowMap(0, &shadowOptions);
     }
 
     // Find all shadow-casting spotlights.
@@ -326,7 +336,7 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         if (shadowMapCount + shadowMapCountNeeded <= CONFIG_MAX_SHADOWMAPS) {
             shadowMapCount += shadowMapCountNeeded;
             const auto& shadowOptions = lcm.getShadowOptions(li);
-            mShadowMapManager.addShadowMap(l, spotLight, &shadowOptions);
+            builder.shadowMap(l, spotLight, &shadowOptions);
         }
 
         if (shadowMapCount >= CONFIG_MAX_SHADOWMAPS) {
@@ -334,15 +344,17 @@ void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableD
         }
     }
 
-    auto shadowTechnique = mShadowMapManager.update(engine, *this, cameraInfo,
-            renderableData, lightData);
+    if (builder.hasShadowMaps()) {
+        ShadowMapManager::createIfNeeded(engine, mShadowMapManager);
+        auto shadowTechnique = mShadowMapManager->update(builder, engine, *this,
+                cameraInfo, renderableData, lightData);
 
-    mHasShadowing = any(shadowTechnique);
-    mNeedsShadowMap = any(shadowTechnique & ShadowMapManager::ShadowTechnique::SHADOW_MAP);
+        mHasShadowing = any(shadowTechnique);
+        mNeedsShadowMap = any(shadowTechnique & ShadowMapManager::ShadowTechnique::SHADOW_MAP);
+    }
 }
 
-void FView::prepareLighting(FEngine& engine, ArenaScope& arena,
-        CameraInfo const& cameraInfo) noexcept {
+void FView::prepareLighting(FEngine& engine, CameraInfo const& cameraInfo) noexcept {
     SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
@@ -354,7 +366,7 @@ void FView::prepareLighting(FEngine& engine, ArenaScope& arena,
      */
 
     if (hasDynamicLighting()) {
-        scene->prepareDynamicLights(cameraInfo, arena, mLightUbh);
+        scene->prepareDynamicLights(cameraInfo, mLightUbh);
     }
 
     // here the array of visible lights has been shrunk to CONFIG_MAX_LIGHT_COUNT
@@ -427,7 +439,7 @@ CameraInfo FView::computeCameraInfo(FEngine& engine) const noexcept {
     return { *camera, mat4{ rotation } * mat4::translation(translation) };
 }
 
-void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
+void FView::prepare(FEngine& engine, DriverApi& driver, RootArenaScope& rootArenaScope,
         filament::Viewport viewport, CameraInfo cameraInfo,
         float4 const& userTime, bool needsAlphaChannel) noexcept {
 
@@ -465,7 +477,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
      * Gather all information needed to render this scene. Apply the world origin to all
      * objects in the scene.
      */
-    scene->prepare(js, arena.getAllocator(),
+    scene->prepare(js, rootArenaScope,
             cameraInfo.worldTransform,
             hasVSM());
 
@@ -475,14 +487,22 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
 
     JobSystem::Job* froxelizeLightsJob = nullptr;
     JobSystem::Job* prepareVisibleLightsJob = nullptr;
-    if (scene->getLightData().size() > FScene::DIRECTIONAL_LIGHTS_COUNT) {
+    size_t const lightCount = scene->getLightData().size();
+    if (lightCount > FScene::DIRECTIONAL_LIGHTS_COUNT) {
         // create and start the prepareVisibleLights job
         // note: this job updates LightData (non const)
+        // allocate a scratch buffer for distances outside the job below, so we don't need
+        // to use a locked allocator; the downside is that we need to account for the worst case.
+        size_t const positionalLightCount = lightCount - FScene::DIRECTIONAL_LIGHTS_COUNT;
+        float* const distances = rootArenaScope.allocate<float>(
+                (positionalLightCount + 3u) & ~3u, CACHELINE_SIZE);
+
         prepareVisibleLightsJob = js.runAndRetain(js.createJob(nullptr,
-                [&engine, &arena, &viewMatrix = cameraInfo.view, &cullingFrustum,
+                [&engine, distances, positionalLightCount, &viewMatrix = cameraInfo.view, &cullingFrustum,
                  &lightData = scene->getLightData()]
                         (JobSystem&, JobSystem::Job*) {
-                    FView::prepareVisibleLights(engine.getLightManager(), arena,
+                    FView::prepareVisibleLights(engine.getLightManager(),
+                            { distances, distances + positionalLightCount },
                             viewMatrix, cullingFrustum, lightData);
                 }));
     }
@@ -530,7 +550,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
         // As soon as prepareVisibleLight finishes, we can kick-off the froxelization
         if (hasDynamicLighting()) {
             auto& froxelizer = mFroxelizer;
-            if (froxelizer.prepare(driver, arena, viewport,
+            if (froxelizer.prepare(driver, rootArenaScope, viewport,
                     cameraInfo.projection, cameraInfo.zn, cameraInfo.zf)) {
                 // TODO: might be more consistent to do this in prepareLighting(), but it's not
                 //       strictly necessary
@@ -606,7 +626,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
                 uint32_t(endDirCastersOnly - beginRenderables)};
 
         merged = { 0, uint32_t(endPotentialSpotCastersOnly - beginRenderables) };
-        if (!mShadowMapManager.hasSpotShadows()) {
+        if (!needsShadowMap() || !mShadowMapManager->hasSpotShadows()) {
             // we know we don't have spot shadows, we can reduce the range to not even include
             // the potential spot casters
             merged = { 0, uint32_t(endDirCastersOnly - beginRenderables) };
@@ -645,7 +665,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
      * Relies on FScene::prepare() and prepareVisibleLights()
      */
 
-    prepareLighting(engine, arena, cameraInfo);
+    prepareLighting(engine, cameraInfo);
 
     /*
      * Update driver state
@@ -672,8 +692,11 @@ void FView::bindPerViewUniformsAndSamplers(FEngine::DriverApi& driver) const noe
     driver.bindUniformBuffer(+UniformBindingPoints::LIGHTS,
             mLightUbh);
 
-    driver.bindUniformBuffer(+UniformBindingPoints::SHADOW,
-            mShadowMapManager.getShadowUniformsHandle());
+    if (needsShadowMap()) {
+        assert_invariant(mShadowMapManager->getShadowUniformsHandle());
+        driver.bindUniformBuffer(+UniformBindingPoints::SHADOW,
+                mShadowMapManager->getShadowUniformsHandle());
+    }
 
     driver.bindUniformBuffer(+UniformBindingPoints::FROXEL_RECORDS,
             mFroxelizer.getRecordBuffer());
@@ -774,7 +797,12 @@ void FView::prepareStructure(Handle<HwTexture> structure) const noexcept {
 }
 
 void FView::prepareShadow(Handle<HwTexture> texture) const noexcept {
-    const auto& uniforms = mShadowMapManager.getShadowMappingUniforms();
+    // when needsShadowMap() is not set, this method only just sets a dummy texture
+    // in the needed samplers (in that case `texture` is actually a dummy texture).
+    ShadowMapManager::ShadowMappingUniforms uniforms;
+    if (needsShadowMap()) {
+        uniforms = mShadowMapManager->getShadowMappingUniforms();
+    }
     switch (mShadowType) {
         case filament::ShadowType::PCF:
             mPerViewUniforms.prepareShadowPCF(texture, uniforms);
@@ -850,7 +878,8 @@ void FView::cullRenderables(JobSystem&,
     functor(0, renderableData.size());
 }
 
-void FView::prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena,
+void FView::prepareVisibleLights(FLightManager const& lcm,
+        utils::Slice<float> scratch,
         mat4f const& viewMatrix, Frustum const& frustum,
         FScene::LightSoa& lightData) noexcept {
     SYSTRACE_CALL();
@@ -918,28 +947,25 @@ void FView::prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena
      * - This helps our limited numbers of spot-shadow as well.
      */
 
-    ArenaScope arena(rootArena.getAllocator());
-    size_t const size = visibleLightCount;
     // number of point/spotlights
-    size_t const positionalLightCount = size - FScene::DIRECTIONAL_LIGHTS_COUNT;
+    size_t const positionalLightCount = visibleLightCount - FScene::DIRECTIONAL_LIGHTS_COUNT;
     if (positionalLightCount) {
-        // always allocate at least 4 entries, because the vectorized loops below rely on that
-        float* const UTILS_RESTRICT distances =
-                arena.allocate<float>((size + 3u) & ~3u, CACHELINE_SIZE);
-
+        assert_invariant(positionalLightCount <= scratch.size());
         // pre-compute the lights' distance to the camera, for sorting below
         // - we don't skip the directional light, because we don't care, it's ignored during sorting
+        float* const UTILS_RESTRICT distances = scratch.data();
         float4 const* const UTILS_RESTRICT spheres = lightData.data<FScene::POSITION_RADIUS>();
-        computeLightCameraDistances(distances, viewMatrix, spheres, size);
+        computeLightCameraDistances(distances, viewMatrix, spheres, visibleLightCount);
 
         // skip directional light
         Zip2Iterator<FScene::LightSoa::iterator, float*> b = { lightData.begin(), distances };
-        std::sort(b + FScene::DIRECTIONAL_LIGHTS_COUNT, b + size,
+        std::sort(b + FScene::DIRECTIONAL_LIGHTS_COUNT, b + visibleLightCount,
                 [](auto const& lhs, auto const& rhs) { return lhs.second < rhs.second; });
     }
 
     // drop excess lights
-    lightData.resize(std::min(size, CONFIG_MAX_LIGHT_COUNT + FScene::DIRECTIONAL_LIGHTS_COUNT));
+    lightData.resize(std::min(visibleLightCount,
+            CONFIG_MAX_LIGHT_COUNT + FScene::DIRECTIONAL_LIGHTS_COUNT));
 }
 
 // These methods need to exist so clang honors the __restrict__ keyword, which in turn
@@ -972,8 +998,10 @@ void FView::updatePrimitivesLod(FEngine& engine, const CameraInfo&,
 }
 
 FrameGraphId<FrameGraphTexture> FView::renderShadowMaps(FEngine& engine, FrameGraph& fg,
-        CameraInfo const& cameraInfo, float4 const& userTime, RenderPass const& pass) noexcept {
-    return mShadowMapManager.render(engine, fg, pass, *this, cameraInfo, userTime);
+        CameraInfo const& cameraInfo, float4 const& userTime,
+        RenderPassBuilder const& passBuilder) noexcept {
+    assert_invariant(needsShadowMap());
+    return mShadowMapManager->render(engine, fg, passBuilder, *this, cameraInfo, userTime);
 }
 
 void FView::commitFrameHistory(FEngine& engine) noexcept {
diff --git a/filament/src/details/View.h b/filament/src/details/View.h
index 204f11b9d56..6680306296d 100644
--- a/filament/src/details/View.h
+++ b/filament/src/details/View.h
@@ -53,6 +53,9 @@
 #include <math/scalar.h>
 #include <math/mat4.h>
 
+#include <array>
+#include <memory>
+
 namespace utils {
 class JobSystem;
 } // namespace utils;
@@ -88,7 +91,7 @@ class FView : public View {
 
     // note: viewport/cameraInfo are passed by value to make it clear that prepare cannot
     // keep references on them that would outlive the scope of prepare() (e.g. with JobSystem).
-    void prepare(FEngine& engine, backend::DriverApi& driver, ArenaScope& arena,
+    void prepare(FEngine& engine, backend::DriverApi& driver, RootArenaScope& rootArenaScope,
             filament::Viewport viewport, CameraInfo cameraInfo,
             math::float4 const& userTime, bool needsAlphaChannel) noexcept;
 
@@ -144,7 +147,7 @@ class FView : public View {
 
     void prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableData,
             FScene::LightSoa const& lightData, CameraInfo const& cameraInfo) noexcept;
-    void prepareLighting(FEngine& engine, ArenaScope& arena, CameraInfo const& cameraInfo) noexcept;
+    void prepareLighting(FEngine& engine, CameraInfo const& cameraInfo) noexcept;
 
     void prepareSSAO(backend::Handle<backend::HwTexture> ssao) const noexcept;
     void prepareSSR(backend::Handle<backend::HwTexture> ssr, bool disableSSR,
@@ -176,7 +179,7 @@ class FView : public View {
 
     FrameGraphId<FrameGraphTexture> renderShadowMaps(FEngine& engine, FrameGraph& fg,
             CameraInfo const& cameraInfo, math::float4 const& userTime,
-            RenderPass const& pass) noexcept;
+            RenderPassBuilder const& passBuilder) noexcept;
 
     void updatePrimitivesLod(
             FEngine& engine, const CameraInfo& camera,
@@ -198,8 +201,9 @@ class FView : public View {
 
     void setStereoscopicOptions(StereoscopicOptions const& options) noexcept;
 
-    FCamera const* getDirectionalLightCamera() const noexcept {
-        return mShadowMapManager.getDirectionalLightCamera();
+    FCamera const* getDirectionalShadowCamera() const noexcept {
+        if (!mShadowMapManager) return nullptr;
+        return mShadowMapManager->getDirectionalShadowCamera();
     }
 
     void setRenderTarget(FRenderTarget* renderTarget) noexcept {
@@ -460,7 +464,8 @@ class FView : public View {
     void prepareVisibleRenderables(utils::JobSystem& js,
             Frustum const& frustum, FScene::RenderableSoa& renderableData) const noexcept;
 
-    static void prepareVisibleLights(FLightManager const& lcm, ArenaScope& rootArena,
+    static void prepareVisibleLights(FLightManager const& lcm,
+            utils::Slice<float> scratch,
             math::mat4f const& viewMatrix, Frustum const& frustum,
             FScene::LightSoa& lightData) noexcept;
 
@@ -554,7 +559,7 @@ class FView : public View {
     mutable bool mHasShadowing = false;
     mutable bool mNeedsShadowMap = false;
 
-    ShadowMapManager mShadowMapManager;
+    std::unique_ptr<ShadowMapManager> mShadowMapManager;
 
     std::array<math::float4, 4> mMaterialGlobals = {{
                                                             { 0, 0, 0, 1 },
@@ -564,7 +569,7 @@ class FView : public View {
                                                     }};
 
 #ifndef NDEBUG
-    std::array<DebugRegistry::FrameHistory, 5*60> mDebugFrameHistory;
+    std::unique_ptr<std::array<DebugRegistry::FrameHistory, 5*60>> mDebugFrameHistory;
 #endif
 };
 
diff --git a/ios/CocoaPods/Filament.podspec b/ios/CocoaPods/Filament.podspec
index 5b53805fa63..df0911beb7d 100644
--- a/ios/CocoaPods/Filament.podspec
+++ b/ios/CocoaPods/Filament.podspec
@@ -1,12 +1,12 @@
 Pod::Spec.new do |spec|
   spec.name = "Filament"
-  spec.version = "1.50.3"
+  spec.version = "1.50.4"
   spec.license = { :type => "Apache 2.0", :file => "LICENSE" }
   spec.homepage = "https://google.github.io/filament"
   spec.authors = "Google LLC."
   spec.summary = "Filament is a real-time physically based rendering engine for Android, iOS, Windows, Linux, macOS, and WASM/WebGL."
   spec.platform = :ios, "11.0"
-  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.3/filament-v1.50.3-ios.tgz" }
+  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.50.4/filament-v1.50.4-ios.tgz" }
 
   # Fix linking error with Xcode 12; we do not yet support the simulator on Apple silicon.
   spec.pod_target_xcconfig = {
diff --git a/libs/camutils/src/FreeFlightManipulator.h b/libs/camutils/src/FreeFlightManipulator.h
index 1df4dd56f73..c20b2578295 100644
--- a/libs/camutils/src/FreeFlightManipulator.h
+++ b/libs/camutils/src/FreeFlightManipulator.h
@@ -24,6 +24,7 @@
 #include <math/mat4.h>
 #include <math/quat.h>
 
+#include <algorithm>
 #include <cmath>
 
 namespace filament {
@@ -121,50 +122,90 @@ class FreeFlightManipulator : public Manipulator<FLOAT> {
     }
 
     void update(FLOAT deltaTime) override {
-        vec3 forceLocal { 0.0, 0.0, 0.0 };
 
-        if (mKeyDown[(int) Base::Key::FORWARD]) {
-            forceLocal += vec3{  0.0,  0.0, -1.0 };
-        }
-        if (mKeyDown[(int) Base::Key::LEFT]) {
-            forceLocal += vec3{ -1.0,  0.0,  0.0 };
-        }
-        if (mKeyDown[(int) Base::Key::BACKWARD]) {
-            forceLocal += vec3{  0.0,  0.0,  1.0 };
-        }
-        if (mKeyDown[(int) Base::Key::RIGHT]) {
-            forceLocal += vec3{  1.0,  0.0,  0.0 };
-        }
-
-        const mat4 orientation = mat4::lookAt(Base::mEye, Base::mTarget, Base::mProps.upVector);
-        vec3 forceWorld = (orientation * vec4{ forceLocal, 0.0f }).xyz;
-
-        if (mKeyDown[(int) Base::Key::UP]) {
-            forceWorld += vec3{  0.0,  1.0,  0.0 };
-        }
-        if (mKeyDown[(int) Base::Key::DOWN]) {
-            forceWorld += vec3{  0.0, -1.0,  0.0 };
-        }
-
-        forceWorld *= mMoveSpeed;
-
-        const auto dampingFactor = Base::mProps.flightMoveDamping;
+        auto getLocalDirection = [this]() -> vec3 {
+            vec3 directionLocal{ 0.0, 0.0, 0.0 };
+            if (mKeyDown[(int)Base::Key::FORWARD]) {
+                directionLocal += vec3{ 0.0, 0.0, -1.0 };
+            }
+            if (mKeyDown[(int)Base::Key::LEFT]) {
+                directionLocal += vec3{ -1.0, 0.0, 0.0 };
+            }
+            if (mKeyDown[(int)Base::Key::BACKWARD]) {
+                directionLocal += vec3{ 0.0, 0.0, 1.0 };
+            }
+            if (mKeyDown[(int)Base::Key::RIGHT]) {
+                directionLocal += vec3{ 1.0, 0.0, 0.0 };
+            }
+            return directionLocal;
+        };
+
+        auto getWorldDirection = [this](vec3 directionLocal) -> vec3 {
+            const mat4 orientation = mat4::lookAt(Base::mEye, Base::mTarget, Base::mProps.upVector);
+            vec3 directionWorld = (orientation * vec4{ directionLocal, 0.0f }).xyz;
+            if (mKeyDown[(int)Base::Key::UP]) {
+                directionWorld += vec3{ 0.0, 1.0, 0.0 };
+            }
+            if (mKeyDown[(int)Base::Key::DOWN]) {
+                directionWorld += vec3{ 0.0, -1.0, 0.0 };
+            }
+            return directionWorld;
+        };
+
+        vec3 const localDirection = getLocalDirection();
+        vec3 const worldDirection = getWorldDirection(localDirection);
+
+        // unit of dampingFactor is [1/s]
+        FLOAT const dampingFactor = Base::mProps.flightMoveDamping;
         if (dampingFactor == 0.0) {
             // Without damping, we simply treat the force as our velocity.
-            mEyeVelocity = forceWorld;
+            vec3 const speed = worldDirection * mMoveSpeed;
+            mEyeVelocity = speed;
+            vec3 const positionDelta = mEyeVelocity * deltaTime;
+            Base::mEye += positionDelta;
+            Base::mTarget += positionDelta;
         } else {
-            // The dampingFactor acts as "friction", which acts upon the camera in the direction
-            // opposite its velocity.
-            // Force is also multiplied by the dampingFactor, to "make up" for the friction.
-            // This ensures that the max velocity still approaches mMoveSpeed;
-            vec3 velocityDelta = (forceWorld - mEyeVelocity) * dampingFactor;
-            mEyeVelocity += velocityDelta * deltaTime;
+            auto dt = deltaTime / 16.0;
+            for (size_t i = 0; i < 16; i++) {
+                // Note: the algorithm below doesn't work well for large time steps because
+                //       we're not using a closed form for updating the position, so we need
+                //       to loop a few times. We could make this better by having a dynamic
+                //       loop count. What we're really doing is evaluation the solution to
+                //       a differential equation numerically.
+
+                // Kinetic friction is a force opposing velocity and proportional to it.:
+                //      F = -kv
+                //      F = ma
+                // ==>  ma = -kv
+                //      a = -vk/m               [m.s^-2] = [m/s] * [Kg/s] / [Kg]
+                // ==>  dampingFactor = k/m        [1/s] = [Kg/s] / [Kg]
+                //
+                // The velocity update for dt due to friction is then:
+                // v = v + a.dt
+                //   = v - v * dampingFactor * dt
+                //   = v * (1.0 - dampingFactor * dt)
+                mEyeVelocity = mEyeVelocity * saturate(1.0 - dampingFactor * dt);
+
+                // We also undergo an acceleration proportional to the distance to the target speed
+                // (the closer we are the less we accelerate, similar to a car).
+                //       F = k * (target_v - v)
+                //       F = ma
+                //  ==> ma = k * (target_v - v)
+                //       a = k/m * (target_v - v)       [m.s^-2] = [Kg/s] / [Kg] * [m/s]
+                //
+                // The velocity update for dt due to the acceleration (the gas basically) is then:
+                // v = v + a.dt
+                //   = v + k/m * (target_v - v).dt
+                // We're using the same dampingFactor here, but we don't have to.
+                auto const accelerationFactor = dampingFactor;
+                vec3 const acceleration = worldDirection *
+                        (accelerationFactor * std::max(mMoveSpeed - length(mEyeVelocity), FLOAT(0)));
+                mEyeVelocity += acceleration * dt;
+                vec3 const positionDelta = mEyeVelocity * dt;
+                Base::mEye += positionDelta;
+                Base::mTarget += positionDelta;
+            }
         }
-
-        const vec3 positionDelta = mEyeVelocity * deltaTime;
-
-        Base::mEye += positionDelta;
-        Base::mTarget += positionDelta;
     }
 
     Bookmark getCurrentBookmark() const override {
diff --git a/libs/filamentapp/src/FilamentApp.cpp b/libs/filamentapp/src/FilamentApp.cpp
index 108bb0ac535..7e315b0da47 100644
--- a/libs/filamentapp/src/FilamentApp.cpp
+++ b/libs/filamentapp/src/FilamentApp.cpp
@@ -430,8 +430,10 @@ void FilamentApp::run(const Config& config, SetupCallback setupCallback,
         window->mDebugCamera->lookAt(eye, center, up);
 
         // Update the cube distortion matrix used for frustum visualization.
-        const Camera* lightmapCamera = window->mMainView->getView()->getDirectionalLightCamera();
-        lightmapCube->mapFrustum(*mEngine, lightmapCamera);
+        const Camera* lightmapCamera = window->mMainView->getView()->getDirectionalShadowCamera();
+        if (lightmapCamera) {
+            lightmapCube->mapFrustum(*mEngine, lightmapCamera);
+        }
         cameraCube->mapFrustum(*mEngine, window->mMainCamera);
 
         // Delay rendering for roughly one monitor refresh interval
@@ -713,7 +715,10 @@ FilamentApp::Window::Window(FilamentApp* filamentApp,
         mGodView->setCameraManipulator(mDebugCameraMan);
 
         // Ortho view obviously uses an ortho camera
-        mOrthoView->setCamera( (Camera *)mMainView->getView()->getDirectionalLightCamera() );
+        Camera const* debugDirectionalShadowCamera = mMainView->getView()->getDirectionalShadowCamera();
+        if (debugDirectionalShadowCamera) {
+            mOrthoView->setCamera(const_cast<Camera *>(debugDirectionalShadowCamera));
+        }
     }
 
     // configure the cameras
diff --git a/libs/geometry/src/MikktspaceImpl.cpp b/libs/geometry/src/MikktspaceImpl.cpp
index f9f54c332cb..19942ada3bd 100644
--- a/libs/geometry/src/MikktspaceImpl.cpp
+++ b/libs/geometry/src/MikktspaceImpl.cpp
@@ -19,12 +19,13 @@
 #include <math/mat3.h>
 #include <math/norm.h>
 
-
 #include <meshoptimizer.h>
 #include <mikktspace/mikktspace.h>
 
 #include <vector>
 
+#include <string.h>  // memcpy
+
 namespace filament::geometry {
 
 using namespace filament::math;
@@ -98,7 +99,7 @@ void MikktspaceImpl::setTSpaceBasic(SMikkTSpaceContext const* context, float con
     cursor += 36;
     for (auto [attribArray, attribStride, attribSize]: wrapper->mInputAttribArrays) {
         uint8_t const* input = pointerAdd(attribArray, vertInd, attribStride);
-        std::memcpy(cursor, input, attribSize);
+        memcpy(cursor, input, attribSize);
         cursor += attribSize;
     }
 }
diff --git a/libs/math/include/math/mat3.h b/libs/math/include/math/mat3.h
index 5ad06bdf4bc..035865fe2bb 100644
--- a/libs/math/include/math/mat3.h
+++ b/libs/math/include/math/mat3.h
@@ -17,15 +17,21 @@
 #ifndef TNT_MATH_MAT3_H
 #define TNT_MATH_MAT3_H
 
-#include <math/TMatHelpers.h>
 #include <math/compiler.h>
 #include <math/quat.h>
 #include <math/vec3.h>
+#include <math/TMatHelpers.h>
+#include <math/TVecHelpers.h>
 
 #include <limits.h>
 #include <stdint.h>
 #include <sys/types.h>
 
+#include <cmath>
+
+#include <assert.h>
+#include <stddef.h>
+
 namespace filament {
 namespace math {
 // -------------------------------------------------------------------------------------
diff --git a/libs/utils/include/utils/Allocator.h b/libs/utils/include/utils/Allocator.h
index c726ac5ffe6..073206f48ba 100644
--- a/libs/utils/include/utils/Allocator.h
+++ b/libs/utils/include/utils/Allocator.h
@@ -30,6 +30,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <vector>
 
 namespace utils {
 
@@ -43,14 +44,14 @@ static inline P* add(P* a, T b) noexcept {
 template <typename P>
 static inline P* align(P* p, size_t alignment) noexcept {
     // alignment must be a power-of-two
-    assert(alignment && !(alignment & alignment-1));
+    assert_invariant(alignment && !(alignment & alignment-1));
     return (P*)((uintptr_t(p) + alignment - 1) & ~(alignment - 1));
 }
 
 template <typename P>
 static inline P* align(P* p, size_t alignment, size_t offset) noexcept {
     P* const r = align(add(p, offset), alignment);
-    assert(r >= add(p, offset));
+    assert_invariant(r >= add(p, offset));
     return r;
 }
 
@@ -89,20 +90,19 @@ class LinearAllocator {
         // branch-less allocation
         void* const p = pointermath::align(current(), alignment, extra);
         void* const c = pointermath::add(p, size);
-        bool success = c <= end();
+        bool const success = c <= end();
         set_current(success ? c : current());
         return success ? p : nullptr;
     }
 
     // API specific to this allocator
-
     void *getCurrent() UTILS_RESTRICT noexcept {
         return current();
     }
 
     // free memory back to the specified point
     void rewind(void* p) UTILS_RESTRICT noexcept {
-        assert(p>=mBegin && p<end());
+        assert_invariant(p >= mBegin && p < end());
         set_current(p);
     }
 
@@ -122,16 +122,21 @@ class LinearAllocator {
     void swap(LinearAllocator& rhs) noexcept;
 
     void *base() noexcept { return mBegin; }
+    void const *base() const noexcept { return mBegin; }
 
     void free(void*, size_t) UTILS_RESTRICT noexcept { }
 
-private:
+protected:
     void* end() UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mSize); }
+    void const* end() const UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mSize); }
+
     void* current() UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mCur); }
+    void const* current() const UTILS_RESTRICT noexcept { return pointermath::add(mBegin, mCur); }
+
+private:
     void set_current(void* p) UTILS_RESTRICT noexcept {
         mCur = uint32_t(uintptr_t(p) - uintptr_t(mBegin));
     }
-
     void* mBegin = nullptr;
     uint32_t mSize = 0;
     uint32_t mCur = 0;
@@ -152,9 +157,7 @@ class HeapAllocator {
     explicit HeapAllocator(const AREA&) { }
 
     // our allocator concept
-    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t), size_t extra = 0) {
-        // this allocator doesn't support 'extra'
-        assert(extra == 0);
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t)) {
         return aligned_alloc(size, alignment);
     }
 
@@ -171,6 +174,50 @@ class HeapAllocator {
     void swap(HeapAllocator&) noexcept { }
 };
 
+/* ------------------------------------------------------------------------------------------------
+ * LinearAllocatorWithFallback
+ *
+ * This is a LinearAllocator that falls back to a HeapAllocator when allocation fail. The Heap
+ * allocator memory is freed only when the LinearAllocator is reset or destroyed.
+ * ------------------------------------------------------------------------------------------------
+ */
+class LinearAllocatorWithFallback : private LinearAllocator, private HeapAllocator {
+    std::vector<void*> mHeapAllocations;
+public:
+    LinearAllocatorWithFallback(void* begin, void* end) noexcept
+        : LinearAllocator(begin, end) {
+    }
+
+    template <typename AREA>
+    explicit LinearAllocatorWithFallback(const AREA& area)
+        : LinearAllocatorWithFallback(area.begin(), area.end()) {
+    }
+
+    ~LinearAllocatorWithFallback() noexcept {
+        LinearAllocatorWithFallback::reset();
+    }
+
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t));
+
+    void *getCurrent() noexcept {
+        return LinearAllocator::getCurrent();
+    }
+
+    void rewind(void* p) noexcept {
+        if (p >= LinearAllocator::base() && p < LinearAllocator::end()) {
+            LinearAllocator::rewind(p);
+        }
+    }
+
+    void reset() noexcept;
+
+    void free(void*, size_t) noexcept { }
+
+    bool isHeapAllocation(void* p) const noexcept {
+        return p < LinearAllocator::base() || p >= LinearAllocator::end();
+    }
+};
+
 // ------------------------------------------------------------------------------------------------
 
 class FreeList {
@@ -186,13 +233,13 @@ class FreeList {
         Node* const head = mHead;
         mHead = head ? head->next : nullptr;
         // this could indicate a use after free
-        assert(!mHead || mHead >= mBegin && mHead < mEnd);
+        assert_invariant(!mHead || mHead >= mBegin && mHead < mEnd);
         return head;
     }
 
     void push(void* p) noexcept {
-        assert(p);
-        assert(p >= mBegin && p < mEnd);
+        assert_invariant(p);
+        assert_invariant(p >= mBegin && p < mEnd);
         // TODO: assert this is one of our pointer (i.e.: it's address match one of ours)
         Node* const head = static_cast<Node*>(p);
         head->next = mHead;
@@ -203,11 +250,11 @@ class FreeList {
         return mHead;
     }
 
-private:
     struct Node {
         Node* next;
     };
 
+private:
     static Node* init(void* begin, void* end,
             size_t elementSize, size_t alignment, size_t extra) noexcept;
 
@@ -225,20 +272,20 @@ class AtomicFreeList {
     AtomicFreeList() noexcept = default;
     AtomicFreeList(void* begin, void* end,
             size_t elementSize, size_t alignment, size_t extra) noexcept;
-    AtomicFreeList(const FreeList& rhs) = delete;
-    AtomicFreeList& operator=(const FreeList& rhs) = delete;
+    AtomicFreeList(const AtomicFreeList& rhs) = delete;
+    AtomicFreeList& operator=(const AtomicFreeList& rhs) = delete;
 
     void* pop() noexcept {
-        Node* const storage = mStorage;
+        Node* const pStorage = mStorage;
 
         HeadPtr currentHead = mHead.load();
         while (currentHead.offset >= 0) {
-            // The value of "next" we load here might already contain application data if another
+            // The value of "pNext" we load here might already contain application data if another
             // thread raced ahead of us. But in that case, the computed "newHead" will be discarded
             // since compare_exchange_weak fails. Then this thread will loop with the updated
             // value of currentHead, and try again.
-            Node* const next = storage[currentHead.offset].next.load(std::memory_order_relaxed);
-            const HeadPtr newHead{ next ? int32_t(next - storage) : -1, currentHead.tag + 1 };
+            Node* const pNext = pStorage[currentHead.offset].next.load(std::memory_order_relaxed);
+            const HeadPtr newHead{ pNext ? int32_t(pNext - pStorage) : -1, currentHead.tag + 1 };
             // In the rare case that the other thread that raced ahead of us already returned the 
             // same mHead we just loaded, but it now has a different "next" value, the tag field will not 
             // match, and compare_exchange_weak will fail and prevent that particular race condition.
@@ -246,18 +293,18 @@ class AtomicFreeList {
                 // This assert needs to occur after we have validated that there was no race condition
                 // Otherwise, next might already contain application data, if another thread
                 // raced ahead of us after we loaded mHead, but before we loaded mHead->next.
-                assert(!next || next >= storage);
+                assert_invariant(!pNext || pNext >= pStorage);
                 break;
             }
         }
-        void* p = (currentHead.offset >= 0) ? (storage + currentHead.offset) : nullptr;
-        assert(!p || p >= storage);
+        void* p = (currentHead.offset >= 0) ? (pStorage + currentHead.offset) : nullptr;
+        assert_invariant(!p || p >= pStorage);
         return p;
     }
 
     void push(void* p) noexcept {
         Node* const storage = mStorage;
-        assert(p && p >= storage);
+        assert_invariant(p && p >= storage);
         Node* const node = static_cast<Node*>(p);
         HeadPtr currentHead = mHead.load();
         HeadPtr newHead = { int32_t(node - storage), currentHead.tag + 1 };
@@ -272,7 +319,6 @@ class AtomicFreeList {
         return mStorage + mHead.load(std::memory_order_relaxed).offset;
     }
 
-private:
     struct Node {
         // This should be a regular (non-atomic) pointer, but this causes TSAN to complain
         // about a data-race that exists but is benin. We always use this atomic<> in
@@ -303,6 +349,7 @@ class AtomicFreeList {
         std::atomic<Node*> next;
     };
 
+private:
     // This struct is using a 32-bit offset into the arena rather than
     // a direct pointer, because together with the 32-bit tag, it needs to 
     // fit into 8 bytes. If it was any larger, it would not be possible to
@@ -325,14 +372,15 @@ template <
         size_t OFFSET = 0,
         typename FREELIST = FreeList>
 class PoolAllocator {
-    static_assert(ELEMENT_SIZE >= sizeof(void*), "ELEMENT_SIZE must accommodate at least a pointer");
+    static_assert(ELEMENT_SIZE >= sizeof(typename FREELIST::Node),
+            "ELEMENT_SIZE must accommodate at least a FreeList::Node");
 public:
     // our allocator concept
     void* alloc(size_t size = ELEMENT_SIZE,
                 size_t alignment = ALIGNMENT, size_t offset = OFFSET) noexcept {
-        assert(size <= ELEMENT_SIZE);
-        assert(alignment <= ALIGNMENT);
-        assert(offset == OFFSET);
+        assert_invariant(size <= ELEMENT_SIZE);
+        assert_invariant(alignment <= ALIGNMENT);
+        assert_invariant(offset == OFFSET);
         return mFreeList.pop();
     }
 
@@ -346,6 +394,10 @@ class PoolAllocator {
         : mFreeList(begin, end, ELEMENT_SIZE, ALIGNMENT, OFFSET) {
     }
 
+    PoolAllocator(void* begin, size_t size) noexcept
+        : mFreeList(begin, static_cast<char *>(begin) + size, ELEMENT_SIZE, ALIGNMENT, OFFSET) {
+    }
+
     template <typename AREA>
     explicit PoolAllocator(const AREA& area) noexcept
         : PoolAllocator(area.begin(), area.end()) {
@@ -585,32 +637,54 @@ class Arena {
               mListener(name, mArea.data(), mArea.size()) {
     }
 
+    template<typename ... ARGS>
+    void* alloc(size_t size, size_t alignment, size_t extra, ARGS&& ... args) noexcept {
+        std::lock_guard<LockingPolicy> guard(mLock);
+        void* p = mAllocator.alloc(size, alignment, extra, std::forward<ARGS>(args) ...);
+        mListener.onAlloc(p, size, alignment, extra);
+        return p;
+    }
+
+
     // allocate memory from arena with given size and alignment
     // (acceptable size/alignment may depend on the allocator provided)
-    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t), size_t extra = 0) noexcept {
+    void* alloc(size_t size, size_t alignment, size_t extra) noexcept {
         std::lock_guard<LockingPolicy> guard(mLock);
         void* p = mAllocator.alloc(size, alignment, extra);
         mListener.onAlloc(p, size, alignment, extra);
         return p;
     }
 
+    void* alloc(size_t size, size_t alignment = alignof(std::max_align_t)) noexcept {
+        std::lock_guard<LockingPolicy> guard(mLock);
+        void* p = mAllocator.alloc(size, alignment);
+        mListener.onAlloc(p, size, alignment, 0);
+        return p;
+    }
+
     // Allocate an array of trivially destructible objects
     // for safety, we disable the object-based alloc method if the object type is not
     // trivially destructible, since free() won't call the destructor and this is allocating
     // an array.
     template <typename T,
             typename = typename std::enable_if<std::is_trivially_destructible<T>::value>::type>
-    T* alloc(size_t count, size_t alignment = alignof(T), size_t extra = 0) noexcept {
+    T* alloc(size_t count, size_t alignment, size_t extra) noexcept {
         return (T*)alloc(count * sizeof(T), alignment, extra);
     }
 
-    // return memory pointed by p to the arena
-    // (actual behaviour may depend on allocator provided)
-    void free(void* p) noexcept {
+    template <typename T,
+            typename = typename std::enable_if<std::is_trivially_destructible<T>::value>::type>
+    T* alloc(size_t count, size_t alignment = alignof(T)) noexcept {
+        return (T*)alloc(count * sizeof(T), alignment);
+    }
+
+    // some allocators require more parameters
+    template<typename ... ARGS>
+    void free(void* p, size_t size, ARGS&& ... args) noexcept {
         if (p) {
             std::lock_guard<LockingPolicy> guard(mLock);
-            mListener.onFree(p);
-            mAllocator.free(p);
+            mListener.onFree(p, size);
+            mAllocator.free(p, size, std::forward<ARGS>(args) ...);
         }
     }
 
@@ -623,6 +697,16 @@ class Arena {
         }
     }
 
+    // return memory pointed by p to the arena
+    // (actual behaviour may depend on allocator provided)
+    void free(void* p) noexcept {
+        if (p) {
+            std::lock_guard<LockingPolicy> guard(mLock);
+            mListener.onFree(p);
+            mAllocator.free(p);
+        }
+    }
+
     // some allocators don't have a free() call, but a single reset() or rewind() instead
     void reset() noexcept {
         std::lock_guard<LockingPolicy> guard(mLock);
@@ -720,6 +804,8 @@ class ArenaScope {
     }
 
 public:
+    using Arena = ARENA;
+
     explicit ArenaScope(ARENA& allocator)
             : mArena(allocator), mRewind(allocator.getCurrent()) {
     }
@@ -771,7 +857,7 @@ class ArenaScope {
     }
 
     // use with caution
-    ARENA& getAllocator() noexcept { return mArena; }
+    ARENA& getArena() noexcept { return mArena; }
 
 private:
     ARENA& mArena;
diff --git a/libs/utils/src/Allocator.cpp b/libs/utils/src/Allocator.cpp
index 2d7a8fcbe92..fd6e5945691 100644
--- a/libs/utils/src/Allocator.cpp
+++ b/libs/utils/src/Allocator.cpp
@@ -16,6 +16,8 @@
 
 #include <utils/Allocator.h>
 
+#include <utils/compiler.h>
+#include <utils/debug.h>
 #include <utils/Log.h>
 
 #include <algorithm>
@@ -52,6 +54,29 @@ void LinearAllocator::swap(LinearAllocator& rhs) noexcept {
     std::swap(mCur, rhs.mCur);
 }
 
+
+// ------------------------------------------------------------------------------------------------
+// LinearAllocatorWithFallback
+// ------------------------------------------------------------------------------------------------
+
+void* LinearAllocatorWithFallback::alloc(size_t size, size_t alignment) {
+    void* p = LinearAllocator::alloc(size, alignment);
+    if (UTILS_UNLIKELY(!p)) {
+        p = HeapAllocator::alloc(size, alignment);
+        mHeapAllocations.push_back(p);
+    }
+    assert_invariant(p);
+    return p;
+}
+
+void LinearAllocatorWithFallback::reset() noexcept {
+    LinearAllocator::reset();
+    for (auto* p : mHeapAllocations) {
+        HeapAllocator::free(p);
+    }
+    mHeapAllocations.clear();
+}
+
 // ------------------------------------------------------------------------------------------------
 // FreeList
 // ------------------------------------------------------------------------------------------------
@@ -61,8 +86,8 @@ FreeList::Node* FreeList::init(void* begin, void* end,
 {
     void* const p = pointermath::align(begin, alignment, extra);
     void* const n = pointermath::align(pointermath::add(p, elementSize), alignment, extra);
-    assert(p >= begin && p < end);
-    assert(n >= begin && n < end && n > p);
+    assert_invariant(p >= begin && p < end);
+    assert_invariant(n >= begin && n < end && n > p);
 
     const size_t d = uintptr_t(n) - uintptr_t(p);
     const size_t num = (uintptr_t(end) - uintptr_t(p)) / d;
@@ -77,8 +102,8 @@ FreeList::Node* FreeList::init(void* begin, void* end,
         cur->next = next;
         cur = next;
     }
-    assert(cur < end);
-    assert(pointermath::add(cur, d) <= end);
+    assert_invariant(cur < end);
+    assert_invariant(pointermath::add(cur, d) <= end);
     cur->next = nullptr;
     return head;
 }
@@ -97,13 +122,13 @@ AtomicFreeList::AtomicFreeList(void* begin, void* end,
 {
 #ifdef __ANDROID__
     // on some platform (e.g. web) this returns false. we really only care about mobile though.
-    assert(mHead.is_lock_free());
+    assert_invariant(mHead.is_lock_free());
 #endif
 
     void* const p = pointermath::align(begin, alignment, extra);
     void* const n = pointermath::align(pointermath::add(p, elementSize), alignment, extra);
-    assert(p >= begin && p < end);
-    assert(n >= begin && n < end && n > p);
+    assert_invariant(p >= begin && p < end);
+    assert_invariant(n >= begin && n < end && n > p);
 
     const size_t d = uintptr_t(n) - uintptr_t(p);
     const size_t num = (uintptr_t(end) - uintptr_t(p)) / d;
@@ -119,8 +144,8 @@ AtomicFreeList::AtomicFreeList(void* begin, void* end,
         cur->next = next;
         cur = next;
     }
-    assert(cur < end);
-    assert(pointermath::add(cur, d) <= end);
+    assert_invariant(cur < end);
+    assert_invariant(pointermath::add(cur, d) <= end);
     cur->next = nullptr;
 
     mHead.store({ int32_t(head - mStorage), 0 });
@@ -148,22 +173,25 @@ TrackingPolicy::HighWatermark::~HighWatermark() noexcept {
 }
 
 void TrackingPolicy::HighWatermark::onFree(void* p, size_t size) noexcept {
-    assert(mCurrent >= size);
+    // FIXME: this code is incorrect with LinearAllocators because free() is a no-op for them
+    assert_invariant(mCurrent >= size);
     mCurrent -= uint32_t(size);
 }
 void TrackingPolicy::HighWatermark::onReset() noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onReset() tries to call the underlying allocator's onReset()
-    assert(mBase);
+    assert_invariant(mBase);
     mCurrent = 0;
 }
 
 void TrackingPolicy::HighWatermark::onRewind(void const* addr) noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onRewind() tries to call the underlying allocator's onReset()
-    assert(mBase);
-    assert(addr >= mBase);
-    mCurrent = uint32_t(uintptr_t(addr) - uintptr_t(mBase));
+    assert_invariant(mBase);
+    // for LinearAllocatorWithFallback we could get pointers outside the range
+    if (addr >= mBase && addr < pointermath::add(mBase, mSize)) {
+        mCurrent = uint32_t(uintptr_t(addr) - uintptr_t(mBase));
+    }
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -183,7 +211,7 @@ void TrackingPolicy::Debug::onFree(void* p, size_t size) noexcept {
 void TrackingPolicy::Debug::onReset() noexcept {
     // we should never be here if mBase is nullptr because compilation would have failed when
     // Arena::onReset() tries to call the underlying allocator's onReset()
-    assert(mBase);
+    assert_invariant(mBase);
     memset(mBase, 0xec, mSize);
 }
 
diff --git a/web/filament-js/package.json b/web/filament-js/package.json
index 7ebc0a0ff4b..52b661bd10e 100644
--- a/web/filament-js/package.json
+++ b/web/filament-js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "filament",
-  "version": "1.50.3",
+  "version": "1.50.4",
   "description": "Real-time physically based rendering engine",
   "main": "filament.js",
   "module": "filament.js",