From c64f295e8133d0a7241b526e3ee9e79eb4f4f147 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Fri, 16 Feb 2024 15:56:28 -0300
Subject: [PATCH 01/12] Add sub-allocated descriptor set example

---
 67_SubAllocatedDescriptorSet/CMakeLists.txt   |  24 +++
 .../app_resources/common.hlsl                 |  20 ++
 .../app_resources/shader.comp.hlsl            |  33 +++
 .../config.json.template                      |  28 +++
 67_SubAllocatedDescriptorSet/main.cpp         | 191 ++++++++++++++++++
 67_SubAllocatedDescriptorSet/pipeline.groovy  |  50 +++++
 CMakeLists.txt                                |   1 +
 7 files changed, 347 insertions(+)
 create mode 100644 67_SubAllocatedDescriptorSet/CMakeLists.txt
 create mode 100644 67_SubAllocatedDescriptorSet/app_resources/common.hlsl
 create mode 100644 67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl
 create mode 100644 67_SubAllocatedDescriptorSet/config.json.template
 create mode 100644 67_SubAllocatedDescriptorSet/main.cpp
 create mode 100644 67_SubAllocatedDescriptorSet/pipeline.groovy

diff --git a/67_SubAllocatedDescriptorSet/CMakeLists.txt b/67_SubAllocatedDescriptorSet/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/app_resources/common.hlsl b/67_SubAllocatedDescriptorSet/app_resources/common.hlsl
new file mode 100644
index 000000000..456dc6740
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/app_resources/common.hlsl
@@ -0,0 +1,20 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
+typedef nbl::hlsl::float32_t3 input_t;
+typedef nbl::hlsl::float32_t output_t;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;
+
+struct PushConstantData
+{
+	uint64_t inputAddress;
+	uint64_t outputAddress;
+	uint32_t dataElementCount;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
+
+// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
+#include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl b/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..4aeef0e0f
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl
@@ -0,0 +1,33 @@
+#include "common.hlsl"
+
+// just a small test
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+// does absolutely nothing, a later example will show how it gets used
+template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
+void dummyTraitTest() {}
+
+[numthreads(WorkgroupSize,1,1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	dummyTraitTest();
+	if (ID.x>=pushConstants.dataElementCount)
+		return;
+
+	const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);
+
+	nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);
+
+	float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
+	const static uint32_t OthersToTest = 15;
+	[[unroll(OthersToTest)]]
+	for (uint32_t i=0; i<OthersToTest; i++)
+	{
+		const uint32_t offset = rng() % pushConstants.dataElementCount;
+		const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
+		acc = min(length(other-self),acc);
+	}
+	vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
+}
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/config.json.template b/67_SubAllocatedDescriptorSet/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
new file mode 100644
index 000000000..fe021929b
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -0,0 +1,191 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+#include "nbl/video/surface/CSurfaceVulkan.h"
+#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"
+
+#include "../common/BasicMultiQueueApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
+class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::MonoDeviceApplication;
+		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
+		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
+		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
+
+		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>> m_subAllocDescriptorSet;
+
+		// This example really lets the advantages of a timeline semaphore shine through!
+		smart_refctd_ptr<ISemaphore> m_timeline;
+		uint64_t m_iteration = 0;
+		constexpr static inline uint64_t MaxIterations = 200;
+
+		constexpr static inline uint32_t MaxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
+		constexpr static inline uint32_t MinDescriptorSetAllocationSize = 1u;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			using nbl::video::IGPUDescriptorSetLayout;
+
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(std::move(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+
+			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
+			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
+			constexpr auto MaxConcurrency = 64;
+
+			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
+			m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);
+
+			// In contrast to fences, we just need one semaphore to rule all dispatches
+			m_timeline = m_device->createSemaphore(m_iteration);
+
+			// Descriptor set sub allocator
+
+			video::IGPUDescriptorSetLayout::SBinding bindings[1];
+			{
+				bindings[0].binding = 0;
+				bindings[0].count = 65536u;
+				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
+				bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
+				bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+			}
+
+			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
+
+			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>>(
+				bindings, MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
+			);
+
+			std::vector<uint32_t> allocation, size;
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
+			{
+				std::vector<uint32_t> addr, freeSize;
+				for (uint32_t i = 0; i < 512; i+=2)
+				{
+					addr.push_back(allocation[i]);
+					freeSize.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]);
+			}
+
+			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
+			allocation.clear();
+			size.clear();
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(2);
+				}
+				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
+			
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return m_iteration<MaxIterations; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override
+		{
+			IQueue* const queue = getComputeQueue();
+
+			// Obtain our command pool once one gets recycled
+			uint32_t poolIx;
+			do
+			{
+				poolIx = m_poolCache->acquirePool();
+			} while (poolIx==ICommandPoolCache::invalid_index);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
+				// lets record, its still a one time submit because we have to re-record with different push constants each time
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+				// COMMAND RECORDING
+
+				auto result = cmdbuf->end();
+				assert(result);
+			}
+
+
+			const auto savedIterNum = m_iteration++;
+			{
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+				{
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+				{
+					.semaphore = m_timeline.get(),
+					.value = m_iteration,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				};
+				// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
+				// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
+				// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
+				// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
+				const IQueue::SSubmitInfo submitInfo = {
+					.waitSemaphores = {},
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signalInfo,1}
+				};
+
+				queue->startCapture();
+				auto statusCode = queue->submit({ &submitInfo,1 });
+				queue->endCapture();
+				assert(statusCode == IQueue::RESULT::SUCCESS);
+			}
+		}
+};
+
+NBL_MAIN_FUNC(SubAllocatedDescriptorSetApp)
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/pipeline.groovy b/67_SubAllocatedDescriptorSet/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/67_SubAllocatedDescriptorSet/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20a33a9..4a9c2b376 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+	add_subdirectory(67_SubAllocatedDescriptorSet EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
\ No newline at end of file

From 5005a4c4038953f782f5e9c0b79f25025769231d Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 16 Feb 2024 14:08:08 +0100
Subject: [PATCH 02/12] Change the way device filtering works

---
 08_HelloSwapchain/main.cpp       | 33 +++++++++++++++++++-------------
 common/MonoDeviceApplication.hpp |  7 ++++---
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/08_HelloSwapchain/main.cpp b/08_HelloSwapchain/main.cpp
index e6581d525..89f843b18 100644
--- a/08_HelloSwapchain/main.cpp
+++ b/08_HelloSwapchain/main.cpp
@@ -17,6 +17,10 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 	public:
 		using base_t::base_t;
 
+		// We inherit from an application that tries to find Graphics and Compute queues
+		// because applications with presentable images often want to perform Graphics family operations
+		virtual bool isComputeOnly() const {return false;}
+
 		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
 		{
 			auto retval = base_t::getAPIFeaturesToEnable();
@@ -26,22 +30,23 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 		}
 
 		// New function, we neeed to know about surfaces to create ahead of time
-		virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
 
-		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		// We have a very simple heuristic, the device must be able to render to all windows!
+		// (want to make something more complex? you're on your own!)
+		virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
 		{
-			const auto firstFilter = base_t::filterDevices(physicalDevices);
+			base_t::filterDevices(physicalDevices);
 
 			video::SPhysicalDeviceFilter deviceFilter = {};
 			
-			const auto surfaces = getSurfaces();
-			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
-			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+			auto surfaces = getSurfaces();
+			deviceFilter.requiredSurfaceCompatibilities = {surfaces};
 
 			return deviceFilter(physicalDevices);
 		}
 		
-		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
 			if (!base_t::onAppInitialized(std::move(system)))
@@ -52,6 +57,7 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 		#else
 			#error "Unimplemented!"
 		#endif
+			return true;
 		}
 
 		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
@@ -87,7 +93,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 	public:
 		using base_t::base_t;
 
-		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
 			if (!base_t::onAppInitialized(std::move(system)))
@@ -98,7 +104,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 			return true;
 		}
 
-		virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
 		{
 			return {{m_surface.get()/*,EQF_NONE*/}};
 		}
@@ -112,15 +118,15 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 		}
 
 	protected:
-		virtual IWindow::SCreationParams getWindowCreationParams() const
+		virtual ui::IWindow::SCreationParams getWindowCreationParams() const
 		{
-			IWindow::SCreationParams params = {};
-			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
+			ui::IWindow::SCreationParams params = {};
+			params.callback = core::make_smart_refctd_ptr<IWindowClosedCallback>();
 			params.width = 640;
 			params.height = 480;
 			params.x = 32;
 			params.y = 32;
-			params.flags = IWindow::ECF_NONE;
+			params.flags = ui::IWindow::ECF_NONE;
 			params.windowCaption = "SingleNonResizableWindowApplication";
 			return params;
 		}
@@ -130,6 +136,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 };
 }
 
+#include "nbl/video/CVulkanSwapchain.h"
 
 using namespace nbl;
 using namespace core;
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
index ca4e6d449..64728d892 100644
--- a/common/MonoDeviceApplication.hpp
+++ b/common/MonoDeviceApplication.hpp
@@ -40,7 +40,8 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 			if (gpus.empty())
 				return logFail("Failed to find any Nabla Core Profile Vulkan devices!");
 
-			const core::set<video::IPhysicalDevice*> suitablePhysicalDevices = filterDevices(gpus);
+			core::set<video::IPhysicalDevice*> suitablePhysicalDevices(gpus.begin(),gpus.end());
+			filterDevices(suitablePhysicalDevices);
 			if (suitablePhysicalDevices.empty())
 				return logFail("No PhysicalDevice met the feature requirements of the application!");
 
@@ -78,7 +79,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 		}
 
 		// a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
-		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
 		{
 			video::SPhysicalDeviceFilter deviceFilter = {};
 
@@ -96,7 +97,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 			const auto queueReqs = getQueueRequirements();
 			deviceFilter.queueRequirements = queueReqs;
 			
-			return deviceFilter(physicalDevices);
+			deviceFilter(physicalDevices);
 		}
 
 		// virtual function so you can override as needed for some example father down the line

From f18077bc181132afe687b3d3fec3783c3c272a4b Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 19 Feb 2024 16:53:39 -0300
Subject: [PATCH 03/12] Update example to match changes

---
 .../app_resources/common.hlsl                 | 20 -----------
 .../app_resources/shader.comp.hlsl            | 33 -------------------
 67_SubAllocatedDescriptorSet/main.cpp         | 11 +++----
 3 files changed, 5 insertions(+), 59 deletions(-)
 delete mode 100644 67_SubAllocatedDescriptorSet/app_resources/common.hlsl
 delete mode 100644 67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl

diff --git a/67_SubAllocatedDescriptorSet/app_resources/common.hlsl b/67_SubAllocatedDescriptorSet/app_resources/common.hlsl
deleted file mode 100644
index 456dc6740..000000000
--- a/67_SubAllocatedDescriptorSet/app_resources/common.hlsl
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
-
-// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
-typedef nbl::hlsl::float32_t3 input_t;
-typedef nbl::hlsl::float32_t output_t;
-
-NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;
-
-struct PushConstantData
-{
-	uint64_t inputAddress;
-	uint64_t outputAddress;
-	uint32_t dataElementCount;
-};
-
-NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
-
-// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
-#include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl b/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl
deleted file mode 100644
index 4aeef0e0f..000000000
--- a/67_SubAllocatedDescriptorSet/app_resources/shader.comp.hlsl
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "common.hlsl"
-
-// just a small test
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-[[vk::push_constant]] PushConstantData pushConstants;
-
-// does absolutely nothing, a later example will show how it gets used
-template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
-void dummyTraitTest() {}
-
-[numthreads(WorkgroupSize,1,1)]
-void main(uint32_t3 ID : SV_DispatchThreadID)
-{
-	dummyTraitTest();
-	if (ID.x>=pushConstants.dataElementCount)
-		return;
-
-	const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);
-
-	nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);
-
-	float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
-	const static uint32_t OthersToTest = 15;
-	[[unroll(OthersToTest)]]
-	for (uint32_t i=0; i<OthersToTest; i++)
-	{
-		const uint32_t offset = rng() % pushConstants.dataElementCount;
-		const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
-		acc = min(length(other-self),acc);
-	}
-	vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
-}
\ No newline at end of file
diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index fe021929b..398c842dc 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -16,7 +16,6 @@ using namespace ui;
 using namespace asset;
 using namespace video;
 
-#include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
 
 // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
@@ -29,7 +28,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
 		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
 
-		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>> m_subAllocDescriptorSet;
+		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet> m_subAllocDescriptorSet;
 
 		// This example really lets the advantages of a timeline semaphore shine through!
 		smart_refctd_ptr<ISemaphore> m_timeline;
@@ -82,7 +81,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
 
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
-			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>>(
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(
 				bindings, MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
 			);
 
@@ -93,7 +92,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
 					size.push_back(4);
 				}
-				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
@@ -107,7 +106,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 					addr.push_back(allocation[i]);
 					freeSize.push_back(4);
 				}
-				subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]);
+				subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0], &freeSize[0]);
 			}
 
 			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
@@ -119,7 +118,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
 					size.push_back(2);
 				}
-				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);

From 8dee3637262203a9a5e10b58e77f62d746133bf8 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 21 Feb 2024 08:54:57 -0300
Subject: [PATCH 04/12] Conform to API change

---
 67_SubAllocatedDescriptorSet/main.cpp | 61 ++++++++++++---------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index 398c842dc..b1c22dcbd 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -67,62 +67,55 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 
 			// Descriptor set sub allocator
 
-			video::IGPUDescriptorSetLayout::SBinding bindings[1];
+			video::IGPUDescriptorSetLayout::SBinding bindings[12];
 			{
-				bindings[0].binding = 0;
-				bindings[0].count = 65536u;
-				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
-					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
-					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
-				bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
-				bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+				for (uint32_t i = 0; i < 12; i++)
+				{
+					bindings[i].binding = i;
+					bindings[i].count = 16000;
+					bindings[i].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
+						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
+						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
+					if (i % 2 == 0) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
+					else if (i % 2 == 1) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
+					bindings[i].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+				}
 			}
 
 			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
 
+			auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
 			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(
-				bindings, MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
+				descriptorSetLayout.get(), MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
 			);
 
-			std::vector<uint32_t> allocation, size;
+			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
-				for (uint32_t i = 0; i < 512; i++)
-				{
-					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-					size.push_back(4);
-				}
-				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
+				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0]);
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					assert(allocation[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
 				}
 			}
 			{
-				std::vector<uint32_t> addr, freeSize;
-				for (uint32_t i = 0; i < 512; i+=2)
+				std::vector<uint32_t> addr;
+				for (uint32_t i = 0; i < allocation.size(); i+=2)
 				{
 					addr.push_back(allocation[i]);
-					freeSize.push_back(4);
 				}
-				subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0], &freeSize[0]);
+				subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0]);
 			}
-
-			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
-			allocation.clear();
-			size.clear();
+			m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
+			std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
-				for (uint32_t i = 0; i < 512; i++)
+				subAllocatedDescriptorSet->multi_allocate(0, allocation2.size(), &allocation2[0]);
+				for (uint32_t i = 0; i < allocation2.size(); i++)
 				{
-					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-					size.push_back(2);
-				}
-				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
-				for (uint32_t i = 0; i < allocation.size(); i++)
-				{
-					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);
+					assert(allocation2[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
 				}
 			}
 			

From 0b805e0fbf13ebc85e7aafb28cc49fdad741932f Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 21 Feb 2024 13:50:27 -0300
Subject: [PATCH 05/12] PR reviews

---
 67_SubAllocatedDescriptorSet/main.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index b1c22dcbd..a1587ef5f 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -35,9 +35,6 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 		uint64_t m_iteration = 0;
 		constexpr static inline uint64_t MaxIterations = 200;
 
-		constexpr static inline uint32_t MaxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
-		constexpr static inline uint32_t MinDescriptorSetAllocationSize = 1u;
-
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
 		SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -72,7 +69,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				for (uint32_t i = 0; i < 12; i++)
 				{
 					bindings[i].binding = i;
-					bindings[i].count = 16000;
+					bindings[i].count = 512;
 					bindings[i].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
 						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
 						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
@@ -87,10 +84,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
 
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
-			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(
-				descriptorSetLayout.get(), MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
-			);
-
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(descriptorSetLayout.get()); 
 			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
 				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0]);

From 461a7dae01c3c02687706fd7a0a9a20c70acdf10 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Tue, 27 Feb 2024 17:40:11 -0300
Subject: [PATCH 06/12] Fix example

---
 67_SubAllocatedDescriptorSet/main.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index a1587ef5f..e11faa1e5 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -83,8 +83,19 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 
 			auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
 
+			video::IDescriptorPool::SCreateInfo poolParams = {};
+			{
+				poolParams.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = 512 * 6;
+				poolParams.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 512 * 6;
+				poolParams.maxSets = 1;
+				poolParams.flags = core::bitflag(video::IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT);
+			}
+
+			auto descriptorPool = m_device->createDescriptorPool(std::move(poolParams));
+			auto descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(descriptorSetLayout));
+
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
-			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(descriptorSetLayout.get()); 
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet)); 
 			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
 				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0]);

From 5a94b7ef784f1aa81905cc4a05aab2adc64576ed Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 4 Mar 2024 19:16:34 -0300
Subject: [PATCH 07/12] Fix example with changes

---
 67_SubAllocatedDescriptorSet/main.cpp | 54 +++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index e11faa1e5..93c7c486d 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -94,11 +94,57 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			auto descriptorPool = m_device->createDescriptorPool(std::move(poolParams));
 			auto descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(descriptorSetLayout));
 
+
+			auto createImageDescriptor = [&](uint32_t width, uint32_t height)
+			{
+				auto image = m_device->createImage(nbl::video::IGPUImage::SCreationParams {
+					{
+						.type = nbl::video::IGPUImage::E_TYPE::ET_2D,
+						.samples = nbl::video::IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
+						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
+						.extent = { width, height, 1 },
+						.mipLevels = 1,
+						.arrayLayers = 1,
+						.usage = nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT 
+							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
+							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT,
+					}, {}, nbl::video::IGPUImage::TILING::LINEAR,
+				});
+
+				auto reqs = image->getMemoryReqs();
+				reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				m_device->allocate(reqs, image.get());
+
+				auto imageView = m_device->createImageView(nbl::video::IGPUImageView::SCreationParams {
+					.image = image,
+						.viewType = nbl::video::IGPUImageView::E_TYPE::ET_2D,
+						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
+						// .subresourceRange = { nbl::video::IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT, 0, 1, 0, 1 },
+				});
+				
+				video::IGPUDescriptorSet::SDescriptorInfo descriptorInfo = {};
+                descriptorInfo.desc = imageView;
+                descriptorInfo.info.image.imageLayout = asset::IImage::LAYOUT::GENERAL;
+
+				return descriptorInfo;
+			};
+
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
-			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet)); 
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet), core::smart_refctd_ptr(m_device)); 
 			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
+			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptors;
+			std::vector<video::IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites(allocation.size(), video::IGPUDescriptorSet::SWriteDescriptorSet{});
+
+			for (uint32_t i = 0; i < allocation.size(); i++)
+			{
+				auto descriptorInfo = createImageDescriptor(80, 80);
+				descriptors.push_back(descriptorInfo);
+			}
+
 			{
-				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0]);
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), descriptors.data(), descriptorWrites.data(), allocation.data());
+				assert(allocNum == 0);
+				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
@@ -116,7 +162,9 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
 			std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
-				subAllocatedDescriptorSet->multi_allocate(0, allocation2.size(), &allocation2[0]);
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(0, allocation2.size(), descriptors.data(), descriptorWrites.data(), &allocation2[0]);
+				assert(allocNum == 0);
+				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation2.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);

From 2d9181d1879edf93bc9518e6d0ff4d0bce6f27f7 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 4 Mar 2024 19:31:33 -0300
Subject: [PATCH 08/12] Fix conflict

---
 08_HelloSwapchain/main.cpp | 135 -------------------------------------
 1 file changed, 135 deletions(-)

diff --git a/08_HelloSwapchain/main.cpp b/08_HelloSwapchain/main.cpp
index 0f28e2097..9585f4121 100644
--- a/08_HelloSwapchain/main.cpp
+++ b/08_HelloSwapchain/main.cpp
@@ -6,141 +6,6 @@
 
 //
 #include "nbl/video/surface/CSurfaceVulkan.h"
-<<<<<<< HEAD
-
-#include "../common/BasicMultiQueueApplication.hpp"
-
-namespace nbl::examples
-{
-// Virtual Inheritance because apps might end up doing diamond inheritance
-class WindowedApplication : public virtual BasicMultiQueueApplication
-{
-		using base_t = BasicMultiQueueApplication;
-
-	public:
-		using base_t::base_t;
-
-		// We inherit from an application that tries to find Graphics and Compute queues
-		// because applications with presentable images often want to perform Graphics family operations
-		virtual bool isComputeOnly() const {return false;}
-
-		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
-		{
-			auto retval = base_t::getAPIFeaturesToEnable();
-			// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
-			retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
-			return retval;
-		}
-
-		// New function, we neeed to know about surfaces to create ahead of time
-		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
-
-		// We have a very simple heuristic, the device must be able to render to all windows!
-		// (want to make something more complex? you're on your own!)
-		virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
-		{
-			base_t::filterDevices(physicalDevices);
-
-			video::SPhysicalDeviceFilter deviceFilter = {};
-			
-			auto surfaces = getSurfaces();
-			deviceFilter.requiredSurfaceCompatibilities = {surfaces};
-
-			return deviceFilter(physicalDevices);
-		}
-		
-		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
-		{
-			// Remember to call the base class initialization!
-			if (!base_t::onAppInitialized(std::move(system)))
-				return false;
-
-		#ifdef _NBL_PLATFORM_WINDOWS_
-			m_winMgr = nbl::ui::IWindowManagerWin32::create();
-		#else
-			#error "Unimplemented!"
-		#endif
-			return true;
-		}
-
-		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
-};
-
-
-// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control
-class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback
-{
-	public:
-		IWindowClosedCallback() : m_gotWindowClosedMsg(false) {}
-
-		// unless you create a separate callback per window, both will "trip" this condition
-		bool windowGotClosed() const {return m_gotWindowClosedMsg;}
-
-	private:
-		bool onWindowClosed_impl() override
-		{
-			m_gotWindowClosedMsg = true;
-			return true;
-		}
-
-		bool m_gotWindowClosedMsg;
-};
-
-// We inherit from an application that tries to find Graphics and Compute queues
-// because applications with presentable images often want to perform Graphics family operations
-// Virtual Inheritance because apps might end up doing diamond inheritance
-class SingleNonResizableWindowApplication : public virtual WindowedApplication
-{
-		using base_t = WindowedApplication;
-
-	public:
-		using base_t::base_t;
-
-		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
-		{
-			// Remember to call the base class initialization!
-			if (!base_t::onAppInitialized(std::move(system)))
-				return false;
-
-			m_window = m_winMgr->createWindow(getWindowCreationParams());
-			m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast<ui::IWindowWin32>(m_window));
-			return true;
-		}
-
-		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
-		{
-			return {{m_surface.get()/*,EQF_NONE*/}};
-		}
-
-		virtual bool keepRunning() override
-		{
-			if (!m_window || reinterpret_cast<const IWindowClosedCallback*>(m_window->getEventCallback())->windowGotClosed())
-				return false;
-
-			return true;
-		}
-
-	protected:
-		virtual ui::IWindow::SCreationParams getWindowCreationParams() const
-		{
-			ui::IWindow::SCreationParams params = {};
-			params.callback = core::make_smart_refctd_ptr<IWindowClosedCallback>();
-			params.width = 640;
-			params.height = 480;
-			params.x = 32;
-			params.y = 32;
-			params.flags = ui::IWindow::ECF_NONE;
-			params.windowCaption = "SingleNonResizableWindowApplication";
-			return params;
-		}
-
-		core::smart_refctd_ptr<ui::IWindow> m_window;
-		core::smart_refctd_ptr<video::ISurfaceVulkan> m_surface;
-};
-}
-
-=======
->>>>>>> vulkan_1_3
 #include "nbl/video/CVulkanSwapchain.h"
 
 using namespace nbl;

From 04ca9e27dd7980d552939d74021f2790eff17622 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 4 Mar 2024 19:37:22 -0300
Subject: [PATCH 09/12] Fix lack of onAppTerminated

---
 67_SubAllocatedDescriptorSet/main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index 93c7c486d..87231a931 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -231,6 +231,11 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				assert(statusCode == IQueue::RESULT::SUCCESS);
 			}
 		}
+
+		bool onAppTerminated() override
+		{
+			return device_base_t::onAppTerminated();
+		}
 };
 
 NBL_MAIN_FUNC(SubAllocatedDescriptorSetApp)
\ No newline at end of file

From ffb014e5a686fc2955d8858afda12294bf3c176f Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 4 Mar 2024 20:17:18 -0300
Subject: [PATCH 10/12] Example PR reviews

---
 67_SubAllocatedDescriptorSet/main.cpp        | 34 ++++----------------
 67_SubAllocatedDescriptorSet/pipeline.groovy |  6 ++--
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index 87231a931..a4352f1a2 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -3,7 +3,6 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 
-#include "nbl/video/surface/CSurfaceVulkan.h"
 #include "nbl/video/alloc/SubAllocatedDescriptorSet.h"
 
 #include "../common/BasicMultiQueueApplication.hpp"
@@ -16,50 +15,37 @@ using namespace ui;
 using namespace asset;
 using namespace video;
 
-#include "nbl/builtin/hlsl/bit.hlsl"
-
-// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
 class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::MonoDeviceApplication;
 		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
 
-		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
-		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
 		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
-
 		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet> m_subAllocDescriptorSet;
 
-		// This example really lets the advantages of a timeline semaphore shine through!
 		smart_refctd_ptr<ISemaphore> m_timeline;
 		uint64_t m_iteration = 0;
 		constexpr static inline uint64_t MaxIterations = 200;
+		constexpr static uint32_t AllocatedBinding = 0;
 
 	public:
-		// Yay thanks to multiple inheritance we cannot forward ctors anymore
 		SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
 
-		// we stuff all our work here because its a "single shot" app
 		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
 			using nbl::video::IGPUDescriptorSetLayout;
 
-			// Remember to call the base class initialization!
 			if (!device_base_t::onAppInitialized(std::move(system)))
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
 
-			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
-			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
 			constexpr auto MaxConcurrency = 64;
 
-			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
 			m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);
 
-			// In contrast to fences, we just need one semaphore to rule all dispatches
 			m_timeline = m_device->createSemaphore(m_iteration);
 
 			// Descriptor set sub allocator
@@ -142,13 +128,14 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			}
 
 			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), descriptors.data(), descriptorWrites.data(), allocation.data());
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), descriptors.data(), descriptorWrites.data(), allocation.data());
 				assert(allocNum == 0);
 				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					assert(allocation[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
+					if (allocation[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
+						return logFail("value at %d wasn't allocated", i);
 				}
 			}
 			{
@@ -162,28 +149,26 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
 			std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(0, allocation2.size(), descriptors.data(), descriptorWrites.data(), &allocation2[0]);
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation2.size(), descriptors.data(), descriptorWrites.data(), &allocation2[0]);
 				assert(allocNum == 0);
 				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation2.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);
-					assert(allocation2[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
+					if (allocation2[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
+						return logFail("value at %d wasn't allocated", i);
 				}
 			}
 			
 			return true;
 		}
 
-		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
 		bool keepRunning() override { return m_iteration<MaxIterations; }
 
-		// Finally the first actual work-loop
 		void workLoopBody() override
 		{
 			IQueue* const queue = getComputeQueue();
 
-			// Obtain our command pool once one gets recycled
 			uint32_t poolIx;
 			do
 			{
@@ -193,7 +178,6 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 			{
 				m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
-				// lets record, its still a one time submit because we have to re-record with different push constants each time
 				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 				// COMMAND RECORDING
@@ -215,10 +199,6 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 					.value = m_iteration,
 					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
 				};
-				// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
-				// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
-				// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
-				// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
 				const IQueue::SSubmitInfo submitInfo = {
 					.waitSemaphores = {},
 					.commandBuffers = {&cmdbufInfo,1},
diff --git a/67_SubAllocatedDescriptorSet/pipeline.groovy b/67_SubAllocatedDescriptorSet/pipeline.groovy
index 1a7b043a4..4d7b41369 100644
--- a/67_SubAllocatedDescriptorSet/pipeline.groovy
+++ b/67_SubAllocatedDescriptorSet/pipeline.groovy
@@ -2,9 +2,9 @@ import org.DevshGraphicsProgramming.Agent
 import org.DevshGraphicsProgramming.BuilderInfo
 import org.DevshGraphicsProgramming.IBuilder
 
-class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+class CSubAllocatedDescriptorSetBuilder extends IBuilder
 {
-	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	public CSubAllocatedDescriptorSetBuilder(Agent _agent, _info)
 	{
 		super(_agent, _info)
 	}
@@ -44,7 +44,7 @@ class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
 
 def create(Agent _agent, _info)
 {
-	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+	return new CSubAllocatedDescriptorSetBuilder(_agent, _info)
 }
 
 return this
\ No newline at end of file

From 9b6764f885374b97c841f6f9e188505ca9d24601 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Thu, 7 Mar 2024 23:55:09 -0300
Subject: [PATCH 11/12] Fix up example

---
 67_SubAllocatedDescriptorSet/main.cpp | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index a4352f1a2..b7ec7bbdf 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -118,19 +118,11 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
 			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet), core::smart_refctd_ptr(m_device)); 
 			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
-			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptors;
-			std::vector<video::IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites(allocation.size(), video::IGPUDescriptorSet::SWriteDescriptorSet{});
+			std::vector<video::IGPUDescriptorSet::SDropDescriptorSet> descriptorDrops(allocation.size(), video::IGPUDescriptorSet::SDropDescriptorSet{});
 
-			for (uint32_t i = 0; i < allocation.size(); i++)
 			{
-				auto descriptorInfo = createImageDescriptor(80, 80);
-				descriptors.push_back(descriptorInfo);
-			}
-
-			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), descriptors.data(), descriptorWrites.data(), allocation.data());
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), allocation.data());
 				assert(allocNum == 0);
-				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
@@ -144,14 +136,13 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				{
 					addr.push_back(allocation[i]);
 				}
-				subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0]);
+				subAllocatedDescriptorSet->multi_deallocate(descriptorDrops.data(), AllocatedBinding, addr.size(), addr.data());
 			}
 			m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
 			std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
 			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation2.size(), descriptors.data(), descriptorWrites.data(), &allocation2[0]);
+				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation2.size(), allocation2.data());
 				assert(allocNum == 0);
-				m_device->updateDescriptorSets(descriptorWrites, {});
 				for (uint32_t i = 0; i < allocation2.size(); i++)
 				{
 					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);

From 7906d1cac91881862aca8295bab0726f76350fec Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Tue, 12 Mar 2024 00:38:56 -0300
Subject: [PATCH 12/12] Use example with multi timeline stuff

---
 67_SubAllocatedDescriptorSet/main.cpp | 212 ++++++++++++++++++--------
 1 file changed, 145 insertions(+), 67 deletions(-)

diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
index b7ec7bbdf..ace25bd30 100644
--- a/67_SubAllocatedDescriptorSet/main.cpp
+++ b/67_SubAllocatedDescriptorSet/main.cpp
@@ -8,6 +8,8 @@
 #include "../common/BasicMultiQueueApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
 using namespace nbl;
 using namespace core;
 using namespace system;
@@ -26,12 +28,116 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 		smart_refctd_ptr<ISemaphore> m_timeline;
 		uint64_t m_iteration = 0;
 		constexpr static inline uint64_t MaxIterations = 200;
+		constexpr static inline uint64_t MaxDescriptors = 512;
+		constexpr static inline uint64_t MaxAllocPerFrame = 10;
 		constexpr static uint32_t AllocatedBinding = 0;
+		smart_refctd_ptr<IGPUImageView> m_descriptorImages[MaxDescriptors];
+		smart_refctd_ptr<IGPUBuffer> m_descriptorBuffers[MaxDescriptors];
 
 	public:
 		SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
 
+		bool writeDescriptors(uint32_t count, uint32_t* valueIndices, uint32_t* allocationIndex)
+		{
+			auto createImageDescriptor = [&](uint32_t width, uint32_t height)
+			{
+				auto image = m_device->createImage(nbl::video::IGPUImage::SCreationParams {
+					{
+						.type = nbl::video::IGPUImage::E_TYPE::ET_2D,
+						.samples = nbl::video::IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
+						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
+						.extent = { width, height, 1 },
+						.mipLevels = 1,
+						.arrayLayers = 1,
+						.usage = nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT 
+							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
+							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT,
+					}, {}, nbl::video::IGPUImage::TILING::LINEAR,
+				});
+
+				auto reqs = image->getMemoryReqs();
+				reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				m_device->allocate(reqs, image.get());
+
+				auto imageView = m_device->createImageView(nbl::video::IGPUImageView::SCreationParams {
+					.image = image,
+						.viewType = nbl::video::IGPUImageView::E_TYPE::ET_2D,
+						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
+						// .subresourceRange = { nbl::video::IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT, 0, 1, 0, 1 },
+				});
+
+				return imageView;
+			};
+
+			auto createBufferDescriptor = [&](uint32_t size)
+			{
+				nbl::video::IGPUBuffer::SCreationParams params;
+				{
+					params.size = size;
+					params.usage = nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT
+						| nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
+						| nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT;
+				}
+				auto buffer = m_device->createBuffer(std::move(params));
+
+				auto reqs = buffer->getMemoryReqs();
+				reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				m_device->allocate(reqs, buffer.get());
+
+				return buffer;
+			};
+
+
+			std::vector<video::IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
+			descriptorWrites.reserve(count);
+			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
+			{
+				for (uint32_t i = 0; i < count; i++)
+				{
+					auto index = valueIndices[i];
+					m_logger->log("writeDescriptors[%d]: allocation[%d]: %d", system::ILogger::ELL_INFO, i, index, allocationIndex[i]);
+					if (allocationIndex[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
+						return logFail("value at %d wasn't allocated", i);
+
+					auto allocationIdx = allocationIndex[i];
+
+					video::IGPUDescriptorSet::SDescriptorInfo descriptorInfo;
+
+					// Storage image
+					{
+						m_descriptorImages[index] = createImageDescriptor(256, 256);
+						descriptorInfo.desc = core::smart_refctd_ptr<IGPUImageView>(m_descriptorImages[index]);
+						descriptorInfo.info.image.imageLayout = asset::IImage::LAYOUT::GENERAL;
+					}
+					// Storage buffer
+					//{
+					//	m_descriptorBuffers[index] = createBufferDescriptor(1024);
+					//	descriptorInfo.desc = core::smart_refctd_ptr<IGPUBuffer>(m_descriptorBuffers[index]);
+					//	descriptorInfo.info.buffer.offset = 0u;
+					//	descriptorInfo.info.buffer.size = 1024u;
+					//}
+
+					descriptorInfos.push_back(descriptorInfo);
+				}
+				for (uint32_t i = 0; i < count; i++)
+				{
+					auto index = valueIndices[i];
+					auto allocationIdx = allocationIndex[i];
+
+					video::IGPUDescriptorSet::SWriteDescriptorSet write;
+					write.dstSet = m_subAllocDescriptorSet->getDescriptorSet();
+					write.binding = AllocatedBinding;
+					write.arrayElement = index;
+					write.count = 1u;
+					write.info = &descriptorInfos[i];
+					descriptorWrites.push_back(write);
+				}
+			}
+
+			m_device->updateDescriptorSets(descriptorWrites, {});
+		}
+
 		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
 			using nbl::video::IGPUDescriptorSetLayout;
@@ -55,7 +161,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				for (uint32_t i = 0; i < 12; i++)
 				{
 					bindings[i].binding = i;
-					bindings[i].count = 512;
+					bindings[i].count = MaxDescriptors;
 					bindings[i].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
 						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
 						| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
@@ -81,75 +187,21 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 			auto descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(descriptorSetLayout));
 
 
-			auto createImageDescriptor = [&](uint32_t width, uint32_t height)
-			{
-				auto image = m_device->createImage(nbl::video::IGPUImage::SCreationParams {
-					{
-						.type = nbl::video::IGPUImage::E_TYPE::ET_2D,
-						.samples = nbl::video::IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
-						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
-						.extent = { width, height, 1 },
-						.mipLevels = 1,
-						.arrayLayers = 1,
-						.usage = nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT 
-							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
-							| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT,
-					}, {}, nbl::video::IGPUImage::TILING::LINEAR,
-				});
-
-				auto reqs = image->getMemoryReqs();
-				reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				m_device->allocate(reqs, image.get());
-
-				auto imageView = m_device->createImageView(nbl::video::IGPUImageView::SCreationParams {
-					.image = image,
-						.viewType = nbl::video::IGPUImageView::E_TYPE::ET_2D,
-						.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
-						// .subresourceRange = { nbl::video::IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT, 0, 1, 0, 1 },
-				});
-				
-				video::IGPUDescriptorSet::SDescriptorInfo descriptorInfo = {};
-                descriptorInfo.desc = imageView;
-                descriptorInfo.info.image.imageLayout = asset::IImage::LAYOUT::GENERAL;
-
-				return descriptorInfo;
-			};
-
 			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
 			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet), core::smart_refctd_ptr(m_device)); 
-			std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
-			std::vector<video::IGPUDescriptorSet::SDropDescriptorSet> descriptorDrops(allocation.size(), video::IGPUDescriptorSet::SDropDescriptorSet{});
+			//std::vector<uint32_t> allocation(MaxDescriptors, core::PoolAddressAllocator<uint32_t>::invalid_address);
 
-			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), allocation.data());
-				assert(allocNum == 0);
-				for (uint32_t i = 0; i < allocation.size(); i++)
-				{
-					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					if (allocation[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
-						return logFail("value at %d wasn't allocated", i);
-				}
-			}
-			{
-				std::vector<uint32_t> addr;
-				for (uint32_t i = 0; i < allocation.size(); i+=2)
-				{
-					addr.push_back(allocation[i]);
-				}
-				subAllocatedDescriptorSet->multi_deallocate(descriptorDrops.data(), AllocatedBinding, addr.size(), addr.data());
-			}
-			m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
-			std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
-			{
-				auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation2.size(), allocation2.data());
-				assert(allocNum == 0);
-				for (uint32_t i = 0; i < allocation2.size(); i++)
-				{
-					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);
-					if (allocation2[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
-						return logFail("value at %d wasn't allocated", i);
-				}
-			}
+			//std::vector<uint32_t> indices;
+			//indices.reserve(MaxDescriptors);
+			//for (uint32_t i = 0; i < MaxDescriptors; i++)
+			//	indices.push_back(i);
+
+			//auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), allocation.data());
+			//assert(allocNum == 0);
+			m_subAllocDescriptorSet = std::move(subAllocatedDescriptorSet);
+
+			//bool response = writeDescriptors(allocation.size(), indices.data(), allocation.data());
+			//if (!response) return false;
 			
 			return true;
 		}
@@ -160,6 +212,27 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 		{
 			IQueue* const queue = getComputeQueue();
 
+			// Similar idea to example 05 (streaming buffers)
+			// We will be allocating and freeing stuff, latched on previous frame's timeline semaphore
+			auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ m_iteration ^ 0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_) });
+			const auto elementCount = rng() % MaxAllocPerFrame;
+			m_logger->log("elementCount: %d", system::ILogger::ELL_INFO, elementCount);
+
+			std::vector<SubAllocatedDescriptorSet::value_type> values(elementCount, SubAllocatedDescriptorSet::invalid_value);
+
+			{
+				std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+				m_subAllocDescriptorSet->multi_allocate(waitTill, AllocatedBinding, elementCount, values.data());
+
+				std::vector<SubAllocatedDescriptorSet::value_type> indices;
+				indices.reserve(elementCount);
+				for (uint32_t i = 0; i < elementCount; i++)
+					indices.push_back(i);
+			
+				bool response = writeDescriptors(elementCount, indices.data(), values.data());
+				assert(response);
+			}
+
 			uint32_t poolIx;
 			do
 			{
@@ -172,6 +245,7 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 				// COMMAND RECORDING
+				// Here we would hipothetically use the descriptors created above
 
 				auto result = cmdbuf->end();
 				assert(result);
@@ -201,6 +275,10 @@ class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplicatio
 				queue->endCapture();
 				assert(statusCode == IQueue::RESULT::SUCCESS);
 			}
+
+			const ISemaphore::SWaitInfo futureWait = {m_timeline.get(),m_iteration};
+			m_poolCache->releasePool(futureWait,poolIx);
+			m_subAllocDescriptorSet->multi_deallocate(AllocatedBinding, elementCount, values.data(), futureWait);
 		}
 
 		bool onAppTerminated() override