From c4ca1f6077c7fecc8df2ad2ed8719fb943b4e4b1 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Mon, 30 Sep 2024 12:10:19 +0800 Subject: [PATCH] Update xgl from commit 980233f5 Update Khronos Vulkan Headers to 1.3.295 Remove fastBuildThreshold setting Fix missing SQTT calls in dynamic render pass functions Correct the pNewMode->palScreenMode.refreshRate Fix PAL_ASSERT in GetGraphicsPipelineSize Define vkCmdDrawIndirectCount and vkCmdDrawIndexedIndirectCount in SQTT layer Cleanup XGL warnings Add tuning profile for Rainbowsix Extraction on Navi33 Set the default value of rtEnableTopDownBuild to False Export GetSettingsBlobsAll function from XGL dynamic lib Fix an assert when patchControlPoints is set mistakenly Fix for case that tracker's SRD size may be larger than the device's native one Fix failure in dEQP-VK.pipeline.*.extended_dynamic_state.*.color_blend_att_count_0 Enable top-down build for RayTracingInVulkan Query typed/untyped buffer SRD sizes separately Replace time-date timestamp with a build id Enable AC01 fastclear for Talos of principle Add missing WriteBufferDescriptors Templates Support VK_KHR_pipeline_binary Remove NV31 PWS workarounds [RT] Refine split raytracing layer Enable workaroundStorageImageFormats for Houdini [RT] Fix TrivialBuilder UserNode SRV crash Bump up GPURT version to 48 Fix AcquireRelease for events Set EnableMortonCode30 to false Use PAL's gfxipProperties to indicate mixed signdot product Improve the refresh rate precision with PAL_CLIENT_INTERFACE_MAJOR_VERSION 894 [Talos principle] Force vgprLimit:48 to let 16 wave slots can be ran together Fix Pal assertion errors observed while running mesh shader cts testcases Clean up some macros Update PAL Version in XGL 892 Batch LoadOp Clears for Dynamic Rendering Add support for the VRS Experiment Enable AC01 fastclear for Serious Sam Add additional formats for acceleration structures to fix RT settings grayed out in some games Fix issue caused by barrier translation Remove AppProfile based LLPC compiler init options Cleanup shader module flags Add check for dynamic rasterization samples in quad sample setting Remove legacy pipeline hash functions Enabled InfiniteDeviceWaitIdle by default Add Disable Acceleration Structure Compaction and Check Buffer Overlaps Fix cache masks in renderpass barriers --- cmake/Modules/FindAMDBoost.cmake | 112 ---- cmake/Modules/FindAMDNinja.cmake | 48 -- cmake/Modules/XglSetupAmdGlobalRoots.cmake | 12 - cmake/XglCompileDefinitions.cmake | 58 +- cmake/XglOptions.cmake | 41 +- cmake/XglOverrides.cmake | 52 +- cmake/XglVersions.cmake | 4 +- icd/CMakeLists.txt | 10 +- icd/Loader/LunarG/Lnx/amd-icd.json | 4 +- icd/api/app_profile.cpp | 33 +- icd/api/app_resource_optimizer.cpp | 8 +- icd/api/app_shader_optimizer.cpp | 42 +- .../llpc/generic/Talos/profile.json | 3 +- .../Navi33/RainbowSixExtraction/profile.json | 54 ++ icd/api/appopt/split_raytracing_layer.cpp | 123 ++-- icd/api/compiler_solution.cpp | 25 + icd/api/compiler_solution_llpc.cpp | 50 +- icd/api/debug_printf.cpp | 15 +- icd/api/graphics_pipeline_common.cpp | 68 ++- icd/api/icd_main.cpp | 9 + icd/api/include/app_profile.h | 1 + icd/api/include/app_resource_optimizer.h | 4 +- icd/api/include/app_shader_optimizer.h | 4 +- icd/api/include/compiler_solution.h | 11 +- icd/api/include/compiler_solution_llpc.h | 5 +- icd/api/include/debug_printf.h | 4 +- icd/api/include/graphics_pipeline_common.h | 6 + .../khronos/devext/vk_amd_gpa_interface.h | 2 - .../khronos/sdk-1.3/vulkan/vulkan_core.h | 169 +++++- icd/api/include/pipeline_compiler.h | 37 +- icd/api/include/vk_buffer_view.h | 1 - icd/api/include/vk_cmdbuffer.h | 46 +- icd/api/include/vk_compute_pipeline.h | 1 + icd/api/include/vk_conv.h | 6 +- icd/api/include/vk_defines.h | 3 + icd/api/include/vk_descriptor_set.h | 6 +- .../include/vk_descriptor_update_template.h | 3 +- icd/api/include/vk_device.h | 37 +- icd/api/include/vk_extensions.h | 1 + icd/api/include/vk_formats.h | 20 +- icd/api/include/vk_graphics_pipeline.h | 2 + .../include/vk_graphics_pipeline_library.h | 1 + icd/api/include/vk_indirect_commands_layout.h | 4 +- icd/api/include/vk_physical_device.h | 28 + icd/api/include/vk_pipeline.h | 35 ++ icd/api/include/vk_pipeline_binary.h | 106 ++++ icd/api/include/vk_pipeline_layout.h | 8 +- icd/api/include/vk_shader.h | 33 +- icd/api/pipeline_compiler.cpp | 205 ++----- icd/api/raytrace/ray_tracing_device.cpp | 55 +- icd/api/raytrace/ray_tracing_device.h | 21 +- .../raytrace/vk_acceleration_structure.cpp | 18 + icd/api/raytrace/vk_ray_tracing_pipeline.cpp | 144 ++++- icd/api/raytrace/vk_ray_tracing_pipeline.h | 9 +- icd/api/renderpass/renderpass_builder.cpp | 23 +- icd/api/renderpass/renderpass_types.h | 3 +- icd/api/sqtt/sqtt_layer.cpp | 52 +- icd/api/sqtt/sqtt_rgp_annotations.h | 4 + icd/api/strings/entry_points.txt | 6 + icd/api/strings/extensions.txt | 1 + icd/api/vk_buffer.cpp | 11 +- icd/api/vk_buffer_view.cpp | 19 +- icd/api/vk_cmdbuffer.cpp | 545 +++++++++++------ icd/api/vk_cmdbuffer_transfer.cpp | 28 +- icd/api/vk_compute_pipeline.cpp | 88 ++- icd/api/vk_conv.cpp | 24 +- icd/api/vk_descriptor_buffer.cpp | 13 +- icd/api/vk_descriptor_set.cpp | 130 ++-- icd/api/vk_descriptor_set_layout.cpp | 6 +- icd/api/vk_descriptor_update_template.cpp | 84 ++- icd/api/vk_device.cpp | 135 ++++- icd/api/vk_dispatch.cpp | 10 +- icd/api/vk_gpa_session.cpp | 6 +- icd/api/vk_graphics_pipeline.cpp | 161 ++++- icd/api/vk_graphics_pipeline_library.cpp | 97 +++ icd/api/vk_indirect_commands_layout.cpp | 22 +- icd/api/vk_memory.cpp | 3 +- icd/api/vk_physical_device.cpp | 337 +++++++---- icd/api/vk_pipeline.cpp | 83 ++- icd/api/vk_pipeline_binary.cpp | 558 ++++++++++++++++++ icd/api/vk_pipeline_layout.cpp | 60 +- icd/api/vk_query.cpp | 10 +- icd/api/vk_shader.cpp | 3 +- icd/api/vk_utils.cpp | 5 +- icd/imported/gputexdecoder/gpuTexDecoder.cpp | 2 +- icd/make/amdicd.so.def | 1 + icd/res/ver.h | 4 +- icd/settings/settings.cpp | 155 ++--- icd/settings/settings_xgl.json | 60 +- icd/tools/generate/shaderProfileTemplate.py | 13 +- 90 files changed, 3182 insertions(+), 1387 deletions(-) delete mode 100644 cmake/Modules/FindAMDBoost.cmake delete mode 100644 cmake/Modules/FindAMDNinja.cmake create mode 100644 icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi33/RainbowSixExtraction/profile.json create mode 100644 icd/api/include/vk_pipeline_binary.h create mode 100644 icd/api/vk_pipeline_binary.cpp diff --git a/cmake/Modules/FindAMDBoost.cmake b/cmake/Modules/FindAMDBoost.cmake deleted file mode 100644 index 0099c468..00000000 --- a/cmake/Modules/FindAMDBoost.cmake +++ /dev/null @@ -1,112 +0,0 @@ -## - ####################################################################################################################### - # - # Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. - # - # Permission is hereby granted, free of charge, to any person obtaining a copy - # of this software and associated documentation files (the "Software"), to deal - # in the Software without restriction, including without limitation the rights - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - # copies of the Software, and to permit persons to whom the Software is - # furnished to do so, subject to the following conditions: - # - # The above copyright notice and this permission notice shall be included in all - # copies or substantial portions of the Software. - # - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - # SOFTWARE. - # - ####################################################################################################################### - -# Output -# Boost_FOUND -# Boost_ROOT_DIR -# Boost_INCLUDE_DIRS -# Boost_LIBRARY_DIRS - -# CMAKE-TODO: -# There is a built in FindBoost module: https://cmake.org/cmake/help/latest/module/FindBoost.html -# But our DK version is very inconsistent and is not structured the same way. More testing required. - -if(NOT DEFINED Boost_FOUND) - if(NOT DEFINED AMDBoost_FIND_VERSION) - message(FATAL_ERROR "A version to search for must be specified.") - endif() - - if(NOT DEFINED TARGET_ARCHITECTURE_BITS) - message(FATAL_ERROR "TARGET_ARCHITECTURE_BITS must be defined.") - endif() - - if(NOT DEFINED GLOBAL_ROOT_DK_DIR) - message(FATAL_ERROR "GLOBAL_ROOT_DK_DIR must be specified.") - endif() - - set(BOOST_VER ${AMDBoost_FIND_VERSION_MAJOR}.${AMDBoost_FIND_VERSION_MINOR}.${AMDBoost_FIND_VERSION_PATCH}) - - if(MSVC) - #MSVC++ 11.0 MSVC_VERSION == 1700 (Visual Studio 2012) - #MSVC++ 12.0 MSVC_VERSION == 1800 (Visual Studio 2013) - #MSVC++ 14.0 MSVC_VERSION == 1900 (Visual Studio 2015) - if(MSVC_VERSION EQUAL 1700) - set(Boost_ROOT_DIR ${GLOBAL_ROOT_DK_DIR}/boost/${BOOST_VER}/vc11 CACHE PATH "Boost root directory.") - elseif(MSVC_VERSION GREATER_EQUAL 1800) # CMAKE-TODO: Set to GREATER_EQUAL until VS projects are supported correctly. - set(Boost_ROOT_DIR ${GLOBAL_ROOT_DK_DIR}/boost/${BOOST_VER}/vc12 CACHE PATH "Boost root directory.") - else() - message(FATAL_ERROR "The MSVC Version: ${MSVC_VERSION} is currently unsopported for: ${CMAKE_PARENT_LIST_FILE}") - endif() - message(STATUS "Boost Version: ${BOOST_VER} for MSVC Version: ${MSVC_VERSION}") - elseif(CMAKE_COMPILER_IS_GNUCC) - set(Boost_ROOT_DIR ${GLOBAL_ROOT_DK_DIR}/boost/${BOOST_VER}/gcc-${CMAKE_CXX_COMPILER_VERSION} CACHE PATH "Boost root directory.") - message(STATUS "Boost Version: ${BOOST_VER} for GCC Version: ${CMAKE_CXX_COMPILER_VERSION}") - endif() - mark_as_advanced(Boost_ROOT_DIR) - - message(STATUS "Boost: ${Boost_ROOT_DIR}") - -if (Boost_ROOT_DIR) - set(Boost_INCLUDE_DIRS - ${Boost_ROOT_DIR}/include - CACHE PATH "Boost include directories." - ) - mark_as_advanced(Boost_INCLUDE_DIRS) - - if(WIN32) - if(TARGET_ARCHITECTURE_BITS EQUAL 64) - set(Boost_LIBRARY_DIRS - ${Boost_ROOT_DIR}/lib/x64 - CACHE PATH "Boost library directories." - ) - elseif(TARGET_ARCHITECTURE_BITS EQUAL 32) - set(Boost_LIBRARY_DIRS - ${Boost_ROOT_DIR}/lib/x86-fastcall - CACHE PATH "Boost library directories." - ) - endif() - elseif(UNIX) - if(TARGET_ARCHITECTURE_BITS EQUAL 64) - set(Boost_LIBRARY_DIRS - ${Boost_ROOT_DIR}/lib/x64-fPIC - CACHE PATH "Boost library directories." - ) - elseif(TARGET_ARCHITECTURE_BITS EQUAL 32) - set(Boost_LIBRARY_DIRS - ${Boost_ROOT_DIR}/lib/x86-fPIC - CACHE PATH "Boost library directories." - ) - endif() - endif() - mark_as_advanced(Boost_LIBRARY_DIRS) - - set(Boost_FOUND 1) - else() - set(Boost_FOUND 0) - endif() - - set(Boost_FOUND ${Boost_FOUND} CACHE STRING "Was Boost found?") - mark_as_advanced(Boost_FOUND) -endif() diff --git a/cmake/Modules/FindAMDNinja.cmake b/cmake/Modules/FindAMDNinja.cmake deleted file mode 100644 index a7a736dd..00000000 --- a/cmake/Modules/FindAMDNinja.cmake +++ /dev/null @@ -1,48 +0,0 @@ -## - ####################################################################################################################### - # - # Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. - # - # Permission is hereby granted, free of charge, to any person obtaining a copy - # of this software and associated documentation files (the "Software"), to deal - # in the Software without restriction, including without limitation the rights - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - # copies of the Software, and to permit persons to whom the Software is - # furnished to do so, subject to the following conditions: - # - # The above copyright notice and this permission notice shall be included in all - # copies or substantial portions of the Software. - # - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - # SOFTWARE. - # - ####################################################################################################################### - -# Output -# Ninja_FOUND -# Ninja_DIR -# Ninja_EXECUTABLE - -if(NOT DEFINED Ninja_FOUND) - if(NOT DEFINED AMDNinja_FIND_VERSION) - message(FATAL_ERROR "A version to search for must be specified.") - endif() - if(NOT DEFINED GLOBAL_ROOT_DK_DIR) - message(FATAL_ERROR "GLOBAL_ROOT_DK_DIR must be specified.") - endif() - - set(Ninja_DIR ${GLOBAL_ROOT_DK_DIR}/ninja/${AMDNinja_FIND_VERSION} CACHE FILEPATH "Ninja Direction") - mark_as_advanced(Ninja_DIR) - set(Ninja_EXECUTABLE ${Ninja_DIR}/ninja.exe CACHE FILEPATH "Ninja Executable") - mark_as_advanced(Ninja_EXECUTABLE) - - message(STATUS "Ninja: ${Ninja_EXECUTABLE}") - - set(Ninja_FOUND ${Ninja_FOUND} CACHE STRING "Was Ninja found?") - mark_as_advanced(Ninja_FOUND) -endif() diff --git a/cmake/Modules/XglSetupAmdGlobalRoots.cmake b/cmake/Modules/XglSetupAmdGlobalRoots.cmake index fa7d9ab6..7f1718f1 100644 --- a/cmake/Modules/XglSetupAmdGlobalRoots.cmake +++ b/cmake/Modules/XglSetupAmdGlobalRoots.cmake @@ -23,18 +23,6 @@ # ####################################################################################################################### -# find_dk_root must be available -if(NOT DEFINED GLOBAL_ROOT_DK_DIR) - execute_process( - COMMAND find_dk_root - OUTPUT_VARIABLE GLOBAL_ROOT_DK_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(NOT ("${GLOBAL_ROOT_DK_DIR}" STREQUAL "")) - set(GLOBAL_ROOT_DK_DIR ${GLOBAL_ROOT_DK_DIR} CACHE PATH "Global root dk directory..") - endif() -endif() - if(NOT DEFINED GLOBAL_ROOT_SRC_DIR) if(EXISTS ${PROJECT_SOURCE_DIR}/../../drivers) get_filename_component(GLOBAL_ROOT_SRC_DIR ${PROJECT_SOURCE_DIR}/../.. ABSOLUTE) diff --git a/cmake/XglCompileDefinitions.cmake b/cmake/XglCompileDefinitions.cmake index 3fac14e5..45e6b4de 100644 --- a/cmake/XglCompileDefinitions.cmake +++ b/cmake/XglCompileDefinitions.cmake @@ -49,43 +49,23 @@ macro(xgl_set_compile_definitions) target_compile_definitions(xgl PRIVATE ICD_BUILD_LLPC) endif() - target_compile_definitions(xgl PRIVATE PAL_BUILD_GFX9=1) - - if(XGL_BUILD_NAVI12) - target_compile_definitions(xgl PRIVATE VKI_BUILD_NAVI12=1) - endif() - -#if VKI_BUILD_GFX11 - if(XGL_BUILD_GFX11) - target_compile_definitions(xgl PRIVATE VKI_BUILD_GFX11=1) +#if VKI_ENABLE_DEBUG_BARRIERS + if(VKI_ENABLE_DEBUG_BARRIERS) + target_compile_definitions(xgl PRIVATE VKI_ENABLE_DEBUG_BARRIERS) endif() #endif - -#if VKI_BUILD_NAVI31 - if(XGL_BUILD_NAVI31) - target_compile_definitions(xgl PRIVATE VKI_BUILD_NAVI31=1) +#if VKI_RUNTIME_APP_PROFILE + if(VKI_RUNTIME_APP_PROFILE) + target_compile_definitions(xgl PRIVATE VKI_RUNTIME_APP_PROFILE) endif() #endif - -#if VKI_BUILD_NAVI32 - if(XGL_BUILD_NAVI32) - target_compile_definitions(xgl PRIVATE VKI_BUILD_NAVI32=1) +#if VKI_DEVMODE_COMPILER_SETTINGS + if(VKI_DEVMODE_COMPILER_SETTINGS) + target_compile_definitions(xgl PRIVATE VKI_DEVMODE_COMPILER_SETTINGS) endif() #endif -#if VKI_BUILD_NAVI33 - if(XGL_BUILD_NAVI33) - target_compile_definitions(xgl PRIVATE VKI_BUILD_NAVI33=1) - endif() -#endif - - if(XGL_BUILD_PHOENIX1) - target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX1=1) - endif() - - if(XGL_BUILD_PHOENIX2) - target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX2=1) - endif() + target_compile_definitions(xgl PRIVATE PAL_BUILD_GFX9=1) #if VKI_BUILD_GFX115 if(XGL_BUILD_GFX115) @@ -99,18 +79,6 @@ macro(xgl_set_compile_definitions) endif() #endif - if(XGL_BUILD_REMBRANDT) - target_compile_definitions(xgl PRIVATE VKI_BUILD_REMBRANDT=1) - endif() - - if(XGL_BUILD_RAPHAEL) - target_compile_definitions(xgl PRIVATE VKI_BUILD_RAPHAEL=1) - endif() - - if(XGL_BUILD_MENDOCINO) - target_compile_definitions(xgl PRIVATE VKI_BUILD_MENDOCINO=1) - endif() - #if VKI_RAY_TRACING if (VKI_RAY_TRACING) target_compile_definitions(xgl PRIVATE VKI_RAY_TRACING=1) @@ -119,12 +87,6 @@ macro(xgl_set_compile_definitions) endif() #endif -#if VKI_KHR_DISPLAY - if(VKI_KHR_DISPLAY) - target_compile_definitions(xgl PRIVATE VKI_KHR_DISPLAY) - endif() -#endif - #if VKI_NORMALIZED_TRIG_FUNCTIONS if(VKI_NORMALIZED_TRIG_FUNCTIONS) target_compile_definitions(xgl PRIVATE VKI_NORMALIZED_TRIG_FUNCTIONS) diff --git a/cmake/XglOptions.cmake b/cmake/XglOptions.cmake index bac81e04..2cf1a891 100644 --- a/cmake/XglOptions.cmake +++ b/cmake/XglOptions.cmake @@ -29,46 +29,25 @@ macro(xgl_options) ### Cached Project Options ############################################################################################# - option(XGL_ENABLE_PRINTS_ASSERTS "Build with debug print enabled?" OFF) - - option(XGL_ENABLE_LTO "Build with LTO enabled?" ON) - - option(XGL_ENABLE_GCOV "Build with gcov source code coverage?" OFF) - - option(XGL_BUILD_GFX103 "Build vulkan for GFX103" ON) - - option(XGL_BUILD_NAVI12 "Build vulkan for Navi12" ON) - - option(XGL_BUILD_REMBRANDT "Build vulkan for REMBRANDT" ON) - - option(XGL_BUILD_RAPHAEL "Build vulkan for RAPHAEL" ON) - - option(XGL_BUILD_MENDOCINO "Build vulkan for MENDOCINO" ON) - -#if VKI_BUILD_GFX11 - option(XGL_BUILD_GFX11 "Build vulkan for GFX11" ON) +#if VKI_ENABLE_DEBUG_BARRIERS + option(VKI_ENABLE_DEBUG_BARRIERS "Build with debug barriers enabled?" OFF) #endif - -#if VKI_BUILD_NAVI31 - option(XGL_BUILD_NAVI31 "Build vulkan for Navi31" ON) +#if VKI_RUNTIME_APP_PROFILE + option(VKI_RUNTIME_APP_PROFILE "Build with runtime app profile?" OFF) #endif - -#if VKI_BUILD_NAVI32 - option(XGL_BUILD_NAVI32 "Build vulkan for Navi32" ON) +#if VKI_DEVMODE_COMPILER_SETTINGS + option(VKI_DEVMODE_COMPILER_SETTINGS "Build with devmode compiler settings?" OFF) #endif -#if VKI_BUILD_NAVI33 - option(XGL_BUILD_NAVI33 "Build vulkan for Navi33" ON) -#endif + option(XGL_ENABLE_PRINTS_ASSERTS "Build with debug print enabled?" OFF) - option(XGL_BUILD_PHOENIX1 "Build vulkan for PHOENIX1" ON) + option(XGL_ENABLE_LTO "Build with LTO enabled?" ON) - option(XGL_BUILD_PHOENIX2 "Build vulkan for PHOENIX2" ON) + option(XGL_ENABLE_GCOV "Build with gcov source code coverage?" OFF) #if VKI_BUILD_GFX115 option(XGL_BUILD_GFX115 "Build vulkan for GFX115" ON) #endif - #if VKI_BUILD_STRIX1 option(XGL_BUILD_STRIX1 "Build vulkan for STRIX1" ON) #endif @@ -85,7 +64,7 @@ macro(xgl_options) option(VKI_GPU_DECOMPRESS "Build vulkan with GPU_DECOMPRESS" ON) #endif - option(ICD_BUILD_LLPC "Build LLPC?" ON) + option(ICD_BUILD_LLPC "Build LLPC?" ON) option(XGL_LLVM_UPSTREAM "Build with upstreamed LLVM?" OFF) diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake index b856086e..0f1ab516 100644 --- a/cmake/XglOverrides.cmake +++ b/cmake/XglOverrides.cmake @@ -103,11 +103,8 @@ macro(xgl_overrides_pal) set(PAL_BUILD_GPUOPEN ${ICD_GPUOPEN_DEVMODE_BUILD} CACHE BOOL "${PROJECT_NAME} override." FORCE) - if(XGL_BUILD_NAVI31 OR XGL_BUILD_NAVI32 OR XGL_BUILD_NAVI33 OR XGL_BUILD_PHOENIX1) - set(PAL_BUILD_GFX11 1 CACHE BOOL "${PROJECT_NAME} override." FORCE) - endif() - - set(PAL_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(PAL_BUILD_GFX11 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(PAL_BUILD_PHOENIX2 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) #if VKI_BUILD_GFX115 set(PAL_BUILD_GFX115 ${XGL_BUILD_GFX115} CACHE BOOL "${PROJECT_NAME} override." FORCE) @@ -141,33 +138,16 @@ macro(xgl_overrides_vkgc) set(LLPC_BUILD_TESTS ${XGL_BUILD_TESTS} CACHE BOOL "${PROJECT_NAME} override." FORCE) - set(LLPC_BUILD_NAVI12 ${XGL_BUILD_NAVI12} CACHE BOOL "${PROJECT_NAME} override." FORCE) - - set(LLPC_BUILD_REMBRANDT ${XGL_BUILD_REMBRANDT} CACHE BOOL "${PROJECT_NAME} override." FORCE) - - set(LLPC_BUILD_RAPHAEL ${XGL_BUILD_RAPHAEL} CACHE BOOL "${PROJECT_NAME} override." FORCE) - - set(LLPC_BUILD_MENDOCINO ${XGL_BUILD_MENDOCINO} CACHE BOOL "${PROJECT_NAME} override." FORCE) - -#if VKI_BUILD_GFX11 - set(LLPC_BUILD_GFX11 ${XGL_BUILD_GFX11} CACHE BOOL "${PROJECT_NAME} override." FORCE) -#endif - -#if VKI_BUILD_NAVI31 - set(LLPC_BUILD_NAVI31 ${XGL_BUILD_NAVI31} CACHE BOOL "${PROJECT_NAME} override." FORCE) -#endif - -#if VKI_BUILD_NAVI32 - set(LLPC_BUILD_NAVI32 ${XGL_BUILD_NAVI32} CACHE BOOL "${PROJECT_NAME} override." FORCE) -#endif - -#if VKI_BUILD_NAVI33 - set(LLPC_BUILD_NAVI33 ${XGL_BUILD_NAVI33} CACHE BOOL "${PROJECT_NAME} override." FORCE) -#endif - - set(LLPC_BUILD_PHOENIX1 ${XGL_BUILD_PHOENIX1} CACHE BOOL "${PROJECT_NAME} override." FORCE) - - set(LLPC_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_NAVI12 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_REMBRANDT ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_RAPHAEL ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_MENDOCINO ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_GFX11 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_NAVI31 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_NAVI32 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_NAVI33 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_PHOENIX1 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_PHOENIX2 ON CACHE BOOL "${PROJECT_NAME} override." FORCE) #if VKI_BUILD_GFX115 set(LLPC_BUILD_GFX115 ${XGL_BUILD_GFX115} CACHE BOOL "${PROJECT_NAME} override." FORCE) @@ -187,6 +167,14 @@ macro(xgl_overrides) set(GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION ${ICD_GPUOPEN_CLIENT_MAJOR_VERSION}) endif() +#if VKI_BUILD_GFX115 +#if VKI_BUILD_STRIX1 + if(XGL_BUILD_STRIX1) + set(XGL_BUILD_GFX115 ON CACHE BOOL "XGL_BUILD_GFX115 override." FORCE) + endif() +#endif +#endif + xgl_get_path() if(XGL_BUILD_TESTS) diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index 7b95dbdd..e0f16371 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "888") +set(ICD_PAL_CLIENT_MAJOR_VERSION "892") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. @@ -37,7 +37,7 @@ set(ICD_GPUOPEN_CLIENT_MAJOR_VERSION "42") #if VKI_RAY_TRACING # This will become the value of GPURT_CLIENT_INTERFACE_MAJOR_VERSION if VKI_RAY_TRACING=1. # It describes the interface version of the GpuRT shared module that the ICD supports. -set(ICD_GPURT_CLIENT_MAJOR_VERSION "47") +set(ICD_GPURT_CLIENT_MAJOR_VERSION "48") #endif # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1. diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index 52531fd9..00baae1d 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -135,6 +135,7 @@ target_sources(xgl PRIVATE api/vk_instance.cpp api/vk_memory.cpp api/vk_pipeline.cpp + api/vk_pipeline_binary.cpp api/vk_pipeline_layout.cpp api/vk_pipeline_cache.cpp api/vk_private_data_slot.cpp @@ -189,15 +190,6 @@ if(ICD_BUILD_LLPC) ) endif() -# vk_utils.cpp uses the __DATE__ and __TIME__ macros to generate a pipelineCacheUUID. The following -# rule forces vk_utils.cpp to be re-compiled on every build, so that an up-to-date time/date -# is always used regardless of which files were touched since the last build. -add_custom_command( - TARGET xgl PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_SOURCE_DIR}/api/vk_utils.cpp - COMMENT "Touching vk_utils.cpp" -) - ### ICD Auto-generated Shader Profiles Files ################################## # ICD_GENDIR Path to the code generation tools set(ICD_GENDIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/generate) diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index ba102a99..20285065 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.293" + "api_version": "1.3.295" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.293", + "api_version": "1.3.295", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index 27e608b3..7ab4adb7 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -792,6 +792,12 @@ constexpr AppProfilePatternEntry AppEngineXenon = "xenonengine" }; +constexpr AppProfilePatternEntry AppNameHoudini = +{ + PatternAppNameLower, + "houdini" +}; + // Section END of AppProfilePatternEntry for all games // This is a table of patterns. The first matching pattern in this table will be returned. @@ -1397,14 +1403,6 @@ AppProfilePattern AppPatternTable[] = } }, - { - AppProfile::Source2Engine, - { - AppEngineSource2, - PatternEnd - } - }, - { AppProfile::DxvkGodOfWar, { @@ -1615,6 +1613,14 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::Source2Engine, + { + AppEngineSource2, + PatternEnd + } + }, + { AppProfile::WindowKill, { @@ -1631,7 +1637,16 @@ AppProfilePattern AppPatternTable[] = AppEngineXenon, PatternEnd } - } + }, + + { + AppProfile::Houdini, + { + AppNameHoudini, + PatternEnd + } + }, + }; static char* GetExecutableName(size_t* pLength, bool includeExtension = false); diff --git a/icd/api/app_resource_optimizer.cpp b/icd/api/app_resource_optimizer.cpp index 5118967b..dd4a8644 100644 --- a/icd/api/app_resource_optimizer.cpp +++ b/icd/api/app_resource_optimizer.cpp @@ -63,7 +63,7 @@ void ResourceOptimizer::Init() BuildTuningProfile(); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE BuildRuntimeProfile(); #endif @@ -127,7 +127,7 @@ void ResourceOptimizer::OverrideImageCreateInfo( ApplyProfileToImageCreateInfo(m_tuningProfile, resourceKey, pCreateInfo); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ApplyProfileToImageCreateInfo(m_runtimeProfile, resourceKey, pCreateInfo); #endif @@ -141,7 +141,7 @@ void ResourceOptimizer::OverrideImageViewCreateInfo( ApplyProfileToImageViewCreateInfo(m_tuningProfile, resourceKey, pPalViewInfo); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ApplyProfileToImageViewCreateInfo(m_runtimeProfile, resourceKey, pPalViewInfo); #endif } @@ -457,7 +457,7 @@ void ResourceOptimizer::BuildAppProfile() } } -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE void ResourceOptimizer::BuildRuntimeProfile() { memset(&m_runtimeProfile, 0, sizeof(m_runtimeProfile)); diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 6595e97a..aca59ef2 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -39,7 +39,7 @@ #include "palDbgPrint.h" #include "palFile.h" -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE #include "utils/json_reader.h" #endif @@ -68,7 +68,7 @@ void ShaderOptimizer::Init() m_appShaderProfile.PipelineProfileToJson(m_tuningProfile, m_settings.pipelineProfileDumpFile); } -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE BuildRuntimeProfile(); #endif } @@ -136,7 +136,7 @@ bool ShaderOptimizer::HasMatchingProfileEntry( foundMatch = HasMatchingProfileEntry(m_tuningProfile, pipelineKey); } -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE if (foundMatch == false) { foundMatch = HasMatchingProfileEntry(m_runtimeProfile, pipelineKey); @@ -192,7 +192,7 @@ void ShaderOptimizer::CalculateMatchingProfileEntriesHash( { CalculateMatchingProfileEntriesHash(m_appProfile, pipelineKey, pHasher); CalculateMatchingProfileEntriesHash(m_tuningProfile, pipelineKey, pHasher); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE CalculateMatchingProfileEntriesHash(m_runtimeProfile, pipelineKey, pHasher); #endif } @@ -393,7 +393,7 @@ void ShaderOptimizer::OverrideShaderCreateInfo( ApplyProfileToShaderCreateInfo(m_tuningProfile, pipelineKey, shaderIndex, options); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ApplyProfileToShaderCreateInfo(m_runtimeProfile, pipelineKey, shaderIndex, options); #endif } @@ -534,7 +534,7 @@ void ShaderOptimizer::OverrideGraphicsPipelineCreateInfo( ApplyProfileToGraphicsPipelineCreateInfo( m_tuningProfile, pipelineKey, shaderStages, pPalCreateInfo, pGraphicsShaderInfos); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ApplyProfileToGraphicsPipelineCreateInfo( m_runtimeProfile, pipelineKey, shaderStages, pPalCreateInfo, pGraphicsShaderInfos); #endif @@ -549,7 +549,7 @@ void ShaderOptimizer::OverrideComputePipelineCreateInfo( ApplyProfileToComputePipelineCreateInfo(m_tuningProfile, pipelineKey, pDynamicCompueShaderInfo); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ApplyProfileToComputePipelineCreateInfo(m_runtimeProfile, pipelineKey, pDynamicCompueShaderInfo); #endif } @@ -567,7 +567,7 @@ ShaderOptimizer::~ShaderOptimizer() { pAllocCB->pfnFree(pAllocCB->pUserData, m_tuningProfile.pEntries); } -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE if (m_runtimeProfile.pEntries != nullptr) { pAllocCB->pfnFree(pAllocCB->pUserData, m_runtimeProfile.pEntries); @@ -1182,6 +1182,20 @@ void ShaderOptimizer::BuildAppProfileLlpc() m_appShaderProfile.BuildAppProfileLlpc(appProfile, gfxIpLevel, asicRevision, &m_appProfile); + if ((appProfile == AppProfile::MadMax) || + (appProfile == AppProfile::SedpEngine) || + (appProfile == AppProfile::ThronesOfBritannia)) + { + i = m_appProfile.entryCount++; + PipelineProfileEntry *pEntry = &m_appProfile.pEntries[i]; + pEntry->pattern.match.always = true; + for (uint32_t stage = 0; stage < ShaderStageCount; ++stage) + { + pEntry->action.shaders[stage].shaderCreate.apply.useSiScheduler = true; + pEntry->action.shaders[stage].shaderCreate.tuningOptions.useSiScheduler = true; + } + } + if (appProfile == AppProfile::ShadowOfTheTombRaider) { i = m_appProfile.entryCount++; @@ -1224,6 +1238,14 @@ void ShaderOptimizer::BuildAppProfileLlpc() pEntry->action.shaders[ShaderStage::ShaderStageCompute].shaderCreate.apply.workaroundStorageImageFormats = true; } + if (appProfile == AppProfile::Houdini) + { + i = m_appProfile.entryCount++; + PipelineProfileEntry *pEntry = &m_appProfile.pEntries[i]; + pEntry->pattern.match.always = true; + pEntry->action.shaders[ShaderStage::ShaderStageCompute].shaderCreate.apply.workaroundStorageImageFormats = true; + } + if (appProfile == AppProfile::ELEX2) { i = m_appProfile.entryCount++; @@ -1250,7 +1272,7 @@ void ShaderOptimizer::PrintProfileEntryMatch( { pProfile = "Application"; } -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE else if (&profile == &m_runtimeProfile) { pProfile = "Runtime"; @@ -1310,7 +1332,7 @@ void ShaderOptimizer::PrintProfileEntryMatch( } #endif -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE // ===================================================================================================================== void ShaderOptimizer::RuntimeProfileParseError() { diff --git a/icd/api/appopt/shader_profiles/llpc/generic/Talos/profile.json b/icd/api/appopt/shader_profiles/llpc/generic/Talos/profile.json index aeefe6c8..4f10b94c 100644 --- a/icd/api/appopt/shader_profiles/llpc/generic/Talos/profile.json +++ b/icd/api/appopt/shader_profiles/llpc/generic/Talos/profile.json @@ -77,7 +77,8 @@ }, "action": { "ps": { - "useSiScheduler": true + "useSiScheduler": true, + "vgprLimit": 48 } } }, diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi33/RainbowSixExtraction/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi33/RainbowSixExtraction/profile.json new file mode 100644 index 00000000..8e4586c2 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi33/RainbowSixExtraction/profile.json @@ -0,0 +1,54 @@ +{ + "entries": [ + { + "pattern": { + "ps": { + "codeHash": "0xdf44ae88f263605d 6d21f3936125b78b" + } + }, + "action": { + "ps": { + "disableLoopUnrolls": true, + "waveSize": 64 + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x4dab409a1cee9aee 2b8c1d18f83bc11d" + } + }, + "action": { + "ps": { + "waveSize": 32 + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x5df94d2774fadfee 941f5d1b215994fc" + } + }, + "action": { + "ps": { + "allowReZ": 1, + "waveSize": 64 + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x295d1b6cd2aff9c4 9a96d6ffac16b5e9" + } + }, + "action": { + "ps": { + "aggressiveInvariantLoads": "EnableOptimization" + } + } + } + ] +} \ No newline at end of file diff --git a/icd/api/appopt/split_raytracing_layer.cpp b/icd/api/appopt/split_raytracing_layer.cpp index 0578124a..d3355922 100644 --- a/icd/api/appopt/split_raytracing_layer.cpp +++ b/icd/api/appopt/split_raytracing_layer.cpp @@ -50,63 +50,92 @@ void SplitRaytracingLayer::TraceRaysDispatchPerDevice( const RuntimeSettings& settings = pCmdBuffer->VkDevice()->GetRuntimeSettings(); const RayTracingPipeline* pPipeline = pCmdBuffer->RenderState()->pRayTracingPipeline; + const Pal::DispatchDims traceSize = + { + .x = width, + .y = height, + .z = depth + }; + const uint32_t splitX = settings.rtDispatchSplitX; const uint32_t splitY = settings.rtDispatchSplitY; const uint32_t splitZ = settings.rtDispatchSplitZ; - const uint32_t blockW = (width + splitX - 1) / splitX; - const uint32_t blockH = (height + splitY - 1) / splitY; - const uint32_t blockD = (depth + splitZ - 1) / splitZ; - - uint32_t dispatchSizeX = 0; - uint32_t dispatchSizeY = 0; - uint32_t dispatchSizeZ = 0; + const Pal::DispatchDims blockSize = + { + .x = (traceSize.x + splitX - 1) / splitX, + .y = (traceSize.y + splitY - 1) / splitY, + .z = (traceSize.z + splitZ - 1) / splitZ + }; - pPipeline->GetDispatchSize(&dispatchSizeX, &dispatchSizeY, &dispatchSizeZ, blockW, blockH, blockD); + const Pal::DispatchDims blockDispatchSize = pPipeline->GetDispatchSize(blockSize); - for (uint32_t z = 0; z < splitZ; z++) - { - uint32_t zOffset = z * blockD; - for (uint32_t x = 0; x < splitX; x++) + // Lambda function used to help dispatch. + auto dispatch = [pCmdBuffer, deviceIdx](Pal::DispatchDims offset, Pal::DispatchDims size) + { + pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdDispatchOffset( + offset, + size, + size); + + // To avoid TDR, the large dispatch is split into mulitple smaller sub-dispatches. However, + // when a MCBP event arrives, PFP may have already processed all dispatch commands, so mulitple + // smaller sub-dispatches cannot be interrupted by MCBP in this case. + // The Barrier below is used to stall the PFP and allow MCBP to happen between dispatches. + Pal::BarrierTransition transition = {}; + transition.srcCacheMask = Pal::CoherShaderRead; + transition.dstCacheMask = Pal::CoherShaderRead; + const Pal::HwPipePoint postCs = Pal::HwPipePostCs; + Pal::BarrierInfo barrierInfo = {}; + barrierInfo.pipePointWaitCount = 1; + barrierInfo.pPipePoints = &postCs; + barrierInfo.waitPoint = Pal::HwPipeTop; + pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdBarrier(barrierInfo); + }; + + // Lambda function used to help splitting. + auto split = [](uint32_t size, uint32_t incSize, auto&& fun) { - uint32_t xOffset = x * blockW; - for (uint32_t y = 0; y < splitY; y++) + uint32_t i = 0; + for (; i <= size - incSize; i += incSize) { - uint32_t yOffset = y * blockH; - - uint32_t dispatchOffsetX = 0; - uint32_t dispatchOffsetY = 0; - uint32_t dispatchOffsetZ = 0; - - pPipeline->GetDispatchSize(&dispatchOffsetX, - &dispatchOffsetY, - &dispatchOffsetZ, - xOffset, - yOffset, - zOffset); - - pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdDispatchOffset( - { dispatchOffsetX, dispatchOffsetY, dispatchOffsetZ }, - { dispatchSizeX, dispatchSizeY, dispatchSizeZ }, - { dispatchSizeX, dispatchSizeY, dispatchSizeZ }); - - // To avoid TDR, the large dispatch is split into mulitple smaller sub-dispatches. However, - // when a MCBP event arrives, PFP may have already processed all dispatch commands, so mulitple - // smaller sub-dispatches cannot be interrupted by MCBP in this case. - // The Barrier below is used to stall the PFP and allow MCBP to happen between dispatches. - Pal::BarrierTransition transition = {}; - transition.srcCacheMask = Pal::CoherShaderRead; - transition.dstCacheMask = Pal::CoherShaderRead; - const Pal::HwPipePoint postCs = Pal::HwPipePostCs; - Pal::BarrierInfo barrierInfo = {}; - barrierInfo.pipePointWaitCount = 1; - barrierInfo.pPipePoints = &postCs; - barrierInfo.waitPoint = Pal::HwPipeTop; - pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdBarrier(barrierInfo); - + fun(i, incSize); + } + if (i < size) + { + fun(i, size - i); } + }; + + // Split Z axis. + split(traceSize.z, blockDispatchSize.z, + [split, traceSize, blockDispatchSize, &dispatch](uint32_t offsetZ, uint32_t sizeZ) + { + // Split Y axis. + split(traceSize.y, blockDispatchSize.y, + [split, traceSize, blockDispatchSize, &dispatch, offsetZ, sizeZ](uint32_t offsetY, uint32_t sizeY) + { + //Split X axis. + split(traceSize.x, blockDispatchSize.x, + [&dispatch, offsetZ, sizeZ, offsetY, sizeY](uint32_t offsetX, uint32_t sizeX) + { + Pal::DispatchDims offset = + { + .x = offsetX, + .y = offsetY, + .z = offsetZ + }; + Pal::DispatchDims size = + { + .x = sizeX, + .y = sizeY, + .z = sizeZ + }; + dispatch(offset, size); + }); + }); } - } + ); } // ===================================================================================================================== diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp index eb43bcdd..84f8929c 100644 --- a/icd/api/compiler_solution.cpp +++ b/icd/api/compiler_solution.cpp @@ -369,4 +369,29 @@ uint32_t CompilerSolution::GetRayTracingVgprLimit( } #endif +bool CompilerSolution::ClonePipelineBinary( + const Vkgc::BinaryData* pProvidedBinary, + Vkgc::BinaryData* pNewBinary) +{ + bool success = false; + + // Create memory, to be freed later, just as StoreShaderBinaryToCache does. The VkInstance allocation callbacks + // are used here for consistency with the PipelineBinaryCache that backs the PipelineCache. + void* pBinaryData = m_pPhysicalDevice->Manager()->VkInstance()->AllocMem( + pProvidedBinary->codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pBinaryData != nullptr) + { + memcpy(pBinaryData, pProvidedBinary->pCode, pProvidedBinary->codeSize); + + pNewBinary->pCode = pBinaryData; + pNewBinary->codeSize = pProvidedBinary->codeSize; + success = true; + } + + return success; +} + } diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index c9928c39..a5707b60 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -131,8 +131,7 @@ void CompilerSolutionLlpc::Destroy() // Builds shader module from SPIR-V binary code. VkResult CompilerSolutionLlpc::BuildShaderModule( const Device* pDevice, - VkShaderModuleCreateFlags flags, - VkShaderModuleCreateFlags internalShaderFlags, + ShaderModuleFlags flags, const Vkgc::BinaryData& shaderBinary, ShaderModuleHandle* pShaderModule, const PipelineOptimizerKey& profileKey) @@ -155,7 +154,7 @@ VkResult CompilerSolutionLlpc::BuildShaderModule( ); #if VKI_RAY_TRACING - if ((internalShaderFlags & VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT) != 0) + if ((flags & ShaderModuleInternalRayTracingShader) != 0) { moduleInfo.options.pipelineOptions.internalRtShaders = true; } @@ -442,6 +441,8 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( PipelineCache* pPipelineCache, GraphicsLibraryType gplType, GraphicsPipelineBinaryCreateInfo* pCreateInfo, + const Vkgc::BinaryData* pProvidedBinary, + const Util::MetroHash::Hash* pProvidedBinaryHash, void* pPipelineDumpHandle, GplModuleState* pModuleState) { @@ -458,6 +459,8 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( int64_t startTime = Util::GetPerfCpuTime(); bool binaryProvided = false; + binaryProvided = (pProvidedBinary != nullptr) && (pProvidedBinary->codeSize > 0); + if (binaryProvided == false) { Util::MetroHash128 hasher; @@ -466,6 +469,10 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash()); hasher.Finalize(cacheId.bytes); } + else + { + cacheId = *pProvidedBinaryHash; + } Vkgc::BinaryData finalBinary = {}; if ((pDevice->GetRuntimeSettings().shaderReplaceMode == ShaderReplacePipelineBinaryHash) || @@ -502,6 +509,18 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( { LoadShaderBinaryFromCache(pPipelineCache, &cacheId, &shaderLibraryBinary, &hitCache, &hitAppCache); } + else + { + if (ClonePipelineBinary(pProvidedBinary, &shaderLibraryBinary)) + { + hitCache = true; + hitAppCache = true; + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } if (pPipelineCache != nullptr) { @@ -981,7 +1000,6 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( const uint32_t MaxLlpcOptions = 32; Llpc::ICompiler* pCompiler = nullptr; const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); - AppProfile appProfile = m_pPhysicalDevice->GetAppProfile(); // Get the executable name and path char executableNameBuffer[PATH_MAX]; char* pExecutablePtr; @@ -1046,26 +1064,9 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( // NOTE: For testing consistency, these options should be kept the same as those of // "amdllpc" (Init()). - // WARNING: Do not conditionally add options based on GFXIP version as these will - // break support for systems with a mixture of ASICs. GFXIP dependent options - // should be subtarget features or handled in LLVM backend. - - if ((appProfile == AppProfile::SeriousSamFusion) || - (appProfile == AppProfile::Talos)) - { - llpcOptions[numOptions++] = "-unroll-partial-threshold=700"; - } - - ShaderCacheMode shaderCacheMode = settings.shaderCacheMode; - if ((appProfile == AppProfile::MadMax) || - (appProfile == AppProfile::SedpEngine) || - (appProfile == AppProfile::ThronesOfBritannia)) - { - llpcOptions[numOptions++] = "-enable-si-scheduler"; - // si-scheduler interacts badly with SIFormMemoryClauses pass, so - // disable the effect of that pass by limiting clause length to 1. - llpcOptions[numOptions++] = "-amdgpu-max-memory-clause=1"; - } + // WARNING: Do not conditionally add options! + // GFXIP or AppProfile dependent options should be set via pipeline options structure in LLPC + // or subtarget features handled in LLVM backend. #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 66 optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-executable-name=%s", pExecutablePtr); @@ -1081,6 +1082,7 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( pOptionBuffer += optionLength; bufSize -= optionLength; + ShaderCacheMode shaderCacheMode = settings.shaderCacheMode; optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-cache-mode=%d", shaderCacheMode); ++optionLength; llpcOptions[numOptions++] = pOptionBuffer; diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index 866ce8fa..2976d8cb 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -114,8 +114,8 @@ void DebugPrintf::BindPipeline( { m_pPipeline = pPipeline; - const size_t bufferSrdSize = - pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gfxipProperties.srdSizes.bufferView; + const size_t bufferSrdSize = pDevice->VkPhysicalDevice(DefaultDeviceIndex)-> + PalProperties().gfxipProperties.srdSizes.untypedBufferView; void* pTable = pCmdBuffer->CmdAllocateEmbeddedData( bufferSrdSize, bufferSrdSize, &tableVa); @@ -142,6 +142,7 @@ void DebugPrintf::BindPipeline( pSubSections->Reserve(1); ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections); } + constexpr VkSemaphoreTypeCreateInfo semaphoreTypeInfo { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, @@ -174,7 +175,7 @@ void DebugPrintf::Init( const Device* pDevice) { const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - if ((settings.enableDebugPrintf) && (m_state == Uninitialized)) + if ((pDevice->GetEnabledFeatures().enableDebugPrintf) && (m_state == Uninitialized)) { m_state = Enabled; m_pPipeline = nullptr; @@ -280,6 +281,7 @@ uint64_t DebugPrintf::ProcessDebugPrintfBuffer( outputDecodedSpecifiers.Reserve(5); // Set pPtr point to the head of the system memory pPtr = pPrintBuffer; + while ((bufferSize - decodeOffset) > 1) { // Decode entry @@ -329,11 +331,16 @@ uint64_t DebugPrintf::ProcessDebugPrintfBuffer( varIndex, &outputDecodedSpecifiers[varIndex]); } + OutputBufferString(formatString, *pSubSections, &outputBufferStr); + decodeOffset += outputsInDwords; } + WriteToFile(pFile, outputBufferStr); + pDevice->VkInstance()->FreeMem(pPrintBuffer); + m_frame++; } } @@ -739,6 +746,7 @@ void DebugPrintf::DecodeFormatStringsFromElf( } } } + bool found = true; PrintfElfString* pElfString = nullptr; result = pFormatStrings->FindAllocate(hashValue, &found, &pElfString); @@ -770,3 +778,4 @@ void DebugPrintf::DecodeFormatStringsFromElf( } } } + diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index cfe52317..cf510311 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -379,18 +379,6 @@ static VkFormat GetDepthFormat( return format; } -// ===================================================================================================================== -static uint32_t GetColorAttachmentCount( - const RenderPass* pRenderPass, - const uint32_t subpassIndex, - const VkPipelineRenderingCreateInfo* pPipelineRenderingCreateInfo -) -{ - return (pRenderPass != nullptr) ? pRenderPass->GetSubpassColorReferenceCount(subpassIndex) : - (pPipelineRenderingCreateInfo != nullptr) ? pPipelineRenderingCreateInfo->colorAttachmentCount : - 0u; -} - // ===================================================================================================================== static VkShaderStageFlagBits GetLibraryActiveShaderStages( const VkGraphicsPipelineLibraryFlagsEXT libFlags) @@ -1518,7 +1506,7 @@ static void BuildMultisampleState( pInfo->immedInfo.msaaCreateInfo.shaderExportMaskSamples = subpassCoverageSampleCount; pInfo->immedInfo.msaaCreateInfo.sampleMask = (pMs->pSampleMask != nullptr) ? pMs->pSampleMask[0] - : 0xffffffff; + : 0xffff; pInfo->immedInfo.msaaCreateInfo.sampleClusters = subpassCoverageSampleCount; pInfo->immedInfo.msaaCreateInfo.alphaToCoverageSamples = subpassCoverageSampleCount; pInfo->immedInfo.msaaCreateInfo.occlusionQuerySamples = subpassDepthSampleCount; @@ -1705,7 +1693,8 @@ static void BuildColorBlendState( pInfo->staticStateMask |= 1ULL << static_cast(DynamicStatesInternal::LogicOpEnable); } - if (GetColorAttachmentCount(pRenderPass, subpass, pRendering) != 0) + const uint32 numColorTargets = GraphicsPipelineCommon::GetColorAttachmentCount(pRenderPass, subpass, pRendering); + if (numColorTargets != 0) { if (pCb != nullptr) { @@ -1713,11 +1702,18 @@ static void BuildColorBlendState( pInfo->immedInfo.logicOpEnable = pCb->logicOpEnable; } - uint32_t numColorTargets = 0; + bool useBlendAttachments = false; const VkPipelineColorWriteCreateInfoEXT* pColorWriteCreateInfo = nullptr; if (pCb != nullptr) { - numColorTargets = Min(pCb->attachmentCount, Pal::MaxColorTargets); + // If the pipeline is created with these 3 states as dynamic, the attachmentCount from the + // VkPipelineColorBlendStateCreateInfo is ignored. + if ((IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEnable) == false) || + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEquation) == false) || + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorWriteMask) == false)) + { + useBlendAttachments = true; + } const void* pNext = static_cast(pCb->pNext); @@ -1746,11 +1742,6 @@ static void BuildColorBlendState( } } - if (pRendering != nullptr) - { - numColorTargets = Min(pRendering->colorAttachmentCount, Pal::MaxColorTargets); - } - pInfo->immedInfo.colorWriteEnable = 0; pInfo->immedInfo.colorWriteMask = 0; @@ -1788,8 +1779,13 @@ static void BuildColorBlendState( // disable shader writes through that target. if (pCbDst->swizzledFormat.format != Pal::ChNumFormat::Undefined) { - const VkPipelineColorBlendAttachmentState* pSrc = - (pCb != nullptr) ? &pCb->pAttachments[i] : nullptr; + const VkPipelineColorBlendAttachmentState* pSrc = nullptr; + + if (useBlendAttachments && (i < pCb->attachmentCount)) + { + pSrc = &pCb->pAttachments[i]; + } + VkColorComponentFlags colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | @@ -1945,14 +1941,15 @@ static void BuildPreRasterizationShaderState( #endif GraphicsPipelineObjectCreateInfo* pInfo) { - if (pIn->pTessellationState != nullptr) + // Set patch control points only if tessellation shader is enabled. + pInfo->immedInfo.inputAssemblyState.patchControlPoints = 0; + if (pInfo->activeStages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) { - pInfo->immedInfo.inputAssemblyState.patchControlPoints = static_cast( - pIn->pTessellationState->patchControlPoints); - } - else - { - pInfo->immedInfo.inputAssemblyState.patchControlPoints = 0; + if (pIn->pTessellationState != nullptr) + { + pInfo->immedInfo.inputAssemblyState.patchControlPoints = static_cast( + pIn->pTessellationState->patchControlPoints); + } } // Build states via VkPipelineRasterizationStateCreateInfo @@ -3240,4 +3237,15 @@ void GraphicsPipelineCommon::HandleExtensionStructs( } } +// ===================================================================================================================== +uint32_t GraphicsPipelineCommon::GetColorAttachmentCount( + const RenderPass* pRenderPass, + const uint32_t subpassIndex, + const VkPipelineRenderingCreateInfo* pPipelineRenderingCreateInfo) +{ + return (pRenderPass != nullptr) ? pRenderPass->GetSubpassColorReferenceCount(subpassIndex) : + (pPipelineRenderingCreateInfo != nullptr) ? pPipelineRenderingCreateInfo->colorAttachmentCount : + 0u; +} + } diff --git a/icd/api/icd_main.cpp b/icd/api/icd_main.cpp index 5f42745a..6b845c21 100644 --- a/icd/api/icd_main.cpp +++ b/icd/api/icd_main.cpp @@ -29,6 +29,8 @@ ************************************************************************************************************************ */ +#include + #if defined(__unix__) & (__GNUC__ == 5) #include @@ -73,3 +75,10 @@ namespace std { } #endif + +extern "C" unsigned int GetSettingsBlobsAll( + unsigned char* pBuffer, + size_t bufferSize) +{ + return DevDriver::SettingsBlobNode::GetAllSettingsBlobs(pBuffer, bufferSize); +} diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index 86d1af44..3e04a17a 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -148,6 +148,7 @@ enum class AppProfile : uint32_t DXVK, // DXVK WindowKill, // Windowkill by torcado Archean, // Archean by batcholi + Houdini, // Houdini }; struct ProfileSettings diff --git a/icd/api/include/app_resource_optimizer.h b/icd/api/include/app_resource_optimizer.h index c98fa0aa..314f7b06 100644 --- a/icd/api/include/app_resource_optimizer.h +++ b/icd/api/include/app_resource_optimizer.h @@ -185,7 +185,7 @@ class ResourceOptimizer void BuildTuningProfile(); void BuildAppProfile(); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE void BuildRuntimeProfile(); #endif @@ -195,7 +195,7 @@ class ResourceOptimizer ResourceProfile m_tuningProfile; ResourceProfile m_appProfile; -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE ResourceProfile m_runtimeProfile; #endif diff --git a/icd/api/include/app_shader_optimizer.h b/icd/api/include/app_shader_optimizer.h index 4ac850b5..cd83e834 100644 --- a/icd/api/include/app_shader_optimizer.h +++ b/icd/api/include/app_shader_optimizer.h @@ -205,7 +205,7 @@ class ShaderOptimizer void BuildAppProfileLlpc(); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE void BuildRuntimeProfile(); void RuntimeProfileParseError(); #endif @@ -225,7 +225,7 @@ class ShaderOptimizer ShaderProfile m_appShaderProfile; -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE PipelineProfile m_runtimeProfile; #endif diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index 5493be7c..04429172 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -57,6 +57,8 @@ struct GraphicsPipelineLibraryInfo; struct DeferredWorkload; #endif +typedef uint32_t ShaderModuleFlags; + enum FreeCompilerBinary : uint32_t { FreeWithCompiler = 0, @@ -279,8 +281,7 @@ class CompilerSolution virtual VkResult BuildShaderModule( const Device* pDevice, - VkShaderModuleCreateFlags flags, - VkShaderModuleCreateFlags internalShaderFlags, + ShaderModuleFlags flags, const Vkgc::BinaryData& shaderBinary, ShaderModuleHandle* pShaderModule, const PipelineOptimizerKey& profileKey) = 0; @@ -304,6 +305,8 @@ class CompilerSolution PipelineCache* pPipelineCache, GraphicsLibraryType gplType, GraphicsPipelineBinaryCreateInfo* pCreateInfo, + const Vkgc::BinaryData* pProvidedBinary, + const Util::MetroHash::Hash* pProvidedBinaryHash, void* pPipelineDumpHandle, GplModuleState* pModuleState) = 0; @@ -389,6 +392,10 @@ class CompilerSolution bool hitAppCache, Vkgc::BinaryData* pCacheBinary); + bool ClonePipelineBinary( + const Vkgc::BinaryData* pProvidedBinary, + Vkgc::BinaryData* pNewBinary); + PhysicalDevice* m_pPhysicalDevice; // Vulkan physical device object Vkgc::GfxIpVersion m_gfxIp; // Graphics IP version info, used by Vkgc Pal::GfxIpLevel m_gfxIpLevel; // Graphics IP level diff --git a/icd/api/include/compiler_solution_llpc.h b/icd/api/include/compiler_solution_llpc.h index 9e49dae7..63028cf0 100644 --- a/icd/api/include/compiler_solution_llpc.h +++ b/icd/api/include/compiler_solution_llpc.h @@ -86,8 +86,7 @@ class CompilerSolutionLlpc final : public CompilerSolution virtual VkResult BuildShaderModule( const Device* pDevice, - VkShaderModuleCreateFlags flags, - VkShaderModuleCreateFlags internalShaderFlags, + ShaderModuleFlags flags, const Vkgc::BinaryData& shaderBinary, ShaderModuleHandle* pShaderModule, const PipelineOptimizerKey& profileKey) override; @@ -111,6 +110,8 @@ class CompilerSolutionLlpc final : public CompilerSolution PipelineCache* pPipelineCache, GraphicsLibraryType gplType, GraphicsPipelineBinaryCreateInfo* pCreateInfo, + const Vkgc::BinaryData* pProvidedBinary, + const Util::MetroHash::Hash* pProvidedBinaryHash, void* pPipelineDumpHandle, GplModuleState* pModuleState) override; diff --git a/icd/api/include/debug_printf.h b/icd/api/include/debug_printf.h index d9f1d918..d010f106 100644 --- a/icd/api/include/debug_printf.h +++ b/icd/api/include/debug_printf.h @@ -51,8 +51,8 @@ typedef Util::Vector PrintfBit; // Printf Elf string and bits position struct PrintfElfString { - PrintfString printStr; // Printf format string - PrintfBit bit64s; // Bit positions of output variables + PrintfString printStr; // Printf format string + PrintfBit bit64s; // Bit positions of output variables PrintfElfString() : printStr(nullptr), bit64s(nullptr) { } diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h index bc7cab99..be1ef9a8 100644 --- a/icd/api/include/graphics_pipeline_common.h +++ b/icd/api/include/graphics_pipeline_common.h @@ -284,6 +284,12 @@ class GraphicsPipelineCommon : public Pipeline const VkGraphicsPipelineCreateInfo* pCreateInfo, GraphicsPipelineExtStructs* pExtStructs); + // Gets the color attachment count from either the renderpass or the pipeline rendering + static uint32_t GetColorAttachmentCount( + const RenderPass* pRenderPass, + const uint32_t subpassIndex, + const VkPipelineRenderingCreateInfo* pPipelineRenderingCreateInfo); + protected: // Convert API information into internal create info used to create internal pipeline object static void BuildPipelineObjectCreateInfo( diff --git a/icd/api/include/khronos/devext/vk_amd_gpa_interface.h b/icd/api/include/khronos/devext/vk_amd_gpa_interface.h index fd5a24b8..5aa89396 100644 --- a/icd/api/include/khronos/devext/vk_amd_gpa_interface.h +++ b/icd/api/include/khronos/devext/vk_amd_gpa_interface.h @@ -109,10 +109,8 @@ typedef enum VkGpaPerfBlockAMD VK_GPA_PERF_BLOCK_GE_DIST_AMD = 46, VK_GPA_PERF_BLOCK_GE_SE_AMD = 47, VK_GPA_PERF_BLOCK_DF_MALL_AMD = 48, -#if VKI_BUILD_GFX11 VK_GPA_PERF_BLOCK_SQ_WGP_AMD = 49, VK_GPA_PERF_BLOCK_PC_AMD = 50, -#endif VK_GPA_PERF_BLOCK_MAX_ENUM_AMD = 0x7FFFFFFF } VkGpaPerfBlockAMD; diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index 49916033..6f1c17f2 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 293 +#define VK_HEADER_VERSION 295 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -189,6 +189,8 @@ typedef enum VkResult { VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR = -1000299000, VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000, VK_INCOMPATIBLE_SHADER_BINARY_EXT = 1000482000, + VK_PIPELINE_BINARY_MISSING_KHR = 1000483000, + VK_ERROR_NOT_ENOUGH_SPACE_KHR = -1000483000, VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE, VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION, @@ -694,7 +696,6 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_DEVICE_MEMORY_OVERALLOCATION_CREATE_INFO_AMD = 1000189000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT = 1000190000, VK_STRUCTURE_TYPE_PRESENT_FRAME_TOKEN_GGP = 1000191000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV = 1000201000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_NV = 1000202000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_PROPERTIES_NV = 1000202001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_IMAGE_FOOTPRINT_FEATURES_NV = 1000204000, @@ -1043,6 +1044,16 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT = 1000482000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_PROPERTIES_EXT = 1000482001, VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT = 1000482002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_BINARY_FEATURES_KHR = 1000483000, + VK_STRUCTURE_TYPE_PIPELINE_BINARY_CREATE_INFO_KHR = 1000483001, + VK_STRUCTURE_TYPE_PIPELINE_BINARY_INFO_KHR = 1000483002, + VK_STRUCTURE_TYPE_PIPELINE_BINARY_KEY_KHR = 1000483003, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_BINARY_PROPERTIES_KHR = 1000483004, + VK_STRUCTURE_TYPE_RELEASE_CAPTURED_PIPELINE_DATA_INFO_KHR = 1000483005, + VK_STRUCTURE_TYPE_PIPELINE_BINARY_DATA_INFO_KHR = 1000483006, + VK_STRUCTURE_TYPE_PIPELINE_CREATE_INFO_KHR = 1000483007, + VK_STRUCTURE_TYPE_DEVICE_PIPELINE_BINARY_INTERNAL_CACHE_CONTROL_KHR = 1000483008, + VK_STRUCTURE_TYPE_PIPELINE_BINARY_HANDLES_INFO_KHR = 1000483009, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TILE_PROPERTIES_FEATURES_QCOM = 1000484000, VK_STRUCTURE_TYPE_TILE_PROPERTIES_QCOM = 1000484001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_AMIGO_PROFILING_FEATURES_SEC = 1000485000, @@ -1075,6 +1086,8 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR = 1000506002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PER_VIEW_RENDER_AREAS_FEATURES_QCOM = 1000510000, VK_STRUCTURE_TYPE_MULTIVIEW_PER_VIEW_RENDER_AREAS_RENDER_PASS_BEGIN_INFO_QCOM = 1000510001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_KHR = 1000201000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_PROPERTIES_KHR = 1000511000, VK_STRUCTURE_TYPE_VIDEO_DECODE_AV1_CAPABILITIES_KHR = 1000512000, VK_STRUCTURE_TYPE_VIDEO_DECODE_AV1_PICTURE_INFO_KHR = 1000512001, VK_STRUCTURE_TYPE_VIDEO_DECODE_AV1_PROFILE_INFO_KHR = 1000512003, @@ -1243,6 +1256,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES, VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_KHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_BARYCENTRIC_FEATURES_NV = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_BARYCENTRIC_FEATURES_KHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES, @@ -1421,6 +1435,7 @@ typedef enum VkObjectType { VK_OBJECT_TYPE_MICROMAP_EXT = 1000396000, VK_OBJECT_TYPE_OPTICAL_FLOW_SESSION_NV = 1000464000, VK_OBJECT_TYPE_SHADER_EXT = 1000482000, + VK_OBJECT_TYPE_PIPELINE_BINARY_KHR = 1000483000, VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR = VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE, VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION_KHR = VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION, VK_OBJECT_TYPE_PRIVATE_DATA_SLOT_EXT = VK_OBJECT_TYPE_PRIVATE_DATA_SLOT, @@ -7606,6 +7621,7 @@ typedef enum VkColorSpaceKHR { VK_COLOR_SPACE_BT709_NONLINEAR_EXT = 1000104006, VK_COLOR_SPACE_BT2020_LINEAR_EXT = 1000104007, VK_COLOR_SPACE_HDR10_ST2084_EXT = 1000104008, + // VK_COLOR_SPACE_DOLBYVISION_EXT is deprecated, but no reason was given in the API XML VK_COLOR_SPACE_DOLBYVISION_EXT = 1000104009, VK_COLOR_SPACE_HDR10_HLG_EXT = 1000104010, VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT = 1000104011, @@ -11184,6 +11200,7 @@ static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_NO_PROTECTED_ACCE static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_PROTECTED_ACCESS_ONLY_BIT_EXT = 0x40000000ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 0x10000000ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DESCRIPTOR_BUFFER_BIT_EXT = 0x20000000ULL; +static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR = 0x80000000ULL; typedef VkFlags64 VkBufferUsageFlags2KHR; @@ -11318,6 +11335,128 @@ typedef struct VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR { +// VK_KHR_pipeline_binary is a preprocessor guard. Do not pass it to API calls. +#define VK_KHR_pipeline_binary 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineBinaryKHR) +#define VK_MAX_PIPELINE_BINARY_KEY_SIZE_KHR 32U +#define VK_KHR_PIPELINE_BINARY_SPEC_VERSION 1 +#define VK_KHR_PIPELINE_BINARY_EXTENSION_NAME "VK_KHR_pipeline_binary" +typedef struct VkPhysicalDevicePipelineBinaryFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 pipelineBinaries; +} VkPhysicalDevicePipelineBinaryFeaturesKHR; + +typedef struct VkPhysicalDevicePipelineBinaryPropertiesKHR { + VkStructureType sType; + void* pNext; + VkBool32 pipelineBinaryInternalCache; + VkBool32 pipelineBinaryInternalCacheControl; + VkBool32 pipelineBinaryPrefersInternalCache; + VkBool32 pipelineBinaryPrecompiledInternalCache; + VkBool32 pipelineBinaryCompressedData; +} VkPhysicalDevicePipelineBinaryPropertiesKHR; + +typedef struct VkDevicePipelineBinaryInternalCacheControlKHR { + VkStructureType sType; + const void* pNext; + VkBool32 disableInternalCache; +} VkDevicePipelineBinaryInternalCacheControlKHR; + +typedef struct VkPipelineBinaryKeyKHR { + VkStructureType sType; + void* pNext; + uint32_t keySize; + uint8_t key[VK_MAX_PIPELINE_BINARY_KEY_SIZE_KHR]; +} VkPipelineBinaryKeyKHR; + +typedef struct VkPipelineBinaryDataKHR { + size_t dataSize; + void* pData; +} VkPipelineBinaryDataKHR; + +typedef struct VkPipelineBinaryKeysAndDataKHR { + uint32_t binaryCount; + const VkPipelineBinaryKeyKHR* pPipelineBinaryKeys; + const VkPipelineBinaryDataKHR* pPipelineBinaryData; +} VkPipelineBinaryKeysAndDataKHR; + +typedef struct VkPipelineCreateInfoKHR { + VkStructureType sType; + void* pNext; +} VkPipelineCreateInfoKHR; + +typedef struct VkPipelineBinaryCreateInfoKHR { + VkStructureType sType; + const void* pNext; + const VkPipelineBinaryKeysAndDataKHR* pKeysAndDataInfo; + VkPipeline pipeline; + const VkPipelineCreateInfoKHR* pPipelineCreateInfo; +} VkPipelineBinaryCreateInfoKHR; + +typedef struct VkPipelineBinaryInfoKHR { + VkStructureType sType; + const void* pNext; + uint32_t binaryCount; + const VkPipelineBinaryKHR* pPipelineBinaries; +} VkPipelineBinaryInfoKHR; + +typedef struct VkReleaseCapturedPipelineDataInfoKHR { + VkStructureType sType; + void* pNext; + VkPipeline pipeline; +} VkReleaseCapturedPipelineDataInfoKHR; + +typedef struct VkPipelineBinaryDataInfoKHR { + VkStructureType sType; + void* pNext; + VkPipelineBinaryKHR pipelineBinary; +} VkPipelineBinaryDataInfoKHR; + +typedef struct VkPipelineBinaryHandlesInfoKHR { + VkStructureType sType; + const void* pNext; + uint32_t pipelineBinaryCount; + VkPipelineBinaryKHR* pPipelineBinaries; +} VkPipelineBinaryHandlesInfoKHR; + +typedef VkResult (VKAPI_PTR *PFN_vkCreatePipelineBinariesKHR)(VkDevice device, const VkPipelineBinaryCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineBinaryHandlesInfoKHR* pBinaries); +typedef void (VKAPI_PTR *PFN_vkDestroyPipelineBinaryKHR)(VkDevice device, VkPipelineBinaryKHR pipelineBinary, const VkAllocationCallbacks* pAllocator); +typedef VkResult (VKAPI_PTR *PFN_vkGetPipelineKeyKHR)(VkDevice device, const VkPipelineCreateInfoKHR* pPipelineCreateInfo, VkPipelineBinaryKeyKHR* pPipelineKey); +typedef VkResult (VKAPI_PTR *PFN_vkGetPipelineBinaryDataKHR)(VkDevice device, const VkPipelineBinaryDataInfoKHR* pInfo, VkPipelineBinaryKeyKHR* pPipelineBinaryKey, size_t* pPipelineBinaryDataSize, void* pPipelineBinaryData); +typedef VkResult (VKAPI_PTR *PFN_vkReleaseCapturedPipelineDataKHR)(VkDevice device, const VkReleaseCapturedPipelineDataInfoKHR* pInfo, const VkAllocationCallbacks* pAllocator); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineBinariesKHR( + VkDevice device, + const VkPipelineBinaryCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryHandlesInfoKHR* pBinaries); + +VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineBinaryKHR( + VkDevice device, + VkPipelineBinaryKHR pipelineBinary, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineKeyKHR( + VkDevice device, + const VkPipelineCreateInfoKHR* pPipelineCreateInfo, + VkPipelineBinaryKeyKHR* pPipelineKey); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineBinaryDataKHR( + VkDevice device, + const VkPipelineBinaryDataInfoKHR* pInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey, + size_t* pPipelineBinaryDataSize, + void* pPipelineBinaryData); + +VKAPI_ATTR VkResult VKAPI_CALL vkReleaseCapturedPipelineDataKHR( + VkDevice device, + const VkReleaseCapturedPipelineDataInfoKHR* pInfo, + const VkAllocationCallbacks* pAllocator); +#endif + + // VK_KHR_cooperative_matrix is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_cooperative_matrix 1 #define VK_KHR_COOPERATIVE_MATRIX_SPEC_VERSION 2 @@ -11397,6 +11536,25 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR #endif +// VK_KHR_compute_shader_derivatives is a preprocessor guard. Do not pass it to API calls. +#define VK_KHR_compute_shader_derivatives 1 +#define VK_KHR_COMPUTE_SHADER_DERIVATIVES_SPEC_VERSION 1 +#define VK_KHR_COMPUTE_SHADER_DERIVATIVES_EXTENSION_NAME "VK_KHR_compute_shader_derivatives" +typedef struct VkPhysicalDeviceComputeShaderDerivativesFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 computeDerivativeGroupQuads; + VkBool32 computeDerivativeGroupLinear; +} VkPhysicalDeviceComputeShaderDerivativesFeaturesKHR; + +typedef struct VkPhysicalDeviceComputeShaderDerivativesPropertiesKHR { + VkStructureType sType; + void* pNext; + VkBool32 meshAndTaskShaderDerivatives; +} VkPhysicalDeviceComputeShaderDerivativesPropertiesKHR; + + + // VK_KHR_video_decode_av1 is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_video_decode_av1 1 #include "vk_video/vulkan_video_codec_av1std.h" @@ -14265,12 +14423,7 @@ typedef VkPipelineCreationFeedback VkPipelineCreationFeedbackEXT; #define VK_NV_compute_shader_derivatives 1 #define VK_NV_COMPUTE_SHADER_DERIVATIVES_SPEC_VERSION 1 #define VK_NV_COMPUTE_SHADER_DERIVATIVES_EXTENSION_NAME "VK_NV_compute_shader_derivatives" -typedef struct VkPhysicalDeviceComputeShaderDerivativesFeaturesNV { - VkStructureType sType; - void* pNext; - VkBool32 computeDerivativeGroupQuads; - VkBool32 computeDerivativeGroupLinear; -} VkPhysicalDeviceComputeShaderDerivativesFeaturesNV; +typedef VkPhysicalDeviceComputeShaderDerivativesFeaturesKHR VkPhysicalDeviceComputeShaderDerivativesFeaturesNV; diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index dcd3acb1..8e6f6732 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -157,8 +157,7 @@ class PipelineCompiler VkResult BuildShaderModule( const Device* pDevice, - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const Vkgc::BinaryData& shaderBinary, ShaderModuleHandle* pShaderModule); @@ -182,6 +181,8 @@ class PipelineCompiler PipelineCache* pPipelineCache, GraphicsLibraryType gplType, GraphicsPipelineBinaryCreateInfo* pCreateInfo, + const Vkgc::BinaryData* pProvidedBinary, + const Util::MetroHash::Hash* pProvidedBinaryHash, GplModuleState* pModuleState); VkResult CreateColorExportShaderLibrary( @@ -345,34 +346,10 @@ class PipelineCompiler bool needCache, GraphicsPipelineBinaryCreateInfo* pCreateInfo); - void GetComputePipelineCacheId( - uint32_t deviceIdx, - ComputePipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId); - - void GetGraphicsPipelineCacheId( - uint32_t deviceIdx, - GraphicsPipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId); - void GetColorExportShaderCacheId( GraphicsPipelineBinaryCreateInfo* pCreateInfo, Util::MetroHash::Hash* pCacheId); -#if VKI_RAY_TRACING - void GetRayTracingPipelineCacheId( - uint32_t deviceIdx, - uint32_t numDevices, - RayTracingPipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId); -#endif - static void BuildNggState( const Device* pDevice, const VkShaderStageFlagBits activeStages, @@ -520,21 +497,19 @@ class PipelineCompiler #endif VkResult LoadShaderModuleFromCache( - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, ShaderModuleHandle* pShaderModule); void StoreShaderModuleToCache( - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, ShaderModuleHandle* pShaderModule); Util::MetroHash::Hash GetShaderModuleCacheHash( - const VkShaderModuleCreateFlags flags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash); diff --git a/icd/api/include/vk_buffer_view.h b/icd/api/include/vk_buffer_view.h index 7375e7ec..5b385f7e 100644 --- a/icd/api/include/vk_buffer_view.h +++ b/icd/api/include/vk_buffer_view.h @@ -56,7 +56,6 @@ class BufferView final : public NonDispatchable const Pal::gpusize* bufferAddress, const VkFormat format, const uint32_t deviceNum, - const size_t srdSize, void* pSrdMemory); VkResult Destroy( diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 694098bf..8755dff7 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -923,7 +923,8 @@ class CmdBuffer template void PushDescriptorSetKHR( VkPipelineBindPoint pipelineBindPoint, @@ -1105,8 +1106,7 @@ class CmdBuffer void PalCmdAcquire( Pal::AcquireReleaseInfo* pAcquireReleaseInfo, - uint32_t eventCount, - const VkEvent* pEvents, + const VkEvent event, Pal::MemBarrier* const pBufferBarriers, const Buffer** const ppBuffers, Pal::ImgBarrier* const pImageBarriers, @@ -1116,8 +1116,7 @@ class CmdBuffer void PalCmdRelease( Pal::AcquireReleaseInfo* pAcquireReleaseInfo, - uint32_t eventCount, - const VkEvent* pEvents, + const VkEvent event, Pal::MemBarrier* const pBufferBarriers, const Buffer** const ppBuffers, Pal::ImgBarrier* const pImageBarriers, @@ -1316,7 +1315,7 @@ class CmdBuffer uint32_t NumDeviceEvents(uint32_t numEvents) const { return m_numPalDevices * numEvents; } -#if VK_ENABLE_DEBUG_BARRIERS +#if VKI_ENABLE_DEBUG_BARRIERS void DbgBarrierPreCmd(uint64_t cmd) { if (m_dbgBarrierPreCmdMask & (cmd)) @@ -1488,9 +1487,10 @@ class CmdBuffer const VkPushConstantsInfoKHR* pPushConstantsInfo); template + size_t samplerDescSize, + size_t typedBufferDescSize, + size_t untypedBufferDescSize, + uint32_t numPalDevices> void PushDescriptorSet2KHR( const VkPushDescriptorSetInfoKHR* pPushDescriptorSetInfo); @@ -1514,6 +1514,16 @@ class CmdBuffer bool isBegin); private: + + void BatchedLoadOpClears( + uint32_t clearCount, + const ImageView** pImageViews, + const Pal::ClearColor* pClearColors, + const Pal::ImageLayout* pClearLayouts, + const Pal::SubresRange* pRanges, + const Pal::SwizzledFormat* pClearFormats, + uint32_t viewMask); + PAL_DISALLOW_COPY_AND_ASSIGN(CmdBuffer); void ValidateGraphicsStates(); @@ -1585,9 +1595,8 @@ class CmdBuffer const VkImageMemoryBarrier* pImageMemoryBarriers); void ExecuteAcquireRelease( - uint32_t eventCount, - const VkEvent* pEvents, uint32_t dependencyCount, + const VkEvent* pEvents, const VkDependencyInfoKHR* pDependencyInfos, AcquireReleaseMode acquireReleaseMode, uint32_t rgpBarrierReasonType); @@ -1687,7 +1696,7 @@ class CmdBuffer #endif void ReleaseResources(); -#if VK_ENABLE_DEBUG_BARRIERS +#if VKI_ENABLE_DEBUG_BARRIERS void DbgCmdBarrier(bool preCmd); #endif @@ -1742,7 +1751,8 @@ class CmdBuffer template static VKAPI_ATTR void VKAPI_CALL CmdPushDescriptorSetKHR( VkCommandBuffer commandBuffer, @@ -1773,7 +1783,8 @@ class CmdBuffer template static VKAPI_ATTR void VKAPI_CALL CmdPushDescriptorSet2KHR( VkCommandBuffer commandBuffer, @@ -1955,8 +1966,7 @@ class CmdBuffer uint32_t preBindDefaultState : 1; uint32_t useReleaseAcquire : 1; uint32_t useSplitReleaseAcquire : 1; - uint32_t useBackupBuffer : 1; - uint32_t reserved2 : 3; + uint32_t useBackupBuffer : 1; uint32_t isRenderingSuspended : 1; #if VKI_RAY_TRACING uint32_t hasRayTracing : 1; @@ -1964,7 +1974,7 @@ class CmdBuffer uint32_t reserved4 : 1; #endif uint32_t offsetMode : 1; - uint32_t reserved : 13; + uint32_t reserved : 16; }; }; @@ -1996,7 +2006,7 @@ class CmdBuffer RenderPassInstanceState m_renderPassInstance; TransformFeedbackState* m_pTransformFeedbackState; -#if VK_ENABLE_DEBUG_BARRIERS +#if VKI_ENABLE_DEBUG_BARRIERS uint64_t m_dbgBarrierPreCmdMask; uint64_t m_dbgBarrierPostCmdMask; #endif diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h index c2c0bddc..5f01590f 100644 --- a/icd/api/include/vk_compute_pipeline.h +++ b/icd/api/include/vk_compute_pipeline.h @@ -112,6 +112,7 @@ class ComputePipeline final : public Pipeline, public NonDispatchable(VK_GPA_PERF_BLOCK_GE1_AMD) == static_cast(Pal::GpuBlock::Ge1)) && (static_cast(VK_GPA_PERF_BLOCK_GE_DIST_AMD) == static_cast(Pal::GpuBlock::GeDist)) && (static_cast(VK_GPA_PERF_BLOCK_GE_SE_AMD) == static_cast(Pal::GpuBlock::GeSe)) && - (static_cast(VK_GPA_PERF_BLOCK_DF_MALL_AMD) == static_cast(Pal::GpuBlock::DfMall)) -#if VKI_BUILD_GFX11 - && (static_cast(VK_GPA_PERF_BLOCK_SQ_WGP_AMD) == static_cast(Pal::GpuBlock::SqWgp)) && + (static_cast(VK_GPA_PERF_BLOCK_DF_MALL_AMD) == static_cast(Pal::GpuBlock::DfMall)) && + (static_cast(VK_GPA_PERF_BLOCK_SQ_WGP_AMD) == static_cast(Pal::GpuBlock::SqWgp)) && (static_cast(VK_GPA_PERF_BLOCK_PC_AMD) == static_cast(Pal::GpuBlock::Pc)) -#endif , "Need to update function convert::GpuBlock"); diff --git a/icd/api/include/vk_defines.h b/icd/api/include/vk_defines.h index 849d5e60..ad7448b7 100644 --- a/icd/api/include/vk_defines.h +++ b/icd/api/include/vk_defines.h @@ -188,6 +188,9 @@ namespace vk // The maximum number of sets that can appear in a pipeline layout static const uint32_t MaxDescriptorSets = 32; + // The maximum size of a buffer SRD + static const uint32_t MaxBufferSrdSize = 8; + // The maximum size of push constants in bytes static const uint32_t MaxPushConstants = 256; diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h index a51cccba..939bb2cf 100644 --- a/icd/api/include/vk_descriptor_set.h +++ b/icd/api/include/vk_descriptor_set.h @@ -324,7 +324,8 @@ class DescriptorUpdate template static VKAPI_ATTR void VKAPI_CALL UpdateDescriptorSets( VkDevice device, @@ -336,7 +337,8 @@ class DescriptorUpdate template static void WriteDescriptorSets( const Device* pDevice, diff --git a/icd/api/include/vk_descriptor_update_template.h b/icd/api/include/vk_descriptor_update_template.h index b23c2a88..84e6bbe4 100644 --- a/icd/api/include/vk_descriptor_update_template.h +++ b/icd/api/include/vk_descriptor_update_template.h @@ -104,7 +104,8 @@ class DescriptorUpdateTemplate final : public NonDispatchable static PfnUpdateEntry GetUpdateEntryFunc( VkDescriptorType descriptorType, diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index 9ce4d34f..d9038d27 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -168,7 +168,9 @@ class Device uint32 reserved2 : 1; uint32 deviceGeneratedCommands : 1; uint32 robustVertexBufferExtend : 1; - uint32 reserved : 11; + uint32 enableDebugPrintf : 1; + uint32 reserved3 : 1; + uint32 reserved : 9; }; uint32 u32All; @@ -193,7 +195,8 @@ class Device struct { - uint32_t bufferView; + uint32_t typedBufferView; + uint32_t untypedBufferView; uint32_t imageView; uint32_t fmaskView; uint32_t sampler; @@ -809,7 +812,7 @@ class Device const uint8_t* pCode, uint32_t numUserDataNodes, Vkgc::ResourceMappingRootNode* pUserDataNodes, - VkShaderModuleCreateFlags internalShaderFlags, + ShaderModuleFlags flags, bool forceWave64, const VkSpecializationInfo* pSpecializationInfo, InternalPipeline* pInternalPipeline); @@ -1444,6 +1447,34 @@ VKAPI_ATTR void VKAPI_CALL vkGetGeneratedCommandsMemoryRequirementsNV( const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo, VkMemoryRequirements2* pMemoryRequirements); +VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineBinariesKHR( + VkDevice device, + const VkPipelineBinaryCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryHandlesInfoKHR* pBinaries); + +VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineBinaryKHR( + VkDevice device, + VkPipelineBinaryKHR pipelineBinary, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineKeyKHR( + VkDevice device, + const VkPipelineCreateInfoKHR* pPipelineCreateInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineBinaryDataKHR( + VkDevice device, + const VkPipelineBinaryDataInfoKHR* pInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey, + size_t* pPipelineBinaryDataSize, + void* pPipelineBinaryData); + +VKAPI_ATTR VkResult VKAPI_CALL vkReleaseCapturedPipelineDataKHR( + VkDevice device, + const VkReleaseCapturedPipelineDataInfoKHR* pInfo, + const VkAllocationCallbacks* pAllocator); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index 0424401f..a1a7549d 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -323,6 +323,7 @@ class DeviceExtensions final : public Extensions KHR_MAINTENANCE7, KHR_MAP_MEMORY2, KHR_MULTIVIEW, + KHR_PIPELINE_BINARY, KHR_PIPELINE_EXECUTABLE_PROPERTIES, KHR_PIPELINE_LIBRARY, KHR_PUSH_DESCRIPTOR, diff --git a/icd/api/include/vk_formats.h b/icd/api/include/vk_formats.h index 7d3a2497..452b5bd3 100755 --- a/icd/api/include/vk_formats.h +++ b/icd/api/include/vk_formats.h @@ -283,13 +283,19 @@ bool Formats::IsEtc2Format( // Returns true if the given format is a valid RT Vertex Buffer format. bool Formats::IsRTVertexFormat(VkFormat format) { - return (VK_FORMAT_R32G32_SFLOAT == format) || - (VK_FORMAT_R32G32B32_SFLOAT == format) || - (VK_FORMAT_R16G16_SFLOAT == format) || - (VK_FORMAT_R16G16B16A16_SFLOAT == format) || - (VK_FORMAT_R16G16_SNORM == format) || - (VK_FORMAT_R16G16B16A16_SNORM == format) || - (VK_FORMAT_R16G16B16A16_UNORM == format); + return (VK_FORMAT_R32G32_SFLOAT == format) || + (VK_FORMAT_R32G32B32_SFLOAT == format) || + (VK_FORMAT_R16G16_SFLOAT == format) || + (VK_FORMAT_R16G16B16A16_SFLOAT == format) || + (VK_FORMAT_R16G16_SNORM == format) || + (VK_FORMAT_R16G16B16A16_SNORM == format) || + (VK_FORMAT_R16G16B16A16_UNORM == format) || + (VK_FORMAT_R16G16_UNORM == format) || + (VK_FORMAT_A2B10G10R10_UNORM_PACK32 == format) || + (VK_FORMAT_R8G8B8A8_UNORM == format) || + (VK_FORMAT_R8G8_UNORM == format) || + (VK_FORMAT_R8G8B8A8_SNORM == format) || + (VK_FORMAT_R8G8_SNORM == format); } // ===================================================================================================================== diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h index a0e9e55d..796d965a 100644 --- a/icd/api/include/vk_graphics_pipeline.h +++ b/icd/api/include/vk_graphics_pipeline.h @@ -227,6 +227,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch Device* const pDevice, Pal::IPipeline** pPalPipeline, const PipelineLayout* pLayout, + PipelineBinaryStorage* pBinaryStorage, const GraphicsPipelineObjectImmedInfo& immedInfo, uint64_t staticStateMask, GraphicsPipelineObjectFlags flags, @@ -277,6 +278,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch PipelineCache* pPipelineCache, const Util::MetroHash::Hash* pCacheIds, uint64_t apiPsoHash, + const PipelineBinaryStorage& binaryStorage, GraphicsPipelineObjectCreateInfo* pObjectCreateInfo, VkPipeline* pPipeline); diff --git a/icd/api/include/vk_graphics_pipeline_library.h b/icd/api/include/vk_graphics_pipeline_library.h index 7647f357..2569b1c8 100644 --- a/icd/api/include/vk_graphics_pipeline_library.h +++ b/icd/api/include/vk_graphics_pipeline_library.h @@ -96,6 +96,7 @@ class GraphicsPipelineLibrary final : public GraphicsPipelineCommon, public NonD const Util::MetroHash::Hash& elfHash, const uint64_t apiHash, const GplModuleState* pGplModuleStates, + PipelineBinaryStorage* pBinaryStorage, const PipelineLayout* pPipelineLayout); static VkResult CreatePartialPipelineBinary( diff --git a/icd/api/include/vk_indirect_commands_layout.h b/icd/api/include/vk_indirect_commands_layout.h index dc80025c..eadcb646 100644 --- a/icd/api/include/vk_indirect_commands_layout.h +++ b/icd/api/include/vk_indirect_commands_layout.h @@ -64,7 +64,8 @@ enum IndirectCommandsActionType Draw = 0, DrawIndexed, Dispatch, - DrawMeshTask + DrawMeshTask, + TraceRay }; struct IndirectCommandsInfo @@ -72,6 +73,7 @@ struct IndirectCommandsInfo IndirectCommandsActionType actionType; IndirectCommandsLayoutType layoutType; uint32_t strideInBytes; + uint32_t preActionArgSizeInBytes; }; // ===================================================================================================================== diff --git a/icd/api/include/vk_physical_device.h b/icd/api/include/vk_physical_device.h index f85f9206..98f19642 100644 --- a/icd/api/include/vk_physical_device.h +++ b/icd/api/include/vk_physical_device.h @@ -387,6 +387,26 @@ class PhysicalDevice void GetDevicePropertiesMaxBufferSize( VkDeviceSize* pMaxBufferSize) const; + void GetPhysicalDeviceLineSubPixelPrecisionBits( + uint32_t* pLineSubPixelPrecisionBits) const; + + void GetPhysicalDeviceVertexAttributeDivisorProperties( + uint32_t* pMaxVertexAttribDivisor, + VkBool32* pSupportsNonZeroFirstInstance) const; + + void GetPhysicalDeviceMaintenance5Properties( + VkBool32* pEarlyFragmentMultisampleCoverageAfterSampleCounting, + VkBool32* pEarlyFragmentSampleMaskTestBeforeSampleCounting, + VkBool32* pDepthStencilSwizzleOneSupport, + VkBool32* pPolygonModePointSize, + VkBool32* pNonStrictSinglePixelWideLinesUseParallelogram, + VkBool32* pNonStrictWideLinesUseParallelogram) const; + + void GetPhysicalDeviceMaintenance6Properties( + VkBool32* pBlockTexelViewCompatibleMultipleLayers, + uint32_t* pMaxCombinedImageSamplerDescriptorCount, + VkBool32* pFragmentShadingRateClampCombinerInputs) const; + void GetPhysicalDeviceDriverProperties( VkDriverId* pDriverID, char* pDriverName, @@ -522,6 +542,14 @@ template VkBool32* pVulkanMemoryModelDeviceScope, VkBool32* pVulkanMemoryModelAvailabilityVisibilityChains) const; + void GetPhysicalDeviceLineRasterizationFeatures( + VkBool32* pRectangularLines, + VkBool32* pBresenhamLines, + VkBool32* pSmoothLines, + VkBool32* pStippledRectangularLines, + VkBool32* pStippledBresenhamLines, + VkBool32* pStippledSmoothLines) const; + VkResult GetPhysicalDeviceCalibrateableTimeDomainsEXT( uint32_t* pTimeDomainCount, VkTimeDomainEXT* pTimeDomains); diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h index c6919bc0..9118d278 100644 --- a/icd/api/include/vk_pipeline.h +++ b/icd/api/include/vk_pipeline.h @@ -72,6 +72,20 @@ struct PipelineBinaryInfo Util::MetroHash::Hash binaryHash; }; +constexpr uint32 MaxPipelineBinaryInfoCount = Util::Max(MaxPalDevices, static_cast(GraphicsLibraryCount)); + +// If a pipeline is created with VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR set, it must retain its binaries so that we +// can create VkPipelineBinaryKHR objects from it at any time. We can't rely on our in-memory cache, because it can be +// disabled or have its entries evicted. This struct lets the pipeline store up to MaxPalDevices binaries and retrieve +// them by key or device index. +struct PipelineBinaryStorage +{ + // For monolithic pipelines this stores a single packed blob per device (same as how caching works). For graphics + // pipeline libraries, this stores an elf binary blob per graphics library type. + PipelineBinaryInfo binaryInfo[MaxPipelineBinaryInfoCount]; + uint32 binaryCount; +}; + enum class DynamicStatesInternal : uint32_t { Viewport = 0, @@ -133,6 +147,7 @@ enum class DynamicStatesInternal : uint32_t struct PipelineExtStructs { const VkPipelineCreationFeedbackCreateInfoEXT* pPipelineCreationFeedbackCreateInfoEXT; + const VkPipelineBinaryInfoKHR* pPipelineBinaryInfoKHR; }; // ===================================================================================================================== @@ -243,6 +258,24 @@ class Pipeline Util::MetroHash::Hash* pCacheId ); + const PipelineBinaryStorage* GetBinaryStorage() const + { return m_pBinaryStorage; } + + // See the implementation note about memory ownership behavior. + static void InsertBinaryData( + PipelineBinaryStorage* pBinaryStorage, + const uint32 binaryIndex, + const Util::MetroHash::Hash& key, + const size_t dataSize, + const void* pData); + + VkResult FreeBinaryStorage( + const VkAllocationCallbacks* pAllocator); + + static void FreeBinaryStorage( + PipelineBinaryStorage* pBinaryStorage, + const VkAllocationCallbacks* pAllocator); + static void FreeTempModules( const Device* pDevice, const uint32_t maxStageCount, @@ -259,6 +292,7 @@ class Pipeline void Init( Pal::IPipeline** pPalPipeline, const PipelineLayout* pLayout, + PipelineBinaryStorage* pBinaryStorage, uint64_t staticStateMask, #if VKI_RAY_TRACING uint32_t dispatchRaysUserDataOffset, @@ -310,6 +344,7 @@ class Pipeline private: PAL_DISALLOW_COPY_AND_ASSIGN(Pipeline); + PipelineBinaryStorage* m_pBinaryStorage; PrintfFormatMap* m_pFormatStrings; }; diff --git a/icd/api/include/vk_pipeline_binary.h b/icd/api/include/vk_pipeline_binary.h new file mode 100644 index 00000000..ec3ade3b --- /dev/null +++ b/icd/api/include/vk_pipeline_binary.h @@ -0,0 +1,106 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#ifndef __VK_PIPELINE_BINARY_H__ +#define __VK_PIPELINE_BINARY_H__ + +#pragma once + +#include "include/vk_dispatch.h" +#include "include/vk_pipeline.h" + +#include "palMetroHash.h" + +namespace vk +{ +class Device; + +class PipelineBinary final : public NonDispatchable +{ +public: + static VkResult CreatePipelineBinaries( + Device* pDevice, + const VkPipelineBinaryCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryHandlesInfoKHR* pBinaries); + + VkResult DestroyPipelineBinary( + Device* pDevice, + const VkAllocationCallbacks* pAllocator); + + static VkResult GetPipelineKey( + const Device* pDevice, + const VkPipelineCreateInfoKHR* pPipelineCreateInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey); + + VkResult GetPipelineBinaryData( + VkPipelineBinaryKeyKHR* pPipelineBinaryKey, + size_t* pPipelineBinaryDataSize, + void* pPipelineBinaryData); + + static VkResult ReleaseCapturedPipelineData( + Device* pDevice, + Pipeline* pPipeline, + const VkAllocationCallbacks* pAllocator); + + static void ReadFromPipelineBinaryKey( + const VkPipelineBinaryKeyKHR& inKey, + Util::MetroHash::Hash* pOutKey); + + const Util::MetroHash::Hash& BinaryKey() const + { return m_binaryKey; } + + const Vkgc::BinaryData& BinaryData() const + { return m_binaryData; } + +protected: + +private: + PAL_DISALLOW_COPY_AND_ASSIGN(PipelineBinary); + + PipelineBinary( + const Util::MetroHash::Hash& binaryKey, + const Vkgc::BinaryData& binaryData); + + // Pipeline binary doesn't contain the key itself. + static VkResult Create( + Device* pDevice, + const Util::MetroHash::Hash& binaryKey, + const Vkgc::BinaryData& binaryData, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryKHR* pPipelineBinary); + + static void WriteToPipelineBinaryKey( + const void* pSrcData, + const size_t dataSize, + VkPipelineBinaryKeyKHR* pDstKey); + + const Util::MetroHash::Hash m_binaryKey; + const Vkgc::BinaryData m_binaryData; +}; + +} // namespace vk + +#endif /* __VK_PIPELINE_BINARY_H__ */ diff --git a/icd/api/include/vk_pipeline_layout.h b/icd/api/include/vk_pipeline_layout.h index f7c58f78..b78a2b06 100644 --- a/icd/api/include/vk_pipeline_layout.h +++ b/icd/api/include/vk_pipeline_layout.h @@ -1,4 +1,4 @@ -/* +/* *********************************************************************************************************************** * * Copyright (c) 2014-2024 Advanced Micro Devices, Inc. All Rights Reserved. @@ -256,6 +256,12 @@ class PipelineLayout final : public NonDispatchable @@ -97,6 +99,13 @@ class ShaderModule final : public NonDispatchable static void* GetFirstValidShaderData(const ShaderModuleHandle* pHandle); + static ShaderModuleFlags ConvertVkShaderModuleCreateFlags(VkShaderModuleCreateFlags flags) + { + // There aren't any VkShaderModuleCreateFlags yet, but this function should be implemented when one is added. + VK_ASSERT(flags == 0); + return 0; + } + protected: ShaderModule(size_t codeSize, const void* pCode, VkShaderModuleCreateFlags flags); VkResult Init(Device* pDevice); @@ -105,7 +114,7 @@ class ShaderModule final : public NonDispatchable const void* m_pCode; ShaderModuleHandle m_handle; Pal::ShaderHash m_codeHash; - VkShaderModuleCreateFlags m_flags; + ShaderModuleFlags m_flags; private: PAL_DISALLOW_COPY_AND_ASSIGN(ShaderModule); diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index a0236cb1..3e8e7927 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -196,11 +196,11 @@ static void ApplyProfileOptions( static bool SupportInternalModuleCache( const PhysicalDevice* pDevice, const uint32_t compilerMask, - const VkShaderModuleCreateFlags internalShaderFlags) + const ShaderModuleFlags flags) { bool supportInternalModuleCache = false; - if (Util::TestAnyFlagSet(internalShaderFlags, VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT)) + if (Util::TestAnyFlagSet(flags, ShaderModuleForceUncached)) { supportInternalModuleCache = false; } @@ -330,12 +330,10 @@ VkResult PipelineCompiler::Initialize() m_gfxIp.major = 10; m_gfxIp.minor = 3; break; -#if VKI_BUILD_GFX11 case Pal::GfxIpLevel::GfxIp11_0: m_gfxIp.major = 11; m_gfxIp.minor = 0; break; -#endif #if VKI_BUILD_GFX115 case Pal::GfxIpLevel::GfxIp11_5: m_gfxIp.major = 11; @@ -481,15 +479,24 @@ bool PipelineCompiler::LoadReplaceShaderBinary( // ===================================================================================================================== // Generates shader module cache hash ID Util::MetroHash::Hash PipelineCompiler::GetShaderModuleCacheHash( - const VkShaderModuleCreateFlags flags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash) { + // None of the internal flags require hashing, but any new API provided one might. + constexpr ShaderModuleFlags AllInternalFlags = +#if VKI_RAY_TRACING + ShaderModuleInternalRayTracingShader | +#endif + ShaderModuleInternalShader | + ShaderModuleAllowDelayConversion | + ShaderModuleForceUncached; + VK_ASSERT(Util::TestAnyFlagSet(flags, ~AllInternalFlags) == false); + Util::MetroHash128 hasher; Util::MetroHash::Hash hash; hasher.Update(compilerMask); hasher.Update(uniqueHash); - hasher.Update(flags); hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash()); hasher.Finalize(hash.bytes); return hash; @@ -498,8 +505,7 @@ Util::MetroHash::Hash PipelineCompiler::GetShaderModuleCacheHash( // ===================================================================================================================== // Loads shader module from cache, include both run-time cache and binary cache VkResult PipelineCompiler::LoadShaderModuleFromCache( - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, ShaderModuleHandle* pShaderModule) @@ -507,7 +513,7 @@ VkResult PipelineCompiler::LoadShaderModuleFromCache( VkResult result = VK_ERROR_INITIALIZATION_FAILED; const bool supportInternalModuleCache = - SupportInternalModuleCache(m_pPhysicalDevice, compilerMask, internalShaderFlags); + SupportInternalModuleCache(m_pPhysicalDevice, compilerMask, flags); const bool delayConversion = false; VK_ASSERT(pShaderModule->pRefCount == nullptr); @@ -585,8 +591,7 @@ VkResult PipelineCompiler::LoadShaderModuleFromCache( // ===================================================================================================================== // Stores shader module to cache, include both run-time cache and binary cache void PipelineCompiler::StoreShaderModuleToCache( - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, ShaderModuleHandle* pShaderModule) @@ -594,7 +599,7 @@ void PipelineCompiler::StoreShaderModuleToCache( VK_ASSERT(pShaderModule->pRefCount == nullptr); const bool supportInternalModuleCache = - SupportInternalModuleCache(m_pPhysicalDevice, compilerMask, internalShaderFlags); + SupportInternalModuleCache(m_pPhysicalDevice, compilerMask, flags); if (supportInternalModuleCache) { @@ -633,8 +638,7 @@ void PipelineCompiler::StoreShaderModuleToCache( // Builds shader module from SPIR-V binary code. VkResult PipelineCompiler::BuildShaderModule( const Device* pDevice, - const VkShaderModuleCreateFlags flags, - const VkShaderModuleCreateFlags internalShaderFlags, + const ShaderModuleFlags flags, const Vkgc::BinaryData& shaderBinary, ShaderModuleHandle* pShaderModule) { @@ -652,6 +656,8 @@ VkResult PipelineCompiler::BuildShaderModule( bool findReplaceShader = false; + ShaderModuleFlags shaderFlags = flags; + Vkgc::BinaryData finalData = shaderBinary; if ((pSettings->shaderReplaceMode == ShaderReplaceShaderHash) || (pSettings->shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash)) @@ -664,11 +670,11 @@ VkResult PipelineCompiler::BuildShaderModule( finalData = replaceBinary; Util::MetroHash64::Hash( reinterpret_cast(replaceBinary.pCode), replaceBinary.codeSize, uniqueHash.bytes); + } } - result = LoadShaderModuleFromCache( - flags, internalShaderFlags, compilerMask, uniqueHash, pShaderModule); + result = LoadShaderModuleFromCache(shaderFlags, compilerMask, uniqueHash, pShaderModule); if (result != VK_SUCCESS) { @@ -676,15 +682,14 @@ VkResult PipelineCompiler::BuildShaderModule( { result = m_compilerSolutionLlpc.BuildShaderModule( pDevice, - flags, - internalShaderFlags, + shaderFlags, finalData, pShaderModule, PipelineOptimizerKey{}); } - StoreShaderModuleToCache(flags, internalShaderFlags, compilerMask, uniqueHash, pShaderModule); + StoreShaderModuleToCache(shaderFlags, compilerMask, uniqueHash, pShaderModule); } else if ((pSettings->enablePipelineDump) ) @@ -800,7 +805,7 @@ bool PipelineCompiler::ReplacePipelineShaderModule( if (LoadReplaceShaderBinary(hash64, &shaderBinary)) { VkResult result = - BuildShaderModule(pDevice, 0, 0, shaderBinary, pShaderModule); + BuildShaderModule(pDevice, 0, shaderBinary, pShaderModule); if (result == VK_SUCCESS) { @@ -1194,6 +1199,8 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary( PipelineCache* pPipelineCache, GraphicsLibraryType gplType, GraphicsPipelineBinaryCreateInfo* pCreateInfo, + const Vkgc::BinaryData* pProvidedBinary, + const Util::MetroHash::Hash* pProvidedBinaryHash, GplModuleState* pModuleState) { VkResult result = VK_SUCCESS; @@ -1226,6 +1233,8 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary( pPipelineCache, gplType, pCreateInfo, + pProvidedBinary, + pProvidedBinaryHash, pPipelineDumpHandle, pModuleState); @@ -1655,7 +1664,6 @@ void BuildLlpcVertexInputDescriptors( { VK_ASSERT(pVbInfo != nullptr); - const uint32_t srdDwSize = pDevice->GetProperties().descriptorSizes.bufferView / sizeof(uint32_t); uint32_t activeBindings = 0; // Sort the strides by binding slot @@ -1823,9 +1831,7 @@ static void MergePipelineOptions( pDst->extendedRobustness.nullDescriptor |= src.extendedRobustness.nullDescriptor; pDst->extendedRobustness.robustBufferAccess |= src.extendedRobustness.robustBufferAccess; pDst->extendedRobustness.robustImageAccess |= src.extendedRobustness.robustImageAccess; -#if VKI_BUILD_GFX11 pDst->optimizeTessFactor |= src.optimizeTessFactor; -#endif pDst->enableInterpModePatch |= src.enableInterpModePatch; pDst->pageMigrationEnabled |= src.pageMigrationEnabled; pDst->optimizationLevel |= src.optimizationLevel; @@ -2079,11 +2085,11 @@ void PipelineCompiler::BuildNggState( // NOTE: To support unrestrict dynamic primtive topology, we need full disable NGG on gfx10. bool disallowNgg = unrestrictedPrimitiveTopology; -#if VKI_BUILD_GFX11 + // On gfx11, we needn't program GS output primitive type on VsPs pipeline, so we can support unrestrict dynamic // primtive topology with NGG. disallowNgg = (disallowNgg && (deviceProp.gfxLevel < Pal::GfxIpLevel::GfxIp11_0)); -#endif + if (disallowNgg) { pCreateInfo->pipelineInfo.nggState.enableNgg = false; @@ -2399,11 +2405,18 @@ static void BuildColorBlendState( { auto pRendering = extStructs.pPipelineRenderingCreateInfo; + const uint32 numColorTargets = GraphicsPipelineCommon::GetColorAttachmentCount(pRenderPass, subpass, pRendering); if ((pCb != nullptr) || (pRendering != nullptr)) { - const uint32_t numColorTargets = (pRendering != nullptr) ? - Util::Min(pRendering->colorAttachmentCount, Pal::MaxColorTargets) : - Util::Min(pCb->attachmentCount, Pal::MaxColorTargets); + bool useBlendAttachments = false; + // If the pipeline is created with these 3 states as dynamic, the attachmentCount from the + // VkPipelineColorBlendStateCreateInfo is ignored. + if ((IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEnable) == false) || + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEquation) == false) || + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorWriteMask) == false)) + { + useBlendAttachments = true; + } if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEquation) || IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEnable)) @@ -2458,7 +2471,7 @@ static void BuildColorBlendState( VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - if ((pCb != nullptr) && (i < pCb->attachmentCount)) + if (useBlendAttachments && (i < pCb->attachmentCount)) { const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i]; if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorWriteMask) == false) @@ -3066,6 +3079,12 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( result = BuildPipelineResourceMapping(pDevice, pPipelineLayout, availableStageMask, pCreateInfo); } + + // Primitive Generated Query is only used for traditional shaders, disable it for mesh shader. + if (pCreateInfo->pipelineInfo.mesh.pModuleData != nullptr) + { + pCreateInfo->pipelineInfo.options.enablePrimGeneratedQuery = false; + } } if (result == VK_SUCCESS) @@ -3295,9 +3314,7 @@ void PipelineCompiler::ApplyPipelineOptions( pOptions->enableRelocatableShaderElf = settings.enableRelocatableShaders; pOptions->disableImageResourceCheck = settings.disableImageResourceTypeCheck; -#if VKI_BUILD_GFX11 pOptions->optimizeTessFactor = settings.optimizeTessFactor != OptimizeTessFactorDisable; -#endif pOptions->forceCsThreadIdSwizzling = settings.forceCsThreadIdSwizzling; pOptions->overrideThreadGroupSizeX = settings.overrideThreadGroupSizeX; pOptions->overrideThreadGroupSizeY = settings.overrideThreadGroupSizeY; @@ -4322,9 +4339,7 @@ void PipelineCompiler::SetRayTracingState( bvhInfo.boxSortHeuristic = Pal::BoxSortHeuristic::ClosestFirst; bvhInfo.flags.useZeroOffset = 1; bvhInfo.flags.returnBarycentrics = 1; -#if VKI_BUILD_GFX11 bvhInfo.flags.pointerFlags = settings.rtEnableNodePointerFlags; -#endif // Bypass Mall cache read/write if no alloc policy is set for SRDs. // This global setting applies to every BVH SRD. @@ -4409,7 +4424,6 @@ void PipelineCompiler::SetRayTracingState( auto rtCounterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex); pRtState->enableRayTracingCounters = (rtCounterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable); -#if VKI_BUILD_GFX11 // Enable hardware traversal stack on RTIP 2.0+ if (settings.emulatedRtIpLevel > EmulatedRtIpLevel1_1) { @@ -4427,7 +4441,6 @@ void PipelineCompiler::SetRayTracingState( pRtState->enableRayTracingHwTraversalStack = 1; } } -#endif Pal::RayTracingIpLevel rayTracingIp = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gfxipProperties.rayTracingIp; @@ -4441,11 +4454,9 @@ void PipelineCompiler::SetRayTracingState( case EmulatedRtIpLevel1_1: rayTracingIp = Pal::RayTracingIpLevel::RtIp1_1; break; -#if VKI_BUILD_GFX11 case EmulatedRtIpLevel2_0: rayTracingIp = Pal::RayTracingIpLevel::RtIp2_0; break; -#endif default: VK_ASSERT(false); break; @@ -4460,11 +4471,9 @@ void PipelineCompiler::SetRayTracingState( case Pal::RayTracingIpLevel::RtIp1_1: pRtState->rtIpVersion = Vkgc::RtIpVersion({ 1, 1 }); break; -#if VKI_BUILD_GFX11 case Pal::RayTracingIpLevel::RtIp2_0: pRtState->rtIpVersion = Vkgc::RtIpVersion({ 2, 0 }); break; -#endif default: VK_NEVER_CALLED(); break; @@ -4472,7 +4481,7 @@ void PipelineCompiler::SetRayTracingState( pRtState->gpurtFeatureFlags = GpuRtShaderLibraryFlags(pDevice); - const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(pRtState->gpurtFeatureFlags); + const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(rayTracingIp, pRtState->gpurtFeatureFlags); VK_ASSERT(codePatch.dxilSize > 0); pRtState->gpurtShaderLibrary.pCode = codePatch.pSpvCode; @@ -4809,94 +4818,6 @@ static VkPipelineCreateFlags2KHR GetCacheIdControlFlags( return in & (~CacheIdIgnoreFlags); } -// ===================================================================================================================== -// The pipeline cache ID contains additional inputs outside the shader creation information for pipeline executable -// properties as well as options to avoid user error when changing performance tuning, compiler, or any other settings. -static void GetCommonPipelineCacheId( - uint32_t deviceIdx, - VkPipelineCreateFlags2KHR flags, - const PipelineOptimizerKey* pPipelineProfileKey, - PipelineCompilerType compilerType, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash128* pHash) -{ - pHash->Update(pipelineHash); - pHash->Update(deviceIdx); - pHash->Update(GetCacheIdControlFlags(flags)); - pHash->Update(compilerType); - pHash->Update(settingsHash); - pHash->Update(pPipelineProfileKey->shaderCount); - - for (uint32_t shaderIdx = 0; shaderIdx < pPipelineProfileKey->shaderCount; ++shaderIdx) - { - pHash->Update(pPipelineProfileKey->pShaders[shaderIdx]); - } -} - -// ===================================================================================================================== -void PipelineCompiler::GetComputePipelineCacheId( - uint32_t deviceIdx, - ComputePipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId) -{ - Util::MetroHash128 hash = {}; - - GetCommonPipelineCacheId( - deviceIdx, - pCreateInfo->flags, - pCreateInfo->pPipelineProfileKey, - pCreateInfo->compilerType, - pipelineHash, - settingsHash, - &hash); - - hash.Update(pCreateInfo->pipelineInfo.cs.options); - hash.Update(pCreateInfo->pipelineInfo.options); - - hash.Finalize(pCacheId->bytes); -} - -// ===================================================================================================================== -void PipelineCompiler::GetGraphicsPipelineCacheId( - uint32_t deviceIdx, - GraphicsPipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId) -{ - Util::MetroHash128 hash = {}; - - GetCommonPipelineCacheId( - deviceIdx, - pCreateInfo->flags, - pCreateInfo->pPipelineProfileKey, - pCreateInfo->compilerType, - pipelineHash, - settingsHash, - &hash); - - hash.Update(pCreateInfo->pipelineInfo.task.options); - hash.Update(pCreateInfo->pipelineInfo.vs.options); - hash.Update(pCreateInfo->pipelineInfo.tes.options); - hash.Update(pCreateInfo->pipelineInfo.tcs.options); - hash.Update(pCreateInfo->pipelineInfo.gs.options); - hash.Update(pCreateInfo->pipelineInfo.mesh.options); - hash.Update(pCreateInfo->pipelineInfo.fs.options); - hash.Update(pCreateInfo->pipelineInfo.options); - hash.Update(pCreateInfo->pipelineInfo.nggState); - hash.Update(pCreateInfo->dbFormat); - hash.Update(pCreateInfo->pipelineInfo.dynamicVertexStride); - hash.Update(pCreateInfo->pipelineInfo.enableUberFetchShader); - hash.Update(pCreateInfo->pipelineInfo.rsState); - - hash.Update(pCreateInfo->pBinaryMetadata->pointSizeUsed); - - hash.Finalize(pCacheId->bytes); -} - // ===================================================================================================================== void PipelineCompiler::GetColorExportShaderCacheId( GraphicsPipelineBinaryCreateInfo* pCreateInfo, @@ -4915,34 +4836,6 @@ void PipelineCompiler::GetColorExportShaderCacheId( hash.Finalize(pCacheId->bytes); } -#if VKI_RAY_TRACING -// ===================================================================================================================== -void PipelineCompiler::GetRayTracingPipelineCacheId( - uint32_t deviceIdx, - uint32_t numDevices, - RayTracingPipelineBinaryCreateInfo* pCreateInfo, - uint64_t pipelineHash, - const Util::MetroHash::Hash& settingsHash, - Util::MetroHash::Hash* pCacheId) -{ - Util::MetroHash128 hash = {}; - - GetCommonPipelineCacheId( - deviceIdx, - pCreateInfo->flags, - pCreateInfo->pPipelineProfileKey, - pCreateInfo->compilerType, - pipelineHash, - settingsHash, - &hash); - - hash.Update(numDevices); - hash.Update(pCreateInfo->pipelineInfo.options); - - hash.Finalize(pCacheId->bytes); -} -#endif - // ===================================================================================================================== void PipelineCompiler::BuildPipelineInternalBufferData( const PipelineLayout* pPipelineLayout, diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index a01de905..3a097064 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -35,6 +35,7 @@ #include "sqtt/sqtt_rgp_annotations.h" #include "palAutoBuffer.h" #include "palVectorImpl.h" +#include "palArchiveFile.h" #include "gpurt/gpurtLib.h" #include "g_gpurtOptions.h" @@ -70,6 +71,7 @@ RayTracingDevice::~RayTracingDevice() VkResult RayTracingDevice::Init() { VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); if (InitAccelStructTracker() != VK_SUCCESS) { @@ -106,7 +108,7 @@ VkResult RayTracingDevice::Init() initInfo.accelStructTrackerGpuAddr = GetAccelStructTrackerGpuVa(deviceIdx); initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::None; - switch (m_pDevice->GetRuntimeSettings().emulatedRtIpLevel) + switch (settings.emulatedRtIpLevel) { case EmulatedRtIpLevelNone: break; @@ -114,11 +116,9 @@ VkResult RayTracingDevice::Init() case EmulatedRtIpLevel1_1: initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp1_1; break; -#if VKI_BUILD_GFX11 case EmulatedRtIpLevel2_0: initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp2_0; break; -#endif default: break; } @@ -252,7 +252,6 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->accelerationStructureUUID = GetAccelerationStructureUUID( m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties()); pDeviceSettings->enableMergeSort = settings.enableMergeSort; - pDeviceSettings->fastBuildThreshold = settings.fastBuildThreshold; pDeviceSettings->lbvhBuildThreshold = settings.lbvhBuildThreshold; pDeviceSettings->enableBVHBuildDebugCounters = settings.enableBvhBuildDebugCounters; pDeviceSettings->enableInsertBarriersInBuildAS = settings.enableInsertBarriersInBuildAs; @@ -278,6 +277,8 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->enableMergedEncodeBuild = settings.enableMergedEncodeBuild; pDeviceSettings->enableMergedEncodeUpdate = settings.enableMergedEncodeUpdate; + pDeviceSettings->checkBufferOverlapsInBatch = settings.rtCheckBufferOverlapsInBatch; + pDeviceSettings->disableCompaction = settings.rtDisableAccelStructCompaction; } // ===================================================================================================================== @@ -511,7 +512,7 @@ VkResult RayTracingDevice::InitAccelStructTracker() // Ensure the SRD size matches with the size reported by PAL VK_ASSERT(sizeof(pTracker->srd) >= - m_pDevice->VkPhysicalDevice(deviceIdx)->PalProperties().gfxipProperties.srdSizes.bufferView); + m_pDevice->VkPhysicalDevice(deviceIdx)->PalProperties().gfxipProperties.srdSizes.untypedBufferView); pPalDevice->CreateUntypedBufferViewSrds(1, &viewInfo, &pTracker->srd); } @@ -545,7 +546,7 @@ VkResult RayTracingDevice::InitAccelStructTracker() // Create null view if tracking is disabled. memcpy(&m_accelStructTrackerResources[deviceIdx].srd[0], props.gfxipProperties.nullSrds.pNullBufferView, - props.gfxipProperties.srdSizes.bufferView); + props.gfxipProperties.srdSizes.untypedBufferView); } } @@ -684,9 +685,7 @@ void RayTracingDevice::SetDispatchInfo( dispatchInfo.stateObjectHash = apiHash; dispatchInfo.boxSortMode = settings.boxSortingHeuristic; -#if VKI_BUILD_GFX11 dispatchInfo.usesNodePtrFlags = settings.rtEnableNodePointerFlags ? 1 : 0; -#endif if (pipelineType == GpuRt::RtPipelineType::RayTracing) { @@ -821,10 +820,19 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( const void* pPipelineBinary = nullptr; size_t pipelineBinarySize = 0; + Vkgc::BinaryData spvBin = + { + .codeSize = buildInfo.code.spvSize, + .pCode = buildInfo.code.pSpvCode + }; + Vkgc::ResourceMappingRootNode nodes[GpuRt::MaxInternalPipelineNodes] = {}; Vkgc::ResourceMappingNode subNodes[GpuRt::MaxInternalPipelineNodes] = {}; uint32_t subNodeIndex = 0; - const uint32_t bufferSrdSizeDw = pDevice->GetProperties().descriptorSizes.bufferView / sizeof(uint32_t); + const uint32_t typedBufferSrdSizeDw = + pDevice->GetProperties().descriptorSizes.typedBufferView / sizeof(uint32_t); + const uint32_t untypedBufferSrdSizeDw = + pDevice->GetProperties().descriptorSizes.untypedBufferView / sizeof(uint32_t); for (uint32_t nodeIndex = 0; nodeIndex < buildInfo.nodeCount; ++nodeIndex) { @@ -863,8 +871,17 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( } else if (node.type == GpuRt::NodeType::Srv) { - nodes[nodeIndex].node.type = - Vkgc::ResourceMappingNodeType::DescriptorResource; + nodes[nodeIndex].node.type = Vkgc::ResourceMappingNodeType::DescriptorResource; + + if (node.srdStride == 2) + { + nodes[nodeIndex].node.type = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; + } + else if (node.srdStride == 4) + { + nodes[nodeIndex].node.type = Vkgc::ResourceMappingNodeType::DescriptorBuffer; + } + nodes[nodeIndex].node.sizeInDwords = node.dwSize; nodes[nodeIndex].node.offsetInDwords = node.dwOffset; nodes[nodeIndex].node.srdRange.set = node.descSet; @@ -888,22 +905,28 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( { case GpuRt::NodeType::UavTable: pSubNode->type = Vkgc::ResourceMappingNodeType::DescriptorBuffer; + pSubNode->sizeInDwords = untypedBufferSrdSizeDw; break; case GpuRt::NodeType::TypedUavTable: pSubNode->type = Vkgc::ResourceMappingNodeType::DescriptorTexelBuffer; + pSubNode->sizeInDwords = typedBufferSrdSizeDw; break; case GpuRt::NodeType::ConstantBufferTable: pSubNode->type = Vkgc::ResourceMappingNodeType::DescriptorConstBuffer; + pSubNode->sizeInDwords = untypedBufferSrdSizeDw; break; case GpuRt::NodeType::SrvTable: + pSubNode->type = Vkgc::ResourceMappingNodeType::DescriptorResource; + pSubNode->sizeInDwords = untypedBufferSrdSizeDw; + break; case GpuRt::NodeType::TypedSrvTable: pSubNode->type = Vkgc::ResourceMappingNodeType::DescriptorResource; + pSubNode->sizeInDwords = typedBufferSrdSizeDw; break; default: VK_NEVER_CALLED(); } pSubNode->offsetInDwords = 0; - pSubNode->sizeInDwords = bufferSrdSizeDw; pSubNode->srdRange.set = node.descSet; pSubNode->srdRange.binding = node.binding; } @@ -934,8 +957,6 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( compileConstants.pConstants }; - Vkgc::BinaryData spvBin = { buildInfo.code.spvSize, buildInfo.code.pSpvCode }; - bool forceWave64 = false; // Overide wave size for these GpuRT shader types @@ -947,11 +968,11 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( forceWave64 = true; } - result = pDevice->CreateInternalComputePipeline(buildInfo.code.spvSize, - static_cast(buildInfo.code.pSpvCode), + result = pDevice->CreateInternalComputePipeline(spvBin.codeSize, + static_cast(spvBin.pCode), buildInfo.nodeCount, nodes, - VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT, + ShaderModuleInternalRayTracingShader, forceWave64, &specializationInfo, &pDevice->GetInternalRayTracingPipeline()); diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h index 246489eb..75789250 100644 --- a/icd/api/raytrace/ray_tracing_device.h +++ b/icd/api/raytrace/ray_tracing_device.h @@ -132,16 +132,6 @@ class RayTracingDevice void* pConstants); private: - Device* m_pDevice; - - GpuRt::IDevice* m_pGpuRtDevice[MaxPalDevices]; - GpuRt::DeviceSettings m_gpurtDeviceSettings; - GpurtOptions m_gpurtOptions; - - uint32_t m_profileRayFlags; // Ray flag override for profiling - uint32_t m_profileMaxIterations; // Max traversal iterations - - CmdContext m_cmdContext[MaxPalDevices]; // GPURT Callback Functions static Pal::Result ClientAllocateGpuMemory( @@ -211,6 +201,17 @@ class RayTracingDevice void CollectGpurtOptions(GpurtOptions* const pGpurtOptions) const; + Device* m_pDevice; + + GpuRt::IDevice* m_pGpuRtDevice[MaxPalDevices]; + GpuRt::DeviceSettings m_gpurtDeviceSettings; + GpurtOptions m_gpurtOptions; + + uint32_t m_profileRayFlags; // Ray flag override for profiling + uint32_t m_profileMaxIterations; // Max traversal iterations + + CmdContext m_cmdContext[MaxPalDevices]; + BvhBatchLayer* m_pBvhBatchLayer; SplitRaytracingLayer* m_pSplitRaytracingLayer; diff --git a/icd/api/raytrace/vk_acceleration_structure.cpp b/icd/api/raytrace/vk_acceleration_structure.cpp index 3295d5c7..bfeb83d4 100644 --- a/icd/api/raytrace/vk_acceleration_structure.cpp +++ b/icd/api/raytrace/vk_acceleration_structure.cpp @@ -556,6 +556,24 @@ GpuRt::Geometry AccelerationStructure::ClientConvertAccelStructBuildGeometryKHR( case VK_FORMAT_R16G16_SNORM: pTriangles->vertexFormat = GpuRt::VertexFormat::R16G16_Snorm; break; + case VK_FORMAT_R16G16_UNORM: + pTriangles->vertexFormat = GpuRt::VertexFormat::R16G16_Unorm; + break; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + pTriangles->vertexFormat = GpuRt::VertexFormat::R10G10B10A2_Unorm; + break; + case VK_FORMAT_R8G8B8A8_UNORM: + pTriangles->vertexFormat = GpuRt::VertexFormat::R8G8B8A8_Unorm; + break; + case VK_FORMAT_R8G8B8A8_SNORM: + pTriangles->vertexFormat = GpuRt::VertexFormat::R8G8B8A8_Snorm; + break; + case VK_FORMAT_R8G8_UNORM: + pTriangles->vertexFormat = GpuRt::VertexFormat::R8G8_Unorm; + break; + case VK_FORMAT_R8G8_SNORM: + pTriangles->vertexFormat = GpuRt::VertexFormat::R8G8_Snorm; + break; default: VK_NEVER_CALLED(); pTriangles->vertexFormat = GpuRt::VertexFormat::R32G32B32_Float; diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp index d2a65adb..69e2db0a 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp @@ -30,6 +30,7 @@ #include "include/vk_shader.h" #include "include/vk_device.h" #include "include/vk_instance.h" +#include "include/vk_pipeline_binary.h" #include "include/vk_pipeline_cache.h" #include "include/vk_pipeline_layout.h" #include "include/vk_memory.h" @@ -330,6 +331,7 @@ void RayTracingPipeline::Init( uint32_t shaderLibraryCount, Pal::IShaderLibrary** ppPalShaderLibrary, const PipelineLayout* pPipelineLayout, + PipelineBinaryStorage* pBinaryStorage, const ShaderOptimizerKey* pShaderOptKeys, const ImmedInfo& immedInfo, uint64_t staticStateMask, @@ -350,6 +352,7 @@ void RayTracingPipeline::Init( Pipeline::Init( ppPalPipeline, pPipelineLayout, + pBinaryStorage, staticStateMask, dispatchRaysUserDataOffset, cacheHash, @@ -482,6 +485,29 @@ VkResult RayTracingPipeline::CreateImpl( PipelineCompiler::InitPipelineCreationFeedback(pPipelineCreationFeedbackCreateInfo); bool binariesProvided = false; Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; + Vkgc::BinaryData providedBinaries[MaxPalDevices] = {}; + + auto pPipelineBinaryInfoKHR = extStructs.pPipelineBinaryInfoKHR; + + if (pPipelineBinaryInfoKHR != nullptr) + { + if (pPipelineBinaryInfoKHR->binaryCount > 0) + { + VK_ASSERT(pPipelineBinaryInfoKHR->binaryCount == m_pDevice->NumPalDevices()); + binariesProvided = true; + } + + for (uint32_t binaryIndex = 0; + (binaryIndex < pPipelineBinaryInfoKHR->binaryCount) && (result == VK_SUCCESS); + ++binaryIndex) + { + const auto pBinary = PipelineBinary::ObjectFromHandle( + pPipelineBinaryInfoKHR->pPipelineBinaries[binaryIndex]); + + cacheId[binaryIndex] = pBinary->BinaryKey(); + providedBinaries[binaryIndex] = pBinary->BinaryData(); + } + } RayTracingPipelineShaderStageInfo shaderInfo = {}; PipelineOptimizerKey optimizerKey = {}; @@ -693,6 +719,10 @@ VkResult RayTracingPipeline::CreateImpl( bool storeBinaryToPipeline = false; bool storeBinaryToCache = true; + PipelineBinaryStorage binaryStorage = {}; + + storeBinaryToPipeline = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) != 0; + storeBinaryToCache = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) == 0; for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < m_pDevice->NumPalDevices()); deviceIdx++) { @@ -732,10 +762,40 @@ VkResult RayTracingPipeline::CreateImpl( isInternalCacheHit); } + if (storeBinaryToPipeline) + { + // Store single packed blob of binaries from cache instead of separate binaries. + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + cachedBinData.codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the pipeline object + + if (pMemory != nullptr) + { + memcpy( + pMemory, + cachedBinData.pCode, + cachedBinData.codeSize); + + InsertBinaryData( + &binaryStorage, + deviceIdx, + cacheId[deviceIdx], + cachedBinData.codeSize, + pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } } } else { + cachedBinData = providedBinaries[deviceIdx]; + cacheResult = Util::Result::Success; } if (cacheResult == Util::Result::Success) @@ -836,6 +896,35 @@ VkResult RayTracingPipeline::CreateImpl( isInternalCacheHit); } + if (storeBinaryToPipeline) + { + // Store compiled binaries packed into a single blob instead of separately. + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + cachedBinData.codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the pipeline object + + if (pMemory != nullptr) + { + memcpy( + pMemory, + cachedBinData.pCode, + cachedBinData.codeSize); + + InsertBinaryData( + &binaryStorage, + deviceIdx, + cacheId[deviceIdx], + cachedBinData.codeSize, + pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + else { m_pDevice->VkInstance()->FreeMem(const_cast(cachedBinData.pCode)); } @@ -925,6 +1014,7 @@ VkResult RayTracingPipeline::CreateImpl( // Create the PAL pipeline object. Pal::IShaderLibrary** ppShaderLibraries = nullptr; ShaderGroupInfo* pShaderGroupInfos = nullptr; + PipelineBinaryStorage* pPermBinaryStorage = nullptr; Pal::IPipeline* pPalPipeline [MaxPalDevices] = {}; ShaderGroupStackSizes* pShaderGroupStackSizes[MaxPalDevices] = {}; gpusize traceRayGpuVas [MaxPalDevices] = {}; @@ -934,6 +1024,7 @@ VkResult RayTracingPipeline::CreateImpl( size_t shaderLibraryPalMemSize = 0; size_t shaderGroupStackSizesMemSize = 0; size_t shaderGroupInfosMemSize = 0; + size_t binaryStorageSize = 0; const size_t shaderOptKeysSize = optimizerKey.shaderCount * sizeof(ShaderOptimizerKey); @@ -966,6 +1057,7 @@ VkResult RayTracingPipeline::CreateImpl( shaderGroupInfosMemSize = sizeof(ShaderGroupInfo) * totalGroupCount; shaderGroupStackSizesMemSize = (((funcCount > 0) || hasLibraries) ? 1 : 0) * sizeof(ShaderGroupStackSizes) * totalGroupCount * m_pDevice->NumPalDevices(); + binaryStorageSize = (storeBinaryToPipeline ? 1 : 0 ) * sizeof(PipelineBinaryStorage); const size_t totalSize = pipelineMemSize + @@ -973,6 +1065,7 @@ VkResult RayTracingPipeline::CreateImpl( shaderLibraryPalMemSize + shaderGroupStackSizesMemSize + shaderGroupInfosMemSize + + binaryStorageSize + shaderOptKeysSize; pSystemMem = pAllocator->pfnAllocation( @@ -1007,6 +1100,15 @@ VkResult RayTracingPipeline::CreateImpl( PopulateShaderGroupInfos(pCreateInfo, pShaderGroupInfos, totalGroupCount); + if (storeBinaryToPipeline) + { + pPermBinaryStorage = static_cast( + Util::VoidPtrInc(pShaderGroupsStackSizesMem, shaderGroupStackSizesMemSize)); + + // Simply copy the existing allocations to the new struct. + *pPermBinaryStorage = binaryStorage; + } + // Transfer shader optimizer keys to permanent storage. memcpy(pShaderOptKeys, optimizerKey.pShaders, shaderOptKeysSize); optimizerKey.pShaders = static_cast(pShaderOptKeys); @@ -1526,6 +1628,7 @@ VkResult RayTracingPipeline::CreateImpl( funcCount * m_pDevice->NumPalDevices(), ppShaderLibraries, localPipelineInfo.pLayout, + pPermBinaryStorage, optimizerKey.pShaders, localPipelineInfo.immedInfo, localPipelineInfo.staticStateMask, @@ -1542,7 +1645,7 @@ VkResult RayTracingPipeline::CreateImpl( cacheId[DefaultDeviceIndex], apiPsoHash, elfHash); - if (settings.enableDebugPrintf) + if (m_pDevice->GetEnabledFeatures().enableDebugPrintf) { ClearFormatString(); for (uint32_t i = 0; i < pipelineBinaries[DefaultDeviceIndex].pipelineBinCount; ++i) @@ -1557,6 +1660,8 @@ VkResult RayTracingPipeline::CreateImpl( } else { + // Free the binaries only if we failed to create the pipeline. + FreeBinaryStorage(&binaryStorage, pAllocator); for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++) { @@ -2262,15 +2367,10 @@ uint32_t RayTracingPipeline::UpdateShaderGroupIndex( } // ===================================================================================================================== -void RayTracingPipeline::GetDispatchSize( - uint32_t* pDispatchSizeX, - uint32_t* pDispatchSizeY, - uint32_t* pDispatchSizeZ, - uint32_t width, - uint32_t height, - uint32_t depth) const +Pal::DispatchDims RayTracingPipeline::GetDispatchSize( + Pal::DispatchDims size) const { - VK_ASSERT((pDispatchSizeX != nullptr) && (pDispatchSizeY != nullptr) && (pDispatchSizeZ != nullptr)); + Pal::DispatchDims dispatchSize = {}; const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); @@ -2280,31 +2380,33 @@ void RayTracingPipeline::GetDispatchSize( if (flattenThreadGroupSize == 0) { - *pDispatchSizeX = Util::RoundUpQuotient(width, settings.rtThreadGroupSizeX); - *pDispatchSizeY = Util::RoundUpQuotient(height, settings.rtThreadGroupSizeY); - *pDispatchSizeZ = Util::RoundUpQuotient(depth, settings.rtThreadGroupSizeZ); + dispatchSize.x = Util::RoundUpQuotient(size.x, settings.rtThreadGroupSizeX); + dispatchSize.y = Util::RoundUpQuotient(size.y, settings.rtThreadGroupSizeY); + dispatchSize.z = Util::RoundUpQuotient(size.z, settings.rtThreadGroupSizeZ); } else { - uint32_t dispatchSize = 0; + uint32_t x = 0; - if ((width > 1) && (height > 1)) + if ((size.x > 1) && (size.y > 1)) { const uint32_t tileHeight = flattenThreadGroupSize / RayTracingTileWidth; - const uint32_t paddedWidth = Util::Pow2Align(width, RayTracingTileWidth); - const uint32_t paddedHeight = Util::Pow2Align(height, tileHeight); + const uint32_t paddedWidth = Util::Pow2Align(size.x, RayTracingTileWidth); + const uint32_t paddedHeight = Util::Pow2Align(size.y, tileHeight); - dispatchSize = Util::RoundUpQuotient(paddedWidth * paddedHeight, flattenThreadGroupSize); + x = Util::RoundUpQuotient(paddedWidth * paddedHeight, flattenThreadGroupSize); } else { - dispatchSize = Util::RoundUpQuotient(width * height, flattenThreadGroupSize); + x = Util::RoundUpQuotient(size.x * size.y, flattenThreadGroupSize); } - *pDispatchSizeX = dispatchSize; - *pDispatchSizeY = depth; - *pDispatchSizeZ = 1; + dispatchSize.x = x; + dispatchSize.y = size.z; + dispatchSize.z = 1; } + + return dispatchSize; } // ===================================================================================================================== diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.h b/icd/api/raytrace/vk_ray_tracing_pipeline.h index 394c7f68..19a4c5e7 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.h +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.h @@ -30,6 +30,7 @@ #include "include/vk_pipeline.h" #include "include/internal_mem_mgr.h" +#include "include/vk_pipeline_binary.h" #include "palPipeline.h" #include "palVector.h" @@ -256,12 +257,7 @@ class RayTracingPipeline final : public Pipeline, public NonDispatchablesrcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; - pBarrier->implicitSrcCacheMask |= pBarrier->flags.preColorResolveSync ? Pal::CoherColorTarget : - Pal::CoherDepthStencilTarget; - pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; + // It's possible that color and DS are both included in a single barrier for resolves. As a result of that we + // cannot rely on preColorResolveSync and preDsResolveSync to determine the cache mask here. Instead, include + // both here and then in RPSyncPoint we use excludeAccessMask to filter out unnecessary mask. + pBarrier->implicitSrcCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; + + // Ideally, we should specify CoherResolveSrc here but since this preResolveSync barrier can specify multiple + // image transitions it's possible that different images in this barrier are used as src and dst for the + // resolve operation. Thus, it's better to specify just CoherResolve here. + pBarrier->implicitDstCacheMask |= Pal::CoherResolve; } // Wait for (non-auto-synced) pre-clear if necessary. No need to augment the pipe point because the prior work falls @@ -1009,13 +1015,14 @@ static void ConvertImplicitSyncs( // Augment the active source pipeline stages for resolves if we need to wait for prior resolves to complete if (pBarrier->flags.postResolveSync) { - // TopOfPipe causes a stall at PFP which is not really needed for images. As an optimization for Acq-Rel - // barriers we instead set dstStage to Blt here. + // Wait until the prior resolves complete pBarrier->srcStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; - pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT_KHR | + VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT_KHR | + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT_KHR; - pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; - pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; + pBarrier->implicitSrcCacheMask |= Pal::CoherResolve; + pBarrier->implicitDstCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; } if (pBarrier->flags.implicitExternalOutgoing && settings.implicitExternalSynchronization) diff --git a/icd/api/renderpass/renderpass_types.h b/icd/api/renderpass/renderpass_types.h index 6c7d4a92..ac6859d7 100644 --- a/icd/api/renderpass/renderpass_types.h +++ b/icd/api/renderpass/renderpass_types.h @@ -173,8 +173,7 @@ union SubpassStateFlags uint32_t hasExternalIncoming : 1; // True if an explicit VkSubpassDependency exists with src = // VK_SUBPASS_EXTERNAL and dst = this. uint32_t hasExternalOutgoing : 1; // Same as above, but src and dst reversed. - uint32_t reserved1 : 2; - uint32_t reserved : 26; + uint32_t reserved : 28; }; uint32_t u32All; }; diff --git a/icd/api/sqtt/sqtt_layer.cpp b/icd/api/sqtt/sqtt_layer.cpp index b9c020ba..ad814674 100644 --- a/icd/api/sqtt/sqtt_layer.cpp +++ b/icd/api/sqtt/sqtt_layer.cpp @@ -1142,12 +1142,12 @@ void SqttCmdBufferState::DebugMarkerInsert( void SqttCmdBufferState::DebugLabelBegin( const VkDebugUtilsLabelEXT* pMarkerInfo) { - DevUserMarkerString userMarkerString; + DevUserMarkerString userMarkerString = {}; userMarkerString.length = static_cast(strlen(pMarkerInfo->pLabelName)) + 1; Util::Strncpy(userMarkerString.string, pMarkerInfo->pLabelName, sizeof(userMarkerString.string)); m_userMarkerStrings.PushBack(userMarkerString); - Pal::Developer::UserMarkerOpInfo opInfo; + Pal::Developer::UserMarkerOpInfo opInfo = {}; opInfo.opType = static_cast(Pal::Developer::UserMarkerOpType::Push); opInfo.strIndex = static_cast(m_userMarkerStrings.size()); m_userMarkerOpHistory.PushBack(opInfo.u32All); @@ -1158,7 +1158,7 @@ void SqttCmdBufferState::DebugLabelBegin( // ===================================================================================================================== void SqttCmdBufferState::DebugLabelEnd() { - Pal::Developer::UserMarkerOpInfo opInfo; + Pal::Developer::UserMarkerOpInfo opInfo = {}; opInfo.opType = static_cast(Pal::Developer::UserMarkerOpType::Pop); m_userMarkerOpHistory.PushBack(opInfo.u32All); @@ -1477,6 +1477,50 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountKHR( pSqtt->EndEntryPoint(); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCount( + VkCommandBuffer cmdBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + SQTT_SETUP(); + + pSqtt->BeginEntryPoint(RgpSqttMarkerGeneralApiType::CmdDrawIndirectCount); + pSqtt->BeginEventMarkers(RgpSqttMarkerEventType::CmdDrawIndirectCount); + + SQTT_CALL_NEXT_LAYER(vkCmdDrawIndirectCount)(cmdBuffer, buffer, offset, countBuffer, countOffset, maxDrawCount, + stride); + + pSqtt->EndEventMarkers(); + pSqtt->EndEntryPoint(); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCount( + VkCommandBuffer cmdBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + SQTT_SETUP(); + + pSqtt->BeginEntryPoint(RgpSqttMarkerGeneralApiType::CmdDrawIndexedIndirectCount); + pSqtt->BeginEventMarkers(RgpSqttMarkerEventType::CmdDrawIndexedIndirectCount); + + SQTT_CALL_NEXT_LAYER(vkCmdDrawIndexedIndirectCount)(cmdBuffer, buffer, offset, countBuffer, countOffset, + maxDrawCount, stride); + + pSqtt->EndEventMarkers(); + pSqtt->EndEntryPoint(); +} + // ===================================================================================================================== VKAPI_ATTR void VKAPI_CALL vkCmdDrawMeshTasksEXT( VkCommandBuffer cmdBuffer, @@ -2924,6 +2968,8 @@ void SqttOverrideDispatchTable( SQTT_OVERRIDE_ENTRY(vkCmdDrawIndexedIndirectCountAMD); SQTT_OVERRIDE_ENTRY(vkCmdDrawIndirectCountKHR); SQTT_OVERRIDE_ENTRY(vkCmdDrawIndexedIndirectCountKHR); + SQTT_OVERRIDE_ENTRY(vkCmdDrawIndirectCount); + SQTT_OVERRIDE_ENTRY(vkCmdDrawIndexedIndirectCount); SQTT_OVERRIDE_ENTRY(vkCmdDrawMeshTasksEXT); SQTT_OVERRIDE_ENTRY(vkCmdDrawMeshTasksIndirectCountEXT); SQTT_OVERRIDE_ENTRY(vkCmdDrawMeshTasksIndirectEXT); diff --git a/icd/api/sqtt/sqtt_rgp_annotations.h b/icd/api/sqtt/sqtt_rgp_annotations.h index dcc1d146..0c3cf3a2 100644 --- a/icd/api/sqtt/sqtt_rgp_annotations.h +++ b/icd/api/sqtt/sqtt_rgp_annotations.h @@ -217,6 +217,8 @@ enum class RgpSqttMarkerEventType : uint32_t CmdDrawMeshTasksEXT = 41, // vkCmdDrawMeshTasksEXT CmdDrawMeshTasksIndirectCountEXT = 42, // vkCmdDrawMeshTasksIndirectCountEXT CmdDrawMeshTasksIndirectEXT = 43, // vkCmdDrawMeshTasksIndirectEXT + CmdDrawIndirectCount = 44, // vkCmdDrawIndirectCount + CmdDrawIndexedIndirectCount = 45, // vkCmdDrawIndexedIndirectCount #if VKI_RAY_TRACING ShaderIndirectModeMask = 0x800000, // Used to mark whether the shader is compiled in indirect mode or not // This mask can only be used with CmdTraceRaysKHR and CmdTraceRaysIndirectKHR @@ -509,6 +511,8 @@ enum class RgpSqttMarkerGeneralApiType : uint32_t CmdDrawMeshTasksEXT = 47, CmdDrawMeshTasksIndirectCountEXT = 48, CmdDrawMeshTasksIndirectEXT = 49, + CmdDrawIndirectCount = 50, + CmdDrawIndexedIndirectCount = 51, Invalid = 0xffffffff }; diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt index 92260482..75362511 100644 --- a/icd/api/strings/entry_points.txt +++ b/icd/api/strings/entry_points.txt @@ -583,5 +583,11 @@ vkCmdBindDescriptorBufferEmbeddedSamplers2EXT @device @dext(KHR_main vkCmdSetRenderingAttachmentLocationsKHR @device @dext(KHR_dynamic_rendering_local_read) vkCmdSetRenderingInputAttachmentIndicesKHR @device @dext(KHR_dynamic_rendering_local_read) +vkCreatePipelineBinariesKHR @device @dext(KHR_pipeline_binary) +vkDestroyPipelineBinaryKHR @device @dext(KHR_pipeline_binary) +vkGetPipelineKeyKHR @device @dext(KHR_pipeline_binary) +vkGetPipelineBinaryDataKHR @device @dext(KHR_pipeline_binary) +vkReleaseCapturedPipelineDataKHR @device @dext(KHR_pipeline_binary) + vkCmdSetDepthBias2EXT @device @dext(EXT_depth_bias_control) diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index ef9fdbe4..ed839be3 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -196,6 +196,7 @@ VK_EXT_image_2d_view_of_3d VK_EXT_depth_clamp_zero_one VK_EXT_primitives_generated_query VK_EXT_non_seamless_cube_map +VK_KHR_pipeline_binary VK_EXT_image_sliced_view_of_3d VK_KHR_shader_maximal_reconvergence VK_EXT_shader_module_identifier diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp index 0d35fa27..5c314096 100644 --- a/icd/api/vk_buffer.cpp +++ b/icd/api/vk_buffer.cpp @@ -127,7 +127,16 @@ VkResult Buffer::Create( VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPrtFeatures() & Pal::PrtFeatureBuffer); } - if ((pCreateInfo->flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) != 0) + // Use the descriptor table VA range for descriptor buffers because we need to program descriptors + // with a single (32-bit) user data entry and there is no such guarentee with the default VA range. + if ((Device::GetBufferUsageFlagBits(pCreateInfo) & + (VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT)) != 0) + { + gpuMemoryCreateInfo.vaRange = Pal::VaRange::DescriptorTable; + } + else if ((pCreateInfo->flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) != 0) { gpuMemoryCreateInfo.vaRange = Pal::VaRange::CaptureReplay; } diff --git a/icd/api/vk_buffer_view.cpp b/icd/api/vk_buffer_view.cpp index f1196a96..ab2e2f62 100644 --- a/icd/api/vk_buffer_view.cpp +++ b/icd/api/vk_buffer_view.cpp @@ -46,9 +46,11 @@ VkResult BufferView::Create( // Allocate memory for the buffer view const size_t apiSize = sizeof(BufferView); - const size_t bufferSrdSize = - pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gfxipProperties.srdSizes.bufferView; - size_t srdSize = bufferSrdSize; + const size_t srdSize = (pCreateInfo->format == VK_FORMAT_UNDEFINED) ? + pDevice->VkPhysicalDevice(DefaultDeviceIndex)-> + PalProperties().gfxipProperties.srdSizes.untypedBufferView : + pDevice->VkPhysicalDevice(DefaultDeviceIndex)-> + PalProperties().gfxipProperties.srdSizes.typedBufferView; const size_t objSize = apiSize + (srdSize * pDevice->NumPalDevices()); @@ -87,7 +89,6 @@ VkResult BufferView::Create( bufferAddress, pCreateInfo->format, pDevice->NumPalDevices(), - srdSize, pSrdMemory); VK_PLACEMENT_NEW(pMemory) BufferView(pDevice, static_cast(srdSize), pSrdMemory); @@ -105,7 +106,6 @@ void BufferView::BuildSrd( const Pal::gpusize* bufferAddress, const VkFormat format, const uint32_t deviceNum, - const size_t srdSize, void* pSrdMemory) { // Build the SRD @@ -129,19 +129,20 @@ void BufferView::BuildSrd( if (format != VK_FORMAT_UNDEFINED) { + const uint32_t srdSize = + pDevice->VkPhysicalDevice(deviceIdx)->PalProperties().gfxipProperties.srdSizes.typedBufferView; pDevice->PalDevice(deviceIdx)->CreateTypedBufferViewSrds( 1, &info, Util::VoidPtrInc(pSrdMemory, srdSize * deviceIdx)); } else { - info.stride = 0; // Raw buffers have a zero byte stride + const uint32_t srdSize = + pDevice->VkPhysicalDevice(deviceIdx)->PalProperties().gfxipProperties.srdSizes.untypedBufferView; + info.stride = 0; // Raw buffers have a zero byte stride pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds( 1, &info, Util::VoidPtrInc(pSrdMemory, srdSize * deviceIdx)); } - - VK_ASSERT(srdSize >= - pDevice->VkPhysicalDevice(deviceIdx)->PalProperties().gfxipProperties.srdSizes.bufferView); } } diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 869c08a9..c4b2c71e 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -620,7 +620,7 @@ CmdBuffer::CmdBuffer( m_optimizeCmdbufMode = settings.optimizeCmdbufMode; m_asyncComputeQueueMaxWavesPerCu = settings.asyncComputeQueueMaxWavesPerCu; -#if VK_ENABLE_DEBUG_BARRIERS +#if VKI_ENABLE_DEBUG_BARRIERS m_dbgBarrierPreCmdMask = settings.dbgBarrierPreCmdEnable; m_dbgBarrierPostCmdMask = settings.dbgBarrierPostCmdEnable; #endif @@ -1417,17 +1417,7 @@ VkResult CmdBuffer::Begin( VK_ASSERT(m_flags.is2ndLvl); pInheritanceRenderingInfo = static_cast(pNext); - - inheritedStateParams.colorTargetCount = pInheritanceRenderingInfo->colorAttachmentCount; inheritedStateParams.stateFlags.targetViewState = 1; - - for (uint32_t i = 0; i < inheritedStateParams.colorTargetCount; i++) - { - inheritedStateParams.colorTargetSwizzledFormats[i] = - VkToPalFormat(pInheritanceRenderingInfo->pColorAttachmentFormats[i], settings); - - inheritedStateParams.sampleCount[i] = pInheritanceRenderingInfo->rasterizationSamples; - } } else if (pHeader->sType == VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR) { @@ -1473,15 +1463,7 @@ VkResult CmdBuffer::Begin( { VK_ASSERT(m_flags.is2ndLvl); - inheritedStateParams.colorTargetCount = pRenderPass->GetSubpassColorReferenceCount(currentSubPass); inheritedStateParams.stateFlags.targetViewState = 1; - - for (uint32_t i = 0; i < inheritedStateParams.colorTargetCount; i++) - { - inheritedStateParams.colorTargetSwizzledFormats[i] = - VkToPalFormat(pRenderPass->GetColorAttachmentFormat(currentSubPass, i), settings); - inheritedStateParams.sampleCount[i] = pRenderPass->GetColorAttachmentSamples(currentSubPass, i); - } } Pal::Result result = PalCmdBufferBegin(cmdInfo); @@ -2805,7 +2787,8 @@ PFN_vkCmdBindDescriptorSets CmdBuffer::GetCmdBindDescriptorSetsFunc( // ===================================================================================================================== template VKAPI_ATTR void VKAPI_CALL CmdBuffer::CmdPushDescriptorSetKHR( VkCommandBuffer commandBuffer, @@ -2817,7 +2800,8 @@ VKAPI_ATTR void VKAPI_CALL CmdBuffer::CmdPushDescriptorSetKHR( { CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); - pCmdBuffer->PushDescriptorSetKHR( + pCmdBuffer->PushDescriptorSetKHR + ( pipelineBindPoint, layout, set, @@ -2830,19 +2814,34 @@ template PFN_vkCmdPushDescriptorSetKHR CmdBuffer::GetCmdPushDescriptorSetKHRFunc( const Device* pDevice) { - const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; - const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; - const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t typedBufferDescSize = pDevice->GetProperties().descriptorSizes.typedBufferView; + const size_t untypedBufferDescSize = pDevice->GetProperties().descriptorSizes.untypedBufferView; PFN_vkCmdPushDescriptorSetKHR pFunc = nullptr; - if ((imageDescSize == 32) && - (samplerDescSize == 16) && - (bufferDescSize == 16)) + if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) + { + pFunc = &CmdPushDescriptorSetKHR< + 32, + 16, + 16, + 16, + numPalDevices>; + } + else if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) { pFunc = &CmdPushDescriptorSetKHR< 32, 16, + 24, 16, numPalDevices>; } @@ -4659,7 +4658,6 @@ void CmdBuffer::SetEvent2( { ExecuteAcquireRelease(1, &event, - 1, pDependencyInfo, Release, RgpBarrierExternalCmdWaitEvents); @@ -4725,6 +4723,20 @@ void CmdBuffer::LoadOpClearColor( const Pal::Rect* pDeviceGroupRenderArea, const VkRenderingInfo* pRenderingInfo) { + if (m_pSqttState != nullptr) + { + m_pSqttState->BeginRenderPassColorClear(); + } + + const ImageView* pImageViews[Pal::MaxColorTargets] = {}; + Pal::ClearColor clearColors[Pal::MaxColorTargets] = {}; + Pal::ImageLayout imageLayouts[Pal::MaxColorTargets] = {}; + Pal::SubresRange ranges[Pal::MaxColorTargets] = {}; + Pal::SwizzledFormat clearFormats[Pal::MaxColorTargets] = {}; + + uint32_t clearCount = 0; + + // Collect information on the number of clears to decide if we need to batch. for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) { const VkRenderingAttachmentInfo& attachmentInfo = pRenderingInfo->pColorAttachments[i]; @@ -4733,63 +4745,80 @@ void CmdBuffer::LoadOpClearColor( { // Get the image view from the attachment info const ImageView* const pImageView = ImageView::ObjectFromHandle(attachmentInfo.imageView); - if (pImageView != VK_NULL_HANDLE) + if (pImageView != nullptr) { - // Get the attachment image + pImageViews[clearCount] = pImageView; + const Image* pImage = pImageView->GetImage(); // Convert the clear color to the format of the attachment view - Pal::SwizzledFormat clearFormat = VkToPalFormat( + clearFormats[clearCount] = VkToPalFormat( pImageView->GetViewFormat(), m_pDevice->GetRuntimeSettings()); - Pal::ClearColor clearColor = VkToPalClearColor( + + clearColors[clearCount] = VkToPalClearColor( attachmentInfo.clearValue.color, - clearFormat); + clearFormats[clearCount]); // Get subres range from the image view - Pal::SubresRange subresRange = {}; - pImageView->GetFrameBufferAttachmentSubresRange(&subresRange); + pImageView->GetFrameBufferAttachmentSubresRange(&ranges[clearCount]); // Override the number of slices with layerCount from pBeginRendering - subresRange.numSlices = pRenderingInfo->layerCount; - - const auto clearSubresRanges = LoadOpClearSubresRanges( - pRenderingInfo->viewMask, - subresRange); + ranges[clearCount].numSlices = pRenderingInfo->layerCount; // Clear Layout - const Pal::ImageLayout clearLayout = pImage->GetBarrierPolicy().GetAspectLayout( + imageLayouts[clearCount] = pImage->GetBarrierPolicy().GetAspectLayout( attachmentInfo.imageLayout, - subresRange.startSubres.plane, + ranges[clearCount].startSubres.plane, GetQueueFamilyIndex(), pImage->GetFormat()); - utils::IterateMask deviceGroup(GetDeviceMask()); - - do - { - const uint32_t deviceIdx = deviceGroup.Index(); - - // Clear Box - Pal::Box clearBox = BuildClearBox( - pDeviceGroupRenderArea[deviceIdx], - *pImageView); - - PalCmdBuffer(deviceIdx)->CmdClearColorImage( - *pImage->PalImage(deviceIdx), - clearLayout, - clearColor, - clearFormat, - clearSubresRanges.NumElements(), - clearSubresRanges.Data(), - 1, - &clearBox, - Pal::ColorClearAutoSync); - } - while (deviceGroup.IterateNext()); + clearCount++; } } } + + if (clearCount > 1) + { + BatchedLoadOpClears(clearCount, + pImageViews, + clearColors, + imageLayouts, + ranges, + clearFormats, + pRenderingInfo->viewMask); + } + else if (clearCount == 1) + { + VK_ASSERT(pImageViews[0] != nullptr); + const auto clearSubresRanges = LoadOpClearSubresRanges(pRenderingInfo->viewMask, ranges[0]); + + utils::IterateMask deviceGroup(GetDeviceMask()); + + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + + // Clear Box + Pal::Box clearBox = BuildClearBox(pDeviceGroupRenderArea[deviceIdx], *(pImageViews[0])); + + PalCmdBuffer(deviceIdx)->CmdClearColorImage( + *(pImageViews[0]->GetImage()->PalImage(deviceIdx)), + imageLayouts[0], + clearColors[0], + clearFormats[0], + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), + 1, + &clearBox, + Pal::ColorClearAutoSync); + } while (deviceGroup.IterateNext()); + } + + if (m_pSqttState != nullptr) + { + m_pSqttState->EndRenderPassColorClear(); + } } // ===================================================================================================================== @@ -4798,6 +4827,11 @@ void CmdBuffer::LoadOpClearDepthStencil( const Pal::Rect* pDeviceGroupRenderArea, const VkRenderingInfo* pRenderingInfo) { + if (m_pSqttState != nullptr) + { + m_pSqttState->BeginRenderPassDepthStencilClear(); + } + // Note that no allocation will be performed, so Util::Vector allocator is nullptr. Util::Vector clearSubresRanges{ nullptr }; @@ -4886,6 +4920,11 @@ void CmdBuffer::LoadOpClearDepthStencil( } while (deviceGroup.IterateNext()); } + + if (m_pSqttState != nullptr) + { + m_pSqttState->EndRenderPassDepthStencilClear(); + } } // ===================================================================================================================== @@ -5062,7 +5101,7 @@ void CmdBuffer::BeginRendering( } while (deviceGroup.IterateNext()); - if (!skipClears) + if (skipClears == false) { PalCmdSuspendPredication(true); @@ -5112,6 +5151,11 @@ void CmdBuffer::ResolveImage( VkImageAspectFlags aspectMask, const DynamicRenderingAttachments& dynamicRenderingAttachments) { + if (m_pSqttState != nullptr) + { + m_pSqttState->BeginRenderPassResolve(); + } + Pal::ImageResolveRegion regions[MaxPalDevices] = {}; for (uint32_t idx = 0; idx < m_allGpuState.dynamicRenderingInstance.renderAreaCount; idx++) @@ -5186,27 +5230,58 @@ void CmdBuffer::ResolveImage( m_allGpuState.dynamicRenderingInstance.renderAreaCount, regions, m_curDeviceMask); + + if (m_pSqttState != nullptr) + { + m_pSqttState->EndRenderPassResolve(); + } } // ===================================================================================================================== // For Dynamic Rendering we need to wait for draws to finish before we do resolves. void CmdBuffer::PostDrawPreResolveSync() { - Pal::BarrierInfo barrierInfo = {}; - barrierInfo.waitPoint = Pal::HwPipePreCs; + if (m_flags.useReleaseAcquire) + { + Pal::AcquireReleaseInfo barrierInfo = + { + .srcGlobalStageMask = Pal::PipelineStageColorTarget | Pal::PipelineStageDsTarget, + .dstGlobalStageMask = Pal::PipelineStageBlt, + .srcGlobalAccessMask = Pal::CoherColorTarget | Pal::CoherDepthStencilTarget, + .dstGlobalAccessMask = Pal::CoherResolveSrc, + .memoryBarrierCount = 0, + .pMemoryBarriers = nullptr, + .imageBarrierCount = 0, + .pImageBarriers = nullptr, + .reason = RgpBarrierExternalRenderPassSync + }; - const Pal::HwPipePoint pipePoint = Pal::HwPipePostPs; - barrierInfo.pipePointWaitCount = 1; - barrierInfo.pPipePoints = &pipePoint; + PalCmdReleaseThenAcquire( + &barrierInfo, + nullptr, + nullptr, + nullptr, + nullptr, + m_curDeviceMask); + } + else + { + Pal::BarrierInfo barrierInfo = {}; + barrierInfo.waitPoint = Pal::HwPipePreCs; - Pal::BarrierTransition transition = {}; - transition.srcCacheMask = Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; - transition.dstCacheMask = Pal::CoherShader; + const Pal::HwPipePoint pipePoint = Pal::HwPipePostPs; + barrierInfo.pipePointWaitCount = 1; + barrierInfo.pPipePoints = &pipePoint; - barrierInfo.transitionCount = 1; - barrierInfo.pTransitions = &transition; + Pal::BarrierTransition transition = {}; + transition.srcCacheMask = Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; + transition.dstCacheMask = Pal::CoherShader; - PalCmdBarrier(barrierInfo, m_curDeviceMask); + barrierInfo.transitionCount = 1; + barrierInfo.pTransitions = &transition; + + PalCmdBarrier(barrierInfo, m_curDeviceMask); + } } // ===================================================================================================================== @@ -5682,7 +5757,6 @@ void CmdBuffer::WaitEvents2( ExecuteAcquireRelease(eventRangeCount, pEvents + i, - eventRangeCount, pDependencyInfos + i, Acquire, RgpBarrierExternalCmdWaitEvents); @@ -5835,13 +5909,14 @@ void CmdBuffer::WaitEventsSync2ToSync1( // ===================================================================================================================== // Based on Dependency Info, execute Acquire or Release according to the mode. void CmdBuffer::ExecuteAcquireRelease( - uint32_t eventCount, - const VkEvent* pEvents, uint32_t dependencyCount, + const VkEvent* pEvents, const VkDependencyInfoKHR* pDependencyInfos, AcquireReleaseMode acquireReleaseMode, uint32_t rgpBarrierReasonType) { + VK_ASSERT((acquireReleaseMode == ReleaseThenAcquire) || (pEvents != nullptr)); + const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); uint32_t barrierCount = 0; @@ -5858,7 +5933,7 @@ void CmdBuffer::ExecuteAcquireRelease( maxImageMemoryBarriers = Util::Max(pDependencyInfos[i].imageMemoryBarrierCount, maxImageMemoryBarriers); } - if ((eventCount > 0) || (barrierCount > 0)) + if ((pEvents != nullptr) || (barrierCount > 0)) { VirtualStackFrame virtStackFrame(m_pStackAllocator); @@ -6139,8 +6214,7 @@ void CmdBuffer::ExecuteAcquireRelease( PalCmdRelease( &acquireReleaseInfo, - eventCount, - pEvents, + pEvents[j], pPalBufferMemoryBarriers, ppBuffers, pPalImageBarriers, @@ -6166,8 +6240,7 @@ void CmdBuffer::ExecuteAcquireRelease( PalCmdAcquire( &acquireReleaseInfo, - eventCount, - pEvents, + pEvents[j], pPalBufferMemoryBarriers, ppBuffers, pPalImageBarriers, @@ -6574,9 +6647,8 @@ void CmdBuffer::PipelineBarrier2( if (m_flags.useReleaseAcquire) { - ExecuteAcquireRelease(0, + ExecuteAcquireRelease(1, nullptr, - 1, pDependencyInfo, ReleaseThenAcquire, RgpBarrierExternalCmdPipelineBarrier); @@ -7286,8 +7358,7 @@ void CmdBuffer::PalCmdReleaseThenAcquire( // ===================================================================================================================== void CmdBuffer::PalCmdAcquire( Pal::AcquireReleaseInfo* pAcquireReleaseInfo, - uint32_t eventCount, - const VkEvent* pEvents, + const VkEvent event, Pal::MemBarrier* const pBufferBarriers, const Buffer** const ppBuffers, Pal::ImgBarrier* const pImageBarriers, @@ -7300,7 +7371,7 @@ void CmdBuffer::PalCmdAcquire( // in the header, but temporarily you may use the generic "unknown" reason so as not to block you. VK_ASSERT(pAcquireReleaseInfo->reason != 0); - Event* pEvent = Event::ObjectFromHandle(pEvents[0]); + Event* pEvent = Event::ObjectFromHandle(event); utils::IterateMask deviceGroup(deviceMask); do @@ -7314,53 +7385,23 @@ void CmdBuffer::PalCmdAcquire( pImageBarriers[i].pImage = ppImages[i]->PalImage(deviceIdx); } } - pAcquireReleaseInfo->pImageBarriers = pImageBarriers; + pAcquireReleaseInfo->pImageBarriers = pImageBarriers; pAcquireReleaseInfo->pMemoryBarriers = pBufferBarriers; if (pEvent->IsUseToken()) { - // Allocate space to store sync token values (automatically rewound on unscope) - Pal::ReleaseToken* pSyncTokens = eventCount > 0 ? - pVirtStackFrame->AllocArray(eventCount) : nullptr; + Pal::ReleaseToken syncToken = {}; - if (pSyncTokens != nullptr) - { - for (uint32_t i = 0; i < eventCount; ++i) - { - pSyncTokens[i] = Event::ObjectFromHandle(pEvents[i])->GetSyncToken(); - } - - PalCmdBuffer(deviceIdx)->CmdAcquire(*pAcquireReleaseInfo, eventCount, pSyncTokens); - - pVirtStackFrame->FreeArray(pSyncTokens); - } - else - { - m_recordingResult = VK_ERROR_OUT_OF_HOST_MEMORY; - } + syncToken = pEvent->GetSyncToken(); + PalCmdBuffer(deviceIdx)->CmdAcquire(*pAcquireReleaseInfo, 1u, &syncToken); } else { - // Allocate space to store signaled event pointers (automatically rewound on unscope) - const Pal::IGpuEvent** ppGpuEvents = eventCount > 0 ? - pVirtStackFrame->AllocArray(eventCount) : nullptr; - - if (ppGpuEvents != nullptr) - { - for (uint32_t i = 0; i < eventCount; ++i) - { - ppGpuEvents[i] = Event::ObjectFromHandle(pEvents[i])->PalEvent(deviceIdx); - } + const Pal::IGpuEvent* pGpuEvent = {}; - PalCmdBuffer(deviceIdx)->CmdAcquireEvent(*pAcquireReleaseInfo, eventCount, ppGpuEvents); - - pVirtStackFrame->FreeArray(ppGpuEvents); - } - else - { - m_recordingResult = VK_ERROR_OUT_OF_HOST_MEMORY; - } + pGpuEvent = pEvent->PalEvent(deviceIdx); + PalCmdBuffer(deviceIdx)->CmdAcquireEvent(*pAcquireReleaseInfo, 1u, &pGpuEvent); } } while (deviceGroup.IterateNext()); @@ -7369,8 +7410,7 @@ void CmdBuffer::PalCmdAcquire( // ===================================================================================================================== void CmdBuffer::PalCmdRelease( Pal::AcquireReleaseInfo* pAcquireReleaseInfo, - uint32_t eventCount, - const VkEvent* pEvents, + const VkEvent event, Pal::MemBarrier* const pBufferBarriers, const Buffer** const ppBuffers, Pal::ImgBarrier* const pImageBarriers, @@ -7382,9 +7422,7 @@ void CmdBuffer::PalCmdRelease( // in the header, but temporarily you may use the generic "unknown" reason so as not to block you. VK_ASSERT(pAcquireReleaseInfo->reason != 0); - VK_ASSERT(eventCount == 1); - - Event* pEvent = Event::ObjectFromHandle(*pEvents); + Event* pEvent = Event::ObjectFromHandle(event); utils::IterateMask deviceGroup(deviceMask); do @@ -7398,8 +7436,8 @@ void CmdBuffer::PalCmdRelease( pImageBarriers[i].pImage = ppImages[i]->PalImage(deviceIdx); } } - pAcquireReleaseInfo->pImageBarriers = pImageBarriers; + pAcquireReleaseInfo->pImageBarriers = pImageBarriers; pAcquireReleaseInfo->pMemoryBarriers = pBufferBarriers; if (pEvent->IsUseToken()) @@ -7814,7 +7852,6 @@ void CmdBuffer::BeginRenderPass( while (deviceGroup.IterateNext()); RPBeginSubpass(); - } else { @@ -7834,7 +7871,6 @@ void CmdBuffer::NextSubPass( if (m_renderPassInstance.subpass != VK_SUBPASS_EXTERNAL) { - // End the previous subpass RPEndSubpass(); @@ -7843,7 +7879,6 @@ void CmdBuffer::NextSubPass( // Begin the next subpass RPBeginSubpass(); - } DbgBarrierPostCmd(DbgBarrierNextSubpass); @@ -8074,7 +8109,6 @@ void CmdBuffer::RPBeginSubpass() // Set view instance mask, on devices in render pass instance's device mask SetViewInstanceMask(GetRpDeviceMask()); - } // ===================================================================================================================== @@ -8296,6 +8330,19 @@ void CmdBuffer::RPSyncPoint( Pal::BarrierTransition imageTransition = { }; + // Remove depth stencil related stage/access masks for color attachment transitions and remove color + // target related stage/access mask for depth stencils. + const uint32_t excludeStageMask = + attachment.pImage->IsColorFormat() ? (~Pal::PipelineStageDsTarget) : + (attachment.pImage->IsDepthStencilFormat() ? + (~Pal::PipelineStageColorTarget) : + (Pal::PipelineStageAllStages)); + const uint32_t excludeAccessMask = + attachment.pImage->IsColorFormat() ? (~Pal::CoherDepthStencilTarget) : + (attachment.pImage->IsDepthStencilFormat() ? + (~Pal::CoherColorTarget) : + (Pal::CoherAllUsages)); + for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) { const uint32_t plane = attachment.subresRange[sr].startSubres.plane; @@ -8325,10 +8372,14 @@ void CmdBuffer::RPSyncPoint( ppImages[acquireReleaseInfo.imageBarrierCount] = attachment.pImage; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask & + excludeStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask & + excludeStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask & + excludeAccessMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask & + excludeAccessMask; // We set the pImage to nullptr by default here. But, this will be computed correctly later for // each device including DefaultDeviceIndex based on the deviceId. pPalTransitions[acquireReleaseInfo.imageBarrierCount].pImage = nullptr; @@ -8336,6 +8387,9 @@ void CmdBuffer::RPSyncPoint( pPalTransitions[acquireReleaseInfo.imageBarrierCount].newLayout = newLayout; pPalTransitions[acquireReleaseInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; + VK_ASSERT((pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask != 0) && + (pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask != 0)); + const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; if (attachment.pImage->IsSampleLocationsCompatibleDepth() && @@ -8695,12 +8749,6 @@ void CmdBuffer::RPResolveAttachments( uint32_t count, const RPResolveInfo* pResolves) { - // Notify SQTT annotator that we are doing a render pass resolve operation - if (m_pSqttState != nullptr) - { - m_pSqttState->BeginRenderPassResolve(); - } - for (uint32_t i = 0; i < count; ++i) { const RPResolveInfo& params = pResolves[i]; @@ -8709,11 +8757,6 @@ void CmdBuffer::RPResolveAttachments( RPResolveMsaa(params); } } - - if (m_pSqttState != nullptr) - { - m_pSqttState->EndRenderPassResolve(); - } } // ===================================================================================================================== @@ -8721,6 +8764,11 @@ void CmdBuffer::RPResolveAttachments( void CmdBuffer::RPResolveMsaa( const RPResolveInfo& params) { + if (m_pSqttState != nullptr) + { + m_pSqttState->BeginRenderPassResolve(); + } + const Framebuffer::Attachment& srcAttachment = m_allGpuState.pFramebuffer->GetAttachment(params.src.attachment); const Framebuffer::Attachment& dstAttachment = @@ -8849,6 +8897,11 @@ void CmdBuffer::RPResolveMsaa( regions, GetRpDeviceMask()); } + + if (m_pSqttState != nullptr) + { + m_pSqttState->EndRenderPassResolve(); + } } // ===================================================================================================================== @@ -9307,7 +9360,8 @@ VkDescriptorSet CmdBuffer::InitPushDescriptorSet( // ===================================================================================================================== template void CmdBuffer::PushDescriptorSetKHR( VkPipelineBindPoint pipelineBindPoint, @@ -9430,7 +9484,7 @@ void CmdBuffer::PushDescriptorSetKHR( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - DescriptorUpdate::WriteBufferDescriptors( + DescriptorUpdate::WriteBufferDescriptors( params.pTexelBufferView, deviceIdx, pDestAddr, @@ -9439,7 +9493,7 @@ void CmdBuffer::PushDescriptorSetKHR( break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - DescriptorUpdate::WriteBufferDescriptors( + DescriptorUpdate::WriteBufferDescriptors( params.pTexelBufferView, deviceIdx, pDestAddr, @@ -9448,7 +9502,7 @@ void CmdBuffer::PushDescriptorSetKHR( break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - DescriptorUpdate::WriteBufferInfoDescriptors( + DescriptorUpdate::WriteBufferInfoDescriptors( m_pDevice, params.pBufferInfo, deviceIdx, @@ -9458,7 +9512,7 @@ void CmdBuffer::PushDescriptorSetKHR( break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - DescriptorUpdate::WriteBufferInfoDescriptors( + DescriptorUpdate::WriteBufferInfoDescriptors( m_pDevice, params.pBufferInfo, deviceIdx, @@ -10123,7 +10177,7 @@ void CmdBuffer::SetRenderingInputAttachmentIndices( } -#if VK_ENABLE_DEBUG_BARRIERS +#if VKI_ENABLE_DEBUG_BARRIERS // ===================================================================================================================== // This function inserts a command before or after a particular Vulkan command if the given runtime settings are asking // for it. @@ -10949,7 +11003,7 @@ void CmdBuffer::GetRayTracingDispatchArgs( memcpy(pConstants->descriptorTable.accelStructTrackerSrd, m_pDevice->RayTrace()->GetAccelStructTrackerSrd(deviceIdx), - m_pDevice->GetProperties().descriptorSizes.bufferView); + m_pDevice->GetProperties().descriptorSizes.untypedBufferView); if (pPipeline->CheckIsCps()) { @@ -11093,6 +11147,8 @@ void CmdBuffer::TraceRaysIndirect( const VkStridedDeviceAddressRegionKHR& callableShaderBindingTable, VkDeviceAddress indirectDeviceAddress) { + DbgBarrierPreCmd(DbgTraceRays); + utils::IterateMask deviceGroup(m_curDeviceMask); do @@ -11110,6 +11166,8 @@ void CmdBuffer::TraceRaysIndirect( GetUserMarkerContextValue()); } while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(DbgTraceRays); } // ===================================================================================================================== @@ -11162,12 +11220,8 @@ void CmdBuffer::TraceRaysDispatchPerDevice( uint32_t depth) { const RayTracingPipeline* pPipeline = pCmdBuffer->m_allGpuState.pRayTracingPipeline; - uint32_t dispatchSizeX = 0; - uint32_t dispatchSizeY = 0; - uint32_t dispatchSizeZ = 0; - - pPipeline->GetDispatchSize(&dispatchSizeX, &dispatchSizeY, &dispatchSizeZ, width, height, depth); - pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdDispatch({ dispatchSizeX, dispatchSizeY, dispatchSizeZ }); + const Pal::DispatchDims dispatchSize = pPipeline->GetDispatchSize({ .x = width, .y = height, .z = depth }); + pCmdBuffer->PalCmdBuffer(deviceIdx)->CmdDispatch(dispatchSize); } // ===================================================================================================================== @@ -11181,8 +11235,6 @@ void CmdBuffer::TraceRaysIndirectPerDevice( VkDeviceAddress indirectDeviceAddress, uint64_t userMarkerContext) { - DbgBarrierPreCmd(DbgTraceRays); - const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); const RayTracingPipeline* pPipeline = m_allGpuState.pRayTracingPipeline; @@ -11201,7 +11253,8 @@ void CmdBuffer::TraceRaysIndirectPerDevice( // Pre-pass gpusize initConstantsVa = 0; - const gpusize scratchBufferSize = sizeof(VkTraceRaysIndirectCommandKHR); + const gpusize scratchBufferSize = + sizeof(VkTraceRaysIndirectCommandKHR); InternalMemory* pScratchMemory = nullptr; VkResult result = GetRayTracingIndirectMemory(scratchBufferSize, &pScratchMemory); @@ -11213,6 +11266,7 @@ void CmdBuffer::TraceRaysIndirectPerDevice( 2, &initConstantsVa)); + memset(pInitConstants, 0, sizeof(GpuRt::InitExecuteIndirectConstants)); pInitConstants->maxIterations = m_pDevice->RayTrace()->GetProfileMaxIterations(); pInitConstants->profileRayFlags = m_pDevice->RayTrace()->GetProfileRayFlags(); @@ -11240,6 +11294,12 @@ void CmdBuffer::TraceRaysIndirectPerDevice( pInitConstants->rtThreadGroupSizeZ = 1; } + pInitConstants->bindingArgsSize = + 0; + + pInitConstants->inputBytesPerDispatch = 0; + pInitConstants->outputBytesPerDispatch = 0; + GpuRt::InitExecuteIndirectUserData initUserData = {}; initUserData.constantsVa = initConstantsVa; @@ -11299,9 +11359,9 @@ void CmdBuffer::TraceRaysIndirectPerDevice( 1, &constGpuAddrLow); - PalCmdBuffer(deviceIdx)->CmdDispatchIndirect(pScratchMemory->GpuVirtAddr(deviceIdx)); - - DbgBarrierPostCmd(DbgTraceRays); + { + PalCmdBuffer(deviceIdx)->CmdDispatchIndirect(pScratchMemory->GpuVirtAddr(deviceIdx)); + } } // ===================================================================================================================== @@ -11453,7 +11513,7 @@ void CmdBuffer::BindRayQueryConstants( { memcpy(constants.descriptorTable.accelStructTrackerSrd, VkDevice()->RayTrace()->GetAccelStructTrackerSrd(deviceIdx), - VkDevice()->GetProperties().descriptorSizes.bufferView); + VkDevice()->GetProperties().descriptorSizes.untypedBufferView); } if (rtCountersEnabled) @@ -11705,6 +11765,98 @@ void CmdBuffer::BindDescriptorBufferEmbeddedSamplers( } } +// ===================================================================================================================== +// Batch LoadOp clears on multiple color attachments instead of using PAL's ColorClearAutoSync, this will reduce the +// amount of barriers from 2 per clear to 2 for the entire batch. This is currently only used for Dynamic Rendering +// as the renderpass code has it's own version of this. +void CmdBuffer::BatchedLoadOpClears( + uint32_t clearCount, + const ImageView** pImageViews, + const Pal::ClearColor* pClearColors, + const Pal::ImageLayout* pClearLayouts, + const Pal::SubresRange* pRanges, + const Pal::SwizzledFormat* pClearFormats, + uint32_t viewMask) +{ + VK_ASSERT_MSG(clearCount > 1, "Pal::ColorClearAutoSync is recommended for single clears"); + + Pal::ImgBarrier imageBarriers[Pal::MaxColorTargets] = {}; + const Image* images[Pal::MaxColorTargets] = {}; + + for (uint32_t i = 0; i < clearCount; i++) + { + Pal::ImgBarrier* pPreSyncBarrier = &imageBarriers[i]; + + pPreSyncBarrier->srcStageMask = Pal::PipelineStageColorTarget; + pPreSyncBarrier->dstStageMask = Pal::PipelineStageBlt; + pPreSyncBarrier->srcAccessMask = Pal::CoherColorTarget; + pPreSyncBarrier->dstAccessMask = Pal::CoherClear; + pPreSyncBarrier->oldLayout = pClearLayouts[i]; + pPreSyncBarrier->newLayout = pClearLayouts[i]; + + pImageViews[i]->GetFrameBufferAttachmentSubresRange(&pPreSyncBarrier->subresRange); + + // This is filled out later in PalCmdReleaseThenAcquire() + pPreSyncBarrier->pImage = nullptr; + + images[i] = pImageViews[i]->GetImage(); + } + + // Issue the pre sync barrier + Pal::AcquireReleaseInfo acqRelInfo = {}; + + acqRelInfo.reason = Pal::Developer::BarrierReason::BarrierReasonPreSyncClear; + acqRelInfo.imageBarrierCount = clearCount; + acqRelInfo.pImageBarriers = imageBarriers; + + PalCmdReleaseThenAcquire(&acqRelInfo, nullptr, nullptr, imageBarriers, images, m_curDeviceMask); + + // Issue the actual clear + for (uint32_t i = 0; i < clearCount; i++) + { + //Modify the barriers for postSync clear + Pal::ImgBarrier* pPostSyncBarrier = &imageBarriers[i]; + + pPostSyncBarrier->srcStageMask = Pal::PipelineStageBlt; + pPostSyncBarrier->dstStageMask = Pal::PipelineStageColorTarget; + pPostSyncBarrier->srcAccessMask = Pal::CoherClear; + pPostSyncBarrier->dstAccessMask = Pal::CoherColorTarget; + + const auto clearSubresRanges = LoadOpClearSubresRanges(viewMask, pRanges[i]); + + utils::IterateMask deviceGroup(GetDeviceMask()); + + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + + // Clear Box + Pal::Box clearBox = BuildClearBox( + m_allGpuState.dynamicRenderingInstance.renderArea[deviceIdx], + *(pImageViews[i])); + + PalCmdBuffer(deviceIdx)->CmdClearColorImage( + *(images[i]->PalImage(deviceIdx)), + pClearLayouts[i], + pClearColors[i], + pClearFormats[i], + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), + 1, + &clearBox, + 0); + } + while (deviceGroup.IterateNext()); + } + + //Issue the post sync barrier + acqRelInfo.reason = Pal::Developer::BarrierReason::BarrierReasonPostSyncClear; + acqRelInfo.imageBarrierCount = clearCount; + acqRelInfo.pImageBarriers = imageBarriers; + + PalCmdReleaseThenAcquire(&acqRelInfo, nullptr, nullptr, imageBarriers, images, m_curDeviceMask); +} + // ===================================================================================================================== void CmdBuffer::ValidateGraphicsStates() { @@ -12811,7 +12963,8 @@ void CmdBuffer::PushConstants2KHR( // ===================================================================================================================== template VKAPI_ATTR void VKAPI_CALL CmdBuffer::CmdPushDescriptorSet2KHR( VkCommandBuffer commandBuffer, @@ -12819,21 +12972,24 @@ VKAPI_ATTR void VKAPI_CALL CmdBuffer::CmdPushDescriptorSet2KHR( { CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); - pCmdBuffer->PushDescriptorSet2KHR( + pCmdBuffer->PushDescriptorSet2KHR + ( pPushDescriptorSetInfo); } // ===================================================================================================================== template + size_t samplerDescSize, + size_t typedBufferDescSize, + size_t untypedBufferDescSize, + uint32_t numPalDevices> void CmdBuffer::PushDescriptorSet2KHR( const VkPushDescriptorSetInfoKHR* pPushDescriptorSetInfo) { if ((pPushDescriptorSetInfo->stageFlags & ShaderStageAllGraphics) != 0) { - PushDescriptorSetKHR( + PushDescriptorSetKHR + ( VK_PIPELINE_BIND_POINT_GRAPHICS, pPushDescriptorSetInfo->layout, pPushDescriptorSetInfo->set, @@ -12843,7 +12999,8 @@ void CmdBuffer::PushDescriptorSet2KHR( if ((pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) != 0) { - PushDescriptorSetKHR( + PushDescriptorSetKHR + ( VK_PIPELINE_BIND_POINT_COMPUTE, pPushDescriptorSetInfo->layout, pPushDescriptorSetInfo->set, @@ -12853,7 +13010,8 @@ void CmdBuffer::PushDescriptorSet2KHR( #if VKI_RAY_TRACING if ((pPushDescriptorSetInfo->stageFlags & ShaderStageAllRayTracing) != 0) { - PushDescriptorSetKHR( + PushDescriptorSetKHR + ( VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, pPushDescriptorSetInfo->layout, pPushDescriptorSetInfo->set, @@ -13015,19 +13173,34 @@ template PFN_vkCmdPushDescriptorSet2KHR CmdBuffer::GetCmdPushDescriptorSet2KHRFunc( const Device* pDevice) { - const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; - const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; - const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t typedBufferDescSize = pDevice->GetProperties().descriptorSizes.typedBufferView; + const size_t untypedBufferDescSize = pDevice->GetProperties().descriptorSizes.untypedBufferView; PFN_vkCmdPushDescriptorSet2KHR pFunc = nullptr; - if ((imageDescSize == 32) && - (samplerDescSize == 16) && - (bufferDescSize == 16)) + if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) + { + pFunc = &CmdPushDescriptorSet2KHR< + 32, + 16, + 16, + 16, + numPalDevices>; + } + else if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) { pFunc = &CmdPushDescriptorSet2KHR< 32, 16, + 24, 16, numPalDevices>; } diff --git a/icd/api/vk_cmdbuffer_transfer.cpp b/icd/api/vk_cmdbuffer_transfer.cpp index 610636f3..9b03c795 100644 --- a/icd/api/vk_cmdbuffer_transfer.cpp +++ b/icd/api/vk_cmdbuffer_transfer.cpp @@ -747,8 +747,8 @@ void CmdBuffer::CopyQueryPoolResults( // =================================================================================================================== // Command to write a timestamp value to a location in a Timestamp query pool void CmdBuffer::QueryCopy( - const QueryPool* pBasePool, - const Buffer* pDestBuffer, + const QueryPool* pBasePool, + const Buffer* pDestBuffer, uint32_t firstQuery, uint32_t queryCount, VkDeviceSize destOffset, @@ -799,19 +799,23 @@ void CmdBuffer::QueryCopy( uint32_t userData[16]; // Figure out which user data registers should contain what compute constants - const uint32_t storageViewSize = m_pDevice->GetProperties().descriptorSizes.bufferView; + const uint32_t untypedViewSize = m_pDevice->GetProperties().descriptorSizes.untypedBufferView; + const uint32_t typedViewSize = m_pDevice->GetProperties().descriptorSizes.typedBufferView; + + const uint32_t storageViewSize = m_pDevice->UseStridedCopyQueryResults() ? untypedViewSize : typedViewSize; const uint32_t storageViewDwSize = storageViewSize / sizeof(uint32_t); - const uint32_t viewOffset = 0; - const uint32_t bufferViewOffset = storageViewDwSize; - const uint32_t queryCountOffset = bufferViewOffset + storageViewDwSize; - const uint32_t copyFlagsOffset = queryCountOffset + 1; - const uint32_t copyStrideOffset = copyFlagsOffset + 1; - const uint32_t firstQueryOffset = copyStrideOffset + 1; - const uint32_t ptrQueryOffset = firstQueryOffset + 1; - const uint32_t userDataCount = ptrQueryOffset + 1; + const uint32_t bufferViewDwSize = untypedViewSize / sizeof(uint32_t); + const uint32_t viewOffset = 0; + const uint32_t bufferViewOffset = storageViewDwSize; + const uint32_t queryCountOffset = bufferViewOffset + bufferViewDwSize; + const uint32_t copyFlagsOffset = queryCountOffset + 1; + const uint32_t copyStrideOffset = copyFlagsOffset + 1; + const uint32_t firstQueryOffset = copyStrideOffset + 1; + const uint32_t ptrQueryOffset = firstQueryOffset + 1; + const uint32_t userDataCount = ptrQueryOffset + 1; // Make sure they agree with pipeline mapping - VK_ASSERT(viewOffset == pipeline.userDataNodeOffsets[0]); + VK_ASSERT(viewOffset == pipeline.userDataNodeOffsets[0]); VK_ASSERT(bufferViewOffset == pipeline.userDataNodeOffsets[1]); VK_ASSERT(queryCountOffset == pipeline.userDataNodeOffsets[2]); VK_ASSERT(userDataCount <= VK_ARRAY_SIZE(userData)); diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index dca14a6e..0fbe3289 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -29,6 +29,7 @@ #include "include/vk_shader.h" #include "include/vk_device.h" #include "include/vk_instance.h" +#include "include/vk_pipeline_binary.h" #include "include/vk_pipeline_cache.h" #include "include/vk_pipeline_layout.h" #include "include/vk_memory.h" @@ -113,6 +114,8 @@ VkResult ComputePipeline::CreatePipelineBinaries( PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); bool storeBinaryToCache = true; + storeBinaryToCache = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) == 0; + // Load or create the pipeline binary PipelineBinaryCache* pPipelineBinaryCache = (pPipelineCache != nullptr) ? pPipelineCache->GetPipelineCache() : nullptr; @@ -319,6 +322,7 @@ ComputePipeline::ComputePipeline( Device* const pDevice, Pal::IPipeline** pPalPipeline, const PipelineLayout* pPipelineLayout, + PipelineBinaryStorage* pBinaryStorage, const ImmedInfo& immedInfo, #if VKI_RAY_TRACING bool hasRayTracing, @@ -340,6 +344,7 @@ ComputePipeline::ComputePipeline( Pipeline::Init( pPalPipeline, pPipelineLayout, + pBinaryStorage, staticStateMask, #if VKI_RAY_TRACING dispatchRaysUserDataOffset, @@ -388,6 +393,36 @@ VkResult ComputePipeline::Create( HandleExtensionStructs(pCreateInfo, &extStructs); + auto pPipelineBinaryInfoKHR = extStructs.pPipelineBinaryInfoKHR; + + if (pPipelineBinaryInfoKHR != nullptr) + { + if (pPipelineBinaryInfoKHR->binaryCount > 0) + { + VK_ASSERT(pPipelineBinaryInfoKHR->binaryCount == pDevice->NumPalDevices()); + binariesProvided = true; + } + + for (uint32_t deviceIdx = 0; + (deviceIdx < pPipelineBinaryInfoKHR->binaryCount) && (result == VK_SUCCESS); + ++deviceIdx) + { + const auto pBinary = PipelineBinary::ObjectFromHandle( + pPipelineBinaryInfoKHR->pPipelineBinaries[deviceIdx]); + + cacheId[deviceIdx] = pBinary->BinaryKey(); + pipelineBinaries[deviceIdx] = pBinary->BinaryData(); + + if (deviceIdx == DefaultDeviceIndex) + { + pDefaultCompiler->ReadBinaryMetadata( + pDevice, + pipelineBinaries[deviceIdx], + &binaryMetadata); + } + } + } + ComputePipelineShaderStageInfo shaderInfo = {}; uint64_t apiPsoHash = {}; @@ -395,6 +430,9 @@ VkResult ComputePipeline::Create( PipelineCompiler::InitPipelineCreationFeedback(pPipelineCreationFeedbackCreateInfo); + PipelineBinaryStorage binaryStorage = {}; + bool storeBinaryToPipeline = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) != 0; + if ((result == VK_SUCCESS) && (binariesProvided == false)) { // 1. Create Cache IDs @@ -429,6 +467,37 @@ VkResult ComputePipeline::Create( &binaryMetadata); } + // 3. Store created binaries for pipeline_binary + if ((result == VK_SUCCESS) && storeBinaryToPipeline) + { + for (uint32_t deviceIdx = 0; (deviceIdx < pDevice->NumPalDevices()) && (result == VK_SUCCESS); ++deviceIdx) + { + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + pipelineBinaries[deviceIdx].codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the pipeline object + + if (pMemory != nullptr) + { + memcpy( + pMemory, + pipelineBinaries[deviceIdx].pCode, + pipelineBinaries[deviceIdx].codeSize); + + InsertBinaryData( + &binaryStorage, + deviceIdx, + cacheId[deviceIdx], + pipelineBinaries[deviceIdx].codeSize, + pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } } CreateInfo localPipelineInfo = {}; @@ -445,6 +514,7 @@ VkResult ComputePipeline::Create( // Get the pipeline and shader size from PAL and allocate memory. size_t pipelineSize = 0; + PipelineBinaryStorage* pPermBinaryStorage = nullptr; void* pSystemMem = nullptr; Pal::Result palResult = Pal::Result::Success; @@ -460,6 +530,10 @@ VkResult ComputePipeline::Create( VK_ASSERT(palResult == Pal::Result::Success); size_t allocationSize = sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices()); + if (storeBinaryToPipeline) + { + allocationSize += sizeof(PipelineBinaryStorage); + } pSystemMem = pDevice->AllocApiObject( pAllocator, @@ -534,6 +608,15 @@ VkResult ComputePipeline::Create( result = PalToVkResult(palResult); + if ((result == VK_SUCCESS) && storeBinaryToPipeline) + { + size_t pipelineBinaryOffset = sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices()); + pPermBinaryStorage = static_cast(Util::VoidPtrInc(pSystemMem, + pipelineBinaryOffset)); + + // Simply copy the existing allocations to the new struct. + *pPermBinaryStorage = binaryStorage; + } } if (result == VK_SUCCESS) @@ -552,6 +635,7 @@ VkResult ComputePipeline::Create( VK_PLACEMENT_NEW(pSystemMem) ComputePipeline(pDevice, pPalPipeline, localPipelineInfo.pLayout, + pPermBinaryStorage, localPipelineInfo.immedInfo, #if VKI_RAY_TRACING hasRayTracing, @@ -563,7 +647,7 @@ VkResult ComputePipeline::Create( apiPsoHash); *pPipeline = ComputePipeline::HandleFromVoidPointer(pSystemMem); - if (settings.enableDebugPrintf) + if (pDevice->GetEnabledFeatures().enableDebugPrintf) { ComputePipeline* pComputePipeline = static_cast(pSystemMem); pComputePipeline->ClearFormatString(); @@ -577,6 +661,8 @@ VkResult ComputePipeline::Create( } else { + // Free the binaries only if we failed to create the pipeline. + FreeBinaryStorage(&binaryStorage, pAllocator); for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index e89d0650..48f54089 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -1122,20 +1122,24 @@ static uint32_t GetBufferSrdFormatInfo( } else { - VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t)); + const Pal::DeviceProperties& palProperties = pPhysicalDevice->PalProperties(); + VK_ASSERT(palProperties.gfxipProperties.srdSizes.typedBufferView <= MaxBufferSrdSize * sizeof(uint32_t)); - uint32_t result[4] = {}; - Pal::BufferViewInfo bufferInfo = {}; - bufferInfo.gpuAddr = 0x300000000ull; - bufferInfo.swizzledFormat = swizzledFormat; - bufferInfo.range = UINT32_MAX; - bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format); + uint32_t result[MaxBufferSrdSize] = {}; + Pal::BufferViewInfo bufferInfo = {}; + bufferInfo.gpuAddr = 0x300000000ull; + bufferInfo.swizzledFormat = swizzledFormat; + bufferInfo.range = UINT32_MAX; + bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format); pPhysicalDevice->PalDevice()->CreateTypedBufferViewSrds(1, &bufferInfo, result); - // NOTE: Until now, all buffer format info is stored the fourth DWORD of buffer SRD. please modify + // NOTE: Until now, all buffer format info is stored the last DWORD of buffer SRD. please modify // both BilVertexFetchManager::IssueUberFetchInst and UberFetchShaderFormatInfo once it is changed. - return result[3]; + + { + return result[3]; + } } } @@ -1366,7 +1370,7 @@ VkResult InitializeUberFetchShaderFormatTable( // to avoid access the exact bit in buffer SRD, we create untypeded buffer twice with different stride, // and record the modified bits. - VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t)); + VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.untypedBufferView == 4 * sizeof(uint32_t)); uint32_t defaultSrd[4] = {}; uint32_t zeroStrideSrd[4] = {}; diff --git a/icd/api/vk_descriptor_buffer.cpp b/icd/api/vk_descriptor_buffer.cpp index 705fca54..e54054e7 100644 --- a/icd/api/vk_descriptor_buffer.cpp +++ b/icd/api/vk_descriptor_buffer.cpp @@ -86,10 +86,6 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( const Device* pDevice = ApiDevice::ObjectFromHandle(device); const Device::Properties& props = pDevice->GetProperties(); - VK_ASSERT((props.descriptorSizes.imageView == 32) && - (props.descriptorSizes.sampler == 16) && - (props.descriptorSizes.bufferView == 16)); - switch (static_cast(pDescriptorInfo->type)) { case VK_DESCRIPTOR_TYPE_SAMPLER: @@ -198,6 +194,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( { if (pDescriptorInfo->data.pUniformTexelBuffer != nullptr) { + VK_ASSERT(pDescriptorInfo->data.pUniformTexelBuffer->format != VK_FORMAT_UNDEFINED); BufferView::BuildSrd( pDevice, 0, @@ -205,12 +202,11 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( static_cast (&pDescriptorInfo->data.pUniformTexelBuffer->address), pDescriptorInfo->data.pUniformTexelBuffer->format, 1, - props.descriptorSizes.bufferView, pDescriptor); } else { - memset(pDescriptor, 0, props.descriptorSizes.bufferView); + memset(pDescriptor, 0, props.descriptorSizes.typedBufferView); } break; } @@ -231,7 +227,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( } else { - memset(pDescriptor, 0, props.descriptorSizes.bufferView); + memset(pDescriptor, 0, props.descriptorSizes.untypedBufferView); } break; @@ -249,12 +245,11 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( static_cast (&pDescriptorInfo->data.pUniformBuffer->address), VK_FORMAT_UNDEFINED, 1, - props.descriptorSizes.bufferView, pDescriptor); } else { - memset(pDescriptor, 0, props.descriptorSizes.bufferView); + memset(pDescriptor, 0, props.descriptorSizes.untypedBufferView); } break; } diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index 14576653..9386be8e 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -519,7 +519,8 @@ void DescriptorUpdate::WriteInlineUniformBlock( template void DescriptorUpdate::WriteDescriptorSets( const Device* pDevice, @@ -639,7 +640,7 @@ void DescriptorUpdate::WriteDescriptorSets( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - WriteBufferDescriptors( + WriteBufferDescriptors( params.pTexelBufferView, deviceIdx, pDestAddr, @@ -648,7 +649,7 @@ void DescriptorUpdate::WriteDescriptorSets( break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - WriteBufferDescriptors( + WriteBufferDescriptors( params.pTexelBufferView, deviceIdx, pDestAddr, @@ -657,7 +658,7 @@ void DescriptorUpdate::WriteDescriptorSets( break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, params.pBufferInfo, deviceIdx, @@ -667,7 +668,7 @@ void DescriptorUpdate::WriteDescriptorSets( break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, params.pBufferInfo, deviceIdx, @@ -681,7 +682,7 @@ void DescriptorUpdate::WriteDescriptorSets( pDestAddr = pDestSet->DynamicDescriptorData(deviceIdx) + pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement); - WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, params.pBufferInfo, deviceIdx, @@ -695,7 +696,7 @@ void DescriptorUpdate::WriteDescriptorSets( pDestAddr = pDestSet->DynamicDescriptorData(deviceIdx) + pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement); - WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, params.pBufferInfo, deviceIdx, @@ -923,7 +924,8 @@ void DescriptorUpdate::CopyDescriptorSets( template VKAPI_ATTR void VKAPI_CALL DescriptorUpdate::UpdateDescriptorSets( VkDevice device, @@ -936,7 +938,8 @@ VKAPI_ATTR void VKAPI_CALL DescriptorUpdate::UpdateDescriptorSets( for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) { - WriteDescriptorSets( + WriteDescriptorSets + ( pDevice, deviceIdx, descriptorWriteCount, @@ -988,39 +991,70 @@ template PFN_vkUpdateDescriptorSets DescriptorUpdate::GetUpdateDescriptorSetsFunc( const Device* pDevice) { - const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; - const size_t fmaskDescSize = pDevice->GetProperties().descriptorSizes.fmaskView; - const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; - const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t fmaskDescSize = pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead ? + pDevice->GetProperties().descriptorSizes.fmaskView : 0; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t typedBufferDescSize = pDevice->GetProperties().descriptorSizes.typedBufferView; + const size_t untypedBufferDescSize = pDevice->GetProperties().descriptorSizes.untypedBufferView; + PFN_vkUpdateDescriptorSets pFunc = nullptr; - if ((imageDescSize == 32) && - (samplerDescSize == 16) && - (bufferDescSize == 16)) + if ((imageDescSize == 32) && + (fmaskDescSize == 0) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) { - if ((pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead == false) || (fmaskDescSize == 0)) - { - pFunc = &UpdateDescriptorSets< - 32, - 0, - 16, - 16, - numPalDevices>; - } - else if (fmaskDescSize == 32) - { - pFunc = &UpdateDescriptorSets< - 32, - 32, - 16, - 16, - numPalDevices>; - } - else - { - VK_NEVER_CALLED(); - pFunc = nullptr; - } + pFunc = &UpdateDescriptorSets< + 32, + 0, + 16, + 16, + 16, + numPalDevices>; + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) + { + pFunc = &UpdateDescriptorSets< + 32, + 32, + 16, + 16, + 16, + numPalDevices>; + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 0) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) + { + pFunc = &UpdateDescriptorSets< + 32, + 0, + 16, + 24, + 16, + numPalDevices>; + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) + { + pFunc = &UpdateDescriptorSets< + 32, + 32, + 16, + 24, + 16, + numPalDevices>; } else { @@ -1143,6 +1177,24 @@ void DescriptorUpdate::WriteBufferDescriptors<16, VK_DESCRIPTOR_TYPE_STORAGE_TEX uint32_t dwStride, size_t descriptorStrideInBytes); +template +void DescriptorUpdate::WriteBufferDescriptors<24, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER>( + const VkBufferView* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorUpdate::WriteBufferDescriptors<24, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER>( + const VkBufferView* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + template void DescriptorUpdate::WriteBufferInfoDescriptors<16, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER>( const Device* pDevice, diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp index 4227d2b3..2fdea471 100644 --- a/icd/api/vk_descriptor_set_layout.cpp +++ b/icd/api/vk_descriptor_set_layout.cpp @@ -171,12 +171,14 @@ uint32_t DescriptorSetLayout::GetSingleDescStaticSize( case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + size = props.descriptorSizes.typedBufferView; + break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: #if VKI_RAY_TRACING case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: #endif - size = props.descriptorSizes.bufferView; + size = props.descriptorSizes.untypedBufferView; break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: @@ -268,7 +270,7 @@ uint32_t DescriptorSetLayout::GetDynamicBufferDescDwSize( } else { - size = pDevice->GetProperties().descriptorSizes.bufferView; + size = pDevice->GetProperties().descriptorSizes.untypedBufferView; } VK_ASSERT(Util::IsPow2Aligned(size, sizeof(uint32_t))); diff --git a/icd/api/vk_descriptor_update_template.cpp b/icd/api/vk_descriptor_update_template.cpp index 87cfba51..28abf320 100644 --- a/icd/api/vk_descriptor_update_template.cpp +++ b/icd/api/vk_descriptor_update_template.cpp @@ -124,7 +124,8 @@ VkResult DescriptorUpdateTemplate::Create( template DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntryFunc( VkDescriptorType descriptorType, @@ -165,22 +166,22 @@ DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntr pFunc = &UpdateEntrySampledImage; break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - pFunc = &UpdateEntryTexelBuffer; + pFunc = &UpdateEntryTexelBuffer; break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - pFunc = &UpdateEntryTexelBuffer; + pFunc = &UpdateEntryTexelBuffer; break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - pFunc = &UpdateEntryBuffer; + pFunc = &UpdateEntryBuffer; break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - pFunc = &UpdateEntryBuffer; + pFunc = &UpdateEntryBuffer; break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - pFunc = &UpdateEntryBuffer; + pFunc = &UpdateEntryBuffer; break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - pFunc = &UpdateEntryBuffer; + pFunc = &UpdateEntryBuffer; break; case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: pFunc = &UpdateEntryInlineUniformBlock; @@ -205,41 +206,74 @@ DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntr VkDescriptorType descriptorType, const DescriptorSetLayout::BindingInfo& dstBinding) { - const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; - const size_t fmaskDescSize = pDevice->GetProperties().descriptorSizes.fmaskView; - const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; - const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t fmaskDescSize = pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead ? + pDevice->GetProperties().descriptorSizes.fmaskView : 0; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t typedBufferDescSize = pDevice->GetProperties().descriptorSizes.typedBufferView; + const size_t untypedBufferDescSize = pDevice->GetProperties().descriptorSizes.untypedBufferView; DescriptorUpdateTemplate::PfnUpdateEntry pFunc = nullptr; - if ((imageDescSize == 32) && - (samplerDescSize == 16) && - (bufferDescSize == 16)) + if ((imageDescSize == 32) && + (fmaskDescSize == 0) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) + + { + pFunc = GetUpdateEntryFunc< + 32, + 0, + 16, + 16, + 16, + numPalDevices>(descriptorType, dstBinding); + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 16) && + (untypedBufferDescSize == 16)) + + { + pFunc = GetUpdateEntryFunc< + 32, + 32, + 16, + 16, + 16, + numPalDevices>(descriptorType, dstBinding); + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 0) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) { - if ((pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead == false) || (fmaskDescSize == 0)) - { pFunc = GetUpdateEntryFunc< 32, 0, 16, + 24, 16, numPalDevices>(descriptorType, dstBinding); - } - else if (fmaskDescSize == 32) - { + } + else if ((imageDescSize == 32) && + (fmaskDescSize == 32) && + (samplerDescSize == 16) && + (typedBufferDescSize == 24) && + (untypedBufferDescSize == 16)) + + { pFunc = GetUpdateEntryFunc< 32, 32, 16, + 24, 16, numPalDevices>(descriptorType, dstBinding); - } - else - { - VK_NEVER_CALLED(); - pFunc = nullptr; - } } else { diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 943a80d2..b09fbc8b 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -44,7 +44,7 @@ #include "include/vk_fence.h" #include "include/vk_formats.h" #include "include/vk_framebuffer.h" - +#include "include/vk_pipeline_binary.h" #include "include/vk_pipeline_layout.h" #include "include/vk_physical_device.h" #include "include/vk_image.h" @@ -742,6 +742,12 @@ VkResult Device::Create( { deviceFeatures.strictImageSizeRequirements = false; } + + if (pPhysicalDevice->GetRuntimeSettings().enableDebugPrintf + ) + { + deviceFeatures.enableDebugPrintf = true; + } } if (palResult == Pal::Result::Success) @@ -1154,15 +1160,16 @@ VkResult Device::Initialize( } #if VKI_RAY_TRACING - m_properties.rayTracingIpLevel = deviceProps.gfxipProperties.rayTracingIp; + m_properties.rayTracingIpLevel = deviceProps.gfxipProperties.rayTracingIp; #endif - m_properties.virtualMemAllocGranularity = deviceProps.gpuMemoryProperties.virtualMemAllocGranularity; - m_properties.virtualMemPageSize = deviceProps.gpuMemoryProperties.virtualMemPageSize; - m_properties.descriptorSizes.bufferView = deviceProps.gfxipProperties.srdSizes.bufferView; - m_properties.descriptorSizes.imageView = deviceProps.gfxipProperties.srdSizes.imageView; - m_properties.descriptorSizes.fmaskView = deviceProps.gfxipProperties.srdSizes.fmaskView; - m_properties.descriptorSizes.sampler = deviceProps.gfxipProperties.srdSizes.sampler; - m_properties.descriptorSizes.bvh = deviceProps.gfxipProperties.srdSizes.bvh; + m_properties.virtualMemAllocGranularity = deviceProps.gpuMemoryProperties.virtualMemAllocGranularity; + m_properties.virtualMemPageSize = deviceProps.gpuMemoryProperties.virtualMemPageSize; + m_properties.descriptorSizes.typedBufferView = deviceProps.gfxipProperties.srdSizes.typedBufferView; + m_properties.descriptorSizes.untypedBufferView = deviceProps.gfxipProperties.srdSizes.untypedBufferView; + m_properties.descriptorSizes.imageView = deviceProps.gfxipProperties.srdSizes.imageView; + m_properties.descriptorSizes.fmaskView = deviceProps.gfxipProperties.srdSizes.fmaskView; + m_properties.descriptorSizes.sampler = deviceProps.gfxipProperties.srdSizes.sampler; + m_properties.descriptorSizes.bvh = deviceProps.gfxipProperties.srdSizes.bvh; // Size of combined image samplers is the sum of the image and sampler SRD sizes (8DW + 4DW) m_properties.descriptorSizes.combinedImageSampler = m_properties.descriptorSizes.imageView + @@ -1849,7 +1856,7 @@ VkResult Device::CreateInternalComputePipeline( const uint8_t* pCode, uint32_t numUserDataNodes, Vkgc::ResourceMappingRootNode* pUserDataNodes, - VkShaderModuleCreateFlags internalShaderFlags, + ShaderModuleFlags flags, bool forceWave64, const VkSpecializationInfo* pSpecializationInfo, InternalPipeline* pInternalPipeline) @@ -1879,10 +1886,9 @@ VkResult Device::CreateInternalComputePipeline( // Build shader module Vkgc::BinaryData spvBin = { codeByteSize, pCode }; - internalShaderFlags |= VK_INTERNAL_SHADER_FLAGS_INTERNAL_SHADER_BIT; + ShaderModuleFlags internalShaderFlags = flags | ShaderModuleInternalShader; result = pCompiler->BuildShaderModule( this, - 0, internalShaderFlags, spvBin, &shaderModule); @@ -2103,34 +2109,38 @@ VkResult Device::CreateInternalPipelines() Vkgc::ResourceMappingRootNode userDataNodes[3] = {}; - const uint32_t uavViewSize = m_properties.descriptorSizes.bufferView / sizeof(uint32_t); + const uint32_t untypedViewDwSize = m_properties.descriptorSizes.untypedBufferView / sizeof(uint32_t); + const uint32_t typedViewDwSize = m_properties.descriptorSizes.typedBufferView / sizeof(uint32_t); + uint32_t offset = 0; // Timestamp counter storage view - userDataNodes[0].node.type = useStridedShader ? + userDataNodes[0].node.type = useStridedShader ? Vkgc::ResourceMappingNodeType::DescriptorBuffer : Vkgc::ResourceMappingNodeType::DescriptorTexelBuffer; - userDataNodes[0].node.offsetInDwords = 0; - userDataNodes[0].node.sizeInDwords = uavViewSize; - userDataNodes[0].node.srdRange.set = 0; - userDataNodes[0].node.srdRange.binding = 0; + userDataNodes[0].node.offsetInDwords = 0; + userDataNodes[0].node.sizeInDwords = useStridedShader ? untypedViewDwSize : typedViewDwSize; + userDataNodes[0].node.srdRange.set = 0; + userDataNodes[0].node.srdRange.binding = 0; userDataNodes[0].node.srdRange.strideInDwords = 0; - userDataNodes[0].visibility = Vkgc::ShaderStageComputeBit; + userDataNodes[0].visibility = Vkgc::ShaderStageComputeBit; + offset += userDataNodes[0].node.sizeInDwords; // Copy destination storage view - userDataNodes[1].node.type = Vkgc::ResourceMappingNodeType::DescriptorBuffer; - userDataNodes[1].node.offsetInDwords = uavViewSize; - userDataNodes[1].node.sizeInDwords = uavViewSize; - userDataNodes[1].node.srdRange.set = 0; - userDataNodes[1].node.srdRange.binding = 1; + userDataNodes[1].node.type = Vkgc::ResourceMappingNodeType::DescriptorBuffer; + userDataNodes[1].node.offsetInDwords = offset; + userDataNodes[1].node.sizeInDwords = untypedViewDwSize; + userDataNodes[1].node.srdRange.set = 0; + userDataNodes[1].node.srdRange.binding = 1; userDataNodes[1].node.srdRange.strideInDwords = 0; - userDataNodes[1].visibility = Vkgc::ShaderStageComputeBit; + userDataNodes[1].visibility = Vkgc::ShaderStageComputeBit; + offset += userDataNodes[1].node.sizeInDwords; // Inline constant data - userDataNodes[2].node.type = Vkgc::ResourceMappingNodeType::PushConst; - userDataNodes[2].node.offsetInDwords = 2 * uavViewSize; - userDataNodes[2].node.sizeInDwords = 4; - userDataNodes[2].node.srdRange.set = Vkgc::InternalDescriptorSetId; + userDataNodes[2].node.type = Vkgc::ResourceMappingNodeType::PushConst; + userDataNodes[2].node.offsetInDwords = offset; + userDataNodes[2].node.sizeInDwords = 4; + userDataNodes[2].node.srdRange.set = Vkgc::InternalDescriptorSetId; userDataNodes[2].node.srdRange.strideInDwords = 0; - userDataNodes[2].visibility = Vkgc::ShaderStageComputeBit; + userDataNodes[2].visibility = Vkgc::ShaderStageComputeBit; result = CreateInternalComputePipeline( spvCodeSize, @@ -4176,6 +4186,7 @@ VkResult Device::GetDeviceFaultInfoEXT( { m_retrievedFaultData = true; } + } if (m_pageFaultStatus.flags.pageFault == false) @@ -4218,6 +4229,7 @@ VkResult Device::GetDeviceFaultInfoEXT( VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT; pAddressInfo->reportedAddress = static_cast(m_pageFaultStatus.faultAddress); pAddressInfo->addressPrecision = 4096; + } return result; @@ -5404,6 +5416,69 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceFaultInfoEXT( return pDevice->GetDeviceFaultInfoEXT(pFaultCounts, pFaultInfo); } +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineBinariesKHR( + VkDevice device, + const VkPipelineBinaryCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryHandlesInfoKHR* pBinaries) +{ + const auto pDevice = ApiDevice::ObjectFromHandle(device); + const auto pAllocCB = (pAllocator != nullptr) ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + return PipelineBinary::CreatePipelineBinaries(pDevice, pCreateInfo, pAllocCB, pBinaries); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineBinaryKHR( + VkDevice device, + VkPipelineBinaryKHR pipelineBinary, + const VkAllocationCallbacks* pAllocator) +{ + const auto pDevice = ApiDevice::ObjectFromHandle(device); + const auto pAllocCB = (pAllocator != nullptr) ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + const auto pBinary = PipelineBinary::ObjectFromHandle(pipelineBinary); + + pBinary->DestroyPipelineBinary(pDevice, pAllocCB); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineKeyKHR( + VkDevice device, + const VkPipelineCreateInfoKHR* pPipelineCreateInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey) +{ + const auto pDevice = ApiDevice::ObjectFromHandle(device); + + return PipelineBinary::GetPipelineKey(pDevice, pPipelineCreateInfo, pPipelineBinaryKey); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineBinaryDataKHR( + VkDevice device, + const VkPipelineBinaryDataInfoKHR* pInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey, + size_t* pPipelineBinaryDataSize, + void* pPipelineBinaryData) +{ + const auto pBinary = PipelineBinary::ObjectFromHandle(pInfo->pipelineBinary); + + return pBinary->GetPipelineBinaryData(pPipelineBinaryKey, pPipelineBinaryDataSize, pPipelineBinaryData); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkReleaseCapturedPipelineDataKHR( + VkDevice device, + const VkReleaseCapturedPipelineDataInfoKHR* pInfo, + const VkAllocationCallbacks* pAllocator) +{ + const auto pDevice = ApiDevice::ObjectFromHandle(device); + const auto pPipeline = Pipeline::BaseObjectFromHandle(pInfo->pipeline); + const auto pAllocCB = (pAllocator != nullptr) ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + return PipelineBinary::ReleaseCapturedPipelineData(pDevice, pPipeline, pAllocCB); +} + // ===================================================================================================================== VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSubresourceLayoutKHR( VkDevice device, diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index 1e00c8e2..50ad9c55 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -387,6 +387,7 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetQueryPoolResults ); INIT_DISPATCH_ENTRY(vkGetRenderAreaGranularity ); INIT_DISPATCH_ENTRY(vkGetRenderingAreaGranularityKHR ); + INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceCapabilitiesKHR ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceCapabilities2KHR ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceFormatsKHR ); @@ -625,7 +626,6 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkCmdSetLineStippleEXT ); INIT_DISPATCH_ALIAS(vkCmdSetLineStippleKHR , vkCmdSetLineStippleEXT ); - INIT_DISPATCH_ENTRY(vkSetDeviceMemoryPriorityEXT ); INIT_DISPATCH_ENTRY(vkGetDeviceFaultInfoEXT ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ); @@ -771,6 +771,7 @@ void DispatchTable::Init() vkCmdResolveImage2 ); INIT_DISPATCH_ENTRY(vkCmdPushDescriptorSetKHR ); INIT_DISPATCH_ENTRY(vkCmdPushDescriptorSetWithTemplateKHR ); + INIT_DISPATCH_ENTRY(vkGetDeviceBufferMemoryRequirements ); INIT_DISPATCH_ENTRY(vkGetDeviceImageMemoryRequirements ); INIT_DISPATCH_ENTRY(vkGetDeviceImageSparseMemoryRequirements ); @@ -840,12 +841,19 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkCmdPushConstants2KHR ); INIT_DISPATCH_ENTRY(vkCmdPushDescriptorSet2KHR ); INIT_DISPATCH_ENTRY(vkCmdPushDescriptorSetWithTemplate2KHR ); + INIT_DISPATCH_ENTRY(vkCmdSetDescriptorBufferOffsets2EXT ); INIT_DISPATCH_ENTRY(vkCmdBindDescriptorBufferEmbeddedSamplers2EXT ); INIT_DISPATCH_ENTRY(vkCmdSetRenderingAttachmentLocationsKHR ); INIT_DISPATCH_ENTRY(vkCmdSetRenderingInputAttachmentIndicesKHR ); + INIT_DISPATCH_ENTRY(vkCreatePipelineBinariesKHR ); + INIT_DISPATCH_ENTRY(vkDestroyPipelineBinaryKHR ); + INIT_DISPATCH_ENTRY(vkGetPipelineKeyKHR ); + INIT_DISPATCH_ENTRY(vkGetPipelineBinaryDataKHR ); + INIT_DISPATCH_ENTRY(vkReleaseCapturedPipelineDataKHR ); + INIT_DISPATCH_ENTRY(vkCmdSetDepthBias2EXT ); } diff --git a/icd/api/vk_gpa_session.cpp b/icd/api/vk_gpa_session.cpp index ea63e919..17e076b8 100644 --- a/icd/api/vk_gpa_session.cpp +++ b/icd/api/vk_gpa_session.cpp @@ -240,17 +240,13 @@ VkResult GpaSession::CmdBeginSample( sampleConfig.flags.sampleInternalOperations = pGpaSampleBeginInfo->sampleInternalOperations; sampleConfig.flags.cacheFlushOnCounterCollection = pGpaSampleBeginInfo->cacheFlushOnCounterCollection; sampleConfig.flags.sqShaderMask = pGpaSampleBeginInfo->sqShaderMaskEnable; - -#if VKI_BUILD_GFX11 sampleConfig.flags.sqWgpShaderMask = pGpaSampleBeginInfo->sqShaderMaskEnable; -#endif + sampleConfig.sqShaderMask = static_cast( VkToPalPerfExperimentShaderFlags(pGpaSampleBeginInfo->sqShaderMask)); -#if VKI_BUILD_GFX11 sampleConfig.sqWgpShaderMask = static_cast( VkToPalPerfExperimentShaderFlags(pGpaSampleBeginInfo->sqShaderMask)); -#endif VirtualStackFrame virtStackFrame(pCmdbuf->GetStackAllocator()); diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 61a497ea..e4a6f094 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -30,6 +30,7 @@ #include "include/vk_graphics_pipeline_library.h" #include "include/vk_instance.h" #include "include/vk_memory.h" +#include "include/vk_pipeline_binary.h" #include "include/vk_pipeline_cache.h" #include "include/vk_pipeline_layout.h" #include "include/vk_render_pass.h" @@ -80,6 +81,8 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); bool storeBinaryToCache = true; + storeBinaryToCache = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) == 0; + // Load or create the pipeline binary PipelineBinaryCache* pPipelineBinaryCache = (pPipelineCache != nullptr) ? pPipelineCache->GetPipelineCache() : nullptr; @@ -280,9 +283,6 @@ VkResult GraphicsPipeline::CreatePalPipelineObjects( { size_t palSize = 0; - pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinaries[DefaultDeviceIndex].codeSize; - pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[DefaultDeviceIndex].pCode; - Pal::Result palResult = Pal::Result::Success; palSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult); @@ -367,6 +367,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects( PipelineCache* pPipelineCache, const Util::MetroHash::Hash* pCacheIds, uint64_t apiPsoHash, + const PipelineBinaryStorage& binaryStorage, GraphicsPipelineObjectCreateInfo* pObjectCreateInfo, VkPipeline* pPipeline) { @@ -386,12 +387,20 @@ VkResult GraphicsPipeline::CreatePipelineObjects( // Get the pipeline size from PAL and allocate memory. void* pSystemMem = nullptr; size_t palSize = 0; + pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinaries[DefaultDeviceIndex].codeSize; + pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[DefaultDeviceIndex].pCode; palSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult); VK_ASSERT(palResult == Pal::Result::Success); size_t allocationSize = sizeof(GraphicsPipeline) + (palSize * numPalDevices); + const bool storeBinaryToPipeline = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) != 0; + + if (storeBinaryToPipeline) + { + allocationSize += sizeof(PipelineBinaryStorage); + } pSystemMem = pDevice->AllocApiObject( pAllocator, @@ -415,6 +424,8 @@ VkResult GraphicsPipeline::CreatePipelineObjects( pPalPipeline); } + PipelineBinaryStorage* pPermBinaryStorage = nullptr; + if (result == VK_SUCCESS) { bool sampleShadingEnable = pObjectCreateInfo->flags.sampleShadingEnable; @@ -507,6 +518,15 @@ VkResult GraphicsPipeline::CreatePipelineObjects( result = PalToVkResult(palResult); + if ((result == VK_SUCCESS) && storeBinaryToPipeline) + { + size_t pipelineBinaryOffset = sizeof(GraphicsPipeline) + (palSize * numPalDevices); + pPermBinaryStorage = static_cast(Util::VoidPtrInc(pSystemMem, + pipelineBinaryOffset)); + + // Simply copy the existing allocations to the new struct. + memcpy(pPermBinaryStorage, &binaryStorage, sizeof(PipelineBinaryStorage)); + } } // On success, wrap it up in a Vulkan object. @@ -516,6 +536,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects( pDevice, pPalPipeline, pPipelineLayout, + pPermBinaryStorage, pObjectCreateInfo->immedInfo, pObjectCreateInfo->staticStateMask, pObjectCreateInfo->flags, @@ -535,7 +556,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects( &palPipelineHasher); *pPipeline = GraphicsPipeline::HandleFromVoidPointer(pSystemMem); - if (pDevice->GetRuntimeSettings().enableDebugPrintf) + if (pDevice->GetEnabledFeatures().enableDebugPrintf) { GraphicsPipeline* pGraphicsPipeline = static_cast(pSystemMem); pGraphicsPipeline->ClearFormatString(); @@ -759,6 +780,39 @@ VkResult GraphicsPipeline::Create( Util::MetroHash::Hash gplCacheId[GraphicsLibraryCount] = {}; uint32_t numShaderLibraries = 0; + auto pPipelineBinaryInfoKHR = extStructs.pPipelineBinaryInfoKHR; + + if (pPipelineBinaryInfoKHR != nullptr) + { + if (pPipelineBinaryInfoKHR->binaryCount > 0) + { + VK_ASSERT(pPipelineBinaryInfoKHR->binaryCount == pDevice->NumPalDevices()); + binariesProvided = true; + } + + for (uint32_t deviceIdx = 0; + deviceIdx < pPipelineBinaryInfoKHR->binaryCount; + ++deviceIdx) + { + const auto pBinary = PipelineBinary::ObjectFromHandle( + pPipelineBinaryInfoKHR->pPipelineBinaries[deviceIdx]); + + cacheId[deviceIdx] = pBinary->BinaryKey(); + pipelineBinaries[deviceIdx] = pBinary->BinaryData(); + + if (deviceIdx == DefaultDeviceIndex) + { + pDefaultCompiler->ReadBinaryMetadata( + pDevice, + pipelineBinaries[deviceIdx], + &binaryMetadata); + } + } + } + + PipelineBinaryStorage binaryStorage = {}; + const bool storeBinaryToPipeline = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) != 0; + VK_ASSERT(pCreateInfo->layout != VK_NULL_HANDLE); pPipelineLayout = PipelineLayout::ObjectFromHandle(pCreateInfo->layout); @@ -905,6 +959,77 @@ VkResult GraphicsPipeline::Create( } } + // 4. Store created binaries for pipeline_binary + if ((result == VK_SUCCESS) && storeBinaryToPipeline) + { + if (gplProvided) + { + for (uint32_t gplType = 0; + (gplType < GraphicsLibraryCount) && + (shaderLibraries[gplType] != nullptr) && + Util::TestAnyFlagSet(gplMask, 1 << gplType) && + (result == VK_SUCCESS); + ++gplType) + { + uint32 codeSize = 0; + + shaderLibraries[gplType]->GetCodeObject(&codeSize, nullptr); + + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the pipeline object + + if (pMemory != nullptr) + { + shaderLibraries[gplType]->GetCodeObject(&codeSize, pMemory); + + InsertBinaryData( + &binaryStorage, + gplType, + gplCacheId[gplType], + codeSize, + pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } + else + { + for (uint32_t deviceIdx = 0; (deviceIdx < pDevice->NumPalDevices()) && (result == VK_SUCCESS); ++deviceIdx) + { + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + pipelineBinaries[deviceIdx].codeSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the pipeline object + + if (pMemory != nullptr) + { + memcpy( + pMemory, + pipelineBinaries[deviceIdx].pCode, + pipelineBinaries[deviceIdx].codeSize); + + InsertBinaryData( + &binaryStorage, + deviceIdx, + cacheId[deviceIdx], + pipelineBinaries[deviceIdx].codeSize, + pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } + } + if (result == VK_SUCCESS) { // 5. Build pipeline object create info @@ -947,9 +1072,15 @@ VkResult GraphicsPipeline::Create( pPipelineCache, cacheId, apiPsoHash, + binaryStorage, &objectCreateInfo, pPipeline); + if (result != VK_SUCCESS) + { + // Free the binaries only if we failed to create the pipeline objects. + FreeBinaryStorage(&binaryStorage, pAllocator); + } } } @@ -1195,6 +1326,7 @@ GraphicsPipeline::GraphicsPipeline( Device* const pDevice, Pal::IPipeline** pPalPipeline, const PipelineLayout* pLayout, + PipelineBinaryStorage* pBinaryStorage, const GraphicsPipelineObjectImmedInfo& immedInfo, uint64_t staticStateMask, GraphicsPipelineObjectFlags flags, @@ -1228,6 +1360,7 @@ GraphicsPipeline::GraphicsPipeline( Pipeline::Init( pPalPipeline, pLayout, + pBinaryStorage, staticStateMask, #if VKI_RAY_TRACING dispatchRaysUserDataOffset, @@ -1725,6 +1858,7 @@ void GraphicsPipeline::BindToCmdBuffer( pRenderState->msaaCreateInfo.sampleClusters = m_info.msaaCreateInfo.sampleClusters; pRenderState->msaaCreateInfo.alphaToCoverageSamples = m_info.msaaCreateInfo.alphaToCoverageSamples; pRenderState->msaaCreateInfo.occlusionQuerySamples = m_info.msaaCreateInfo.occlusionQuerySamples; + if (m_flags.customSampleLocations) { pRenderState->msaaCreateInfo.flags.enable1xMsaaSampleLocations = @@ -2003,7 +2137,8 @@ void GraphicsPipeline::BindToCmdBuffer( } if (ContainsStaticState(DynamicStatesInternal::SampleLocations) && - ContainsStaticState(DynamicStatesInternal::SampleLocationsEnable)) + ContainsStaticState(DynamicStatesInternal::SampleLocationsEnable) && + ContainsStaticState(DynamicStatesInternal::RasterizationSamples)) { if ((pRenderState->sampleLocationsEnable != m_flags.customSampleLocations) || (memcmp(&pRenderState->samplePattern, &m_info.samplePattern, sizeof(SamplePattern)) != 0)) @@ -2019,13 +2154,15 @@ void GraphicsPipeline::BindToCmdBuffer( { if (ContainsStaticState(DynamicStatesInternal::SampleLocations)) { - if (memcmp(&pRenderState->samplePattern, &m_info.samplePattern, sizeof(SamplePattern)) != 0) + if (memcmp(&pRenderState->samplePattern.locations, + &m_info.samplePattern.locations, + sizeof(Pal::MsaaQuadSamplePattern)) != 0) { - pRenderState->samplePattern = m_info.samplePattern; + pRenderState->samplePattern.locations = m_info.samplePattern.locations; pRenderState->dirtyGraphics.samplePattern = 1; } } - else if (ContainsStaticState(DynamicStatesInternal::SampleLocationsEnable)) + if (ContainsStaticState(DynamicStatesInternal::SampleLocationsEnable)) { if (pRenderState->sampleLocationsEnable != m_flags.customSampleLocations) { @@ -2033,6 +2170,14 @@ void GraphicsPipeline::BindToCmdBuffer( pRenderState->dirtyGraphics.samplePattern = 1; } } + if (ContainsStaticState(DynamicStatesInternal::RasterizationSamples)) + { + if (pRenderState->samplePattern.sampleCount != m_info.samplePattern.sampleCount) + { + pRenderState->samplePattern.sampleCount = m_info.samplePattern.sampleCount; + pRenderState->dirtyGraphics.samplePattern = 1; + } + } } // Only set the Fragment Shading Rate if the dynamic state is not set. diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 6a9d4700..30cc392e 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -24,6 +24,7 @@ **********************************************************************************************************************/ #include "include/vk_graphics_pipeline_library.h" +#include "include/vk_pipeline_binary.h" #include "include/vk_pipeline_layout.h" #include "palVectorImpl.h" @@ -335,6 +336,26 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( &pBinaryCreateInfo->pipelineInfo.fs, }; + auto pPipelineBinaryInfoKHR = extStructs.pPipelineBinaryInfoKHR; + PipelineBinaryInfo providedBinaries[GraphicsLibraryCount] = {}; + + if (pPipelineBinaryInfoKHR != nullptr) + { + for (uint32_t binaryIndex = 0; (binaryIndex < pPipelineBinaryInfoKHR->binaryCount); ++binaryIndex) + { + const auto pBinary = PipelineBinary::ObjectFromHandle( + pPipelineBinaryInfoKHR->pPipelineBinaries[binaryIndex]); + + // Retrieve the GraphicsLibraryType identifier from the binary + GraphicsLibraryType gplType = *static_cast(pBinary->BinaryData().pCode); + + providedBinaries[gplType].binaryHash = pBinary->BinaryKey(); + providedBinaries[gplType].pipelineBinary.codeSize = pBinary->BinaryData().codeSize; + providedBinaries[gplType].pipelineBinary.pCode = + Util::VoidPtrInc(pBinary->BinaryData().pCode, sizeof(GraphicsLibraryType)); + } + } + uint32_t gplMask = 0; for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; ++i) { @@ -364,6 +385,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( pPipelineCache, gplType, pBinaryCreateInfo, + &providedBinaries[gplType].pipelineBinary, + &providedBinaries[gplType].binaryHash, &pTempModuleStages[i]); gplMask |= (1 << gplType); } @@ -393,6 +416,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( pPipelineCache, GraphicsLibraryPreRaster, pBinaryCreateInfo, + nullptr, + nullptr, &pTempModuleStages[TempIdx]); } @@ -409,6 +434,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( pPipelineCache, GraphicsLibraryFragment, pBinaryCreateInfo, + nullptr, + nullptr, &pTempModuleStages[TempIdx]); } } @@ -551,6 +578,9 @@ VkResult GraphicsPipelineLibrary::Create( &binaryCreateInfo); } + PipelineBinaryStorage binaryStorage = {}; + const bool storeBinaryToPipeline = (flags & VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR) != 0; + if (result == VK_SUCCESS) { // 5. Create partial pipeline binary for fast-link @@ -565,6 +595,53 @@ VkResult GraphicsPipelineLibrary::Create( pAllocator, tempModuleStates); + // 6. Store created binaries for pipeline_binary + if ((result == VK_SUCCESS) && storeBinaryToPipeline) + { + uint32 binaryIndex = 0; + + for (uint32_t gplType = 0; gplType < GraphicsLibraryCount; ++gplType) + { + if ((binaryCreateInfo.earlyElfPackage[gplType].codeSize != 0) && + (binaryCreateInfo.earlyElfPackage[gplType].pCode != nullptr) && + (result == VK_SUCCESS)) + { + const size_t storageSize = sizeof(GraphicsLibraryType) + + binaryCreateInfo.earlyElfPackage[gplType].codeSize; + + void* pMemory = pAllocator->pfnAllocation( + pAllocator->pUserData, + storageSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); // retained in the GPL pipeline object + + if (pMemory != nullptr) + { + // Store the GraphicsLibraryType identifier with the binary + *static_cast(pMemory) = static_cast(gplType); + + memcpy( + Util::VoidPtrInc(pMemory, sizeof(GraphicsLibraryType)), + binaryCreateInfo.earlyElfPackage[gplType].pCode, + binaryCreateInfo.earlyElfPackage[gplType].codeSize); + + InsertBinaryData( + &binaryStorage, + binaryIndex, + binaryCreateInfo.earlyElfPackageHash[gplType], + storageSize, + pMemory); + + ++binaryIndex; + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } + } + // Clean up temporary storage for (uint32_t stage = 0; stage < ShaderStage::ShaderStageGfxCount; ++stage) { @@ -585,6 +662,7 @@ VkResult GraphicsPipelineLibrary::Create( GraphicsPipelineObjectCreateInfo objectCreateInfo = {}; size_t auxiliarySize = 0; + PipelineBinaryStorage* pPermBinaryStorage = nullptr; if (result == VK_SUCCESS) { @@ -607,6 +685,11 @@ VkResult GraphicsPipelineLibrary::Create( size_t objSize = apiSize + auxiliarySize; + if (storeBinaryToPipeline) + { + objSize += sizeof(PipelineBinaryStorage); + } + // Allocate memory pSysMem = pDevice->AllocApiObject(pAllocator, objSize); @@ -621,6 +704,17 @@ VkResult GraphicsPipelineLibrary::Create( GraphicsPipelineBinaryCreateInfo* pBinInfo = DumpGraphicsPipelineBinaryCreateInfo(&binaryCreateInfo, Util::VoidPtrInc(pSysMem, apiSize), nullptr); + if (storeBinaryToPipeline) + { + size_t pipelineBinaryOffset = apiSize + auxiliarySize; + + pPermBinaryStorage = static_cast(Util::VoidPtrInc(pSysMem, + pipelineBinaryOffset)); + + // Simply copy the existing allocations to the new struct. + memcpy(pPermBinaryStorage, &binaryStorage, sizeof(PipelineBinaryStorage)); + } + VK_PLACEMENT_NEW(pSysMem) GraphicsPipelineLibrary( pDevice, objectCreateInfo, @@ -629,6 +723,7 @@ VkResult GraphicsPipelineLibrary::Create( elfHash, apiPsoHash, tempModuleStates, + pPermBinaryStorage, pPipelineLayout); *pPipeline = GraphicsPipelineLibrary::HandleFromVoidPointer(pSysMem); @@ -773,6 +868,7 @@ GraphicsPipelineLibrary::GraphicsPipelineLibrary( const Util::MetroHash::Hash& elfHash, const uint64_t apiHash, const GplModuleState* pGplModuleStates, + PipelineBinaryStorage* pBinaryStorage, const PipelineLayout* pPipelineLayout) : GraphicsPipelineCommon( #if VKI_RAY_TRACING @@ -789,6 +885,7 @@ GraphicsPipelineLibrary::GraphicsPipelineLibrary( Pipeline::Init( nullptr, pPipelineLayout, + pBinaryStorage, objectInfo.staticStateMask, #if VKI_RAY_TRACING 0, diff --git a/icd/api/vk_indirect_commands_layout.cpp b/icd/api/vk_indirect_commands_layout.cpp index a05c1799..1a49ad9e 100644 --- a/icd/api/vk_indirect_commands_layout.cpp +++ b/icd/api/vk_indirect_commands_layout.cpp @@ -235,22 +235,22 @@ void IndirectCommandsLayoutNV::BuildPalCreateInfo( Pal::IndirectParam* pIndirectParams, Pal::IndirectCmdGeneratorCreateInfo* pPalCreateInfo) { - uint32_t paramCount = 0; - uint32_t expectedOffset = 0; - uint32_t bindingArgsSize = 0; + uint32_t paramCount = 0u; + uint32_t expectedOffset = 0u; + uint32_t bindingArgsSize = 0u; bool useNativeIndexType = true; - const bool isDispatch = (pCreateInfo->pTokens[pCreateInfo->tokenCount - 1].tokenType - == VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV); + const bool isDispatch = (pCreateInfo->pTokens[pCreateInfo->tokenCount - 1].tokenType == + VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV); for (uint32_t i = 0; i < pCreateInfo->tokenCount; ++i) { const VkIndirectCommandsLayoutTokenNV& token = pCreateInfo->pTokens[i]; -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 889 // Set a padding operation to handle non tightly packed indirect arguments buffers VK_ASSERT(token.offset >= expectedOffset); + if (token.offset > expectedOffset) { pIndirectParams[paramCount].type = Pal::IndirectParamType::Padding; @@ -259,7 +259,6 @@ void IndirectCommandsLayoutNV::BuildPalCreateInfo( bindingArgsSize += pIndirectParams[paramCount].sizeInBytes; paramCount++; } -#endif switch (token.tokenType) { @@ -359,9 +358,12 @@ void IndirectCommandsLayoutNV::BuildPalCreateInfo( constexpr uint32_t DxgiIndexTypeUint16 = 57; constexpr uint32_t DxgiIndexTypeUint32 = 42; - pPalCreateInfo->indexTypeTokens[0] = useNativeIndexType ? VK_INDEX_TYPE_UINT8_KHR : DxgiIndexTypeUint8; - pPalCreateInfo->indexTypeTokens[1] = useNativeIndexType ? VK_INDEX_TYPE_UINT16 : DxgiIndexTypeUint16; - pPalCreateInfo->indexTypeTokens[2] = useNativeIndexType ? VK_INDEX_TYPE_UINT32 : DxgiIndexTypeUint32; + pPalCreateInfo->indexTypeTokens[0] = useNativeIndexType ? + static_cast(VK_INDEX_TYPE_UINT8_KHR) : DxgiIndexTypeUint8; + pPalCreateInfo->indexTypeTokens[1] = useNativeIndexType ? + static_cast(VK_INDEX_TYPE_UINT16) : DxgiIndexTypeUint16; + pPalCreateInfo->indexTypeTokens[2] = useNativeIndexType ? + static_cast(VK_INDEX_TYPE_UINT32) : DxgiIndexTypeUint32; } // ===================================================================================================================== diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 171c6504..a016df9b 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -304,7 +304,8 @@ VkResult Memory::Create( pNext = pHeader->pNext; } - // For the descriptor table VA range for descriptor buffers + // Use the descriptor table VA range for descriptor buffers because we need to program descriptors + // with a single (32-bit) user data entry and there is no such guarentee with the default VA range. if (pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetMemoryTypeMaskForDescriptorBuffers() & (1 << pAllocInfo->memoryTypeIndex)) { diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 01d28850..bf67f464 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -726,7 +726,7 @@ void PhysicalDevice::InitializePlatformKey( // - Be a valid UUID generated using normal means // // Settings: -// - markPipelineCacheWithBuildTimestamp: decides whether to mix in __DATE__ __TIME__ from compiler to UUID +// - markPipelineCacheWithBuildTimestamp: decides whether to mix in current library BuildId from compiler to UUID // - useGlobalCacheId : decides if UUID should be portable between machines // static void GenerateCacheUuid( @@ -1667,14 +1667,7 @@ size_t PhysicalDevice::GetFeatures( pFeatures->shaderInt64 = (PalProperties().gfxipProperties.flags.support64BitInstructions ? VK_TRUE : VK_FALSE); - if (Is16BitInstructionsSupported()) - { - pFeatures->shaderInt16 = VK_TRUE; - } - else - { - pFeatures->shaderInt16 = VK_FALSE; - } + pFeatures->shaderInt16 = VK_TRUE; if (settings.optEnablePrt) { @@ -4399,12 +4392,10 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_BORDER_COLOR_SWIZZLE)); } -#if VKI_BUILD_GFX11 if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0)) { availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_PRIMITIVES_GENERATED_QUERY)); } -#endif if (IsKhrCooperativeMatrixSupported(pPhysicalDevice)) { @@ -4462,6 +4453,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_NON_SEAMLESS_CUBE_MAP)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_MODULE_IDENTIFIER)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_PIPELINE_BINARY)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_MAXIMAL_RECONVERGENCE)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE3)); @@ -5138,14 +5131,12 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct8Properties( *pIntegerDotProductAccumulatingSaturating8BitUnsignedAccelerated = int8DotSupport; *pIntegerDotProductAccumulatingSaturating8BitSignedAccelerated = int8DotSupport; -#if VKI_BUILD_GFX11 if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_TRUE; *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_TRUE; } else -#endif { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_FALSE; *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; @@ -5170,14 +5161,12 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct4x8Properties( *pIntegerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = int8DotSupport; *pIntegerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = int8DotSupport; -#if VKI_BUILD_GFX11 - if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) + if (PalProperties().gfxipProperties.flags.supportMixedSignIntDot) { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_TRUE; *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_TRUE; } else -#endif { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_FALSE; *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; @@ -5195,9 +5184,7 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct16Properties( ) const { const VkBool32 int16DotSupport = (Is16BitInstructionsSupported() -#if VKI_BUILD_GFX11 && (PalProperties().gfxLevel < Pal::GfxIpLevel::GfxIp11_0) -#endif #if VKI_BUILD_GFX115 && (PalProperties().gfxLevel < Pal::GfxIpLevel::GfxIp11_5) #endif @@ -5269,6 +5256,54 @@ void PhysicalDevice::GetDevicePropertiesMaxBufferSize( *pMaxBufferSize = 2u * 1024u * 1024u * 1024u; // TODO: replace with actual size } +// ===================================================================================================================== +void PhysicalDevice::GetPhysicalDeviceLineSubPixelPrecisionBits( + uint32_t* pLineSubPixelPrecisionBits +) const +{ + *pLineSubPixelPrecisionBits = Pal::SubPixelBits; +} + +// ===================================================================================================================== +void PhysicalDevice::GetPhysicalDeviceVertexAttributeDivisorProperties( + uint32_t* pMaxVertexAttribDivisor, + VkBool32* pSupportsNonZeroFirstInstance +) const +{ + *pMaxVertexAttribDivisor = UINT32_MAX; + *pSupportsNonZeroFirstInstance = VK_TRUE; +} + +// ===================================================================================================================== +void PhysicalDevice::GetPhysicalDeviceMaintenance5Properties( + VkBool32* pEarlyFragmentMultisampleCoverageAfterSampleCounting, + VkBool32* pEarlyFragmentSampleMaskTestBeforeSampleCounting, + VkBool32* pDepthStencilSwizzleOneSupport, + VkBool32* pPolygonModePointSize, + VkBool32* pNonStrictSinglePixelWideLinesUseParallelogram, + VkBool32* pNonStrictWideLinesUseParallelogram +) const +{ + *pEarlyFragmentMultisampleCoverageAfterSampleCounting = VK_TRUE; + *pEarlyFragmentSampleMaskTestBeforeSampleCounting = VK_TRUE; + *pDepthStencilSwizzleOneSupport = VK_TRUE; + *pPolygonModePointSize = VK_TRUE; + *pNonStrictSinglePixelWideLinesUseParallelogram = VK_TRUE; + *pNonStrictWideLinesUseParallelogram = VK_TRUE; +} + +// ===================================================================================================================== +void PhysicalDevice::GetPhysicalDeviceMaintenance6Properties( + VkBool32* pBlockTexelViewCompatibleMultipleLayers, + uint32_t* pMaxCombinedImageSamplerDescriptorCount, + VkBool32* pFragmentShadingRateClampCombinerInputs +) const +{ + *pBlockTexelViewCompatibleMultipleLayers = VK_TRUE; + *pMaxCombinedImageSamplerDescriptorCount = MaxCombinedImageSamplerDescriptorCount; + *pFragmentShadingRateClampCombinerInputs = VK_TRUE; +} + // ===================================================================================================================== void PhysicalDevice::GetPhysicalDeviceDriverProperties( VkDriverId* pDriverID, @@ -5544,14 +5579,7 @@ void PhysicalDevice::GetPhysicalDeviceSamplerYcbcrConversionFeatures( VkBool32* pSamplerYcbcrConversion ) const { - if (IsExtensionSupported(DeviceExtensions::KHR_SAMPLER_YCBCR_CONVERSION)) - { - *pSamplerYcbcrConversion = VK_TRUE; - } - else - { - *pSamplerYcbcrConversion = VK_FALSE; - } + *pSamplerYcbcrConversion = VK_TRUE; } // ===================================================================================================================== @@ -5723,6 +5751,23 @@ void PhysicalDevice::GetPhysicalDeviceVulkanMemoryModelFeatures( } +void PhysicalDevice::GetPhysicalDeviceLineRasterizationFeatures( + VkBool32* pRectangularLines, + VkBool32* pBresenhamLines, + VkBool32* pSmoothLines, + VkBool32* pStippledRectangularLines, + VkBool32* pStippledBresenhamLines, + VkBool32* pStippledSmoothLines +) const +{ + *pRectangularLines = VK_FALSE; + *pBresenhamLines = VK_TRUE; + *pSmoothLines = VK_FALSE; + *pStippledRectangularLines = VK_FALSE; + *pStippledBresenhamLines = VK_TRUE; + *pStippledSmoothLines = VK_FALSE; +} + // ===================================================================================================================== // Retrieve device feature support. Called in response to vkGetPhysicalDeviceFeatures2 // NOTE: Don't memset here. Otherwise, VerifyRequestedPhysicalDeviceFeatures needs to compare member by member @@ -6230,13 +6275,12 @@ size_t PhysicalDevice::GetFeatures2( if (updateFeatures) { - pExtInfo->rectangularLines = VK_FALSE; - pExtInfo->bresenhamLines = VK_TRUE; - pExtInfo->smoothLines = VK_FALSE; - - pExtInfo->stippledRectangularLines = VK_FALSE; - pExtInfo->stippledBresenhamLines = VK_TRUE; - pExtInfo->stippledSmoothLines = VK_FALSE; + GetPhysicalDeviceLineRasterizationFeatures(&pExtInfo->rectangularLines, + &pExtInfo->bresenhamLines, + &pExtInfo->smoothLines, + &pExtInfo->stippledRectangularLines, + &pExtInfo->stippledBresenhamLines, + &pExtInfo->stippledSmoothLines); } structSize = sizeof(*pExtInfo); @@ -6851,7 +6895,6 @@ size_t PhysicalDevice::GetFeatures2( structSize = sizeof(*pExtInfo); break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -7470,6 +7513,19 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_BINARY_FEATURES_KHR: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->pipelineBinaries = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + default: { // skip any unsupported extension structures @@ -7973,7 +8029,8 @@ void PhysicalDevice::GetDeviceProperties2( { auto* pProps = static_cast(pNext); - pProps->lineSubPixelPrecisionBits = Pal::SubPixelBits; + GetPhysicalDeviceLineSubPixelPrecisionBits( + &pProps->lineSubPixelPrecisionBits); break; } @@ -8264,25 +8321,26 @@ void PhysicalDevice::GetDeviceProperties2( pProps->samplerCaptureReplayDescriptorDataSize = sizeof(uint32_t); pProps->accelerationStructureCaptureReplayDescriptorDataSize = sizeof(uint32_t); - VK_ASSERT(palProps.gfxipProperties.srdSizes.sampler <= 32); - VK_ASSERT(palProps.gfxipProperties.srdSizes.imageView <= 64); - VK_ASSERT(palProps.gfxipProperties.srdSizes.bufferView <= 64); + VK_ASSERT(palProps.gfxipProperties.srdSizes.sampler <= 32); + VK_ASSERT(palProps.gfxipProperties.srdSizes.imageView <= 64); + VK_ASSERT(palProps.gfxipProperties.srdSizes.typedBufferView <= 64); + VK_ASSERT(palProps.gfxipProperties.srdSizes.untypedBufferView <= 64); pProps->samplerDescriptorSize = palProps.gfxipProperties.srdSizes.sampler; pProps->combinedImageSamplerDescriptorSize = palProps.gfxipProperties.srdSizes.sampler + palProps.gfxipProperties.srdSizes.imageView; pProps->sampledImageDescriptorSize = palProps.gfxipProperties.srdSizes.imageView; pProps->storageImageDescriptorSize = palProps.gfxipProperties.srdSizes.imageView; - pProps->uniformTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->robustUniformTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->storageTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->robustStorageTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->uniformBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->robustUniformBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->storageBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; - pProps->robustStorageBufferDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; + pProps->uniformTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.typedBufferView; + pProps->robustUniformTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.typedBufferView; + pProps->storageTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.typedBufferView; + pProps->robustStorageTexelBufferDescriptorSize = palProps.gfxipProperties.srdSizes.typedBufferView; + pProps->uniformBufferDescriptorSize = palProps.gfxipProperties.srdSizes.untypedBufferView; + pProps->robustUniformBufferDescriptorSize = palProps.gfxipProperties.srdSizes.untypedBufferView; + pProps->storageBufferDescriptorSize = palProps.gfxipProperties.srdSizes.untypedBufferView; + pProps->robustStorageBufferDescriptorSize = palProps.gfxipProperties.srdSizes.untypedBufferView; pProps->inputAttachmentDescriptorSize = palProps.gfxipProperties.srdSizes.imageView; - pProps->accelerationStructureDescriptorSize = palProps.gfxipProperties.srdSizes.bufferView; + pProps->accelerationStructureDescriptorSize = palProps.gfxipProperties.srdSizes.untypedBufferView; pProps->maxSamplerDescriptorBufferRange = UINT_MAX; pProps->maxResourceDescriptorBufferRange = UINT_MAX; pProps->resourceDescriptorBufferAddressSpaceSize = UINT_MAX; @@ -8334,9 +8392,9 @@ void PhysicalDevice::GetDeviceProperties2( { auto* pProps = static_cast(pNext); - pProps->blockTexelViewCompatibleMultipleLayers = VK_TRUE; - pProps->maxCombinedImageSamplerDescriptorCount = MaxCombinedImageSamplerDescriptorCount; - pProps->fragmentShadingRateClampCombinerInputs = VK_TRUE; + GetPhysicalDeviceMaintenance6Properties(&pProps->blockTexelViewCompatibleMultipleLayers, + &pProps->maxCombinedImageSamplerDescriptorCount, + &pProps->fragmentShadingRateClampCombinerInputs); break; } @@ -8424,13 +8482,11 @@ void PhysicalDevice::GetDeviceProperties2( pProps->maxMeshOutputVertices = 256; pProps->maxMeshOutputPrimitives = 256; - #if VKI_BUILD_GFX11 if (palProps.gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { pProps->maxMeshOutputLayers = m_limits.maxFramebufferLayers; } else -#endif { pProps->maxMeshOutputLayers = 8; } @@ -8487,8 +8543,8 @@ void PhysicalDevice::GetDeviceProperties2( { auto* pProps = static_cast(pNext); - pProps->maxVertexAttribDivisor = UINT32_MAX; - pProps->supportsNonZeroFirstInstance = VK_TRUE; + GetPhysicalDeviceVertexAttributeDivisorProperties(&pProps->maxVertexAttribDivisor, + &pProps->supportsNonZeroFirstInstance); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_3_PROPERTIES_EXT: @@ -8513,6 +8569,27 @@ void PhysicalDevice::GetDeviceProperties2( } #endif + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_BINARY_PROPERTIES_KHR: + { + auto* pProps = static_cast(pNext); + pProps->pipelineBinaryInternalCache = VK_TRUE; + pProps->pipelineBinaryInternalCacheControl = VK_FALSE; + pProps->pipelineBinaryPrefersInternalCache = VK_FALSE; + pProps->pipelineBinaryCompressedData = VK_FALSE; + + if ((getenv(PipelineBinaryCache::EnvVarPath) != nullptr) || + (getenv(PipelineBinaryCache::EnvVarReadOnlyFileName) != nullptr)) + { + pProps->pipelineBinaryPrecompiledInternalCache = VK_TRUE; + } + else + { + pProps->pipelineBinaryPrecompiledInternalCache = VK_FALSE; + } + + break; + } + #if VKI_COPY_MEMORY_INDIRECT case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COPY_MEMORY_INDIRECT_PROPERTIES_KHR: { @@ -8816,21 +8893,9 @@ static void VerifyLimits( VK_ASSERT(limits.maxImageDimension3D >= 256); VK_ASSERT(limits.maxImageDimensionCube >= 4096); VK_ASSERT(limits.maxImageArrayLayers >= 256); - VK_ASSERT(limits.maxTexelBufferElements >= 65536); VK_ASSERT(limits.maxUniformBufferRange >= 16384); - VK_ASSERT(limits.maxStorageBufferRange >= (1UL << 27)); - VK_ASSERT(limits.maxPushConstantsSize >= 128); - VK_ASSERT(limits.maxMemoryAllocationCount >= 4096); - VK_ASSERT(limits.maxSamplerAllocationCount >= 4000); VK_ASSERT(limits.bufferImageGranularity <= 131072); - VK_ASSERT(limits.sparseAddressSpaceSize >= (features.sparseBinding ? (1ULL << 31) : 0)); - VK_ASSERT(limits.maxBoundDescriptorSets >= 4); - VK_ASSERT(limits.maxPerStageDescriptorSamplers >= 16); VK_ASSERT(limits.maxPerStageDescriptorUniformBuffers >= 12); - VK_ASSERT(limits.maxPerStageDescriptorStorageBuffers >= 4); - VK_ASSERT(limits.maxPerStageDescriptorSampledImages >= 16); - VK_ASSERT(limits.maxPerStageDescriptorStorageImages >= 4); - VK_ASSERT(limits.maxPerStageDescriptorInputAttachments >= 4); const uint64_t reqMaxPerStageResources = Util::Min( static_cast(limits.maxPerStageDescriptorUniformBuffers) + @@ -8841,14 +8906,59 @@ static void VerifyLimits( static_cast(limits.maxColorAttachments), static_cast(128)); + VK_ASSERT(limits.maxDescriptorSetUniformBuffers >= 72); + VK_ASSERT(limits.maxDescriptorSetStorageBuffers >= 24); + VK_ASSERT(limits.maxDescriptorSetStorageImages >= 24); + VK_ASSERT(limits.maxFragmentCombinedOutputResources >= 4); + VK_ASSERT(limits.maxComputeWorkGroupInvocations >= 128); + VK_ASSERT(limits.maxComputeWorkGroupSize[0] >= 128); + VK_ASSERT(limits.maxComputeWorkGroupSize[1] >= 128); + VK_ASSERT(limits.maxComputeWorkGroupSize[2] >= 64); + VK_ASSERT(limits.subTexelPrecisionBits >= 4); + VK_ASSERT(limits.mipmapPrecisionBits >= 4); + VK_ASSERT(limits.maxSamplerLodBias >= 2); + VK_ASSERT(limits.maxBoundDescriptorSets >= 4); + VK_ASSERT(limits.maxColorAttachments >= 4); + VK_ASSERT(limits.maxPushConstantsSize >= 128); + + if (features.largePoints) + { + VK_ASSERT(limits.pointSizeRange[0] <= 1.0f); + VK_ASSERT(limits.pointSizeRange[1] >= 64.0f - limits.pointSizeGranularity); + + VK_ASSERT(limits.pointSizeGranularity <= 1.0f); + } + else + { + VK_ASSERT(limits.pointSizeRange[0] == 1.0f); + VK_ASSERT(limits.pointSizeRange[1] == 1.0f); + VK_ASSERT(limits.pointSizeGranularity == 0.0f); + } + + if (features.wideLines) + { + VK_ASSERT(limits.lineWidthGranularity <= 1.0f); + } + else + { + VK_ASSERT(limits.lineWidthGranularity == 0.0f); + } + VK_ASSERT(limits.maxTexelBufferElements >= 65536); + VK_ASSERT(limits.maxStorageBufferRange >= (1UL << 27)); + VK_ASSERT(limits.maxMemoryAllocationCount >= 4096); + VK_ASSERT(limits.maxSamplerAllocationCount >= 4000); + VK_ASSERT(limits.sparseAddressSpaceSize >= (features.sparseBinding ? (1ULL << 31) : 0)); + VK_ASSERT(limits.maxPerStageDescriptorSamplers >= 16); + VK_ASSERT(limits.maxPerStageDescriptorStorageBuffers >= 4); + VK_ASSERT(limits.maxPerStageDescriptorSampledImages >= 16); + VK_ASSERT(limits.maxPerStageDescriptorStorageImages >= 4); + VK_ASSERT(limits.maxPerStageDescriptorInputAttachments >= 4); + VK_ASSERT(limits.maxPerStageResources >= reqMaxPerStageResources); VK_ASSERT(limits.maxDescriptorSetSamplers >= 96); - VK_ASSERT(limits.maxDescriptorSetUniformBuffers >= 72); VK_ASSERT(limits.maxDescriptorSetUniformBuffersDynamic >= 8); - VK_ASSERT(limits.maxDescriptorSetStorageBuffers >= 24); VK_ASSERT(limits.maxDescriptorSetStorageBuffersDynamic >= 4); VK_ASSERT(limits.maxDescriptorSetSampledImages >= 96); - VK_ASSERT(limits.maxDescriptorSetStorageImages >= 24); VK_ASSERT(limits.maxDescriptorSetInputAttachments >= 4); VK_ASSERT(limits.maxVertexInputAttributes >= 16); VK_ASSERT(limits.maxVertexInputBindings >= 16); @@ -8914,18 +9024,11 @@ static void VerifyLimits( VK_ASSERT(limits.maxFragmentDualSrcAttachments == 0); } - VK_ASSERT(limits.maxFragmentCombinedOutputResources >= 4); VK_ASSERT(limits.maxComputeSharedMemorySize >= 16384); VK_ASSERT(limits.maxComputeWorkGroupCount[0] >= 65535); VK_ASSERT(limits.maxComputeWorkGroupCount[1] >= 65535); VK_ASSERT(limits.maxComputeWorkGroupCount[2] >= 65535); - VK_ASSERT(limits.maxComputeWorkGroupInvocations >= 128); - VK_ASSERT(limits.maxComputeWorkGroupSize[0] >= 128); - VK_ASSERT(limits.maxComputeWorkGroupSize[1] >= 128); - VK_ASSERT(limits.maxComputeWorkGroupSize[2] >= 64); VK_ASSERT(limits.subPixelPrecisionBits >= 4); - VK_ASSERT(limits.subTexelPrecisionBits >= 4); - VK_ASSERT(limits.mipmapPrecisionBits >= 4); VK_ASSERT(features.fullDrawIndexUint32); @@ -8947,8 +9050,6 @@ static void VerifyLimits( VK_ASSERT(limits.maxDrawIndirectCount == 1); } - VK_ASSERT(limits.maxSamplerLodBias >= 2); - VK_ASSERT(features.samplerAnisotropy); if (features.samplerAnisotropy) @@ -9030,7 +9131,6 @@ static void VerifyLimits( VK_ASSERT(limits.framebufferStencilSampleCounts & VK_SAMPLE_COUNT_4_BIT); VK_ASSERT(limits.framebufferNoAttachmentsSampleCounts & VK_SAMPLE_COUNT_1_BIT); VK_ASSERT(limits.framebufferNoAttachmentsSampleCounts & VK_SAMPLE_COUNT_4_BIT); - VK_ASSERT(limits.maxColorAttachments >= 4); VK_ASSERT(limits.sampledImageColorSampleCounts & VK_SAMPLE_COUNT_1_BIT); VK_ASSERT(limits.sampledImageColorSampleCounts & VK_SAMPLE_COUNT_4_BIT); VK_ASSERT(limits.sampledImageIntegerSampleCounts & VK_SAMPLE_COUNT_1_BIT); @@ -9079,21 +9179,6 @@ static void VerifyLimits( VK_ASSERT(limits.discreteQueuePriorities >= 2); - VK_ASSERT(features.largePoints); - - if (features.largePoints) - { - const float ULP = limits.pointSizeGranularity; - - VK_ASSERT(limits.pointSizeRange[0] <= 1.0f); - VK_ASSERT(limits.pointSizeRange[1] >= 64.0f - limits.pointSizeGranularity); - } - else - { - VK_ASSERT(limits.pointSizeRange[0] == 1.0f); - VK_ASSERT(limits.pointSizeRange[1] == 1.0f); - } - VK_ASSERT(features.wideLines); if (features.wideLines) @@ -9109,24 +9194,6 @@ static void VerifyLimits( VK_ASSERT(limits.lineWidthRange[1] == 1.0f); } - if (features.largePoints) - { - VK_ASSERT(limits.pointSizeGranularity <= 1.0f); - } - else - { - VK_ASSERT(limits.pointSizeGranularity == 0.0f); - } - - if (features.wideLines) - { - VK_ASSERT(limits.lineWidthGranularity <= 1.0f); - } - else - { - VK_ASSERT(limits.lineWidthGranularity == 0.0f); - } - VK_ASSERT(limits.nonCoherentAtomSize >= 128); } @@ -9322,6 +9389,7 @@ static void VerifyExtensions( && dev.IsExtensionSupported(DeviceExtensions::KHR_SYNCHRONIZATION2) && dev.IsExtensionSupported(DeviceExtensions::KHR_ZERO_INITIALIZE_WORKGROUP_MEMORY)); } + } // ===================================================================================================================== @@ -9482,10 +9550,14 @@ VkResult PhysicalDevice::GetDisplayModeProperties( properties[i].displayMode = reinterpret_cast(pDisplayMode); properties[i].parameters.visibleRegion.width = pScreenMode[i]->extent.width; properties[i].parameters.visibleRegion.height = pScreenMode[i]->extent.height; - // The refresh rate returned by pal is HZ. // Spec requires refresh rate to be "the number of times the display is refreshed each second // multiplied by 1000", in other words, HZ * 1000 +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 894 properties[i].parameters.refreshRate = pScreenMode[i]->refreshRate * 1000; +#else + properties[i].parameters.refreshRate = + pScreenMode[i]->refreshRate.numerator * 1000 / pScreenMode[i]->refreshRate.denominator; +#endif } *pPropertyCount = loopCount; @@ -9549,7 +9621,12 @@ VkResult PhysicalDevice::CreateDisplayMode( // The modes are considered as identical if the dimension as well as the refresh rate are the same. if ((pCreateInfo->parameters.visibleRegion.width == pScreenMode[i]->extent.width) && (pCreateInfo->parameters.visibleRegion.height == pScreenMode[i]->extent.height) && +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 894 (pCreateInfo->parameters.refreshRate == pScreenMode[i]->refreshRate * 1000)) +#else + (pCreateInfo->parameters.refreshRate == + pScreenMode[i]->refreshRate.numerator * 1000 / pScreenMode[i]->refreshRate.denominator)) +#endif { isValidMode = true; break; @@ -9580,7 +9657,12 @@ VkResult PhysicalDevice::CreateDisplayMode( { pNewMode->palScreenMode.extent.width = pCreateInfo->parameters.visibleRegion.width; pNewMode->palScreenMode.extent.height = pCreateInfo->parameters.visibleRegion.height; - pNewMode->palScreenMode.refreshRate = pCreateInfo->parameters.refreshRate; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 894 + pNewMode->palScreenMode.refreshRate = pCreateInfo->parameters.refreshRate / 1000; +#else + pNewMode->palScreenMode.refreshRate.numerator = pCreateInfo->parameters.refreshRate; + pNewMode->palScreenMode.refreshRate.denominator = 1000; +#endif pNewMode->palScreenMode.flags.u32All = 0; pNewMode->pScreen = pScreen; *pMode = reinterpret_cast(pNewMode); @@ -9781,8 +9863,8 @@ VkResult PhysicalDevice::GetPhysicalDeviceCooperativeMatrixPropertiesKHR( if (IsKhrCooperativeMatrixSupported(this)) { - constexpr uint32_t totalCount = CooperativeMatrixTypesCount + CooperativeMatrixSaturatingTypesCount; - + const uint32_t basicTypeCount = CooperativeMatrixTypesCount + CooperativeMatrixSaturatingTypesCount; + uint32_t totalCount = basicTypeCount; if (pProperties == nullptr) { *pPropertyCount = totalCount; @@ -9798,17 +9880,24 @@ VkResult PhysicalDevice::GetPhysicalDeviceCooperativeMatrixPropertiesKHR( for (uint32_t i = 0; i < *pPropertyCount; ++i) { - const bool sat = (i >= CooperativeMatrixTypesCount); - const uint32_t n = sat ? i - CooperativeMatrixTypesCount : i; - const CooperativeMatrixType* types = sat ? CooperativeMatrixSaturatingTypes : CooperativeMatrixTypes; - + bool sat = false; + const CooperativeMatrixType* pType = nullptr; + if (i < CooperativeMatrixTypesCount) + { + pType = CooperativeMatrixTypes + i; + } + else if (i < basicTypeCount) + { + sat = true; + pType = CooperativeMatrixSaturatingTypes + i - CooperativeMatrixTypesCount; + } pProperties[i].MSize = CooperativeMatrixDimension; pProperties[i].NSize = CooperativeMatrixDimension; pProperties[i].KSize = CooperativeMatrixDimension; - pProperties[i].AType = types[n].a; - pProperties[i].BType = types[n].b; - pProperties[i].CType = types[n].c; - pProperties[i].ResultType = types[n].c; + pProperties[i].AType = pType->a; + pProperties[i].BType = pType->b; + pProperties[i].CType = pType->c; + pProperties[i].ResultType = pType->c; pProperties[i].scope = VK_SCOPE_SUBGROUP_KHR; pProperties[i].saturatingAccumulation = sat ? VK_TRUE : VK_FALSE; } diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index 6c250699..f9134898 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -101,6 +101,69 @@ static_assert(VK_ARRAY_SIZE(HwStageNames) == static_cast(Util::Abi::Ha // the vkGetPipelineExecutableStatisticsKHR function static constexpr uint32_t ExecutableStatisticsCount = 5; +// ===================================================================================================================== +// Add binary data to this storage. +// To avoid redundant copies and memory allocation, it's expected that the calling code will allocate and prepare +// the binary. A Vulkan allocator must be used to allocate the memory at pData pointer. +// PipelineBinaryStorage will take ownership of the pointer and later free it in Free() call. +void Pipeline::InsertBinaryData( + PipelineBinaryStorage* pBinaryStorage, + const uint32 binaryIndex, + const Util::MetroHash::Hash& key, + const size_t dataSize, + const void* pData) +{ + VK_ASSERT(pBinaryStorage != nullptr); + VK_ASSERT(binaryIndex < VK_ARRAY_SIZE(pBinaryStorage->binaryInfo)); + // Expect that each entry is added only once + VK_ASSERT((pBinaryStorage->binaryInfo[binaryIndex].binaryHash.qwords[0] == 0) && + (pBinaryStorage->binaryInfo[binaryIndex].binaryHash.qwords[1] == 0)); + + pBinaryStorage->binaryInfo[binaryIndex].binaryHash = key; + pBinaryStorage->binaryInfo[binaryIndex].pipelineBinary.codeSize = dataSize; + pBinaryStorage->binaryInfo[binaryIndex].pipelineBinary.pCode = pData; + + ++pBinaryStorage->binaryCount; +} + +// ===================================================================================================================== +// Frees the previously inserted pipeline binaries. +VkResult Pipeline::FreeBinaryStorage( + const VkAllocationCallbacks* pAllocator) +{ + VkResult result = VK_SUCCESS; + + if (m_pBinaryStorage != nullptr) + { + Pipeline::FreeBinaryStorage(m_pBinaryStorage, pAllocator); + m_pBinaryStorage = nullptr; + } + else + { + result = VK_ERROR_UNKNOWN; + } + + return result; +} + +// ===================================================================================================================== +// Frees the pipeline binaries. +void Pipeline::FreeBinaryStorage( + PipelineBinaryStorage* pBinaryStorage, + const VkAllocationCallbacks* pAllocator) +{ + VK_ASSERT(pBinaryStorage != nullptr); + + for (uint32_t binaryIndex = 0; binaryIndex < VK_ARRAY_SIZE(pBinaryStorage->binaryInfo); ++binaryIndex) + { + if (pBinaryStorage->binaryInfo[binaryIndex].pipelineBinary.pCode != nullptr) + { + auto pMemory = const_cast(pBinaryStorage->binaryInfo[binaryIndex].pipelineBinary.pCode); + pAllocator->pfnFree(pAllocator->pUserData, pMemory); + } + } +} + // ===================================================================================================================== // Filter VkPipelineCreateFlags2KHR to only values used for pipeline caching VkPipelineCreateFlags2KHR Pipeline::GetCacheIdControlFlags( @@ -317,7 +380,7 @@ VkResult Pipeline::BuildShaderStageInfo( // creation of pipeline. VK_ASSERT(pTempModules != nullptr); - VkShaderModuleCreateFlags flags = 0; + ShaderModuleFlags flags = 0; Vkgc::BinaryData shaderBinary = {}; Pal::ShaderHash codeHash = {}; PipelineCreationFeedback* pShaderFeedback = (pFeedbacks == nullptr) ? nullptr : pFeedbacks + outIdx; @@ -335,7 +398,7 @@ VkResult Pipeline::BuildShaderStageInfo( if (pShaderModuleCreateInfo != nullptr) { - flags = pShaderModuleCreateInfo->flags; + flags = ShaderModule::ConvertVkShaderModuleCreateFlags(pShaderModuleCreateInfo->flags); shaderBinary.codeSize = pShaderModuleCreateInfo->codeSize; shaderBinary.pCode = pShaderModuleCreateInfo->pCode; @@ -347,8 +410,7 @@ VkResult Pipeline::BuildShaderStageInfo( { result = pCompiler->BuildShaderModule( pDevice, - flags, - VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT, + flags | ShaderModuleForceUncached, shaderBinary, &pTempModules[outIdx]); @@ -435,6 +497,11 @@ void Pipeline::HandleExtensionStructs( static_cast(pNext); break; } + case VK_STRUCTURE_TYPE_PIPELINE_BINARY_INFO_KHR: + { + pExtStructs->pPipelineBinaryInfoKHR = static_cast(pNext); + break; + } default: break; } @@ -460,6 +527,7 @@ Pipeline::Pipeline( m_hasRayTracing(hasRayTracing), m_dispatchRaysUserDataOffset(0), #endif + m_pBinaryStorage(nullptr), m_pFormatStrings(nullptr) { memset(m_pPalPipeline, 0, sizeof(m_pPalPipeline)); @@ -468,6 +536,7 @@ Pipeline::Pipeline( void Pipeline::Init( Pal::IPipeline** pPalPipeline, const PipelineLayout* pLayout, + PipelineBinaryStorage* pBinaryStorage, uint64_t staticStateMask, #if VKI_RAY_TRACING uint32_t dispatchRaysUserDataOffset, @@ -475,6 +544,7 @@ void Pipeline::Init( const Util::MetroHash::Hash& cacheHash, uint64_t apiHash) { + m_pBinaryStorage = pBinaryStorage; m_staticStateMask = staticStateMask; m_cacheHash = cacheHash; m_apiHash = apiHash; @@ -523,6 +593,11 @@ VkResult Pipeline::Destroy( m_pPalPipeline[deviceIdx]->Destroy(); } + if (m_pBinaryStorage != nullptr) + { + FreeBinaryStorage(m_pBinaryStorage, pAllocator); + } + if (m_pFormatStrings != nullptr) { Util::Destructor(m_pFormatStrings); diff --git a/icd/api/vk_pipeline_binary.cpp b/icd/api/vk_pipeline_binary.cpp new file mode 100644 index 00000000..699959b0 --- /dev/null +++ b/icd/api/vk_pipeline_binary.cpp @@ -0,0 +1,558 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#include "include/vk_pipeline_binary.h" +#include "include/vk_device.h" +#include "include/vk_pipeline.h" +#include "include/vk_compute_pipeline.h" +#include "include/vk_graphics_pipeline.h" +#include "include/graphics_pipeline_common.h" +#if VKI_RAY_TRACING +#include "raytrace/vk_ray_tracing_pipeline.h" +#endif + +#include "palPlatformKey.h" + +namespace vk +{ +// ===================================================================================================================== +PipelineBinary::PipelineBinary( + const Util::MetroHash::Hash& binaryKey, + const Vkgc::BinaryData& binaryData) + : + m_binaryKey(binaryKey), + m_binaryData(binaryData) +{ +} + +// ===================================================================================================================== +// Create a pipeline binary object. +VkResult PipelineBinary::Create( + Device* pDevice, + const Util::MetroHash::Hash& binaryKey, + const Vkgc::BinaryData& binaryData, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryKHR* pPipelineBinary) +{ + VK_ASSERT(pPipelineBinary != nullptr); + + VkResult result = VK_SUCCESS; + + PipelineBinary* pObject = nullptr; + uint8_t* pCode = nullptr; + + auto placement = utils::PlacementHelper<2>( + nullptr, + utils::PlacementElement{&pObject}, + utils::PlacementElement {&pCode, binaryData.codeSize}); + + void* pMemory = pDevice->AllocApiObject(pAllocator, placement.SizeOf()); + + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + *pPipelineBinary = VK_NULL_HANDLE; + } + else + { + placement.FixupPtrs(pMemory); + VK_ASSERT(pObject == pMemory); + + memcpy(pCode, binaryData.pCode, binaryData.codeSize); + + Vkgc::BinaryData objectBinaryData + { + .codeSize = binaryData.codeSize, + .pCode = pCode + }; + + VK_PLACEMENT_NEW(pObject) PipelineBinary(binaryKey, objectBinaryData); + + *pPipelineBinary = PipelineBinary::HandleFromVoidPointer(pObject); + } + + return result; +} + +// ===================================================================================================================== +VkResult PipelineBinary::CreatePipelineBinaries( + Device* pDevice, + const VkPipelineBinaryCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineBinaryHandlesInfoKHR* pBinaries) +{ + VK_ASSERT((pCreateInfo != nullptr) && (pBinaries != nullptr)); + + VkResult finalResult = VK_SUCCESS; + + if (pCreateInfo->pKeysAndDataInfo != nullptr) + { + if (pBinaries->pPipelineBinaries == nullptr) + { + pBinaries->pipelineBinaryCount = pCreateInfo->pKeysAndDataInfo->binaryCount; + } + else + { + uint32 binariesCopiedCount = 0; + + for (uint32_t binaryIndex = 0; + (binaryIndex < pCreateInfo->pKeysAndDataInfo->binaryCount); + ++binaryIndex) + { + Util::MetroHash::Hash binaryKey = {}; + ReadFromPipelineBinaryKey(pCreateInfo->pKeysAndDataInfo->pPipelineBinaryKeys[binaryIndex], &binaryKey); + + const auto binaryData = Vkgc::BinaryData + { + .codeSize = pCreateInfo->pKeysAndDataInfo->pPipelineBinaryData[binaryIndex].dataSize, + .pCode = pCreateInfo->pKeysAndDataInfo->pPipelineBinaryData[binaryIndex].pData + }; + + VkResult result = PipelineBinary::Create( + pDevice, + binaryKey, + binaryData, + pAllocator, + &pBinaries->pPipelineBinaries[binaryIndex]); + + if (result == VK_SUCCESS) + { + ++binariesCopiedCount; + } + else if (finalResult == VK_SUCCESS) + { + // Keep the first failed result, but attempt to create the remaining pipeline binaries + finalResult = result; + } + } + + pBinaries->pipelineBinaryCount = binariesCopiedCount; + + } + } + else if (pCreateInfo->pipeline != VK_NULL_HANDLE) + { + const auto pBinaryStorage = Pipeline::BaseObjectFromHandle(pCreateInfo->pipeline)->GetBinaryStorage(); + + if (pBinaryStorage != nullptr) + { + if (pBinaries->pPipelineBinaries == nullptr) + { + pBinaries->pipelineBinaryCount = pBinaryStorage->binaryCount; + } + else + { + uint32 binariesCopiedCount = 0; + + for (uint32_t binaryIndex = 0; + (binaryIndex < pBinaries->pipelineBinaryCount); + ++binaryIndex) + { + VkResult result = PipelineBinary::Create( + pDevice, + pBinaryStorage->binaryInfo[binaryIndex].binaryHash, + pBinaryStorage->binaryInfo[binaryIndex].pipelineBinary, + pAllocator, + &pBinaries->pPipelineBinaries[binaryIndex]); + + if (result == VK_SUCCESS) + { + ++binariesCopiedCount; + } + else if (finalResult == VK_SUCCESS) + { + // Keep the first failed result, but attempt to create the remaining pipeline binaries + finalResult = result; + } + } + + pBinaries->pipelineBinaryCount = binariesCopiedCount; + } + } + else + { + // Pipeline didn't enable VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR. + finalResult = VK_ERROR_INITIALIZATION_FAILED; + } + } + else if (pCreateInfo->pPipelineCreateInfo != nullptr) + { + // Generate the key for the provided pipeline create info + VkPipelineBinaryKeyKHR binaryKey = {}; + PipelineBinary::GetPipelineKey(pDevice, pCreateInfo->pPipelineCreateInfo, &binaryKey); + + // Query the pipeline binary cache using the generated key. + bool isUserCacheHit = false; + bool isInternalCacheHit = false; + Util::MetroHash::Hash key = {}; + Vkgc::BinaryData pipelineBinary = {}; + FreeCompilerBinary freeCompilerBinary = FreeWithCompiler; + + ReadFromPipelineBinaryKey(binaryKey, &key); + + Util::Result cacheResult = pDevice->GetCompiler(DefaultDeviceIndex)->GetCachedPipelineBinary( + &key, + nullptr, // pPipelineBinaryCache + &pipelineBinary, + &isUserCacheHit, + &isInternalCacheHit, + &freeCompilerBinary, + nullptr); // pPipelineFeedback + + if (cacheResult == Util::Result::Success) + { + if (pBinaries->pPipelineBinaries == nullptr) + { + // Cached binaries are monolithic, not GPL libraries + pBinaries->pipelineBinaryCount = pDevice->NumPalDevices(); + } + else + { + uint32 binariesCopiedCount = 0; + + for (uint32_t binaryIndex = 0; + (binaryIndex < pBinaries->pipelineBinaryCount); + ++binaryIndex) + { + VkResult result = PipelineBinary::Create( + pDevice, + key, + pipelineBinary, + pAllocator, + &pBinaries->pPipelineBinaries[binaryIndex]); + + if (result == VK_SUCCESS) + { + ++binariesCopiedCount; + } + else if (finalResult == VK_SUCCESS) + { + // Keep the first failed result, but attempt to create the remaining pipeline binaries + finalResult = result; + } + } + + pBinaries->pipelineBinaryCount = binariesCopiedCount; + } + } + else + { + finalResult = VK_PIPELINE_BINARY_MISSING_KHR; + } + } + else + { + finalResult = VK_ERROR_INITIALIZATION_FAILED; + VK_NEVER_CALLED(); + } + + return finalResult; +} + +// ===================================================================================================================== +VkResult PipelineBinary::DestroyPipelineBinary( + Device* pDevice, + const VkAllocationCallbacks* pAllocator) +{ + Util::Destructor(this); + + pDevice->FreeApiObject(pAllocator, this); + + return VK_SUCCESS; +} + +// ===================================================================================================================== +VkResult PipelineBinary::GetPipelineKey( + const Device* pDevice, + const VkPipelineCreateInfoKHR* pPipelineCreateInfo, + VkPipelineBinaryKeyKHR* pPipelineBinaryKey) +{ + VkResult result = VK_SUCCESS; + + if (pPipelineCreateInfo == nullptr) + { + // Return a common key that applies to all pipelines. If it's changed, it invalidates all other + // pipeline-specific keys. + const auto pPlatformKey = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPlatformKey(); + + // If this fails, then we probably need to compute the key in some other way, or we risk collisions. + VK_ASSERT(pPlatformKey->GetKeySize() <= VK_MAX_PIPELINE_BINARY_KEY_SIZE_KHR); + + WriteToPipelineBinaryKey( + pPlatformKey->GetKey(), + pPlatformKey->GetKeySize(), + pPipelineBinaryKey); + } + else + { + Util::MetroHash::Hash cacheId[MaxPipelineBinaryInfoCount] = {}; + + switch (static_cast(pPipelineCreateInfo->pNext)->sType) + { + case VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO: + { + const auto pComputeCreateInfo = static_cast(pPipelineCreateInfo->pNext); + const auto flags = Device::GetPipelineCreateFlags(pComputeCreateInfo); + + ComputePipelineBinaryCreateInfo binaryCreateInfo = {}; + PipelineOptimizerKey pipelineOptimizerKey = {}; + ShaderOptimizerKey shaderOptimizerKey = {}; + ShaderModuleHandle tempModule = {}; + ComputePipelineShaderStageInfo shaderInfo = {}; + uint64 apiPsoHash = 0; + + result = ComputePipeline::CreateCacheId( + pDevice, + pComputeCreateInfo, + flags, + &shaderInfo, + &binaryCreateInfo, + &shaderOptimizerKey, + &pipelineOptimizerKey, + &apiPsoHash, + &tempModule, + cacheId); + + break; + } + case VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO: + { + const auto pGraphicsCreateInfo = + static_cast(pPipelineCreateInfo->pNext); + + const auto flags = Device::GetPipelineCreateFlags(pGraphicsCreateInfo); + + GraphicsPipelineExtStructs extStructs = {}; + GraphicsPipelineLibraryInfo libInfo = {}; + GraphicsPipelineBinaryCreateInfo binaryCreateInfo = {}; + PipelineOptimizerKey pipelineOptimizerKey = {}; + ShaderOptimizerKey shaderOptimizerKeys[ShaderStage::ShaderStageGfxCount] = {}; + ShaderModuleHandle tempModules[ShaderStage::ShaderStageGfxCount] = {}; + GraphicsPipelineShaderStageInfo shaderStageInfo = {}; + uint64 apiPsoHash = 0; + + GraphicsPipelineCommon::HandleExtensionStructs(pGraphicsCreateInfo, &extStructs); + GraphicsPipelineCommon::ExtractLibraryInfo(pDevice, pGraphicsCreateInfo, extStructs, flags, &libInfo); + + result = GraphicsPipelineCommon::CreateCacheId( + pDevice, + pGraphicsCreateInfo, + extStructs, + libInfo, + flags, + &shaderStageInfo, + &binaryCreateInfo, + shaderOptimizerKeys, + &pipelineOptimizerKey, + &apiPsoHash, + tempModules, + cacheId); + + break; + } +#if VKI_RAY_TRACING + case VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR: + { + const auto pRayTracingCreateInfo = + static_cast(pPipelineCreateInfo->pNext); + + const auto flags = Device::GetPipelineCreateFlags(pRayTracingCreateInfo); + + RayTracingPipelineShaderStageInfo shaderInfo = {}; + PipelineOptimizerKey optimizerKey = {}; + ShaderModuleHandle* pTempModules = nullptr; + uint64 apiPsoHash = 0; + Util::MetroHash::Hash elfHash = {}; + + // If rtEnableCompilePipelineLibrary is false, the library shaders are included in pRayTracingCreateInfo. + const bool hasLibraries = + pDevice->GetRuntimeSettings().rtEnableCompilePipelineLibrary && + ((pRayTracingCreateInfo->pLibraryInfo != nullptr) && + (pRayTracingCreateInfo->pLibraryInfo->libraryCount > 0)); + + void* pShaderTempBuffer = nullptr; + const uint32_t nativeShaderCount = pRayTracingCreateInfo->stageCount; + uint32_t totalShaderCount = pRayTracingCreateInfo->stageCount; + + if (hasLibraries) + { + for (uint32_t libraryIdx = 0; + libraryIdx < pRayTracingCreateInfo->pLibraryInfo->libraryCount; + ++libraryIdx) + { + auto pLibrary = RayTracingPipeline::ObjectFromHandle( + pRayTracingCreateInfo->pLibraryInfo->pLibraries[libraryIdx]); + + VK_ASSERT(pLibrary->GetType() == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); + + totalShaderCount += pLibrary->GetTotalShaderCount(); + } + } + + if (totalShaderCount > 0) + { + auto placement = utils::PlacementHelper<3>( + nullptr, + utils::PlacementElement {&shaderInfo.pStages, nativeShaderCount}, + utils::PlacementElement{&pTempModules, nativeShaderCount}, + utils::PlacementElement{&optimizerKey.pShaders, totalShaderCount}); + + pShaderTempBuffer = pDevice->VkInstance()->AllocMem( + placement.SizeOf(), + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (pShaderTempBuffer != nullptr) + { + memset(pShaderTempBuffer, 0, placement.SizeOf()); + placement.FixupPtrs(pShaderTempBuffer); + + shaderInfo.stageCount = nativeShaderCount; + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + optimizerKey.shaderCount = totalShaderCount; + + result = RayTracingPipeline::CreateCacheId( + pDevice, + pRayTracingCreateInfo, + flags, + hasLibraries, + &shaderInfo, + &optimizerKey, + &apiPsoHash, + &elfHash, + pTempModules, + cacheId); + + // Free the temporary memory for shader modules + Pipeline::FreeTempModules(pDevice, nativeShaderCount, pTempModules); + + // Free the temporary memory for creating cacheId + if (pShaderTempBuffer != nullptr) + { + pDevice->VkInstance()->FreeMem(pShaderTempBuffer); + } + } + } + + break; + } +#endif + default: + // Unexpected header + result = VK_ERROR_UNKNOWN; + + VK_NEVER_CALLED(); + break; + } + + if (result == VK_SUCCESS) + { + WriteToPipelineBinaryKey( + cacheId[0].bytes, + sizeof(cacheId[0].bytes), + pPipelineBinaryKey); + } + } + + return result; +} + +// ===================================================================================================================== +VkResult PipelineBinary::GetPipelineBinaryData( + VkPipelineBinaryKeyKHR* pPipelineBinaryKey, + size_t* pPipelineBinaryDataSize, + void* pPipelineBinaryData) +{ + VK_ASSERT(pPipelineBinaryDataSize != nullptr); + + VkResult result = VK_SUCCESS; + + if (pPipelineBinaryData != nullptr) + { + if (*pPipelineBinaryDataSize < m_binaryData.codeSize) + { + result = VK_ERROR_NOT_ENOUGH_SPACE_KHR; + } + else + { + WriteToPipelineBinaryKey(&m_binaryKey, sizeof(m_binaryKey), pPipelineBinaryKey); + + memcpy(pPipelineBinaryData, m_binaryData.pCode, m_binaryData.codeSize); + } + } + + // Must be written in all cases + *pPipelineBinaryDataSize = m_binaryData.codeSize; + + return result; +} + +// ===================================================================================================================== +VkResult PipelineBinary::ReleaseCapturedPipelineData( + Device* pDevice, + Pipeline* pPipeline, + const VkAllocationCallbacks* pAllocator) +{ + return pPipeline->FreeBinaryStorage(pAllocator); +} + +// ===================================================================================================================== +// A helper to write a pipeline binary key +void PipelineBinary::WriteToPipelineBinaryKey( + const void* pSrcData, + const size_t dataSize, + VkPipelineBinaryKeyKHR* pDstKey) +{ + VK_ASSERT(pDstKey != nullptr); + VK_ASSERT(dataSize <= sizeof(pDstKey->key)); + + pDstKey->keySize = static_cast(dataSize); + memcpy(pDstKey->key, pSrcData, dataSize); + memset(&pDstKey->key[dataSize], 0, sizeof(pDstKey->key) - dataSize); +} + +// ===================================================================================================================== +// A helper to convert a pipeline binary key to MetroHash::Hash. +void PipelineBinary::ReadFromPipelineBinaryKey( + const VkPipelineBinaryKeyKHR& inKey, + Util::MetroHash::Hash* pOutKey) +{ + VK_ASSERT(pOutKey != nullptr); + + constexpr auto OutKeySize = static_cast(sizeof(pOutKey->bytes)); + + VK_ASSERT(inKey.keySize >= OutKeySize); + + memcpy(pOutKey->bytes, inKey.key, OutKeySize); +} + +} // namespace vk diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 68f3de3a..3c029508 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -333,7 +333,7 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( gfxReservedCount++; } - if (pDevice->GetRuntimeSettings().enableDebugPrintf) + if (pDevice->GetEnabledFeatures().enableDebugPrintf) { gfxReservedCount++; } @@ -386,7 +386,7 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( pPipelineInfo->numUserDataNodes += 1; } - if (pDevice->GetRuntimeSettings().enableDebugPrintf) + if (pDevice->GetEnabledFeatures().enableDebugPrintf) { pPipelineInfo->numUserDataNodes += 1; pUserDataLayout->debugPrintfRegBase = pInfo->userDataRegCount; @@ -633,7 +633,7 @@ VkResult PipelineLayout::BuildIndirectSchemeInfo( pInfo->userDataRegCount += 1; } - if (settings.enableDebugPrintf) + if (pDevice->GetEnabledFeatures().enableDebugPrintf) { pPipelineInfo->numUserDataNodes += 1; pUserDataLayout->debugPrintfRegBase = pInfo->userDataRegCount; @@ -886,6 +886,54 @@ VkResult PipelineLayout::Create( return result; } +// ===================================================================================================================== +// Extract user data layout based on createInfo with no pipeline layout object being actually created +VkResult PipelineLayout::GenerateUserDataLayout( + const Device* pDevice, + const VkPipelineLayoutCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + UserDataLayout* pUserDataLayout) +{ + VkResult result = VK_SUCCESS; + + Info info = {}; + PipelineInfo pipelineInfo = {}; + SetUserDataLayout* pSetUserDataLayout = nullptr; + + const size_t setUserDataLayoutSize = + Util::Pow2Align((pCreateInfo->setLayoutCount * sizeof(SetUserDataLayout)), ExtraDataAlignment()); + + void* pMemory = pAllocator->pfnAllocation(pAllocator->pUserData, + setUserDataLayoutSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (pMemory != nullptr) + { + pSetUserDataLayout = static_cast(pMemory); + + result = ConvertCreateInfo( + pDevice, + pCreateInfo, + &info, + &pipelineInfo, + pSetUserDataLayout); + + if (result == VK_SUCCESS) + { + *pUserDataLayout = info.userDataLayout; + } + + pAllocator->pfnFree(pAllocator->pUserData, pMemory); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + // ===================================================================================================================== // Translates VkDescriptorType to VKGC ResourceMappingNodeType Vkgc::ResourceMappingNodeType PipelineLayout::MapLlpcResourceNodeType( @@ -1037,7 +1085,7 @@ void PipelineLayout::BuildLlpcVertexBufferTableMapping( if (pVbInfo != nullptr) { // Build the table description itself - const uint32_t srdDwSize = m_pDevice->GetProperties().descriptorSizes.bufferView / sizeof(uint32_t); + const uint32_t srdDwSize = m_pDevice->GetProperties().descriptorSizes.untypedBufferView / sizeof(uint32_t); const uint32_t vbTableSize = pVbInfo->bindingTableSize * srdDwSize; // Add the set pointer node pointing to this table @@ -1346,7 +1394,7 @@ VkResult PipelineLayout::BuildCompactSchemeLlpcPipelineMapping( &userDataNodeCount); } - if (m_pDevice->GetRuntimeSettings().enableDebugPrintf) + if (m_pDevice->GetEnabledFeatures().enableDebugPrintf) { BuildLlpcDebugPrintfMapping( stageMask, @@ -1640,7 +1688,7 @@ void PipelineLayout::BuildIndirectSchemeLlpcPipelineMapping( } #endif - if (m_pDevice->GetRuntimeSettings().enableDebugPrintf) + if (m_pDevice->GetEnabledFeatures().enableDebugPrintf) { BuildLlpcDebugPrintfMapping( stageMask, diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index d16685b6..af16c0fe 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -445,7 +445,9 @@ VkResult TimestampQueryPool::Create( // Allocate system memory size_t apiSize = sizeof(TimestampQueryPool); - size_t viewSize = pDevice->GetProperties().descriptorSizes.bufferView; + size_t viewSize = pDevice->UseStridedCopyQueryResults() ? + pDevice->GetProperties().descriptorSizes.untypedBufferView : + pDevice->GetProperties().descriptorSizes.typedBufferView; size_t totalSize = apiSize + (viewSize * pDevice->NumPalDevices()); void* pMemory = nullptr; const uint32_t slotSize = pDevice->GetProperties().timestampQueryPoolSlotSize; @@ -642,7 +644,7 @@ VkResult QueryPoolWithStorageView::Initialize( } else { - memset(pViewMem, 0, m_pDevice->GetProperties().descriptorSizes.bufferView); + memset(pViewMem, 0, viewSize * m_pDevice->NumPalDevices()); } } @@ -822,7 +824,9 @@ VkResult AccelerationStructureQueryPool::Create( // Allocate system memory size_t apiSize = sizeof(AccelerationStructureQueryPool); - size_t viewSize = pDevice->GetProperties().descriptorSizes.bufferView; + size_t viewSize = pDevice->UseStridedCopyQueryResults() ? + pDevice->GetProperties().descriptorSizes.untypedBufferView : + pDevice->GetProperties().descriptorSizes.typedBufferView; size_t totalSize = apiSize + (viewSize * pDevice->NumPalDevices()); void* pMemory = nullptr; diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp index c6378aef..9fd7e4dd 100644 --- a/icd/api/vk_shader.cpp +++ b/icd/api/vk_shader.cpp @@ -158,7 +158,7 @@ ShaderModule::ShaderModule( : m_codeSize(codeSize), m_pCode(pCode), - m_flags(flags) + m_flags(ConvertVkShaderModuleCreateFlags(flags)) { m_codeHash = BuildCodeHash(pCode, codeSize); @@ -211,7 +211,6 @@ VkResult ShaderModule::Init( VkResult result = pCompiler->BuildShaderModule( pDevice, m_flags, - 0, shaderBinary, &m_handle); diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index 097adbe2..94cbf36c 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -41,7 +41,10 @@ namespace utils // Get driver build time hash uint32_t GetBuildTimeHash() { - return Util::HashLiteralString(__DATE__ __TIME__); + Util::BuildId buildId; + Util::GetCurrentLibraryBuildId(&buildId); + + return Util::HashString((const char*)(buildId.data), sizeof(buildId.data)); } // ===================================================================================================================== diff --git a/icd/imported/gputexdecoder/gpuTexDecoder.cpp b/icd/imported/gputexdecoder/gpuTexDecoder.cpp index 8da4cc88..0c4d95c5 100755 --- a/icd/imported/gputexdecoder/gpuTexDecoder.cpp +++ b/icd/imported/gputexdecoder/gpuTexDecoder.cpp @@ -389,7 +389,7 @@ void Device::Init( { m_info = info; m_imageViewSizeInDwords = m_info.pDeviceProperties->gfxipProperties.srdSizes.imageView / sizeof(uint32); - m_bufferViewSizeInDwords = m_info.pDeviceProperties->gfxipProperties.srdSizes.bufferView / sizeof(uint32); + m_bufferViewSizeInDwords = m_info.pDeviceProperties->gfxipProperties.srdSizes.typedBufferView / sizeof(uint32); // 3 Table and 1 TexBuffer, and 2 Image resource m_srdDwords[static_cast(InternalTexConvertCsType::ConvertASTCToRGBA8)] diff --git a/icd/make/amdicd.so.def b/icd/make/amdicd.so.def index 769208a3..76fce96f 100644 --- a/icd/make/amdicd.so.def +++ b/icd/make/amdicd.so.def @@ -37,6 +37,7 @@ global: vkEnumerateInstanceExtensionProperties; vkEnumerateInstanceLayerProperties; vkEnumerateInstanceVersion; + GetSettingsBlobsAll; local: *; }; diff --git a/icd/res/ver.h b/icd/res/ver.h index dd4a4df1..46a9acb7 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 318 +#define VULKAN_ICD_BUILD_VERSION 321 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,7 +45,7 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2024.Q3.2" +#define VULKAN_DRIVER_INFO_STR "2024.Q3.3" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 04cd3b66..6133823d 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -203,7 +203,7 @@ void VulkanSettingsLoader::OverrideSettingsBySystemInfo() pRootPath, m_settings.appProfileDumpDir); MakeAbsolutePath(m_settings.pipelineProfileDumpFile, sizeof(m_settings.pipelineProfileDumpFile), pRootPath, m_settings.pipelineProfileDumpFile); -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE MakeAbsolutePath(m_settings.pipelineProfileRuntimeFile, sizeof(m_settings.pipelineProfileRuntimeFile), pRootPath, m_settings.pipelineProfileRuntimeFile); #endif @@ -225,6 +225,8 @@ void VulkanSettingsLoader::OverrideDefaultsExperimentInfo() VK_SET_VAL_IF_EXPERIMENT_ENABLED(RayTracingSupport, enableRaytracingSupport, false); #endif + VK_SET_VAL_IF_EXPERIMENT_ENABLED(VariableRateShadingSupport, enableVariableRateShading, false); + VK_SET_VAL_IF_EXPERIMENT_ENABLED(Native16BitTypesSupport, enableNative16BitTypes, false); VK_SET_VAL_IF_EXPERIMENT_ENABLED(AmdVendorExtensions, disableAmdVendorExtensions, true); @@ -247,7 +249,6 @@ void VulkanSettingsLoader::OverrideDefaultsExperimentInfo() { m_settings.rtEnableTreeRebraid = RebraidTypeOff; m_settings.rtEnableTriangleSplitting = false; - m_settings.rtEnableTopDownBuild = false; m_settings.rtBvhBuildModeFastBuild = BvhBuildModeLinear; m_settings.enablePairCompressionCostCheck = true; } @@ -455,7 +456,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.maxUnifiedNonRayGenShaders = static_cast(atoi(pMaxInlinedShadersEnvVar)); } -#if VKI_BUILD_GFX11 + // Default optimized RT settings for Navi31 / 32, // which has physical VGPR 1536 per SIMD if (pInfo->gfxipProperties.shaderCore.vgprsPerSimd == 1536) @@ -467,7 +468,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.indirectCallTargetOccupancyPerSimd = 0.75; } #endif -#endif + + { + m_settings.disableImplicitInvariantExports = false; + } if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp10_3) { @@ -479,13 +483,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.nggCompactVertex = false; } - - { - m_settings.disableImplicitInvariantExports = false; - } - -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) + else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { // Enable NGG compactionless mode for Navi3x m_settings.nggCompactVertex = false; @@ -494,18 +492,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.csWaveSize = 64; m_settings.fsWaveSize = 64; } -#endif - switch (pInfo->revision) - { -#if VKI_BUILD_STRIX1 - case Pal::AsicRevision::Strix1: - // Remove this when displayDcc corruption issue is fixed on Strix. - m_settings.disableDisplayDcc = DisplayableDcc::DisplayableDccDisabled; - break; -#endif - default: - break; - } // Put command buffers in local for large/resizable BAR systems with > 7 GBs of local heap constexpr gpusize _1GB = 1024ull * 1024ull * 1024ull; @@ -608,11 +594,8 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { } } -#if VKI_BUILD_GFX11 else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { - -#if VKI_BUILD_NAVI31 if (pInfo->revision == Pal::AsicRevision::Navi31) { { @@ -620,9 +603,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; } } -#endif } -#endif } if ((appProfile == AppProfile::WolfensteinII) || @@ -722,12 +703,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } } -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { m_settings.resourceBarrierOptions &= ~ResourceBarrierOptions::SkipDstCacheInv; } -#endif m_settings.implicitExternalSynchronization = false; } @@ -800,7 +779,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } } -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrPolicy::MallNoAllocSsrAsSnsr; @@ -812,28 +790,21 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( RpmViewBypassMall::RpmViewBypassMallOnRead; } -#if VKI_BUILD_NAVI31 if (pInfo->revision == Pal::AsicRevision::Navi31) { m_settings.mallNoAllocCtPolicy = MallNoAllocCtPolicy::MallNoAllocCtAsSnsr; } -#endif -#if VKI_BUILD_NAVI32 if (pInfo->revision == Pal::AsicRevision::Navi32) { m_settings.mallNoAllocCtPolicy = MallNoAllocCtPolicy::MallNoAllocCtAsSnsr; } -#endif -#if VKI_BUILD_NAVI33 if (pInfo->revision == Pal::AsicRevision::Navi33) { m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrPolicy::MallNoAllocCtSsrAsSnsr; } -#endif } -#endif m_settings.enableUberFetchShader = true; } @@ -859,6 +830,8 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.forceDepthClampBasedOnZExport = true; m_settings.clampMaxImageSize = 16384u; + + m_settings.ac01WaNotNeeded = true; } if (appProfile == AppProfile::SeriousSamFusion) @@ -868,6 +841,8 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.anisoThreshold = 1.0f; m_settings.clampMaxImageSize = 16384u; + + m_settings.ac01WaNotNeeded = true; } if ((appProfile == AppProfile::TalosVR) || @@ -943,13 +918,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( // Originally applied to QuakeRemastered - this setting applies to QuakeEnhanced now since it's an update // to the same game. m_settings.disableDisplayDcc = DisplayableDcc::DisplayableDccDisabled; - -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) - { - m_settings.forcePwsMode = PwsMode::NoLateAcquirePoint; - } -#endif } if (appProfile == AppProfile::SedpEngine) @@ -989,18 +957,15 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( ForceDccForColorAttachments | ForceDccFor32BppShaderStorage); } -#if VKI_BUILD_GFX11 + else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { -#if VKI_BUILD_NAVI31 if (pInfo->revision == Pal::AsicRevision::Navi31) { m_settings.mallNoAllocDsPolicy = MallNoAllocDsAsSnsr; m_settings.pipelineBinningMode = PipelineBinningModeEnable; } -#endif } -#endif } if (appProfile == AppProfile::ZombieArmy4) @@ -1108,19 +1073,34 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (appProfile == AppProfile::RainbowSixExtraction) { -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { - m_settings.forceEnableDcc = (ForceDccFor2DShaderStorage | - ForceDccForColorAttachments | - ForceDccForNonColorAttachmentShaderStorage | - ForceDccFor32BppShaderStorage | - ForceDccFor64BppShaderStorage); + if (pInfo->revision == Pal::AsicRevision::Navi31) + { + m_settings.forceEnableDcc = (ForceDccFor2DShaderStorage | + ForceDccForColorAttachments | + ForceDccForNonColorAttachmentShaderStorage | + ForceDccFor32BppShaderStorage | + ForceDccFor64BppShaderStorage); + + m_settings.disableLoopUnrolls = true; + m_settings.forceCsThreadIdSwizzling = true; + } + else if (pInfo->revision == Pal::AsicRevision::Navi33) + { + m_settings.forceEnableDcc = (ForceDccFor2DShaderStorage | + ForceDccFor3DShaderStorage | + ForceDccForColorAttachments | + ForceDccForNonColorAttachmentShaderStorage | + ForceDccFor64BppShaderStorage); - m_settings.disableLoopUnrolls = true; - m_settings.forceCsThreadIdSwizzling = true; + m_settings.pipelineBinningMode = PipelineBinningModeDisable; + m_settings.mallNoAllocDsPolicy = MallNoAllocDsAsSnsr; + m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; + m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrAsSnsr; + m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; + } } -#endif } if (appProfile == AppProfile::Rage2) @@ -1169,8 +1149,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.fsWaveSize = 64; } } - -#if VKI_BUILD_GFX11 else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { m_settings.pipelineBinningMode = PipelineBinningModeDisable; @@ -1183,7 +1161,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( ForceDccFor32BppShaderStorage | ForceDccFor64BppShaderStorage); } -#endif } if (appProfile == AppProfile::RedDeadRedemption2) @@ -1212,12 +1189,11 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( ForceDccForColorAttachments | ForceDccFor64BppShaderStorage); -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { m_settings.forceEnableDcc |= ForceDccForNonColorAttachmentShaderStorage; } -#endif + } m_settings.ac01WaNotNeeded = true; @@ -1284,12 +1260,9 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } } -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { - m_settings.forcePwsMode = PwsMode::NoLateAcquirePoint; } -#endif } #if VKI_RAY_TRACING @@ -1314,13 +1287,11 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { // Gives ~0.5% gain at 4k m_settings.enableAceShaderPrefetch = false; } -#endif } if (appProfile == AppProfile::ControlDX12) @@ -1330,17 +1301,17 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.rtEnableCompilePipelineLibrary = false; } -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { // Gives ~2.22% gain at 1080p m_settings.enableAceShaderPrefetch = false; } -#endif } if (appProfile == AppProfile::RayTracingWeekends) { + m_settings.rtEnableTopDownBuild = true; + if (pInfo->gfxipProperties.shaderCore.vgprsPerSimd == 1024) { { @@ -1388,7 +1359,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { #if VKI_RAY_TRACING m_settings.rtBvhBuildModeFastTrace = BvhBuildModeLinear; - m_settings.rtEnableTopDownBuild = false; m_settings.plocRadius = 4; // 13% Gain @ 4k - Allows overlapping builds @@ -1425,15 +1395,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.csWaveSize = 64; } -#if VKI_BUILD_GFX11 else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { // Navi31 Mall and Tiling Settings - if ((pInfo->revision == Pal::AsicRevision::Navi31) -#if VKI_BUILD_NAVI32 - || (pInfo->revision == Pal::AsicRevision::Navi32) -#endif - ) + if ((pInfo->revision == Pal::AsicRevision::Navi31) || (pInfo->revision == Pal::AsicRevision::Navi32)) { // Mall no alloc settings give a ~1% gain m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; @@ -1444,7 +1409,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor; } } -#endif } if (appProfile == AppProfile::IdTechLauncher) @@ -1512,13 +1476,11 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( // A larger minImageCount can get a performance gain for game Metro Exodus. m_settings.forceMinImageCount = 3; -#if VKI_BUILD_GFX11 if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { // Gives ~0.9% gain at 1080p m_settings.enableAceShaderPrefetch = false; } -#endif } if (appProfile == AppProfile::X4Foundations) @@ -1594,27 +1556,22 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; } } -#if VKI_BUILD_GFX11 else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { -#if VKI_BUILD_NAVI31 if (pInfo->revision == Pal::AsicRevision::Navi31) { // This provides ~4.2% gain at 4k m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor; m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; } -#endif -#if VKI_BUILD_NAVI33 + if (pInfo->revision == Pal::AsicRevision::Navi33) { { m_settings.forceCsThreadIdSwizzling = true; } } -#endif } -#endif } if (appProfile == AppProfile::MetalGearSolid5) @@ -1660,27 +1617,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } - if ((appProfile == AppProfile::HalfLifeAlyx) || - (appProfile == AppProfile::Satisfactory)) - { -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) - { - m_settings.forcePwsMode = PwsMode::NoLateAcquirePoint; - } -#endif - } - - if (appProfile == AppProfile::RomeRemastered) - { -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) - { - m_settings.forcePwsMode = PwsMode::NoLateAcquirePoint; - } -#endif - } - if (appProfile == AppProfile::SpidermanRemastered) { m_settings.supportMutableDescriptors = false; @@ -1695,7 +1631,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( #if VKI_RAY_TRACING m_settings.plocRadius = 4; m_settings.rtBvhBuildModeFastTrace = BvhBuildModeLinear; - m_settings.rtEnableTopDownBuild = false; if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp10_3) { @@ -1715,7 +1650,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { OverrideVkd3dCommonSettings(&m_settings); -#if VKI_BUILD_GFX11 if ((pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) #if VKI_BUILD_GFX115 || (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_5) @@ -1724,7 +1658,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.fsWaveSize = 32; } -#endif } if (appProfile == AppProfile::Vkd3dEngine) @@ -1969,11 +1902,9 @@ void VulkanSettingsLoader::ValidateSettings() m_settings.enableRaytracingSupport = false; } -#if VKI_BUILD_GFX11 // RTIP 2.0+ is always expected to support hardware traversal stack VK_ASSERT((rayTracingIpLevel <= Pal::RayTracingIpLevel::RtIp1_1) || (deviceProps.gfxipProperties.flags.supportRayTraversalStack == 1)); -#endif // Clamp target occupancy to [0.0, 1.0] m_settings.indirectCallTargetOccupancyPerSimd = Util::Clamp(m_settings.indirectCallTargetOccupancyPerSimd, 0.0f, 1.0f); diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index acc08788..2ca1ff94 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -1074,7 +1074,7 @@ }, { "Name": "PipelineProfileRuntimeFile", - "Description": "Relative Path to a JSON file that describes a shader app profile that is parsed at runtime. This setting only triggers on debug builds or builds made with the ICD_RUNTIME_APP_PROFILE=1 option. This file has the same format as the JSON files used to build production shader app profiles. Root directory is determined by AMD_DEBUG_DIR environment variable", + "Description": "Relative Path to a JSON file that describes a shader app profile that is parsed at runtime. This setting only triggers on debug builds or builds made with the VKI_RUNTIME_APP_PROFILE=1 option. This file has the same format as the JSON files used to build production shader app profiles. Root directory is determined by AMD_DEBUG_DIR environment variable", "Tags": [ "Pipeline Options" ], @@ -1129,9 +1129,6 @@ "Tags": [ "Pipeline Options" ], - "BuildTypes": [ - "VKI_BUILD_GFX11" - ], "Defaults": { "Default": false }, @@ -3184,7 +3181,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": true + "Default": false }, "Type": "bool", "Name": "RtEnableTopDownBuild", @@ -3331,6 +3328,36 @@ "Type": "bool", "Scope": "Driver" }, + { + "Name": "RtCheckBufferOverlapsInBatch", + "Description": "Check for scratch and result buffer overlaps in a batch of BVH builds/updates.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, + { + "Name": "RtDisableAccelStructCompaction", + "Description": "Disables compaction of the Acceleration Structure build and performs a copy instead.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "RtEnableMortonCode30", "Description": "Enable Morton Code 30 bits", @@ -3341,7 +3368,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": true + "Default": false }, "Type": "bool", "Scope": "Driver" @@ -4625,7 +4652,7 @@ }, { "Name": "DbgBarrierPostCmdEnable", - "Description": "Triggers a CmdBarrier call after any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPost* settings in this category. Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)", + "Description": "Triggers a CmdBarrier call after any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPost* settings in this category. Requires VKI_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)", "Tags": [ "Debugging" ], @@ -4905,7 +4932,7 @@ "Name": "DbgBarrierPostCacheDstMask" }, { - "Description": "Triggers a CmdBarrier call before any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPre* settings in this category. Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. For further documentation, consult the corresponding DbgBarrierPostCmdEnable command.", + "Description": "Triggers a CmdBarrier call before any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPre* settings in this category. Requires VKI_ENABLE_DEBUG_BARRIERS=1 to take effect. For further documentation, consult the corresponding DbgBarrierPostCmdEnable command.", "Tags": [ "Debugging" ], @@ -5371,21 +5398,6 @@ "Name": "RebraidQualityHeuristicType", "Scope": "Driver" }, - { - "Description": "Fast BVH Build with no Morton Code sorting. Applies to BVHs with up to a wave size number of primitives.", - "Tags": [ - "Ray Tracing" - ], - "BuildTypes": [ - "VKI_RAY_TRACING" - ], - "Defaults": { - "Default": 0 - }, - "Type": "uint32", - "Name": "FastBuildThreshold", - "Scope": "Driver" - }, { "Description": "Enable pair compression in early build stage, i.e., During Encode phase.", "Tags": [ @@ -5640,7 +5652,7 @@ "General" ], "Defaults": { - "Default": false + "Default": true }, "Scope": "Driver", "Type": "bool" diff --git a/icd/tools/generate/shaderProfileTemplate.py b/icd/tools/generate/shaderProfileTemplate.py index 9f3b449c..2218bb76 100644 --- a/icd/tools/generate/shaderProfileTemplate.py +++ b/icd/tools/generate/shaderProfileTemplate.py @@ -71,7 +71,7 @@ #include \"utils/json_writer.h\" #include \"palJsonWriter.h\" -#if ICD_RUNTIME_APP_PROFILE +#if VKI_RUNTIME_APP_PROFILE #include \"utils/json_reader.h\" #endif """ @@ -2762,16 +2762,7 @@ def json_enum_reader_template(values, prefix=""): BuildTypesTemplate = { "llpc": "ICD_BUILD_LLPC", -#if VKI_BUILD_NAVI31 - "Navi31": "VKI_BUILD_NAVI31", -#endif -#if VKI_BUILD_NAVI33 - "Navi33": "VKI_BUILD_NAVI33", -#endif -#if VKI_BUILD_GFX11 - "gfxIp11_0": "VKI_BUILD_GFX11", -#endif - "icdRuntimeAppProfile": "ICD_RUNTIME_APP_PROFILE" + "icdRuntimeAppProfile": "VKI_RUNTIME_APP_PROFILE" } ###################################################################################################################