From ca39d4df4ca0264024343992fcbe21ccd312846c Mon Sep 17 00:00:00 2001 From: makslevental Date: Mon, 23 Sep 2024 03:51:22 -0400 Subject: [PATCH 1/8] [WIP] numerics fixes --- .../core/driver/xdna/amd_xdna_driver.cpp | 10 ++++- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 5 ++- .../hsa-runtime/core/inc/amd_xdna_driver.h | 4 +- .../core/runtime/amd_aie_aql_queue.cpp | 42 +++++++++++-------- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 12f140e5c..5564657a2 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -189,6 +189,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, } vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); + handle_size_map.emplace(create_bo_args.handle, size); vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); return HSA_STATUS_SUCCESS; @@ -353,11 +354,18 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { +hsa_status_t XdnaDriver::GetHandleMappings( + std::unordered_map &vmem_handle_mappings) { vmem_handle_mappings = this->vmem_handle_mappings; return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetHandleSizeMap( + std::unordered_map &handle_size_map) { + handle_size_map = this->handle_size_map; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::GetFd(int &fd) { fd = fd_; return HSA_STATUS_SUCCESS; diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 224b85d7c..f47a93cea 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -165,7 +165,8 @@ class AieAqlQueue : public core::Queue, static hsa_status_t SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings); + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map); /// @brief Creates a command BO and returns a pointer to the memory and // the corresponding handle @@ -190,7 +191,7 @@ class AieAqlQueue : public core::Queue, /// @brief Syncs all BOs referenced in bo_args /// /// @param bo_args vector containing handles of BOs to sync - static hsa_status_t SyncBos(std::vector &bo_args, int fd); + static hsa_status_t SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size); /// @brief Executes a command and waits for its completion /// diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 79cbaa710..0d86ad3dc 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -71,6 +71,7 @@ class XdnaDriver : public core::Driver { hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetHandleSizeMap(std::unordered_map &handle_size_map); hsa_status_t GetFd(int &fd); hsa_status_t GetAgentProperties(core::Agent &agent) const override; @@ -118,6 +119,7 @@ class XdnaDriver : public core::Driver { // TODO: Remove this once we move to the vmem API std::unordered_map vmem_handle_mappings_reverse; + std::unordered_map handle_size_map; /// @brief Virtual address range allocated for the device heap. /// @@ -128,7 +130,7 @@ class XdnaDriver : public core::Driver { /// @brief The aligned device heap. void *dev_heap_aligned = nullptr; - static constexpr size_t dev_heap_size = 48 * 1024 * 1024; + static constexpr size_t dev_heap_size = 64 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; }; diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 6f796441a..5639a02b3 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -82,7 +82,7 @@ constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; // Environment variable to define job submission timeout constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; -constexpr int DEFAULT_TIMEOUT_VAL = 50; +constexpr int DEFAULT_TIMEOUT_VAL = 0; char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); @@ -219,12 +219,16 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { std::unordered_map vmem_handle_mappings; + std::unordered_map handle_size_map; auto &driver = static_cast( core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)); if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { return; } + if (driver.GetHandleSizeMap(handle_size_map) != HSA_STATUS_SUCCESS) { + return; + } int fd = 0; if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) { @@ -233,17 +237,17 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, - vmem_handle_mappings); + vmem_handle_mappings, handle_size_map); } -hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { - for (unsigned int bo_arg : bo_args) { - amdxdna_drm_sync_bo sync_params = {}; - sync_params.handle = bo_arg; - if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) - return HSA_STATUS_ERROR; +hsa_status_t AieAqlQueue::SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_arg; + sync_params.direction = direction; + sync_params.size = size; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) { + return HSA_STATUS_ERROR; } - return HSA_STATUS_SUCCESS; } @@ -330,7 +334,8 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, hsa_status_t AieAqlQueue::SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings) { + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map) { uint64_t cur_id = read_dispatch_id; while (cur_id < write_dispatch_id) { hsa_amd_aie_ert_packet_t *pkt = @@ -351,9 +356,6 @@ hsa_status_t AieAqlQueue::SubmitCmd( // packets there are. All can be combined into a single chain. int num_cont_start_cu_pkts = 1; for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { - if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { - break; - } num_cont_start_cu_pkts++; } @@ -416,8 +418,10 @@ hsa_status_t AieAqlQueue::SubmitCmd( } // Syncing BOs before we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_TO_DEVICE, handle_size_map[bo_arg])) + return HSA_STATUS_ERROR; + } // Removing duplicates in the bo container. The driver will report // an error if we provide the same BO handle multiple times. @@ -440,8 +444,12 @@ hsa_status_t AieAqlQueue::SubmitCmd( ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); // Syncing BOs after we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_FROM_DEVICE, + handle_size_map[bo_arg])) { + return HSA_STATUS_ERROR; + } + } cur_id += num_cont_start_cu_pkts; break; From ec867aa8080d9c12f87a251938aa950f27e27aab Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 20:16:24 -0600 Subject: [PATCH 2/8] Something seems to be wrong with this test just when running on the CI --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 5d54d35ac..8d11ce6c1 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -145,11 +145,13 @@ int main(int argc, char **argv) { hsa_amd_memory_pool_t global_dev_mem_pool{0}; // System memory pool. Used for allocating kernel argument data. hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; + std::cout << __LINE__ << std::endl; const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt"); const std::string pdi_file_name(sourcePath / "add_one.pdi"); uint32_t *instr_inst_buf(nullptr); uint64_t *pdi_buf(nullptr); + std::cout << __LINE__ << std::endl; assert(aie_agents.empty()); assert(global_dev_mem_pool.handle == 0); assert(global_kernarg_mem_pool.handle == 0); @@ -158,6 +160,7 @@ int main(int argc, char **argv) { auto r = hsa_init(); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; assert(sizeof(hsa_kernel_dispatch_packet_s) == sizeof(hsa_amd_aie_ert_packet_s)); @@ -167,6 +170,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(aie_agents.size() == 1); + std::cout << __LINE__ << std::endl; const auto &aie_agent = aie_agents.front(); // Create a queue on the first agent. @@ -176,12 +180,14 @@ int main(int argc, char **argv) { assert(aie_queue); assert(aie_queue->base_address); + std::cout << __LINE__ << std::endl; // Find a pool for DEV BOs. This is a global system memory pool that is // mapped to the device. Will be used for PDIs and DPU instructions. r = hsa_amd_agent_iterate_memory_pools( aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; // Find a pool that supports kernel args. This is just normal system memory. // It will be used for commands and input data. r = hsa_amd_agent_iterate_memory_pools( @@ -189,6 +195,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(global_kernarg_mem_pool.handle); + std::cout << __LINE__ << std::endl; // Getting the maximum size of the queue so we can submit that many consecutive // packets. uint32_t aie_max_queue_size; @@ -196,6 +203,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); int num_pkts = aie_max_queue_size; + std::cout << __LINE__ << std::endl; // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). uint32_t num_instr; @@ -206,6 +214,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(instr_handle != 0); + std::cout << __LINE__ << std::endl; load_pdi_file(global_dev_mem_pool, pdi_file_name, reinterpret_cast(&pdi_buf)); uint32_t pdi_handle = 0; @@ -213,6 +222,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(pdi_handle != 0); + std::cout << __LINE__ << std::endl; hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, .cu_func = 0}; @@ -224,6 +234,7 @@ int main(int argc, char **argv) { aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; // create inputs / outputs constexpr std::size_t num_data_elements = 1024; constexpr std::size_t data_buffer_size = @@ -238,6 +249,7 @@ int main(int argc, char **argv) { uint64_t wr_idx = 0; uint64_t packet_id = 0; + std::cout << __LINE__ << std::endl; for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, reinterpret_cast(&input[pkt_iter])); @@ -300,6 +312,7 @@ int main(int argc, char **argv) { // Ringing the doorbell to dispatch each packet we added to // the queue hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); + std::cout << __LINE__ << std::endl; for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { for (std::size_t i = 0; i < num_data_elements; i++) { @@ -319,6 +332,7 @@ int main(int argc, char **argv) { r = hsa_queue_destroy(aie_queue); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; r = hsa_amd_memory_pool_free(pdi_buf); assert(r == HSA_STATUS_SUCCESS); r = hsa_amd_memory_pool_free(instr_inst_buf); From d0ffc0115b81d2d7109c8d744ac3e5806deceaee Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 20:26:09 -0600 Subject: [PATCH 3/8] Seems to be breaking in loading instr file --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 8d11ce6c1..b00ba3025 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -113,23 +113,34 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, void **buf, uint32_t &num_instr) { + + + std::cout << __LINE__ << std::endl; std::ifstream bin_file(file_name, std::ios::binary | std::ios::ate | std::ios::in); assert(bin_file.fail() == false); + std::cout << __LINE__ << std::endl; + auto size(bin_file.tellg()); bin_file.seekg(0, std::ios::beg); std::vector pdi_vec; std::string val; + + std::cout << __LINE__ << std::endl; while (bin_file >> val) { pdi_vec.push_back(std::stoul(val, nullptr, 16)); } + + std::cout << __LINE__ << std::endl; auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); assert(r == HSA_STATUS_SUCCESS); std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); num_instr = pdi_vec.size(); + + std::cout << __LINE__ << std::endl; } } // namespace From 93b58382e8246f68e912f2a7c589e22aa5e76223 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 20:35:20 -0600 Subject: [PATCH 4/8] Diving deeper --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index b00ba3025..71acbdf17 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -135,9 +135,12 @@ void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_nam } std::cout << __LINE__ << std::endl; + std::cout << "Buff is at " << buf << std::endl; auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + std::cout << __LINE__ << std::endl; assert(r == HSA_STATUS_SUCCESS); std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); + std::cout << __LINE__ << std::endl; num_instr = pdi_vec.size(); std::cout << __LINE__ << std::endl; From 13e171c22aa6d7887160ef3f082725c5751fbd09 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 20:43:32 -0600 Subject: [PATCH 5/8] Diving deeper --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 71acbdf17..08434bd21 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -139,6 +139,7 @@ void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_nam auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); std::cout << __LINE__ << std::endl; assert(r == HSA_STATUS_SUCCESS); + std::cout << "*buf is at " << *buf << std::endl; std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); std::cout << __LINE__ << std::endl; num_instr = pdi_vec.size(); @@ -221,6 +222,7 @@ int main(int argc, char **argv) { // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). uint32_t num_instr; + std::cout << "instr_inst_buf: " << instr_inst_buf << std::endl; load_instr_file(global_dev_mem_pool, instr_inst_file_name, reinterpret_cast(&instr_inst_buf), num_instr); uint32_t instr_handle = 0; From c2b53d303343c6c5763ec53f9ac0692438386776 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 20:56:52 -0600 Subject: [PATCH 6/8] Diving deeper --- runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 5564657a2..4b102aea0 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -136,6 +136,8 @@ hsa_status_t XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, core::MemoryRegion::AllocateFlags alloc_flags, void **mem, size_t size, uint32_t node_id) { + + std::cout << __func__ << __LINE__ << std::endl; const auto ®ion = static_cast(mem_region); amdxdna_drm_create_bo create_bo_args{.size = size}; amdxdna_drm_get_bo_info get_bo_info_args{0}; From c8913eb7805031cf1644dc6617c259b4d4cbed4a Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 21:06:59 -0600 Subject: [PATCH 7/8] Diving deeper --- .../hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 4b102aea0..fe97431be 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -145,16 +145,20 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, void *mapped_mem(nullptr); if (!region.IsSystem()) { + std::cout << __func__ << __LINE__ << std::endl; return HSA_STATUS_ERROR_INVALID_REGION; } if (region.kernarg()) { + std::cout << __func__ << __LINE__ << std::endl; create_bo_args.type = AMDXDNA_BO_SHMEM; } else { + std::cout << __func__ << __LINE__ << std::endl; create_bo_args.type = AMDXDNA_BO_DEV; } if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) { + std::cout << __func__ << __LINE__ << std::endl; return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -162,8 +166,10 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, // In case we need to close this BO to avoid leaks due to some error after // creation. close_bo_args.handle = create_bo_args.handle; + std::cout << __func__ << __LINE__ << std::endl; if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) { + std::cout << __func__ << __LINE__ << std::endl; // Close the BO in the case we can't get info about it. ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); return HSA_STATUS_ERROR; @@ -173,6 +179,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, /// to VA memory addresses. Once we can support the separate VMEM call to /// map handles we can fix this. if (region.kernarg()) { + std::cout << __func__ << __LINE__ << std::endl; mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, get_bo_info_args.map_offset); if (mapped_mem == MAP_FAILED) { @@ -181,15 +188,19 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } } else { + std::cout << __func__ << __LINE__ << std::endl; mapped_mem = reinterpret_cast(get_bo_info_args.vaddr); } if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) { + std::cout << __func__ << __LINE__ << std::endl; *mem = reinterpret_cast(create_bo_args.handle); } else { + std::cout << __func__ << __LINE__ << std::endl; *mem = mapped_mem; } + std::cout << __func__ << __LINE__ << std::endl; vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); handle_size_map.emplace(create_bo_args.handle, size); vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); From 8e47aa30a9dd50efde3b944caf1b40fdb15dbb48 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Tue, 24 Sep 2024 21:14:59 -0600 Subject: [PATCH 8/8] Diving deeper --- runtime/hsa-runtime/core/inc/amd_xdna_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 0d86ad3dc..623f7f204 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -130,7 +130,7 @@ class XdnaDriver : public core::Driver { /// @brief The aligned device heap. void *dev_heap_aligned = nullptr; - static constexpr size_t dev_heap_size = 64 * 1024 * 1024; + static constexpr size_t dev_heap_size = 48 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; };