Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

[WIP] CI test #33

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions rocrtst/suites/aie/aie_hsa_dispatch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,23 +113,38 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,

void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf, uint32_t &num_instr) {


std::cout << __LINE__ << std::endl;
std::ifstream bin_file(file_name,
std::ios::binary | std::ios::ate | std::ios::in);

assert(bin_file.fail() == false);

std::cout << __LINE__ << std::endl;

auto size(bin_file.tellg());
bin_file.seekg(0, std::ios::beg);
std::vector<uint32_t> pdi_vec;
std::string val;

std::cout << __LINE__ << std::endl;

while (bin_file >> val) {
pdi_vec.push_back(std::stoul(val, nullptr, 16));
}

std::cout << __LINE__ << std::endl;
std::cout << "Buff is at " << buf << std::endl;
auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
std::cout << __LINE__ << std::endl;
assert(r == HSA_STATUS_SUCCESS);
std::cout << "*buf is at " << *buf << std::endl;
std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
std::cout << __LINE__ << std::endl;
num_instr = pdi_vec.size();

std::cout << __LINE__ << std::endl;
}

} // namespace
Expand All @@ -145,11 +160,13 @@ int main(int argc, char **argv) {
hsa_amd_memory_pool_t global_dev_mem_pool{0};
// System memory pool. Used for allocating kernel argument data.
hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
std::cout << __LINE__ << std::endl;
const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string pdi_file_name(sourcePath / "add_one.pdi");
uint32_t *instr_inst_buf(nullptr);
uint64_t *pdi_buf(nullptr);

std::cout << __LINE__ << std::endl;
assert(aie_agents.empty());
assert(global_dev_mem_pool.handle == 0);
assert(global_kernarg_mem_pool.handle == 0);
Expand All @@ -158,6 +175,7 @@ int main(int argc, char **argv) {
auto r = hsa_init();
assert(r == HSA_STATUS_SUCCESS);

std::cout << __LINE__ << std::endl;
assert(sizeof(hsa_kernel_dispatch_packet_s) ==
sizeof(hsa_amd_aie_ert_packet_s));

Expand All @@ -167,6 +185,7 @@ int main(int argc, char **argv) {
assert(r == HSA_STATUS_SUCCESS);
assert(aie_agents.size() == 1);

std::cout << __LINE__ << std::endl;
const auto &aie_agent = aie_agents.front();

// Create a queue on the first agent.
Expand All @@ -176,43 +195,50 @@ int main(int argc, char **argv) {
assert(aie_queue);
assert(aie_queue->base_address);

std::cout << __LINE__ << std::endl;
// Find a pool for DEV BOs. This is a global system memory pool that is
// mapped to the device. Will be used for PDIs and DPU instructions.
r = hsa_amd_agent_iterate_memory_pools(
aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool);
assert(r == HSA_STATUS_SUCCESS);

std::cout << __LINE__ << std::endl;
// Find a pool that supports kernel args. This is just normal system memory.
// It will be used for commands and input data.
r = hsa_amd_agent_iterate_memory_pools(
aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool);
assert(r == HSA_STATUS_SUCCESS);
assert(global_kernarg_mem_pool.handle);

std::cout << __LINE__ << std::endl;
// Getting the maximum size of the queue so we can submit that many consecutive
// packets.
uint32_t aie_max_queue_size;
r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
assert(r == HSA_STATUS_SUCCESS);
int num_pkts = aie_max_queue_size;

std::cout << __LINE__ << std::endl;
// Load the DPU and PDI files into a global pool that doesn't support kernel
// args (DEV BO).
uint32_t num_instr;
std::cout << "instr_inst_buf: " << instr_inst_buf << std::endl;
load_instr_file(global_dev_mem_pool, instr_inst_file_name,
reinterpret_cast<void **>(&instr_inst_buf), num_instr);
uint32_t instr_handle = 0;
r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(instr_handle != 0);

std::cout << __LINE__ << std::endl;
load_pdi_file(global_dev_mem_pool, pdi_file_name,
reinterpret_cast<void **>(&pdi_buf));
uint32_t pdi_handle = 0;
r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(pdi_handle != 0);

std::cout << __LINE__ << std::endl;
hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle,
.cu_func = 0};

Expand All @@ -224,6 +250,7 @@ int main(int argc, char **argv) {
aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args);
assert(r == HSA_STATUS_SUCCESS);

std::cout << __LINE__ << std::endl;
// create inputs / outputs
constexpr std::size_t num_data_elements = 1024;
constexpr std::size_t data_buffer_size =
Expand All @@ -238,6 +265,7 @@ int main(int argc, char **argv) {
uint64_t wr_idx = 0;
uint64_t packet_id = 0;

std::cout << __LINE__ << std::endl;
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input[pkt_iter]));
Expand Down Expand Up @@ -300,6 +328,7 @@ int main(int argc, char **argv) {
// Ringing the doorbell to dispatch each packet we added to
// the queue
hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);
std::cout << __LINE__ << std::endl;

for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
for (std::size_t i = 0; i < num_data_elements; i++) {
Expand All @@ -319,6 +348,7 @@ int main(int argc, char **argv) {
r = hsa_queue_destroy(aie_queue);
assert(r == HSA_STATUS_SUCCESS);

std::cout << __LINE__ << std::endl;
r = hsa_amd_memory_pool_free(pdi_buf);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(instr_inst_buf);
Expand Down
23 changes: 22 additions & 1 deletion runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,32 +136,40 @@ hsa_status_t
XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size, uint32_t node_id) {

std::cout << __func__ << __LINE__ << std::endl;
const auto &region = static_cast<const MemoryRegion &>(mem_region);
amdxdna_drm_create_bo create_bo_args{.size = size};
amdxdna_drm_get_bo_info get_bo_info_args{0};
drm_gem_close close_bo_args{0};
void *mapped_mem(nullptr);

if (!region.IsSystem()) {
std::cout << __func__ << __LINE__ << std::endl;
return HSA_STATUS_ERROR_INVALID_REGION;
}

if (region.kernarg()) {
std::cout << __func__ << __LINE__ << std::endl;
create_bo_args.type = AMDXDNA_BO_SHMEM;
} else {
std::cout << __func__ << __LINE__ << std::endl;
create_bo_args.type = AMDXDNA_BO_DEV;
}

if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) {
std::cout << __func__ << __LINE__ << std::endl;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

get_bo_info_args.handle = create_bo_args.handle;
// In case we need to close this BO to avoid leaks due to some error after
// creation.
close_bo_args.handle = create_bo_args.handle;
std::cout << __func__ << __LINE__ << std::endl;

if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) {
std::cout << __func__ << __LINE__ << std::endl;
// Close the BO in the case we can't get info about it.
ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
return HSA_STATUS_ERROR;
Expand All @@ -171,6 +179,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
/// to VA memory addresses. Once we can support the separate VMEM call to
/// map handles we can fix this.
if (region.kernarg()) {
std::cout << __func__ << __LINE__ << std::endl;
mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
get_bo_info_args.map_offset);
if (mapped_mem == MAP_FAILED) {
Expand All @@ -179,16 +188,21 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
} else {
std::cout << __func__ << __LINE__ << std::endl;
mapped_mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
}

if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) {
std::cout << __func__ << __LINE__ << std::endl;
*mem = reinterpret_cast<void *>(create_bo_args.handle);
} else {
std::cout << __func__ << __LINE__ << std::endl;
*mem = mapped_mem;
}

std::cout << __func__ << __LINE__ << std::endl;
vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
handle_size_map.emplace(create_bo_args.handle, size);
vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle);

return HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -353,11 +367,18 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
hsa_status_t XdnaDriver::GetHandleMappings(
std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
vmem_handle_mappings = this->vmem_handle_mappings;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetHandleSizeMap(
std::unordered_map<uint32_t, uint32_t> &handle_size_map) {
handle_size_map = this->handle_size_map;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetFd(int &fd) {
fd = fd_;
return HSA_STATUS_SUCCESS;
Expand Down
5 changes: 3 additions & 2 deletions runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ class AieAqlQueue : public core::Queue,
static hsa_status_t SubmitCmd(
uint32_t hw_ctx_handle, int fd, void *queue_base,
uint64_t read_dispatch_id, uint64_t write_dispatch_id,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings);
std::unordered_map<uint32_t, void *> &vmem_handle_mappings,
std::unordered_map<uint32_t, uint32_t> &handle_size_map);

/// @brief Creates a command BO and returns a pointer to the memory and
// the corresponding handle
Expand All @@ -190,7 +191,7 @@ class AieAqlQueue : public core::Queue,
/// @brief Syncs all BOs referenced in bo_args
///
/// @param bo_args vector containing handles of BOs to sync
static hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);
static hsa_status_t SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size);

/// @brief Executes a command and waits for its completion
///
Expand Down
2 changes: 2 additions & 0 deletions runtime/hsa-runtime/core/inc/amd_xdna_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class XdnaDriver : public core::Driver {
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;

hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
hsa_status_t GetHandleSizeMap(std::unordered_map<uint32_t, uint32_t> &handle_size_map);
hsa_status_t GetFd(int &fd);

hsa_status_t GetAgentProperties(core::Agent &agent) const override;
Expand Down Expand Up @@ -118,6 +119,7 @@ class XdnaDriver : public core::Driver {

// TODO: Remove this once we move to the vmem API
std::unordered_map<void*, uint32_t> vmem_handle_mappings_reverse;
std::unordered_map<uint32_t, uint32_t> handle_size_map;

/// @brief Virtual address range allocated for the device heap.
///
Expand Down
42 changes: 25 additions & 17 deletions runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;

// Environment variable to define job submission timeout
constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT";
constexpr int DEFAULT_TIMEOUT_VAL = 50;
constexpr int DEFAULT_TIMEOUT_VAL = 0;
char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR);
int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr);

Expand Down Expand Up @@ -219,12 +219,16 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {

void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
std::unordered_map<uint32_t, void*> vmem_handle_mappings;
std::unordered_map<uint32_t, uint32_t> handle_size_map;

auto &driver = static_cast<XdnaDriver &>(
core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type));
if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) {
return;
}
if (driver.GetHandleSizeMap(handle_size_map) != HSA_STATUS_SUCCESS) {
return;
}

int fd = 0;
if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) {
Expand All @@ -233,17 +237,17 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {

SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address,
amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id,
vmem_handle_mappings);
vmem_handle_mappings, handle_size_map);
}

hsa_status_t AieAqlQueue::SyncBos(std::vector<uint32_t> &bo_args, int fd) {
for (unsigned int bo_arg : bo_args) {
amdxdna_drm_sync_bo sync_params = {};
sync_params.handle = bo_arg;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params))
return HSA_STATUS_ERROR;
hsa_status_t AieAqlQueue::SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size) {
amdxdna_drm_sync_bo sync_params = {};
sync_params.handle = bo_arg;
sync_params.direction = direction;
sync_params.size = size;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) {
return HSA_STATUS_ERROR;
}

return HSA_STATUS_SUCCESS;
}

Expand Down Expand Up @@ -330,7 +334,8 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
hsa_status_t AieAqlQueue::SubmitCmd(
uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id,
uint64_t write_dispatch_id,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
std::unordered_map<uint32_t, void *> &vmem_handle_mappings,
std::unordered_map<uint32_t, uint32_t> &handle_size_map) {
uint64_t cur_id = read_dispatch_id;
while (cur_id < write_dispatch_id) {
hsa_amd_aie_ert_packet_t *pkt =
Expand All @@ -351,9 +356,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// packets there are. All can be combined into a single chain.
int num_cont_start_cu_pkts = 1;
for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) {
break;
}
num_cont_start_cu_pkts++;
}

Expand Down Expand Up @@ -416,8 +418,10 @@ hsa_status_t AieAqlQueue::SubmitCmd(
}

// Syncing BOs before we execute the command
if (SyncBos(bo_args, fd))
return HSA_STATUS_ERROR;
for (auto bo_arg : bo_args) {
if (SyncBo(fd, bo_arg, SYNC_DIRECT_TO_DEVICE, handle_size_map[bo_arg]))
return HSA_STATUS_ERROR;
}

// Removing duplicates in the bo container. The driver will report
// an error if we provide the same BO handle multiple times.
Expand All @@ -440,8 +444,12 @@ hsa_status_t AieAqlQueue::SubmitCmd(
ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd);

// Syncing BOs after we execute the command
if (SyncBos(bo_args, fd))
return HSA_STATUS_ERROR;
for (auto bo_arg : bo_args) {
if (SyncBo(fd, bo_arg, SYNC_DIRECT_FROM_DEVICE,
handle_size_map[bo_arg])) {
return HSA_STATUS_ERROR;
}
}

cur_id += num_cont_start_cu_pkts;
break;
Expand Down
Loading