Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

Commit

Permalink
Fixing command sizing and changing the test to issue the maximum numb…
Browse files Browse the repository at this point in the history
…er of packets the queue supports. (#23)

* Fixed workarounds in the AIE soft queue regarding the size of the command chain and the individual commands.

* Added the functionality to aie_hsa_dispatch_test.cc to query the size of the AIE queue and issue the maximum number of packets it supports.

* Some additional small fixes on aie_hsa_dispatch_test.cc test.

* Adding links to driver source
  • Loading branch information
eddierichter-amd authored Sep 12, 2024
1 parent c817704 commit 0757f63
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 101 deletions.
177 changes: 99 additions & 78 deletions rocrtst/suites/aie/aie_hsa_dispatch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
bin_file.read(reinterpret_cast<char *>(*buf), size);
}

void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf) {
void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf, uint32_t &num_instr) {
std::ifstream bin_file(file_name,
std::ios::binary | std::ios::ate | std::ios::in);

Expand All @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
assert(r == HSA_STATUS_SUCCESS);
std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
num_instr = pdi_vec.size();
}

} // namespace
Expand All @@ -144,9 +145,9 @@ int main(int argc, char **argv) {
hsa_amd_memory_pool_t global_dev_mem_pool{0};
// System memory pool. Used for allocating kernel argument data.
hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string pdi_file_name(sourcePath / "add_one.pdi");
uint32_t *dpu_inst_buf(nullptr);
uint32_t *instr_inst_buf(nullptr);
uint64_t *pdi_buf(nullptr);

assert(aie_agents.empty());
Expand All @@ -164,8 +165,6 @@ int main(int argc, char **argv) {
// Find the AIE agents in the system.
r = hsa_iterate_agents(get_aie_agents, &aie_agents);
assert(r == HSA_STATUS_SUCCESS);
// assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
// HSA_STATUS_SUCCESS);
assert(aie_agents.size() == 1);

const auto &aie_agent = aie_agents.front();
Expand All @@ -190,14 +189,22 @@ int main(int argc, char **argv) {
assert(r == HSA_STATUS_SUCCESS);
assert(global_kernarg_mem_pool.handle);

// Getting the maximum size of the queue so we can submit that many consecutive
// packets.
uint32_t aie_max_queue_size;
r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
assert(r == HSA_STATUS_SUCCESS);
int num_pkts = aie_max_queue_size;

// Load the DPU and PDI files into a global pool that doesn't support kernel
// args (DEV BO).
load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
reinterpret_cast<void **>(&dpu_inst_buf));
uint32_t dpu_handle = 0;
r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
uint32_t num_instr;
load_instr_file(global_dev_mem_pool, instr_inst_file_name,
reinterpret_cast<void **>(&instr_inst_buf), num_instr);
uint32_t instr_handle = 0;
r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(dpu_handle != 0);
assert(instr_handle != 0);

load_pdi_file(global_dev_mem_pool, pdi_file_name,
reinterpret_cast<void **>(&pdi_buf));
Expand All @@ -222,85 +229,99 @@ int main(int argc, char **argv) {
constexpr std::size_t data_buffer_size =
num_data_elements * sizeof(std::uint32_t);

std::uint32_t *input = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t input_handle = {};
r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle != 0);

std::uint32_t *output = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t output_handle = {};
r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle != 0);
std::vector<uint32_t *> input(num_pkts);
std::vector<uint32_t *> output(num_pkts);
std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
std::vector<uint32_t> input_handle(num_pkts);
std::vector<uint32_t> output_handle(num_pkts);

uint64_t wr_idx = 0;
uint64_t packet_id = 0;

for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle[pkt_iter] != 0);

r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle[pkt_iter] != 0);

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input[pkt_iter] + i) = i * (pkt_iter + 1);
*(output[pkt_iter] + i) = 0xDEFACE;
}

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input + i) = i;
*(output + i) = 0xDEFACE;
// Getting a slot in the queue
wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
packet_id = wr_idx % aie_queue->size;

// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address) + packet_id;
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
// Selecting the PDI to use with this command
cmd_payload->cu_mask = 0x1;
// Transaction opcode
cmd_payload->data[0] = 0x3;
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = instr_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = num_instr;
cmd_payload->data[5] = input_handle[pkt_iter];
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle[pkt_iter];
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

// Keeping track of payloads so we can free them at the end
cmd_payloads[pkt_iter] = cmd_payload;
}

///////////////////////////////////// Creating the cmd packet
// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_pkt));
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
uint32_t cmd_handle;
r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
&cmd_handle);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command
cmd_payload->data[0] = 0x3; // Transaction opcode
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = dpu_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = 0x44; // Size of DPU instruction
cmd_payload->data[5] = input_handle;
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle;
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
uint64_t packet_id = wr_idx % aie_queue->size;
reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address)[packet_id] = *cmd_pkt;
// Ringing the doorbell to dispatch each packet we added to
// the queue
hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);

for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input + i) + 1;
const auto result = *(output + i);
assert(result == expected);
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input[pkt_iter] + i) + 1;
const auto result = *(output[pkt_iter] + i);
assert(result == expected);
}

r = hsa_amd_memory_pool_free(output[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
}

r = hsa_queue_destroy(aie_queue);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_amd_memory_pool_free(output);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(pdi_buf);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(dpu_inst_buf);
r = hsa_amd_memory_pool_free(instr_inst_buf);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_shut_down();
Expand Down
37 changes: 14 additions & 23 deletions runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6;
constexpr int DEV_ADDR_BASE = 0x04000000;
constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;

// BO size allocated for commands
constexpr int CMD_SIZE = 64;

// This is a temp workaround. For some reason the first command count in a chain
// needs to be a larger than it actually is, assuming there is some other data
// structure at the beginning
// TODO: Look more into this
constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
// In the following xdna driver source can see where this is implemented:
// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
constexpr int CMD_COUNT_SIZE_INCREASE = 3;

// Index of command payload where the instruction sequence
// address is located
Expand Down Expand Up @@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
// Creating the command
amdxdna_drm_create_bo create_cmd_bo = {};
create_cmd_bo.type = AMDXDNA_BO_CMD,
create_cmd_bo.size = CMD_SIZE;
create_cmd_bo.size = size;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
return HSA_STATUS_ERROR;

Expand Down Expand Up @@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Get the payload information
switch (pkt->opcode) {
case HSA_AMD_AIE_ERT_START_CU: {

std::vector<uint32_t> bo_args;
std::vector<uint32_t> cmd_handles;

Expand Down Expand Up @@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a packet that contains the command to execute the kernel
uint32_t cmd_bo_handle = 0;
amdxdna_cmd *cmd = nullptr;
if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd))
return HSA_STATUS_ERROR;

// Filling in the fields of the command
cmd->state = pkt->state;
cmd->extra_cu_masks = 0;

// For some reason the first count needs to be a little larger than
// it actually is, assuming there is some other data structure at the
// beginning
// TODO: Look more into this
if (pkt_iter == cur_id) {
cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
}
else {
cmd->count = pkt->count;
}
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
cmd->opcode = pkt->opcode;
cmd->data[0] = cmd_pkt_payload->cu_mask;
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);
Expand All @@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a command chain
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_chain->extra_cu_masks = 0;
// TODO: Figure out why this is the value
cmd_chain->count = 0xA;
cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
cmd_chain_payload->command_count = cmd_handles.size();
cmd_chain_payload->submit_index = 0;
Expand Down

0 comments on commit 0757f63

Please sign in to comment.