Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

Commit

Permalink
Merge branch 'iree-aie' into ypapadop-amd/fix-driver-cast
Browse files Browse the repository at this point in the history
  • Loading branch information
ypapadop-amd authored Sep 12, 2024
2 parents 6c1af4c + 0757f63 commit 3c59b0a
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 101 deletions.
177 changes: 99 additions & 78 deletions rocrtst/suites/aie/aie_hsa_dispatch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
bin_file.read(reinterpret_cast<char *>(*buf), size);
}

void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf) {
void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf, uint32_t &num_instr) {
std::ifstream bin_file(file_name,
std::ios::binary | std::ios::ate | std::ios::in);

Expand All @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
assert(r == HSA_STATUS_SUCCESS);
std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
num_instr = pdi_vec.size();
}

} // namespace
Expand All @@ -144,9 +145,9 @@ int main(int argc, char **argv) {
hsa_amd_memory_pool_t global_dev_mem_pool{0};
// System memory pool. Used for allocating kernel argument data.
hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string pdi_file_name(sourcePath / "add_one.pdi");
uint32_t *dpu_inst_buf(nullptr);
uint32_t *instr_inst_buf(nullptr);
uint64_t *pdi_buf(nullptr);

assert(aie_agents.empty());
Expand All @@ -164,8 +165,6 @@ int main(int argc, char **argv) {
// Find the AIE agents in the system.
r = hsa_iterate_agents(get_aie_agents, &aie_agents);
assert(r == HSA_STATUS_SUCCESS);
// assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
// HSA_STATUS_SUCCESS);
assert(aie_agents.size() == 1);

const auto &aie_agent = aie_agents.front();
Expand All @@ -190,14 +189,22 @@ int main(int argc, char **argv) {
assert(r == HSA_STATUS_SUCCESS);
assert(global_kernarg_mem_pool.handle);

// Getting the maximum size of the queue so we can submit that many consecutive
// packets.
uint32_t aie_max_queue_size;
r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
assert(r == HSA_STATUS_SUCCESS);
int num_pkts = aie_max_queue_size;

// Load the DPU and PDI files into a global pool that doesn't support kernel
// args (DEV BO).
load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
reinterpret_cast<void **>(&dpu_inst_buf));
uint32_t dpu_handle = 0;
r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
uint32_t num_instr;
load_instr_file(global_dev_mem_pool, instr_inst_file_name,
reinterpret_cast<void **>(&instr_inst_buf), num_instr);
uint32_t instr_handle = 0;
r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(dpu_handle != 0);
assert(instr_handle != 0);

load_pdi_file(global_dev_mem_pool, pdi_file_name,
reinterpret_cast<void **>(&pdi_buf));
Expand All @@ -222,85 +229,99 @@ int main(int argc, char **argv) {
constexpr std::size_t data_buffer_size =
num_data_elements * sizeof(std::uint32_t);

std::uint32_t *input = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t input_handle = {};
r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle != 0);

std::uint32_t *output = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t output_handle = {};
r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle != 0);
std::vector<uint32_t *> input(num_pkts);
std::vector<uint32_t *> output(num_pkts);
std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
std::vector<uint32_t> input_handle(num_pkts);
std::vector<uint32_t> output_handle(num_pkts);

uint64_t wr_idx = 0;
uint64_t packet_id = 0;

for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle[pkt_iter] != 0);

r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle[pkt_iter] != 0);

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input[pkt_iter] + i) = i * (pkt_iter + 1);
*(output[pkt_iter] + i) = 0xDEFACE;
}

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input + i) = i;
*(output + i) = 0xDEFACE;
// Getting a slot in the queue
wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
packet_id = wr_idx % aie_queue->size;

// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address) + packet_id;
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
// Selecting the PDI to use with this command
cmd_payload->cu_mask = 0x1;
// Transaction opcode
cmd_payload->data[0] = 0x3;
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = instr_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = num_instr;
cmd_payload->data[5] = input_handle[pkt_iter];
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle[pkt_iter];
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

// Keeping track of payloads so we can free them at the end
cmd_payloads[pkt_iter] = cmd_payload;
}

///////////////////////////////////// Creating the cmd packet
// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_pkt));
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
uint32_t cmd_handle;
r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
&cmd_handle);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command
cmd_payload->data[0] = 0x3; // Transaction opcode
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = dpu_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = 0x44; // Size of DPU instruction
cmd_payload->data[5] = input_handle;
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle;
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
uint64_t packet_id = wr_idx % aie_queue->size;
reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address)[packet_id] = *cmd_pkt;
// Ringing the doorbell to dispatch each packet we added to
// the queue
hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);

for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input + i) + 1;
const auto result = *(output + i);
assert(result == expected);
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input[pkt_iter] + i) + 1;
const auto result = *(output[pkt_iter] + i);
assert(result == expected);
}

r = hsa_amd_memory_pool_free(output[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
}

r = hsa_queue_destroy(aie_queue);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_amd_memory_pool_free(output);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(pdi_buf);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(dpu_inst_buf);
r = hsa_amd_memory_pool_free(instr_inst_buf);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_shut_down();
Expand Down
37 changes: 14 additions & 23 deletions runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6;
constexpr int DEV_ADDR_BASE = 0x04000000;
constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;

// BO size allocated for commands
constexpr int CMD_SIZE = 64;

// This is a temp workaround. For some reason the first command count in a chain
// needs to be a larger than it actually is, assuming there is some other data
// structure at the beginning
// TODO: Look more into this
constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
// In the following xdna driver source can see where this is implemented:
// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
constexpr int CMD_COUNT_SIZE_INCREASE = 3;

// Index of command payload where the instruction sequence
// address is located
Expand Down Expand Up @@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
// Creating the command
amdxdna_drm_create_bo create_cmd_bo = {};
create_cmd_bo.type = AMDXDNA_BO_CMD,
create_cmd_bo.size = CMD_SIZE;
create_cmd_bo.size = size;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
return HSA_STATUS_ERROR;

Expand Down Expand Up @@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Get the payload information
switch (pkt->opcode) {
case HSA_AMD_AIE_ERT_START_CU: {

std::vector<uint32_t> bo_args;
std::vector<uint32_t> cmd_handles;

Expand Down Expand Up @@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a packet that contains the command to execute the kernel
uint32_t cmd_bo_handle = 0;
amdxdna_cmd *cmd = nullptr;
if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd))
return HSA_STATUS_ERROR;

// Filling in the fields of the command
cmd->state = pkt->state;
cmd->extra_cu_masks = 0;

// For some reason the first count needs to be a little larger than
// it actually is, assuming there is some other data structure at the
// beginning
// TODO: Look more into this
if (pkt_iter == cur_id) {
cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
}
else {
cmd->count = pkt->count;
}
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
cmd->opcode = pkt->opcode;
cmd->data[0] = cmd_pkt_payload->cu_mask;
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);
Expand All @@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a command chain
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_chain->extra_cu_masks = 0;
// TODO: Figure out why this is the value
cmd_chain->count = 0xA;
cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
cmd_chain_payload->command_count = cmd_handles.size();
cmd_chain_payload->submit_index = 0;
Expand Down

0 comments on commit 3c59b0a

Please sign in to comment.