Fixing command sizing and changing the test to issue the maximum numb…

…er of packets the queue supports. (#23) * Fixed workarounds in the AIE soft queue regarding the size of the command chain and the individual commands. * Added the functionality to aie_hsa_dispatch_test.cc to query the size of the AIE queue and issue the maximum number of packets it supports. * Some additional small fixes on aie_hsa_dispatch_test.cc test. * Adding links to driver source
nod-ai · Sep 12, 2024 · 0757f63 · 0757f63
1 parent c817704
commit 0757f63
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 101 deletions.
diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
@@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
   bin_file.read(reinterpret_cast<char *>(*buf), size);
 }
 
-void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
-                   void **buf) {
+void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf, uint32_t &num_instr) {
   std::ifstream bin_file(file_name,
                          std::ios::binary | std::ios::ate | std::ios::in);
 
@@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
   auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
   assert(r == HSA_STATUS_SUCCESS);
   std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
+  num_instr = pdi_vec.size();
 }
 
 }  // namespace
@@ -144,9 +145,9 @@ int main(int argc, char **argv) {
   hsa_amd_memory_pool_t global_dev_mem_pool{0};
   // System memory pool. Used for allocating kernel argument data.
   hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
-  const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
+  const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
   const std::string pdi_file_name(sourcePath / "add_one.pdi");
-  uint32_t *dpu_inst_buf(nullptr);
+  uint32_t *instr_inst_buf(nullptr);
   uint64_t *pdi_buf(nullptr);
 
   assert(aie_agents.empty());
@@ -164,8 +165,6 @@ int main(int argc, char **argv) {
   // Find the AIE agents in the system.
   r = hsa_iterate_agents(get_aie_agents, &aie_agents);
   assert(r == HSA_STATUS_SUCCESS);
-  //    assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
-  //    HSA_STATUS_SUCCESS);
   assert(aie_agents.size() == 1);
 
   const auto &aie_agent = aie_agents.front();
@@ -190,14 +189,22 @@ int main(int argc, char **argv) {
   assert(r == HSA_STATUS_SUCCESS);
   assert(global_kernarg_mem_pool.handle);
 
+  // Getting the maximum size of the queue so we can submit that many consecutive
+  // packets.
+  uint32_t aie_max_queue_size;
+  r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
+  assert(r == HSA_STATUS_SUCCESS);
+  int num_pkts = aie_max_queue_size;
+
   // Load the DPU and PDI files into a global pool that doesn't support kernel
   // args (DEV BO).
-  load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
-                reinterpret_cast<void **>(&dpu_inst_buf));
-  uint32_t dpu_handle = 0;
-  r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
+  uint32_t num_instr;
+  load_instr_file(global_dev_mem_pool, instr_inst_file_name,
+                reinterpret_cast<void **>(&instr_inst_buf), num_instr);
+  uint32_t instr_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
   assert(r == HSA_STATUS_SUCCESS);
-  assert(dpu_handle != 0);
+  assert(instr_handle != 0);
 
   load_pdi_file(global_dev_mem_pool, pdi_file_name,
                 reinterpret_cast<void **>(&pdi_buf));
@@ -222,85 +229,99 @@ int main(int argc, char **argv) {
   constexpr std::size_t data_buffer_size =
       num_data_elements * sizeof(std::uint32_t);
 
-  std::uint32_t *input = {};
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
-                                   reinterpret_cast<void **>(&input));
-  assert(r == HSA_STATUS_SUCCESS);
-  std::uint32_t input_handle = {};
-  r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  assert(input_handle != 0);
-
-  std::uint32_t *output = {};
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
-                                   reinterpret_cast<void **>(&output));
-  assert(r == HSA_STATUS_SUCCESS);
-  std::uint32_t output_handle = {};
-  r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  assert(output_handle != 0);
+  std::vector<uint32_t *> input(num_pkts);
+  std::vector<uint32_t *> output(num_pkts);
+  std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
+  std::vector<uint32_t> input_handle(num_pkts);
+  std::vector<uint32_t> output_handle(num_pkts);
+
+  uint64_t wr_idx = 0;
+  uint64_t packet_id = 0;
+
+  for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
+                                     reinterpret_cast<void **>(&input[pkt_iter]));
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    assert(input_handle[pkt_iter] != 0);
+
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
+                                     reinterpret_cast<void **>(&output[pkt_iter]));
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    assert(output_handle[pkt_iter] != 0);
+
+    for (std::size_t i = 0; i < num_data_elements; i++) {
+      *(input[pkt_iter] + i) = i * (pkt_iter + 1);
+      *(output[pkt_iter] + i) = 0xDEFACE;
+    }
 
-  for (std::size_t i = 0; i < num_data_elements; i++) {
-    *(input + i) = i;
-    *(output + i) = 0xDEFACE;
+    // Getting a slot in the queue
+    wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
+    packet_id = wr_idx % aie_queue->size;
+
+    // Creating a packet to store the command
+    hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
+        aie_queue->base_address) + packet_id;
+    assert(r == HSA_STATUS_SUCCESS);
+    cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
+    cmd_pkt->count = 0xA;  // # of arguments to put in command
+    cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
+    cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
+    cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
+                             << HSA_PACKET_HEADER_TYPE;
+
+    // Creating the payload for the packet
+    hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                     reinterpret_cast<void **>(&cmd_payload));
+    assert(r == HSA_STATUS_SUCCESS);
+    // Selecting the PDI to use with this command
+    cmd_payload->cu_mask = 0x1;
+    // Transaction opcode
+    cmd_payload->data[0] = 0x3;
+    cmd_payload->data[1] = 0x0;
+    cmd_payload->data[2] = instr_handle;
+    cmd_payload->data[3] = 0x0;
+    cmd_payload->data[4] = num_instr;
+    cmd_payload->data[5] = input_handle[pkt_iter];
+    cmd_payload->data[6] = 0;
+    cmd_payload->data[7] = output_handle[pkt_iter];
+    cmd_payload->data[8] = 0;
+    cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
+
+    // Keeping track of payloads so we can free them at the end
+    cmd_payloads[pkt_iter] = cmd_payload;
   }
 
-  ///////////////////////////////////// Creating the cmd packet
-  // Creating a packet to store the command
-  hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
-                                   reinterpret_cast<void **>(&cmd_pkt));
-  assert(r == HSA_STATUS_SUCCESS);
-  cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
-  cmd_pkt->count = 0xA;  // # of arguments to put in command
-  cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
-  cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
-  cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
-                           << HSA_PACKET_HEADER_TYPE;
-
-  // Creating the payload for the packet
-  hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
-  uint32_t cmd_handle;
-  r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
-                                    &cmd_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
-                                   reinterpret_cast<void **>(&cmd_payload));
-  assert(r == HSA_STATUS_SUCCESS);
-  cmd_payload->cu_mask = 0x1;  // Selecting the PDI to use with this command
-  cmd_payload->data[0] = 0x3;  // Transaction opcode
-  cmd_payload->data[1] = 0x0;
-  cmd_payload->data[2] = dpu_handle;
-  cmd_payload->data[3] = 0x0;
-  cmd_payload->data[4] = 0x44;  // Size of DPU instruction
-  cmd_payload->data[5] = input_handle;
-  cmd_payload->data[6] = 0;
-  cmd_payload->data[7] = output_handle;
-  cmd_payload->data[8] = 0;
-  cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
-
-  uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
-  uint64_t packet_id = wr_idx % aie_queue->size;
-  reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
-      aie_queue->base_address)[packet_id] = *cmd_pkt;
+  // Ringing the doorbell to dispatch each packet we added to
+  // the queue
   hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);
 
-  for (std::size_t i = 0; i < num_data_elements; i++) {
-    const auto expected = *(input + i) + 1;
-    const auto result = *(output + i);
-    assert(result == expected);
+  for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
+    for (std::size_t i = 0; i < num_data_elements; i++) {
+      const auto expected = *(input[pkt_iter] + i) + 1;
+      const auto result = *(output[pkt_iter] + i);
+      assert(result == expected);
+    }
+
+    r = hsa_amd_memory_pool_free(output[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_free(input[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
   }
 
   r = hsa_queue_destroy(aie_queue);
   assert(r == HSA_STATUS_SUCCESS);
 
-  r = hsa_amd_memory_pool_free(output);
-  assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_free(input);
-  assert(r == HSA_STATUS_SUCCESS);
   r = hsa_amd_memory_pool_free(pdi_buf);
   assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_free(dpu_inst_buf);
+  r = hsa_amd_memory_pool_free(instr_inst_buf);
   assert(r == HSA_STATUS_SUCCESS);
 
   r = hsa_shut_down();

diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6;
 constexpr int DEV_ADDR_BASE = 0x04000000;
 constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
 
-// BO size allocated for commands
-constexpr int CMD_SIZE = 64;
-
-// This is a temp workaround. For some reason the first command count in a chain
-// needs to be a larger than it actually is, assuming there is some other data 
-// structure at the beginning
-// TODO: Look more into this
-constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
+// The driver places a structure before each command in a command chain.
+// Need to increase the size of the command by the size of this structure.
+// In the following xdna driver source can see where this is implemented:
+// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
+// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
+// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
+constexpr int CMD_COUNT_SIZE_INCREASE = 3;
 
 // Index of command payload where the instruction sequence 
 // address is located 
@@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
   // Creating the command
   amdxdna_drm_create_bo create_cmd_bo = {};
   create_cmd_bo.type = AMDXDNA_BO_CMD,
-  create_cmd_bo.size = CMD_SIZE;
+  create_cmd_bo.size = size;
   if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
     return HSA_STATUS_ERROR;
 
@@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
     // Get the payload information
     switch (pkt->opcode) {
       case HSA_AMD_AIE_ERT_START_CU: {
-
         std::vector<uint32_t> bo_args;
         std::vector<uint32_t> cmd_handles;
 
@@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd(
           // Creating a packet that contains the command to execute the kernel
           uint32_t cmd_bo_handle = 0;
           amdxdna_cmd *cmd = nullptr;
-          if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
+          uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
+          if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd))
             return HSA_STATUS_ERROR;
 
           // Filling in the fields of the command
           cmd->state = pkt->state;
           cmd->extra_cu_masks = 0;
 
-          // For some reason the first count needs to be a little larger than
-          // it actually is, assuming there is some other data structure at the
-          // beginning
-          // TODO: Look more into this
-          if (pkt_iter == cur_id) {
-            cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
-          }
-          else {
-            cmd->count = pkt->count;
-          }
+          // The driver places a structure before each command in a command chain.
+          // Need to increase the size of the command by the size of this structure.
+          cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
           cmd->opcode = pkt->opcode;
           cmd->data[0] = cmd_pkt_payload->cu_mask;
           memcpy((cmd->data + 1),  cmd_pkt_payload->data, 4 * pkt->count);
@@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd(
         // Creating a command chain
         cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
         cmd_chain->extra_cu_masks = 0;
-        // TODO: Figure out why this is the value
-        cmd_chain->count = 0xA;
+        cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
         cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
         cmd_chain_payload->command_count = cmd_handles.size();
         cmd_chain_payload->submit_index = 0;