From 824017296657eb27cfd78eb28878fdee5636c74c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20K=2E=20Guti=C3=A9rrez?= Date: Wed, 20 Mar 2024 12:01:34 -0600 Subject: [PATCH] Checkpoint restructuring of hardware resource types. (#98) Signed-off-by: Samuel K. Gutierrez --- src/CMakeLists.txt | 3 +- src/qvi-context.cc | 2 +- src/qvi-devinfo.h | 98 ---------------------------------------------- src/qvi-hwloc.cc | 38 +++++++++--------- src/qvi-hwpool.cc | 31 +++++++++++---- src/qvi-hwpool.h | 57 +++++++++++++++++++++++++-- src/qvi-macros.h | 2 +- src/qvi-rmi.cc | 16 ++++---- src/qvi-scope.cc | 9 +++-- 9 files changed, 112 insertions(+), 144 deletions(-) delete mode 100644 src/qvi-devinfo.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9d487e19..77f855e9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2023 Triad National Security, LLC +# Copyright (c) 2020-2024 Triad National Security, LLC # All rights reserved. # # Copyright (c) 2020-2021 Lawrence Livermore National Security, LLC @@ -29,7 +29,6 @@ add_library( qvi-bbuff-rmi.h qvi-context.h qvi-task.h - qvi-devinfo.h qvi-group.h qvi-map.h qvi-scope.h diff --git a/src/qvi-context.cc b/src/qvi-context.cc index 1a02f21f..9f036e34 100644 --- a/src/qvi-context.cc +++ b/src/qvi-context.cc @@ -15,8 +15,8 @@ */ #include "qvi-common.h" // IWYU pragma: keep - #include "qvi-context.h" +#include "qvi-utils.h" int qvi_context_new( diff --git a/src/qvi-devinfo.h b/src/qvi-devinfo.h deleted file mode 100644 index d0d3e2d9..00000000 --- a/src/qvi-devinfo.h +++ /dev/null @@ -1,98 +0,0 @@ -/* -*- Mode: C++; c-basic-offset:4; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2022-2024 Triad National Security, LLC - * All rights reserved. - * - * This file is part of the quo-vadis project. See the LICENSE file at the - * top-level directory of this distribution. - */ - -/** - * @file qvi-devinfo.cc - * - * Device information. - */ - -#ifndef QVI_DEVINFO_H -#define QVI_DEVINFO_H - -#include "qvi-common.h" -#include "qvi-hwloc.h" -#include "qvi-utils.h" - -/** Device information. */ -struct qvi_devinfo_t { - int qvim_rc = QV_ERR_INTERNAL; - /** Device type. */ - qv_hw_obj_type_t type = QV_HW_OBJ_LAST; - /** Device ID. */ - int id = 0; - /** The PCI bus ID. */ - char *pci_bus_id = nullptr; - /** UUID */ - char *uuid = nullptr; - /** The bitmap encoding CPU affinity. */ - hwloc_bitmap_t affinity = nullptr; - /** Constructor */ - qvi_devinfo_t( - qv_hw_obj_type_t t, - int i, - cstr_t pci_bus_id, - cstr_t uuid, - hwloc_const_cpuset_t c - ) : type(t) - , id(i) - { - int nw = asprintf(&this->pci_bus_id, "%s", pci_bus_id); - if (nw == -1) { - qvim_rc = QV_ERR_OOR; - return; - } - - nw = asprintf(&this->uuid, "%s", uuid); - if (nw == -1) { - qvim_rc = QV_ERR_OOR; - return; - } - - qvim_rc = qvi_hwloc_bitmap_dup(c, &affinity); - } - /** Destructor */ - ~qvi_devinfo_t(void) - { - qvi_hwloc_bitmap_free(&affinity); - free(pci_bus_id); - free(uuid); - } - /** Equality operator. */ - bool - operator==(const qvi_devinfo_t &x) const - { - return id == x.id && type == x.type; - } -}; - -/** - * Extend namespace std so we can easily add qvi_devinfo_ts to - * unordered_sets. - */ -namespace std { - template <> - struct hash - { - size_t - operator()(const qvi_devinfo_t &x) const - { - const int a = x.id; - const int b = (int)x.type; - const int64_t c = qvi_cantor_pairing(a, b); - return hash()(c); - } - }; -} - -#endif - -/* - * vim: ft=cpp ts=4 sts=4 sw=4 expandtab - */ diff --git a/src/qvi-hwloc.cc b/src/qvi-hwloc.cc index be535f9b..626bf35e 100644 --- a/src/qvi-hwloc.cc +++ b/src/qvi-hwloc.cc @@ -21,9 +21,9 @@ #include "qvi-nvml.h" #include "qvi-rsmi.h" -static constexpr int pci_bus_id_buff_size = 16; -static constexpr int dev_name_buff_size = 32; -static constexpr int uuid_buff_size = 64; +static constexpr int PCI_BUS_ID_BUFF_SIZE = 16; +static constexpr int DEV_NAME_BUFF_SIZE = 32; +static constexpr int UUID_BUFF_SIZE = 64; /** Device list type. */ using qvi_hwloc_dev_list_t = std::vector< @@ -70,11 +70,11 @@ typedef struct qvi_hwloc_device_s { /** CUDA/ROCm visible devices ID */ int visdev_id = QVI_HWLOC_DEVICE_INVISIBLE_ID; /** Device name */ - char name[dev_name_buff_size] = {'\0'}; + char name[DEV_NAME_BUFF_SIZE] = {'\0'}; /** PCI bus ID */ - char pci_bus_id[pci_bus_id_buff_size] = {'\0'}; + char pci_bus_id[PCI_BUS_ID_BUFF_SIZE] = {'\0'}; /** UUID */ - char uuid[uuid_buff_size] = {'\0'}; + char uuid[UUID_BUFF_SIZE] = {'\0'}; /** Constructor */ qvi_hwloc_device_s(void) { @@ -377,19 +377,19 @@ set_general_device_info( // Save device name. int nw = snprintf( device->name, - dev_name_buff_size, + DEV_NAME_BUFF_SIZE, "%s", obj->name ); - if (nw >= dev_name_buff_size) { + if (nw >= DEV_NAME_BUFF_SIZE) { return QV_ERR_INTERNAL; } // Set the PCI bus ID. nw = snprintf( device->pci_bus_id, - pci_bus_id_buff_size, + PCI_BUS_ID_BUFF_SIZE, "%s", pci_bus_id ); - if (nw >= pci_bus_id_buff_size) { + if (nw >= PCI_BUS_ID_BUFF_SIZE) { return QV_ERR_INTERNAL; } // Set visible device ID, if applicable. @@ -407,10 +407,10 @@ set_gpu_device_info( if (sscanf(obj->name, "rsmi%d", &id) == 1) { device->smi = id; int nw = snprintf( - device->uuid, uuid_buff_size, "%s", + device->uuid, UUID_BUFF_SIZE, "%s", hwloc_obj_get_info_by_name(obj, "AMDUUID") ); - if (nw >= uuid_buff_size) { + if (nw >= UUID_BUFF_SIZE) { return QV_ERR_INTERNAL; } return qvi_hwloc_rsmi_get_device_cpuset_by_device_id( @@ -423,10 +423,10 @@ set_gpu_device_info( if (sscanf(obj->name, "nvml%d", &id) == 1) { device->smi = id; int nw = snprintf( - device->uuid, uuid_buff_size, "%s", + device->uuid, UUID_BUFF_SIZE, "%s", hwloc_obj_get_info_by_name(obj, "NVIDIAUUID") ); - if (nw >= uuid_buff_size) { + if (nw >= UUID_BUFF_SIZE) { return QV_ERR_INTERNAL; } return qvi_hwloc_nvml_get_device_cpuset_by_pci_bus_id( @@ -446,11 +446,11 @@ set_of_device_info( ) { // TODO(skg) Get cpuset, if available. int nw = snprintf( - device->uuid, uuid_buff_size, "%s", + device->uuid, UUID_BUFF_SIZE, "%s", hwloc_obj_get_info_by_name(obj, "NodeGUID") ); // Internal error because our buffer is too small. - if (nw >= uuid_buff_size) return QV_ERR_INTERNAL; + if (nw >= UUID_BUFF_SIZE) return QV_ERR_INTERNAL; return QV_SUCCESS; } @@ -471,7 +471,7 @@ discover_all_devices( continue; } // Try to get the PCI object. - char busid[pci_bus_id_buff_size] = {'\0'}; + char busid[PCI_BUS_ID_BUFF_SIZE] = {'\0'}; hwloc_obj_t pci_obj = get_pci_busid(obj, busid, sizeof(busid)); if (!pci_obj) continue; // Have we seen this device already? For example, opencl0d0 and cuda0 @@ -509,7 +509,7 @@ discover_gpu_devices( continue; } // Try to get the PCI object. - char busid[pci_bus_id_buff_size] = {'\0'}; + char busid[PCI_BUS_ID_BUFF_SIZE] = {'\0'}; hwloc_obj_t pci_obj = get_pci_busid(obj, busid, sizeof(busid)); if (!pci_obj) continue; @@ -573,7 +573,7 @@ discover_nic_devices( while ((obj = hwloc_get_next_osdev(hwl->topo, obj)) != nullptr) { if (obj->attr->osdev.type != HWLOC_OBJ_OSDEV_OPENFABRICS) continue; // Try to get the PCI object. - char busid[pci_bus_id_buff_size] = {'\0'}; + char busid[PCI_BUS_ID_BUFF_SIZE] = {'\0'}; hwloc_obj_t pci_obj = get_pci_busid(obj, busid, sizeof(busid)); if (!pci_obj) continue; diff --git a/src/qvi-hwpool.cc b/src/qvi-hwpool.cc index c42bc668..e4ffe971 100644 --- a/src/qvi-hwpool.cc +++ b/src/qvi-hwpool.cc @@ -42,8 +42,6 @@ // is zero, then the resource is not in use. For devices, we can take a similar // approach using the device IDs instead of the bit positions. -#include "qvi-common.h" - #include "qvi-hwpool.h" #include "qvi-hwloc.h" #include "qvi-utils.h" @@ -93,14 +91,13 @@ struct qvi_hwpool_cpus_s : qvi_hwpool_resource_s { int qvim_rc = QV_ERR_INTERNAL; /** The cpuset of the maintained CPUs. */ qvi_hwloc_bitmap_t cpuset; - + /** Constructor */ qvi_hwpool_cpus_s(void) { qvim_rc = qvi_construct_rc(cpuset); } - - virtual - ~qvi_hwpool_cpus_s(void) = default; + /** Destructor */ + virtual ~qvi_hwpool_cpus_s(void) = default; }; struct qvi_hwpool_s { @@ -109,7 +106,6 @@ struct qvi_hwpool_s { qvi_hwpool_cpus_s cpus; /** Device information. */ qvi_hwpool_devinfos_t devinfos; - // TODO(skg) Add owner to structure? /** The obtained cpuset of this resource pool. */ hwloc_bitmap_t obcpuset = nullptr; @@ -248,7 +244,7 @@ qvi_hwpool_add_device( cstr_t uuid, hwloc_const_cpuset_t affinity ) { - auto dinfo = std::make_shared( + auto dinfo = std::make_shared( type, id, pcibid, uuid, affinity ); const int rc = qvi_construct_rc(dinfo); @@ -485,6 +481,25 @@ qvi_hwpool_unpack( return rc; } +/** + * Extend namespace std so we can easily add qvi_devinfo_ts to + * unordered_sets. + */ +namespace std { + template <> + struct hash + { + size_t + operator()(const qvi_hwpool_devinfo_s &x) const + { + const int a = x.id; + const int b = (int)x.type; + const int64_t c = qvi_cantor_pairing(a, b); + return hash()(c); + } + }; +} + /* * vim: ft=cpp ts=4 sts=4 sw=4 expandtab */ diff --git a/src/qvi-hwpool.h b/src/qvi-hwpool.h index 1edfa1ad..c76540c2 100644 --- a/src/qvi-hwpool.h +++ b/src/qvi-hwpool.h @@ -1,6 +1,6 @@ /* -*- Mode: C++; c-basic-offset:4; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2022 Triad National Security, LLC + * Copyright (c) 2022-2024 Triad National Security, LLC * All rights reserved. * * This file is part of the quo-vadis project. See the LICENSE file at the @@ -17,11 +17,62 @@ #define QVI_HWPOOL_H #include "qvi-common.h" -#include "qvi-devinfo.h" #include "qvi-line.h" +/** Device information. */ +struct qvi_hwpool_devinfo_s { + int qvim_rc = QV_ERR_INTERNAL; + /** Device type. */ + qv_hw_obj_type_t type = QV_HW_OBJ_LAST; + /** Device ID. */ + int id = 0; + /** The PCI bus ID. */ + char *pci_bus_id = nullptr; + /** UUID */ + char *uuid = nullptr; + /** The bitmap encoding CPU affinity. */ + hwloc_bitmap_t affinity = nullptr; + /** Constructor */ + qvi_hwpool_devinfo_s( + qv_hw_obj_type_t t, + int i, + cstr_t pci_bus_id, + cstr_t uuid, + hwloc_const_cpuset_t c + ) : type(t) + , id(i) + { + int nw = asprintf(&this->pci_bus_id, "%s", pci_bus_id); + if (nw == -1) { + qvim_rc = QV_ERR_OOR; + return; + } + + nw = asprintf(&this->uuid, "%s", uuid); + if (nw == -1) { + qvim_rc = QV_ERR_OOR; + return; + } + + qvim_rc = qvi_hwloc_bitmap_dup(c, &affinity); + } + /** Destructor */ + ~qvi_hwpool_devinfo_s(void) + { + qvi_hwloc_bitmap_free(&affinity); + free(pci_bus_id); + free(uuid); + } + /** Equality operator. */ + bool + operator==(const qvi_hwpool_devinfo_s &x) const + { + return id == x.id && type == x.type; + } +}; + using qvi_hwpool_devinfos_t = std::multimap< - int, std::shared_ptr + int, std::shared_ptr >; struct qvi_hwpool_s; diff --git a/src/qvi-macros.h b/src/qvi-macros.h index 501e8d98..b520cb3f 100644 --- a/src/qvi-macros.h +++ b/src/qvi-macros.h @@ -31,7 +31,7 @@ do { \ } while (0) /** - * Convenience wrapper around new(std::nothrow). + * Convenience macro for new(std::nothrow). */ #define qvi_new new(std::nothrow) diff --git a/src/qvi-rmi.cc b/src/qvi-rmi.cc index bd3b9b3c..3cfaf607 100644 --- a/src/qvi-rmi.cc +++ b/src/qvi-rmi.cc @@ -33,7 +33,7 @@ #include "zmq.h" -static const cstr_t zinproc_addr = "inproc://qvi-rmi-workers"; +static const cstr_t ZINPROC_ADDR = "inproc://qvi-rmi-workers"; struct qvi_rmi_server_s { /** Server configuration */ @@ -117,7 +117,7 @@ zctx_destroy( ) { void *ictx = *ctx; if (!ictx) return; - int rc = zmq_ctx_destroy(ictx); + const int rc = zmq_ctx_destroy(ictx); if (rc != 0) qvi_zwrn_msg("zmq_ctx_destroy() failed", errno); *ctx = nullptr; } @@ -128,7 +128,7 @@ zsocket_close( ) { void *isock = *sock; if (!isock) return; - int rc = zmq_close(isock); + const int rc = zmq_close(isock); if (rc != 0) qvi_zwrn_msg("zmq_close() failed", errno); *sock = nullptr; } @@ -142,7 +142,7 @@ zsocket_create_and_connect( void *zsock = zmq_socket(zctx, sock_type); if (!zsock) qvi_zerr_msg("zmq_socket() failed", errno); - int rc = zmq_connect(zsock, addr); + const int rc = zmq_connect(zsock, addr); if (rc != 0) { qvi_zerr_msg("zmq_connect() failed", errno); zsocket_close(&zsock); @@ -160,7 +160,7 @@ zsocket_create_and_bind( void *zsock = zmq_socket(zctx, sock_type); if (!zsock) qvi_zerr_msg("zmq_socket() failed", errno); - int rc = zmq_bind(zsock, addr); + const int rc = zmq_bind(zsock, addr); if (rc != 0) { qvi_zerr_msg("zmq_bind() failed", errno); zsocket_close(&zsock); @@ -230,7 +230,7 @@ zmsg_init_from_bbuff( zmq_msg_t *zmsg ) { const size_t buffer_size = qvi_bbuff_size(bbuff); - int zrc = zmq_msg_init_data( + const int zrc = zmq_msg_init_data( zmsg, qvi_bbuff_data(bbuff), buffer_size, @@ -731,7 +731,7 @@ server_go( qvi_rmi_server_t *server = (qvi_rmi_server_t *)data; void *zworksock = zsocket_create_and_connect( - server->zctx, ZMQ_REP, zinproc_addr + server->zctx, ZMQ_REP, ZINPROC_ADDR ); if (!zworksock) return nullptr; @@ -878,7 +878,7 @@ server_start_threads( } void *workers = zsocket_create_and_bind( - server->zctx, ZMQ_DEALER, zinproc_addr + server->zctx, ZMQ_DEALER, ZINPROC_ADDR ); if (!workers) { cstr_t ers = "zsocket_create_and_bind() failed"; diff --git a/src/qvi-scope.cc b/src/qvi-scope.cc index 3db5b74e..311a3191 100644 --- a/src/qvi-scope.cc +++ b/src/qvi-scope.cc @@ -24,7 +24,7 @@ #include "qvi-map.h" /** Maintains a mapping between IDs to device information. */ -using id_devinfo_multimap_t = std::multimap; +using id_devinfo_multimap_t = std::multimap; /** Scope type definition. */ struct qv_scope_s { @@ -642,6 +642,7 @@ global_split_devices_user_defined( // Cache all device infos associated with the parent hardware pool. auto dinfos = qvi_hwpool_devinfos_get(parent_scope->hwpool); // Iterate over the supported device types and split them up round-robin. + // TODO(skg) Should this be a mapping operation in qvi-map? const qv_hw_obj_type_t *devts = qvi_hwloc_supported_devices(); for (int i = 0; devts[i] != QV_HW_OBJ_LAST; ++i) { // The current device type. @@ -649,7 +650,7 @@ global_split_devices_user_defined( // Get the number of devices. const int ndevs = dinfos->count(devt); // Store device infos. - std::vector devs; + std::vector devs; for (const auto &dinfo : *dinfos) { // Not the type we are currently dealing with. if (devt != dinfo.first) continue; @@ -706,7 +707,7 @@ qvi_global_split_devices_affinity_preserving( // The current device type. const qv_hw_obj_type_t devt = devts[i]; // Store device infos. - std::vector devs; + std::vector devs; for (const auto &dinfo : *dinfos) { // Not the type we are currently dealing with. if (devt != dinfo.first) continue; @@ -1165,7 +1166,7 @@ qvi_scope_get_device_id( ) { int rc = QV_SUCCESS, id = 0, nw = 0; - qvi_devinfo_t *finfo = nullptr; + qvi_hwpool_devinfo_s *finfo = nullptr; for (const auto &dinfo : *qvi_hwpool_devinfos_get(scope->hwpool)) { if (dev_obj != dinfo.first) continue; if (id++ == i) {