From 210a8534c0b43d0608d53918e3a56caacb253397 Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Mon, 11 Dec 2023 15:48:35 -0600 Subject: [PATCH] prov/util: Integrate kdreg2 into libfabric kdreg2 is a Linux kernel module used to enabled the libfabric MR cache for FI_HMEM_SYSTEM. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- Makefile.am | 3 +- configure.ac | 51 +++- include/ofi_mr.h | 38 ++- prov/util/src/kdreg2_mem_monitor.c | 367 +++++++++++++++++++++++++++++ prov/util/src/util_mem_monitor.c | 17 +- 5 files changed, 469 insertions(+), 7 deletions(-) create mode 100644 prov/util/src/kdreg2_mem_monitor.c diff --git a/Makefile.am b/Makefile.am index 0eb9c67a44c..04238385b45 100644 --- a/Makefile.am +++ b/Makefile.am @@ -100,7 +100,8 @@ common_srcs = \ prov/coll/src/coll_eq.c \ prov/coll/src/coll_fabric.c \ prov/coll/src/coll_init.c \ - prov/coll/src/coll.h + prov/coll/src/coll.h \ + prov/util/src/kdreg2_mem_monitor.c if MACOS common_srcs += src/osx/osd.c diff --git a/configure.ac b/configure.ac index 676fd365bae..f78ec5d5a3b 100644 --- a/configure.ac +++ b/configure.ac @@ -598,6 +598,53 @@ AC_ARG_ENABLE([restricted_dl], AC_DEFINE_UNQUOTED([HAVE_RESTRICTED_DL], [$restricted_dl], [Define to 1 to only look for dl providers under default location if FI_PROVIDER_PATH is not set]) +dnl Check kdreg2 support +kdreg2_enabled=1 +have_kdreg2=0 +have_kdreg2_include_path=0 + +AC_ARG_ENABLE([kdreg2], + [AC_HELP_STRING([--disable-kdreg2], + [Determine whether kdreg2 memory monitor is disabled.])], + [AS_IF([test "$enable_kdreg2" = "no"], [kdreg2_enabled=0])], + []) + +AS_IF([test $kdreg2_enabled -ne 0 ], + [AC_CHECK_HEADER([linux/kdreg2.h], [have_kdreg2=1], [], []) + + AC_ARG_WITH([kdreg2], + [AS_HELP_STRING([--with-kdreg2=DIR], + [Enable KDREG2 memory monitor. + Optional=.])], + [AS_CASE(["$with_kdreg2"], + ["no"], [kdreg2_enabled=0], + ["yes"], [], + [""], [], + [CPPFLAGS="$CPPFLAGS -I$with_kdreg2" + AC_CHECK_HEADER([kdreg2.h], + [have_kdreg2=1 + have_kdreg2_include_path=1], + [have_kdreg2=0], + [])]) + AS_IF([test $have_kdreg2 -eq 0 ], + [AC_MSG_ERROR([KDREG2 header not found in $with_kdreg2. Cannot enable KDREG2 memory monitor.])]) + ]) + ]) + +AS_IF([test $kdreg2_enabled -eq 0], + [AC_MSG_NOTICE([kdreg2 monitor disabled])], + [AS_IF([test $have_kdreg2 -ne 0], + [AC_MSG_NOTICE([kdreg2 present and enabled])])]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2, [$have_kdreg2], + [Define to 1 if kdreg2.h is available.]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_INCLUDE_PATH, [$have_kdreg2_include_path], + [Define to 1 if kdreg2.h path is not .]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_MONITOR, [$have_kdreg2], + [Define to 1 to enable kdreg2 memory monitor]) + dnl Check support to intercept syscalls AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) @@ -888,16 +935,18 @@ AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], default_monitor="" bad_default="0" AC_ARG_WITH([default-monitor], - [AS_HELP_STRING([--with-default-monitor=], + [AS_HELP_STRING([--with-default-monitor=], [Select the default memory monitor.])], [AS_CASE([$with_default_monitor], [memhooks],[default_monitor=memhooks], [uffd],[default_monitor=uffd], + [kdreg2],[default_monitor=kdreg2] [disabled], [default_monitor=disabled], [AC_MSG_ERROR([Unknown monitor specified: $with_default_monitor. Choices are memhooks, uffd, or disabled.])]) AS_CASE([$default_monitor], [memhooks], [AS_IF([test "$enable_memhooks" != "1"], [bad_default=1])], [uffd], [AS_IF([test "$enable_uffd" != "1"], [bad_default=1])], + [kdreg2], [AS_IF([test "$kdreg2_enabled" != "1"], [bad_default=1])], []) AS_IF([test "$bad_default" != "0"], [AC_MSG_ERROR(["Default memory monitor is not available: $default_monitor."])]) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 324aa34313c..c009eaf556b 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -2,7 +2,7 @@ * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,8 @@ # include #endif /* HAVE_CONFIG_H */ +struct ofi_mr; + #include #include @@ -48,6 +50,15 @@ #include #include #include +#include + +#if HAVE_KDREG2_MONITOR +#if HAVE_KDREG2_INCLUDE_PATH +#include "kdreg2.h" +#else +#include "linux/kdreg2.h" +#endif +#endif int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len, uint64_t flags, struct fid **fid, void *context); @@ -128,6 +139,12 @@ struct ofi_mr_cache; union ofi_mr_hmem_info { uint64_t cuda_id; uint64_t ze_id; +#if HAVE_KDREG2_MONITOR + struct { + kdreg2_cookie_t cookie; + struct kdreg2_monitoring_params monitoring_params; + } kdreg2; +#endif }; struct ofi_mr_entry { @@ -229,6 +246,23 @@ struct ofi_memhooks { extern struct ofi_mem_monitor *memhooks_monitor; +/* + * Kdreg2 monitor + */ + +struct kdreg2_status_data; + +struct ofi_kdreg2 { + struct ofi_mem_monitor monitor; + pthread_t thread; + int fd; + int exit_pipe[2]; + const struct kdreg2_status_data *status_data; + ofi_atomic64_t next_cookie; +}; + +extern struct ofi_mem_monitor *kdreg2_monitor; + extern struct ofi_mem_monitor *cuda_monitor; extern struct ofi_mem_monitor *cuda_ipc_monitor; extern struct ofi_mem_monitor *rocr_monitor; @@ -368,7 +402,7 @@ struct ofi_mr_cache { struct ofi_rbmap tree; struct dlist_entry lru_list; struct dlist_entry dead_region_list; - pthread_mutex_t lock; + pthread_mutex_t lock; size_t cached_cnt; size_t cached_size; diff --git a/prov/util/src/kdreg2_mem_monitor.c b/prov/util/src/kdreg2_mem_monitor.c new file mode 100644 index 00000000000..ba7c2a21d31 --- /dev/null +++ b/prov/util/src/kdreg2_mem_monitor.c @@ -0,0 +1,367 @@ +/* + * (C) Copyright 2022-2023 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#if HAVE_KDREG2_MONITOR + +#include "ofi_hmem.h" + +#define EVICTOR_THREAD_ATTR NULL +#define INFINITE_TIMEOUT -1 + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + uint64_t cookie = ofi_atomic_inc64(&kdreg2->next_cookie); + struct kdreg2_ioctl_monitor ioctl_monitor = { + .addr = addr, + .length = len, + .cookie = (kdreg2_cookie_t) cookie, + }; + int ret; + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_MONITOR, &ioctl_monitor); + if (ret) + return ret; + + hmem_info->kdreg2.cookie = ioctl_monitor.cookie; + hmem_info->kdreg2.monitoring_params = ioctl_monitor.monitoring_params; + + return 0; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_ioctl_unmonitor ioctl_unmonitor = { + .cookie = hmem_info->kdreg2.cookie, + .monitoring_params = hmem_info->kdreg2.monitoring_params, + }; + + ioctl(kdreg2->fd, KDREG2_IOCTL_UNMONITOR, &ioctl_unmonitor); +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_monitoring_params *params = + &entry->hmem_info.kdreg2.monitoring_params; + + return !kdreg2_mapping_changed(kdreg2->status_data, params); +} + +static int kdreg2_read_evictions(struct ofi_kdreg2 *kdreg2) +{ + struct kdreg2_event event; + ssize_t bytes; + int err; + + while (kdreg2_read_counter(&kdreg2->status_data->pending_events) > 0) { + + /* The read should return a multiple of sizeof(event) or + * an error. There should be no partial reads. + */ + + bytes = read(kdreg2->fd, &event, sizeof(event)); + if (bytes < 0) { + err = errno; + + /* EINTR means we caught a signal. */ + if (err == EINTR) + continue; + + /* Nothing left */ + if ((err == EAGAIN) || + (err == EWOULDBLOCK)) + return 0; + + /* All other errors */ + return err; + } + + switch (event.type) { + case KDREG2_EVENT_MAPPING_CHANGE: + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + + ofi_monitor_notify(&kdreg2->monitor, + event.u.mapping_change.addr, + event.u.mapping_change.len); + + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + + break; + + default: + + return -ENOMSG; + } + } + + return 0; +} + +static void kdreg2_close_pipe(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->exit_pipe[0]); + close(kdreg2->exit_pipe[1]); + kdreg2->exit_pipe[0] = -1; + kdreg2->exit_pipe[1] = -1; +} + +static void kdreg2_close_fd(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->fd); + kdreg2->fd = -1; + kdreg2->status_data = NULL; +} + +static void *kdreg2_evictor(void *arg) +{ + struct ofi_kdreg2 *kdreg2 = (struct ofi_kdreg2 *) arg; + int ret; + struct pollfd pollfd[2] = { + { + .fd = kdreg2->fd, + .events = POLLIN, + }, + { .fd = kdreg2->exit_pipe[0], + .events = POLLIN, + }, + }; + int n; + + while (1) { + + /* wait until there are events to read */ + n = poll(pollfd, 2, INFINITE_TIMEOUT); + if (n == 0) /* timeout(?) */ + continue; + + if (n < 0) { + switch (errno) { + case EINTR: /* interrupted */ + continue; + default: + ret = -errno; + goto error_ret; + } + } + + /* look for exit message on second fd */ + if (pollfd[1].revents) { + ret = 0; + goto error_ret; + } + + ret = kdreg2_read_evictions(kdreg2); + if (ret) + goto error_ret; + } + +error_ret: + + return (void *) (intptr_t) ret; +} + + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + int ret = 0; + struct kdreg2_config_data config_data; + + /* see if already started */ + if (kdreg2->fd >= 0) + return 0; + + ofi_atomic_initialize64(&kdreg2->next_cookie, 1); + + ret = pipe(kdreg2->exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to create pipe for kdreg2: %s\n", + strerror(errno)); + return -errno; + } + + kdreg2->fd = open(KDREG2_DEVICE_NAME, O_RDWR); + if (kdreg2->fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to open %s for monitor kdreg2: %s.\n", + KDREG2_DEVICE_NAME, strerror(errno)); + ret = -errno; + goto close_pipe; + } + + /* configure the monitor with the maximum number of entries */ + + config_data.max_regions = cache_params.max_cnt; + if (!config_data.max_regions) { + ret = -FI_ENOSPC; + goto close_fd; + } + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_CONFIG_DATA, &config_data); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get module config data for kdreg2 monitor: %d.\n", + errno); + ret = -errno; + goto close_fd; + } + + /* Configuring the monitor allocates the status data. Save the address. */ + + kdreg2->status_data = config_data.status_data; + + ret = pthread_create(&kdreg2->thread, EVICTOR_THREAD_ATTR, + kdreg2_evictor, kdreg2); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to start thread for kdreg2 monitor: %d.\n", + ret); + goto close_fd; + } + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor started.\n"); + + return 0; + +close_fd: + + kdreg2_close_fd(kdreg2); + +close_pipe: + + kdreg2_close_pipe(kdreg2); + + FI_WARN(&core_prov, FI_LOG_MR, + "Kdreg2 memory monitor failed to start: %i.\n", ret); + + return ret; +} + +static void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + + /* see if it's really running */ + if (kdreg2->fd < 0) + return; + + num_written = write(kdreg2->exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to write to kdreg2 exit pipe: %s\n", + strerror(errno)); + /* We could call pthread cancel here. The thread + * has probably already exited. Cancelling would be + * benign. But calling join on an exited thread is + * also legal. + */ + } + + pthread_join(kdreg2->thread, NULL); + + kdreg2_close_fd(kdreg2); + kdreg2_close_pipe(kdreg2); + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor stopped.\n"); +} + +#else /* !HAVE_KDREG2_MONITOR */ + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, + size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + /* no-op */ +} + +#endif /* HAVE_KDREG2_MONITOR */ + +static struct ofi_kdreg2 kdreg2_mm = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = kdreg2_monitor_start, + .monitor.stop = kdreg2_monitor_stop, + .monitor.subscribe = kdreg2_monitor_subscribe, + .monitor.unsubscribe = kdreg2_monitor_unsubscribe, + .monitor.valid = kdreg2_monitor_valid, + .monitor.name = "kdreg2", + .fd = -1, + .exit_pipe = { -1, -1 }, + .status_data = NULL, +}; + +struct ofi_mem_monitor *kdreg2_monitor = &kdreg2_mm.monitor; diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 527d5ee2867..1848a42af9b 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -195,6 +195,7 @@ static void initialize_monitor_list() xpmem_monitor, ze_monitor, import_monitor, + kdreg2_monitor, }; monitor_list_size = ARRAY_SIZE(monitors); @@ -230,6 +231,13 @@ static void set_default_monitor(const char *monitor) #else FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "kdreg2")) { +#if HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "kdreg2 monitor not available\n"); + default_monitor = NULL; #endif } else if (!strcmp(monitor, "disabled")) { default_monitor = NULL; @@ -270,9 +278,10 @@ void ofi_monitors_init(void) "Define a default memory registration monitor." " The monitor checks for virtual to physical memory" " address changes. Options are: userfaultfd, memhooks" - " and disabled. Userfaultfd is a Linux kernel feature." - " Memhooks operates by intercepting memory allocation" - " and free calls." + " kdreg2, and disabled. Userfaultfd is a Linux kernel" + " feature. Memhooks operates by intercepting memory" + " allocation and free calls. kdreg2 is a supplied as a" + " loadable Linux kernel module." #if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) " " HAVE_MR_CACHE_MONITOR_DEFAULT #else @@ -314,6 +323,8 @@ void ofi_monitors_init(void) default_monitor = memhooks_monitor; #elif HAVE_UFFD_MONITOR default_monitor = uffd_monitor; +#elif HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; #else default_monitor = NULL; #endif