diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am index 9367835a9cb..a0e901a291d 100644 --- a/src/ucs/Makefile.am +++ b/src/ucs/Makefile.am @@ -75,6 +75,7 @@ nobase_dist_libucs_la_HEADERS = \ type/param.h \ type/init_once.h \ type/spinlock.h \ + type/rwlock.h \ type/status.h \ type/thread_mode.h \ type/cpu_set.h \ diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c index 94d19f9bf0c..66a1e277d28 100644 --- a/src/ucs/memory/rcache.c +++ b/src/ucs/memory/rcache.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -424,7 +425,7 @@ void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache, UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_DEREGS, 1); if (drop_lock) { - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } UCS_PROFILE_NAMED_CALL_VOID_ALWAYS("mem_dereg", @@ -433,7 +434,7 @@ void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache, region); if (drop_lock) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); } } @@ -493,14 +494,14 @@ static inline void ucs_rcache_region_put_internal(ucs_rcache_t *rcache, /* Destroy region and de-register memory */ if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); } ucs_mem_region_destroy_internal(rcache, region, flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK); if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } } @@ -649,7 +650,7 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, * no rcache operations are performed to clean it. */ if (!(rcache->params.flags & UCS_RCACHE_FLAG_SYNC_EVENTS) && - !pthread_rwlock_trywrlock(&rcache->pgt_lock)) { + !ucs_rwlock_write_trylock(&rcache->pgt_lock)) { /* coverity[double_lock] */ ucs_rcache_invalidate_range(rcache, start, end, UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); @@ -657,7 +658,7 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, /* coverity[double_lock] */ ucs_rcache_check_inv_queue(rcache, UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); /* coverity[double_unlock] */ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); return; } @@ -703,11 +704,11 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache) /* Lock must be held in write mode */ static void ucs_rcache_clean(ucs_rcache_t *rcache) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); /* coverity[double_lock]*/ ucs_rcache_check_inv_queue(rcache, 0); ucs_rcache_check_gc_list(rcache, 1); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } /* Lock must be held in write mode */ @@ -940,7 +941,7 @@ ucs_status_t ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); retry: /* Align to page size */ @@ -1061,7 +1062,7 @@ ucs_status_t ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, *region_p = region; out_unlock: /* coverity[double_unlock]*/ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); return status; } @@ -1082,7 +1083,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_rdlock(&rcache->pgt_lock); + ucs_rwlock_read_lock(&rcache->pgt_lock); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_GETS, 1); if (ucs_queue_is_empty(&rcache->inv_q)) { pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &rcache->pgtable, @@ -1096,12 +1097,12 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_rcache_region_lru_get(rcache, region); *region_p = region; UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_HITS_FAST, 1); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); return UCS_OK; } } } - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); /* Fall back to slow version (with rw lock) in following cases: * - invalidation list not empty @@ -1132,7 +1133,7 @@ void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, comp = ucs_mpool_get(&rcache->mp); ucs_spin_unlock(&rcache->lock); - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); if (comp != NULL) { comp->func = cb; comp->arg = arg; @@ -1145,7 +1146,7 @@ void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, /* coverity[double_lock] */ ucs_rcache_region_invalidate_internal(rcache, region, 0); /* coverity[double_unlock] */ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_PUTS, 1); } @@ -1170,10 +1171,10 @@ static void ucs_rcache_before_fork(void) * again on-demand. * - Other use cases shouldn't be affected */ - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); /* coverity[double_lock] */ ucs_rcache_invalidate_range(rcache, 0, UCS_PGT_ADDR_MAX, 0); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } } pthread_mutex_unlock(&ucs_rcache_global_context.lock); @@ -1272,7 +1273,6 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, { ucs_status_t status; size_t mp_obj_size, mp_align; - int ret; ucs_mpool_params_t mp_params; if (params->region_struct_size < sizeof(ucs_rcache_region_t)) { @@ -1294,16 +1294,10 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, self->params = *params; - ret = pthread_rwlock_init(&self->pgt_lock, NULL); - if (ret) { - ucs_error("pthread_rwlock_init() failed: %m"); - status = UCS_ERR_INVALID_PARAM; - goto err_destroy_stats; - } - + ucs_rwlock_init(&self->pgt_lock); status = ucs_spinlock_init(&self->lock, 0); if (status != UCS_OK) { - goto err_destroy_rwlock; + goto err_destroy_stats; } status = ucs_pgtable_init(&self->pgtable, ucs_rcache_pgt_dir_alloc, @@ -1376,8 +1370,6 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, ucs_pgtable_cleanup(&self->pgtable); err_destroy_inv_q_lock: ucs_spinlock_destroy(&self->lock); -err_destroy_rwlock: - pthread_rwlock_destroy(&self->pgt_lock); err_destroy_stats: UCS_STATS_NODE_FREE(self->stats); err_free_name: @@ -1408,7 +1400,6 @@ static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t) ucs_mpool_cleanup(&self->mp, 1); ucs_pgtable_cleanup(&self->pgtable); ucs_spinlock_destroy(&self->lock); - pthread_rwlock_destroy(&self->pgt_lock); UCS_STATS_NODE_FREE(self->stats); ucs_free(self->name); ucs_free(self->distribution); diff --git a/src/ucs/memory/rcache_int.h b/src/ucs/memory/rcache_int.h index bb9d4f1bd95..ff3009f2541 100644 --- a/src/ucs/memory/rcache_int.h +++ b/src/ucs/memory/rcache_int.h @@ -13,6 +13,7 @@ #include #include #include +#include #define ucs_rcache_region_log_lvl(_level, _message, ...) \ @@ -66,7 +67,7 @@ typedef struct ucs_rcache_distribution { struct ucs_rcache { ucs_rcache_params_t params; /**< rcache parameters (immutable) */ - pthread_rwlock_t pgt_lock; /**< Protects the page table and all + ucs_rwlock_t pgt_lock; /**< Protects the page table and all regions whose refcount is 0 */ ucs_pgtable_t pgtable; /**< page table to hold the regions */ diff --git a/src/ucs/memory/rcache_vfs.c b/src/ucs/memory/rcache_vfs.c index c168eed1530..43dcf35c9f0 100644 --- a/src/ucs/memory/rcache_vfs.c +++ b/src/ucs/memory/rcache_vfs.c @@ -54,9 +54,9 @@ static void ucs_rcache_vfs_show_primitive(void *obj, ucs_string_buffer_t *strb, { ucs_rcache_t *rcache = obj; - pthread_rwlock_rdlock(&rcache->pgt_lock); + ucs_rwlock_read_lock(&rcache->pgt_lock); ucs_vfs_show_primitive(obj, strb, arg_ptr, arg_u64); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); } static void ucs_rcache_vfs_init_regions_distribution(ucs_rcache_t *rcache) diff --git a/src/ucs/type/rwlock.h b/src/ucs/type/rwlock.h new file mode 100644 index 00000000000..2e825394d54 --- /dev/null +++ b/src/ucs/type/rwlock.h @@ -0,0 +1,105 @@ +/* +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2024. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_RWLOCK_H +#define UCS_RWLOCK_H + +/** + * The ucs_rwlock_t type. + * + * Readers increment the counter by UCS_RWLOCK_READ (4) + * Writers set the UCS_RWLOCK_WRITE bit when lock is held + * and set the UCS_RWLOCK_WAIT bit while waiting. + * + * 31 2 1 0 + * +-------------------+-+-+ + * | readers | | | + * +-------------------+-+-+ + * ^ ^ + * | | + * WRITE: lock held ----/ | + * WAIT: writer pending --/ + */ + +#define UCS_RWLOCK_WAIT 0x1 /* Writer is waiting */ +#define UCS_RWLOCK_WRITE 0x2 /* Writer has the lock */ +#define UCS_RWLOCK_MASK (UCS_RWLOCK_WAIT | UCS_RWLOCK_WRITE) + /* Writer is waiting or has lock */ +#define UCS_RWLOCK_READ 0x4 /* Reader increment */ + + +/** + * Read-write lock. + */ +typedef struct { + volatile int l; +} ucs_rwlock_t; + + +static inline void ucs_rwlock_read_lock(ucs_rwlock_t *lock) { + int x; + + while (1) { + while (lock->l & UCS_RWLOCK_MASK) { + sched_yield(); + } + + x = __sync_fetch_and_add(&lock->l, UCS_RWLOCK_READ); + if (!(x & UCS_RWLOCK_MASK)) { + return; + } + + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_READ); + } +} + + +static inline void ucs_rwlock_read_unlock(ucs_rwlock_t *lock) { + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_READ); +} + + +static inline void ucs_rwlock_write_lock(ucs_rwlock_t *lock) { + int x; + + while (1) { + x = lock->l; + if ((x < UCS_RWLOCK_WRITE) && + (__sync_val_compare_and_swap(&lock->l, x, UCS_RWLOCK_WRITE) == x)) { + return; + } + + __sync_fetch_and_or(&lock->l, UCS_RWLOCK_WAIT); + while (lock->l > UCS_RWLOCK_WAIT) { + sched_yield(); + } + } +} + + +static inline int ucs_rwlock_write_trylock(ucs_rwlock_t *lock) { + int x; + + x = lock->l; + if ((x < UCS_RWLOCK_WRITE) && + (__sync_val_compare_and_swap(&lock->l, x, UCS_RWLOCK_WRITE) == x)) { + return 0; + } + + return -EBUSY; +} + + +static inline void ucs_rwlock_write_unlock(ucs_rwlock_t *lock) { + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_WRITE); +} + + +static inline void ucs_rwlock_init(ucs_rwlock_t *lock) { + lock->l = 0; +} + +#endif diff --git a/test/gtest/ucs/test_rcache.cc b/test/gtest/ucs/test_rcache.cc index 2286951a0d7..76128be2325 100644 --- a/test/gtest/ucs/test_rcache.cc +++ b/test/gtest/ucs/test_rcache.cc @@ -972,9 +972,9 @@ UCS_TEST_F(test_rcache_stats, unmap_dereg_with_lock) { * We can have more unmap events if releasing the region structure triggers * releasing memory back to the OS. */ - pthread_rwlock_wrlock(&m_rcache->pgt_lock); + ucs_rwlock_write_lock(&m_rcache->pgt_lock); munmap(mem, size1); - pthread_rwlock_unlock(&m_rcache->pgt_lock); + ucs_rwlock_write_unlock(&m_rcache->pgt_lock); EXPECT_GE(get_counter(UCS_RCACHE_UNMAPS), 1); EXPECT_EQ(0, get_counter(UCS_RCACHE_UNMAP_INVALIDATES)); @@ -1026,9 +1026,9 @@ UCS_TEST_F(test_rcache_stats, hits_slow) { r1 = get(mem2, size1); /* generate unmap event under lock, to roce using invalidation queue */ - pthread_rwlock_rdlock(&m_rcache->pgt_lock); + ucs_rwlock_read_lock(&m_rcache->pgt_lock); munmap(mem1, size1); - pthread_rwlock_unlock(&m_rcache->pgt_lock); + ucs_rwlock_read_unlock(&m_rcache->pgt_lock); EXPECT_EQ(1, get_counter(UCS_RCACHE_UNMAPS));