diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 28c7fbc4449..70c36ca56e4 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -1187,6 +1187,44 @@ ucs_status_t ucp_mem_unmap(ucp_context_h context, ucp_mem_h memh) return UCS_OK; } +static ucs_status_t +ucp_memh_uct_reg_one(ucp_mem_h memh, ucp_md_index_t md_index) +{ + uct_md_mem_reg_params_t reg_params = { + .field_mask = UCT_MD_MEM_REG_FIELD_FLAGS, + .flags = UCT_MD_MEM_ACCESS_ALL, + }; + ucp_context_h context = memh->context; + ucs_memory_type_t mem_type = memh->mem_type; + void *address = ucp_memh_address(memh); + size_t length = ucp_memh_length(memh); + ucs_status_t status; + + if (((context->cache_md_map[mem_type] & UCS_BIT(md_index)) == 0) || + (context->rcache == NULL)) { + status = uct_md_mem_reg_v2(context->tl_mds[md_index].md, address, length, + ®_params, &memh->uct[md_index]); + if (status != UCS_OK) { + return status; + } + + memh->parent = memh; + } else { + status = ucp_memh_get(context, address, length, mem_type, + UCS_BIT(md_index), UCT_MD_MEM_ACCESS_ALL, + "mem_type", &memh->parent); + + if (status != UCS_OK) { + return status; + } + + memh->uct[md_index] = memh->parent->uct[md_index]; + } + + memh->md_map = UCS_BIT(md_index); + return UCS_OK; +} + ucs_status_t ucp_mem_type_reg_buffers(ucp_worker_h worker, void *remote_addr, size_t length, ucs_memory_type_t mem_type, ucp_md_index_t md_index, ucp_mem_h memh, @@ -1214,8 +1252,7 @@ ucs_status_t ucp_mem_type_reg_buffers(ucp_worker_h worker, void *remote_addr, tl_md = &context->tl_mds[md_index]; cmpt = context->tl_cmpts[tl_md->cmpt_index].cmpt; - status = ucp_memh_init_uct_reg(context, memh, UCS_BIT(md_index), - UCT_MD_MEM_ACCESS_ALL, "mem_type"); + status = ucp_memh_uct_reg_one(memh, md_index); if (status != UCS_OK) { goto out; } diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am index 9367835a9cb..a0e901a291d 100644 --- a/src/ucs/Makefile.am +++ b/src/ucs/Makefile.am @@ -75,6 +75,7 @@ nobase_dist_libucs_la_HEADERS = \ type/param.h \ type/init_once.h \ type/spinlock.h \ + type/rwlock.h \ type/status.h \ type/thread_mode.h \ type/cpu_set.h \ diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c index 94d19f9bf0c..66a1e277d28 100644 --- a/src/ucs/memory/rcache.c +++ b/src/ucs/memory/rcache.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -424,7 +425,7 @@ void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache, UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_DEREGS, 1); if (drop_lock) { - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } UCS_PROFILE_NAMED_CALL_VOID_ALWAYS("mem_dereg", @@ -433,7 +434,7 @@ void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache, region); if (drop_lock) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); } } @@ -493,14 +494,14 @@ static inline void ucs_rcache_region_put_internal(ucs_rcache_t *rcache, /* Destroy region and de-register memory */ if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); } ucs_mem_region_destroy_internal(rcache, region, flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK); if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } } @@ -649,7 +650,7 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, * no rcache operations are performed to clean it. */ if (!(rcache->params.flags & UCS_RCACHE_FLAG_SYNC_EVENTS) && - !pthread_rwlock_trywrlock(&rcache->pgt_lock)) { + !ucs_rwlock_write_trylock(&rcache->pgt_lock)) { /* coverity[double_lock] */ ucs_rcache_invalidate_range(rcache, start, end, UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); @@ -657,7 +658,7 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, /* coverity[double_lock] */ ucs_rcache_check_inv_queue(rcache, UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); /* coverity[double_unlock] */ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); return; } @@ -703,11 +704,11 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache) /* Lock must be held in write mode */ static void ucs_rcache_clean(ucs_rcache_t *rcache) { - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); /* coverity[double_lock]*/ ucs_rcache_check_inv_queue(rcache, 0); ucs_rcache_check_gc_list(rcache, 1); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } /* Lock must be held in write mode */ @@ -940,7 +941,7 @@ ucs_status_t ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); retry: /* Align to page size */ @@ -1061,7 +1062,7 @@ ucs_status_t ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, *region_p = region; out_unlock: /* coverity[double_unlock]*/ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); return status; } @@ -1082,7 +1083,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_rdlock(&rcache->pgt_lock); + ucs_rwlock_read_lock(&rcache->pgt_lock); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_GETS, 1); if (ucs_queue_is_empty(&rcache->inv_q)) { pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &rcache->pgtable, @@ -1096,12 +1097,12 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_rcache_region_lru_get(rcache, region); *region_p = region; UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_HITS_FAST, 1); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); return UCS_OK; } } } - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); /* Fall back to slow version (with rw lock) in following cases: * - invalidation list not empty @@ -1132,7 +1133,7 @@ void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, comp = ucs_mpool_get(&rcache->mp); ucs_spin_unlock(&rcache->lock); - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); if (comp != NULL) { comp->func = cb; comp->arg = arg; @@ -1145,7 +1146,7 @@ void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, /* coverity[double_lock] */ ucs_rcache_region_invalidate_internal(rcache, region, 0); /* coverity[double_unlock] */ - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_PUTS, 1); } @@ -1170,10 +1171,10 @@ static void ucs_rcache_before_fork(void) * again on-demand. * - Other use cases shouldn't be affected */ - pthread_rwlock_wrlock(&rcache->pgt_lock); + ucs_rwlock_write_lock(&rcache->pgt_lock); /* coverity[double_lock] */ ucs_rcache_invalidate_range(rcache, 0, UCS_PGT_ADDR_MAX, 0); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_write_unlock(&rcache->pgt_lock); } } pthread_mutex_unlock(&ucs_rcache_global_context.lock); @@ -1272,7 +1273,6 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, { ucs_status_t status; size_t mp_obj_size, mp_align; - int ret; ucs_mpool_params_t mp_params; if (params->region_struct_size < sizeof(ucs_rcache_region_t)) { @@ -1294,16 +1294,10 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, self->params = *params; - ret = pthread_rwlock_init(&self->pgt_lock, NULL); - if (ret) { - ucs_error("pthread_rwlock_init() failed: %m"); - status = UCS_ERR_INVALID_PARAM; - goto err_destroy_stats; - } - + ucs_rwlock_init(&self->pgt_lock); status = ucs_spinlock_init(&self->lock, 0); if (status != UCS_OK) { - goto err_destroy_rwlock; + goto err_destroy_stats; } status = ucs_pgtable_init(&self->pgtable, ucs_rcache_pgt_dir_alloc, @@ -1376,8 +1370,6 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, ucs_pgtable_cleanup(&self->pgtable); err_destroy_inv_q_lock: ucs_spinlock_destroy(&self->lock); -err_destroy_rwlock: - pthread_rwlock_destroy(&self->pgt_lock); err_destroy_stats: UCS_STATS_NODE_FREE(self->stats); err_free_name: @@ -1408,7 +1400,6 @@ static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t) ucs_mpool_cleanup(&self->mp, 1); ucs_pgtable_cleanup(&self->pgtable); ucs_spinlock_destroy(&self->lock); - pthread_rwlock_destroy(&self->pgt_lock); UCS_STATS_NODE_FREE(self->stats); ucs_free(self->name); ucs_free(self->distribution); diff --git a/src/ucs/memory/rcache.inl b/src/ucs/memory/rcache.inl index 16d30da2ca4..25aac2e9e0e 100644 --- a/src/ucs/memory/rcache.inl +++ b/src/ucs/memory/rcache.inl @@ -8,6 +8,7 @@ #define UCS_RCACHE_INL_ #include "rcache_int.h" +#include static UCS_F_ALWAYS_INLINE int ucs_rcache_region_test(ucs_rcache_region_t *region, int prot, size_t alignment) @@ -80,6 +81,17 @@ ucs_rcache_lookup_unsafe(ucs_rcache_t *rcache, void *address, size_t length, return region; } +static UCS_F_ALWAYS_INLINE ucs_rcache_region_t * +ucs_rcache_lookup(ucs_rcache_t *rcache, void *address, size_t length, + size_t alignment, int prot) +{ + ucs_rcache_region_t *region; + + ucs_rwlock_read_lock(&rcache->pgt_lock); + region = ucs_rcache_lookup_unsafe(rcache, address, length, alignment, prot); + ucs_rwlock_read_unlock(&rcache->pgt_lock); + return region; +} static UCS_F_ALWAYS_INLINE void ucs_rcache_region_put_unsafe(ucs_rcache_t *rcache, ucs_rcache_region_t *region) diff --git a/src/ucs/memory/rcache_int.h b/src/ucs/memory/rcache_int.h index bb9d4f1bd95..ff3009f2541 100644 --- a/src/ucs/memory/rcache_int.h +++ b/src/ucs/memory/rcache_int.h @@ -13,6 +13,7 @@ #include #include #include +#include #define ucs_rcache_region_log_lvl(_level, _message, ...) \ @@ -66,7 +67,7 @@ typedef struct ucs_rcache_distribution { struct ucs_rcache { ucs_rcache_params_t params; /**< rcache parameters (immutable) */ - pthread_rwlock_t pgt_lock; /**< Protects the page table and all + ucs_rwlock_t pgt_lock; /**< Protects the page table and all regions whose refcount is 0 */ ucs_pgtable_t pgtable; /**< page table to hold the regions */ diff --git a/src/ucs/memory/rcache_vfs.c b/src/ucs/memory/rcache_vfs.c index c168eed1530..43dcf35c9f0 100644 --- a/src/ucs/memory/rcache_vfs.c +++ b/src/ucs/memory/rcache_vfs.c @@ -54,9 +54,9 @@ static void ucs_rcache_vfs_show_primitive(void *obj, ucs_string_buffer_t *strb, { ucs_rcache_t *rcache = obj; - pthread_rwlock_rdlock(&rcache->pgt_lock); + ucs_rwlock_read_lock(&rcache->pgt_lock); ucs_vfs_show_primitive(obj, strb, arg_ptr, arg_u64); - pthread_rwlock_unlock(&rcache->pgt_lock); + ucs_rwlock_read_unlock(&rcache->pgt_lock); } static void ucs_rcache_vfs_init_regions_distribution(ucs_rcache_t *rcache) diff --git a/src/ucs/type/rwlock.h b/src/ucs/type/rwlock.h new file mode 100644 index 00000000000..45ad8c1a294 --- /dev/null +++ b/src/ucs/type/rwlock.h @@ -0,0 +1,108 @@ +/* +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2024. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_RWLOCK_H +#define UCS_RWLOCK_H + +#include +#include + +/** + * The ucs_rwlock_t type. + * + * Readers increment the counter by UCS_RWLOCK_READ (4) + * Writers set the UCS_RWLOCK_WRITE bit when lock is held + * and set the UCS_RWLOCK_WAIT bit while waiting. + * + * 31 2 1 0 + * +-------------------+-+-+ + * | readers | | | + * +-------------------+-+-+ + * ^ ^ + * | | + * WRITE: lock held ----/ | + * WAIT: writer pending --/ + */ + +#define UCS_RWLOCK_WAIT 0x1 /* Writer is waiting */ +#define UCS_RWLOCK_WRITE 0x2 /* Writer has the lock */ +#define UCS_RWLOCK_MASK (UCS_RWLOCK_WAIT | UCS_RWLOCK_WRITE) + /* Writer is waiting or has lock */ +#define UCS_RWLOCK_READ 0x4 /* Reader increment */ + + +/** + * Read-write lock. + */ +typedef struct { + volatile int l; +} ucs_rwlock_t; + + +static inline void ucs_rwlock_read_lock(ucs_rwlock_t *lock) { + int x; + + while (1) { + while (lock->l & UCS_RWLOCK_MASK) { + sched_yield(); + } + + x = __sync_fetch_and_add(&lock->l, UCS_RWLOCK_READ); + if (!(x & UCS_RWLOCK_MASK)) { + return; + } + + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_READ); + } +} + + +static inline void ucs_rwlock_read_unlock(ucs_rwlock_t *lock) { + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_READ); +} + + +static inline void ucs_rwlock_write_lock(ucs_rwlock_t *lock) { + int x; + + while (1) { + x = lock->l; + if ((x < UCS_RWLOCK_WRITE) && + (__sync_val_compare_and_swap(&lock->l, x, UCS_RWLOCK_WRITE) == x)) { + return; + } + + __sync_fetch_and_or(&lock->l, UCS_RWLOCK_WAIT); + while (lock->l > UCS_RWLOCK_WAIT) { + sched_yield(); + } + } +} + + +static inline int ucs_rwlock_write_trylock(ucs_rwlock_t *lock) { + int x; + + x = lock->l; + if ((x < UCS_RWLOCK_WRITE) && + (__sync_val_compare_and_swap(&lock->l, x, UCS_RWLOCK_WRITE) == x)) { + return 0; + } + + return -EBUSY; +} + + +static inline void ucs_rwlock_write_unlock(ucs_rwlock_t *lock) { + __sync_fetch_and_sub(&lock->l, UCS_RWLOCK_WRITE); +} + + +static inline void ucs_rwlock_init(ucs_rwlock_t *lock) { + lock->l = 0; +} + +#endif diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c index f613dc4d4df..313bcd592f1 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -342,12 +343,19 @@ uct_gdr_copy_mem_rcache_reg(uct_md_h uct_md, void *address, size_t length, ucs_status_t status; uct_gdr_copy_mem_t *memh; + rregion = ucs_rcache_lookup(md->rcache, address, length, GPU_PAGE_SIZE, + PROT_READ | PROT_WRITE); + if (rregion != NULL) { + goto out; + } + status = ucs_rcache_get(md->rcache, address, length, GPU_PAGE_SIZE, PROT_READ | PROT_WRITE, &flags, &rregion); if (status != UCS_OK) { return status; } +out: ucs_assert(rregion->refcount > 0); memh = &ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t)->memh; *memh_p = memh; diff --git a/test/gtest/ucs/test_rcache.cc b/test/gtest/ucs/test_rcache.cc index 2286951a0d7..76128be2325 100644 --- a/test/gtest/ucs/test_rcache.cc +++ b/test/gtest/ucs/test_rcache.cc @@ -972,9 +972,9 @@ UCS_TEST_F(test_rcache_stats, unmap_dereg_with_lock) { * We can have more unmap events if releasing the region structure triggers * releasing memory back to the OS. */ - pthread_rwlock_wrlock(&m_rcache->pgt_lock); + ucs_rwlock_write_lock(&m_rcache->pgt_lock); munmap(mem, size1); - pthread_rwlock_unlock(&m_rcache->pgt_lock); + ucs_rwlock_write_unlock(&m_rcache->pgt_lock); EXPECT_GE(get_counter(UCS_RCACHE_UNMAPS), 1); EXPECT_EQ(0, get_counter(UCS_RCACHE_UNMAP_INVALIDATES)); @@ -1026,9 +1026,9 @@ UCS_TEST_F(test_rcache_stats, hits_slow) { r1 = get(mem2, size1); /* generate unmap event under lock, to roce using invalidation queue */ - pthread_rwlock_rdlock(&m_rcache->pgt_lock); + ucs_rwlock_read_lock(&m_rcache->pgt_lock); munmap(mem1, size1); - pthread_rwlock_unlock(&m_rcache->pgt_lock); + ucs_rwlock_read_unlock(&m_rcache->pgt_lock); EXPECT_EQ(1, get_counter(UCS_RCACHE_UNMAPS));