diff --git a/src/components/mc/cuda/mc_cuda.c b/src/components/mc/cuda/mc_cuda.c index 7f2a5e094c..a4b0a51330 100644 --- a/src/components/mc/cuda/mc_cuda.c +++ b/src/components/mc/cuda/mc_cuda.c @@ -159,10 +159,17 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr, ucc_memory_type_t mt) { ucc_mc_buffer_header_t *h = NULL; - if (size <= MC_CUDA_CONFIG->mpool_elem_size && - mt != UCC_MEMORY_TYPE_CUDA_MANAGED) { - h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool); + + if (size <= MC_CUDA_CONFIG->mpool_elem_size) { + if (mt == UCC_MEMORY_TYPE_CUDA) { + h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool); + } else if (mt == UCC_MEMORY_TYPE_CUDA_MANAGED) { + h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool_managed); + } else { + return UCC_ERR_INVALID_PARAM; + } } + if (!h) { // Slow path return ucc_mc_cuda_mem_alloc(h_ptr, size, mt); @@ -196,6 +203,7 @@ static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT if (st != cudaSuccess) { // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice. + h->addr = NULL; cudaGetLastError(); mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes, " @@ -225,11 +233,39 @@ static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp i } } +static void ucc_mc_cuda_chunk_init_managed(ucc_mpool_t *mp, //NOLINT + void *obj, void *chunk) //NOLINT +{ + ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj; + cudaError_t st; + + st = cudaMallocManaged(&h->addr, MC_CUDA_CONFIG->mpool_elem_size, + cudaMemAttachGlobal); + if (st != cudaSuccess) { + // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will + // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice. + h->addr = NULL; + cudaGetLastError(); + mc_error(&ucc_mc_cuda.super, + "failed to allocate %zd bytes, " + "cuda error %d(%s)", + MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st)); + } + h->from_pool = 1; + h->mt = UCC_MEMORY_TYPE_CUDA_MANAGED; +} + + static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc, .chunk_release = ucc_mc_cuda_chunk_release, .obj_init = ucc_mc_cuda_chunk_init, .obj_cleanup = ucc_mc_cuda_chunk_cleanup}; +static ucc_mpool_ops_t ucc_mc_managed_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc, + .chunk_release = ucc_mc_cuda_chunk_release, + .obj_init = ucc_mc_cuda_chunk_init_managed, + .obj_cleanup = ucc_mc_cuda_chunk_cleanup}; + static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr) { cudaError_t st; @@ -260,6 +296,8 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr, size_t size, ucc_memory_type_t mt) { + ucc_status_t status; + // lock assures single mpool initiation when multiple threads concurrently execute // different collective operations thus concurrently entering init function. ucc_spin_lock(&ucc_mc_cuda.init_spinlock); @@ -272,7 +310,7 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr, } if (!ucc_mc_cuda.mpool_init_flag) { - ucc_status_t status = ucc_mpool_init( + status = ucc_mpool_init( &ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0, UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems, &ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers"); @@ -280,6 +318,17 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr, ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); return status; } + + status = ucc_mpool_init( + &ucc_mc_cuda.mpool_managed, 0, sizeof(ucc_mc_buffer_header_t), 0, + UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems, + &ucc_mc_managed_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers"); + if (status != UCC_OK) { + ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); + return status; + } + + ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc; ucc_mc_cuda.mpool_init_flag = 1; } @@ -422,6 +471,7 @@ static ucc_status_t ucc_mc_cuda_finalize() } if (ucc_mc_cuda.mpool_init_flag) { ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1); + ucc_mpool_cleanup(&ucc_mc_cuda.mpool_managed, 1); ucc_mc_cuda.mpool_init_flag = 0; ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init; } diff --git a/src/components/mc/cuda/mc_cuda.h b/src/components/mc/cuda/mc_cuda.h index abc82312c2..63c730fa73 100644 --- a/src/components/mc/cuda/mc_cuda.h +++ b/src/components/mc/cuda/mc_cuda.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -27,6 +27,7 @@ typedef struct ucc_mc_cuda { ucc_mpool_t events; ucc_mpool_t strm_reqs; ucc_mpool_t mpool; + ucc_mpool_t mpool_managed; int mpool_init_flag; ucc_spinlock_t init_spinlock; ucc_thread_mode_t thread_mode; diff --git a/src/components/mc/ucc_mc.c b/src/components/mc/ucc_mc.c index 26f1647249..ad3de3a94b 100644 --- a/src/components/mc/ucc_mc.c +++ b/src/components/mc/ucc_mc.c @@ -145,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type), ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr) { - UCC_CHECK_MC_AVAILABLE(h_ptr->mt); - return mc_ops[h_ptr->mt]->mem_free(h_ptr); + ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ? + UCC_MEMORY_TYPE_CUDA : h_ptr->mt; + + UCC_CHECK_MC_AVAILABLE(mt); + return mc_ops[mt]->mem_free(h_ptr); } UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,