From 0e50dda646f43f74adbd4854d3c2cd5081f3319b Mon Sep 17 00:00:00 2001 From: Ryan Hankins Date: Thu, 14 Nov 2024 19:48:00 +0000 Subject: [PATCH] prov/cxi: Support FI_OPT_CUDA_API_PERMITTED If GDRCopy is required by the application (ie. it has set FI_OPT_CUDA_API_PERMITTED), and is not available, return not supported, eliminating deadlocks due to calls to cudaMemcpy interacting with CUDA applications. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 50 ++++++++++++++++++++++------ prov/cxi/src/cxip_atomic.c | 15 +++++---- prov/cxi/src/cxip_coll.c | 4 +-- prov/cxi/src/cxip_ep.c | 58 +++++++++++++++++++++++++++++++++ prov/cxi/src/cxip_mr.c | 13 ++++++++ prov/cxi/src/cxip_msg.c | 12 +++---- prov/cxi/src/cxip_msg_hpc.c | 5 +-- prov/cxi/src/cxip_msg_rnr.c | 6 ++-- prov/cxi/src/cxip_ptelist_buf.c | 4 +-- prov/cxi/src/cxip_rma.c | 3 +- prov/cxi/src/cxip_txc.c | 4 +-- 11 files changed, 140 insertions(+), 34 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 92c4427a48c..0f1386f3cde 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -1953,13 +1953,16 @@ struct cxip_rxc_rnr { }; static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle, dest, src, size); assert(ret == FI_SUCCESS); @@ -1975,13 +1978,16 @@ static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, } static inline void cxip_copy_from_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface, md->handle, dest, src, size); @@ -2426,6 +2432,9 @@ struct cxip_ep_obj { struct fi_tx_attr tx_attr; struct fi_rx_attr rx_attr; + /* Require memcpy's via the dev reg APIs. */ + bool require_dev_reg_copy[OFI_HMEM_MAX]; + /* Collectives support */ struct cxip_ep_coll_obj coll; struct cxip_ep_zbcoll_obj zbcoll; @@ -2436,6 +2445,25 @@ struct cxip_ep_obj { struct cxip_portals_table *ptable; }; +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md); + +static inline void +cxip_ep_obj_copy_to_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_to_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + +static inline void +cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_from_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc) { assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); @@ -3617,17 +3645,19 @@ cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md, */ if (!cxip_env.fork_safe_requested) { if (!hmem_md) { - ret = cxip_map(domain, hmem_src, size, 0, &hmem_md); + ret = cxip_ep_obj_map(txc->ep_obj, hmem_src, size, 0, + &hmem_md); if (ret) { - TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret, - fi_strerror(-ret)); + TXC_WARN(txc, "cxip_ep_obj_map failed: %d:%s\n", + ret, fi_strerror(-ret)); return ret; } unmap_hmem_md = true; } - cxip_copy_from_md(hmem_md, dest, hmem_src, size); + cxip_ep_obj_copy_from_md(txc->ep_obj, hmem_md, dest, hmem_src, + size); if (unmap_hmem_md) cxip_unmap(hmem_md); diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index 49218a324a3..0b8f0d4867b 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -612,8 +612,9 @@ static int cxip_amo_emit_idc(struct cxip_txc *txc, if (result_mr) { result_md = result_mr->md; } else { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN_RET(txc, ret, "Failed to map result buffer\n"); @@ -930,8 +931,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, /* Optionally register result MR. */ if (result) { if (!result_mr) { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN(txc, "Failed to map result buffer: %d:%s\n", @@ -1017,8 +1019,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, buf_md = buf_mr->md; } else { /* Map user operand buffer for DMA command. */ - ret = cxip_map(dom, buf, atomic_type_len, 0, - &req->amo.oper1_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, + atomic_type_len, 0, + &req->amo.oper1_md); if (ret) { TXC_WARN(txc, "Failed to map operand buffer: %d:%s\n", diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 0b121496c33..2e54a746407 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -1246,8 +1246,8 @@ static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size, ret = -FI_ENOMEM; goto out; } - ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer, - size, 0, &buf->cxi_md); + ret = cxip_ep_obj_map(coll_pte->ep_obj, (void *)buf->buffer, + size, 0, &buf->cxi_md); if (ret) goto del_msg; buf->bufsiz = size; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index 50ff7b9bd96..d91f4c61379 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -1114,6 +1114,15 @@ int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname, *optlen = sizeof(size_t); break; + case FI_OPT_CUDA_API_PERMITTED: + if (!optval || !optlen) + return -FI_EINVAL; + if (*optlen < sizeof(bool)) + return -FI_ETOOSMALL; + + *(bool *)optval = + !ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA]; + break; default: return -FI_ENOPROTOOPT; } @@ -1136,6 +1145,7 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, const void *optval, size_t optlen) { size_t min_multi_recv; + bool cuda_api_permitted; if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; @@ -1154,6 +1164,28 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, } ep->ep_obj->rxc->min_multi_recv = min_multi_recv; break; + /* + * If GDRCopy is required by the application (ie. it has set + * FI_OPT_CUDA_API_PERMITTED), and is not available, return not + * supported. + */ + case FI_OPT_CUDA_API_PERMITTED: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + + if (!hmem_ops[FI_HMEM_CUDA].initialized) { + CXIP_WARN("FI_OPT_CUDA_API_PERMITTED cannot be set when CUDA library or CUDA device is not available\n"); + return -FI_EOPNOTSUPP; + } + + cuda_api_permitted = *(bool *)optval; + + if (!cuda_api_permitted && !cuda_is_gdrcopy_enabled()) + return -FI_EOPNOTSUPP; + + ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] = + !cuda_api_permitted; + break; default: return -FI_ENOPROTOOPT; @@ -1256,6 +1288,12 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->src_addr.pid = pid; ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + /* Default to allowing non-dev reg copy APIs unless the caller + * disables it. + */ + for (i = 0; i < OFI_HMEM_MAX; i++) + ep_obj->require_dev_reg_copy[i] = false; + ofi_atomic_initialize32(&ep_obj->txq_ref, 0); ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); @@ -1328,6 +1366,26 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, return ret; } +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md) +{ + struct cxip_domain *dom = ep->domain; + int ret; + + ret = cxip_map(dom, buf, len, flags, md); + if (ret != FI_SUCCESS) + return ret; + + if (ep->require_dev_reg_copy[(*md)->info.iface] && + !((*md)->handle_valid)) { + CXIP_WARN("Required dev registration copy failed\n"); + cxip_unmap(*md); + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + /* * cxip_endpoint() - Provider fi_endpoint() implementation. */ diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 6a53ea87af5..2329164e466 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -1283,6 +1283,15 @@ static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; } + /* Zero length MRs do not have MD. */ + if (mr->md && + ep->ep_obj->require_dev_reg_copy[mr->md->info.iface] && + !mr->md->handle_valid) { + CXIP_WARN("Cannot bind to endpoint without required dev reg support\n"); + ret = -FI_EOPNOTSUPP; + break; + } + mr->ep = ep; ofi_atomic_inc32(&ep->ep_obj->ref); break; @@ -1439,6 +1448,10 @@ static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr, _mr->mr_fid.key = _mr->key; if (_mr->len) { + /* Do not check whether cuda_api_permitted is set at this point, + * because the mr is not bound to an endpoint. Check instead in + * cxip_mr_bind(). + */ ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0, &_mr->md); if (ret) { diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 29c9589baa0..abfa4a42490 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -61,7 +61,6 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, int (*recv_cb)(struct cxip_req *req, const union c_event *event)) { - struct cxip_domain *dom = rxc->domain; struct cxip_req *req; struct cxip_md *recv_md = NULL; int ret; @@ -80,7 +79,8 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, if (len) { /* If hybrid descriptor not passed, map for dma */ if (!md) { - ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); + ret = cxip_ep_obj_map(rxc->ep_obj, (void *)buf, len, 0, + &recv_md); if (ret) { RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n", @@ -714,8 +714,8 @@ int cxip_send_buf_init(struct cxip_req *req) /* Triggered operation always requires memory registration. */ if (req->triggered) - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, + req->send.len, 0, &req->send.send_md); /* FI_INJECT operations always require an internal bounce buffer. This * is needed to replay FI_INJECT operations which may experience flow @@ -773,8 +773,8 @@ int cxip_send_buf_init(struct cxip_req *req) } /* Everything else requires memory registeration. */ - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, req->send.len, 0, + &req->send.send_md); err_buf_fini: cxip_send_buf_fini(req); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 89de6e4f01b..d6e79e95b1e 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -629,8 +629,9 @@ static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, /* Copy data out of overflow buffer. */ oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); - cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, - oflow_va, oflow_bytes); + cxip_ep_obj_copy_to_md(match_req->recv.rxc->ep_obj, + match_req->recv.recv_md, + match_req->recv.recv_buf, oflow_va, oflow_bytes); if (oflow_req->type == CXIP_REQ_OFLOW) oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c index ec5064a4fe5..7b4415ea1e8 100644 --- a/prov/cxi/src/cxip_msg_rnr.c +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -1174,9 +1174,9 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, if (send_req->send.len && !idc) { if (!mr) { - ret = cxip_map(txc->domain, send_req->send.buf, - send_req->send.len, 0, - &send_req->send.send_md); + ret = cxip_ep_obj_map(txc->ep_obj, send_req->send.buf, + send_req->send.len, 0, + &send_req->send.send_md); if (ret) { TXC_WARN(txc, "Local buffer map failed: %d %s\n", diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c index b8ee08a3733..a313ccf0be4 100644 --- a/prov/cxi/src/cxip_ptelist_buf.c +++ b/prov/cxi/src/cxip_ptelist_buf.c @@ -132,8 +132,8 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) } } - ret = cxip_map(rxc->base.domain, buf->data, pool->attr.buf_size, - OFI_MR_NOCACHE, &buf->md); + ret = cxip_ep_obj_map(rxc->base.ep_obj, buf->data, pool->attr.buf_size, + OFI_MR_NOCACHE, &buf->md); if (ret) goto err_unreg_buf; diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 7b691c17e20..98a4165326a 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -269,7 +269,8 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, } else { assert(req != NULL); - ret = cxip_map(dom, buf, len, 0, &req->rma.local_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, len, 0, + &req->rma.local_md); if (ret) { TXC_WARN(txc, "Failed to map buffer: %d:%s\n", ret, fi_strerror(-ret)); diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index 94bc470ba68..fdbd64af604 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -63,8 +63,8 @@ int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region) struct cxip_md *md; int ret; - ret = cxip_map(txc->domain, region->mem_region, - region->pool->region_size, OFI_MR_NOCACHE, &md); + ret = cxip_ep_obj_map(txc->ep_obj, region->mem_region, + region->pool->region_size, OFI_MR_NOCACHE, &md); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to map inject buffer chunk\n"); return ret;