Skip to content

Commit

Permalink
prov/cxi: Support FI_OPT_CUDA_API_PERMITTED
Browse files Browse the repository at this point in the history
If GDRCopy is required by the application (ie. it has set
FI_OPT_CUDA_API_PERMITTED), and is not available, return not supported,
eliminating deadlocks due to calls to cudaMemcpy interacting with CUDA
applications.

Signed-off-by: Ian Ziemba <ian.ziemba@hpe.com>
  • Loading branch information
ryanhankins authored and iziemba committed Nov 21, 2024
1 parent 7623bc5 commit 0e50dda
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 34 deletions.
50 changes: 40 additions & 10 deletions prov/cxi/include/cxip.h
Original file line number Diff line number Diff line change
Expand Up @@ -1953,13 +1953,16 @@ struct cxip_rxc_rnr {
};

static inline void cxip_copy_to_md(struct cxip_md *md, void *dest,
const void *src, size_t size)
const void *src, size_t size,
bool require_dev_reg_copy)
{
ssize_t ret __attribute__((unused));
struct iovec iov;
bool dev_reg_copy = require_dev_reg_copy ||
(md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold);

/* Favor CPU store access instead of relying on HMEM copy functions. */
if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) {
/* Favor dev reg access instead of relying on HMEM copy functions. */
if (dev_reg_copy) {
ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle,
dest, src, size);
assert(ret == FI_SUCCESS);
Expand All @@ -1975,13 +1978,16 @@ static inline void cxip_copy_to_md(struct cxip_md *md, void *dest,
}

static inline void cxip_copy_from_md(struct cxip_md *md, void *dest,
const void *src, size_t size)
const void *src, size_t size,
bool require_dev_reg_copy)
{
ssize_t ret __attribute__((unused));
struct iovec iov;
bool dev_reg_copy = require_dev_reg_copy ||
(md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold);

/* Favor CPU store access instead of relying on HMEM copy functions. */
if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) {
/* Favor dev reg access instead of relying on HMEM copy functions. */
if (dev_reg_copy) {
ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface,
md->handle,
dest, src, size);
Expand Down Expand Up @@ -2426,6 +2432,9 @@ struct cxip_ep_obj {
struct fi_tx_attr tx_attr;
struct fi_rx_attr rx_attr;

/* Require memcpy's via the dev reg APIs. */
bool require_dev_reg_copy[OFI_HMEM_MAX];

/* Collectives support */
struct cxip_ep_coll_obj coll;
struct cxip_ep_zbcoll_obj zbcoll;
Expand All @@ -2436,6 +2445,25 @@ struct cxip_ep_obj {
struct cxip_portals_table *ptable;
};

int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len,
uint64_t flags, struct cxip_md **md);

static inline void
cxip_ep_obj_copy_to_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest,
const void *src, size_t size)
{
cxip_copy_to_md(md, dest, src, size,
ep->require_dev_reg_copy[md->info.iface]);
}

static inline void
cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest,
const void *src, size_t size)
{
cxip_copy_from_md(md, dest, src, size,
ep->require_dev_reg_copy[md->info.iface]);
}

static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc)
{
assert(ofi_genlock_held(&txc->ep_obj->lock) == 1);
Expand Down Expand Up @@ -3617,17 +3645,19 @@ cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md,
*/
if (!cxip_env.fork_safe_requested) {
if (!hmem_md) {
ret = cxip_map(domain, hmem_src, size, 0, &hmem_md);
ret = cxip_ep_obj_map(txc->ep_obj, hmem_src, size, 0,
&hmem_md);
if (ret) {
TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret,
fi_strerror(-ret));
TXC_WARN(txc, "cxip_ep_obj_map failed: %d:%s\n",
ret, fi_strerror(-ret));
return ret;
}

unmap_hmem_md = true;
}

cxip_copy_from_md(hmem_md, dest, hmem_src, size);
cxip_ep_obj_copy_from_md(txc->ep_obj, hmem_md, dest, hmem_src,
size);
if (unmap_hmem_md)
cxip_unmap(hmem_md);

Expand Down
15 changes: 9 additions & 6 deletions prov/cxi/src/cxip_atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -612,8 +612,9 @@ static int cxip_amo_emit_idc(struct cxip_txc *txc,
if (result_mr) {
result_md = result_mr->md;
} else {
ret = cxip_map(dom, result, atomic_type_len, 0,
&req->amo.result_md);
ret = cxip_ep_obj_map(txc->ep_obj, result,
atomic_type_len, 0,
&req->amo.result_md);
if (ret) {
TXC_WARN_RET(txc, ret,
"Failed to map result buffer\n");
Expand Down Expand Up @@ -930,8 +931,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc,
/* Optionally register result MR. */
if (result) {
if (!result_mr) {
ret = cxip_map(dom, result, atomic_type_len, 0,
&req->amo.result_md);
ret = cxip_ep_obj_map(txc->ep_obj, result,
atomic_type_len, 0,
&req->amo.result_md);
if (ret) {
TXC_WARN(txc,
"Failed to map result buffer: %d:%s\n",
Expand Down Expand Up @@ -1017,8 +1019,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc,
buf_md = buf_mr->md;
} else {
/* Map user operand buffer for DMA command. */
ret = cxip_map(dom, buf, atomic_type_len, 0,
&req->amo.oper1_md);
ret = cxip_ep_obj_map(txc->ep_obj, buf,
atomic_type_len, 0,
&req->amo.oper1_md);
if (ret) {
TXC_WARN(txc,
"Failed to map operand buffer: %d:%s\n",
Expand Down
4 changes: 2 additions & 2 deletions prov/cxi/src/cxip_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -1246,8 +1246,8 @@ static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size,
ret = -FI_ENOMEM;
goto out;
}
ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer,
size, 0, &buf->cxi_md);
ret = cxip_ep_obj_map(coll_pte->ep_obj, (void *)buf->buffer,
size, 0, &buf->cxi_md);
if (ret)
goto del_msg;
buf->bufsiz = size;
Expand Down
58 changes: 58 additions & 0 deletions prov/cxi/src/cxip_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,15 @@ int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname,
*optlen = sizeof(size_t);
break;

case FI_OPT_CUDA_API_PERMITTED:
if (!optval || !optlen)
return -FI_EINVAL;
if (*optlen < sizeof(bool))
return -FI_ETOOSMALL;

*(bool *)optval =
!ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA];
break;
default:
return -FI_ENOPROTOOPT;
}
Expand All @@ -1136,6 +1145,7 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname,
const void *optval, size_t optlen)
{
size_t min_multi_recv;
bool cuda_api_permitted;

if (level != FI_OPT_ENDPOINT)
return -FI_ENOPROTOOPT;
Expand All @@ -1154,6 +1164,28 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname,
}
ep->ep_obj->rxc->min_multi_recv = min_multi_recv;
break;
/*
* If GDRCopy is required by the application (ie. it has set
* FI_OPT_CUDA_API_PERMITTED), and is not available, return not
* supported.
*/
case FI_OPT_CUDA_API_PERMITTED:
if (optlen != sizeof(bool))
return -FI_EINVAL;

if (!hmem_ops[FI_HMEM_CUDA].initialized) {
CXIP_WARN("FI_OPT_CUDA_API_PERMITTED cannot be set when CUDA library or CUDA device is not available\n");
return -FI_EOPNOTSUPP;
}

cuda_api_permitted = *(bool *)optval;

if (!cuda_api_permitted && !cuda_is_gdrcopy_enabled())
return -FI_EOPNOTSUPP;

ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] =
!cuda_api_permitted;
break;

default:
return -FI_ENOPROTOOPT;
Expand Down Expand Up @@ -1256,6 +1288,12 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints,
ep_obj->src_addr.pid = pid;
ep_obj->fi_addr = FI_ADDR_NOTAVAIL;

/* Default to allowing non-dev reg copy APIs unless the caller
* disables it.
*/
for (i = 0; i < OFI_HMEM_MAX; i++)
ep_obj->require_dev_reg_copy[i] = false;

ofi_atomic_initialize32(&ep_obj->txq_ref, 0);
ofi_atomic_initialize32(&ep_obj->tgq_ref, 0);

Expand Down Expand Up @@ -1328,6 +1366,26 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints,
return ret;
}

int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len,
uint64_t flags, struct cxip_md **md)
{
struct cxip_domain *dom = ep->domain;
int ret;

ret = cxip_map(dom, buf, len, flags, md);
if (ret != FI_SUCCESS)
return ret;

if (ep->require_dev_reg_copy[(*md)->info.iface] &&
!((*md)->handle_valid)) {
CXIP_WARN("Required dev registration copy failed\n");
cxip_unmap(*md);
return -FI_EOPNOTSUPP;
}

return FI_SUCCESS;
}

/*
* cxip_endpoint() - Provider fi_endpoint() implementation.
*/
Expand Down
13 changes: 13 additions & 0 deletions prov/cxi/src/cxip_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1283,6 +1283,15 @@ static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
break;
}

/* Zero length MRs do not have MD. */
if (mr->md &&
ep->ep_obj->require_dev_reg_copy[mr->md->info.iface] &&
!mr->md->handle_valid) {
CXIP_WARN("Cannot bind to endpoint without required dev reg support\n");
ret = -FI_EOPNOTSUPP;
break;
}

mr->ep = ep;
ofi_atomic_inc32(&ep->ep_obj->ref);
break;
Expand Down Expand Up @@ -1439,6 +1448,10 @@ static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr,
_mr->mr_fid.key = _mr->key;

if (_mr->len) {
/* Do not check whether cuda_api_permitted is set at this point,
* because the mr is not bound to an endpoint. Check instead in
* cxip_mr_bind().
*/
ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0,
&_mr->md);
if (ret) {
Expand Down
12 changes: 6 additions & 6 deletions prov/cxi/src/cxip_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len,
int (*recv_cb)(struct cxip_req *req,
const union c_event *event))
{
struct cxip_domain *dom = rxc->domain;
struct cxip_req *req;
struct cxip_md *recv_md = NULL;
int ret;
Expand All @@ -80,7 +79,8 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len,
if (len) {
/* If hybrid descriptor not passed, map for dma */
if (!md) {
ret = cxip_map(dom, (void *)buf, len, 0, &recv_md);
ret = cxip_ep_obj_map(rxc->ep_obj, (void *)buf, len, 0,
&recv_md);
if (ret) {
RXC_WARN(rxc,
"Map of recv buffer failed: %d, %s\n",
Expand Down Expand Up @@ -714,8 +714,8 @@ int cxip_send_buf_init(struct cxip_req *req)

/* Triggered operation always requires memory registration. */
if (req->triggered)
return cxip_map(txc->domain, req->send.buf, req->send.len, 0,
&req->send.send_md);
return cxip_ep_obj_map(txc->ep_obj, req->send.buf,
req->send.len, 0, &req->send.send_md);

/* FI_INJECT operations always require an internal bounce buffer. This
* is needed to replay FI_INJECT operations which may experience flow
Expand Down Expand Up @@ -773,8 +773,8 @@ int cxip_send_buf_init(struct cxip_req *req)
}

/* Everything else requires memory registeration. */
return cxip_map(txc->domain, req->send.buf, req->send.len, 0,
&req->send.send_md);
return cxip_ep_obj_map(txc->ep_obj, req->send.buf, req->send.len, 0,
&req->send.send_md);

err_buf_fini:
cxip_send_buf_fini(req);
Expand Down
5 changes: 3 additions & 2 deletions prov/cxi/src/cxip_msg_hpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -629,8 +629,9 @@ static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req,

/* Copy data out of overflow buffer. */
oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len);
cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf,
oflow_va, oflow_bytes);
cxip_ep_obj_copy_to_md(match_req->recv.rxc->ep_obj,
match_req->recv.recv_md,
match_req->recv.recv_buf, oflow_va, oflow_bytes);

if (oflow_req->type == CXIP_REQ_OFLOW)
oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength);
Expand Down
6 changes: 3 additions & 3 deletions prov/cxi/src/cxip_msg_rnr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1174,9 +1174,9 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf,

if (send_req->send.len && !idc) {
if (!mr) {
ret = cxip_map(txc->domain, send_req->send.buf,
send_req->send.len, 0,
&send_req->send.send_md);
ret = cxip_ep_obj_map(txc->ep_obj, send_req->send.buf,
send_req->send.len, 0,
&send_req->send.send_md);
if (ret) {
TXC_WARN(txc,
"Local buffer map failed: %d %s\n",
Expand Down
4 changes: 2 additions & 2 deletions prov/cxi/src/cxip_ptelist_buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool)
}
}

ret = cxip_map(rxc->base.domain, buf->data, pool->attr.buf_size,
OFI_MR_NOCACHE, &buf->md);
ret = cxip_ep_obj_map(rxc->base.ep_obj, buf->data, pool->attr.buf_size,
OFI_MR_NOCACHE, &buf->md);
if (ret)
goto err_unreg_buf;

Expand Down
3 changes: 2 additions & 1 deletion prov/cxi/src/cxip_rma.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len,
} else {
assert(req != NULL);

ret = cxip_map(dom, buf, len, 0, &req->rma.local_md);
ret = cxip_ep_obj_map(txc->ep_obj, buf, len, 0,
&req->rma.local_md);
if (ret) {
TXC_WARN(txc, "Failed to map buffer: %d:%s\n",
ret, fi_strerror(-ret));
Expand Down
4 changes: 2 additions & 2 deletions prov/cxi/src/cxip_txc.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region)
struct cxip_md *md;
int ret;

ret = cxip_map(txc->domain, region->mem_region,
region->pool->region_size, OFI_MR_NOCACHE, &md);
ret = cxip_ep_obj_map(txc->ep_obj, region->mem_region,
region->pool->region_size, OFI_MR_NOCACHE, &md);
if (ret != FI_SUCCESS) {
CXIP_WARN("Failed to map inject buffer chunk\n");
return ret;
Expand Down

0 comments on commit 0e50dda

Please sign in to comment.