Skip to content

Commit

Permalink
prov/efa: Distinguish unresponsive receiver errors
Browse files Browse the repository at this point in the history
This adds a new EFA provider error code to provide a little more context
for users when EFA encounters an unresponsive receiver error. For now,
this simply captures some context wrt the reachability/connection health
of an RDM EP's peer - a successful handshake likely rules out
EC2 cluster misconfiguration, e.g. bad security group settings.

Signed-off-by: Darryl Abbate <drl@amazon.com>
(cherry picked from commit fc99843)
  • Loading branch information
darrylabbate authored and jswaro committed Feb 1, 2024
1 parent 76ea6d5 commit 4d0a335
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 106 deletions.
5 changes: 3 additions & 2 deletions prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
_(10, REMOTE_ERROR_RNR, Receiver not ready) \
_(11, REMOTE_ERROR_BAD_LENGTH, Receiver scatter-gather list (SGL) too short) \
_(12, REMOTE_ERROR_BAD_STATUS, Unexpected status received from remote) \
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver)
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver (connection never established or unknown))

/**
* @brief EFA provider proprietary error codes
Expand Down Expand Up @@ -100,7 +100,8 @@
_(4120, WRITE_RECV_COMP, Failure to write receive completion) \
_(4121, DGRAM_CQ_READ, Error reading from DGRAM CQ) \
_(4122, SHM_INTERNAL_ERROR, SHM internal error) \
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation)
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established))

/** @} */

Expand Down
20 changes: 14 additions & 6 deletions prov/efa/src/efa_strerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,20 @@ void efa_show_help(enum efa_errno err) {
"typically encountered when the peer process is no longer present";
break;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
help = "This error is detected locally; "
"typically caused by a peer hardware failure or "
"incorrect inbound/outbound rules in the security group - "
"EFA requires \"All traffic\" type allowlisting. "
"Please also verify the peer application has not "
"terminated unexpectedly.";
help = "This error is detected locally. "
"The connection status is unknown or was never established via "
"handshake. This typically indicates one or more misconfigured "
"EC2 instances; most often due to incorrect inbound/outbound "
"security group rules and/or instances placed in different "
"subnets. Refer to the public AWS documentation for EFA for "
"up-to-date configuration requirements. This error can also be "
"encountered when a peer process is no longer present.";
break;
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
help = "This error is detected locally. "
"The connection was previously established via handshake, "
"which indicates the error is likely due to the peer process no "
"longer being present.";
break;
default:
return;
Expand Down
36 changes: 4 additions & 32 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
@@ -1,35 +1,5 @@
/*
* Copyright (c) 2019-2022 Amazon.com, Inc. or its affiliates.
* All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#ifndef _EFA_RDM_EP_H
#define _EFA_RDM_EP_H
Expand Down Expand Up @@ -303,6 +273,8 @@ ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep,

size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface);

int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep);

static inline
struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep)
{
Expand Down
38 changes: 4 additions & 34 deletions prov/efa/src/rdm/efa_rdm_ep_progress.c
Original file line number Diff line number Diff line change
@@ -1,35 +1,5 @@
/*
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#include "efa.h"
#include "efa_av.h"
Expand Down Expand Up @@ -468,7 +438,7 @@ static inline void efa_rdm_ep_poll_ibv_cq(struct efa_rdm_ep *ep, size_t cqe_to_p
efa_rdm_tracepoint(poll_cq, (size_t) ep->ibv_cq_ex->wr_id);
opcode = ibv_wc_read_opcode(ep->ibv_cq_ex);
if (ep->ibv_cq_ex->status) {
prov_errno = ibv_wc_read_vendor_err(ep->ibv_cq_ex);
prov_errno = efa_rdm_ep_get_prov_errno(ep);
switch (opcode) {
case IBV_WC_SEND: /* fall through */
case IBV_WC_RDMA_WRITE: /* fall through */
Expand Down Expand Up @@ -538,7 +508,7 @@ static inline void efa_rdm_ep_poll_ibv_cq(struct efa_rdm_ep *ep, size_t cqe_to_p

if (err && err != ENOENT) {
err = err > 0 ? err : -err;
prov_errno = ibv_wc_read_vendor_err(ep->ibv_cq_ex);
prov_errno = efa_rdm_ep_get_prov_errno(ep);
efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno);
}

Expand Down
74 changes: 42 additions & 32 deletions prov/efa/src/rdm/efa_rdm_ep_utils.c
Original file line number Diff line number Diff line change
@@ -1,35 +1,5 @@
/*
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#include <inttypes.h>
#include <stdlib.h>
Expand Down Expand Up @@ -697,3 +667,43 @@ size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface

return memory_alignment;
}

/**
* @brief Get the vendor error code for an endpoint's CQ
*
* This function is essentially a wrapper for `ibv_wc_read_vendor_err()`; making
* a best-effort attempt to promote the error code to a proprietary EFA
* provider error code.
*
* @param[in] ep EFA RDM endpoint
* @return EFA-specific error code
* @sa #EFA_PROV_ERRNOS
*
* @todo Currently, this only checks for unresponsive receiver
* (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other
* RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate
* error reporting
*/
int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep) {
uint32_t vendor_err = ibv_wc_read_vendor_err(ep->ibv_cq_ex);
struct efa_rdm_pke *pkt_entry = (void *) (uintptr_t) ep->ibv_cq_ex->wr_id;
struct efa_rdm_peer *peer;

if (OFI_LIKELY(pkt_entry && pkt_entry->addr))
peer = efa_rdm_ep_get_peer(ep, pkt_entry->addr);
else
return vendor_err;

switch (vendor_err) {
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: {
if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)
vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP;
break;
}
default:
break;
}

return vendor_err;
}

0 comments on commit 4d0a335

Please sign in to comment.