From 4d0a335d34d5fb5b752e523f00dec91c0f8456e9 Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Thu, 21 Dec 2023 15:49:32 -0800 Subject: [PATCH] prov/efa: Distinguish unresponsive receiver errors This adds a new EFA provider error code to provide a little more context for users when EFA encounters an unresponsive receiver error. For now, this simply captures some context wrt the reachability/connection health of an RDM EP's peer - a successful handshake likely rules out EC2 cluster misconfiguration, e.g. bad security group settings. Signed-off-by: Darryl Abbate (cherry picked from commit fc99843c1be0a1148b23f836e483e61cbb5fc449) --- prov/efa/src/efa_errno.h | 5 +- prov/efa/src/efa_strerror.c | 20 ++++--- prov/efa/src/rdm/efa_rdm_ep.h | 36 ++----------- prov/efa/src/rdm/efa_rdm_ep_progress.c | 38 ++----------- prov/efa/src/rdm/efa_rdm_ep_utils.c | 74 +++++++++++++++----------- 5 files changed, 67 insertions(+), 106 deletions(-) diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index b10683f6847..0c3447b7821 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -67,7 +67,7 @@ _(10, REMOTE_ERROR_RNR, Receiver not ready) \ _(11, REMOTE_ERROR_BAD_LENGTH, Receiver scatter-gather list (SGL) too short) \ _(12, REMOTE_ERROR_BAD_STATUS, Unexpected status received from remote) \ - _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver) + _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver (connection never established or unknown)) /** * @brief EFA provider proprietary error codes @@ -100,7 +100,8 @@ _(4120, WRITE_RECV_COMP, Failure to write receive completion) \ _(4121, DGRAM_CQ_READ, Error reading from DGRAM CQ) \ _(4122, SHM_INTERNAL_ERROR, SHM internal error) \ - _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) + _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ + _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) /** @} */ diff --git a/prov/efa/src/efa_strerror.c b/prov/efa/src/efa_strerror.c index cf40bb92ef0..2b0c7d682a4 100644 --- a/prov/efa/src/efa_strerror.c +++ b/prov/efa/src/efa_strerror.c @@ -69,12 +69,20 @@ void efa_show_help(enum efa_errno err) { "typically encountered when the peer process is no longer present"; break; case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: - help = "This error is detected locally; " - "typically caused by a peer hardware failure or " - "incorrect inbound/outbound rules in the security group - " - "EFA requires \"All traffic\" type allowlisting. " - "Please also verify the peer application has not " - "terminated unexpectedly."; + help = "This error is detected locally. " + "The connection status is unknown or was never established via " + "handshake. This typically indicates one or more misconfigured " + "EC2 instances; most often due to incorrect inbound/outbound " + "security group rules and/or instances placed in different " + "subnets. Refer to the public AWS documentation for EFA for " + "up-to-date configuration requirements. This error can also be " + "encountered when a peer process is no longer present."; + break; + case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: + help = "This error is detected locally. " + "The connection was previously established via handshake, " + "which indicates the error is likely due to the peer process no " + "longer being present."; break; default: return; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 1492090846b..23f68276cd3 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019-2022 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_EP_H #define _EFA_RDM_EP_H @@ -303,6 +273,8 @@ ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep, size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface); +int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep); + static inline struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_progress.c b/prov/efa/src/rdm/efa_rdm_ep_progress.c index 40208e81911..fef63ea87e1 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_progress.c +++ b/prov/efa/src/rdm/efa_rdm_ep_progress.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" #include "efa_av.h" @@ -468,7 +438,7 @@ static inline void efa_rdm_ep_poll_ibv_cq(struct efa_rdm_ep *ep, size_t cqe_to_p efa_rdm_tracepoint(poll_cq, (size_t) ep->ibv_cq_ex->wr_id); opcode = ibv_wc_read_opcode(ep->ibv_cq_ex); if (ep->ibv_cq_ex->status) { - prov_errno = ibv_wc_read_vendor_err(ep->ibv_cq_ex); + prov_errno = efa_rdm_ep_get_prov_errno(ep); switch (opcode) { case IBV_WC_SEND: /* fall through */ case IBV_WC_RDMA_WRITE: /* fall through */ @@ -538,7 +508,7 @@ static inline void efa_rdm_ep_poll_ibv_cq(struct efa_rdm_ep *ep, size_t cqe_to_p if (err && err != ENOENT) { err = err > 0 ? err : -err; - prov_errno = ibv_wc_read_vendor_err(ep->ibv_cq_ex); + prov_errno = efa_rdm_ep_get_prov_errno(ep); efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); } diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 44de0a970eb..fa50a76d550 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -697,3 +667,43 @@ size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface return memory_alignment; } + +/** + * @brief Get the vendor error code for an endpoint's CQ + * + * This function is essentially a wrapper for `ibv_wc_read_vendor_err()`; making + * a best-effort attempt to promote the error code to a proprietary EFA + * provider error code. + * + * @param[in] ep EFA RDM endpoint + * @return EFA-specific error code + * @sa #EFA_PROV_ERRNOS + * + * @todo Currently, this only checks for unresponsive receiver + * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to + * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other + * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate + * error reporting + */ +int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep) { + uint32_t vendor_err = ibv_wc_read_vendor_err(ep->ibv_cq_ex); + struct efa_rdm_pke *pkt_entry = (void *) (uintptr_t) ep->ibv_cq_ex->wr_id; + struct efa_rdm_peer *peer; + + if (OFI_LIKELY(pkt_entry && pkt_entry->addr)) + peer = efa_rdm_ep_get_peer(ep, pkt_entry->addr); + else + return vendor_err; + + switch (vendor_err) { + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { + if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) + vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; + break; + } + default: + break; + } + + return vendor_err; +}