From 13b32c772c27b080951119191d69bfabec5d99b5 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 24 Sep 2024 11:12:01 -0700 Subject: [PATCH] prov/efa: differentiate unresponsive receiver errors following rdma-core Add a new vendor error code EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE from rdma core to indicate the remote is unreachable. Add a new EFA provider error code UNESTABLISHED_RECV_UNRESP to distinguish unresponsive receiver error when the peer is reachable by the EFA device but libfabric failed to complete a handshake. Add unit test for EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE. Signed-off-by: Jessie Yang (cherry picked from commit 5573b3f3f4ebc6d1857c52dd05d03e71352b8fcc) --- prov/efa/src/efa_errno.h | 12 ++++++++---- prov/efa/src/efa_strerror.c | 16 +++++++++++----- prov/efa/src/rdm/efa_rdm_cq.c | 9 ++++++--- prov/efa/test/efa_unit_test_cq.c | 17 +++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 1a147f0fbdf..4a68fe2488e 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -69,8 +69,9 @@ _(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \ _(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \ _(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \ - _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \ - _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) + _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \ + _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \ + _(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response)) /** * @brief EFA provider proprietary error codes @@ -105,7 +106,8 @@ _(4122, SHM_INTERNAL_ERROR, SHM internal error) \ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ - _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) + _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \ + _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) /** @} */ @@ -156,13 +158,15 @@ static inline int to_fi_errno(enum efa_errno err) { case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: return FI_EINVAL; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: return FI_EHOSTUNREACH; case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: return FI_EMSGSIZE; case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: return FI_ECONNABORTED; case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER: diff --git a/prov/efa/src/efa_strerror.c b/prov/efa/src/efa_strerror.c index 35710501d0e..895ebfd83e7 100644 --- a/prov/efa/src/efa_strerror.c +++ b/prov/efa/src/efa_strerror.c @@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) { help = "This error is detected remotely; " "typically encountered when the peer process is no longer present"; break; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: help = "This error is detected locally. " - "The connection status is unknown or was never established via " - "handshake. This typically indicates one or more misconfigured " + "The peer is not reachable by the EFA device. " + "This typically indicates one or more misconfigured " "EC2 instances; most often due to incorrect inbound/outbound " "security group rules and/or instances placed in different " "subnets. Refer to the public AWS documentation for EFA for " @@ -80,8 +80,14 @@ void efa_show_help(enum efa_errno err) { case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: help = "This error is detected locally. " "The connection was previously established via handshake, " - "which indicates the error is likely due to the peer process no " - "longer being present."; + "which indicates the error is likely due to a hardware failure " + "on the remote peer, or the peer process no longer being present."; + break; + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: + help = "This error is detected locally. " + "The peer is reachable by the EFA device but libfabric failed " + "to complete a handshake, which indicates the error is likely " + "due to the peer process no longer being present."; break; case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX: help = "This error is detected locally. " diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index d7380623cd9..4b3bcd74d1d 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -399,7 +399,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * * @todo Currently, this only checks for unresponsive receiver * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to - * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other + * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or + * #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed. + * This should be expanded to handle other * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate * error reporting */ @@ -418,8 +420,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) { switch (vendor_err) { case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { - if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) - vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; + vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ? + FI_EFA_ERR_ESTABLISHED_RECV_UNRESP : + FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP; break; } default: diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index f3fcc624a6b..75e32b39773 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -227,6 +227,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } +/** + * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns + * unreachable remote error for send. + * + * When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + test_rdm_cq_read_bad_send_status(resource, + 0x1234567812345678, 0x8765432187654321, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE); +} + /** * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns * invalid qpn error for send. diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 14611ceb335..4fe6552503d 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -120,6 +120,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index f1366d25739..312efccd444 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -134,6 +134,7 @@ void test_ibv_cq_ex_read_failed_poll(); void test_rdm_cq_create_error_handling(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id(); +void test_rdm_cq_read_bad_send_status_unreachable_receiver(); void test_rdm_cq_read_bad_send_status_invalid_qpn(); void test_rdm_cq_read_bad_send_status_message_too_long(); void test_ibv_cq_ex_read_bad_recv_status();