diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9016ec00958..1877156779b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -68,6 +68,12 @@ Chapter 4 "extra features/requests" describes the extra features/requests define * Section 4.6 describe the extra feature: RDMA-Write based message transfer. + * Section 4.7 describe the extra feature: Long read and runting read nack protocol. + + * Section 4.8 describe the extra feature: User receive QP. + + * Section 4.9 describe the extra feature: Unsolicited write recv. + Chapter 5 "What's not covered?" describes the contents that are intentionally left out of this document because they are considered "implementation details". @@ -323,6 +329,7 @@ Table: 2.1 a list of extra features/requests | 5 | RDMA-Write based data transfer | extra feature | libfabric 1.18.0 | Section 4.6 | | 6 | Read nack packets | extra feature | libfabric 1.20.0 | Section 4.7 | | 7 | User recv QP | extra feature & request| libfabric 1.22.0 | Section 4.8 | +| 8 | Unsolicited write recv | extra feature | libfabric 1.22.0 | Section 4.9 | How does protocol v4 maintain backward compatibility when extra features/requests are introduced? @@ -1611,6 +1618,17 @@ zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. +### 4.9 Unsolicited write recv + +The "Unsolicited write recv" is an extra feature that was +introduced with the libfabric 1.22.0. When this feature is on, rdma-write +with immediate data will not consume an rx buffer on the responder side. It is +defined as an extra feature because there is a set of requirements (firmware, +EFA kernel module and rdma-core) to be met before an endpoint can use the unsolicited +write recv capability, therefore an endpoint cannot assume the other party supports +unsolicited write recv. The rdma-write with immediate data cannot be issued if there +is a discrepancy on this feature between local and peer. + ## 5. What's not covered? The purpose of this document is to define the communication protocol. Therefore, it is intentionally written diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 2fb407f6525..964ad4a2e02 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -448,4 +448,10 @@ static inline int efa_rdm_attempt_to_sync_memops_ioc(struct efa_rdm_ep *ep, stru return err; } +static inline +bool efa_rdm_ep_support_unsolicited_write_recv(struct efa_rdm_ep *ep) +{ + return ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; +} + #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 5adcec05dff..fe7885c9217 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1022,6 +1022,9 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; + if (efa_rdm_use_unsolicited_write_recv()) + ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + if (ep->use_zcpy_rx) { /* * When zcpy rx is enabled, an extra QP is created to diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 8c2703fc140..fe2f79ead61 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -109,6 +109,23 @@ bool efa_rdm_peer_support_rdma_write(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_RDMA_WRITE); } +/** + * @brief check for peer's unsolicited write support, assuming HANDSHAKE has already occurred + * + * @param[in] peer A peer which we have already received a HANDSHAKE from + * @return bool The peer's unsolicited write recv support + */ +static inline +bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) +{ + /* Unsolicited write recv is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && + (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); +} + static inline bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 1b94b5338d1..8840ce5f401 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -40,7 +40,8 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_RDMA_WRITE BIT_ULL(5) #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) -#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 +#define EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV BIT_ULL(8) +#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 9 /* * The length of 64-bit extra_info array used in efa_rdm_ep * and efa_rdm_peer diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index ae04af66e1e..902f51d0768 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -366,6 +366,24 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); if (efa_rdm_rma_should_write_using_rdma(ep, txe, txe->peer)) { + /** + * Unsolicited write recv is a feature that makes rdma-write with + * imm not consume an rx buffer on the responder side, and this + * feature requires consistent support status on both sides. + */ + if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && + (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + EFA_WARN(FI_LOG_EP_DATA, + "Inconsistent support status detected on unsolicited write recv.\n" + "My support status: %d, peer support status: %d. %s.\n" + "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" + "Please use consistent software versions on both hosts, or disable the unsolicited write " + "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", + efa_rdm_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + ep->err_msg); + return -FI_EOPNOTSUPP; + } efa_rdm_ope_prepare_to_post_write(txe); return efa_rdm_ope_post_remote_write(txe); } diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 02880c09dfd..0175b3884a9 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -97,6 +97,53 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) } } +/** + * @brief Construct a message that contains the local and peer information, + * including the efa address and the host id. + * + * @param ep EFA RDM endpoint + * @param addr Remote peer fi_addr_t + * @param msg the ptr of the msg to be constructed (needs to be allocated already!) + * @param base_msg ptr to the base msg that will show at the beginning of msg + * @param msg_len the length of the message + * @return int 0 on success, negative integer on failure + */ +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + size_t len = 0; + int ret; + struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); + + len = sizeof(ep_addr_str); + efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { + strcpy(peer_host_id_str, "N/A"); + } + + ret = snprintf(msg, msg_len, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > msg_len - 1) { + return -FI_EINVAL; + } + + if (strlen(msg) >= msg_len) { + return -FI_ENOBUFS; + } + + return FI_SUCCESS; +} + /** * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint @@ -108,42 +155,18 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) */ int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) { - char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; - char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - const char *base_msg = efa_strerror(prov_errno); - size_t len = 0; - struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); - - *buf = NULL; - *buflen = 0; - - len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); - len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); - - if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { - strcpy(local_host_id_str, "N/A"); - } - - if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { - strcpy(peer_host_id_str, "N/A"); - } - - int ret = snprintf(ep->err_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", - base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + const char *base_msg = efa_strerror(prov_errno); + int ret; - if (ret < 0 || ret > EFA_RDM_ERROR_MSG_BUFFER_LENGTH - 1) { - return -FI_EINVAL; - } + *buf = NULL; + *buflen = 0; - if (strlen(ep->err_msg) >= EFA_RDM_ERROR_MSG_BUFFER_LENGTH) { - return -FI_ENOBUFS; - } + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + if (ret) + return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buf = ep->err_msg; + *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; - return 0; + return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index a2ba0083295..b79bafb4e85 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -19,6 +19,8 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); + int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); #ifdef ENABLE_EFA_POISONING diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 7f4834b847b..a62b28b0685 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -659,6 +659,79 @@ void test_efa_rdm_ep_read_queue_before_handshake(struct efa_resource **state) test_efa_rdm_ep_rma_queue_before_handshake(state, ofi_op_read_req); } +/** + * @brief When local support unsolicited write, but the peer doesn't, fi_writedata + * (use rdma-write with imm) should fail as FI_EINVAL + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t peer_addr; + int num_addr; + const int buf_len = 8; + char buf[8] = {0}; + int err; + uint64_t rma_addr, rma_key; + struct efa_rdm_peer *peer; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; + resource->hints->domain_attr->mr_mode |= FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 22), + resource->hints, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /** + * TODO: It's better to mock this function + * so we can test on platform that doesn't + * support rdma-write. + */ + if (!(efa_rdm_ep_support_rdma_write(efa_rdm_ep))) + skip(); + + /* Make local ep support unsolicited write recv */ + efa_rdm_ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + + /* create a fake peer */ + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* create a fake rma_key and address. fi_read should return before + * they are needed. */ + rma_key = 0x1234; + rma_addr = (uint64_t) &buf; + + /* + * Fake a peer that has made handshake and + * does not support unsolicited write recv + */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_WRITE; + peer->extra_info[0] &= ~EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + /* make sure shm is not used */ + peer->is_local = false; + + err = fi_writedata(resource->ep, buf, buf_len, + NULL, /* desc, not required */ + 0x1234, + peer_addr, + rma_addr, + rma_key, + NULL); /* context */ + assert_int_equal(err, -FI_EOPNOTSUPP); +} + /** * @brief verify that when shm was used to send a small message (<4k), no copy was performed. * @@ -1267,3 +1340,24 @@ void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource { test_efa_rdm_ep_rx_refill_impl(state, 128, 64); } + +/** + * @brief when unsolicited write recv is supported (by device + env), + * efa_rdm_ep_support_unsolicited_write_recv + * should return true, otherwise it should return false + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] is_supported support status + */ +void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(efa_rdm_use_unsolicited_write_recv(), + efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index fa81bd93198..14611ceb335 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -112,6 +112,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_support_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index a100f30ab11..f1366d25739 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -126,6 +126,8 @@ void test_efa_rdm_ep_zcpy_recv_cancel(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); +void test_efa_rdm_ep_support_unsolicited_write_recv(); +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll();