From c3f17a8494739092256cb14cd8eb4644b94f02ce Mon Sep 17 00:00:00 2001 From: Ivan Betsis Date: Wed, 29 Jan 2020 15:49:59 +0200 Subject: [PATCH] nvmf/rdma: Add WR batch rdma parameter With x86 and QD < 64 there is a benefit from disable batch when we have randread IO pattern, initiators on many cores and BS = 4K. With QD = 4 we also see benefit from disable batch for small BS( BS < 2K). Batching is configurable with optional parameter WRBatching in configuration file (default True). Signed-off-by: Ivan Betsis Signed-off-by: Evgeniy Kochetov Signed-off-by: Sasha Kotchubievsky --- doc/jsonrpc.md | 1 + etc/spdk/nvmf.conf.in | 3 ++ include/spdk/nvmf.h | 1 + lib/nvmf/nvmf_rpc.c | 5 ++++ lib/nvmf/rdma.c | 45 ++++++++++++++++++++++------- module/event/subsystems/nvmf/conf.c | 2 ++ scripts/rpc.py | 4 ++- scripts/rpc/nvmf.py | 7 +++-- 8 files changed, 54 insertions(+), 14 deletions(-) diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 7aff53c501a..07c1f9c1049 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -3844,6 +3844,7 @@ no_srq | Optional | boolean | Disable shared receive queue c2h_success | Optional | boolean | Disable C2H success optimization (TCP only) dif_insert_or_strip | Optional | boolean | Enable DIF insert for write I/O and DIF strip for read I/O DIF (TCP only) sock_priority | Optional | number | The socket priority of the connection owned by this transport (TCP only) +wr_batching | Optional | boolean | Disable work requests batching (RDMA only) ### Example: diff --git a/etc/spdk/nvmf.conf.in b/etc/spdk/nvmf.conf.in index 5799f65cfbd..a44197bb1e6 100644 --- a/etc/spdk/nvmf.conf.in +++ b/etc/spdk/nvmf.conf.in @@ -111,6 +111,9 @@ # Set the maximum number outstanding I/O per shared receive queue. Relevant only for RDMA transport #MaxSRQDepth 4096 + # Set batching for RDMA requests + #WRBatching True + [Transport] # Set TCP transport type. Type TCP diff --git a/include/spdk/nvmf.h b/include/spdk/nvmf.h index 4b9133da452..31978df0531 100644 --- a/include/spdk/nvmf.h +++ b/include/spdk/nvmf.h @@ -83,6 +83,7 @@ struct spdk_nvmf_transport_opts { bool no_srq; bool c2h_success; bool dif_insert_or_strip; + bool wr_batching; uint32_t sock_priority; }; diff --git a/lib/nvmf/nvmf_rpc.c b/lib/nvmf/nvmf_rpc.c index 9855c942a0a..617b7cc2859 100644 --- a/lib/nvmf/nvmf_rpc.c +++ b/lib/nvmf/nvmf_rpc.c @@ -1602,6 +1602,10 @@ static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] "tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name), spdk_json_decode_string, true }, + { + "wr_batching", offsetof(struct nvmf_rpc_create_transport_ctx, opts.wr_batching), + spdk_json_decode_bool, true + }, }; static void @@ -1745,6 +1749,7 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t if (type == SPDK_NVME_TRANSPORT_RDMA) { spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth); spdk_json_write_named_bool(w, "no_srq", opts->no_srq); + spdk_json_write_named_bool(w, "wr_batching", opts->wr_batching); } else if (type == SPDK_NVME_TRANSPORT_TCP) { spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success); spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority); diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index ad3b4a631c2..feedaf344f7 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -522,6 +522,14 @@ struct spdk_nvmf_rdma_transport { static inline void spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair); +static void +_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller); + +static void +_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller); + static inline int spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state) { @@ -1102,7 +1110,8 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) /* Append the given recv wr structure to the resource structs outstanding recvs list. */ /* This function accepts either a single wr or the first wr in a linked list. */ static void -nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) +nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first, + struct spdk_nvmf_rdma_transport *rtransport) { struct ibv_recv_wr *last; @@ -1121,12 +1130,17 @@ nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_r rqpair->resources->recvs_to_post.last->next = first; rqpair->resources->recvs_to_post.last = last; } + + if (!rtransport->transport.opts.wr_batching) { + _poller_submit_recvs(rtransport, rqpair->poller); + } } /* Append the given send wr structure to the qpair's outstanding sends list. */ /* This function accepts either a single wr or the first wr in a linked list. */ static void -nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first) +nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first, + struct spdk_nvmf_rdma_transport *rtransport) { struct ibv_send_wr *last; @@ -1143,10 +1157,14 @@ nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_s rqpair->sends_to_post.last->next = first; rqpair->sends_to_post.last = last; } + + if (!rtransport->transport.opts.wr_batching) { + _poller_submit_sends(rtransport, rqpair->poller); + } } static int -request_transfer_in(struct spdk_nvmf_request *req) +request_transfer_in(struct spdk_nvmf_request *req, struct spdk_nvmf_rdma_transport *rtransport) { struct spdk_nvmf_rdma_request *rdma_req; struct spdk_nvmf_qpair *qpair; @@ -1159,14 +1177,15 @@ request_transfer_in(struct spdk_nvmf_request *req) assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); assert(rdma_req != NULL); - nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr); + nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr, rtransport); rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; return 0; } static int -request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) +request_transfer_out(struct spdk_nvmf_request *req, int *data_posted, + struct spdk_nvmf_rdma_transport *rtransport) { int num_outstanding_data_wr = 0; struct spdk_nvmf_rdma_request *rdma_req; @@ -1192,7 +1211,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) /* queue the capsule for the recv buffer */ assert(rdma_req->recv != NULL); - nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr); + nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr, rtransport); rdma_req->recv = NULL; assert(rqpair->current_recv_depth > 0); @@ -1210,7 +1229,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) *data_posted = 1; num_outstanding_data_wr = rdma_req->num_outstanding_data_wr; } - nvmf_rdma_qpair_queue_send_wrs(rqpair, first); + nvmf_rdma_qpair_queue_send_wrs(rqpair, first, rtransport); /* +1 for the rsp wr */ rqpair->current_send_depth += num_outstanding_data_wr + 1; @@ -2132,7 +2151,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, /* We have already verified that this request is the head of the queue. */ STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); - rc = request_transfer_in(&rdma_req->req); + rc = request_transfer_in(&rdma_req->req, rtransport); if (!rc) { rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; } else { @@ -2240,7 +2259,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, case RDMA_REQUEST_STATE_READY_TO_COMPLETE: spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); - rc = request_transfer_out(&rdma_req->req, &data_posted); + rc = request_transfer_out(&rdma_req->req, &data_posted, rtransport); assert(rc == 0); /* No good way to handle this currently */ if (rc) { rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; @@ -2295,6 +2314,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false +#define SPDK_NVMF_RDMA_WR_BATCHING true static void spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) @@ -2310,6 +2330,7 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; + opts->wr_batching = SPDK_NVMF_RDMA_WR_BATCHING; } const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { @@ -2370,7 +2391,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" " in_capsule_data_size=%d, max_aq_depth=%d,\n" - " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n", + " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d,\n" + " wr_batching=%d\n", opts->max_queue_depth, opts->max_io_size, opts->max_qpairs_per_ctrlr, @@ -2379,7 +2401,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) opts->max_aq_depth, opts->num_shared_buffers, opts->max_srq_depth, - opts->no_srq); + opts->no_srq, + opts->wr_batching); /* I/O unit size cannot be larger than max I/O size */ if (opts->io_unit_size > opts->max_io_size) { diff --git a/module/event/subsystems/nvmf/conf.c b/module/event/subsystems/nvmf/conf.c index 278ede8efc4..43e7de69cc0 100644 --- a/module/event/subsystems/nvmf/conf.c +++ b/module/event/subsystems/nvmf/conf.c @@ -642,6 +642,8 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx) } bval = spdk_conf_section_get_boolval(ctx->sp, "NoSRQ", false); opts.no_srq = bval; + bval = spdk_conf_section_get_boolval(ctx->sp, "WRBatching", true); + opts.wr_batching = bval; } if (trtype == SPDK_NVME_TRANSPORT_TCP) { diff --git a/scripts/rpc.py b/scripts/rpc.py index 2660d3a72a1..04472717bf3 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -1684,7 +1684,8 @@ def nvmf_create_transport(args): no_srq=args.no_srq, c2h_success=args.c2h_success, dif_insert_or_strip=args.dif_insert_or_strip, - sock_priority=args.sock_priority) + sock_priority=args.sock_priority, + wr_batching=args.wr_batching) p = subparsers.add_parser('nvmf_create_transport', help='Create NVMf transport') p.add_argument('-t', '--trtype', help='Transport type (ex. RDMA)', type=str, required=True) @@ -1702,6 +1703,7 @@ def nvmf_create_transport(args): p.add_argument('-o', '--c2h-success', action='store_false', help='Disable C2H success optimization. Relevant only for TCP transport') p.add_argument('-f', '--dif-insert-or-strip', action='store_true', help='Enable DIF insert/strip. Relevant only for TCP transport') p.add_argument('-y', '--sock-priority', help='The sock priority of the tcp connection. Relevant only for TCP transport', type=int) + p.add_argument('-b', '--wr-batching', action='store_true', help='Disable work requests batching. Relevant only for RDMA transport') p.set_defaults(func=nvmf_create_transport) def nvmf_get_transports(args): diff --git a/scripts/rpc/nvmf.py b/scripts/rpc/nvmf.py index c471f63373f..b42e46450c9 100644 --- a/scripts/rpc/nvmf.py +++ b/scripts/rpc/nvmf.py @@ -106,7 +106,8 @@ def nvmf_create_transport(client, no_srq=False, c2h_success=True, dif_insert_or_strip=None, - sock_priority=None): + sock_priority=None, + wr_batching=True): """NVMf Transport Create options. Args: @@ -123,7 +124,7 @@ def nvmf_create_transport(client, no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional) c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional) dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional) - + wq_batching: Boolean flag to disable work requests batching - RDMA specific (optional) Returns: True or False """ @@ -158,6 +159,8 @@ def nvmf_create_transport(client, params['dif_insert_or_strip'] = dif_insert_or_strip if sock_priority: params['sock_priority'] = sock_priority + if wr_batching: + params['wr_batching'] = wr_batching return client.call('nvmf_create_transport', params)