Skip to content

Commit

Permalink
nvmf/rdma: Add NUMA aware policy
Browse files Browse the repository at this point in the history
Increase performance SPDK NVME-oF target RDMA running on system with
multiple NUMA nodes and multiple network adapter.
Add param NumaPolicy in RDMA conf

Signed-off-by: Ivan Betsis <c_ivanb@mellanox.com>
Change-Id: Ib64033f3cfeefda02f886c31a3331cd9b219875e
  • Loading branch information
Ivan Betsis committed Apr 1, 2020
1 parent 2f585d3 commit 2afc751
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 1 deletion.
2 changes: 2 additions & 0 deletions etc/spdk/nvmf.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@
# Set the maximum number outstanding I/O per shared receive queue. Relevant only for RDMA transport
#MaxSRQDepth 4096

# Set NumaPolicy for ConnectionScheduler: Transport
#NumaPolicy False
[Transport]
# Set TCP transport type.
Type TCP
Expand Down
1 change: 1 addition & 0 deletions include/spdk/nvmf.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct spdk_nvmf_transport_opts {
bool no_srq;
bool c2h_success;
bool dif_insert_or_strip;
bool numa_policy;
uint32_t sock_priority;
};

Expand Down
90 changes: 89 additions & 1 deletion lib/nvmf/rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#include <rdma/rdma_verbs.h>
#include <numa.h>
#include <numaif.h>

#include "spdk/config.h"
#include "spdk/thread.h"
Expand Down Expand Up @@ -463,16 +465,21 @@ struct spdk_nvmf_rdma_poll_group {
struct spdk_nvmf_rdma_poll_group_stat stat;
TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers;
TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link;

int numa_node;
/*
* buffers which are split across multiple RDMA
* memory regions cannot be used by this transport.
*/
STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs;
};

#define MAX_NUMA 8

struct spdk_nvmf_rdma_conn_sched {
struct spdk_nvmf_rdma_poll_group *next_admin_pg;
struct spdk_nvmf_rdma_poll_group *next_io_pg;
struct spdk_nvmf_rdma_poll_group *next_io_pgs[MAX_NUMA];
};

/* Assuming rdma_cm uses just one protection domain per ibv_context. */
Expand All @@ -484,6 +491,7 @@ struct spdk_nvmf_rdma_device {
struct ibv_pd *pd;

int num_srq;
int numa_node;

TAILQ_ENTRY(spdk_nvmf_rdma_device) link;
};
Expand Down Expand Up @@ -2315,6 +2323,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
#define SPDK_NVMF_RDMA_NUMA_POLICY false;

static void
spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
Expand All @@ -2330,6 +2339,7 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
opts->numa_policy = SPDK_NVMF_RDMA_NUMA_POLICY;
}

const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
Expand All @@ -2339,6 +2349,32 @@ const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {

static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport);

static int
spdk_nvmf_rdma_get_numa_node_for_device(const char *device_name)
{
int node = 0;
#if defined(__linux__)
FILE *numa_file;
char path[PATH_MAX] = {};
int res;

snprintf(path, sizeof(path), "/sys/class/infiniband/%s/device/numa_node", device_name);

numa_file = fopen(path, "r");
if (!numa_file) {
SPDK_ERRLOG("file with numa node for device not exist\n");
return 0;
}
res = fscanf(numa_file, "%d", &node);
if (res <= 0) {
SPDK_ERRLOG("file with numa node for device is empty\n");
node = 0;
}
fclose(numa_file);
#endif
return node;
}

static struct spdk_nvmf_transport *
spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
{
Expand Down Expand Up @@ -2399,7 +2435,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
opts->max_aq_depth,
opts->num_shared_buffers,
opts->max_srq_depth,
opts->no_srq);
opts->no_srq,
opts->numa_policy);

/* I/O unit size cannot be larger than max I/O size */
if (opts->io_unit_size > opts->max_io_size) {
Expand Down Expand Up @@ -2539,6 +2576,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)

assert(device->map != NULL);
assert(device->pd != NULL);

device->numa_node = spdk_nvmf_rdma_get_numa_node_for_device(ibv_get_device_name(device->context->device));
}
rdma_free_devices(contexts);

Expand Down Expand Up @@ -3329,6 +3368,8 @@ spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
return NULL;
}

rgroup->numa_node = numa_node_of_cpu(sched_getcpu());

TAILQ_INIT(&rgroup->pollers);
STAILQ_INIT(&rgroup->retired_bufs);

Expand Down Expand Up @@ -3404,15 +3445,57 @@ spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
}

TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);

if (rtransport->conn_sched.next_admin_pg == NULL) {
rtransport->conn_sched.next_admin_pg = rgroup;
rtransport->conn_sched.next_io_pg = rgroup;
}
rtransport->conn_sched.next_io_pgs[rgroup->numa_node] = rgroup;

pthread_mutex_unlock(&rtransport->lock);

return &rgroup->group;
}

static struct spdk_nvmf_transport_poll_group *
spdk_nvmf_rdma_get_optimal_numa_poll_group(struct spdk_nvmf_qpair *qpair)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_poll_group **pg;
struct spdk_nvmf_transport_poll_group *result;
struct spdk_nvmf_rdma_qpair *rqpair;

rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);

pthread_mutex_lock(&rtransport->lock);

if (TAILQ_EMPTY(&rtransport->poll_groups)) {
pthread_mutex_unlock(&rtransport->lock);
return NULL;
}

if (qpair->qid == 0) {
pg = &rtransport->conn_sched.next_admin_pg;
} else {
pg = &rtransport->conn_sched.next_io_pgs[rqpair->device->numa_node];
}
assert(*pg != NULL);

result = &(*pg)->group;

do {
*pg = TAILQ_NEXT(*pg, link);
if (*pg == NULL) {
*pg = TAILQ_FIRST(&rtransport->poll_groups);
}
} while ((*pg)->numa_node != rqpair->device->numa_node);

pthread_mutex_unlock(&rtransport->lock);

return result;
}

static struct spdk_nvmf_transport_poll_group *
spdk_nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
{
Expand All @@ -3422,6 +3505,10 @@ spdk_nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)

rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);

if (rtransport->transport.opts.numa_policy) {
return spdk_nvmf_rdma_get_optimal_numa_poll_group(qpair);
}

pthread_mutex_lock(&rtransport->lock);

if (TAILQ_EMPTY(&rtransport->poll_groups)) {
Expand All @@ -3440,6 +3527,7 @@ spdk_nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
result = &(*pg)->group;

*pg = TAILQ_NEXT(*pg, link);

if (*pg == NULL) {
*pg = TAILQ_FIRST(&rtransport->poll_groups);
}
Expand Down
2 changes: 2 additions & 0 deletions module/event/subsystems/nvmf/conf.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,8 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
}
bval = spdk_conf_section_get_boolval(ctx->sp, "NoSRQ", false);
opts.no_srq = bval;
bval = spdk_conf_section_get_boolval(ctx->sp, "NumaPolicy", false);
opts.numa_policy = bval;
}

if (trtype == SPDK_NVME_TRANSPORT_TCP) {
Expand Down

0 comments on commit 2afc751

Please sign in to comment.