Skip to content

Commit

Permalink
TL/MLX5: complete the mcast init
Browse files Browse the repository at this point in the history
  • Loading branch information
MamziB committed Feb 15, 2024
1 parent 53a808e commit b52f83c
Show file tree
Hide file tree
Showing 9 changed files with 647 additions and 65 deletions.
19 changes: 19 additions & 0 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "components/tl/ucc_tl.h"
#include "components/tl/ucc_tl_log.h"
#include "utils/ucc_rcache.h"
#include "core/ucc_service_coll.h"

#define POLL_PACKED 16
#define REL_DONE ((void*)-1)
Expand Down Expand Up @@ -119,6 +120,7 @@ typedef struct ucc_tl_mlx5_mcast_rcache_region {
} ucc_tl_mlx5_mcast_rcache_region_t;

typedef struct ucc_tl_mlx5_mcast_ctx_params {
int mcast_enabled;
char *ib_dev_name;
int print_nack_stats;
int timeout;
Expand All @@ -142,11 +144,19 @@ typedef struct ucc_tl_mlx5_mcast_coll_context {
ucc_base_lib_t *lib;
} ucc_tl_mlx5_mcast_coll_context_t;

typedef struct ucc_tl_mlx5_mcast_join_info_t {
ucc_status_t status;
uint16_t dlid;
union ibv_gid dgid;
} ucc_tl_mlx5_mcast_join_info_t;

typedef struct ucc_tl_mlx5_mcast_context {
ucc_thread_mode_t tm;
ucc_tl_mlx5_mcast_coll_context_t mcast_context;
ucc_tl_mlx5_mcast_context_config_t cfg;
ucc_mpool_t req_mp;
int mcast_enabled;
int mcast_ready;
ucc_tl_mlx5_mcast_oob_ctx_t oob_ctx;
} ucc_tl_mlx5_mcast_context_t;

Expand Down Expand Up @@ -225,6 +235,11 @@ typedef struct ucc_tl_mlx5_mcast_coll_comm {
int n_prep_reliable;
int n_mcast_reliable;
int wsize;
ucc_tl_mlx5_mcast_join_info_t *group_setup_info;
ucc_service_coll_req_t *group_setup_info_req;
ucc_status_t (*bcast_post) (void*, void*, size_t, ucc_rank_t, ucc_service_coll_req_t**);
ucc_status_t (*bcast_test) (ucc_service_coll_req_t*);
struct rdma_cm_event *event;
struct pp_packet *r_window[1]; // do not add any new variable after here
} ucc_tl_mlx5_mcast_coll_comm_t;

Expand Down Expand Up @@ -352,11 +367,15 @@ ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t *tl_context,
const ucc_base_team_params_t *params,
ucc_tl_mlx5_mcast_coll_comm_init_spec_t *mcast_conf);

ucc_status_t ucc_tl_mlx5_mcast_team_test(ucc_base_team_t *team);

ucc_status_t ucc_tl_mlx5_mcast_coll_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t *mcast_ctx,
ucc_tl_mlx5_mcast_ctx_params_t *mcast_ctx_conf);


ucc_status_t ucc_tl_mlx5_mcast_clean_ctx(ucc_tl_mlx5_mcast_coll_context_t *ctx);
#endif
54 changes: 52 additions & 2 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,20 @@ ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t *cont
int ib_valid;
const char *dst;

mlx5_ctx = ucc_container_of(context, ucc_tl_mlx5_context_t, mcast);
lib = mlx5_ctx->super.super.lib;

context->mcast_enabled = mcast_ctx_conf->mcast_enabled;

if (!mcast_ctx_conf->mcast_enabled) {
tl_debug(lib, "Mcast is disabled by the user");
return UCC_ERR_NO_RESOURCE;
}

ctx = &(context->mcast_context);
memset(ctx, 0, sizeof(ucc_tl_mlx5_mcast_coll_context_t));
memcpy(&ctx->params, mcast_ctx_conf, sizeof(ucc_tl_mlx5_mcast_ctx_params_t));

mlx5_ctx = ucc_container_of(context, ucc_tl_mlx5_context_t, mcast);
lib = mlx5_ctx->super.super.lib;
ctx->lib = lib;

/* TODO unify all the contexts under TL mlx5 */
Expand Down Expand Up @@ -239,13 +247,55 @@ ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t *cont
error:
if (ctx->pd) {
ibv_dealloc_pd(ctx->pd);
ctx->pd = NULL;
}
if (ctx->id) {
rdma_destroy_id(ctx->id);
ctx->id = NULL;
}
if (ctx->channel) {
rdma_destroy_event_channel(ctx->channel);
ctx->channel = NULL;
}

return status;
}

ucc_status_t ucc_tl_mlx5_mcast_clean_ctx(ucc_tl_mlx5_mcast_coll_context_t *ctx)
{
tl_debug(ctx->lib, "cleaning mcast ctx: %p", ctx);

if (ctx == NULL) return UCC_OK;

if (ctx->rcache) {
ucc_rcache_destroy(ctx->rcache);
ctx->rcache = NULL;
}

if (ctx->pd) {
if (ibv_dealloc_pd(ctx->pd)) {
tl_error(ctx->lib, "ibv_dealloc_pd failed errno %d", errno);
return UCC_ERR_NO_RESOURCE;
}
ctx->pd = NULL;
}

if (ctx->id && rdma_destroy_id(ctx->id)) {
tl_error(ctx->lib, "rdma_destroy_id failed errno %d", errno);
return UCC_ERR_NO_RESOURCE;
}

ctx->id = NULL;

if (ctx->channel) {
rdma_destroy_event_channel(ctx->channel);
ctx->channel = NULL;
}

if (ctx->devname && !strcmp(ctx->params.ib_dev_name, "")) {
ucc_free(ctx->devname);
ctx->devname = NULL;
}

return UCC_OK;
}
30 changes: 0 additions & 30 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -529,33 +529,3 @@ ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm)
return UCC_OK;
}

ucc_status_t ucc_tl_mlx5_clean_mcast_ctx(ucc_tl_mlx5_mcast_coll_context_t *ctx)
{
tl_debug(ctx->lib, "cleaning mcast ctx: %p", ctx);

if (ctx->rcache) {
ucc_rcache_destroy(ctx->rcache);
}

if (ctx->pd) {
if (ibv_dealloc_pd(ctx->pd)) {
tl_error(ctx->lib, "ibv_dealloc_pd failed errno %d", errno);
return UCC_ERR_NO_RESOURCE;
}
}

if (rdma_destroy_id(ctx->id)) {
tl_error(ctx->lib, "rdma_destroy_id failed errno %d", errno);
return UCC_ERR_NO_RESOURCE;
}

rdma_destroy_event_channel(ctx->channel);

if (!strcmp(ctx->params.ib_dev_name, "")) {
ucc_free(ctx->devname);
}

ucc_free(ctx);

return UCC_OK;
}
8 changes: 8 additions & 0 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,4 +365,12 @@ ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,

ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm);

ucc_status_t ucc_tl_mlx5_mcast_join_mcast_post(ucc_tl_mlx5_mcast_coll_context_t *ctx,
struct sockaddr_in6 *net_addr,
int is_root);

ucc_status_t ucc_tl_mlx5_mcast_join_mcast_test(ucc_tl_mlx5_mcast_coll_context_t *ctx,
struct rdma_cm_event **event,
int is_root);

#endif /* TL_MLX5_MCAST_HELPER_H_ */
Loading

0 comments on commit b52f83c

Please sign in to comment.