From 1c2c964c90fdd4cc826e205197c3c8f0d1bc7125 Mon Sep 17 00:00:00 2001 From: Mamzi Bayatpour Date: Thu, 18 Apr 2024 10:20:16 -0700 Subject: [PATCH] TL/MLX5: fix warning and var names --- src/components/tl/mlx5/mcast/tl_mlx5_mcast.h | 2 +- .../tl/mlx5/mcast/tl_mlx5_mcast_team.c | 2 +- src/components/tl/mlx5/tl_mlx5.h | 2 +- src/components/tl/mlx5/tl_mlx5_context.c | 6 ++-- src/components/tl/mlx5/tl_mlx5_team.c | 31 +++++++++---------- 5 files changed, 20 insertions(+), 23 deletions(-) diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast.h b/src/components/tl/mlx5/mcast/tl_mlx5_mcast.h index 8c261d830c..711053f1b2 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast.h +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast.h @@ -157,7 +157,7 @@ typedef struct ucc_tl_mlx5_mcast_context { ucc_tl_mlx5_mcast_context_config_t cfg; ucc_mpool_t req_mp; int mcast_enabled; - int mcast_ready; + int mcast_ctx_ready; ucc_tl_mlx5_mcast_oob_ctx_t oob_ctx; } ucc_tl_mlx5_mcast_context_t; diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c index 6823abaa08..61f6865669 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c @@ -62,7 +62,7 @@ ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t *base_context, ucc_tl_mlx5_mcast_coll_comm_t *comm; int i; - if (!ctx->mcast_enabled || !ctx->mcast_ready || NULL == mcast_context) { + if (!ctx->mcast_ctx_ready) { tl_debug(base_context->lib, "mcast context not available, base_context = %p", base_context ); diff --git a/src/components/tl/mlx5/tl_mlx5.h b/src/components/tl/mlx5/tl_mlx5.h index e07838b3a4..1b6404e6bd 100644 --- a/src/components/tl/mlx5/tl_mlx5.h +++ b/src/components/tl/mlx5/tl_mlx5.h @@ -145,7 +145,7 @@ typedef struct ucc_tl_mlx5_team { ucc_tl_mlx5_alltoall_t *a2a; ucc_topo_t *topo; ucc_ep_map_t ctx_map; - int local_mcast_ctx_ready; + int local_mcast_team_ready; ucc_tl_mlx5_mcast_team_t *mcast; ucc_status_t local_status_array[UCC_TL_MLX5_FEATURES_COUNT]; ucc_status_t global_status_array[UCC_TL_MLX5_FEATURES_COUNT]; diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c index 0afe7e6cc0..7eaffaef4c 100644 --- a/src/components/tl/mlx5/tl_mlx5_context.c +++ b/src/components/tl/mlx5/tl_mlx5_context.c @@ -49,13 +49,13 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t, goto err_rcache; } - self->mcast.mcast_ready = 0; + self->mcast.mcast_ctx_ready = 0; if (params->thread_mode == UCC_THREAD_SINGLE) { status = ucc_tl_mlx5_mcast_context_init(&(self->mcast), &(self->cfg.mcast_ctx_conf)); if (UCC_OK != status) { tl_debug(self->super.super.lib, "failed to initialize mcast context"); } else { - self->mcast.mcast_ready = 1; + self->mcast.mcast_ctx_ready = 1; } } return UCC_OK; @@ -82,7 +82,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t) ucc_mpool_cleanup(&self->req_mp, 1); - if (self->mcast.mcast_ready) { + if (self->mcast.mcast_ctx_ready) { ucc_tl_mlx5_mcast_clean_ctx(&self->mcast.mcast_context); } } diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index 5bcf7c7280..614ead348b 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -75,13 +75,16 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context, } self->mcast = NULL; - status = ucc_tl_mlx5_mcast_team_init(tl_context, &(self->mcast), &(ctx->mcast), - params, &(UCC_TL_MLX5_TEAM_LIB(self)->cfg.mcast_conf)); - if (UCC_OK != status) { - tl_warn(tl_context->lib, "mcast team init failed"); - self->local_mcast_ctx_ready = 0; - } else { - self->local_mcast_ctx_ready = 1; + + self->local_mcast_team_ready = 0; + if (ctx->mcast.mcast_ctx_ready) { + status = ucc_tl_mlx5_mcast_team_init(tl_context, &(self->mcast), &(ctx->mcast), + params, &(UCC_TL_MLX5_TEAM_LIB(self)->cfg.mcast_conf)); + if (UCC_OK != status) { + tl_warn(tl_context->lib, "mcast team init failed"); + } else { + self->local_mcast_team_ready = 1; + } } self->mcast_state = TL_MLX5_TEAM_STATE_MCAST_CTX_CHECK; @@ -186,9 +189,9 @@ ucc_status_t ucc_tl_mlx5_team_create_test(ucc_base_team_t *team) /* mcast context is not available for some of the team members so we cannot create * mcast team */ tl_debug(UCC_TL_TEAM_LIB(tl_team), - "failure during mcast ctx create, no mcast team support"); + "some of the ranks do not have mcast context available so no mcast team is created"); - if (tl_team->local_mcast_ctx_ready) { + if (tl_team->local_mcast_team_ready) { comm = tl_team->mcast->mcast_comm; /* release the resources */ if (ibv_dereg_mr(comm->grh_mr)) { @@ -230,17 +233,11 @@ ucc_status_t ucc_tl_mlx5_team_create_test(ucc_base_team_t *team) tl_team->mcast_state = TL_MLX5_TEAM_STATE_MCAST_NOT_AVAILABLE; } - tl_debug(team->context->lib, "attempted to initialize tl team: %p: MCAST component is %s ALLTOALL component is %s", + tl_debug(team->context->lib, "team %p: MCAST component is %s ALLTOALL component is %s", team, (tl_team->mcast_state == TL_MLX5_TEAM_STATE_MCAST_READY)?"ENABLED":"DISABLED", (tl_team->a2a_state == TL_MLX5_TEAM_STATE_ALLTOALL_READY)?"ENABLED":"DISABLED"); } - if (tl_team->mcast_state == TL_MLX5_TEAM_STATE_MCAST_NOT_AVAILABLE && - tl_team->a2a_state == TL_MLX5_TEAM_STATE_ALLTOALL_NOT_AVAILABLE) { - tl_warn(team->context->lib, "unable to initialize tl team as both ALLTOALL and MCAST are not available: %p", team); - return UCC_ERR_NO_RESOURCE; - } - return UCC_OK; } @@ -252,7 +249,7 @@ ucc_status_t ucc_tl_mlx5_team_create_test(ucc_base_team_t *team) tl_team->local_status_array[UCC_TL_MLX5_A2A_STATUS_INDEX] = tl_team->a2a_status.local; tl_team->local_status_array[UCC_TL_MLX5_MCAST_STATUS_INDEX] = - (tl_team->local_mcast_ctx_ready) ? UCC_OK : UCC_ERR_NO_RESOURCE; + (tl_team->local_mcast_team_ready) ? UCC_OK : UCC_ERR_NO_RESOURCE; goto initial_sync_post; }