Skip to content

Commit

Permalink
CORE: fix coll trace for service team (openucx#1046)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev authored and fuxu committed Dec 26, 2024
1 parent e39c99c commit 2b0a0e7
Show file tree
Hide file tree
Showing 10 changed files with 21 additions and 20 deletions.
3 changes: 0 additions & 3 deletions src/components/tl/mlx5/tl_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,6 @@ typedef struct ucc_tl_mlx5_rcache_region {
#define UCC_TL_CTX_LIB(_ctx) \
(ucc_derived_of((_ctx)->super.super.lib, ucc_tl_mlx5_lib_t))

#define IS_SERVICE_TEAM(_team) \
((_team)->super.super.params.scope == UCC_CL_LAST + 1)

#define SQUARED(_num) ((_num) * (_num))

ucc_status_t tl_mlx5_create_rcache(ucc_tl_mlx5_context_t *ctx);
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) {
ucc_context_progress(core_ctx);
}
ucc_collective_finalize(&req->super);
ucc_collective_finalize_internal(req);

if (UCC_OK != status) {
tl_debug(context->lib, "failure during mlx5 ctx bcast");
Expand Down
7 changes: 4 additions & 3 deletions src/components/tl/sharp/tl_sharp_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <inttypes.h>
#include "tl_sharp.h"
#include "utils/arch/cpu.h"
#include "core/ucc_service_coll.h"

static int ucc_tl_sharp_oob_barrier(void *arg)
{
Expand Down Expand Up @@ -141,7 +142,7 @@ static int ucc_tl_sharp_service_barrier(void *arg)
ucc_context_progress(ctx->super.super.ucc_context);
status = ucc_collective_test(&req->super);
} while (status == UCC_INPROGRESS);
ucc_collective_finalize(&req->super);
ucc_collective_finalize_internal(req);

return status;
}
Expand Down Expand Up @@ -179,7 +180,7 @@ static int ucc_tl_sharp_service_gather(void *arg, int root, void *sbuf,
ucc_context_progress(ctx->super.super.ucc_context);
status = ucc_collective_test(&req->super);
} while (status == UCC_INPROGRESS);
ucc_collective_finalize(&req->super);
ucc_collective_finalize_internal(req);

if (subset.myrank != root) {
ucc_free(rbuf);
Expand Down Expand Up @@ -208,7 +209,7 @@ static int ucc_tl_sharp_service_bcast(void *arg, void *buf, int size, int root)
status = ucc_collective_test(&req->super);
} while (status == UCC_INPROGRESS);

ucc_collective_finalize(&req->super);
ucc_collective_finalize_internal(req);
return status;
}

Expand Down
6 changes: 5 additions & 1 deletion src/components/tl/ucc_tl.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -165,4 +165,8 @@ typedef struct ucc_tl_lib_attr {
#define UCC_TL_TEAM_MAP(_tl_team) (_tl_team)->super.super.params.map

#define UCC_TL_TEAM_OOB(_tl_team) (_tl_team)->super.super.params.params.oob

#define UCC_TL_IS_SERVICE_TEAM(_tl_team) \
((_tl_team)->super.super.params.scope == UCC_CL_LAST + 1)

#endif
5 changes: 1 addition & 4 deletions src/components/tl/ucp/tl_ucp.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,8 @@ extern ucc_config_field_t ucc_tl_ucp_lib_config_table[];
#define UCC_TL_UCP_TEAM_CTX(_team) \
(ucc_derived_of((_team)->super.super.context, ucc_tl_ucp_context_t))

#define IS_SERVICE_TEAM(_team) \
((_team)->super.super.params.scope == UCC_CL_LAST + 1)

#define USE_SERVICE_WORKER(_team) \
(IS_SERVICE_TEAM(_team) && UCC_TL_UCP_TEAM_CTX(_team)->cfg.service_worker)
(UCC_TL_IS_SERVICE_TEAM(_team) && UCC_TL_UCP_TEAM_CTX(_team)->cfg.service_worker)

#define UCC_TL_UCP_TASK_TEAM(_task) \
(ucc_derived_of((_task)->super.team, ucc_tl_ucp_team_t))
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/ucp/tl_ucp_ep.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/ucp/tl_ucp_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ static inline ucc_status_t ucc_tl_ucp_get_ep(ucc_tl_ucp_team_t *team,
ucc_team_t *core_team = UCC_TL_CORE_TEAM(team);
/* Core super.super.team ptr is NULL for service_team
which has scope == UCC_CL_LAST + 1*/
ucc_assert((NULL != core_team) || IS_SERVICE_TEAM(team));
ucc_assert((NULL != core_team) || UCC_TL_IS_SERVICE_TEAM(team));
ctx_rank = core_team ? ucc_get_ctx_rank(core_team, core_rank)
: core_rank;
*ep = team->worker->eps[ctx_rank];
Expand Down
5 changes: 3 additions & 2 deletions src/components/tl/ucp/tl_ucp_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context,
}
}

if (ucc_global_config.file_cfg && !IS_SERVICE_TEAM(self) &&
if (ucc_global_config.file_cfg && !UCC_TL_IS_SERVICE_TEAM(self) &&
ctx->topo_required && tl_context->lib->use_tuning) {
status = ucc_add_team_sections(&self->cfg, ucc_tl_ucp_lib_config_table,
self->topo, &self->tuning_str,
Expand All @@ -91,7 +91,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context,
self->cfg.use_reordering = 0;
}

if (self->topo && !IS_SERVICE_TEAM(self) && self->topo->topo->sock_bound) {
if (self->topo && !UCC_TL_IS_SERVICE_TEAM(self) &&
self->topo->topo->sock_bound) {
tsize = UCC_TL_TEAM_SIZE(self);
max_radix = (ucc_topo_max_ppn(self->topo) == 1) ? tsize :
ucc_min(tsize, ucc_topo_min_socket_size(self->topo));
Expand Down
4 changes: 1 addition & 3 deletions src/core/ucc_service_coll.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -139,8 +139,6 @@ ucc_status_t ucc_service_coll_test(ucc_service_coll_req_t *req)
return status;
}

ucc_status_t ucc_collective_finalize_internal(ucc_coll_task_t *task);

ucc_status_t ucc_service_coll_finalize(ucc_service_coll_req_t *req)
{
ucc_status_t status;
Expand Down
5 changes: 4 additions & 1 deletion src/core/ucc_service_coll.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* See file LICENSE for terms.
*/

Expand Down Expand Up @@ -37,4 +37,7 @@ ucc_status_t ucc_internal_oob_init(ucc_team_t *team, ucc_subset_t subset,
ucc_team_oob_coll_t *oob);

void ucc_internal_oob_finalize(ucc_team_oob_coll_t *oob);

ucc_status_t ucc_collective_finalize_internal(ucc_coll_task_t *task);

#endif

0 comments on commit 2b0a0e7

Please sign in to comment.