From 53350c7cc5280935b6fd974263de96ac717d962e Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Thu, 22 Feb 2024 21:59:05 +0100 Subject: [PATCH] CL/HIER: change score only for supported colls (#923) --- src/coll_score/ucc_coll_score.c | 12 +++++++++++- src/coll_score/ucc_coll_score.h | 14 ++++++++------ src/coll_score/ucc_coll_score_map.c | 19 +++++++++++-------- src/components/cl/hier/cl_hier.h | 10 +++++++--- src/components/cl/hier/cl_hier_team.c | 2 +- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/src/coll_score/ucc_coll_score.c b/src/coll_score/ucc_coll_score.c index 7cc4f90af3..c99d33f9dc 100644 --- a/src/coll_score/ucc_coll_score.c +++ b/src/coll_score/ucc_coll_score.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -9,6 +9,16 @@ #include "utils/ucc_log.h" #include "utils/ucc_coll_utils.h" +char *ucc_score_to_str(ucc_score_t score, char *buf, size_t max) { + if (score == UCC_SCORE_MAX) { + ucc_strncpy_safe(buf, "inf", max); + } else { + ucc_snprintf_safe(buf, max, "%d", score); + } + + return buf; +} + ucc_status_t ucc_coll_score_alloc(ucc_coll_score_t **score) { ucc_coll_score_t *s = ucc_malloc(sizeof(*s), "ucc_coll_score"); diff --git a/src/coll_score/ucc_coll_score.h b/src/coll_score/ucc_coll_score.h index 16f0ba0b74..fa95e6a76a 100644 --- a/src/coll_score/ucc_coll_score.h +++ b/src/coll_score/ucc_coll_score.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -63,6 +63,8 @@ typedef struct ucc_coll_score { typedef struct ucc_score_map ucc_score_map_t; +char *ucc_score_to_str(ucc_score_t score, char *buf, size_t max); + /* Allocates empty score data structure */ ucc_status_t ucc_coll_score_alloc(ucc_coll_score_t **score); @@ -77,7 +79,7 @@ ucc_status_t ucc_coll_score_add_range(ucc_coll_score_t *score, /* Releases the score data structure and all the score ranges stored there */ -void ucc_coll_score_free(ucc_coll_score_t *score); +void ucc_coll_score_free(ucc_coll_score_t *score); /* Merges 2 scores score1 and score2 into the new score "rst" selecting larger score. Ie.: rst will contain a range from score1 if either @@ -87,9 +89,9 @@ void ucc_coll_score_free(ucc_coll_score_t *score); This fn is used by CL to merge scores from multiple TLs and produce a score map. As a result the produced score map will select TL with higher score.*/ -ucc_status_t ucc_coll_score_merge(ucc_coll_score_t * score1, - ucc_coll_score_t * score2, - ucc_coll_score_t **rst, int free_inputs); +ucc_status_t ucc_coll_score_merge(ucc_coll_score_t * score1, + ucc_coll_score_t * score2, + ucc_coll_score_t **rst, int free_inputs); /* Parses SCORE string (see ucc_base_iface.c for pattern description) @@ -147,7 +149,7 @@ ucc_status_t ucc_coll_score_build_default(ucc_base_team_t *team, ucc_status_t ucc_coll_score_build_map(ucc_coll_score_t *score, ucc_score_map_t **map); -void ucc_coll_score_free_map(ucc_score_map_t *map); +void ucc_coll_score_free_map(ucc_score_map_t *map); /* Initializes task based on args selection and score map. Checks fallbacks if necessary. */ diff --git a/src/coll_score/ucc_coll_score_map.c b/src/coll_score/ucc_coll_score_map.c index 5b67260bd8..9267a77478 100644 --- a/src/coll_score/ucc_coll_score_map.c +++ b/src/coll_score/ucc_coll_score_map.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -160,11 +160,12 @@ ucc_status_t ucc_coll_init(ucc_score_map_t *map, void ucc_coll_score_map_print_info(const ucc_score_map_t *map) { - size_t left; - ucc_msg_range_t *range; - int i, j, all_empty; - char range_str[128]; - char coll_str[1024]; + size_t left; + ucc_msg_range_t *range; + int i, j, all_empty; + char score_str[32]; + char range_str[128]; + char coll_str[1024]; for (i = 0; i < UCC_COLL_TYPE_NUM; i++) { all_empty = 1; @@ -191,10 +192,12 @@ void ucc_coll_score_map_print_info(const ucc_score_map_t *map) super.list_elem) { ucc_memunits_range_str(range->start, range->end, range_str, sizeof(range_str)); - STR_APPEND(coll_str, left, 256, "{%s}:%s:%u ", + ucc_score_to_str(range->super.score, score_str, + sizeof(score_str)); + STR_APPEND(coll_str, left, 256, "{%s}:%s:%s ", range_str, range->super.team->context->lib->log_component.name, - range->super.score); + score_str); } STR_APPEND(coll_str, left, 4, "\n"); } diff --git a/src/components/cl/hier/cl_hier.h b/src/components/cl/hier/cl_hier.h index 8f538c1d7b..c2fcf5e245 100644 --- a/src/components/cl/hier/cl_hier.h +++ b/src/components/cl/hier/cl_hier.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. * * See file LICENSE for terms. @@ -109,8 +109,12 @@ typedef struct ucc_cl_hier_team { UCC_CLASS_DECLARE(ucc_cl_hier_team_t, ucc_base_context_t *, const ucc_base_team_params_t *); -#define UCC_CL_HIER_SUPPORTED_COLLS \ - (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV) +#define UCC_CL_HIER_SUPPORTED_COLLS \ + (UCC_COLL_TYPE_ALLTOALL | \ + UCC_COLL_TYPE_ALLTOALLV | \ + UCC_COLL_TYPE_ALLREDUCE | \ + UCC_COLL_TYPE_BARRIER | \ + UCC_COLL_TYPE_BCAST) ucc_status_t ucc_cl_hier_coll_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, diff --git a/src/components/cl/hier/cl_hier_team.c b/src/components/cl/hier/cl_hier_team.c index 8457e3db83..32ef7e2f93 100644 --- a/src/components/cl/hier/cl_hier_team.c +++ b/src/components/cl/hier/cl_hier_team.c @@ -363,7 +363,7 @@ ucc_status_t ucc_cl_hier_team_get_scores(ucc_base_team_t *cl_team, team_info.init = ucc_cl_hier_coll_init; team_info.num_mem_types = 0; team_info.supported_mem_types = NULL; /* all memory types supported*/ - team_info.supported_colls = UCC_COLL_TYPE_ALL; + team_info.supported_colls = UCC_CL_HIER_SUPPORTED_COLLS; team_info.size = UCC_CL_TEAM_SIZE(team); status = ucc_coll_score_alloc(&score);