diff --git a/src/components/cl/hier/Makefile.am b/src/components/cl/hier/Makefile.am index 243f5811e8..5377ef4528 100644 --- a/src/components/cl/hier/Makefile.am +++ b/src/components/cl/hier/Makefile.am @@ -2,6 +2,10 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # +allgatherv = \ + allgatherv/allgatherv.h \ + allgatherv/allgatherv.c + allreduce = \ allreduce/allreduce.h \ allreduce/allreduce.c \ @@ -38,6 +42,7 @@ sources = \ cl_hier_team.c \ cl_hier_coll.c \ cl_hier_coll.h \ + $(allgatherv) \ $(allreduce) \ $(alltoallv) \ $(alltoall) \ diff --git a/src/components/cl/hier/allgatherv/allgatherv.c b/src/components/cl/hier/allgatherv/allgatherv.c new file mode 100755 index 0000000000..769ac7175c --- /dev/null +++ b/src/components/cl/hier/allgatherv/allgatherv.c @@ -0,0 +1,381 @@ +/** + * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "allgatherv.h" +#include "../cl_hier_coll.h" +#include "core/ucc_team.h" + +#define MAX_ALLGATHERV_TASKS 3 + +ucc_base_coll_alg_info_t + ucc_cl_hier_allgatherv_algs[UCC_CL_HIER_ALLGATHERV_ALG_LAST + 1] = { + [UCC_CL_HIER_ALLGATHERV_ALG_NODE_SPLIT] = + {.id = UCC_CL_HIER_ALLGATHERV_ALG_NODE_SPLIT, + .name = "node_split", + .desc = "splitting allgatherv into three consecutive calls, first " + "a gatherv" + " inside the node, then an allgatherv between the leaders " + "and then a bcast."}, + [UCC_CL_HIER_ALLGATHERV_ALG_LAST] = { + .id = 0, .name = NULL, .desc = NULL}}; + +static ucc_status_t ucc_cl_hier_allgatherv_start(ucc_coll_task_t *task) +{ + UCC_CL_HIER_PROFILE_REQUEST_EVENT(task, "cl_hier_allgatherv_start", 0); + return ucc_schedule_start(task); +} + +static ucc_status_t ucc_cl_hier_allgatherv_finalize(ucc_coll_task_t *task) +{ + ucc_cl_hier_schedule_t *cl_schedule = + ucc_derived_of(task, ucc_cl_hier_schedule_t); + ucc_status_t status; + + UCC_CL_HIER_PROFILE_REQUEST_EVENT(task, "cl_hier_allgatherv_finalize", 0); + + ucc_assert(cl_schedule->super.super.n_tasks <= 3); + + if (cl_schedule->scratch) { + ucc_mc_free(cl_schedule->scratch); + } + + status = ucc_schedule_finalize(task); + ucc_cl_hier_put_schedule(&cl_schedule->super.super); + return status; +} + +// Question: Do i need this function ? +ucc_status_t ucc_cl_hier_allgatherv_triggered_post_setup(ucc_coll_task_t *task) +{ + ucc_cl_hier_schedule_t *schedule = + ucc_derived_of(task, ucc_cl_hier_schedule_t); + ucc_status_t status = UCC_OK; + int n_tasks = schedule->super.super.n_tasks; + int i = 0; + + for (i = 0; i < n_tasks; ++i) { + ucc_coll_task_t *sub_task = schedule->super.super.tasks[i]; + if (sub_task->triggered_post_setup != NULL) { + sub_task->ee = task->ee; + sub_task->triggered_post_setup(sub_task); + } + } + return status; +} + +static ucc_status_t ucc_cl_hier_allgatherv_node_split_init_schedule( + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_schedule_t **sched_p, int n_frags) +{ + ucc_cl_hier_team_t *cl_team = ucc_derived_of(team, ucc_cl_hier_team_t); + ucc_cl_hier_schedule_t *cl_schedule; + ucc_schedule_t *schedule; + ucc_status_t status; + ucc_base_coll_args_t args; + ucc_coll_task_t *tasks[MAX_ALLGATHERV_TASKS] = {NULL}; + int n_tasks = 0; + void *gv_rc, *gv_displ; // Gatherv args buffers + void *agv_rc, *agv_displ; // Allgatherv args buffers + int i, c64, d64, nrank; + ucc_rank_t full_size, node_size, leaders_size; + size_t elem_size; + ucc_rank_t node_root = 0; + + //int rank = cl_team->sbgps[UCC_HIER_SBGP_FULL].sbgp->group_rank; + nrank = cl_team->sbgps[UCC_HIER_SBGP_NODE].sbgp->group_rank; + + int is_root = nrank == node_root; + + c64 = UCC_COLL_ARGS_COUNT64(&coll_args->args); + d64 = UCC_COLL_ARGS_DISPL64(&coll_args->args); + + if (c64 ^ d64) { + cl_debug(team->context->lib, + "mixed 64 bit count/displ mode is not supported\n"); + return UCC_ERR_NOT_SUPPORTED; + } + + cl_schedule = ucc_cl_hier_get_schedule(cl_team); + if (ucc_unlikely(!cl_schedule)) { + return UCC_ERR_NO_MEMORY; + } + schedule = &cl_schedule->super.super; + memcpy(&args, coll_args, sizeof(args)); + + UCC_CHECK_GOTO(ucc_schedule_init(schedule, &args, team), out, status); + + // Question: What is this ? + if (n_frags > 1) { + args.max_frag_count = + ucc_buffer_block_count(args.args.src.info.count, n_frags, 0); + args.mask |= UCC_BASE_CARGS_MAX_FRAG_COUNT; + } + + full_size = cl_team->sbgps[UCC_HIER_SBGP_FULL].sbgp->group_size; + node_size = cl_team->sbgps[UCC_HIER_SBGP_NODE].sbgp->group_size; + leaders_size = cl_team->sbgps[UCC_HIER_SBGP_NODE_LEADERS].sbgp->group_size; + elem_size = c64 ? 8 : 4; + + // if (!cl_team->sbgps[UCC_HIER_SBGP_NODE].sbgp->preserves_order){ + // printf("Reordering node ranking is not supported"); + // return UCC_ERR_NOT_SUPPORTED; + // } + + // Init buffers for collectives arguments + size_t scratch_size = elem_size * (node_size * 2 + leaders_size * 2); + status = ucc_mc_alloc(&cl_schedule->scratch, scratch_size, UCC_MEMORY_TYPE_HOST); + if (ucc_unlikely(UCC_OK != status)) { + cl_error(team->context->lib, + "failed to allocate %zd bytes for full counts", + scratch_size); + goto out; + } + gv_rc = cl_schedule->scratch->addr; /* +node_size */ + gv_displ = PTR_OFFSET(gv_rc, node_size*elem_size); /* +node_size*/ + agv_rc = PTR_OFFSET(gv_displ, node_size*elem_size); /* +leaders_size*/ + agv_displ = PTR_OFFSET(agv_rc, leaders_size*elem_size); /* +leaders_size*/ + + // Gatherv in the node + // src.info.buffer -> dst.info_v.buffer + if (node_size > 1){ + + do { // This section need to be rewritten to support both uint32_t and uint64_t once the logic is approved + int _i; + uint32_t _scount, _displ; + + _displ = 0; + + /* For every rank in the node, add his count as is and the displacement + * to be the running sum of the counts + */ + for (_i = 0; is_root && _i < node_size; _i++) { + ucc_rank_t r = ucc_ep_map_eval((*cl_team->sbgps[UCC_HIER_SBGP_NODE].sbgp).map, _i); + _scount = ((uint32_t *)coll_args->args.dst.info_v.counts)[r]; + ((uint32_t *)gv_rc)[_i] = _scount; + ((uint32_t *)gv_displ)[_i] = _displ; + + _displ += _scount; + } + } while (0); + + args.args.coll_type = UCC_COLL_TYPE_GATHERV; + args.args.root = node_root; + args.args.dst.info_v.counts = (ucc_count_t *)gv_rc; + args.args.dst.info_v.displacements = (ucc_aint_t *)gv_displ; + + UCC_CHECK_GOTO(ucc_coll_init(SCORE_MAP(cl_team, NODE), &args, &tasks[n_tasks]), out, status); + UCC_CHECK_GOTO(ucc_event_manager_subscribe( + &schedule->super, UCC_EVENT_SCHEDULE_STARTED, + tasks[n_tasks], ucc_task_start_handler), + out, status); + UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, tasks[n_tasks]), out, + status); + + // The result of this collective is in dst.info_v.buffer, this should be the source buffer of the next collective + args.args.src.info.buffer = args.args.dst.info_v.buffer; + n_tasks++; + } + + // Allgatherv in the full net + // src.info.buffer -> dst.info_v.buffer + if (is_root) { + + do { // This section need to be rewritten to support both uint32_t and uint64_t once the logic is approved + int _i; + for (_i = 0; _i < leaders_size; _i++) { + ((uint32_t *) agv_rc)[_i] = 0; + } + + int _hid, _count, _displ; + /* For every rank in the communicator, + * - find his host id (which is also the index of the leader -- Sergey ?) + * - Add his count to the leader count + */ + for (_i = 0; _i < full_size; _i++){ + _hid = ucc_team_rank_host_id(_i, coll_args->team); + _count = ((uint32_t *)coll_args->args.dst.info_v.counts)[_i]; + // Add the count of this rank to the count of the leader + ((uint32_t *) agv_rc)[_hid] += _count; + } + /* + For every leader, add the sum of the previous counts to his displacements + */ + _displ = 0; + for (_i = 0; _i < leaders_size; _i++){ + ((uint32_t *) agv_displ)[_i] = _displ; + _displ += ((uint32_t *) agv_rc)[_i]; + } + } while (0); + + args.args.coll_type = UCC_COLL_TYPE_ALLGATHERV; + args.args.dst.info_v.counts = (ucc_count_t *)agv_rc; + args.args.dst.info_v.displacements = (ucc_aint_t *)agv_displ; + + UCC_CHECK_GOTO(ucc_coll_init(SCORE_MAP(cl_team, NODE_LEADERS), &args, &tasks[n_tasks]), out, status); + + if (n_tasks > 1){ // TODO optimize + UCC_CHECK_GOTO( + ucc_event_manager_subscribe(tasks[n_tasks - 1], UCC_EVENT_COMPLETED, + tasks[n_tasks], ucc_task_start_handler), + out, status); + } + else{ + UCC_CHECK_GOTO(ucc_event_manager_subscribe( + &schedule->super, UCC_EVENT_SCHEDULE_STARTED, + tasks[n_tasks], ucc_task_start_handler), + out, status); + } + UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, tasks[n_tasks]), out, + status); + + // The result of this collective is in dst.info_v.buffer, this should be the source buffer of the next collective + args.args.src.info.buffer = args.args.dst.info_v.buffer; + n_tasks++; + } + + // BCAST in the node + // src.info.buffer -> src.info.buffer + if (node_size > 1){ + + args.args.coll_type = UCC_COLL_TYPE_BCAST; + args.args.root = node_root; + args.args.src.info.count = 0; + + do { // This section need to be rewritten to support both uint32_t and uint64_t once the logic is approved + int _i; + for (_i = 0; _i < full_size; _i++) { + args.args.src.info.count += ((uint32_t *)coll_args->args.dst.info_v.counts)[_i]; + } + } while (0); + + UCC_CHECK_GOTO( + ucc_coll_init(SCORE_MAP(cl_team, NODE), &args, &tasks[n_tasks]), out, + status); + UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, tasks[n_tasks]), out, + status); + + if (n_tasks > 1){ // TODO optimize + UCC_CHECK_GOTO( + ucc_event_manager_subscribe(tasks[n_tasks - 1], UCC_EVENT_COMPLETED, + tasks[n_tasks], ucc_task_start_handler), + out, status); + } + else{ + UCC_CHECK_GOTO(ucc_event_manager_subscribe( + &schedule->super, UCC_EVENT_SCHEDULE_STARTED, + tasks[n_tasks], ucc_task_start_handler), + out, status); + } + + // This collective writes to src.info.buffer, this should be the output buffer (dst.info_v.buffer) + args.args.dst.info_v.buffer = args.args.src.info.buffer; + n_tasks++; + } + + schedule->super.post = ucc_cl_hier_allgatherv_start; + schedule->super.finalize = ucc_cl_hier_allgatherv_finalize; + //schedule->super.triggered_post_setup = ucc_cl_hier_allgatherv_triggered_post_setup; + *sched_p = schedule; + + return UCC_OK; + +out: + for (i = 0; i < n_tasks; i++) { + tasks[i]->finalize(tasks[i]); + } + ucc_mc_free(cl_schedule->scratch); + ucc_cl_hier_put_schedule(schedule); + return status; +} + +static ucc_status_t ucc_cl_hier_allgatherv_node_split_frag_init( + ucc_base_coll_args_t *coll_args, ucc_schedule_pipelined_t *sp, + ucc_base_team_t *team, ucc_schedule_t **frag_p) +{ + int n_frags = sp->super.n_tasks; + + return ucc_cl_hier_allgatherv_node_split_init_schedule(coll_args, team, + frag_p, n_frags); +} + +static ucc_status_t ucc_cl_hier_allgatherv_node_split_frag_setup( + ucc_schedule_pipelined_t *schedule_p, ucc_schedule_t *frag, int frag_num) +{ + ucc_coll_args_t *args = &schedule_p->super.super.bargs.args; + size_t dt_size = ucc_dt_size(args->src.info.datatype); + int n_frags = schedule_p->super.n_tasks; + size_t frag_count, frag_offset; + ucc_coll_task_t *task; + int i; + + frag_count = + ucc_buffer_block_count(args->src.info.count, n_frags, frag_num); + frag_offset = + ucc_buffer_block_offset(args->src.info.count, n_frags, frag_num); + + for (i = 0; i < frag->n_tasks; i++) { + task = frag->tasks[i]; + task->bargs.args.src.info.count = frag_count; + task->bargs.args.src.info.buffer = + PTR_OFFSET(args->src.info.buffer, frag_offset * dt_size); + } + return UCC_OK; +} + +UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init, + (coll_args, team, task), + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_coll_task_t **task) +{ + ucc_cl_hier_team_t *cl_team = ucc_derived_of(team, ucc_cl_hier_team_t); + ucc_cl_hier_lib_config_t *cfg = &UCC_CL_HIER_TEAM_LIB(cl_team)->cfg; + ucc_cl_hier_schedule_t *schedule; + int n_frags, pipeline_depth; + ucc_status_t status; + + if (UCC_IS_PERSISTENT(coll_args->args)) { + return UCC_ERR_NOT_SUPPORTED; + } + n_frags = 1; + pipeline_depth = 1; + + // Question: How to use this ? + // ucc_pipeline_nfrags_pdepth(&cfg->allgatherv_node_split_pipeline, + // coll_args->args.src.info.count * + // ucc_dt_size(coll_args->args.src.info.datatype), + // &n_frags, &pipeline_depth); + + if (n_frags == 1) { + return ucc_cl_hier_allgatherv_node_split_init_schedule( + coll_args, team, (ucc_schedule_t **)task, n_frags); + } + + schedule = ucc_cl_hier_get_schedule(cl_team); + if (ucc_unlikely(!schedule)) { + return UCC_ERR_NO_MEMORY; + } + + status = ucc_schedule_pipelined_init( + coll_args, team, ucc_cl_hier_allgatherv_node_split_frag_init, + ucc_cl_hier_allgatherv_node_split_frag_setup, pipeline_depth, n_frags, + cfg->allgatherv_node_split_pipeline.order, &schedule->super); + + if (ucc_unlikely(status != UCC_OK)) { + cl_error(team->context->lib, + "failed to init pipelined node split allgatherv schedule"); + goto err_pipe_init; + } + + schedule->super.super.super.post = ucc_cl_hier_allgatherv_start; + schedule->super.super.super.finalize = ucc_cl_hier_allgatherv_finalize; + *task = &schedule->super.super.super; + return UCC_OK; + +err_pipe_init: + ucc_cl_hier_put_schedule(&schedule->super.super); + return status; +} diff --git a/src/components/cl/hier/allgatherv/allgatherv.h b/src/components/cl/hier/allgatherv/allgatherv.h new file mode 100755 index 0000000000..dd3d5737aa --- /dev/null +++ b/src/components/cl/hier/allgatherv/allgatherv.h @@ -0,0 +1,35 @@ +/** + * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef ALLGATHERV_H_ +#define ALLGATHERV_H_ +#include "../cl_hier.h" + +enum +{ + UCC_CL_HIER_ALLGATHERV_ALG_NODE_SPLIT, + UCC_CL_HIER_ALLGATHERV_ALG_LAST, +}; + +extern ucc_base_coll_alg_info_t + ucc_cl_hier_allgatherv_algs[UCC_CL_HIER_ALLGATHERV_ALG_LAST + 1]; + +ucc_status_t ucc_cl_hier_allgatherv_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task); + +static inline int ucc_cl_hier_allgatherv_alg_from_str(const char *str) +{ + int i; + for (i = 0; i < UCC_CL_HIER_ALLGATHERV_ALG_LAST; i++) { + if (0 == strcasecmp(str, ucc_cl_hier_allgatherv_algs[i].name)) { + break; + } + } + return i; +} + +#endif diff --git a/src/components/cl/hier/cl_hier.c b/src/components/cl/hier/cl_hier.c index edbb469d78..1e67522c6d 100644 --- a/src/components/cl/hier/cl_hier.c +++ b/src/components/cl/hier/cl_hier.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -117,4 +117,6 @@ __attribute__((constructor)) static void cl_hier_iface_init(void) ucc_cl_hier_alltoallv_algs; ucc_cl_hier.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = ucc_cl_hier_bcast_algs; + ucc_cl_hier.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] = + ucc_cl_hier_allgatherv_algs; } diff --git a/src/components/cl/hier/cl_hier.h b/src/components/cl/hier/cl_hier.h index ef40f33118..b18bb23b26 100644 --- a/src/components/cl/hier/cl_hier.h +++ b/src/components/cl/hier/cl_hier.h @@ -54,6 +54,7 @@ typedef struct ucc_cl_hier_lib_config { ucc_pipeline_params_t allreduce_rab_pipeline; ucc_pipeline_params_t bcast_2step_pipeline; ucc_pipeline_params_t reduce_2step_pipeline; + ucc_pipeline_params_t allgatherv_node_split_pipeline; } ucc_cl_hier_lib_config_t; typedef struct ucc_cl_hier_context_config { @@ -115,6 +116,7 @@ UCC_CLASS_DECLARE(ucc_cl_hier_team_t, ucc_base_context_t *, UCC_COLL_TYPE_ALLTOALLV | \ UCC_COLL_TYPE_ALLREDUCE | \ UCC_COLL_TYPE_BARRIER | \ + UCC_COLL_TYPE_ALLGATHERV | \ UCC_COLL_TYPE_BCAST) ucc_status_t ucc_cl_hier_coll_init(ucc_base_coll_args_t *coll_args, diff --git a/src/components/cl/hier/cl_hier_coll.h b/src/components/cl/hier/cl_hier_coll.h index 5a1e294afe..33c5e3377b 100644 --- a/src/components/cl/hier/cl_hier_coll.h +++ b/src/components/cl/hier/cl_hier_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -15,6 +15,7 @@ #include "barrier/barrier.h" #include "bcast/bcast.h" #include "reduce/reduce.h" +#include "allgatherv/allgatherv.h" #define UCC_CL_HIER_N_DEFAULT_ALG_SELECT_STR 3 diff --git a/src/components/cl/hier/cl_hier_team.c b/src/components/cl/hier/cl_hier_team.c index 32ef7e2f93..dd671e14f5 100644 --- a/src/components/cl/hier/cl_hier_team.c +++ b/src/components/cl/hier/cl_hier_team.c @@ -402,6 +402,16 @@ ucc_status_t ucc_cl_hier_team_get_scores(ucc_base_team_t *cl_team, } + status = ucc_coll_score_add_range( + score, UCC_COLL_TYPE_ALLGATHERV, UCC_MEMORY_TYPE_HOST, + 0, UCC_MSG_MAX, UCC_CL_HIER_DEFAULT_SCORE, + ucc_cl_hier_allgatherv_init, cl_team); + if (UCC_OK != status) { + cl_error(lib, "faild to add range to score_t"); + return status; + + } + for (i = 0; i < UCC_CL_HIER_N_DEFAULT_ALG_SELECT_STR; i++) { status = ucc_coll_score_update_from_str( ucc_cl_hier_default_alg_select_str[i], &team_info, diff --git a/src/components/topo/ucc_sbgp.c b/src/components/topo/ucc_sbgp.c index e0264ee9e2..07462aedc9 100644 --- a/src/components/topo/ucc_sbgp.c +++ b/src/components/topo/ucc_sbgp.c @@ -141,6 +141,7 @@ static inline ucc_status_t sbgp_create_node(ucc_topo_t *topo, ucc_sbgp_t *sbgp) ucc_free(local_ranks); return UCC_ERR_NO_MESSAGE; } + sbgp->preserves_order = local_ranks[0] == ctx_nlr; sbgp->group_size = node_size; sbgp->group_rank = node_rank; sbgp->rank_map = local_ranks; diff --git a/src/components/topo/ucc_sbgp.h b/src/components/topo/ucc_sbgp.h index 63697fd02f..cad8014d89 100644 --- a/src/components/topo/ucc_sbgp.h +++ b/src/components/topo/ucc_sbgp.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * See file LICENSE for terms. */ #ifndef UCC_SBGP_H_ @@ -55,6 +55,7 @@ typedef struct ucc_sbgp_t { ucc_rank_t group_rank; ucc_rank_t *rank_map; ucc_ep_map_t map; + int preserves_order; // True if the order of the ranks of the subgroup is the same as in parent group } ucc_sbgp_t; const char* ucc_sbgp_str(ucc_sbgp_type_t type);