diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 91cb926b4..407e0b817 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -22,17 +22,16 @@ usage() { echo " --evaluate Whether to test evaluation rather than training." echo " -s, --steps Number of steps to run, defaults to 500." echo " --multiprocess Enable the multiprocess GPU mode." + echo " --ici ICI mesh shape." + echo " --dcn DCN mesh shape." + echo " --enable-pipeline-parallel Whether to use pipeline parallelism." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." - echo " --data-parallel Data parallelism to use. Defaults to 1." - echo " --fsdp Fully-sharded data parallelism to use. Defaults to 1." - echo " --tensor-parallel Tensor parallelism to use. Defaults to 1." - echo " --pipeline-parallel Pipeline parallelism to use. Defaults to 1 for no pipelining." echo " -n, --nodes Number of nodes." echo " -h, --help Print usage." exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,enable-pipeline-parallel,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@") if [[ $? -ne 0 ]]; then exit $1 fi @@ -44,14 +43,13 @@ OUTPUT=$(mktemp -d) BATCH_PER_GPU=4 DTYPE="bfloat16" STEPS=500 -DP=1 -FSDP=1 -TP=1 -PP=1 +ICI="[1,1,1]" +DCN="[1,1,1]" NODES=1 ENABLE_TE=0 MODEL_TYPE=126M NVTE_FUSED_ATTN=0 +ENABLE_PP=0 DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" @@ -83,6 +81,10 @@ while [ : ]; do NVTE_FUSED_ATTN=1 shift 1 ;; + --enable-pipeline-parallel) + ENABLE_PP=1 + shift 1 + ;; --model-type) MODEL_TYPE=$2 shift 2 @@ -103,20 +105,12 @@ while [ : ]; do OUTPUT=$2 shift 2 ;; - --data-parallel) - DP="$2" + --ici) + ICI=$2 shift 2 ;; - --fsdp) - FSDP="$2" - shift 2 - ;; - --tensor-parallel) - TP="$2" - shift 2 - ;; - --pipeline-parallel) - PP="$2" + --dcn) + DCN=$2 shift 2 ;; -n | --nodes) @@ -152,10 +146,8 @@ print_var ENABLE_TE print_var NVTE_FUSED_ATTN print_var EVALUATE print_var DROPOUT -print_var DP -print_var FSDP -print_var TP -print_var PP +print_var ICI +print_var DCN PAXML_DIR=$(dirname `python -c 'import paxml; print(*paxml.__path__)'`) pushd ${PAXML_DIR} @@ -163,6 +155,7 @@ pushd ${PAXML_DIR} ## Create configs file cat > ci_configs.py < 1: - if dp % dcn_factor == 0: - dcn_dp = dcn_factor - dp = int(dp / dcn_factor) - elif fsdp % dcn_factor == 0: - dcn_fsdp = dcn_factor - fsdp = int(fsdp / dcn_factor) - elif pp % dcn_factor == 0: - dcn_pp = dcn_factor - pp = int(pp / dcn_factor) +assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}' WeightInit = base_layer.WeightInit @@ -301,11 +272,6 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset): USE_MQA = True NUM_KV_HEADS = 8 - PERCORE_BATCH_SIZE = 4 - - ICI_MESH_SHAPE = [1, 8, 1] - DCN_MESH_SHAPE = [1, 1, 1] - def task(self): task_p = super().task() task_p.train.always_use_train_for_model_init=False @@ -313,15 +279,12 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset): return task_p -if pp > 1: +if pp == 1: @experiment_registry.register class Synthetic126MCI(GPT126MPP, SyntheticDataset): - ICI_MESH_SHAPE = [pp, dp, fsdp, tp] - DCN_MESH_SHAPE = [dcn_pp, dcn_dp, dcn_fsdp, 1] MICROBATCH_SIZE = 2 - NUM_STAGES = pp - PERCORE_BATCH_SIZE = percore_batch_size + NUM_STAGES = ${ICI}[0] FRPOP_DTYPE = dtype def task(self): @@ -334,9 +297,6 @@ else: @experiment_registry.register class Synthetic126MCI(Synthetic126M): - ICI_MESH_SHAPE = [dp, fsdp, tp] - DCN_MESH_SHAPE = [dcn_dp, dcn_fsdp, 1] - PERCORE_BATCH_SIZE = percore_batch_size FRPOP_DTYPE = dtype DROPOUT_PROB = dropout @@ -378,48 +338,42 @@ if [[ ${MODEL_TYPE} == "126M" ]]; then CONFIG=ci_configs.Synthetic126MCI elif [[ ${MODEL_TYPE} == "5B" ]]; then CONFIG=paxml.contrib.gpu.scripts_gpu.configs.Synthetic5B - ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}" elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then CONFIG=ci_configs.LLaMA70BSyntheticSmall - ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}" +## hard-code ICI mesh shape for Grok +elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then + CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy + ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=1" else echo "Unsupported model ${MODEL_TYPE}" exit 1 fi +echo "HERE" + +CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ + --job_log_dir=${OUTPUT} \ + --alsologtostderr \ + --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ + --fdl.ICI_MESH_SHAPE=${ICI} \ + --fdl.DCN_MESH_SHAPE=${DCN} \ + $ADDITIONAL_ARGS" +if [[ $MULTIPROCESS != 0 ]]; then + CMD_LINE_FLAGS+=" --multiprocess_gpu" +fi + if [[ ${EVALUATE} -ne 0 ]]; then trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT ## train for 0 steps to generate an initial checkpoint - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --fdl.MAX_STEPS=0 \ - --job_log_dir=${OUTPUT} \ - --alsologtostderr \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0 ## restore from initial checkpoint for eval - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --job_log_dir=${OUTPUT} \ - --mode='eval' \ - --fdl.MAX_STEPS=0 \ - --alsologtostderr \ - --enable_checkpoint_saving=False \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0 --mode='eval' else - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --job_log_dir=${OUTPUT} \ - --alsologtostderr \ - --fdl.MAX_STEPS=${STEPS} \ - --enable_checkpoint_saving=False \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=${STEPS} fi set +x diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 7b80bfa60..fef938315 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -33,9 +33,13 @@ jobs: single-process-multi-device-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE + ICI: "[8,1,1]" + DCN: "[1,1,1]" + - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -70,7 +74,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process_TE + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 NODES=1 @@ -118,10 +122,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te EOF @@ -226,36 +228,52 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 1, 1, 1] + ICI: "[1,1,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 + TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 8, 1, 1] + ICI: "[8,1,1]" + DCN: "[1,1,1]" ADDITIONAL_ARGS: "" BATCH_SIZE: 4 + TASKS: 8 - TEST_NAME: 1DP8FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP_TE - PARALLEL_CONFIG: [1, 4, 1, 2] + ICI: "[4,1,2]" + DCN: "[1,1,1]" BATCH_SIZE: 4 + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 16, 1, 1] + ICI: "[8,1,1]" + DCN: "[2,1,1]" BATCH_SIZE: 4 + TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 5B_fused_attn_1 - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 2 + TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn" - TEST_NAME: 5B_fused_attn_0 - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 2 + TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B" - TEST_NAME: LLaMA_eval_TE - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 + TASKS: 8 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" fail-fast: false @@ -274,7 +292,6 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - - name: Setup SSH known hosts id: ssh-known-hosts run: | @@ -292,7 +309,7 @@ jobs: cd $GITHUB_WORKSPACE IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -340,10 +357,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ @@ -451,11 +466,27 @@ jobs: rosetta-pax-multi-node: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 4, 1, 2] - - [4, 2, 1, 1] - - [4, 2, 1, 2] + include: + - TEST_NAME: 8DP1FSDP1TP1PP + ICI: "[8,1,1]" + DCN: "[1,1,1]" + TASKS: 8 + ADDITIONAL_ARGS: "" + - TEST_NAME: 4DP1FSDP2TP1PP + ICI: "[4,1,2]" + DCN: "[1,1,1]" + TASKS: 8 + ADDITIONAL_ARGS: "" + - TEST_NAME: 2DP1FSDP1TP4PP + ICI: "[4,2,1,1]" + DCN: "[1,1,1,1]" + TASKS: 8 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" + - TEST_NAME: 2DP1FSDP2TP4PP + ICI: "[2,2,1,2]" + DCN: "[2,1,1,1]" + TASKS: 16 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" fail-fast: false runs-on: ubuntu-22.04 @@ -488,8 +519,8 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TEST_CASE_NAME=${{ matrix.TEST_NAME }} + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -538,11 +569,10 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ + ${{ matrix.ADDITIONAL_ARGS }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF ) @@ -646,8 +676,11 @@ jobs: rosetta-pax-single-node-dropout-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP_TE_dropout + ICI: "[8,1,1]" + DCN: "[1,1,1]" + TASKS: 8 fail-fast: false runs-on: ubuntu-22.04 @@ -681,8 +714,8 @@ jobs: run: | cd $GITHUB_WORKSPACE IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP_TE_dropout - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TEST_CASE_NAME=${{ matrix.TEST_NAME }} + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -730,10 +763,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-dropout \ --enable-te \ @@ -840,8 +871,10 @@ jobs: single-process-evaluation-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -874,7 +907,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval_TE + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 @@ -923,10 +956,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes 1 \ --enable-te EOF diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index c07749b12..7e0f0e1c4 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -33,9 +33,13 @@ jobs: single-process-multi-device: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_single_process + ICI: "[8,1,1]" + DCN: "[1,1,1]" + - TEST_NAME: 1DP2FSDP4TP1PP_single_process + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -67,7 +71,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 @@ -114,10 +118,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} EOF ) @@ -179,37 +181,52 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 1, 1, 1] - BATCH_SIZE: 4 + ICI: "[1,1,1]" + DCN: "[1,1,1]" + TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 8, 1, 1] - BATCH_SIZE: 4 + ICI: "[8,1,1]" + DCN: "[1,1,1]" + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 1DP8FSDP1TP1PP - PARALLEL_CONFIG: [1, 1, 8, 1] - BATCH_SIZE: 4 + ICI: "[1,8,1]" + DCN: "[1,1,1]" + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP1TP4PP - PARALLEL_CONFIG: [4, 2, 1, 1] - BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ICI: "[4,2,1,1]" + DCN: "[1,1,1,1]" + TASKS: 8 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: 4DP1FSDP2TP1PP - PARALLEL_CONFIG: [1, 4, 1, 2] - BATCH_SIZE: 4 + ICI: "[4,2,1]" + DCN: "[1,1,1]" + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 16, 1, 1] - BATCH_SIZE: 4 + ICI: "[8,1,1]" + DCN: "[2,1,1]" + TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP2TP4PP - PARALLEL_CONFIG: [4, 2, 1, 2] - BATCH_SIZE: 4 + ICI: "[1,1,2,4]" + DCN: "[2,1,1,1]" + TASKS: 16 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: LLaMA_eval - PARALLEL_CONFIG: [1, 1, 8, 1] - BATCH_SIZE: 4 + ICI: "[1,8,1]" + DCN: "[1,1,1]" + TASKS: 8 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" + - TEST_NAME: Grok + ICI: "[1,1,8,1]" + DCN: "[1,2,1,1]" + TASKS: 16 + EVALUATE: true + ADDITIONAL_ARGS: "--model-type GrokProxy" fail-fast: false runs-on: ubuntu-22.04 @@ -242,7 +259,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -290,10 +307,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \ ${{ matrix.ADDITIONAL_ARGS }} @@ -357,8 +372,10 @@ jobs: single-process-evaluation: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_eval + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -390,7 +407,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval + TEST_CASE_NAME=${{ matrix.TEST_NAME }} TOTAL_TASKS=1 NODES=1 GPUS_PER_NODE=8 @@ -439,10 +456,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF