From a9a828be646336861e364863a95e4b73bd7143f2 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 15:35:45 -0700 Subject: [PATCH 01/17] add fmha related changes --- .github/container/test-pax.sh | 54 ++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index d5b6c2d8c..1b7dff2be 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -23,6 +23,8 @@ usage() { echo " -s, --steps Number of steps to run, defaults to 500." echo " --multiprocess Enable the multiprocess GPU mode." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." + echo " --save-hlo {0, 1} 1 to save the dumped hlo, 0 to remove the hlo dumped folder" + echo " --enable-fmha {0, 1} 1 to enable fmha testing, 0 to run test without fmha; default is 0" echo " --data-parallel Data parallelism to use. Defaults to 1." echo " --fsdp Fully-sharded data parallelism to use. Defaults to 1." echo " --tensor-parallel Tensor parallelism to use. Defaults to 1." @@ -32,7 +34,7 @@ usage() { exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") if [[ $? -ne 0 ]]; then exit $1 fi @@ -55,6 +57,8 @@ NVTE_FUSED_ATTN=0 DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" +ENABLE_FMHA=${ENABLE_FMHA:-0} +SAVE_HLO=${SAVE_HLO:-1} eval set -- "$args" while [ : ]; do @@ -75,6 +79,10 @@ while [ : ]; do ENABLE_TE=1 shift 1 ;; + --enable-fmha) + ENABLE_FMHA="$2" + shift 2 + ;; --enable-dropout) DROPOUT='0.1' shift 1 @@ -103,6 +111,10 @@ while [ : ]; do OUTPUT=$2 shift 2 ;; + --save-hlo) + SAVE_HLO="$2" + shift 2 + ;; --data-parallel) DP="$2" shift 2 @@ -136,6 +148,21 @@ while [ : ]; do esac done +# Set hlo dump folder after output folder is set. +HLO_DIR=${OUTPUT}/hlo +export BASE_XLA_FLAGS="${BASE_XLA_FLAGS:---xla_dump_hlo_as_text --xla_dump_to=${HLO_DIR}}" +export XLA_FLAGS="${BASE_XLA_FLAGS} ${XLA_FLAGS:-}" +echo "HLO will be dumped in ${HLO_DIR} dir." + +## Setting the env variables for FMHA +if [[ "$ENABLE_FMHA" -eq "1" ]]; then + echo "Setting XLA FMHA Flags"; + export BASE_XLA_FLAGS_FMHA="${BASE_XLA_FLAGS_FMHA:---xla_gpu_fused_attention_use_cudnn_rng=true --xla_gpu_enable_cudnn_fmha=true}" + export XLA_FLAGS="${BASE_XLA_FLAGS_FMHA} ${XLA_FLAGS:-}" +fi + +echo "XLA FLAGS: $XLA_FLAGS" + # # Set derived variables GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') @@ -149,8 +176,10 @@ print_var NGPUS print_var OUTPUT print_var MULTIPROCESS print_var ENABLE_TE +print_var ENABLE_FMHA print_var NVTE_FUSED_ATTN print_var EVALUATE +print_var SAVE_HLO print_var DROPOUT print_var DP print_var FSDP @@ -421,3 +450,26 @@ fi set +x echo "Output at ${OUTPUT}" + +if [[ "$ENABLE_FMHA" -eq "1" ]]; then + ## Check if fmha instructions are present in the HLO dumped file or not. + fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*" + result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt) + + if [[ $SAVE_HLO -eq 0 ]]; then + rm -rf $HLO_DIR + echo "Removed dumped HLO directory!" + fi + + if [ -z "$result" ]; then + echo "E: No FMHA instructions were found in the hlo files!" + exit 1 + else + echo -e "Found FMHA instructions in the following HLO files: \n $result" + fi +else + if [[ $SAVE_HLO -eq 0 ]]; then + rm -rf $HLO_DIR + echo "Removed dumped HLO directory!" + fi +fi From 69f9db9f1ef8937ba2f3633b9c5ecd7e7146677d Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 17:44:31 -0700 Subject: [PATCH 02/17] Update _test_upstream_pax.yaml --- .github/workflows/_test_upstream_pax.yaml | 33 ++++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 2ff593630..145dfd161 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -30,12 +30,22 @@ on: jobs: - single-process-multi-device: + pax-single-process-multi-device: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP + PARALLEL_CONFIG: [1, 8, 1, 1] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "" + - TEST_NAME: 8DP2FSDP4TP1PP + PARALLEL_CONFIG: [1, 1, 2, 4] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "" + - TEST_NAME: 8DP1FSDP1TP1PP_fmha + PARALLEL_CONFIG: [1, 8, 1, 1] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "--enable-fmha 1" fail-fast: false runs-on: ubuntu-22.04 @@ -67,7 +77,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process + TEST_CASE_NAME=${{ matrix.TEST_NAME }}_single_process MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 @@ -112,13 +122,14 @@ jobs: test-pax.sh \ --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ --dtype bfloat16 \ - --batch-per-gpu 4 \ + --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 500 \ --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ - --nodes ${{ steps.meta.outputs.NODES }} + --nodes ${{ steps.meta.outputs.NODES }} \ + ${{ matrix.ADDITIONAL_ARGS }} EOF ) @@ -210,6 +221,14 @@ jobs: BATCH_SIZE: 4 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" + - TEST_NAME: 2DP1FSDP1TP4PP_fmha + PARALLEL_CONFIG: [4, 2, 1, 1] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "--enable-fmha 1" + - TEST_NAME: 16DP1FSDP1TP1PP_fmha + PARALLEL_CONFIG: [1, 16, 1, 1] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "--enable-fmha 1" fail-fast: false runs-on: ubuntu-22.04 From 35565ce62c962ae6d9c26bc8180e5dfa6ede7e3e Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 17:49:51 -0700 Subject: [PATCH 03/17] Update _sandbox.yaml --- .github/workflows/_sandbox.yaml | 116 +++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 32 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 7b90b72ca..8ab69d0d8 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,40 +2,92 @@ name: "~Sandbox" on: workflow_dispatch: + inputs: + ARCHITECTURE: + type: string + required: false + default: "amd64" + BUILD_DATE: + type: string + description: Build date in YYYY-MM-DD format + required: false + default: NOT SPECIFIED + MANIFEST_ARTIFACT_NAME: + type: string + description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch + default: '' + required: false + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container jobs: - sandbox: + + build-base: + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} + secrets: inherit + + build-jax: + needs: build-base + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-jax-build + BADGE_FILENAME: badge-jax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + CONTAINER_NAME: jax + DOCKERFILE: .github/container/Dockerfile.jax + RUNNER_SIZE: large + secrets: inherit + + build-upstream-pax: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-pax-build + BADGE_FILENAME: badge-pax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-pax + DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + secrets: inherit + + test-distribution: runs-on: ubuntu-22.04 + strategy: + matrix: + TEST_SCRIPT: + - extra-only-distribution.sh + - mirror-only-distribution.sh + - upstream-only-distribution.sh + - local-patch-distribution.sh + fail-fast: false steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Print usage + - name: Print environment variables + run: env + - name: Set git login for tests run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + - name: Run integration test ${{ matrix.TEST_SCRIPT }} + run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + test-upstream-pax: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_upstream_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + secrets: inherit From 42566a60408b35819f4a3230d962787612bae287 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 17:51:39 -0700 Subject: [PATCH 04/17] Update _test_upstream_pax.yaml --- .github/workflows/_test_upstream_pax.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 145dfd161..5d882c85a 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -520,7 +520,7 @@ jobs: path: output/* metrics: - needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] + needs: [pax-single-process-multi-device, pax-multi-node, single-process-evaluation] runs-on: ubuntu-22.04 steps: @@ -564,7 +564,7 @@ jobs: summary: runs-on: ubuntu-22.04 - needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] + needs: [pax-single-process-multi-device, pax-multi-node, single-process-evaluation] if: "!cancelled()" steps: - name: Generate TensorBoard query URL From 495ae326b09da727e7ec3e7af75f301a868ac6f7 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 17:53:50 -0700 Subject: [PATCH 05/17] Update _sandbox.yaml --- .github/workflows/_sandbox.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 8ab69d0d8..3d4a27a9f 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -89,5 +89,3 @@ jobs: with: PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - - secrets: inherit From 23a19d874e7519bf02cfee240abf5a5c334007c6 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Wed, 15 May 2024 23:27:41 -0700 Subject: [PATCH 06/17] Update test-pax.sh --- .github/container/test-pax.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 1b7dff2be..0bedfc636 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -448,10 +448,10 @@ else $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) fi -set +x -echo "Output at ${OUTPUT}" +echo "Checking for FMHA instructions in HLO!" if [[ "$ENABLE_FMHA" -eq "1" ]]; then + echo "Inside if Statement!" ## Check if fmha instructions are present in the HLO dumped file or not. fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*" result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt) @@ -473,3 +473,6 @@ else echo "Removed dumped HLO directory!" fi fi + +set +x +echo "Output at ${OUTPUT}" From 4be1f40cc2b3bce2ff553f6dd1d0b4a2ecc25ee8 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Thu, 16 May 2024 14:12:29 -0700 Subject: [PATCH 07/17] Update _sandbox.yaml --- .github/workflows/_sandbox.yaml | 114 +++++++++----------------------- 1 file changed, 32 insertions(+), 82 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 3d4a27a9f..7b90b72ca 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,90 +2,40 @@ name: "~Sandbox" on: workflow_dispatch: - inputs: - ARCHITECTURE: - type: string - required: false - default: "amd64" - BUILD_DATE: - type: string - description: Build date in YYYY-MM-DD format - required: false - default: NOT SPECIFIED - MANIFEST_ARTIFACT_NAME: - type: string - description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch - default: '' - required: false - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container jobs: - - build-base: - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} - secrets: inherit - - build-jax: - needs: build-base - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-jax-build - BADGE_FILENAME: badge-jax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - CONTAINER_NAME: jax - DOCKERFILE: .github/container/Dockerfile.jax - RUNNER_SIZE: large - secrets: inherit - - build-upstream-pax: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-pax-build - BADGE_FILENAME: badge-pax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-pax - DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} - secrets: inherit - - test-distribution: + sandbox: runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false steps: - - name: Print environment variables - run: env - - name: Set git login for tests + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Print usage run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-upstream-pax: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + cat << EOF + This is an empty workflow file located in the main branch of your + repository. It serves as a testing ground for new GitHub Actions on + development branches before merging them to the main branch. By + defining and overloading this workflow on your development branch, + you can test new actions without affecting your main branch, ensuring + a smooth integration process once the changes are ready to be merged. + + Usage: + + 1. In your development branch, modify the sandbox.yml workflow file + to include the new actions you want to test. Make sure to commit + the changes to the development branch. + 2. Navigate to the 'Actions' tab in your repository, select the + '~Sandbox' workflow, and choose your development branch from the + branch dropdown menu. Click on 'Run workflow' to trigger the + workflow on your development branch. + 3. Once you have tested and verified the new actions in the Sandbox + workflow, you can incorporate them into your main workflow(s) and + merge the development branch into the main branch. Remember to + revert the changes to the sandbox.yml file in the main branch to + keep it empty for future testing. + EOF From f44cdef645698a4f0d28ad34b7d100f48ee7da11 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Thu, 16 May 2024 14:22:37 -0700 Subject: [PATCH 08/17] removing hlo dir for llama test. --- .github/workflows/_test_upstream_pax.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 1e8088aa2..9cdcc9d3b 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -220,7 +220,7 @@ jobs: PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 EVALUATE: true - ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" + ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate --save-hlo 0" - TEST_NAME: 2DP1FSDP1TP4PP_fmha PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 From da69dbd17a793864c415a9c85274d39be20a1ccd Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Mon, 20 May 2024 12:06:25 -0700 Subject: [PATCH 09/17] Update _test_upstream_pax.yaml Disabled saving hlo by default as suggested by terry --- .github/workflows/_test_upstream_pax.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 9cdcc9d3b..1e8088aa2 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -220,7 +220,7 @@ jobs: PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 EVALUATE: true - ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate --save-hlo 0" + ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - TEST_NAME: 2DP1FSDP1TP4PP_fmha PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 From a6622c8720711ac564964354ce9a427b59e3fae9 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Mon, 20 May 2024 12:10:02 -0700 Subject: [PATCH 10/17] Update _test_upstream_pax.yaml --- .github/workflows/_test_upstream_pax.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 1e8088aa2..9f61ea962 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -45,7 +45,7 @@ jobs: - TEST_NAME: 8DP1FSDP1TP1PP_fmha PARALLEL_CONFIG: [1, 8, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1" + ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" fail-fast: false runs-on: ubuntu-22.04 @@ -224,11 +224,11 @@ jobs: - TEST_NAME: 2DP1FSDP1TP4PP_fmha PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1" + ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" - TEST_NAME: 16DP1FSDP1TP1PP_fmha PARALLEL_CONFIG: [1, 16, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1" + ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" fail-fast: false runs-on: ubuntu-22.04 From b67229b45e60dc046d5a6f88aba3c047128d5a4c Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Mon, 20 May 2024 12:11:01 -0700 Subject: [PATCH 11/17] Update test-pax.sh Incorporated review comments, disabled saving hlo by default as suggested by terry. --- .github/container/test-pax.sh | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 5c88e6ead..464816950 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -58,7 +58,7 @@ DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" ENABLE_FMHA=${ENABLE_FMHA:-0} -SAVE_HLO=${SAVE_HLO:-1} +SAVE_HLO=${SAVE_HLO:-0} eval set -- "$args" while [ : ]; do @@ -454,27 +454,21 @@ fi echo "Checking for FMHA instructions in HLO!" if [[ "$ENABLE_FMHA" -eq "1" ]]; then - echo "Inside if Statement!" ## Check if fmha instructions are present in the HLO dumped file or not. fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*" result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt) - if [[ $SAVE_HLO -eq 0 ]]; then - rm -rf $HLO_DIR - echo "Removed dumped HLO directory!" - fi - if [ -z "$result" ]; then echo "E: No FMHA instructions were found in the hlo files!" exit 1 else echo -e "Found FMHA instructions in the following HLO files: \n $result" fi -else - if [[ $SAVE_HLO -eq 0 ]]; then - rm -rf $HLO_DIR - echo "Removed dumped HLO directory!" - fi +fi + +if [[ $SAVE_HLO -eq 0 ]]; then + rm -rf $HLO_DIR + echo "Removed dumped HLO directory!" fi set +x From f7618bfcd0cd4667174eefe8ed489f25ead79709 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Mon, 20 May 2024 14:34:27 -0700 Subject: [PATCH 12/17] Update test-pax.sh merge enable-fmha and enable-fused-attn flags --- .github/container/test-pax.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 464816950..002811c5d 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -17,7 +17,6 @@ usage() { echo " --dtype Batch size, defaults to bfloat16." echo " --enable-te If set, will run with env var ENABLE_TE=1." echo " --enable-dropout If set, will set DROPOUT_PROB to 0.1." - echo " --enable-fused-attn Whether to test fused attention through TE." echo " --model-type One of 126M, 5B, LLaMA70BProxy. Defaults to 126M" echo " --evaluate Whether to test evaluation rather than training." echo " -s, --steps Number of steps to run, defaults to 500." @@ -34,7 +33,7 @@ usage() { exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") if [[ $? -ne 0 ]]; then exit $1 fi @@ -81,16 +80,13 @@ while [ : ]; do ;; --enable-fmha) ENABLE_FMHA="$2" + NVTE_FUSED_ATTN=1 shift 2 ;; --enable-dropout) DROPOUT='0.1' shift 1 ;; - --enable-fused-attn) - NVTE_FUSED_ATTN=1 - shift 1 - ;; --model-type) MODEL_TYPE=$2 shift 2 From dbb999d195eceb1dba785de06135a4d75abf16fe Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:54:46 -0700 Subject: [PATCH 13/17] Update test-pax.sh Incorporated review comments --- .github/container/test-pax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 002811c5d..01d05e562 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -80,7 +80,7 @@ while [ : ]; do ;; --enable-fmha) ENABLE_FMHA="$2" - NVTE_FUSED_ATTN=1 + NVTE_FUSED_ATTN="$2" shift 2 ;; --enable-dropout) From c1ff8ae4f0f0abf925cd25befd832a143c350529 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Tue, 11 Jun 2024 11:51:30 -0700 Subject: [PATCH 14/17] Update _test_pax_rosetta.yaml --- .github/workflows/_test_pax_rosetta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 264777e15..72ce4b29c 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -252,7 +252,7 @@ jobs: - TEST_NAME: 5B_fused_attn_0 PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 2 - ADDITIONAL_ARGS: "--model-type 5B --disable-fused-attn" + ADDITIONAL_ARGS: "--model-type 5B --enable-fmha 0" - TEST_NAME: LLaMA_eval_TE PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 From 0ef811a02c5441db4e6a9932c44e192b2abe16e9 Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Tue, 11 Jun 2024 11:53:22 -0700 Subject: [PATCH 15/17] Update _test_upstream_pax.yaml --- .github/workflows/_test_upstream_pax.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 9f61ea962..045f8e014 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -45,7 +45,7 @@ jobs: - TEST_NAME: 8DP1FSDP1TP1PP_fmha PARALLEL_CONFIG: [1, 8, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" + ADDITIONAL_ARGS: "--save-hlo 1" fail-fast: false runs-on: ubuntu-22.04 @@ -224,11 +224,11 @@ jobs: - TEST_NAME: 2DP1FSDP1TP4PP_fmha PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 16DP1FSDP1TP1PP_fmha PARALLEL_CONFIG: [1, 16, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1" + ADDITIONAL_ARGS: "--save-hlo 1" fail-fast: false runs-on: ubuntu-22.04 From ca6e2e9062aabda722b3b7675ce1a2fefe2562ad Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Tue, 11 Jun 2024 11:53:27 -0700 Subject: [PATCH 16/17] Update test-pax.sh --- .github/container/test-pax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 6651364f7..0dc3ef9e3 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -57,7 +57,7 @@ NVTE_FUSED_ATTN=1 DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" -ENABLE_FMHA=${ENABLE_FMHA:-0} +ENABLE_FMHA=${ENABLE_FMHA:-1} SAVE_HLO=${SAVE_HLO:-0} eval set -- "$args" From 7084812ca869fea46d329362959f60c222b8a4ea Mon Sep 17 00:00:00 2001 From: Harshit Monish <143435143+hmonishN@users.noreply.github.com> Date: Tue, 11 Jun 2024 12:24:28 -0700 Subject: [PATCH 17/17] Update _test_upstream_pax.yaml --- .github/workflows/_test_upstream_pax.yaml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 045f8e014..fe0ebbaba 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -37,15 +37,11 @@ jobs: - TEST_NAME: 8DP1FSDP1TP1PP PARALLEL_CONFIG: [1, 8, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 8DP2FSDP4TP1PP PARALLEL_CONFIG: [1, 1, 2, 4] BATCH_SIZE: 4 ADDITIONAL_ARGS: "" - - TEST_NAME: 8DP1FSDP1TP1PP_fmha - PARALLEL_CONFIG: [1, 8, 1, 1] - BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--save-hlo 1" fail-fast: false runs-on: ubuntu-22.04 @@ -204,7 +200,7 @@ jobs: - TEST_NAME: 2DP1FSDP1TP4PP PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 4DP1FSDP2TP1PP PARALLEL_CONFIG: [1, 4, 1, 2] BATCH_SIZE: 4 @@ -212,7 +208,7 @@ jobs: - TEST_NAME: 16DP1FSDP1TP1PP PARALLEL_CONFIG: [1, 16, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 2DP1FSDP2TP4PP PARALLEL_CONFIG: [4, 2, 1, 2] BATCH_SIZE: 4 @@ -220,15 +216,7 @@ jobs: PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 EVALUATE: true - ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - - TEST_NAME: 2DP1FSDP1TP4PP_fmha - PARALLEL_CONFIG: [4, 2, 1, 1] - BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--save-hlo 1" - - TEST_NAME: 16DP1FSDP1TP1PP_fmha - PARALLEL_CONFIG: [1, 16, 1, 1] - BATCH_SIZE: 4 - ADDITIONAL_ARGS: "--save-hlo 1" + ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" fail-fast: false runs-on: ubuntu-22.04