From a9a828be646336861e364863a95e4b73bd7143f2 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 15:35:45 -0700
Subject: [PATCH 01/17] add fmha related changes

---
 .github/container/test-pax.sh | 54 ++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index d5b6c2d8c..1b7dff2be 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -23,6 +23,8 @@ usage() {
     echo "  -s, --steps                Number of steps to run, defaults to 500."
     echo "  --multiprocess             Enable the multiprocess GPU mode."
     echo "  -o, --output NAME          Name for the output folder, a temporary folder will be created if none specified."
+    echo "  --save-hlo {0, 1}          1 to save the dumped hlo, 0 to remove the hlo dumped folder"
+    echo "  --enable-fmha {0, 1}       1 to enable fmha testing, 0 to run test without fmha; default is 0"
     echo "  --data-parallel            Data parallelism to use. Defaults to 1."
     echo "  --fsdp                     Fully-sharded data parallelism to use. Defaults to 1."
     echo "  --tensor-parallel          Tensor parallelism to use. Defaults to 1."
@@ -32,7 +34,7 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
@@ -55,6 +57,8 @@ NVTE_FUSED_ATTN=0
 DROPOUT=0
 EVALUATE=0
 ADDITIONAL_ARGS=""
+ENABLE_FMHA=${ENABLE_FMHA:-0}
+SAVE_HLO=${SAVE_HLO:-1}
 
 eval set -- "$args"
 while [ : ]; do
@@ -75,6 +79,10 @@ while [ : ]; do
             ENABLE_TE=1
             shift 1
             ;;
+        --enable-fmha)
+            ENABLE_FMHA="$2"
+            shift 2
+            ;;
         --enable-dropout)
             DROPOUT='0.1'
             shift 1
@@ -103,6 +111,10 @@ while [ : ]; do
             OUTPUT=$2
             shift 2
             ;;
+        --save-hlo)
+            SAVE_HLO="$2"
+            shift 2
+            ;;
         --data-parallel)
             DP="$2"
             shift 2
@@ -136,6 +148,21 @@ while [ : ]; do
     esac
 done
 
+# Set hlo dump folder after output folder is set.
+HLO_DIR=${OUTPUT}/hlo
+export BASE_XLA_FLAGS="${BASE_XLA_FLAGS:---xla_dump_hlo_as_text --xla_dump_to=${HLO_DIR}}"
+export XLA_FLAGS="${BASE_XLA_FLAGS} ${XLA_FLAGS:-}"
+echo "HLO will be dumped in ${HLO_DIR} dir."
+
+## Setting the env variables for FMHA
+if [[ "$ENABLE_FMHA" -eq "1" ]]; then  
+    echo "Setting XLA FMHA Flags";
+    export BASE_XLA_FLAGS_FMHA="${BASE_XLA_FLAGS_FMHA:---xla_gpu_fused_attention_use_cudnn_rng=true --xla_gpu_enable_cudnn_fmha=true}"
+    export XLA_FLAGS="${BASE_XLA_FLAGS_FMHA} ${XLA_FLAGS:-}"
+fi
+
+echo "XLA FLAGS: $XLA_FLAGS"
+
 # # Set derived variables
 
 GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
@@ -149,8 +176,10 @@ print_var NGPUS
 print_var OUTPUT
 print_var MULTIPROCESS
 print_var ENABLE_TE
+print_var ENABLE_FMHA
 print_var NVTE_FUSED_ATTN
 print_var EVALUATE
+print_var SAVE_HLO
 print_var DROPOUT
 print_var DP
 print_var FSDP
@@ -421,3 +450,26 @@ fi
 
 set +x
 echo "Output at ${OUTPUT}"
+
+if [[ "$ENABLE_FMHA" -eq "1" ]]; then 
+    ## Check if fmha instructions are present in the HLO dumped file or not.
+    fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*"
+    result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt)
+
+    if [[ $SAVE_HLO -eq 0 ]]; then
+        rm -rf $HLO_DIR
+        echo "Removed dumped HLO directory!"
+    fi
+
+    if [ -z "$result" ]; then
+        echo "E: No FMHA instructions were found in the hlo files!"
+	exit 1
+    else
+        echo -e "Found FMHA instructions in the following HLO files: \n $result"
+    fi
+else
+    if [[ $SAVE_HLO -eq 0 ]]; then
+        rm -rf $HLO_DIR
+ 	echo "Removed dumped HLO directory!"
+    fi
+fi

From 69f9db9f1ef8937ba2f3633b9c5ecd7e7146677d Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 17:44:31 -0700
Subject: [PATCH 02/17] Update _test_upstream_pax.yaml

---
 .github/workflows/_test_upstream_pax.yaml | 33 ++++++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 2ff593630..145dfd161 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -30,12 +30,22 @@ on:
 
 jobs:
 
-  single-process-multi-device:
+  pax-single-process-multi-device:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
-        - [1, 1, 2, 4]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP
+            PARALLEL_CONFIG: [1, 8, 1, 1]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: ""
+          - TEST_NAME: 8DP2FSDP4TP1PP
+            PARALLEL_CONFIG: [1, 1, 2, 4]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: ""
+          - TEST_NAME: 8DP1FSDP1TP1PP_fmha
+            PARALLEL_CONFIG: [1, 8, 1, 1]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: "--enable-fmha 1"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -67,7 +77,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}_single_process
           MAX_GPUS_PER_NODE=8
           NODES=1
           GPUS_PER_NODE=8
@@ -112,13 +122,14 @@ jobs:
             test-pax.sh \
               --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
               --dtype bfloat16 \
-              --batch-per-gpu 4 \
+              --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 500 \
               --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
               --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
               --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
               --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
-              --nodes ${{ steps.meta.outputs.NODES }}
+              --nodes ${{ steps.meta.outputs.NODES }} \
+              ${{ matrix.ADDITIONAL_ARGS }}
           EOF
           )
 
@@ -210,6 +221,14 @@ jobs:
             BATCH_SIZE: 4
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
+          - TEST_NAME: 2DP1FSDP1TP4PP_fmha
+            PARALLEL_CONFIG: [4, 2, 1, 1]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: "--enable-fmha 1"
+          - TEST_NAME: 16DP1FSDP1TP1PP_fmha
+            PARALLEL_CONFIG: [1, 16, 1, 1]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: "--enable-fmha 1"          
       fail-fast: false
 
     runs-on: ubuntu-22.04

From 35565ce62c962ae6d9c26bc8180e5dfa6ede7e3e Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 17:49:51 -0700
Subject: [PATCH 03/17] Update _sandbox.yaml

---
 .github/workflows/_sandbox.yaml | 116 +++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
index 7b90b72ca..8ab69d0d8 100644
--- a/.github/workflows/_sandbox.yaml
+++ b/.github/workflows/_sandbox.yaml
@@ -2,40 +2,92 @@ name: "~Sandbox"
 
 on:
   workflow_dispatch:
+    inputs:
+      ARCHITECTURE:
+        type: string
+        required: false
+        default: "amd64"
+      BUILD_DATE:
+        type: string
+        description: Build date in YYYY-MM-DD format
+        required: false
+        default: NOT SPECIFIED
+      MANIFEST_ARTIFACT_NAME:
+        type: string
+        description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch
+        default: ''
+        required: false
+
+permissions:
+  contents: read  # to fetch code
+  actions:  write # to cancel previous workflows
+  packages: write # to upload container
 
 jobs:
-  sandbox:
+
+  build-base:
+    uses: ./.github/workflows/_build_base.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
+    secrets: inherit
+
+  build-jax:
+    needs: build-base
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-jax-build
+      BADGE_FILENAME: badge-jax-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
+      CONTAINER_NAME: jax
+      DOCKERFILE: .github/container/Dockerfile.jax
+      RUNNER_SIZE: large
+    secrets: inherit
+
+  build-upstream-pax:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-pax-build
+      BADGE_FILENAME: badge-pax-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-pax
+      DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }}
+    secrets: inherit
+
+  test-distribution:
     runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        TEST_SCRIPT:
+          - extra-only-distribution.sh
+          - mirror-only-distribution.sh
+          - upstream-only-distribution.sh
+          - local-patch-distribution.sh
+      fail-fast: false
     steps:
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Print usage
+      - name: Print environment variables
+        run: env
+      - name: Set git login for tests
         run: |
-          cat << EOF
-          This is an empty workflow file located in the main branch of your
-          repository. It serves as a testing ground for new GitHub Actions on
-          development branches before merging them to the main branch. By
-          defining and overloading this workflow on your development branch,
-          you can test new actions without affecting your main branch, ensuring
-          a smooth integration process once the changes are ready to be merged.
-
-          Usage:
-          
-          1. In your development branch, modify the sandbox.yml workflow file
-             to include the new actions you want to test. Make sure to commit
-             the changes to the development branch.
-          2. Navigate to the 'Actions' tab in your repository, select the
-             '~Sandbox' workflow, and choose your development branch from the
-             branch dropdown menu. Click on 'Run workflow' to trigger the
-             workflow on your development branch.
-          3. Once you have tested and verified the new actions in the Sandbox
-             workflow, you can incorporate them into your main workflow(s) and
-             merge the development branch into the main branch. Remember to
-             revert the changes to the sandbox.yml file in the main branch to
-             keep it empty for future testing.
-          EOF
+          git config --global user.email "jax@nvidia.com"
+          git config --global user.name "JAX-Toolbox CI"
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  test-upstream-pax:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_upstream_pax.yaml
+    with:
+      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+    secrets: inherit

From 42566a60408b35819f4a3230d962787612bae287 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 17:51:39 -0700
Subject: [PATCH 04/17] Update _test_upstream_pax.yaml

---
 .github/workflows/_test_upstream_pax.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 145dfd161..5d882c85a 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -520,7 +520,7 @@ jobs:
           path: output/*
 
   metrics:
-    needs: [single-process-multi-device, pax-multi-node, single-process-evaluation]
+    needs: [pax-single-process-multi-device, pax-multi-node, single-process-evaluation]
     runs-on: ubuntu-22.04
 
     steps:
@@ -564,7 +564,7 @@ jobs:
 
   summary:
     runs-on: ubuntu-22.04
-    needs: [single-process-multi-device, pax-multi-node, single-process-evaluation]
+    needs: [pax-single-process-multi-device, pax-multi-node, single-process-evaluation]
     if: "!cancelled()"
     steps:
       - name: Generate TensorBoard query URL

From 495ae326b09da727e7ec3e7af75f301a868ac6f7 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 17:53:50 -0700
Subject: [PATCH 05/17] Update _sandbox.yaml

---
 .github/workflows/_sandbox.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
index 8ab69d0d8..3d4a27a9f 100644
--- a/.github/workflows/_sandbox.yaml
+++ b/.github/workflows/_sandbox.yaml
@@ -89,5 +89,3 @@ jobs:
     with:
       PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
-
-    secrets: inherit

From 23a19d874e7519bf02cfee240abf5a5c334007c6 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Wed, 15 May 2024 23:27:41 -0700
Subject: [PATCH 06/17] Update test-pax.sh

---
 .github/container/test-pax.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 1b7dff2be..0bedfc636 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -448,10 +448,10 @@ else
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
 fi
 
-set +x
-echo "Output at ${OUTPUT}"
+echo "Checking for FMHA instructions in HLO!"
 
 if [[ "$ENABLE_FMHA" -eq "1" ]]; then 
+    echo "Inside if Statement!"
     ## Check if fmha instructions are present in the HLO dumped file or not.
     fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*"
     result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt)
@@ -473,3 +473,6 @@ else
  	echo "Removed dumped HLO directory!"
     fi
 fi
+
+set +x
+echo "Output at ${OUTPUT}"

From 4be1f40cc2b3bce2ff553f6dd1d0b4a2ecc25ee8 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Thu, 16 May 2024 14:12:29 -0700
Subject: [PATCH 07/17] Update _sandbox.yaml

---
 .github/workflows/_sandbox.yaml | 114 +++++++++-----------------------
 1 file changed, 32 insertions(+), 82 deletions(-)

diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
index 3d4a27a9f..7b90b72ca 100644
--- a/.github/workflows/_sandbox.yaml
+++ b/.github/workflows/_sandbox.yaml
@@ -2,90 +2,40 @@ name: "~Sandbox"
 
 on:
   workflow_dispatch:
-    inputs:
-      ARCHITECTURE:
-        type: string
-        required: false
-        default: "amd64"
-      BUILD_DATE:
-        type: string
-        description: Build date in YYYY-MM-DD format
-        required: false
-        default: NOT SPECIFIED
-      MANIFEST_ARTIFACT_NAME:
-        type: string
-        description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch
-        default: ''
-        required: false
-
-permissions:
-  contents: read  # to fetch code
-  actions:  write # to cancel previous workflows
-  packages: write # to upload container
 
 jobs:
-
-  build-base:
-    uses: ./.github/workflows/_build_base.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
-    secrets: inherit
-
-  build-jax:
-    needs: build-base
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-jax-build
-      BADGE_FILENAME: badge-jax-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
-      CONTAINER_NAME: jax
-      DOCKERFILE: .github/container/Dockerfile.jax
-      RUNNER_SIZE: large
-    secrets: inherit
-
-  build-upstream-pax:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-pax-build
-      BADGE_FILENAME: badge-pax-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-pax
-      DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }}
-    secrets: inherit
-
-  test-distribution:
+  sandbox:
     runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
     steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Print usage
         run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-upstream-pax:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_upstream_pax.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+          cat << EOF
+          This is an empty workflow file located in the main branch of your
+          repository. It serves as a testing ground for new GitHub Actions on
+          development branches before merging them to the main branch. By
+          defining and overloading this workflow on your development branch,
+          you can test new actions without affecting your main branch, ensuring
+          a smooth integration process once the changes are ready to be merged.
+
+          Usage:
+          
+          1. In your development branch, modify the sandbox.yml workflow file
+             to include the new actions you want to test. Make sure to commit
+             the changes to the development branch.
+          2. Navigate to the 'Actions' tab in your repository, select the
+             '~Sandbox' workflow, and choose your development branch from the
+             branch dropdown menu. Click on 'Run workflow' to trigger the
+             workflow on your development branch.
+          3. Once you have tested and verified the new actions in the Sandbox
+             workflow, you can incorporate them into your main workflow(s) and
+             merge the development branch into the main branch. Remember to
+             revert the changes to the sandbox.yml file in the main branch to
+             keep it empty for future testing.
+          EOF

From f44cdef645698a4f0d28ad34b7d100f48ee7da11 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Thu, 16 May 2024 14:22:37 -0700
Subject: [PATCH 08/17] removing hlo dir for llama test.

---
 .github/workflows/_test_upstream_pax.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 1e8088aa2..9cdcc9d3b 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -220,7 +220,7 @@ jobs:
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4
             EVALUATE: true
-            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
+            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate --save-hlo 0"
           - TEST_NAME: 2DP1FSDP1TP4PP_fmha
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4

From da69dbd17a793864c415a9c85274d39be20a1ccd Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 20 May 2024 12:06:25 -0700
Subject: [PATCH 09/17] Update _test_upstream_pax.yaml

Disabled saving hlo by default as suggested by terry
---
 .github/workflows/_test_upstream_pax.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 9cdcc9d3b..1e8088aa2 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -220,7 +220,7 @@ jobs:
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4
             EVALUATE: true
-            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate --save-hlo 0"
+            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
           - TEST_NAME: 2DP1FSDP1TP4PP_fmha
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4

From a6622c8720711ac564964354ce9a427b59e3fae9 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 20 May 2024 12:10:02 -0700
Subject: [PATCH 10/17] Update _test_upstream_pax.yaml

---
 .github/workflows/_test_upstream_pax.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 1e8088aa2..9f61ea962 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -45,7 +45,7 @@ jobs:
           - TEST_NAME: 8DP1FSDP1TP1PP_fmha
             PARALLEL_CONFIG: [1, 8, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1"
+            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -224,11 +224,11 @@ jobs:
           - TEST_NAME: 2DP1FSDP1TP4PP_fmha
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1"
+            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"
           - TEST_NAME: 16DP1FSDP1TP1PP_fmha
             PARALLEL_CONFIG: [1, 16, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1"          
+            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"          
       fail-fast: false
 
     runs-on: ubuntu-22.04

From b67229b45e60dc046d5a6f88aba3c047128d5a4c Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 20 May 2024 12:11:01 -0700
Subject: [PATCH 11/17] Update test-pax.sh

Incorporated review comments, disabled saving hlo by default as suggested by terry.
---
 .github/container/test-pax.sh | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 5c88e6ead..464816950 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -58,7 +58,7 @@ DROPOUT=0
 EVALUATE=0
 ADDITIONAL_ARGS=""
 ENABLE_FMHA=${ENABLE_FMHA:-0}
-SAVE_HLO=${SAVE_HLO:-1}
+SAVE_HLO=${SAVE_HLO:-0}
 
 eval set -- "$args"
 while [ : ]; do
@@ -454,27 +454,21 @@ fi
 echo "Checking for FMHA instructions in HLO!"
 
 if [[ "$ENABLE_FMHA" -eq "1" ]]; then 
-    echo "Inside if Statement!"
     ## Check if fmha instructions are present in the HLO dumped file or not.
     fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*"
     result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt)
 
-    if [[ $SAVE_HLO -eq 0 ]]; then
-        rm -rf $HLO_DIR
-        echo "Removed dumped HLO directory!"
-    fi
-
     if [ -z "$result" ]; then
         echo "E: No FMHA instructions were found in the hlo files!"
 	exit 1
     else
         echo -e "Found FMHA instructions in the following HLO files: \n $result"
     fi
-else
-    if [[ $SAVE_HLO -eq 0 ]]; then
-        rm -rf $HLO_DIR
- 	echo "Removed dumped HLO directory!"
-    fi
+fi
+
+if [[ $SAVE_HLO -eq 0 ]]; then
+    rm -rf $HLO_DIR
+    echo "Removed dumped HLO directory!"
 fi
 
 set +x

From f7618bfcd0cd4667174eefe8ed489f25ead79709 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 20 May 2024 14:34:27 -0700
Subject: [PATCH 12/17] Update test-pax.sh

merge enable-fmha and enable-fused-attn flags
---
 .github/container/test-pax.sh | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 464816950..002811c5d 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -17,7 +17,6 @@ usage() {
     echo "  --dtype                    Batch size, defaults to bfloat16."
     echo "  --enable-te                If set, will run with env var ENABLE_TE=1." 
     echo "  --enable-dropout           If set, will set DROPOUT_PROB to 0.1."
-    echo "  --enable-fused-attn        Whether to test fused attention through TE."
     echo "  --model-type               One of 126M, 5B, LLaMA70BProxy. Defaults to 126M"
     echo "  --evaluate                 Whether to test evaluation rather than training."
     echo "  -s, --steps                Number of steps to run, defaults to 500."
@@ -34,7 +33,7 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
@@ -81,16 +80,13 @@ while [ : ]; do
             ;;
         --enable-fmha)
             ENABLE_FMHA="$2"
+	    NVTE_FUSED_ATTN=1
             shift 2
             ;;
         --enable-dropout)
             DROPOUT='0.1'
             shift 1
             ;;
-        --enable-fused-attn)
-            NVTE_FUSED_ATTN=1
-            shift 1
-            ;;
         --model-type)
             MODEL_TYPE=$2
             shift 2

From dbb999d195eceb1dba785de06135a4d75abf16fe Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 10 Jun 2024 11:54:46 -0700
Subject: [PATCH 13/17] Update test-pax.sh

Incorporated review comments
---
 .github/container/test-pax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 002811c5d..01d05e562 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -80,7 +80,7 @@ while [ : ]; do
             ;;
         --enable-fmha)
             ENABLE_FMHA="$2"
-	    NVTE_FUSED_ATTN=1
+	    NVTE_FUSED_ATTN="$2"
             shift 2
             ;;
         --enable-dropout)

From c1ff8ae4f0f0abf925cd25befd832a143c350529 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Tue, 11 Jun 2024 11:51:30 -0700
Subject: [PATCH 14/17] Update _test_pax_rosetta.yaml

---
 .github/workflows/_test_pax_rosetta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 264777e15..72ce4b29c 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -252,7 +252,7 @@ jobs:
           - TEST_NAME: 5B_fused_attn_0
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 2
-            ADDITIONAL_ARGS: "--model-type 5B --disable-fused-attn"
+            ADDITIONAL_ARGS: "--model-type 5B --enable-fmha 0"
           - TEST_NAME: LLaMA_eval_TE
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4

From 0ef811a02c5441db4e6a9932c44e192b2abe16e9 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Tue, 11 Jun 2024 11:53:22 -0700
Subject: [PATCH 15/17] Update _test_upstream_pax.yaml

---
 .github/workflows/_test_upstream_pax.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 9f61ea962..045f8e014 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -45,7 +45,7 @@ jobs:
           - TEST_NAME: 8DP1FSDP1TP1PP_fmha
             PARALLEL_CONFIG: [1, 8, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"
+            ADDITIONAL_ARGS: "--save-hlo 1"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -224,11 +224,11 @@ jobs:
           - TEST_NAME: 2DP1FSDP1TP4PP_fmha
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"
+            ADDITIONAL_ARGS: "--save-hlo 1"
           - TEST_NAME: 16DP1FSDP1TP1PP_fmha
             PARALLEL_CONFIG: [1, 16, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--enable-fmha 1 --save-hlo 1"          
+            ADDITIONAL_ARGS: "--save-hlo 1"          
       fail-fast: false
 
     runs-on: ubuntu-22.04

From ca6e2e9062aabda722b3b7675ce1a2fefe2562ad Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Tue, 11 Jun 2024 11:53:27 -0700
Subject: [PATCH 16/17] Update test-pax.sh

---
 .github/container/test-pax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 6651364f7..0dc3ef9e3 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -57,7 +57,7 @@ NVTE_FUSED_ATTN=1
 DROPOUT=0
 EVALUATE=0
 ADDITIONAL_ARGS=""
-ENABLE_FMHA=${ENABLE_FMHA:-0}
+ENABLE_FMHA=${ENABLE_FMHA:-1}
 SAVE_HLO=${SAVE_HLO:-0}
 
 eval set -- "$args"

From 7084812ca869fea46d329362959f60c222b8a4ea Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Tue, 11 Jun 2024 12:24:28 -0700
Subject: [PATCH 17/17] Update _test_upstream_pax.yaml

---
 .github/workflows/_test_upstream_pax.yaml | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 045f8e014..fe0ebbaba 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -37,15 +37,11 @@ jobs:
           - TEST_NAME: 8DP1FSDP1TP1PP
             PARALLEL_CONFIG: [1, 8, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--save-hlo 1"
           - TEST_NAME: 8DP2FSDP4TP1PP
             PARALLEL_CONFIG: [1, 1, 2, 4]
             BATCH_SIZE: 4
             ADDITIONAL_ARGS: ""
-          - TEST_NAME: 8DP1FSDP1TP1PP_fmha
-            PARALLEL_CONFIG: [1, 8, 1, 1]
-            BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--save-hlo 1"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -204,7 +200,7 @@ jobs:
           - TEST_NAME: 2DP1FSDP1TP4PP
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--save-hlo 1"
           - TEST_NAME: 4DP1FSDP2TP1PP
             PARALLEL_CONFIG: [1, 4, 1, 2]
             BATCH_SIZE: 4
@@ -212,7 +208,7 @@ jobs:
           - TEST_NAME: 16DP1FSDP1TP1PP
             PARALLEL_CONFIG: [1, 16, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--save-hlo 1"   
           - TEST_NAME: 2DP1FSDP2TP4PP
             PARALLEL_CONFIG: [4, 2, 1, 2]
             BATCH_SIZE: 4
@@ -220,15 +216,7 @@ jobs:
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4
             EVALUATE: true
-            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
-          - TEST_NAME: 2DP1FSDP1TP4PP_fmha
-            PARALLEL_CONFIG: [4, 2, 1, 1]
-            BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--save-hlo 1"
-          - TEST_NAME: 16DP1FSDP1TP1PP_fmha
-            PARALLEL_CONFIG: [1, 16, 1, 1]
-            BATCH_SIZE: 4
-            ADDITIONAL_ARGS: "--save-hlo 1"          
+            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"       
       fail-fast: false
 
     runs-on: ubuntu-22.04