From 1cae7e97811ea60247f197d69b7778da96fde9fe Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 22 May 2023 00:31:29 +0100 Subject: [PATCH 1/3] use nvcr.io for staging ingernal CI/CD containers --- .github/workflows/_build_base.yaml | 8 ++++---- .github/workflows/_build_jax.yaml | 8 ++++---- .github/workflows/_build_pax.yaml | 8 ++++---- .github/workflows/_build_rosetta.yaml | 8 ++++---- .github/workflows/_build_t5x.yaml | 8 ++++---- .github/workflows/_build_te.yaml | 8 ++++---- .github/workflows/_publish_nightly.yaml | 12 ++++++++++-- .github/workflows/_sandbox.yaml | 2 +- .github/workflows/_test_jax.yaml | 6 +++--- .github/workflows/_test_rosetta.yaml | 6 +++--- .github/workflows/_test_te.yaml | 6 +++--- 11 files changed, 44 insertions(+), 36 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 8806a2f13..114c94942 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -19,7 +19,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal permissions: contents: read # to fetch code @@ -42,9 +42,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index e5374e9bc..a17b43b1d 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -39,7 +39,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal permissions: contents: read # to fetch code @@ -62,9 +62,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index cd4acd671..f0a180b7e 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -39,7 +39,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal permissions: contents: read # to fetch code @@ -62,9 +62,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index db4c0c71f..7b6f44763 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -23,7 +23,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal DOCKER_REGISTRY: ghcr.io/nvidia permissions: @@ -55,9 +55,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index 925eec78b..3839405e2 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -29,7 +29,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal permissions: contents: read # to fetch code @@ -52,9 +52,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_build_te.yaml b/.github/workflows/_build_te.yaml index 1919be104..2d1e37fea 100644 --- a/.github/workflows/_build_te.yaml +++ b/.github/workflows/_build_te.yaml @@ -29,7 +29,7 @@ on: value: ${{ jobs.build.outputs.DOCKER_TAGS }} env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal + UPLD_IMAGE: nvcr.io/nvidian/jax-toolbox-internal permissions: contents: read # to fetch code @@ -52,9 +52,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Set docker metadata id: meta diff --git a/.github/workflows/_publish_nightly.yaml b/.github/workflows/_publish_nightly.yaml index 3c497cebc..bb2084606 100644 --- a/.github/workflows/_publish_nightly.yaml +++ b/.github/workflows/_publish_nightly.yaml @@ -36,6 +36,13 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} + - name: Set docker metadata id: meta uses: docker/metadata-action@v4 @@ -48,8 +55,9 @@ jobs: shell: bash -x -e {0} run: | for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - docker manifest create $tag ${{ inputs.SOURCE_IMAGE }} - docker manifest push $tag + docker pull ${{ inputs.SOURCE_IMAGE }} + docker tag ${{ inputs.SOURCE_IMAGE }} $tag + docker push $tag done - name: Generate outputs and artifacts diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37fa6ca68..fef911e88 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -10,7 +10,7 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io + registry: nvcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/_test_jax.yaml b/.github/workflows/_test_jax.yaml index d98d4cf15..bc4de028c 100644 --- a/.github/workflows/_test_jax.yaml +++ b/.github/workflows/_test_jax.yaml @@ -23,9 +23,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Pull JAX image shell: bash -x -e {0} diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml index ddf4700f1..4a220f98f 100644 --- a/.github/workflows/_test_rosetta.yaml +++ b/.github/workflows/_test_rosetta.yaml @@ -27,9 +27,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Pull Rosetta image shell: bash -x -e {0} diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index ce795ed03..b0f5a99e9 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -23,9 +23,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} - name: Pull JAX-TE image shell: bash -x -e {0} From dab0e7d3996dcf6c9d4185e8fe0e2556155b9472 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 22 May 2023 18:31:04 +0100 Subject: [PATCH 2/3] fix job step names --- .github/workflows/_build_base.yaml | 2 +- .github/workflows/_build_jax.yaml | 2 +- .github/workflows/_build_pax.yaml | 2 +- .github/workflows/_build_rosetta.yaml | 2 +- .github/workflows/_build_t5x.yaml | 2 +- .github/workflows/_build_te.yaml | 2 +- .github/workflows/_publish_nightly.yaml | 2 +- .github/workflows/_sandbox.yaml | 9 ++++++++- .github/workflows/_test_jax.yaml | 2 +- .github/workflows/_test_te.yaml | 2 +- 10 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 114c94942..b7270d611 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -39,7 +39,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index a17b43b1d..8cd2bf60b 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -59,7 +59,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index f0a180b7e..55d7dcbe2 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -59,7 +59,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index 7b6f44763..69ddcec53 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -52,7 +52,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index 3839405e2..baca203fe 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -49,7 +49,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_build_te.yaml b/.github/workflows/_build_te.yaml index 2d1e37fea..035a3aa33 100644 --- a/.github/workflows/_build_te.yaml +++ b/.github/workflows/_build_te.yaml @@ -49,7 +49,7 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_publish_nightly.yaml b/.github/workflows/_publish_nightly.yaml index bb2084606..6506e0d2f 100644 --- a/.github/workflows/_publish_nightly.yaml +++ b/.github/workflows/_publish_nightly.yaml @@ -36,7 +36,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index fef911e88..dde7f2357 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -10,10 +10,17 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: - registry: nvcr.io + registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to NVIDIA Container Registry + uses: docker/login-action@v2 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} + - name: Print usage run: | cat << EOF diff --git a/.github/workflows/_test_jax.yaml b/.github/workflows/_test_jax.yaml index bc4de028c..12ba58cf6 100644 --- a/.github/workflows/_test_jax.yaml +++ b/.github/workflows/_test_jax.yaml @@ -20,7 +20,7 @@ jobs: - name: Print GPU information run: nvidia-smi - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index b0f5a99e9..ba28f6333 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -20,7 +20,7 @@ jobs: - name: Print GPU information run: nvidia-smi - - name: Login to GitHub Container Registry + - name: Login to NVIDIA Container Registry uses: docker/login-action@v2 with: registry: nvcr.io From ad785001e2d9a49dd9e5e810d226f99f210c9948 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 22 May 2023 22:17:03 +0100 Subject: [PATCH 3/3] pass nvcr.io token to SLURM job --- .github/workflows/_test_pax.yaml | 2 +- .github/workflows/_test_t5x.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 964d17ffb..974dc55f6 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -75,7 +75,7 @@ jobs: #SBATCH --tasks-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} #SBATCH --time=00:30:00 #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model,ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + #SBATCH --export="VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model,ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},NVCR_TOKEN=$${{ secrets.NVCR_TOKEN }}" time srun \ --container-image=${{ steps.meta.outputs.IMAGE }} \ --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 89e58acdf..6c1aac609 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -69,7 +69,7 @@ jobs: #SBATCH --gpus-per-node=${{ matrix.N_GPU }} #SBATCH --time=00:30:00 #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},NVCR_TOKEN=$${{ secrets.NVCR_TOKEN }}" time srun \ --container-image=${{ steps.meta.outputs.IMAGE }} \ --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ @@ -167,7 +167,7 @@ jobs: #SBATCH --tasks-per-node=${{ matrix.N_GPU }} #SBATCH --time=00:30:00 #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},NVCR_TOKEN=$${{ secrets.NVCR_TOKEN }}" time srun \ --container-image=${{ steps.meta.outputs.IMAGE }} \ --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \