mozilla · bhearsum · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Dec 19, 2024
@@ -1,3 +1,4 @@
 ctranslate2==4.3.1
 sentencepiece==0.2.0
 gpustat==1.1.1
+requests==2.32.3
@@ -93,39 +93,39 @@ workers:
             worker-type: 'b-linux-large-gcp-1tb-64-512-std-d2g'
         b-linux-v100-gpu:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4'
         b-linux-v100-gpu-4:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4'
         b-linux-v100-gpu-4-300gb:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
         b-linux-v100-gpu-4-300gb-standard:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
         b-linux-v100-gpu-4-1tb:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
         b-linux-v100-gpu-4-2tb:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
         b-linux-v100-gpu-4-1tb-standard:
             provisioner: '{trust-domain}-{level}'
-            implementation: generic-worker
+            implementation: docker-worker
             os: linux
-            worker-type: '{alias}'
+            worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
         images:
             provisioner: '{trust-domain}-{level}'
             implementation: docker-worker

@@ -16,7 +16,8 @@ RUN apt-get update -qq \
                           wget \
                           pkg-config \
                           libicu-dev \
+                          software-properties-common \
     && apt-get clean
 
-
 VOLUME /builds/worker/checkouts
+VOLUME /builds/worker/.cache
@@ -64,9 +64,10 @@ tasks:
 
         worker-type: b-largegpu-largedisk
         worker:
+            docker-image: {"in-tree": "train"}
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
             # 7 days. yes, it can take a while to clean a huge dataset
             max-run-time: 604800
@@ -88,6 +89,7 @@ tasks:
 
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -c

@@ -54,9 +54,10 @@ tasks:
 
         worker-type: b-gpu
         worker:
+            docker-image: {"in-tree": "train"}
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
             max-run-time: 2592000
             env:
@@ -84,12 +85,14 @@ tasks:
 
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -c
                 - >-
                     export PATH=$PATH:~/.local/bin &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     pip install --upgrade pip &&
                     pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     pip install $VCS_PATH/tracking &&
@@ -102,7 +105,7 @@ tasks:
                     --vocab             "$MOZ_FETCHES_DIR/vocab.spm"
                     --shortlist         "$MOZ_FETCHES_DIR/lex.s2t.pruned"
                     --artifacts_prefix  "$TASK_WORKDIR/artifacts/{dataset_sanitized}"
-                    --marian_config     "$TASK_WORKDIR/$VCS_PATH/pipeline/quantize/decoder.yml"
+                    --marian_config     "$VCS_PATH/pipeline/quantize/decoder.yml"
                     --marian            "$BMT_MARIAN"
                     --gpus              "$GPUS"
                     --model_variant     quantized
@@ -129,3 +132,4 @@ tasks:
                 # Quantized models are only supported via the browsermt fork of Marian.
                 # https://github.com/browsermt/marian-dev
                 - browsermt-marian
+                - cuda-toolkit
@@ -54,9 +54,10 @@ tasks:
 
         worker-type: b-gpu
         worker:
+            docker-image: {"in-tree": "train"}
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
             max-run-time: 2592000
             env:
@@ -84,6 +85,7 @@ tasks:
 
         run:
             using: run-task
+            cache-dotcache: true
             # The two sed commands here are the unfortunate result of us consuming
             # a marian config that was produced by an earlier step. These configs
             # have hardcoded absolute paths to the models they were trained on,
@@ -96,6 +98,7 @@ tasks:
                 - >-
                     export PATH=$PATH:~/.local/bin &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     pip install --upgrade pip &&
                     pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     pip install $VCS_PATH/tracking &&
@@ -140,3 +143,4 @@ tasks:
                   extract: false
             toolchain:
                 - marian
+                - cuda-toolkit
@@ -52,9 +52,10 @@ task-defaults:
             owner: owner
     worker-type: b-gpu
     worker:
+        docker-image: {"in-tree": "train"}
         artifacts:
             - name: public/build
-              path: artifacts
+              path: /builds/worker/artifacts
               type: directory
         max-run-time: 2592000
         env:
@@ -82,6 +83,7 @@ task-defaults:
 
     run:
         using: run-task
+        cache-dotcache: true
         # The two sed commands here are the unfortunate result of us consuming
         # a marian config that was produced by an earlier step. These configs
         # have hardcoded absolute paths to the models they were trained on,
@@ -94,6 +96,7 @@ task-defaults:
             - >-
                 export PATH=$PATH:~/.local/bin &&
                 export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                 pip install --upgrade pip &&
                 pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                 pip install $VCS_PATH/tracking &&
@@ -145,6 +148,7 @@ tasks:
                   extract: false
             toolchain:
                 - marian
+                - cuda-toolkit
 
     teacher-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}-{this_chunk}:
         description: teacher evaluation for {dataset} {src_locale}-{trg_locale} {this_chunk}
@@ -190,6 +194,7 @@ tasks:
                   extract: false
             toolchain:
                 - marian
+                - cuda-toolkit
 
     student-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:
         description: student evaluation for {dataset_sanitized} {src_locale}-{trg_locale}
@@ -224,6 +229,7 @@ tasks:
                   extract: false
             toolchain:
                 - marian
+                - cuda-toolkit
 
     finetuned-student-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:
         description: finetuned-student evaluation for {dataset_sanitized} {src_locale}-{trg_locale}
@@ -258,3 +264,4 @@ tasks:
                   extract: false
             toolchain:
                 - marian
+                - cuda-toolkit
@@ -43,12 +43,12 @@ extract-lex:
         revision: 42fa605b53f32eaf6c6e0b5677255c21c91b3d49
 
 cuda:
-    description: CUDA 12.1.0 Source
+    description: CUDA 12.6.3 Source
     fetch:
         type: static-url
-        url: https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run  # yamllint disable-line rule:line-length
-        sha256: 68699036c12d71adb9ad2799dce2ff070270fab4488b90920b9756ab3f52c41c
-        size: 4245586997
+        url: https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run  # yamllint disable-line rule:line-length
+        sha256: 81d60e48044796d7883aa8a049afe6501b843f2c45639b3703b2378de30d55d3
+        size: 4446722669
         artifact-name: cuda-source.run
 
 

@@ -62,6 +62,7 @@ tasks:
 
         worker-type: b-largegpu-xxlargedisk
         worker:
+            docker-image: {"in-tree": "train"}
             max-run-time: 2592000
             # train_taskcluster.py exits with 17 if a request to Taskcluster fails
             # 128 happens when cloning this repository fails
@@ -75,7 +76,7 @@ tasks:
                 TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
 
             # Taskcluster proxy is required to read secrets
@@ -89,6 +90,7 @@ tasks:
             from-parameters: training_config.marian-args.training-student-finetuned
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -cx
@@ -99,6 +101,7 @@ tasks:
                     export PATH="$HOME/.local/bin:$PATH" &&
                     export MARIAN=$MOZ_FETCHES_DIR &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     $VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
                     student
                     finetune
@@ -127,6 +130,7 @@ tasks:
         fetches:
             toolchain:
                 - marian
+                - cuda-toolkit
             train-vocab:
                 - artifact: vocab.spm
                   extract: false

@@ -46,6 +46,7 @@ tasks:
 
         worker-type: b-largegpu-largedisk
         worker:
+            docker-image: {"in-tree": "train"}
             max-run-time: 2592000
             env:
                 # TODO: what should we _actually_ use for the workspace value?
@@ -59,7 +60,7 @@ tasks:
                 TRG: "{trg_locale}"
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
             # 128 happens when cloning this repository fails
             retry-exit-status: [128]
@@ -69,11 +70,13 @@ tasks:
 
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -c
                 - >-
                     export MARIAN=$MOZ_FETCHES_DIR &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     find fetches &&
                     $VCS_PATH/pipeline/cefilter/score.sh
                     $TASK_WORKDIR/fetches/final.model.npz.best-{best_model}.npz
@@ -88,6 +91,7 @@ tasks:
         fetches:
             toolchain:
                 - marian
+                - cuda-toolkit
             train-backwards:
                 - artifact: final.model.npz.best-{best_model}.npz
                   extract: false

@@ -62,6 +62,7 @@ tasks:
                 github-pull-request: b-largegpu
                 default: b-largegpu-largedisk
         worker:
+            docker-image: {"in-tree": "train"}
             max-run-time: 2592000
             # train_taskcluster.py exits with 17 if a request to Taskcluster fails
             retry-exit-status: [17]
@@ -74,7 +75,7 @@ tasks:
                 TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
 
             # Taskcluster proxy is required to read secrets
@@ -94,6 +95,7 @@ tasks:
 
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -cx
@@ -104,6 +106,7 @@ tasks:
                     export PATH="$HOME/.local/bin:$PATH" &&
                     export MARIAN=$MOZ_FETCHES_DIR &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     $VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
                     backward
                     train
@@ -129,6 +132,7 @@ tasks:
         fetches:
             toolchain:
                 - marian
+                - cuda-toolkit
             merge-corpus:
                 - artifact: corpus.{src_locale}.zst
                   extract: false

@@ -60,6 +60,7 @@ tasks:
                 github-pull-request: b-largegpu-largedisk
                 default: b-largegpu-xxlargedisk
         worker:
+            docker-image: {"in-tree": "train"}
             max-run-time: 2592000
             # train_taskcluster.py exits with 17 if a request to Taskcluster fails
             # 128 happens when cloning this repository fails
@@ -73,7 +74,7 @@ tasks:
                 TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
             artifacts:
                 - name: public/build
-                  path: artifacts
+                  path: /builds/worker/artifacts
                   type: directory
 
             # Taskcluster proxy is required to read secrets
@@ -90,6 +91,7 @@ tasks:
             from-parameters: training_config.marian-args.training-student
         run:
             using: run-task
+            cache-dotcache: true
             command:
                 - bash
                 - -cx
@@ -100,6 +102,7 @@ tasks:
                     export PATH="$HOME/.local/bin:$PATH" &&
                     export MARIAN=$MOZ_FETCHES_DIR &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
                     $VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
                     student
                     train
@@ -125,6 +128,7 @@ tasks:
         fetches:
             toolchain:
                 - marian
+                - cuda-toolkit
             train-vocab:
                 - artifact: vocab.spm
                   extract: false