Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP switch GPU workers to image that uses multi engine generic worker #700

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions pipeline/translate/requirements/translate-ctranslate2.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ctranslate2==4.3.1
sentencepiece==0.2.0
gpustat==1.1.1
requests==2.32.3
362 changes: 242 additions & 120 deletions pipeline/translate/requirements/translate-ctranslate2.txt

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions taskcluster/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,39 +93,39 @@ workers:
worker-type: 'b-linux-large-gcp-1tb-64-512-std-d2g'
b-linux-v100-gpu:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4'
b-linux-v100-gpu-4:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4'
b-linux-v100-gpu-4-300gb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-300gb-standard:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-1tb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-2tb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-1tb-standard:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
images:
provisioner: '{trust-domain}-{level}'
implementation: docker-worker
Expand Down
3 changes: 2 additions & 1 deletion taskcluster/docker/train/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ RUN apt-get update -qq \
wget \
pkg-config \
libicu-dev \
software-properties-common \
&& apt-get clean


VOLUME /builds/worker/checkouts
VOLUME /builds/worker/.cache
4 changes: 3 additions & 1 deletion taskcluster/kinds/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ tasks:

worker-type: b-largegpu-largedisk
worker:
docker-image: {"in-tree": "train"}
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
# 7 days. yes, it can take a while to clean a huge dataset
max-run-time: 604800
Expand All @@ -88,6 +89,7 @@ tasks:

run:
using: run-task
cache-dotcache: true
command:
- bash
- -c
Expand Down
8 changes: 6 additions & 2 deletions taskcluster/kinds/evaluate-quantized/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ tasks:

worker-type: b-gpu
worker:
docker-image: {"in-tree": "train"}
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
max-run-time: 2592000
env:
Expand Down Expand Up @@ -84,12 +85,14 @@ tasks:

run:
using: run-task
cache-dotcache: true
command:
- bash
- -c
- >-
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
Expand All @@ -102,7 +105,7 @@ tasks:
--vocab "$MOZ_FETCHES_DIR/vocab.spm"
--shortlist "$MOZ_FETCHES_DIR/lex.s2t.pruned"
--artifacts_prefix "$TASK_WORKDIR/artifacts/{dataset_sanitized}"
--marian_config "$TASK_WORKDIR/$VCS_PATH/pipeline/quantize/decoder.yml"
--marian_config "$VCS_PATH/pipeline/quantize/decoder.yml"
--marian "$BMT_MARIAN"
--gpus "$GPUS"
--model_variant quantized
Expand All @@ -129,3 +132,4 @@ tasks:
# Quantized models are only supported via the browsermt fork of Marian.
# https://github.com/browsermt/marian-dev
- browsermt-marian
- cuda-toolkit
6 changes: 5 additions & 1 deletion taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ tasks:

worker-type: b-gpu
worker:
docker-image: {"in-tree": "train"}
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
max-run-time: 2592000
env:
Expand Down Expand Up @@ -84,6 +85,7 @@ tasks:

run:
using: run-task
cache-dotcache: true
# The two sed commands here are the unfortunate result of us consuming
# a marian config that was produced by an earlier step. These configs
# have hardcoded absolute paths to the models they were trained on,
Expand All @@ -96,6 +98,7 @@ tasks:
- >-
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
Expand Down Expand Up @@ -140,3 +143,4 @@ tasks:
extract: false
toolchain:
- marian
- cuda-toolkit
9 changes: 8 additions & 1 deletion taskcluster/kinds/evaluate/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ task-defaults:
owner: owner
worker-type: b-gpu
worker:
docker-image: {"in-tree": "train"}
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
max-run-time: 2592000
env:
Expand Down Expand Up @@ -82,6 +83,7 @@ task-defaults:

run:
using: run-task
cache-dotcache: true
# The two sed commands here are the unfortunate result of us consuming
# a marian config that was produced by an earlier step. These configs
# have hardcoded absolute paths to the models they were trained on,
Expand All @@ -94,6 +96,7 @@ task-defaults:
- >-
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
Expand Down Expand Up @@ -145,6 +148,7 @@ tasks:
extract: false
toolchain:
- marian
- cuda-toolkit

teacher-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}-{this_chunk}:
description: teacher evaluation for {dataset} {src_locale}-{trg_locale} {this_chunk}
Expand Down Expand Up @@ -190,6 +194,7 @@ tasks:
extract: false
toolchain:
- marian
- cuda-toolkit

student-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:
description: student evaluation for {dataset_sanitized} {src_locale}-{trg_locale}
Expand Down Expand Up @@ -224,6 +229,7 @@ tasks:
extract: false
toolchain:
- marian
- cuda-toolkit

finetuned-student-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:
description: finetuned-student evaluation for {dataset_sanitized} {src_locale}-{trg_locale}
Expand Down Expand Up @@ -258,3 +264,4 @@ tasks:
extract: false
toolchain:
- marian
- cuda-toolkit
8 changes: 4 additions & 4 deletions taskcluster/kinds/fetch/toolchains.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ extract-lex:
revision: 42fa605b53f32eaf6c6e0b5677255c21c91b3d49

cuda:
description: CUDA 12.1.0 Source
description: CUDA 12.6.3 Source
fetch:
type: static-url
url: https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run # yamllint disable-line rule:line-length
sha256: 68699036c12d71adb9ad2799dce2ff070270fab4488b90920b9756ab3f52c41c
size: 4245586997
url: https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run # yamllint disable-line rule:line-length
sha256: 81d60e48044796d7883aa8a049afe6501b843f2c45639b3703b2378de30d55d3
size: 4446722669
artifact-name: cuda-source.run


Expand Down
6 changes: 5 additions & 1 deletion taskcluster/kinds/finetune-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ tasks:

worker-type: b-largegpu-xxlargedisk
worker:
docker-image: {"in-tree": "train"}
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
# 128 happens when cloning this repository fails
Expand All @@ -75,7 +76,7 @@ tasks:
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory

# Taskcluster proxy is required to read secrets
Expand All @@ -89,6 +90,7 @@ tasks:
from-parameters: training_config.marian-args.training-student-finetuned
run:
using: run-task
cache-dotcache: true
command:
- bash
- -cx
Expand All @@ -99,6 +101,7 @@ tasks:
export PATH="$HOME/.local/bin:$PATH" &&
export MARIAN=$MOZ_FETCHES_DIR &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
$VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
student
finetune
Expand Down Expand Up @@ -127,6 +130,7 @@ tasks:
fetches:
toolchain:
- marian
- cuda-toolkit
train-vocab:
- artifact: vocab.spm
extract: false
Expand Down
6 changes: 5 additions & 1 deletion taskcluster/kinds/score/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ tasks:

worker-type: b-largegpu-largedisk
worker:
docker-image: {"in-tree": "train"}
max-run-time: 2592000
env:
# TODO: what should we _actually_ use for the workspace value?
Expand All @@ -59,7 +60,7 @@ tasks:
TRG: "{trg_locale}"
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
# 128 happens when cloning this repository fails
retry-exit-status: [128]
Expand All @@ -69,11 +70,13 @@ tasks:

run:
using: run-task
cache-dotcache: true
command:
- bash
- -c
- >-
export MARIAN=$MOZ_FETCHES_DIR &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
find fetches &&
$VCS_PATH/pipeline/cefilter/score.sh
$TASK_WORKDIR/fetches/final.model.npz.best-{best_model}.npz
Expand All @@ -88,6 +91,7 @@ tasks:
fetches:
toolchain:
- marian
- cuda-toolkit
train-backwards:
- artifact: final.model.npz.best-{best_model}.npz
extract: false
Expand Down
6 changes: 5 additions & 1 deletion taskcluster/kinds/train-backwards/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ tasks:
github-pull-request: b-largegpu
default: b-largegpu-largedisk
worker:
docker-image: {"in-tree": "train"}
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
retry-exit-status: [17]
Expand All @@ -74,7 +75,7 @@ tasks:
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory

# Taskcluster proxy is required to read secrets
Expand All @@ -94,6 +95,7 @@ tasks:

run:
using: run-task
cache-dotcache: true
command:
- bash
- -cx
Expand All @@ -104,6 +106,7 @@ tasks:
export PATH="$HOME/.local/bin:$PATH" &&
export MARIAN=$MOZ_FETCHES_DIR &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
$VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
backward
train
Expand All @@ -129,6 +132,7 @@ tasks:
fetches:
toolchain:
- marian
- cuda-toolkit
merge-corpus:
- artifact: corpus.{src_locale}.zst
extract: false
Expand Down
6 changes: 5 additions & 1 deletion taskcluster/kinds/train-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ tasks:
github-pull-request: b-largegpu-largedisk
default: b-largegpu-xxlargedisk
worker:
docker-image: {"in-tree": "train"}
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
# 128 happens when cloning this repository fails
Expand All @@ -73,7 +74,7 @@ tasks:
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory

# Taskcluster proxy is required to read secrets
Expand All @@ -90,6 +91,7 @@ tasks:
from-parameters: training_config.marian-args.training-student
run:
using: run-task
cache-dotcache: true
command:
- bash
- -cx
Expand All @@ -100,6 +102,7 @@ tasks:
export PATH="$HOME/.local/bin:$PATH" &&
export MARIAN=$MOZ_FETCHES_DIR &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MOZ_FETCHES_DIR/cuda-toolkit/lib64" &&
$VCS_PATH/taskcluster/scripts/pipeline/train_taskcluster.py
student
train
Expand All @@ -125,6 +128,7 @@ tasks:
fetches:
toolchain:
- marian
- cuda-toolkit
train-vocab:
- artifact: vocab.spm
extract: false
Expand Down
Loading