From 1eb5fb1ca3387cd174d0d11eda4ed77668353f64 Mon Sep 17 00:00:00 2001 From: ted chang Date: Mon, 8 Apr 2024 16:25:59 -0700 Subject: [PATCH] Update fms-hf-tuning e2e test Signed-off-by: ted chang --- test/e2e/config.json | 4 ++++ test/e2e/kfto_kueue_sft_test.go | 6 +++--- test/e2e/setup.sh | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/e2e/config.json b/test/e2e/config.json index 1c436666ba..2470b68a38 100644 --- a/test/e2e/config.json +++ b/test/e2e/config.json @@ -1,4 +1,8 @@ { + "accelerate_launch_args": { + "num_processes": 2, + "cpu": true + }, "model_name_or_path": "bigscience/bloom-560m", "training_data_path": "/etc/config/twitter_complaints_small.json", "output_dir": "/tmp/out", diff --git a/test/e2e/kfto_kueue_sft_test.go b/test/e2e/kfto_kueue_sft_test.go index b7c363b61c..684aec94b0 100644 --- a/test/e2e/kfto_kueue_sft_test.go +++ b/test/e2e/kfto_kueue_sft_test.go @@ -1,4 +1,4 @@ -//go:build ignore +//go:build tuning_e2e /* Copyright 2023. @@ -94,9 +94,9 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) { Containers: []corev1.Container{ { Name: "pytorch", - Image: "quay.io/tedchang/sft-trainer:dev", + Image: "quay.io/modh/fms-hf-tuning:5d8789723ec58ac1bc9c2df704395f162fed974a", ImagePullPolicy: corev1.PullIfNotPresent, - Command: []string{"python", "/app/launch_training.py"}, + Command: []string{"python", "/app/accelerate_launch.py"}, Env: []corev1.EnvVar{ { Name: "SFT_TRAINER_CONFIG_JSON_PATH", diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index 2a6ced2d3a..8ed9286466 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -19,4 +19,4 @@ docker build -t ${KFTO_IMG} -f ${BASE_DIR}/build/images/training-operator/Docker echo "Load training operator image into cluster" kind load --name training-operator-cluster docker-image training-operator:dev -KFTO_IMG=training-operator:dev make deploy \ No newline at end of file +IMG=training-operator:dev make deploy \ No newline at end of file