From 414d29e40e75492ee37ab1fc2a6c9ee8ef76f68b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 24 Jul 2023 11:31:33 -0700 Subject: [PATCH] update slurm logging on all slurm workflows to tail the log file and print partition and state info --- .github/workflows/_test_pax.yaml | 18 ++++++++++++++---- .github/workflows/_test_t5x.yaml | 30 ++++++++++++++++++++++++++---- .github/workflows/_test_te.yaml | 15 +++++++++++++-- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index f84d1a494..adee918cc 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -96,14 +96,24 @@ jobs: $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' sleep 15 - done + done & + tail_pid=$! - echo "SLURM Job $JOB finished." + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" + sleep 15 + done + echo "SLRUM Job $JOB finished." + kill $tail_pid CKPT_PATH=${{ steps.meta.outputs.MODEL_PATH }}/${{ steps.meta.outputs.TEST_CASE_NAME }}/checkpoints if sshx -q [[ -d $CKPT_PATH ]]; then sshx rm -r $CKPT_PATH; fi diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 411cdd927..4a66a3a31 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -86,13 +86,24 @@ jobs: --steps-per-epoch 100 EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid set -x - name: Retrieve training logs and upload to TensorBoard server @@ -189,13 +200,24 @@ jobs: --multiprocess EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid set -x - name: Retrieve training logs and upload to TensorBoard server diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 399631c3f..ab7e5f7df 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -133,13 +133,24 @@ jobs: test_model_parallel_encoder.py' EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.SLURM_LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.SLURM_LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid set -x - name: Retrieve training logs