diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b70e3c87a..e4f6bbfa7 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -108,14 +108,24 @@ jobs: ${{ inputs.EXTRA_TEST_ARGS }} EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' sleep 15 - done + done & + tail_pid=$! - echo "SLURM Job $JOB finished." + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" + sleep 15 + done + echo "SLRUM Job $JOB finished." + kill $tail_pid # Gather job info SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 4a4f9ab21..7e745b8d5 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -96,13 +96,24 @@ jobs: ${{ inputs.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', inputs.EXTRA_GIN_ARGS) || '' }} EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid # Gather job info SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) @@ -219,13 +230,24 @@ jobs: ${{ inputs.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', inputs.EXTRA_GIN_ARGS) || '' }} EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid # Gather job info SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 3f7d571b9..dfda1d0b9 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -152,13 +152,24 @@ jobs: test_model_parallel_encoder.py' EOF ) + echo "SLURM Partition Status" + sshx sinfo set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." + while true; do + sshx 'tail -f ${{ steps.meta.outputs.SLURM_LOG_FILE }} 2>/dev/null || echo "[WARNING]: sbatch_output=${{ steps.meta.outputs.SLURM_LOG_FILE }} not created yet"' + sleep 15 + done & + tail_pid=$! + + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + while [[ -n "$STATE" ]]; do + STATE=$(sshx "squeue -j $JOB -o '%T' --noheader 2>/dev/null") + echo "$JOB is ${STATE:-n/a}" sleep 15 done echo "SLRUM Job $JOB finished." + kill $tail_pid set -x - name: Retrieve training logs