Skip to content

Commit

Permalink
[PENG-2342] Jobbergate Agent continually resubmits the same job if th…
Browse files Browse the repository at this point in the history
…e job status update fails (#607)

* feat(agent): cache slurm id after submissions
  • Loading branch information
fschuch authored Sep 5, 2024
1 parent 1223c89 commit 119c973
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
1 change: 1 addition & 0 deletions jobbergate-agent/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This file keeps track of all notable changes to jobbergate-agent

## Unreleased
- Fixed issue when downloading job script files for job submission with large names
- Cache slurm submissions to avoid the resubmission of the same job if the job status update fails [PENG-2342]

## 5.3.0a5 -- 2024-08-30
## 5.3.0a4 -- 2024-08-23
Expand Down
10 changes: 9 additions & 1 deletion jobbergate-agent/jobbergate_agent/jobbergate/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,17 @@ async def submit_pending_jobs() -> None:
do_else=lambda: logger.debug(f"Finished submitting pending job_submission {pending_job_submission.id}"),
re_raise=False,
):
slurm_job_id = await submit_job_script(pending_job_submission, user_mapper)
cache_file = SETTINGS.CACHE_DIR / f"{pending_job_submission.id}.slurm_job_id"
if cache_file.exists():
logger.debug(f"Found cache file for job submission {pending_job_submission.id}")
slurm_job_id = int(cache_file.read_text())
else:
slurm_job_id = await submit_job_script(pending_job_submission, user_mapper)
cache_file.write_text(str(slurm_job_id))

slurm_job_data: SlurmJobData = await fetch_job_data(slurm_job_id, info_handler)

await mark_as_submitted(pending_job_submission.id, slurm_job_id, slurm_job_data)
cache_file.unlink(missing_ok=True)

logger.debug("...Finished submitting pending jobs")
24 changes: 21 additions & 3 deletions jobbergate-agent/tests/jobbergate/test_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ async def test_submit_job_script__raises_exception_if_sbatch_fails(
@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_access_token")
async def test_submit_pending_jobs(
tweak_settings,
tmp_path,
dummy_job_script_files,
dummy_template_source,
mocker,
Expand Down Expand Up @@ -666,6 +668,12 @@ async def test_submit_pending_jobs(
owner_email="email3@dummy.com",
job_script={"files": dummy_job_script_files},
),
PendingJobSubmission(
id=4,
name="sub4",
owner_email="email4@dummy.com",
job_script={"files": dummy_job_script_files},
),
]

mocker.patch(
Expand All @@ -678,7 +686,7 @@ def _mocked_submit_job_script(pending_job_submission: PendingJobSubmission, user
raise Exception("BOOM!")
return pending_job_submission.id * 11

def _mocked_mark_as_submitted(job_submission_id: int, slurm_job_id: int):
def _mocked_mark_as_submitted(job_submission_id: int, slurm_job_id: int, slurm_job_data: SlurmJobData):
if job_submission_id == 2:
raise Exception("BANG!")

Expand All @@ -701,7 +709,10 @@ def _mocked_mark_as_submitted(job_submission_id: int, slurm_job_id: int):

test_mapper = manufacture()

await submit_pending_jobs()
with tweak_settings(CACHE_DIR=tmp_path):
cached_submissions = {sub.id: tmp_path / f"{sub.id}.slurm_job_id" for sub in pending_submissions}
cached_submissions[4].write_text("44")
await submit_pending_jobs()

mock_submit.assert_has_calls(
[
Expand All @@ -716,6 +727,13 @@ def _mocked_mark_as_submitted(job_submission_id: int, slurm_job_id: int):
[
mocker.call(1, 11, SlurmJobData(job_state="RUNNING", job_info="{}")),
mocker.call(2, 22, SlurmJobData(job_state="RUNNING", job_info="{}")),
mocker.call(4, 44, SlurmJobData(job_state="RUNNING", job_info="{}")),
]
)
assert mock_mark.call_count == 2
assert mock_mark.call_count == 3

assert not cached_submissions[1].exists()
assert cached_submissions[2].exists()
assert cached_submissions[2].read_text() == "22"
assert not cached_submissions[3].exists()
assert not cached_submissions[4].exists()

0 comments on commit 119c973

Please sign in to comment.