diff --git a/ATLASExperiment.py b/ATLASExperiment.py index 9542de6c..10ed2dea 100644 --- a/ATLASExperiment.py +++ b/ATLASExperiment.py @@ -1439,6 +1439,13 @@ def interpretPayloadStdout(self, job, res, getstatusoutput_was_interrupted, curr else: job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)" job.result[2] = error.ERR_UNKNOWN + + # Any errors due to signals can be ignored if the job was killed because of out of memory + if os.path.exists(os.path.join(job.workdir, "MEMORYEXCEEDED")): + tolog("Ignoring any previously detected errors (like signals) since MEMORYEXCEEDED file was found") + job.pilotErrorDiag = "Payload exceeded maximum allowed memory" + job.result[2] = error.ERR_PAYLOADEXCEEDMAXMEM + tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag)) # set the trf diag error diff --git a/CHANGES b/CHANGES index 69199e1d..5404aff7 100644 --- a/CHANGES +++ b/CHANGES @@ -113,6 +113,10 @@ Log tailing (requested by R. Walker) list_replicas() - Specifying --pfn in rucio download, stageIn(), which will prevent list_replicas() from being used on server side (rucio_sitemover) +Overwritten "Payload exceeded max allowed memory" fix +- Now setting ERR_PAYLOADEXCEEDMAXMEM if MEMORYEXCEEDED file detected at the end of interpretPayloadStdout() to prevent + signal error from being set instead. Requested by R. Walker (ATLASExperiment) + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// TODO: