From cc1436f623f512b2d42759ddfe5a559ec7adf699 Mon Sep 17 00:00:00 2001 From: Andrew Thelen Date: Tue, 16 Jan 2024 11:22:30 -0500 Subject: [PATCH 1/3] add time_estimate_buffer for when parallel remote components have very different evaluation times --- mphys/network/remote_component.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mphys/network/remote_component.py b/mphys/network/remote_component.py index 2800a05c..6de888bc 100644 --- a/mphys/network/remote_component.py +++ b/mphys/network/remote_component.py @@ -18,6 +18,9 @@ def stop_server(self): def initialize(self): self.options.declare('run_server_filename', default="mphys_server.py", desc="python file that will launch the Server class") self.options.declare('time_estimate_multiplier', default=2.0, desc="when determining whether to reboot the server, estimate model run time as this times max prior run time") + self.options.declare('time_estimate_buffer', default=0.0, types=float, desc="constant time in seconds to add to model evaluation esimate. " + +"When using parallel remote components with very different evaluation times, setting to slowest component's " + +"estimated evaluation time avoids having the faster component's job expire while the slower one is being evaluated") self.options.declare('reboot_only_on_function_call', default=True, desc="only allows server reboot before function call, not gradient call. " +"Avoids having to rerun forward solution on next job, but shortens current job time") self.options.declare('dump_json', default=False, desc="dump input/output json file in client") @@ -31,6 +34,7 @@ def setup(self): if self.comm.size>1: raise SystemError('Using Remote Component on more than 1 rank is not supported') self.time_estimate_multiplier = self.options['time_estimate_multiplier'] + self.time_estimate_buffer = self.options['time_estimate_buffer'] self.reboot_only_on_function_call = self.options['reboot_only_on_function_call'] self.dump_json = self.options['dump_json'] self.dump_separate_json = self.options['dump_separate_json'] @@ -150,16 +154,16 @@ def _need_to_restart_server(self, command: str): if self._is_first_gradient_evaluation() or self.reboot_only_on_function_call: return False else: - estimated_model_time = self.time_estimate_multiplier*max(self.times_gradient) + estimated_model_time = self.time_estimate_multiplier*max(self.times_gradient) + self.time_estimate_buffer else: if self._is_first_function_evaluation(): return False else: if self.reboot_only_on_function_call and not self._is_first_gradient_evaluation(): - estimated_model_time = self.time_estimate_multiplier*(max(self.times_function)+max(self.times_gradient)) + estimated_model_time = self.time_estimate_multiplier*(max(self.times_function)+max(self.times_gradient)) + self.time_estimate_buffer else: - estimated_model_time = self.time_estimate_multiplier*max(self.times_function) + estimated_model_time = self.time_estimate_multiplier*max(self.times_function) + self.time_estimate_buffer return not self.server_manager.enough_time_is_remaining(estimated_model_time) def _dump_json(self, remote_dict: dict, command: str): From 4a262cdc2befb7b5f9d455cf9fa9021c136c3f29 Mon Sep 17 00:00:00 2001 From: Tim Brooks <41971846+timryanb@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:58:02 -0500 Subject: [PATCH 2/3] pin tacs to >= 3.6.0 --- .github/workflows/unit_tests_and_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests_and_docs.yml b/.github/workflows/unit_tests_and_docs.yml index 1ca2fc27..fb26ea64 100644 --- a/.github/workflows/unit_tests_and_docs.yml +++ b/.github/workflows/unit_tests_and_docs.yml @@ -60,7 +60,7 @@ jobs: echo "============================================================="; echo "Install optional dependencies"; echo "============================================================="; - mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge tacs funtofem -q -y; + mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge tacs>=3.6.0 funtofem -q -y; pip install openaerostruct; echo "============================================================="; echo "List installed packages/versions"; From ec525f501cd7f725cb1d9275a1fed164112b1b30 Mon Sep 17 00:00:00 2001 From: Tim Brooks <41971846+timryanb@users.noreply.github.com> Date: Fri, 19 Jan 2024 09:35:24 -0500 Subject: [PATCH 3/3] fixing tacs pin --- .github/workflows/unit_tests_and_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests_and_docs.yml b/.github/workflows/unit_tests_and_docs.yml index bf1d9363..31b823d5 100644 --- a/.github/workflows/unit_tests_and_docs.yml +++ b/.github/workflows/unit_tests_and_docs.yml @@ -60,7 +60,7 @@ jobs: echo "============================================================="; echo "Install optional dependencies"; echo "============================================================="; - mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge tacs>=3.6.0 funtofem -q -y; + mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge "tacs>=3.6.0" funtofem -q -y; pip install openaerostruct; echo "============================================================="; echo "List installed packages/versions";