Skip to content

Commit

Permalink
Merge pull request #169 from Asthelen/remote_time_buffer
Browse files Browse the repository at this point in the history
time_estimate_buffer for parallel remote component robustness
  • Loading branch information
kejacobson authored Jan 23, 2024
2 parents 32a9dc9 + 651a73b commit c6ccda0
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit_tests_and_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
echo "=============================================================";
echo "Install optional dependencies";
echo "=============================================================";
mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge tacs funtofem -q -y;
mamba install -c "smdogroup/label/complex" -c smdogroup -c conda-forge "tacs>=3.6.0" funtofem -q -y;
pip install openaerostruct;
echo "=============================================================";
echo "List installed packages/versions";
Expand Down
10 changes: 7 additions & 3 deletions mphys/network/remote_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def stop_server(self):
def initialize(self):
self.options.declare('run_server_filename', default="mphys_server.py", desc="python file that will launch the Server class")
self.options.declare('time_estimate_multiplier', default=2.0, desc="when determining whether to reboot the server, estimate model run time as this times max prior run time")
self.options.declare('time_estimate_buffer', default=0.0, types=float, desc="constant time in seconds to add to model evaluation esimate. "
+"When using parallel remote components with very different evaluation times, setting to slowest component's "
+"estimated evaluation time avoids having the faster component's job expire while the slower one is being evaluated")
self.options.declare('reboot_only_on_function_call', default=True, desc="only allows server reboot before function call, not gradient call. "
+"Avoids having to rerun forward solution on next job, but shortens current job time")
self.options.declare('dump_json', default=False, desc="dump input/output json file in client")
Expand All @@ -31,6 +34,7 @@ def setup(self):
if self.comm.size>1:
raise SystemError('Using Remote Component on more than 1 rank is not supported')
self.time_estimate_multiplier = self.options['time_estimate_multiplier']
self.time_estimate_buffer = self.options['time_estimate_buffer']
self.reboot_only_on_function_call = self.options['reboot_only_on_function_call']
self.dump_json = self.options['dump_json']
self.dump_separate_json = self.options['dump_separate_json']
Expand Down Expand Up @@ -150,16 +154,16 @@ def _need_to_restart_server(self, command: str):
if self._is_first_gradient_evaluation() or self.reboot_only_on_function_call:
return False
else:
estimated_model_time = self.time_estimate_multiplier*max(self.times_gradient)
estimated_model_time = self.time_estimate_multiplier*max(self.times_gradient) + self.time_estimate_buffer

else:
if self._is_first_function_evaluation():
return False
else:
if self.reboot_only_on_function_call and not self._is_first_gradient_evaluation():
estimated_model_time = self.time_estimate_multiplier*(max(self.times_function)+max(self.times_gradient))
estimated_model_time = self.time_estimate_multiplier*(max(self.times_function)+max(self.times_gradient)) + self.time_estimate_buffer
else:
estimated_model_time = self.time_estimate_multiplier*max(self.times_function)
estimated_model_time = self.time_estimate_multiplier*max(self.times_function) + self.time_estimate_buffer
return not self.server_manager.enough_time_is_remaining(estimated_model_time)

def _dump_json(self, remote_dict: dict, command: str):
Expand Down

0 comments on commit c6ccda0

Please sign in to comment.