diff --git a/README.rst b/README.rst index fb9cd39..57322a7 100644 --- a/README.rst +++ b/README.rst @@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file. Shell command to execute in case the node has deemed itself in need of promotion +``failover_priorities`` (default ``{}``) + +Define priority of nodes for promotion, in case there are multiple candidates +with the same replication position. This allows to ensure all pglookout instances +would elect the same standby for promotion, while still allowing for topologies +with e.g. less preferred standbys in secondary network locations. By default, +pglookout uses remote connection ids for the same selection purpose. + ``known_gone_nodes`` (default ``[]``) Lists nodes that are explicitly known to have left the cluster. If the old diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 42fbf3a..794e326 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -643,19 +643,36 @@ def do_failover_decision(self, standby_nodes): if not known_replication_positions: self.log.warning("No known replication positions, canceling failover consideration") return - # If there are multiple nodes with the same replication positions pick the one with the "highest" name - # to make sure pglookouts running on all standbys make the same decision. The rationale for picking - # the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is - # "best" beyond looking at replication positions, but picking the highest id supports environments - # where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to - # promote the latest and greatest node. In static environments node identifiers can be priority - # numbers, with the highest number being the one that should be preferred. - furthest_along_instance = max(known_replication_positions[max(known_replication_positions)]) + + # Find the instance that is furthest along. + # If there are multiple nodes with the same replication positions, try to identify one to promote either + # via explicit failover priority configuration or pick the one with the "highest" name by sort order. + # The rationale of this logic is to ensure all participating pglookouts running on all standbys make + # the same decision. The "highest" name works well in environments where nodes are assigned identifiers + # from an incrementing sequence and where we want to promote the latest and greatest node. + + # First, find the list of instances that share the more recent replication position + furthest_along_instances = known_replication_positions[max(known_replication_positions)] + # Second, sort them by "instance name" + furthest_along_instances = sorted(furthest_along_instances, reverse=True) + # Third, if we have explicit failover priorities, use those for selecting the to be promoted instance + if "failover_priorities" in self.config: + highest_priority = max([ + self.config["failover_priorities"].get(instance, 0) + for instance in furthest_along_instances + ]) + furthest_along_instances = [ + instance + for instance in furthest_along_instances + if self.config["failover_priorities"].get(instance) == highest_priority + ] + furthest_along_instance = furthest_along_instances[0] self.log.warning( "Node that is furthest along is: %r, all replication positions were: %r", furthest_along_instance, sorted(known_replication_positions), ) + total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes) # +1 in the calculation comes from the master node total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers diff --git a/test/test_lookout.py b/test/test_lookout.py index 6e9f7bf..5d47c5f 100644 --- a/test/test_lookout.py +++ b/test/test_lookout.py @@ -931,6 +931,90 @@ def test_standbys_failover_equal_replication_positions(pgl): assert pgl.execute_external_command.call_count == 1 +def test_standbys_failover_equal_replication_positions_with_priorities(pgl): + now = datetime.datetime.utcnow() + set_instance_cluster_state( + pgl, + instance="192.168.54.183", + pg_last_xlog_receive_location="0/70004D8", + pg_is_in_recovery=True, + connection=True, + replication_time_lag=400.435871, + fetch_time=now, + db_time=now, + conn_info="foobar", + ) + set_instance_cluster_state( + pgl, + instance="192.168.57.180", + pg_last_xlog_receive_location=None, + pg_is_in_recovery=False, + connection=False, + replication_time_lag=0.0, + fetch_time=now - datetime.timedelta(seconds=3600), + db_time=now - datetime.timedelta(seconds=3600), + conn_info="foobar", + ) + set_instance_cluster_state( + pgl, + instance="192.168.63.4", + pg_last_xlog_receive_location="0/70004D8", + pg_is_in_recovery=True, + connection=True, + replication_time_lag=401.104655, + fetch_time=now, + db_time=now, + conn_info="foobar", + ) + set_instance_cluster_state( + pgl, + instance="192.168.62.4", + pg_last_xlog_receive_location="0/70004D8", + pg_is_in_recovery=True, + connection=True, + replication_time_lag=401.104655, + fetch_time=now, + db_time=now, + conn_info="foobar", + ) + set_instance_cluster_state( + pgl, + instance="192.168.52.183", + pg_last_xlog_receive_location="0/70004D8", + pg_is_in_recovery=True, + connection=True, + replication_time_lag=401.104655, + fetch_time=now, + db_time=now, + conn_info="foobar", + ) + + pgl.current_master = "192.168.57.180" + + pgl.config["failover_priorities"] = { + "192.168.54.183": 1000, + "192.168.52.183": 1000, + "192.168.63.4": 0, + } + + # This is highest by instance, but lower in priority + pgl.own_db = "192.168.63.4" + pgl.check_cluster_state() + assert pgl.execute_external_command.call_count == 0 + # This is second highest by instance, but no priority set - it's counted at 0 + pgl.own_db = "192.168.62.4" + pgl.check_cluster_state() + assert pgl.execute_external_command.call_count == 0 + # This node shares highest priority == 1000, but is lower by instance + pgl.own_db = "192.168.52.183" + pgl.check_cluster_state() + assert pgl.execute_external_command.call_count == 0 + # Second lowest by instance, but with priority == 1000 + pgl.own_db = "192.168.54.183" + pgl.check_cluster_state() + assert pgl.execute_external_command.call_count == 1 + + def test_node_map_when_only_observer_sees_master(pgl): cluster_state = { "10.255.255.10": {