From dbfad8063d5cde2288c074cc62f30805f7ca9e0f Mon Sep 17 00:00:00 2001
From: Heikki Nousiainen <htn@aiven.io>
Date: Mon, 4 Nov 2024 12:16:32 +0200
Subject: [PATCH] pglookout: support explicit failover priorities

Support explicit prioritization between instances. This can be
configured via ``failover_priorities`` key, and will be consulted
upon picking up the standby that should do the promotion in cases
where multiple nodes have a matching replication position.

Previously, and also as the current default, the selection was based
on the sorting order of the remote nodes.

The configuration option allows some additional flexibility, and
supports e.g. topologies where we have more favorable and less
desirable standbys in multiple different network locations.
---
 README.rst             |  8 ++++
 pglookout/pglookout.py | 33 +++++++++++++----
 test/test_lookout.py   | 84 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index fb9cd39..57322a7 100644
--- a/README.rst
+++ b/README.rst
@@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.
 
 Shell command to execute in case the node has deemed itself in need of promotion
 
+``failover_priorities`` (default ``{}``)
+
+Define priority of nodes for promotion, in case there are multiple candidates
+with the same replication position.  This allows to ensure all pglookout instances
+would elect the same standby for promotion, while still allowing for topologies
+with e.g. less preferred standbys in secondary network locations. By default,
+pglookout uses remote connection ids for the same selection purpose.
+
 ``known_gone_nodes`` (default ``[]``)
 
 Lists nodes that are explicitly known to have left the cluster.  If the old
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index 42fbf3a..794e326 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -643,19 +643,36 @@ def do_failover_decision(self, standby_nodes):
         if not known_replication_positions:
             self.log.warning("No known replication positions, canceling failover consideration")
             return
-        # If there are multiple nodes with the same replication positions pick the one with the "highest" name
-        # to make sure pglookouts running on all standbys make the same decision.  The rationale for picking
-        # the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
-        # "best" beyond looking at replication positions, but picking the highest id supports environments
-        # where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
-        # promote the latest and greatest node.  In static environments node identifiers can be priority
-        # numbers, with the highest number being the one that should be preferred.
-        furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])
+
+        # Find the instance that is furthest along.
+        # If there are multiple nodes with the same replication positions, try to identify one to promote either
+        # via explicit failover priority configuration or pick the one with the "highest" name by sort order.
+        # The rationale of this logic is to ensure all participating pglookouts running on all standbys make
+        # the same decision. The "highest" name works well in environments where nodes are assigned identifiers
+        # from an incrementing sequence and where we want to promote the latest and greatest node.
+
+        # First, find the list of instances that share the more recent replication position
+        furthest_along_instances = known_replication_positions[max(known_replication_positions)]
+        # Second, sort them by "instance name"
+        furthest_along_instances = sorted(furthest_along_instances, reverse=True)
+        # Third, if we have explicit failover priorities, use those for selecting the to be promoted instance
+        if "failover_priorities" in self.config:
+            highest_priority = max([
+                self.config["failover_priorities"].get(instance, 0)
+                for instance in furthest_along_instances
+            ])
+            furthest_along_instances = [
+                instance
+                for instance in furthest_along_instances
+                if self.config["failover_priorities"].get(instance) == highest_priority
+            ]
+        furthest_along_instance = furthest_along_instances[0]
         self.log.warning(
             "Node that is furthest along is: %r, all replication positions were: %r",
             furthest_along_instance,
             sorted(known_replication_positions),
         )
+
         total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
         # +1 in the calculation comes from the master node
         total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers
diff --git a/test/test_lookout.py b/test/test_lookout.py
index 6e9f7bf..5d47c5f 100644
--- a/test/test_lookout.py
+++ b/test/test_lookout.py
@@ -931,6 +931,90 @@ def test_standbys_failover_equal_replication_positions(pgl):
     assert pgl.execute_external_command.call_count == 1
 
 
+def test_standbys_failover_equal_replication_positions_with_priorities(pgl):
+    now = datetime.datetime.utcnow()
+    set_instance_cluster_state(
+        pgl,
+        instance="192.168.54.183",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=400.435871,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+    set_instance_cluster_state(
+        pgl,
+        instance="192.168.57.180",
+        pg_last_xlog_receive_location=None,
+        pg_is_in_recovery=False,
+        connection=False,
+        replication_time_lag=0.0,
+        fetch_time=now - datetime.timedelta(seconds=3600),
+        db_time=now - datetime.timedelta(seconds=3600),
+        conn_info="foobar",
+    )
+    set_instance_cluster_state(
+        pgl,
+        instance="192.168.63.4",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=401.104655,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+    set_instance_cluster_state(
+        pgl,
+        instance="192.168.62.4",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=401.104655,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+    set_instance_cluster_state(
+        pgl,
+        instance="192.168.52.183",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=401.104655,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+
+    pgl.current_master = "192.168.57.180"
+
+    pgl.config["failover_priorities"] = {
+        "192.168.54.183": 1000,
+        "192.168.52.183": 1000,
+        "192.168.63.4": 0,
+    }
+
+    # This is highest by instance, but lower in priority
+    pgl.own_db = "192.168.63.4"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 0
+    # This is second highest by instance, but no priority set - it's counted at 0
+    pgl.own_db = "192.168.62.4"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 0
+    # This node shares highest priority == 1000, but is lower by instance
+    pgl.own_db = "192.168.52.183"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 0
+    # Second lowest by instance, but with priority == 1000
+    pgl.own_db = "192.168.54.183"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 1
+
+
 def test_node_map_when_only_observer_sees_master(pgl):
     cluster_state = {
         "10.255.255.10": {