Prevent replica failover during manual takeover test (#10499)

During 11-manual-takeover.tcl, if the killing of the instances happens too slowly, one of the replicas might be able to promote itself. I'm not sure why it was slow, but it was observed taking 6 seconds which is enough time to do an election. I was able to verify the error locally by adding a small delay (1 second) during ASAN CI. A fix is just to disable automated failover until all the nodes are confirmed dead.
2025-01-22 16:18:28 -05:00 · 2022-03-30 22:15:00 -07:00 · 2022-03-30 22:15:00 -07:00 · e81bd15e99
commit e81bd15e99
parent a3075ca4fe
1 changed files with 18 additions and 6 deletions
--- a/tests/cluster/tests/11-manual-takeover.tcl
+++ b/tests/cluster/tests/11-manual-takeover.tcl
@ -14,20 +14,32 @@ test "Cluster is writable" {
    cluster_write_test 0
 }

+# For this test, disable replica failover until
+# all of the primaries are confirmed killed. Otherwise
+# there might be enough time to elect a replica.
+set replica_ids { 5 6 7 }
+foreach id $replica_ids {
+    R $id config set cluster-replica-no-failover yes
+}
+
 test "Killing majority of master nodes" {
    kill_instance redis 0
    kill_instance redis 1
    kill_instance redis 2
 }

+foreach id $replica_ids {
+    R $id config set cluster-replica-no-failover no
+}
+
 test "Cluster should eventually be down" {
    assert_cluster_state fail
 }

 test "Use takeover to bring slaves back" {
-    R 5 cluster failover takeover
-    R 6 cluster failover takeover
-    R 7 cluster failover takeover
+    foreach id $replica_ids {
+        R $id cluster failover takeover
+    }
 }

 test "Cluster should eventually be up again" {
@ -39,9 +51,9 @@ test "Cluster is writable" {
 }

 test "Instance #5, #6, #7 are now masters" {
-    assert {[RI 5 role] eq {master}}
-    assert {[RI 6 role] eq {master}}
-    assert {[RI 7 role] eq {master}}
+    foreach id $replica_ids {
+        assert {[RI $id role] eq {master}}
+    }
 }

 test "Restarting the previously killed master nodes" {