Prevent replica failover during manual takeover test (#10499)

During 11-manual-takeover.tcl, if the killing of the instances happens
too slowly, one of the replicas might be able to promote itself.
I'm not sure why it was slow, but it was observed taking 6 seconds
which is enough time to do an election.
I was able to verify the error locally by adding a small delay (1 second)
during ASAN CI. A fix is just to disable automated failover until all the
nodes are confirmed dead.
This commit is contained in:
Madelyn Olson 2022-03-30 22:15:00 -07:00 committed by GitHub
parent a3075ca4fe
commit e81bd15e99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -14,20 +14,32 @@ test "Cluster is writable" {
cluster_write_test 0
}
# For this test, disable replica failover until
# all of the primaries are confirmed killed. Otherwise
# there might be enough time to elect a replica.
set replica_ids { 5 6 7 }
foreach id $replica_ids {
R $id config set cluster-replica-no-failover yes
}
test "Killing majority of master nodes" {
kill_instance redis 0
kill_instance redis 1
kill_instance redis 2
}
foreach id $replica_ids {
R $id config set cluster-replica-no-failover no
}
test "Cluster should eventually be down" {
assert_cluster_state fail
}
test "Use takeover to bring slaves back" {
R 5 cluster failover takeover
R 6 cluster failover takeover
R 7 cluster failover takeover
foreach id $replica_ids {
R $id cluster failover takeover
}
}
test "Cluster should eventually be up again" {
@ -39,9 +51,9 @@ test "Cluster is writable" {
}
test "Instance #5, #6, #7 are now masters" {
assert {[RI 5 role] eq {master}}
assert {[RI 6 role] eq {master}}
assert {[RI 7 role] eq {master}}
foreach id $replica_ids {
assert {[RI $id role] eq {master}}
}
}
test "Restarting the previously killed master nodes" {