Clean up and stabilize cluster migration tests. (#8745)

This is work in progress, focusing on two main areas:
* Avoiding race conditions with cluster configuration propagation.
* Ignoring limitations with redis-cli --cluster fix which makes it hard
  to distinguish real errors (e.g. failure to fix) from expected
  conditions in this test (e.g. nodes not agreeing on configuration).
This commit is contained in:
Yossi Gottlieb 2021-04-06 11:57:57 +03:00 committed by GitHub
parent 3b74b55084
commit 4724dd439e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 25 deletions

View File

@ -4,6 +4,10 @@
# This software is released under the BSD License. See the COPYING file for
# more information.
# Track cluster configuration as created by create_cluster below
set ::cluster_master_nodes 0
set ::cluster_replica_nodes 0
# Returns a parsed CLUSTER NODES output as a list of dictionaries.
proc get_cluster_nodes id {
set lines [split [R $id cluster nodes] "\r\n"]
@ -120,6 +124,9 @@ proc create_cluster {masters slaves} {
cluster_allocate_slaves $masters $slaves
}
assert_cluster_state ok
set ::cluster_master_nodes $masters
set ::cluster_replica_nodes $slaves
}
# Set the cluster node-timeout to all the reachalbe nodes.
@ -143,3 +150,28 @@ proc cluster_write_test {id} {
}
$cluster close
}
# Check if cluster configuration is consistent.
proc cluster_config_consistent {} {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
if {$j == 0} {
set base_cfg [R $j cluster slots]
} else {
set cfg [R $j cluster slots]
if {$cfg != $base_cfg} {
return 0
}
}
}
return 1
}
# Wait for cluster configuration to propagate and be consistent across nodes.
proc wait_for_cluster_propagation {} {
wait_for_condition 50 100 {
[cluster_config_consistent] eq 1
} else {
fail "cluster config did not reach a consistent state"
}
}

View File

@ -32,55 +32,58 @@ reset_cluster
$cluster set aga xyz
test "Half init migration in 'migrating' is fixable" {
$nodefrom(link) cluster setslot 609 migrating $nodeto(id)
assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)]
fix_cluster $nodefrom(addr)
assert {[$cluster get aga] eq "xyz"}
assert_equal "xyz" [$cluster get aga]
}
test "Half init migration in 'importing' is fixable" {
$nodeto(link) cluster setslot 609 importing $nodefrom(id)
assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)]
fix_cluster $nodefrom(addr)
assert {[$cluster get aga] eq "xyz"}
assert_equal "xyz" [$cluster get aga]
}
test "Init migration and move key" {
$nodefrom(link) cluster setslot 609 migrating $nodeto(id)
$nodeto(link) cluster setslot 609 importing $nodefrom(id)
$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000
assert {[$cluster get aga] eq "xyz"}
assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)]
assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)]
assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000]
wait_for_cluster_propagation
assert_equal "xyz" [$cluster get aga]
fix_cluster $nodefrom(addr)
assert {[$cluster get aga] eq "xyz"}
assert_equal "xyz" [$cluster get aga]
}
reset_cluster
test "Move key again" {
$nodefrom(link) cluster setslot 609 migrating $nodeto(id)
$nodeto(link) cluster setslot 609 importing $nodefrom(id)
$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000
assert {[$cluster get aga] eq "xyz"}
wait_for_cluster_propagation
assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)]
assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)]
assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000]
wait_for_cluster_propagation
assert_equal "xyz" [$cluster get aga]
}
test "Half-finish migration" {
# half finish migration on 'migrating' node
$nodefrom(link) cluster setslot 609 node $nodeto(id)
assert_equal {OK} [$nodefrom(link) cluster setslot 609 node $nodeto(id)]
fix_cluster $nodefrom(addr)
assert {[$cluster get aga] eq "xyz"}
assert_equal "xyz" [$cluster get aga]
}
reset_cluster
test "Move key back" {
# 'aga' key is in 609 slot
$nodefrom(link) cluster setslot 609 migrating $nodeto(id)
$nodeto(link) cluster setslot 609 importing $nodefrom(id)
$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000
assert {[$cluster get aga] eq "xyz"}
assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)]
assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)]
assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000]
assert_equal "xyz" [$cluster get aga]
}
test "Half-finish importing" {
# Now we half finish 'importing' node
$nodeto(link) cluster setslot 609 node $nodeto(id)
assert_equal {OK} [$nodeto(link) cluster setslot 609 node $nodeto(id)]
fix_cluster $nodefrom(addr)
assert {[$cluster get aga] eq "xyz"}
assert_equal "xyz" [$cluster get aga]
}

View File

@ -3,8 +3,12 @@
source "../tests/includes/init-tests.tcl"
source "../tests/includes/utils.tcl"
# TODO: This test currently runs without replicas, as failovers (which may
# happen on lower-end CI platforms) are still not handled properly by the
# cluster during slot migration (related to #6339).
test "Create a 10 nodes cluster" {
create_cluster 10 10
create_cluster 10 0
}
test "Cluster is up" {
@ -40,6 +44,7 @@ test "Init migration of many slots" {
}
test "Fix cluster" {
wait_for_cluster_propagation
fix_cluster $nodefrom(addr)
}

View File

@ -5,14 +5,15 @@ proc fix_cluster {addr} {
exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster fix $addr << yes
} result]
if {$code != 0} {
puts $result
puts "redis-cli --cluster fix returns non-zero exit code, output below:\n$result"
}
assert {$code == 0}
# Note: redis-cli --cluster fix may return a non-zero exit code if nodes don't agree,
# but we can ignore that and rely on the check below.
assert_cluster_state ok
wait_for_condition 100 100 {
[catch {exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster check $addr} result] == 0
} else {
puts $result
puts "redis-cli --cluster check returns non-zero exit code, output below:\n$result"
fail "Cluster could not settle with configuration"
}
}