From 4724dd439e1a33310008ae244f82f70835d96a2e Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Tue, 6 Apr 2021 11:57:57 +0300 Subject: [PATCH] Clean up and stabilize cluster migration tests. (#8745) This is work in progress, focusing on two main areas: * Avoiding race conditions with cluster configuration propagation. * Ignoring limitations with redis-cli --cluster fix which makes it hard to distinguish real errors (e.g. failure to fix) from expected conditions in this test (e.g. nodes not agreeing on configuration). --- tests/cluster/cluster.tcl | 32 +++++++++++++ tests/cluster/tests/20-half-migrated-slot.tcl | 45 ++++++++++--------- .../cluster/tests/21-many-slot-migration.tcl | 7 ++- tests/cluster/tests/includes/utils.tcl | 7 +-- 4 files changed, 66 insertions(+), 25 deletions(-) diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl index ffb268561..fb10fffcc 100644 --- a/tests/cluster/cluster.tcl +++ b/tests/cluster/cluster.tcl @@ -4,6 +4,10 @@ # This software is released under the BSD License. See the COPYING file for # more information. +# Track cluster configuration as created by create_cluster below +set ::cluster_master_nodes 0 +set ::cluster_replica_nodes 0 + # Returns a parsed CLUSTER NODES output as a list of dictionaries. proc get_cluster_nodes id { set lines [split [R $id cluster nodes] "\r\n"] @@ -120,6 +124,9 @@ proc create_cluster {masters slaves} { cluster_allocate_slaves $masters $slaves } assert_cluster_state ok + + set ::cluster_master_nodes $masters + set ::cluster_replica_nodes $slaves } # Set the cluster node-timeout to all the reachalbe nodes. @@ -143,3 +150,28 @@ proc cluster_write_test {id} { } $cluster close } + +# Check if cluster configuration is consistent. +proc cluster_config_consistent {} { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + if {$j == 0} { + set base_cfg [R $j cluster slots] + } else { + set cfg [R $j cluster slots] + if {$cfg != $base_cfg} { + return 0 + } + } + } + + return 1 +} + +# Wait for cluster configuration to propagate and be consistent across nodes. +proc wait_for_cluster_propagation {} { + wait_for_condition 50 100 { + [cluster_config_consistent] eq 1 + } else { + fail "cluster config did not reach a consistent state" + } +} \ No newline at end of file diff --git a/tests/cluster/tests/20-half-migrated-slot.tcl b/tests/cluster/tests/20-half-migrated-slot.tcl index 6b5a8c516..69a90818c 100644 --- a/tests/cluster/tests/20-half-migrated-slot.tcl +++ b/tests/cluster/tests/20-half-migrated-slot.tcl @@ -32,55 +32,58 @@ reset_cluster $cluster set aga xyz test "Half init migration in 'migrating' is fixable" { - $nodefrom(link) cluster setslot 609 migrating $nodeto(id) + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] fix_cluster $nodefrom(addr) - assert {[$cluster get aga] eq "xyz"} + assert_equal "xyz" [$cluster get aga] } test "Half init migration in 'importing' is fixable" { - $nodeto(link) cluster setslot 609 importing $nodefrom(id) + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] fix_cluster $nodefrom(addr) - assert {[$cluster get aga] eq "xyz"} + assert_equal "xyz" [$cluster get aga] } test "Init migration and move key" { - $nodefrom(link) cluster setslot 609 migrating $nodeto(id) - $nodeto(link) cluster setslot 609 importing $nodefrom(id) - $nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000 - assert {[$cluster get aga] eq "xyz"} + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + wait_for_cluster_propagation + assert_equal "xyz" [$cluster get aga] fix_cluster $nodefrom(addr) - assert {[$cluster get aga] eq "xyz"} + assert_equal "xyz" [$cluster get aga] } reset_cluster test "Move key again" { - $nodefrom(link) cluster setslot 609 migrating $nodeto(id) - $nodeto(link) cluster setslot 609 importing $nodefrom(id) - $nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000 - assert {[$cluster get aga] eq "xyz"} + wait_for_cluster_propagation + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + wait_for_cluster_propagation + assert_equal "xyz" [$cluster get aga] } test "Half-finish migration" { # half finish migration on 'migrating' node - $nodefrom(link) cluster setslot 609 node $nodeto(id) + assert_equal {OK} [$nodefrom(link) cluster setslot 609 node $nodeto(id)] fix_cluster $nodefrom(addr) - assert {[$cluster get aga] eq "xyz"} + assert_equal "xyz" [$cluster get aga] } reset_cluster test "Move key back" { # 'aga' key is in 609 slot - $nodefrom(link) cluster setslot 609 migrating $nodeto(id) - $nodeto(link) cluster setslot 609 importing $nodefrom(id) - $nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000 - assert {[$cluster get aga] eq "xyz"} + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + assert_equal "xyz" [$cluster get aga] } test "Half-finish importing" { # Now we half finish 'importing' node - $nodeto(link) cluster setslot 609 node $nodeto(id) + assert_equal {OK} [$nodeto(link) cluster setslot 609 node $nodeto(id)] fix_cluster $nodefrom(addr) - assert {[$cluster get aga] eq "xyz"} + assert_equal "xyz" [$cluster get aga] } diff --git a/tests/cluster/tests/21-many-slot-migration.tcl b/tests/cluster/tests/21-many-slot-migration.tcl index 62df0ce79..18daff1d4 100644 --- a/tests/cluster/tests/21-many-slot-migration.tcl +++ b/tests/cluster/tests/21-many-slot-migration.tcl @@ -3,8 +3,12 @@ source "../tests/includes/init-tests.tcl" source "../tests/includes/utils.tcl" +# TODO: This test currently runs without replicas, as failovers (which may +# happen on lower-end CI platforms) are still not handled properly by the +# cluster during slot migration (related to #6339). + test "Create a 10 nodes cluster" { - create_cluster 10 10 + create_cluster 10 0 } test "Cluster is up" { @@ -40,6 +44,7 @@ test "Init migration of many slots" { } test "Fix cluster" { + wait_for_cluster_propagation fix_cluster $nodefrom(addr) } diff --git a/tests/cluster/tests/includes/utils.tcl b/tests/cluster/tests/includes/utils.tcl index 53849f87d..43a2f2803 100644 --- a/tests/cluster/tests/includes/utils.tcl +++ b/tests/cluster/tests/includes/utils.tcl @@ -5,14 +5,15 @@ proc fix_cluster {addr} { exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster fix $addr << yes } result] if {$code != 0} { - puts $result + puts "redis-cli --cluster fix returns non-zero exit code, output below:\n$result" } - assert {$code == 0} + # Note: redis-cli --cluster fix may return a non-zero exit code if nodes don't agree, + # but we can ignore that and rely on the check below. assert_cluster_state ok wait_for_condition 100 100 { [catch {exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster check $addr} result] == 0 } else { - puts $result + puts "redis-cli --cluster check returns non-zero exit code, output below:\n$result" fail "Cluster could not settle with configuration" } }