Fix cluster consistency-check test (#7754)

This test was failing from time to time see discussion at the bottom of #7635
This was probably due to timing, the DEBUG SLEEP executed by redis-cli
didn't sleep for enough time.

This commit changes:
1) use SET-ACTIVE-EXPIRE instead of DEBUG SLEEP
2) reduce many `after` sleeps with retry loops to speed up the test.
3) add many comment explaining the different steps of the test and
   it's purpose.
4) config appendonly before populating the volatile keys, so that they'll
   be part of the AOF command stream rather than the preamble RDB portion.

other complications: recently kill_instance switched from SIGKILL to
SIGTERM, and this would sometimes fail since there was an AOFRW running
in the background. now we wait for it to end before attempting the kill.
This commit is contained in:
Oran Agra 2020-09-07 18:06:25 +03:00 committed by GitHub
parent 2df4cb93ac
commit b491d477c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -39,53 +39,79 @@ proc cluster_write_keys_with_expire {id ttl} {
$cluster close
}
# make sure that replica who restarts from persistence will load keys
# that have already expired, critical for correct execution of commands
# that arrive from the master
proc test_slave_load_expired_keys {aof} {
test "Slave expired keys is loaded when restarted: appendonly=$aof" {
set master_id [find_non_empty_master]
set replica_id [get_one_of_my_replica $master_id]
set master_dbsize [R $master_id dbsize]
set slave_dbsize [R $replica_id dbsize]
assert_equal $master_dbsize $slave_dbsize
set data_ttl 5
cluster_write_keys_with_expire $master_id $data_ttl
after 100
set replica_dbsize_1 [R $replica_id dbsize]
assert {$replica_dbsize_1 > $slave_dbsize}
set master_dbsize_0 [R $master_id dbsize]
set replica_dbsize_0 [R $replica_id dbsize]
assert_equal $master_dbsize_0 $replica_dbsize_0
# config the replica persistency and rewrite the config file to survive restart
# note that this needs to be done before populating the volatile keys since
# that triggers and AOFRW, and we rather the AOF file to have SETEX commands
# rather than an RDB with volatile keys
R $replica_id config set appendonly $aof
R $replica_id config rewrite
set start_time [clock seconds]
set end_time [expr $start_time+$data_ttl+2]
R $replica_id save
set replica_dbsize_2 [R $replica_id dbsize]
assert {$replica_dbsize_2 > $slave_dbsize}
# fill with 100 keys with 3 second TTL
set data_ttl 3
cluster_write_keys_with_expire $master_id $data_ttl
# wait for replica to be in sync with master
wait_for_condition 500 10 {
[R $replica_id dbsize] eq [R $master_id dbsize]
} else {
fail "replica didn't sync"
}
set replica_dbsize_1 [R $replica_id dbsize]
assert {$replica_dbsize_1 > $replica_dbsize_0}
# make replica create persistence file
if {$aof == "yes"} {
# we need to wait for the initial AOFRW to be done, otherwise
# kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit")
wait_for_condition 100 10 {
[RI $replica_id aof_rewrite_in_progress] eq 0
} else {
fail "keys didn't expire"
}
} else {
R $replica_id save
}
# kill the replica (would stay down until re-started)
kill_instance redis $replica_id
set master_port [get_instance_attrib redis $master_id port]
exec ../../../src/redis-cli \
-h 127.0.0.1 -p $master_port \
{*}[rediscli_tls_config "../../../tests"] \
debug sleep [expr $data_ttl+3] > /dev/null &
# Make sure the master doesn't do active expire (sending DELs to the replica)
R $master_id DEBUG SET-ACTIVE-EXPIRE 0
while {[clock seconds] <= $end_time} {
#wait for $data_ttl seconds
}
# wait for all the keys to get logically expired
after [expr $data_ttl*1000]
# start the replica again (loading an RDB or AOF file)
restart_instance redis $replica_id
wait_for_condition 200 50 {
[R $replica_id ping] eq {PONG}
} else {
fail "replica #$replica_id not started"
}
# make sure the keys are still there
set replica_dbsize_3 [R $replica_id dbsize]
assert {$replica_dbsize_3 > $slave_dbsize}
assert {$replica_dbsize_3 > $replica_dbsize_0}
# restore settings
R $master_id DEBUG SET-ACTIVE-EXPIRE 1
# wait for the master to expire all keys and replica to get the DELs
wait_for_condition 500 10 {
[R $replica_id dbsize] eq $master_dbsize_0
} else {
fail "keys didn't expire"
}
}
}
test_slave_load_expired_keys no
after 5000
test_slave_load_expired_keys yes