redict/tests/integration/replication-4.tcl
Oran Agra ae89958972
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)
1. enable diskless replication by default
2. add a new config named repl-diskless-sync-max-replicas that enables
   replication to start before the full repl-diskless-sync-delay was
   reached.
3. put replica online sooner on the master (see below)
4. test suite uses repl-diskless-sync-delay of 0 to be faster
5. a few tests that use multiple replica on a pre-populated master, are
   now using the new repl-diskless-sync-max-replicas
6. fix possible timing issues in a few cluster tests (see below)

put replica online sooner on the master 
----------------------------------------------------
there were two tests that failed because they needed for the master to
realize that the replica is online, but the test code was actually only
waiting for the replica to realize it's online, and in diskless it could
have been before the master realized it.

changes include two things:
1. the tests wait on the right thing
2. issues in the master, putting the replica online in two steps.

the master used to put the replica as online in 2 steps. the first
step was to mark it as online, and the second step was to enable the
write event (only after getting ACK), but in fact the first step didn't
contains some of the tasks to put it online (like updating good slave
count, and sending the module event). this meant that if a test was
waiting to see that the replica is online form the point of view of the
master, and then confirm that the module got an event, or that the
master has enough good replicas, it could fail due to timing issues.

so now the full effect of putting the replica online, happens at once,
and only the part about enabling the writes is delayed till the ACK.

fix cluster tests 
--------------------
I added some code to wait for the replica to sync and avoid race
conditions.
later realized the sentinel and cluster tests where using the original 5
seconds delay, so changed it to 0.

this means the other changes are probably not needed, but i suppose
they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00

174 lines
6.0 KiB
Tcl

start_server {tags {"repl network external:skip"}} {
start_server {} {
set master [srv -1 client]
set master_host [srv -1 host]
set master_port [srv -1 port]
set slave [srv 0 client]
set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000]
set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000]
set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000]
test {First server should have role slave after SLAVEOF} {
$slave slaveof $master_host $master_port
after 1000
s 0 role
} {slave}
test {Test replication with parallel clients writing in different DBs} {
after 5000
stop_bg_complex_data $load_handle0
stop_bg_complex_data $load_handle1
stop_bg_complex_data $load_handle2
wait_for_condition 100 100 {
[$master debug digest] == [$slave debug digest]
} else {
set csv1 [csvdump r]
set csv2 [csvdump {r -1}]
set fd [open /tmp/repldump1.txt w]
puts -nonewline $fd $csv1
close $fd
set fd [open /tmp/repldump2.txt w]
puts -nonewline $fd $csv2
close $fd
fail "Master - Replica inconsistency, Run diff -u against /tmp/repldump*.txt for more info"
}
assert {[$master dbsize] > 0}
}
}
}
start_server {tags {"repl external:skip"}} {
start_server {} {
set master [srv -1 client]
set master_host [srv -1 host]
set master_port [srv -1 port]
set slave [srv 0 client]
test {First server should have role slave after SLAVEOF} {
$slave slaveof $master_host $master_port
wait_replica_online $master
}
test {With min-slaves-to-write (1,3): master should be writable} {
$master config set min-slaves-max-lag 3
$master config set min-slaves-to-write 1
$master set foo bar
} {OK}
test {With min-slaves-to-write (2,3): master should not be writable} {
$master config set min-slaves-max-lag 3
$master config set min-slaves-to-write 2
catch {$master set foo bar} e
set e
} {NOREPLICAS*}
test {With min-slaves-to-write: master not writable with lagged slave} {
$master config set min-slaves-max-lag 2
$master config set min-slaves-to-write 1
assert {[$master set foo bar] eq {OK}}
exec kill -SIGSTOP [srv 0 pid]
wait_for_condition 100 100 {
[catch {$master set foo bar}] != 0
} else {
fail "Master didn't become readonly"
}
catch {$master set foo bar} err
assert_match {NOREPLICAS*} $err
exec kill -SIGCONT [srv 0 pid]
}
}
}
start_server {tags {"repl external:skip"}} {
start_server {} {
set master [srv -1 client]
set master_host [srv -1 host]
set master_port [srv -1 port]
set slave [srv 0 client]
test {First server should have role slave after SLAVEOF} {
$slave slaveof $master_host $master_port
wait_for_condition 50 100 {
[s 0 master_link_status] eq {up}
} else {
fail "Replication not started."
}
}
test {Replication of an expired key does not delete the expired key} {
$master debug set-active-expire 0
$master set k 1 ex 1
wait_for_ofs_sync $master $slave
exec kill -SIGSTOP [srv 0 pid]
$master incr k
after 1000
# Stopping the replica for one second to makes sure the INCR arrives
# to the replica after the key is logically expired.
exec kill -SIGCONT [srv 0 pid]
wait_for_ofs_sync $master $slave
# Check that k is locigally expired but is present in the replica.
assert_equal 0 [$slave exists k]
$slave debug object k ; # Raises exception if k is gone.
}
}
}
start_server {tags {"repl external:skip"}} {
start_server {} {
set master [srv -1 client]
set master_host [srv -1 host]
set master_port [srv -1 port]
set slave [srv 0 client]
test {First server should have role slave after SLAVEOF} {
$slave slaveof $master_host $master_port
wait_for_condition 50 100 {
[s 0 role] eq {slave}
} else {
fail "Replication not started."
}
}
test {Replication: commands with many arguments (issue #1221)} {
# We now issue large MSET commands, that may trigger a specific
# class of bugs, see issue #1221.
for {set j 0} {$j < 100} {incr j} {
set cmd [list mset]
for {set x 0} {$x < 1000} {incr x} {
lappend cmd [randomKey] [randomValue]
}
$master {*}$cmd
}
set retry 10
while {$retry && ([$master debug digest] ne [$slave debug digest])}\
{
after 1000
incr retry -1
}
assert {[$master dbsize] > 0}
}
test {Replication of SPOP command -- alsoPropagate() API} {
$master del myset
set size [expr 1+[randomInt 100]]
set content {}
for {set j 0} {$j < $size} {incr j} {
lappend content [randomValue]
}
$master sadd myset {*}$content
set count [randomInt 100]
set result [$master spop myset $count]
wait_for_condition 50 100 {
[$master debug digest] eq [$slave debug digest]
} else {
fail "SPOP replication inconsistency"
}
}
}
}