2013-05-09 12:52:04 +02:00
|
|
|
# Creates a master-slave pair and breaks the link continuously to force
|
|
|
|
# partial resyncs attempts, all this while flooding the master with
|
|
|
|
# write queries.
|
|
|
|
#
|
2018-07-01 13:24:50 +08:00
|
|
|
# You can specify backlog size, ttl, delay before reconnection, test duration
|
2013-05-09 12:52:04 +02:00
|
|
|
# in seconds, and an additional condition to verify at the end.
|
2015-08-05 09:18:54 +02:00
|
|
|
#
|
|
|
|
# If reconnect is > 0, the test actually try to break the connection and
|
|
|
|
# reconnect with the master, otherwise just the initial synchronization is
|
|
|
|
# checked for consistency.
|
2019-07-01 15:22:29 +03:00
|
|
|
proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {tags {"repl"} overrides {save {}}} {
|
|
|
|
start_server {overrides {save {}}} {
|
2013-05-08 13:01:42 +02:00
|
|
|
|
|
|
|
set master [srv -1 client]
|
|
|
|
set master_host [srv -1 host]
|
|
|
|
set master_port [srv -1 port]
|
|
|
|
set slave [srv 0 client]
|
|
|
|
|
2013-05-09 12:52:04 +02:00
|
|
|
$master config set repl-backlog-size $backlog_size
|
|
|
|
$master config set repl-backlog-ttl $backlog_ttl
|
2019-07-01 15:22:29 +03:00
|
|
|
$master config set repl-diskless-sync $mdl
|
2015-08-04 13:14:25 +02:00
|
|
|
$master config set repl-diskless-sync-delay 1
|
2019-07-01 15:22:29 +03:00
|
|
|
$slave config set repl-diskless-load $sdl
|
2013-05-08 13:01:42 +02:00
|
|
|
|
|
|
|
set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000]
|
|
|
|
set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000]
|
|
|
|
set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000]
|
|
|
|
|
2014-06-26 16:00:24 +02:00
|
|
|
test {Slave should be able to synchronize with the master} {
|
2013-05-08 13:01:42 +02:00
|
|
|
$slave slaveof $master_host $master_port
|
2013-05-09 12:52:04 +02:00
|
|
|
wait_for_condition 50 100 {
|
2014-06-26 16:00:24 +02:00
|
|
|
[lindex [r role] 0] eq {slave} &&
|
|
|
|
[lindex [r role] 3] eq {connected}
|
2013-05-09 12:52:04 +02:00
|
|
|
} else {
|
|
|
|
fail "Replication not started."
|
|
|
|
}
|
|
|
|
}
|
2013-05-08 13:01:42 +02:00
|
|
|
|
2014-06-26 16:00:24 +02:00
|
|
|
# Check that the background clients are actually writing.
|
|
|
|
test {Detect write load to master} {
|
2017-02-22 12:27:01 +01:00
|
|
|
wait_for_condition 50 1000 {
|
2014-06-26 16:00:24 +02:00
|
|
|
[$master dbsize] > 100
|
|
|
|
} else {
|
|
|
|
fail "Can't detect write load from background clients."
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-01 15:22:29 +03:00
|
|
|
test "Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect)" {
|
2013-05-08 13:01:42 +02:00
|
|
|
# Now while the clients are writing data, break the maste-slave
|
|
|
|
# link multiple times.
|
2015-08-05 09:18:54 +02:00
|
|
|
if ($reconnect) {
|
|
|
|
for {set j 0} {$j < $duration*10} {incr j} {
|
|
|
|
after 100
|
2018-09-11 11:03:28 +02:00
|
|
|
# catch {puts "MASTER [$master dbsize] keys, REPLICA [$slave dbsize] keys"}
|
2013-05-08 13:01:42 +02:00
|
|
|
|
2015-08-05 09:18:54 +02:00
|
|
|
if {($j % 20) == 0} {
|
|
|
|
catch {
|
|
|
|
if {$delay} {
|
|
|
|
$slave multi
|
|
|
|
$slave client kill $master_host:$master_port
|
|
|
|
$slave debug sleep $delay
|
|
|
|
$slave exec
|
|
|
|
} else {
|
|
|
|
$slave client kill $master_host:$master_port
|
|
|
|
}
|
2013-05-09 12:52:04 +02:00
|
|
|
}
|
2013-05-08 13:01:42 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
stop_bg_complex_data $load_handle0
|
|
|
|
stop_bg_complex_data $load_handle1
|
|
|
|
stop_bg_complex_data $load_handle2
|
2019-05-05 08:19:52 +03:00
|
|
|
|
|
|
|
# Wait for the slave to reach the "online"
|
|
|
|
# state from the POV of the master.
|
|
|
|
set retry 5000
|
|
|
|
while {$retry} {
|
|
|
|
set info [$master info]
|
|
|
|
if {[string match {*slave0:*state=online*} $info]} {
|
|
|
|
break
|
|
|
|
} else {
|
|
|
|
incr retry -1
|
|
|
|
after 100
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if {$retry == 0} {
|
|
|
|
error "assertion:Slave not correctly synchronized"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Wait that slave acknowledge it is online so
|
|
|
|
# we are sure that DBSIZE and DEBUG DIGEST will not
|
|
|
|
# fail because of timing issues. (-LOADING error)
|
|
|
|
wait_for_condition 5000 100 {
|
|
|
|
[lindex [$slave role] 3] eq {connected}
|
|
|
|
} else {
|
|
|
|
fail "Slave still not connected after some time"
|
|
|
|
}
|
|
|
|
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 15:29:43 +03:00
|
|
|
wait_for_condition 100 100 {
|
|
|
|
[$master debug digest] == [$slave debug digest]
|
|
|
|
} else {
|
2013-05-08 13:01:42 +02:00
|
|
|
set csv1 [csvdump r]
|
|
|
|
set csv2 [csvdump {r -1}]
|
|
|
|
set fd [open /tmp/repldump1.txt w]
|
|
|
|
puts -nonewline $fd $csv1
|
|
|
|
close $fd
|
|
|
|
set fd [open /tmp/repldump2.txt w]
|
|
|
|
puts -nonewline $fd $csv2
|
|
|
|
close $fd
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 15:29:43 +03:00
|
|
|
fail "Master - Replica inconsistency, Run diff -u against /tmp/repldump*.txt for more info"
|
2013-05-08 13:01:42 +02:00
|
|
|
}
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 15:29:43 +03:00
|
|
|
assert {[$master dbsize] > 0}
|
2013-05-08 13:01:42 +02:00
|
|
|
eval $cond
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-09 15:13:24 +03:00
|
|
|
tags {"external:skip"} {
|
2019-07-01 15:22:29 +03:00
|
|
|
foreach mdl {no yes} {
|
|
|
|
foreach sdl {disabled swapdb} {
|
|
|
|
test_psync {no reconnection, just sync} 6 1000000 3600 0 {
|
|
|
|
} $mdl $sdl 0
|
2015-08-05 09:18:54 +02:00
|
|
|
|
2019-07-01 15:22:29 +03:00
|
|
|
test_psync {ok psync} 6 100000000 3600 0 {
|
2015-08-04 13:14:25 +02:00
|
|
|
assert {[s -1 sync_partial_ok] > 0}
|
2019-07-01 15:22:29 +03:00
|
|
|
} $mdl $sdl 1
|
2013-05-09 12:52:04 +02:00
|
|
|
|
2019-07-01 15:22:29 +03:00
|
|
|
test_psync {no backlog} 6 100 3600 0.5 {
|
2015-08-04 13:14:25 +02:00
|
|
|
assert {[s -1 sync_partial_err] > 0}
|
2019-07-01 15:22:29 +03:00
|
|
|
} $mdl $sdl 1
|
2013-05-09 12:52:04 +02:00
|
|
|
|
2019-07-01 15:22:29 +03:00
|
|
|
test_psync {ok after delay} 3 100000000 3600 3 {
|
2015-08-04 13:14:25 +02:00
|
|
|
assert {[s -1 sync_partial_ok] > 0}
|
2019-07-01 15:22:29 +03:00
|
|
|
} $mdl $sdl 1
|
2013-05-09 12:52:04 +02:00
|
|
|
|
2019-07-01 15:22:29 +03:00
|
|
|
test_psync {backlog expired} 3 100000000 1 3 {
|
2015-08-04 13:14:25 +02:00
|
|
|
assert {[s -1 sync_partial_err] > 0}
|
2019-07-01 15:22:29 +03:00
|
|
|
} $mdl $sdl 1
|
|
|
|
}
|
2013-05-08 13:01:42 +02:00
|
|
|
}
|
2021-06-09 15:13:24 +03:00
|
|
|
}
|