Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {tags {"failover external:skip"} overrides {save {}}} {
|
|
|
|
start_server {overrides {save {}}} {
|
|
|
|
start_server {overrides {save {}}} {
|
2021-01-28 13:18:05 -08:00
|
|
|
set node_0 [srv 0 client]
|
|
|
|
set node_0_host [srv 0 host]
|
|
|
|
set node_0_port [srv 0 port]
|
|
|
|
set node_0_pid [srv 0 pid]
|
|
|
|
|
|
|
|
set node_1 [srv -1 client]
|
|
|
|
set node_1_host [srv -1 host]
|
|
|
|
set node_1_port [srv -1 port]
|
|
|
|
set node_1_pid [srv -1 pid]
|
|
|
|
|
|
|
|
set node_2 [srv -2 client]
|
|
|
|
set node_2_host [srv -2 host]
|
|
|
|
set node_2_port [srv -2 port]
|
|
|
|
set node_2_pid [srv -2 pid]
|
|
|
|
|
|
|
|
proc assert_digests_match {n1 n2 n3} {
|
|
|
|
assert_equal [$n1 debug digest] [$n2 debug digest]
|
|
|
|
assert_equal [$n2 debug digest] [$n3 debug digest]
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails without connected replica} {
|
|
|
|
catch { $node_0 failover to $node_1_host $node_1_port } err
|
|
|
|
if {! [string match "ERR*" $err]} {
|
|
|
|
fail "failover command succeeded when replica not connected"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test {setup replication for following tests} {
|
|
|
|
$node_1 replicaof $node_0_host $node_0_port
|
|
|
|
$node_2 replicaof $node_0_host $node_0_port
|
|
|
|
wait_for_sync $node_1
|
|
|
|
wait_for_sync $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails with invalid host} {
|
|
|
|
catch { $node_0 failover to invalidhost $node_1_port } err
|
|
|
|
assert_match "ERR*" $err
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails with invalid port} {
|
|
|
|
catch { $node_0 failover to $node_1_host invalidport } err
|
|
|
|
assert_match "ERR*" $err
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails with just force and timeout} {
|
|
|
|
catch { $node_0 FAILOVER FORCE TIMEOUT 100} err
|
|
|
|
assert_match "ERR*" $err
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails when sent to a replica} {
|
|
|
|
catch { $node_1 failover to $node_1_host $node_1_port } err
|
|
|
|
assert_match "ERR*" $err
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command fails with force without timeout} {
|
|
|
|
catch { $node_0 failover to $node_1_host $node_1_port FORCE } err
|
|
|
|
assert_match "ERR*" $err
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command to specific replica works} {
|
|
|
|
set initial_psyncs [s -1 sync_partial_ok]
|
|
|
|
set initial_syncs [s -1 sync_full]
|
|
|
|
|
|
|
|
# Generate a delta between primary and replica
|
|
|
|
set load_handler [start_write_load $node_0_host $node_0_port 5]
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s 0 total_commands_processed] > 100
|
|
|
|
} else {
|
|
|
|
fail "Node 0 did not accept writes"
|
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
|
|
|
|
# Execute the failover
|
|
|
|
$node_0 failover to $node_1_host $node_1_port
|
|
|
|
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s 0 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node 0 to node 1 did not finish"
|
|
|
|
}
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 15:29:43 +03:00
|
|
|
|
|
|
|
# stop the write load and make sure no more commands processed
|
2021-01-28 13:18:05 -08:00
|
|
|
stop_write_load $load_handler
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 15:29:43 +03:00
|
|
|
wait_load_handlers_disconnected
|
|
|
|
|
2021-01-28 13:18:05 -08:00
|
|
|
$node_2 replicaof $node_1_host $node_1_port
|
|
|
|
wait_for_sync $node_0
|
|
|
|
wait_for_sync $node_2
|
|
|
|
|
|
|
|
assert_match *slave* [$node_0 role]
|
|
|
|
assert_match *master* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
|
|
|
# We should accept psyncs from both nodes
|
|
|
|
assert_equal [expr [s -1 sync_partial_ok] - $initial_psyncs] 2
|
|
|
|
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover command to any replica works} {
|
|
|
|
set initial_psyncs [s -2 sync_partial_ok]
|
|
|
|
set initial_syncs [s -2 sync_full]
|
|
|
|
|
|
|
|
wait_for_ofs_sync $node_1 $node_2
|
|
|
|
# We stop node 0 to and make sure node 2 is selected
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $node_0_pid
|
2021-01-28 13:18:05 -08:00
|
|
|
$node_1 set CASE 1
|
|
|
|
$node_1 FAILOVER
|
|
|
|
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -1 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node 1 to node 2 did not finish"
|
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $node_0_pid
|
2021-01-28 13:18:05 -08:00
|
|
|
$node_0 replicaof $node_2_host $node_2_port
|
|
|
|
|
|
|
|
wait_for_sync $node_0
|
|
|
|
wait_for_sync $node_1
|
|
|
|
|
|
|
|
assert_match *slave* [$node_0 role]
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *master* [$node_2 role]
|
|
|
|
|
|
|
|
# We should accept Psyncs from both nodes
|
|
|
|
assert_equal [expr [s -2 sync_partial_ok] - $initial_psyncs] 2
|
|
|
|
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover to a replica with force works} {
|
|
|
|
set initial_psyncs [s 0 sync_partial_ok]
|
|
|
|
set initial_syncs [s 0 sync_full]
|
|
|
|
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $node_0_pid
|
2021-01-28 13:18:05 -08:00
|
|
|
# node 0 will never acknowledge this write
|
|
|
|
$node_2 set case 2
|
|
|
|
$node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
|
|
|
|
|
|
|
|
# Wait for node 0 to give up on sync attempt and start failover
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -2 master_failover_state] == "failover-in-progress"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node 2 to node 0 did not timeout"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Quick check that everyone is a replica, we never want a
|
|
|
|
# state where there are two masters.
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $node_0_pid
|
2021-01-28 13:18:05 -08:00
|
|
|
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -2 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node 2 to node 0 did not finish"
|
|
|
|
}
|
|
|
|
$node_1 replicaof $node_0_host $node_0_port
|
|
|
|
|
|
|
|
wait_for_sync $node_1
|
|
|
|
wait_for_sync $node_2
|
|
|
|
|
|
|
|
assert_match *master* [$node_0 role]
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
|
|
|
assert_equal [count_log_message -2 "time out exceeded, failing over."] 1
|
|
|
|
|
|
|
|
# We should accept both psyncs, although this is the condition we might not
|
|
|
|
# since we didn't catch up.
|
|
|
|
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
|
|
|
|
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover with timeout aborts if replica never catches up} {
|
|
|
|
set initial_psyncs [s 0 sync_partial_ok]
|
|
|
|
set initial_syncs [s 0 sync_full]
|
|
|
|
|
|
|
|
# Stop replica so it never catches up
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
$node_0 SET CASE 1
|
|
|
|
|
|
|
|
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 20 {
|
|
|
|
[s 0 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node_0 to replica did not finish"
|
|
|
|
}
|
|
|
|
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
|
|
|
|
# We need to make sure the nodes actually sync back up
|
|
|
|
wait_for_ofs_sync $node_0 $node_1
|
|
|
|
wait_for_ofs_sync $node_0 $node_2
|
|
|
|
|
|
|
|
assert_match *master* [$node_0 role]
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
|
|
|
# Since we never caught up, there should be no syncs
|
|
|
|
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
|
|
|
|
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failovers can be aborted} {
|
|
|
|
set initial_psyncs [s 0 sync_partial_ok]
|
|
|
|
set initial_syncs [s 0 sync_full]
|
|
|
|
|
|
|
|
# Stop replica so it never catches up
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
$node_0 SET CASE 2
|
|
|
|
|
|
|
|
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
|
|
|
|
assert_match [s 0 master_failover_state] "waiting-for-sync"
|
|
|
|
|
|
|
|
# Sanity check that read commands are still accepted
|
|
|
|
$node_0 GET CASE
|
|
|
|
|
|
|
|
$node_0 failover abort
|
|
|
|
assert_match [s 0 master_failover_state] "no-failover"
|
|
|
|
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
|
|
|
|
# Just make sure everything is still synced
|
|
|
|
wait_for_ofs_sync $node_0 $node_1
|
|
|
|
wait_for_ofs_sync $node_0 $node_2
|
|
|
|
|
|
|
|
assert_match *master* [$node_0 role]
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
|
|
|
# Since we never caught up, there should be no syncs
|
|
|
|
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
|
|
|
|
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
|
|
|
|
test {failover aborts if target rejects sync request} {
|
|
|
|
set initial_psyncs [s 0 sync_partial_ok]
|
|
|
|
set initial_syncs [s 0 sync_full]
|
|
|
|
|
|
|
|
# We block psync, so the failover will fail
|
|
|
|
$node_1 acl setuser default -psync
|
|
|
|
|
|
|
|
# We pause the target long enough to send a write command
|
|
|
|
# during the pause. This write will not be interrupted.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
set rd [redis_deferring_client]
|
|
|
|
$rd SET FOO BAR
|
|
|
|
$node_0 failover to $node_1_host $node_1_port
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process [srv -1 pid]
|
2021-01-28 13:18:05 -08:00
|
|
|
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s 0 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from node_0 to replica did not finish"
|
|
|
|
}
|
|
|
|
|
|
|
|
assert_equal [$rd read] "OK"
|
|
|
|
$rd close
|
|
|
|
|
|
|
|
# restore access to psync
|
|
|
|
$node_1 acl setuser default +psync
|
|
|
|
|
|
|
|
# We need to make sure the nodes actually sync back up
|
|
|
|
wait_for_sync $node_1
|
|
|
|
wait_for_sync $node_2
|
|
|
|
|
|
|
|
assert_match *master* [$node_0 role]
|
|
|
|
assert_match *slave* [$node_1 role]
|
|
|
|
assert_match *slave* [$node_2 role]
|
|
|
|
|
|
|
|
# We will cycle all of our replicas here and force a psync.
|
|
|
|
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
|
|
|
|
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
|
|
|
|
|
|
|
assert_equal [count_log_message 0 "Failover target rejected psync request"] 1
|
|
|
|
assert_digests_match $node_0 $node_1 $node_2
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|