2022-01-02 08:50:15 +01:00
|
|
|
# This test suite tests shutdown when there are lagging replicas connected.
|
|
|
|
|
|
|
|
# Fill up the OS socket send buffer for the replica connection 1M at a time.
|
|
|
|
# When the replication buffer memory increases beyond 2M (often after writing 4M
|
|
|
|
# or so), we assume it's because the OS socket send buffer can't swallow
|
|
|
|
# anymore.
|
|
|
|
proc fill_up_os_socket_send_buffer_for_repl {idx} {
|
|
|
|
set i 0
|
|
|
|
while {1} {
|
|
|
|
incr i
|
|
|
|
populate 1024 junk$i: 1024 $idx
|
|
|
|
after 10
|
|
|
|
set buf_size [s $idx mem_total_replication_buffers]
|
|
|
|
if {$buf_size > 2*1024*1024} {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach how {sigterm shutdown} {
|
|
|
|
test "Shutting down master waits for replica to catch up ($how)" {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {overrides {save ""}} {
|
|
|
|
start_server {overrides {save ""}} {
|
2022-01-02 08:50:15 +01:00
|
|
|
set master [srv -1 client]
|
|
|
|
set master_host [srv -1 host]
|
|
|
|
set master_port [srv -1 port]
|
|
|
|
set master_pid [srv -1 pid]
|
|
|
|
set replica [srv 0 client]
|
|
|
|
set replica_pid [srv 0 pid]
|
|
|
|
|
|
|
|
# Config master.
|
|
|
|
$master config set shutdown-timeout 300; # 5min for slow CI
|
|
|
|
$master config set repl-backlog-size 1; # small as possible
|
|
|
|
$master config set hz 100; # cron runs every 10ms
|
|
|
|
|
|
|
|
# Config replica.
|
|
|
|
$replica replicaof $master_host $master_port
|
|
|
|
wait_for_sync $replica
|
|
|
|
|
|
|
|
# Preparation: Set k to 1 on both master and replica.
|
|
|
|
$master set k 1
|
|
|
|
wait_for_ofs_sync $master $replica
|
|
|
|
|
|
|
|
# Pause the replica.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
|
|
|
|
# Fill up the OS socket send buffer for the replica connection
|
|
|
|
# to prevent the following INCR from reaching the replica via
|
|
|
|
# the OS.
|
|
|
|
fill_up_os_socket_send_buffer_for_repl -1
|
|
|
|
|
|
|
|
# Incr k and immediately shutdown master.
|
|
|
|
$master incr k
|
|
|
|
switch $how {
|
|
|
|
sigterm {
|
|
|
|
exec kill -SIGTERM $master_pid
|
|
|
|
}
|
|
|
|
shutdown {
|
|
|
|
set rd [redis_deferring_client -1]
|
|
|
|
$rd shutdown
|
|
|
|
}
|
|
|
|
}
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -1 shutdown_in_milliseconds] > 0
|
|
|
|
} else {
|
|
|
|
fail "Master not indicating ongoing shutdown."
|
|
|
|
}
|
|
|
|
|
|
|
|
# Wake up replica and check if master has waited for it.
|
|
|
|
after 20; # 2 cron intervals
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
wait_for_condition 300 1000 {
|
|
|
|
[$replica get k] eq 2
|
|
|
|
} else {
|
|
|
|
fail "Master exited before replica could catch up."
|
|
|
|
}
|
|
|
|
|
|
|
|
# Check shutdown log messages on master
|
|
|
|
wait_for_log_messages -1 {"*ready to exit, bye bye*"} 0 100 500
|
|
|
|
assert_equal 0 [count_log_message -1 "*Lagging replica*"]
|
|
|
|
verify_log_message -1 "*1 of 1 replicas are in sync*" 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} {} {repl external:skip}
|
|
|
|
}
|
|
|
|
|
|
|
|
test {Shutting down master waits for replica timeout} {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {overrides {save ""}} {
|
|
|
|
start_server {overrides {save ""}} {
|
2022-01-02 08:50:15 +01:00
|
|
|
set master [srv -1 client]
|
|
|
|
set master_host [srv -1 host]
|
|
|
|
set master_port [srv -1 port]
|
|
|
|
set master_pid [srv -1 pid]
|
|
|
|
set replica [srv 0 client]
|
|
|
|
set replica_pid [srv 0 pid]
|
|
|
|
|
|
|
|
# Config master.
|
|
|
|
$master config set shutdown-timeout 1; # second
|
|
|
|
|
|
|
|
# Config replica.
|
|
|
|
$replica replicaof $master_host $master_port
|
|
|
|
wait_for_sync $replica
|
|
|
|
|
|
|
|
# Preparation: Set k to 1 on both master and replica.
|
|
|
|
$master set k 1
|
|
|
|
wait_for_ofs_sync $master $replica
|
|
|
|
|
|
|
|
# Pause the replica.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
|
|
|
|
# Fill up the OS socket send buffer for the replica connection to
|
|
|
|
# prevent the following INCR k from reaching the replica via the OS.
|
|
|
|
fill_up_os_socket_send_buffer_for_repl -1
|
|
|
|
|
|
|
|
# Incr k and immediately shutdown master.
|
|
|
|
$master incr k
|
|
|
|
exec kill -SIGTERM $master_pid
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -1 shutdown_in_milliseconds] > 0
|
|
|
|
} else {
|
|
|
|
fail "Master not indicating ongoing shutdown."
|
|
|
|
}
|
|
|
|
|
|
|
|
# Let master finish shutting down and check log.
|
|
|
|
wait_for_log_messages -1 {"*ready to exit, bye bye*"} 0 100 100
|
|
|
|
verify_log_message -1 "*Lagging replica*" 0
|
|
|
|
verify_log_message -1 "*0 of 1 replicas are in sync*" 0
|
|
|
|
|
|
|
|
# Wake up replica.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
assert_equal 1 [$replica get k]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} {} {repl external:skip}
|
|
|
|
|
|
|
|
test "Shutting down master waits for replica then fails" {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {overrides {save ""}} {
|
|
|
|
start_server {overrides {save ""}} {
|
2022-01-02 08:50:15 +01:00
|
|
|
set master [srv -1 client]
|
|
|
|
set master_host [srv -1 host]
|
|
|
|
set master_port [srv -1 port]
|
|
|
|
set master_pid [srv -1 pid]
|
|
|
|
set replica [srv 0 client]
|
|
|
|
set replica_pid [srv 0 pid]
|
|
|
|
|
|
|
|
# Config master and replica.
|
|
|
|
$replica replicaof $master_host $master_port
|
|
|
|
wait_for_sync $replica
|
|
|
|
|
|
|
|
# Pause the replica and write a key on master.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
$master incr k
|
|
|
|
|
|
|
|
# Two clients call blocking SHUTDOWN in parallel.
|
|
|
|
set rd1 [redis_deferring_client -1]
|
|
|
|
set rd2 [redis_deferring_client -1]
|
|
|
|
$rd1 shutdown
|
|
|
|
$rd2 shutdown
|
|
|
|
set info_clients [$master info clients]
|
|
|
|
assert_match "*connected_clients:3*" $info_clients
|
|
|
|
assert_match "*blocked_clients:2*" $info_clients
|
|
|
|
|
|
|
|
# Start a very slow initial AOFRW, which will prevent shutdown.
|
|
|
|
$master config set rdb-key-save-delay 30000000; # 30 seconds
|
|
|
|
$master config set appendonly yes
|
|
|
|
|
|
|
|
# Wake up replica, causing master to continue shutting down.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
|
|
|
|
# SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
|
|
|
|
catch { $rd1 read } e1
|
|
|
|
catch { $rd2 read } e2
|
|
|
|
assert_match "*Errors trying to SHUTDOWN. Check logs*" $e1
|
|
|
|
assert_match "*Errors trying to SHUTDOWN. Check logs*" $e2
|
|
|
|
$rd1 close
|
|
|
|
$rd2 close
|
|
|
|
|
|
|
|
# Check shutdown log messages on master.
|
|
|
|
verify_log_message -1 "*1 of 1 replicas are in sync*" 0
|
|
|
|
verify_log_message -1 "*Writing initial AOF, can't exit*" 0
|
|
|
|
verify_log_message -1 "*Errors trying to shut down*" 0
|
|
|
|
|
|
|
|
# Let master to exit fast, without waiting for the very slow AOFRW.
|
|
|
|
catch {$master shutdown nosave force}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} {} {repl external:skip}
|
|
|
|
|
|
|
|
test "Shutting down master waits for replica then aborted" {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
start_server {overrides {save ""}} {
|
|
|
|
start_server {overrides {save ""}} {
|
2022-01-02 08:50:15 +01:00
|
|
|
set master [srv -1 client]
|
|
|
|
set master_host [srv -1 host]
|
|
|
|
set master_port [srv -1 port]
|
|
|
|
set master_pid [srv -1 pid]
|
|
|
|
set replica [srv 0 client]
|
|
|
|
set replica_pid [srv 0 pid]
|
|
|
|
|
|
|
|
# Config master and replica.
|
|
|
|
$replica replicaof $master_host $master_port
|
|
|
|
wait_for_sync $replica
|
|
|
|
|
|
|
|
# Pause the replica and write a key on master.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
pause_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
$master incr k
|
|
|
|
|
|
|
|
# Two clients call blocking SHUTDOWN in parallel.
|
|
|
|
set rd1 [redis_deferring_client -1]
|
|
|
|
set rd2 [redis_deferring_client -1]
|
|
|
|
$rd1 shutdown
|
|
|
|
$rd2 shutdown
|
|
|
|
set info_clients [$master info clients]
|
|
|
|
assert_match "*connected_clients:3*" $info_clients
|
|
|
|
assert_match "*blocked_clients:2*" $info_clients
|
|
|
|
|
|
|
|
# Abort the shutdown
|
|
|
|
$master shutdown abort
|
|
|
|
|
|
|
|
# Wake up replica, causing master to continue shutting down.
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 09:19:21 +03:00
|
|
|
resume_process $replica_pid
|
2022-01-02 08:50:15 +01:00
|
|
|
|
|
|
|
# SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
|
|
|
|
catch { $rd1 read } e1
|
|
|
|
catch { $rd2 read } e2
|
|
|
|
assert_match "*Errors trying to SHUTDOWN. Check logs*" $e1
|
|
|
|
assert_match "*Errors trying to SHUTDOWN. Check logs*" $e2
|
|
|
|
$rd1 close
|
|
|
|
$rd2 close
|
|
|
|
|
|
|
|
# Check shutdown log messages on master.
|
|
|
|
verify_log_message -1 "*Shutdown manually aborted*" 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} {} {repl external:skip}
|