2019-07-21 10:41:03 -04:00
|
|
|
set testmodule [file normalize tests/modules/testrdb.so]
|
|
|
|
|
|
|
|
tags "modules" {
|
2019-10-24 02:45:25 -04:00
|
|
|
test {modules are able to persist types} {
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule"]] {
|
2019-07-21 10:41:03 -04:00
|
|
|
r testrdb.set.key key1 value1
|
|
|
|
assert_equal "value1" [r testrdb.get.key key1]
|
|
|
|
r debug reload
|
|
|
|
assert_equal "value1" [r testrdb.get.key key1]
|
|
|
|
}
|
2019-10-24 02:45:25 -04:00
|
|
|
}
|
2019-07-21 10:41:03 -04:00
|
|
|
|
2019-10-24 02:45:25 -04:00
|
|
|
test {modules global are lost without aux} {
|
|
|
|
set server_path [tmpdir "server.module-testrdb"]
|
2020-08-31 03:44:43 -04:00
|
|
|
start_server [list overrides [list loadmodule "$testmodule" "dir" $server_path] keep_persistence true] {
|
2019-07-21 10:41:03 -04:00
|
|
|
r testrdb.set.before global1
|
|
|
|
assert_equal "global1" [r testrdb.get.before]
|
2019-10-24 02:45:25 -04:00
|
|
|
}
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule" "dir" $server_path]] {
|
2019-07-21 10:41:03 -04:00
|
|
|
assert_equal "" [r testrdb.get.before]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-24 02:45:25 -04:00
|
|
|
test {modules are able to persist globals before and after} {
|
|
|
|
set server_path [tmpdir "server.module-testrdb"]
|
2020-08-31 03:44:43 -04:00
|
|
|
start_server [list overrides [list loadmodule "$testmodule 2" "dir" $server_path] keep_persistence true] {
|
2019-07-21 10:41:03 -04:00
|
|
|
r testrdb.set.before global1
|
|
|
|
r testrdb.set.after global2
|
|
|
|
assert_equal "global1" [r testrdb.get.before]
|
|
|
|
assert_equal "global2" [r testrdb.get.after]
|
2019-10-24 02:45:25 -04:00
|
|
|
}
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule 2" "dir" $server_path]] {
|
2019-07-21 10:41:03 -04:00
|
|
|
assert_equal "global1" [r testrdb.get.before]
|
|
|
|
assert_equal "global2" [r testrdb.get.after]
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-10-24 02:45:25 -04:00
|
|
|
test {modules are able to persist globals just after} {
|
|
|
|
set server_path [tmpdir "server.module-testrdb"]
|
2020-08-31 03:44:43 -04:00
|
|
|
start_server [list overrides [list loadmodule "$testmodule 1" "dir" $server_path] keep_persistence true] {
|
2019-07-21 10:41:03 -04:00
|
|
|
r testrdb.set.after global2
|
|
|
|
assert_equal "global2" [r testrdb.get.after]
|
2019-10-24 02:45:25 -04:00
|
|
|
}
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule 1" "dir" $server_path]] {
|
2019-07-21 10:41:03 -04:00
|
|
|
assert_equal "global2" [r testrdb.get.after]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-21 11:18:11 -04:00
|
|
|
tags {repl} {
|
|
|
|
test {diskless loading short read with module} {
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule"]] {
|
|
|
|
set replica [srv 0 client]
|
|
|
|
set replica_host [srv 0 host]
|
|
|
|
set replica_port [srv 0 port]
|
|
|
|
start_server [list overrides [list loadmodule "$testmodule"]] {
|
|
|
|
set master [srv 0 client]
|
|
|
|
set master_host [srv 0 host]
|
|
|
|
set master_port [srv 0 port]
|
2019-07-21 10:41:03 -04:00
|
|
|
|
2019-07-21 11:18:11 -04:00
|
|
|
# Set master and replica to use diskless replication
|
|
|
|
$master config set repl-diskless-sync yes
|
|
|
|
$master config set rdbcompression no
|
|
|
|
$replica config set repl-diskless-load swapdb
|
Accelerate diskless master connections, and general re-connections (#6271)
Diskless master has some inherent latencies.
1) fork starts with delay from cron rather than immediately
2) replica is put online only after an ACK. but the ACK
was sent only once a second.
3) but even if it would arrive immediately, it will not
register in case cron didn't yet detect that the fork is done.
Besides that, when a replica disconnects, it doesn't immediately
attempts to re-connect, it waits for replication cron (one per second).
in case it was already online, it may be important to try to re-connect
as soon as possible, so that the backlog at the master doesn't vanish.
In case it disconnected during rdb transfer, one can argue that it's
not very important to re-connect immediately, but this is needed for the
"diskless loading short read" test to be able to run 100 iterations in 5
seconds, rather than 3 (waiting for replication cron re-connection)
changes in this commit:
1) sync command starts a fork immediately if no sync_delay is configured
2) replica sends REPLCONF ACK when done reading the rdb (rather than on 1s cron)
3) when a replica unexpectedly disconnets, it immediately tries to
re-connect rather than waiting 1s
4) when when a child exits, if there is another replica waiting, we spawn a new
one right away, instead of waiting for 1s replicationCron.
5) added a call to connectWithMaster from replicationSetMaster. which is called
from the REPLICAOF command but also in 3 places in cluster.c, in all of
these the connection attempt will now be immediate instead of delayed by 1
second.
side note:
we can add a call to rdbPipeReadHandler in replconfCommand when getting
a REPLCONF ACK from the replica to solve a race where the replica got
the entire rdb and EOF marker before we detected that the pipe was
closed.
in the test i did see this race happens in one about of some 300 runs,
but i concluded that this race is unlikely in real life (where the
replica is on another host and we're more likely to first detect the
pipe was closed.
the test runs 100 iterations in 3 seconds, so in some cases it'll take 4
seconds instead (waiting for another REPLCONF ACK).
Removing unneeded startBgsaveForReplication from updateSlavesWaitingForBgsave
Now that CheckChildrenDone is calling the new replicationStartPendingFork
(extracted from serverCron) there's actually no need to call
startBgsaveForReplication from updateSlavesWaitingForBgsave anymore,
since as soon as updateSlavesWaitingForBgsave returns, CheckChildrenDone is
calling replicationStartPendingFork that handles that anyway.
The code in updateSlavesWaitingForBgsave had a bug in which it ignored
repl-diskless-sync-delay, but removing that code shows that this bug was
hiding another bug, which is that the max_idle should have used >= and
not >, this one second delay has a big impact on my new test.
2020-08-06 09:53:06 -04:00
|
|
|
$master config set hz 500
|
|
|
|
$replica config set hz 500
|
|
|
|
$master config set dynamic-hz no
|
|
|
|
$replica config set dynamic-hz no
|
|
|
|
set start [clock clicks -milliseconds]
|
2019-07-21 11:18:11 -04:00
|
|
|
for {set k 0} {$k < 30} {incr k} {
|
|
|
|
r testrdb.set.key key$k [string repeat A [expr {int(rand()*1000000)}]]
|
|
|
|
}
|
2019-07-21 10:41:03 -04:00
|
|
|
|
Accelerate diskless master connections, and general re-connections (#6271)
Diskless master has some inherent latencies.
1) fork starts with delay from cron rather than immediately
2) replica is put online only after an ACK. but the ACK
was sent only once a second.
3) but even if it would arrive immediately, it will not
register in case cron didn't yet detect that the fork is done.
Besides that, when a replica disconnects, it doesn't immediately
attempts to re-connect, it waits for replication cron (one per second).
in case it was already online, it may be important to try to re-connect
as soon as possible, so that the backlog at the master doesn't vanish.
In case it disconnected during rdb transfer, one can argue that it's
not very important to re-connect immediately, but this is needed for the
"diskless loading short read" test to be able to run 100 iterations in 5
seconds, rather than 3 (waiting for replication cron re-connection)
changes in this commit:
1) sync command starts a fork immediately if no sync_delay is configured
2) replica sends REPLCONF ACK when done reading the rdb (rather than on 1s cron)
3) when a replica unexpectedly disconnets, it immediately tries to
re-connect rather than waiting 1s
4) when when a child exits, if there is another replica waiting, we spawn a new
one right away, instead of waiting for 1s replicationCron.
5) added a call to connectWithMaster from replicationSetMaster. which is called
from the REPLICAOF command but also in 3 places in cluster.c, in all of
these the connection attempt will now be immediate instead of delayed by 1
second.
side note:
we can add a call to rdbPipeReadHandler in replconfCommand when getting
a REPLCONF ACK from the replica to solve a race where the replica got
the entire rdb and EOF marker before we detected that the pipe was
closed.
in the test i did see this race happens in one about of some 300 runs,
but i concluded that this race is unlikely in real life (where the
replica is on another host and we're more likely to first detect the
pipe was closed.
the test runs 100 iterations in 3 seconds, so in some cases it'll take 4
seconds instead (waiting for another REPLCONF ACK).
Removing unneeded startBgsaveForReplication from updateSlavesWaitingForBgsave
Now that CheckChildrenDone is calling the new replicationStartPendingFork
(extracted from serverCron) there's actually no need to call
startBgsaveForReplication from updateSlavesWaitingForBgsave anymore,
since as soon as updateSlavesWaitingForBgsave returns, CheckChildrenDone is
calling replicationStartPendingFork that handles that anyway.
The code in updateSlavesWaitingForBgsave had a bug in which it ignored
repl-diskless-sync-delay, but removing that code shows that this bug was
hiding another bug, which is that the max_idle should have used >= and
not >, this one second delay has a big impact on my new test.
2020-08-06 09:53:06 -04:00
|
|
|
if {$::verbose} {
|
|
|
|
set end [clock clicks -milliseconds]
|
|
|
|
set duration [expr $end - $start]
|
|
|
|
puts "filling took $duration ms (TODO: use pipeline)"
|
|
|
|
set start [clock clicks -milliseconds]
|
|
|
|
}
|
|
|
|
|
2019-07-21 11:18:11 -04:00
|
|
|
# Start the replication process...
|
2020-07-10 01:28:22 -04:00
|
|
|
set loglines [count_log_lines -1]
|
2019-07-21 11:18:11 -04:00
|
|
|
$master config set repl-diskless-sync-delay 0
|
|
|
|
$replica replicaof $master_host $master_port
|
|
|
|
|
|
|
|
# kill the replication at various points
|
Accelerate diskless master connections, and general re-connections (#6271)
Diskless master has some inherent latencies.
1) fork starts with delay from cron rather than immediately
2) replica is put online only after an ACK. but the ACK
was sent only once a second.
3) but even if it would arrive immediately, it will not
register in case cron didn't yet detect that the fork is done.
Besides that, when a replica disconnects, it doesn't immediately
attempts to re-connect, it waits for replication cron (one per second).
in case it was already online, it may be important to try to re-connect
as soon as possible, so that the backlog at the master doesn't vanish.
In case it disconnected during rdb transfer, one can argue that it's
not very important to re-connect immediately, but this is needed for the
"diskless loading short read" test to be able to run 100 iterations in 5
seconds, rather than 3 (waiting for replication cron re-connection)
changes in this commit:
1) sync command starts a fork immediately if no sync_delay is configured
2) replica sends REPLCONF ACK when done reading the rdb (rather than on 1s cron)
3) when a replica unexpectedly disconnets, it immediately tries to
re-connect rather than waiting 1s
4) when when a child exits, if there is another replica waiting, we spawn a new
one right away, instead of waiting for 1s replicationCron.
5) added a call to connectWithMaster from replicationSetMaster. which is called
from the REPLICAOF command but also in 3 places in cluster.c, in all of
these the connection attempt will now be immediate instead of delayed by 1
second.
side note:
we can add a call to rdbPipeReadHandler in replconfCommand when getting
a REPLCONF ACK from the replica to solve a race where the replica got
the entire rdb and EOF marker before we detected that the pipe was
closed.
in the test i did see this race happens in one about of some 300 runs,
but i concluded that this race is unlikely in real life (where the
replica is on another host and we're more likely to first detect the
pipe was closed.
the test runs 100 iterations in 3 seconds, so in some cases it'll take 4
seconds instead (waiting for another REPLCONF ACK).
Removing unneeded startBgsaveForReplication from updateSlavesWaitingForBgsave
Now that CheckChildrenDone is calling the new replicationStartPendingFork
(extracted from serverCron) there's actually no need to call
startBgsaveForReplication from updateSlavesWaitingForBgsave anymore,
since as soon as updateSlavesWaitingForBgsave returns, CheckChildrenDone is
calling replicationStartPendingFork that handles that anyway.
The code in updateSlavesWaitingForBgsave had a bug in which it ignored
repl-diskless-sync-delay, but removing that code shows that this bug was
hiding another bug, which is that the max_idle should have used >= and
not >, this one second delay has a big impact on my new test.
2020-08-06 09:53:06 -04:00
|
|
|
set attempts 100
|
|
|
|
if {$::accurate} { set attempts 500 }
|
2019-07-21 11:18:11 -04:00
|
|
|
for {set i 0} {$i < $attempts} {incr i} {
|
|
|
|
# wait for the replica to start reading the rdb
|
|
|
|
# using the log file since the replica only responds to INFO once in 2mb
|
2020-07-28 04:15:29 -04:00
|
|
|
set res [wait_for_log_messages -1 {"*Loading DB in memory*"} $loglines 2000 1]
|
|
|
|
set loglines [lindex $res 1]
|
2019-07-21 11:18:11 -04:00
|
|
|
|
|
|
|
# add some additional random sleep so that we kill the master on a different place each time
|
2020-07-28 04:15:29 -04:00
|
|
|
after [expr {int(rand()*50)}]
|
2019-07-21 11:18:11 -04:00
|
|
|
|
|
|
|
# kill the replica connection on the master
|
|
|
|
set killed [$master client kill type replica]
|
|
|
|
|
2020-07-28 04:15:29 -04:00
|
|
|
set res [wait_for_log_messages -1 {"*Internal error in RDB*" "*Finished with success*" "*Successful partial resynchronization*"} $loglines 1000 1]
|
|
|
|
if {$::verbose} { puts $res }
|
|
|
|
set log_text [lindex $res 0]
|
|
|
|
set loglines [lindex $res 1]
|
|
|
|
if {![string match "*Internal error in RDB*" $log_text]} {
|
2019-07-21 11:18:11 -04:00
|
|
|
# force the replica to try another full sync
|
2020-07-28 04:15:29 -04:00
|
|
|
$master multi
|
2019-07-21 11:18:11 -04:00
|
|
|
$master client kill type replica
|
|
|
|
$master set asdf asdf
|
|
|
|
# the side effect of resizing the backlog is that it is flushed (16k is the min size)
|
|
|
|
$master config set repl-backlog-size [expr {16384 + $i}]
|
2020-07-28 04:15:29 -04:00
|
|
|
$master exec
|
2019-07-21 11:18:11 -04:00
|
|
|
}
|
|
|
|
# wait for loading to stop (fail)
|
2020-07-28 04:15:29 -04:00
|
|
|
wait_for_condition 1000 1 {
|
2019-07-21 11:18:11 -04:00
|
|
|
[s -1 loading] eq 0
|
|
|
|
} else {
|
|
|
|
fail "Replica didn't disconnect"
|
|
|
|
}
|
|
|
|
}
|
Accelerate diskless master connections, and general re-connections (#6271)
Diskless master has some inherent latencies.
1) fork starts with delay from cron rather than immediately
2) replica is put online only after an ACK. but the ACK
was sent only once a second.
3) but even if it would arrive immediately, it will not
register in case cron didn't yet detect that the fork is done.
Besides that, when a replica disconnects, it doesn't immediately
attempts to re-connect, it waits for replication cron (one per second).
in case it was already online, it may be important to try to re-connect
as soon as possible, so that the backlog at the master doesn't vanish.
In case it disconnected during rdb transfer, one can argue that it's
not very important to re-connect immediately, but this is needed for the
"diskless loading short read" test to be able to run 100 iterations in 5
seconds, rather than 3 (waiting for replication cron re-connection)
changes in this commit:
1) sync command starts a fork immediately if no sync_delay is configured
2) replica sends REPLCONF ACK when done reading the rdb (rather than on 1s cron)
3) when a replica unexpectedly disconnets, it immediately tries to
re-connect rather than waiting 1s
4) when when a child exits, if there is another replica waiting, we spawn a new
one right away, instead of waiting for 1s replicationCron.
5) added a call to connectWithMaster from replicationSetMaster. which is called
from the REPLICAOF command but also in 3 places in cluster.c, in all of
these the connection attempt will now be immediate instead of delayed by 1
second.
side note:
we can add a call to rdbPipeReadHandler in replconfCommand when getting
a REPLCONF ACK from the replica to solve a race where the replica got
the entire rdb and EOF marker before we detected that the pipe was
closed.
in the test i did see this race happens in one about of some 300 runs,
but i concluded that this race is unlikely in real life (where the
replica is on another host and we're more likely to first detect the
pipe was closed.
the test runs 100 iterations in 3 seconds, so in some cases it'll take 4
seconds instead (waiting for another REPLCONF ACK).
Removing unneeded startBgsaveForReplication from updateSlavesWaitingForBgsave
Now that CheckChildrenDone is calling the new replicationStartPendingFork
(extracted from serverCron) there's actually no need to call
startBgsaveForReplication from updateSlavesWaitingForBgsave anymore,
since as soon as updateSlavesWaitingForBgsave returns, CheckChildrenDone is
calling replicationStartPendingFork that handles that anyway.
The code in updateSlavesWaitingForBgsave had a bug in which it ignored
repl-diskless-sync-delay, but removing that code shows that this bug was
hiding another bug, which is that the max_idle should have used >= and
not >, this one second delay has a big impact on my new test.
2020-08-06 09:53:06 -04:00
|
|
|
if {$::verbose} {
|
|
|
|
set end [clock clicks -milliseconds]
|
|
|
|
set duration [expr $end - $start]
|
|
|
|
puts "test took $duration ms"
|
|
|
|
}
|
2019-07-21 11:18:11 -04:00
|
|
|
# enable fast shutdown
|
|
|
|
$master config set rdb-key-save-delay 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-07-21 10:41:03 -04:00
|
|
|
}
|