redict/tests/integration/psync2.tcl
2020-05-17 18:24:34 +02:00

410 lines
16 KiB
Tcl

start_server {tags {"psync2"}} {
start_server {} {
start_server {} {
start_server {} {
start_server {} {
set master_id 0 ; # Current master
set start_time [clock seconds] ; # Test start time
set counter_value 0 ; # Current value of the Redis counter "x"
# Config
set debug_msg 0 ; # Enable additional debug messages
set no_exit 0 ; # Do not exit at end of the test
set duration 40 ; # Total test seconds
set genload 1 ; # Load master with writes at every cycle
set genload_time 5000 ; # Writes duration time in ms
set disconnect 1 ; # Break replication link between random
# master and slave instances while the
# master is loaded with writes.
set disconnect_period 1000 ; # Disconnect repl link every N ms.
for {set j 0} {$j < 5} {incr j} {
set R($j) [srv [expr 0-$j] client]
set R_host($j) [srv [expr 0-$j] host]
set R_port($j) [srv [expr 0-$j] port]
if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
}
set cycle 1
while {([clock seconds]-$start_time) < $duration} {
test "PSYNC2: --- CYCLE $cycle ---" {}
incr cycle
# Create a random replication layout.
# Start with switching master (this simulates a failover).
# 1) Select the new master.
set master_id [randomInt 5]
set used [list $master_id]
test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
$R($master_id) slaveof no one
$R($master_id) config set repl-ping-replica-period 1 ;# increse the chance that random ping will cause issues
if {$counter_value == 0} {
$R($master_id) set x $counter_value
}
}
# 2) Attach all the slaves to a random instance
while {[llength $used] != 5} {
while 1 {
set slave_id [randomInt 5]
if {[lsearch -exact $used $slave_id] == -1} break
}
set rand [randomInt [llength $used]]
set mid [lindex $used $rand]
set master_host $R_host($mid)
set master_port $R_port($mid)
test "PSYNC2: Set #$slave_id to replicate from #$mid" {
$R($slave_id) slaveof $master_host $master_port
}
lappend used $slave_id
}
# Wait for replicas to sync. so next loop won't get -LOADING error
wait_for_condition 50 1000 {
[status $R([expr {($master_id+1)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+2)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+3)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+4)%5}]) master_link_status] == "up"
} else {
fail "Replica not reconnecting"
}
# 3) Increment the counter and wait for all the instances
# to converge.
test "PSYNC2: cluster is consistent after failover" {
$R($master_id) incr x; incr counter_value
for {set j 0} {$j < 5} {incr j} {
wait_for_condition 50 1000 {
[$R($j) get x] == $counter_value
} else {
fail "Instance #$j x variable is inconsistent"
}
}
}
# 4) Generate load while breaking the connection of random
# slave-master pairs.
test "PSYNC2: generate load while killing replication links" {
set t [clock milliseconds]
set next_break [expr {$t+$disconnect_period}]
while {[clock milliseconds]-$t < $genload_time} {
if {$genload} {
$R($master_id) incr x; incr counter_value
}
if {[clock milliseconds] == $next_break} {
set next_break \
[expr {[clock milliseconds]+$disconnect_period}]
set slave_id [randomInt 5]
if {$disconnect} {
$R($slave_id) client kill type master
if {$debug_msg} {
puts "+++ Breaking link for replica #$slave_id"
}
}
}
}
}
# 5) Increment the counter and wait for all the instances
set x [$R($master_id) get x]
test "PSYNC2: cluster is consistent after load (x = $x)" {
for {set j 0} {$j < 5} {incr j} {
wait_for_condition 50 1000 {
[$R($j) get x] == $counter_value
} else {
fail "Instance #$j x variable is inconsistent"
}
}
}
# wait for all the slaves to be in sync.
set masteroff [status $R($master_id) master_repl_offset]
wait_for_condition 500 100 {
[status $R(0) master_repl_offset] >= $masteroff &&
[status $R(1) master_repl_offset] >= $masteroff &&
[status $R(2) master_repl_offset] >= $masteroff &&
[status $R(3) master_repl_offset] >= $masteroff &&
[status $R(4) master_repl_offset] >= $masteroff
} else {
puts "Master ID is $master_id"
for {set j 0} {$j < 5} {incr j} {
puts "$j: sync_full: [status $R($j) sync_full]"
puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
puts "---"
}
fail "Replicas offsets didn't catch up with the master after too long time."
}
if {$debug_msg} {
puts "Master ID is $master_id"
for {set j 0} {$j < 5} {incr j} {
puts "$j: sync_full: [status $R($j) sync_full]"
puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
puts "---"
}
}
test "PSYNC2: total sum of full synchronizations is exactly 4" {
set sum 0
for {set j 0} {$j < 5} {incr j} {
incr sum [status $R($j) sync_full]
}
assert {$sum == 4}
}
# In absence of pings, are the instances really able to have
# the exact same offset?
$R($master_id) config set repl-ping-replica-period 3600
wait_for_condition 500 100 {
[status $R($master_id) master_repl_offset] == [status $R(0) master_repl_offset] &&
[status $R($master_id) master_repl_offset] == [status $R(1) master_repl_offset] &&
[status $R($master_id) master_repl_offset] == [status $R(2) master_repl_offset] &&
[status $R($master_id) master_repl_offset] == [status $R(3) master_repl_offset] &&
[status $R($master_id) master_repl_offset] == [status $R(4) master_repl_offset]
} else {
puts "Master ID is $master_id"
for {set j 0} {$j < 5} {incr j} {
puts "$j: sync_full: [status $R($j) sync_full]"
puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
puts "---"
}
fail "Replicas and master offsets were unable to match *exactly*."
}
$R($master_id) config set repl-ping-replica-period 10
# Limit anyway the maximum number of cycles. This is useful when the
# test is skipped via --only option of the test suite. In that case
# we don't want to see many seconds of this test being just skipped.
if {$cycle > 50} break
}
test "PSYNC2: Bring the master back again for next test" {
$R($master_id) slaveof no one
set master_host $R_host($master_id)
set master_port $R_port($master_id)
for {set j 0} {$j < 5} {incr j} {
if {$j == $master_id} continue
$R($j) slaveof $master_host $master_port
}
# Wait for replicas to sync. it is not enough to just wait for connected_slaves==4
# since we might do the check before the master realized that they're disconnected
wait_for_condition 50 1000 {
[status $R($master_id) connected_slaves] == 4 &&
[status $R([expr {($master_id+1)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+2)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+3)%5}]) master_link_status] == "up" &&
[status $R([expr {($master_id+4)%5}]) master_link_status] == "up"
} else {
fail "Replica not reconnecting"
}
}
test "PSYNC2: Partial resync after restart using RDB aux fields" {
# Pick a random slave
set slave_id [expr {($master_id+1)%5}]
set sync_count [status $R($master_id) sync_full]
set sync_partial [status $R($master_id) sync_partial_ok]
set sync_partial_err [status $R($master_id) sync_partial_err]
catch {
$R($slave_id) config rewrite
$R($slave_id) debug restart
}
# note: just waiting for connected_slaves==4 has a race condition since
# we might do the check before the master realized that the slave disconnected
wait_for_condition 50 1000 {
[status $R($master_id) sync_partial_ok] == $sync_partial + 1
} else {
puts "prev sync_full: $sync_count"
puts "prev sync_partial_ok: $sync_partial"
puts "prev sync_partial_err: $sync_partial_err"
puts [$R($master_id) info stats]
fail "Replica didn't partial sync"
}
set new_sync_count [status $R($master_id) sync_full]
assert {$sync_count == $new_sync_count}
}
test "PSYNC2: Replica RDB restart with EVALSHA in backlog issue #4483" {
# Pick a random slave
set slave_id [expr {($master_id+1)%5}]
set sync_count [status $R($master_id) sync_full]
# Make sure to replicate the first EVAL while the salve is online
# so that it's part of the scripts the master believes it's safe
# to propagate as EVALSHA.
$R($master_id) EVAL {return redis.call("incr","__mycounter")} 0
$R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
# Wait for the two to sync
wait_for_condition 50 1000 {
[$R($master_id) debug digest] == [$R($slave_id) debug digest]
} else {
fail "Replica not reconnecting"
}
# Prevent the slave from receiving master updates, and at
# the same time send a new script several times to the
# master, so that we'll end with EVALSHA into the backlog.
$R($slave_id) slaveof 127.0.0.1 0
$R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
$R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
$R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
catch {
$R($slave_id) config rewrite
$R($slave_id) debug restart
}
# Reconfigure the slave correctly again, when it's back online.
set retry 50
while {$retry} {
if {[catch {
$R($slave_id) slaveof $master_host $master_port
}]} {
after 1000
} else {
break
}
incr retry -1
}
# The master should be back at 4 slaves eventually
wait_for_condition 50 1000 {
[status $R($master_id) connected_slaves] == 4
} else {
fail "Replica not reconnecting"
}
set new_sync_count [status $R($master_id) sync_full]
assert {$sync_count == $new_sync_count}
# However if the slave started with the full state of the
# scripting engine, we should now have the same digest.
wait_for_condition 50 1000 {
[$R($master_id) debug digest] == [$R($slave_id) debug digest]
} else {
fail "Debug digest mismatch between master and replica in post-restart handshake"
}
}
if {$no_exit} {
while 1 { puts -nonewline .; flush stdout; after 1000}
}
}}}}}
start_server {tags {"psync2"}} {
start_server {} {
start_server {} {
start_server {} {
start_server {} {
test {pings at the end of replication stream are ignored for psync} {
set master [srv -4 client]
set master_host [srv -4 host]
set master_port [srv -4 port]
set replica1 [srv -3 client]
set replica2 [srv -2 client]
set replica3 [srv -1 client]
set replica4 [srv -0 client]
$replica1 replicaof $master_host $master_port
$replica2 replicaof $master_host $master_port
$replica3 replicaof $master_host $master_port
$replica4 replicaof $master_host $master_port
wait_for_condition 50 1000 {
[status $master connected_slaves] == 4
} else {
fail "replicas didn't connect"
}
$master incr x
wait_for_condition 50 1000 {
[$replica1 get x] == 1 && [$replica2 get x] == 1 &&
[$replica3 get x] == 1 && [$replica4 get x] == 1
} else {
fail "replicas didn't get incr"
}
# disconnect replica1 and replica2
# and wait for the master to send a ping to replica3 and replica4
$replica1 replicaof no one
$replica2 replicaof 127.0.0.1 1 ;# we can't promote it to master since that will cycle the replication id
$master config set repl-ping-replica-period 1
after 1500
# make everyone sync from the replica1 that didn't get the last ping from the old master
# replica4 will keep syncing from the old master which now syncs from replica1
# and replica2 will re-connect to the old master (which went back in time)
set new_master_host [srv -3 host]
set new_master_port [srv -3 port]
$replica3 replicaof $new_master_host $new_master_port
$master replicaof $new_master_host $new_master_port
$replica2 replicaof $master_host $master_port
wait_for_condition 50 1000 {
[status $replica2 master_link_status] == "up" &&
[status $replica3 master_link_status] == "up" &&
[status $replica4 master_link_status] == "up" &&
[status $master master_link_status] == "up"
} else {
fail "replicas didn't connect"
}
# make sure replication is still alive and kicking
$replica1 incr x
wait_for_condition 50 1000 {
[$replica2 get x] == 2 &&
[$replica3 get x] == 2 &&
[$replica4 get x] == 2 &&
[$master get x] == 2
} else {
fail "replicas didn't get incr"
}
# make sure there are full syncs other than the initial ones
assert_equal [status $master sync_full] 4
assert_equal [status $replica1 sync_full] 0
assert_equal [status $replica2 sync_full] 0
assert_equal [status $replica3 sync_full] 0
assert_equal [status $replica4 sync_full] 0
# force psync
$master client kill type master
$replica2 client kill type master
$replica3 client kill type master
$replica4 client kill type master
# make sure replication is still alive and kicking
$replica1 incr x
wait_for_condition 50 1000 {
[$replica2 get x] == 3 &&
[$replica3 get x] == 3 &&
[$replica4 get x] == 3 &&
[$master get x] == 3
} else {
fail "replicas didn't get incr"
}
# make sure there are full syncs other than the initial ones
assert_equal [status $master sync_full] 4
assert_equal [status $replica1 sync_full] 0
assert_equal [status $replica2 sync_full] 0
assert_equal [status $replica3 sync_full] 0
assert_equal [status $replica4 sync_full] 0
}
}}}}}