From b53c7f2c0bb8692fd30c59348190c2bd897958a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sat, 2 Apr 2022 23:58:07 +0200 Subject: [PATCH] Turn into replica on SETSLOT (#10489) * Fix race condition where node loses its last slot and turns into replica When a node has lost its last slot and finds out from the SETSLOT command before the cluster bus PONG from the new owner arrives. In this case, the node didn't turn itself into a replica of the new slot owner. This commit adds the same logic to the SETSLOT command as already exists for the cluster bus PONG processing. * Revert "Fix new / failing cluster slot migration test (#10482)" This reverts commit 0b21ef8d49c47a5dd47a0bcc0cf1b2c235f24fd0. In this test, the old slot owner finds out that it has lost its last slot in a nondeterministic way. Either the cluster bus PONG from the new slot owner and sometimes in a SETSLOT command from redis-cli. In both cases, the result should be the same and the old owner should turn itself into a replica of the new slot owner. --- src/cluster.c | 28 +++++++++++++++++++ .../cluster/tests/12-replica-migration-2.tcl | 7 +++-- tests/unit/cluster.tcl | 8 ++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 0a2e46623..6b5fb8d1a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1899,6 +1899,17 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE| CLUSTER_TODO_FSYNC_CONFIG); + } else if (myself->slaveof && myself->slaveof->slaveof) { + /* Safeguard against sub-replicas. A replica's master can turn itself + * into a replica if its last slot is removed. If no other node takes + * over the slot, there is nothing else to trigger replica migration. */ + serverLog(LL_WARNING, + "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s", + myself->slaveof->slaveof->name); + clusterSetMaster(myself->slaveof->slaveof); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); } else if (dirty_slots_count) { /* If we are here, we received an update message which removed * ownership for certain slots we still have keys about, but still @@ -5403,9 +5414,26 @@ NULL server.cluster->migrating_slots_to[slot]) server.cluster->migrating_slots_to[slot] = NULL; + int slot_was_mine = server.cluster->slots[slot] == myself; clusterDelSlot(slot); clusterAddSlot(n,slot); + /* If we are a master left without slots, we should turn into a + * replica of the new master. */ + if (slot_was_mine && + n != myself && + myself->numslots == 0 && + server.cluster_allow_replica_migration) + { + serverLog(LL_WARNING, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s", n->name); + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | + CLUSTER_TODO_UPDATE_STATE | + CLUSTER_TODO_FSYNC_CONFIG); + } + /* If this node was importing this slot, assigning the slot to * itself also clears the importing status. */ if (n == myself && diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl index f0493e57e..ed680061c 100644 --- a/tests/cluster/tests/12-replica-migration-2.tcl +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -45,11 +45,12 @@ test "Resharding all the master #0 slots away from it" { } -test "Master #0 should lose its replicas" { +test "Master #0 who lost all slots should turn into a replica without replicas" { wait_for_condition 1000 50 { - [llength [lindex [R 0 role] 2]] == 0 + [RI 0 role] == "slave" && [RI 0 connected_slaves] == 0 } else { - fail "Master #0 still has replicas" + puts [R 0 info replication] + fail "Master #0 didn't turn itself into a replica" } } diff --git a/tests/unit/cluster.tcl b/tests/unit/cluster.tcl index 7a9402efd..48ff92536 100644 --- a/tests/unit/cluster.tcl +++ b/tests/unit/cluster.tcl @@ -300,6 +300,14 @@ test {Migrate the last slot away from a node using redis-cli} { # Check that the key foo has been migrated back to the original owner. catch { $newnode_r get foo } e assert_equal "MOVED $slot $owner_host:$owner_port" $e + + # Check that the empty node has turned itself into a replica of the new + # owner and that the new owner knows that. + wait_for_condition 1000 50 { + [string match "*slave*" [$owner_r CLUSTER REPLICAS $owner_id]] + } else { + fail "Empty node didn't turn itself into a replica." + } } }