From a2c938c83450c8be4e4c39874fd6f467b86db273 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Dec 2013 09:56:18 +0100 Subject: [PATCH] Redis Cluster: delay state change when in the majority again. As specified in the Redis Cluster specification, when a node can reach the majority again after a period in which it was partitioend away with the minorty of masters, wait some time before accepting queries, to provide a reasonable amount of time for other nodes to upgrade its configuration. This lowers the probabilities of both a client and a master with not updated configuration to rejoin the cluster at the same time, with a stale master accepting writes. --- src/cluster.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 75a4e59d2..090dbf985 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2205,20 +2205,24 @@ int clusterDelNodeSlots(clusterNode *node) { /* ----------------------------------------------------------------------------- * Cluster state evaluation function * -------------------------------------------------------------------------- */ + +#define REDIS_CLUSTER_MAX_REJOIN_DELAY 5000 + void clusterUpdateState(void) { - int j, initial_state = server.cluster->state; + int j, new_state; int unreachable_masters = 0; + static mstime_t among_minority_time; /* Start assuming the state is OK. We'll turn it into FAIL if there * are the right conditions. */ - server.cluster->state = REDIS_CLUSTER_OK; + new_state = REDIS_CLUSTER_OK; /* Check if all the slots are covered. */ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (REDIS_NODE_FAIL)) { - server.cluster->state = REDIS_CLUSTER_FAIL; + new_state = REDIS_CLUSTER_FAIL; break; } } @@ -2248,24 +2252,39 @@ void clusterUpdateState(void) { /* If we can't reach at least half the masters, change the cluster state * to FAIL, as we are not even able to mark nodes as FAIL in this side - * of the netsplit because of lack of majority. - * - * TODO: when this condition is entered, we should not undo it for some - * (small) time after the majority is reachable again, to make sure that - * other nodes have enough time to inform this node of a configuration change. - * Otherwise a client with an old routing table may write to this node - * and later it may turn into a slave losing the write. */ + * of the netsplit because of lack of majority. */ { int needed_quorum = (server.cluster->size / 2) + 1; - if (unreachable_masters >= needed_quorum) - server.cluster->state = REDIS_CLUSTER_FAIL; + if (unreachable_masters >= needed_quorum) { + new_state = REDIS_CLUSTER_FAIL; + among_minority_time = mstime(); + } } /* Log a state change */ - if (initial_state != server.cluster->state) + if (new_state != server.cluster->state) { + mstime_t rejoin_delay = server.cluster_node_timeout; + + /* If the instance is a master and was partitioned away with the + * minority, don't let it accept queries for some time after the + * partition heals, to make sure there is enough time to receive + * a configuration update. */ + if (rejoin_delay > REDIS_CLUSTER_MAX_REJOIN_DELAY) + rejoin_delay = REDIS_CLUSTER_MAX_REJOIN_DELAY; + + if (new_state == REDIS_CLUSTER_OK && + server.cluster->myself->flags & REDIS_NODE_MASTER && + mstime() - among_minority_time < rejoin_delay) + { + return; + } + + /* Change the state and log the event. */ redisLog(REDIS_WARNING,"Cluster state changed: %s", - server.cluster->state == REDIS_CLUSTER_OK ? "ok" : "fail"); + new_state == REDIS_CLUSTER_OK ? "ok" : "fail"); + server.cluster->state = new_state; + } } /* This function is called after the node startup in order to verify that data