From 592419b4ca08fbf125b2ff2afd3289ef3d7b6a68 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 26 Jan 2016 12:32:53 +0100 Subject: [PATCH] Better address udpate strategy when processing gossip sections. The change covers the case where: 1. There is a node we can't reach (in fail or pfail state). 2. We see a different address for this node, in the gossip section sent to us by a node that, instead, is able to talk with the node we cannot talk to. In this case it's a good bet to switch to the address reported by this node, since there was an address switch and it is able to talk with the node and we are not. However previosuly this was done in a dangerous way, by initiating an handshake. The handshake, using the MEET packet, forces the receiver to join our cluster, and this is not a good idea. If the node in question really just switched address, but is the same node, it already knows about us, so we just need to perform an address update and a reconnection. So with this commit instead we just update the address of the node, release the node link if any, and attempt to reconnect in the next clusterCron() cycle. The commit also improves debugging messages printed by Cluster during address or ID switches. --- src/cluster.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 80c81cf9f..70de1c518 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -531,6 +531,7 @@ void clusterReset(int hard) { sdsfree(oldname); getRandomHexChars(myself->name, CLUSTER_NAMELEN); clusterAddNode(myself); + serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name); } /* Make sure to persist the new config and update the state. */ @@ -1330,14 +1331,19 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { } /* If we already know this node, but it is not reachable, and - * we see a different address in the gossip section, start an - * handshake with the (possibly) new address: this will result - * into a node address update if the handshake will be - * successful. */ + * we see a different address in the gossip section of a node that + * can talk with this other node, update the address, disconnect + * the old link if any, so that we'll attempt to connect with the + * new address. */ if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && + !(flags & CLUSTER_NODE_NOADDR) && + !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && (strcasecmp(node->ip,g->ip) || node->port != ntohs(g->port))) { - clusterStartHandshake(g->ip,ntohs(g->port)); + if (node->link) freeClusterLink(node->link); + memcpy(node->ip,g->ip,NET_IP_STR_LEN); + node->port = g->port; + node->flags &= ~CLUSTER_NODE_NOADDR; } } else { /* If it's not in NOADDR state and we don't have it, we @@ -1710,7 +1716,10 @@ int clusterProcessPacket(clusterLink *link) { /* If the reply has a non matching node ID we * disconnect this node and set it as not having an associated * address. */ - serverLog(LL_DEBUG,"PONG contains mismatching sender ID"); + serverLog(LL_WARNING,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d", + link->node->name, + (int)(mstime()-(link->node->ctime)), + link->node->flags); link->node->flags |= CLUSTER_NODE_NOADDR; link->node->ip[0] = '\0'; link->node->port = 0;