From 592419b4ca08fbf125b2ff2afd3289ef3d7b6a68 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Tue, 26 Jan 2016 12:32:53 +0100
Subject: [PATCH] Better address udpate strategy when processing gossip
 sections.

The change covers the case where:

1. There is a node we can't reach (in fail or pfail state).
2. We see a different address for this node, in the gossip section sent
to us by a node that, instead, is able to talk with the node we cannot
talk to.

In this case it's a good bet to switch to the address reported by this
node, since there was an address switch and it is able to talk with the
node and we are not.

However previosuly this was done in a dangerous way, by initiating an
handshake. The handshake, using the MEET packet, forces the receiver to
join our cluster, and this is not a good idea. If the node in question
really just switched address, but is the same node, it already knows about
us, so we just need to perform an address update and a reconnection.

So with this commit instead we just update the address of the node,
release the node link if any, and attempt to reconnect in the next
clusterCron() cycle.

The commit also improves debugging messages printed by Cluster during
address or ID switches.
---
 src/cluster.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 80c81cf9f..70de1c518 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -531,6 +531,7 @@ void clusterReset(int hard) {
         sdsfree(oldname);
         getRandomHexChars(myself->name, CLUSTER_NAMELEN);
         clusterAddNode(myself);
+        serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name);
     }
 
     /* Make sure to persist the new config and update the state. */
@@ -1330,14 +1331,19 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
             }
 
             /* If we already know this node, but it is not reachable, and
-             * we see a different address in the gossip section, start an
-             * handshake with the (possibly) new address: this will result
-             * into a node address update if the handshake will be
-             * successful. */
+             * we see a different address in the gossip section of a node that
+             * can talk with this other node, update the address, disconnect
+             * the old link if any, so that we'll attempt to connect with the
+             * new address. */
             if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) &&
+                !(flags & CLUSTER_NODE_NOADDR) &&
+                !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) &&
                 (strcasecmp(node->ip,g->ip) || node->port != ntohs(g->port)))
             {
-                clusterStartHandshake(g->ip,ntohs(g->port));
+                if (node->link) freeClusterLink(node->link);
+                memcpy(node->ip,g->ip,NET_IP_STR_LEN);
+                node->port = g->port;
+                node->flags &= ~CLUSTER_NODE_NOADDR;
             }
         } else {
             /* If it's not in NOADDR state and we don't have it, we
@@ -1710,7 +1716,10 @@ int clusterProcessPacket(clusterLink *link) {
                 /* If the reply has a non matching node ID we
                  * disconnect this node and set it as not having an associated
                  * address. */
-                serverLog(LL_DEBUG,"PONG contains mismatching sender ID");
+                serverLog(LL_WARNING,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d",
+                    link->node->name,
+                    (int)(mstime()-(link->node->ctime)),
+                    link->node->flags);
                 link->node->flags |= CLUSTER_NODE_NOADDR;
                 link->node->ip[0] = '\0';
                 link->node->port = 0;