Cluster: link reconnection on delayed PONG reply.

When the PONG delay is half the cluster node timeout, the link gets
disconnected (and later automatically reconnected) in order to ensure
that it's not just a dead connection issue.

However this operation is only performed if the link is old enough, in
order to avoid to disconnect the same link again and again (and among
the other problems, never receive the PONG because of that).

Note: when the link is reconnected, the 'ping_sent' field is not updated
even if a new ping is sent using the new connection, so we can still
reliably detect a node ping timeout.
This commit is contained in:
antirez 2013-05-03 12:37:45 +02:00
parent 1315b9f246
commit 5c9f6d4f55
2 changed files with 4 additions and 3 deletions

View File

@ -268,6 +268,7 @@ void clusterInit(void) {
clusterLink *createClusterLink(clusterNode *node) { clusterLink *createClusterLink(clusterNode *node) {
clusterLink *link = zmalloc(sizeof(*link)); clusterLink *link = zmalloc(sizeof(*link));
link->ctime = time(NULL);
link->sndbuf = sdsempty(); link->sndbuf = sdsempty();
link->rcvbuf = sdsempty(); link->rcvbuf = sdsempty();
link->node = node; link->node = node;
@ -1596,21 +1597,20 @@ void clusterCron(void) {
(REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE)) (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE))
continue; continue;
#if 0
/* If we are waiting for the PONG more than half the cluster /* If we are waiting for the PONG more than half the cluster
* timeout, reconnect the link: maybe there is a connection * timeout, reconnect the link: maybe there is a connection
* issue even if the node is alive. */ * issue even if the node is alive. */
if (node->link && /* is connected */ if (node->link && /* is connected */
time(NULL) - node->link->ctime >
server.cluster_node_timeout && /* was not already reconnected */
node->ping_sent && /* we already sent a ping */ node->ping_sent && /* we already sent a ping */
node->pong_received < node->ping_sent && /* still waiting pong */ node->pong_received < node->ping_sent && /* still waiting pong */
/* and we are waiting for the pong more than timeout/2 */ /* and we are waiting for the pong more than timeout/2 */
now - node->ping_sent > server.cluster_node_timeout/2) now - node->ping_sent > server.cluster_node_timeout/2)
{ {
/* Disconnect the link, it will be reconnected automatically. */ /* Disconnect the link, it will be reconnected automatically. */
printf("DISCONNECT!\n");
freeClusterLink(node->link); freeClusterLink(node->link);
} }
#endif
/* If we have currently no active ping in this instance, and the /* If we have currently no active ping in this instance, and the
* received PONG is older than half the cluster timeout, send * received PONG is older than half the cluster timeout, send

View File

@ -538,6 +538,7 @@ struct clusterNode;
/* clusterLink encapsulates everything needed to talk with a remote node. */ /* clusterLink encapsulates everything needed to talk with a remote node. */
typedef struct clusterLink { typedef struct clusterLink {
time_t ctime; /* Link creation time */
int fd; /* TCP socket file descriptor */ int fd; /* TCP socket file descriptor */
sds sndbuf; /* Packet send buffer */ sds sndbuf; /* Packet send buffer */
sds rcvbuf; /* Packet reception buffer */ sds rcvbuf; /* Packet reception buffer */