2012-11-08 18:25:23 +01:00
|
|
|
/* Redis Cluster implementation.
|
|
|
|
*
|
|
|
|
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
|
|
* to endorse or promote products derived from this software without
|
|
|
|
* specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
#include "redis.h"
|
2013-10-09 15:37:20 +02:00
|
|
|
#include "cluster.h"
|
2012-04-02 13:10:39 +02:00
|
|
|
#include "endianconv.h"
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2011-09-20 00:00:14 +01:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/socket.h>
|
2011-03-29 17:51:15 +02:00
|
|
|
#include <arpa/inet.h>
|
2011-03-30 14:58:19 +02:00
|
|
|
#include <fcntl.h>
|
|
|
|
#include <unistd.h>
|
2012-04-24 11:11:55 +02:00
|
|
|
#include <sys/socket.h>
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-10-09 15:37:20 +02:00
|
|
|
clusterNode *createClusterNode(char *nodename, int flags);
|
|
|
|
int clusterAddNode(clusterNode *node);
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
|
|
|
|
void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask);
|
|
|
|
void clusterSendPing(clusterLink *link, int type);
|
|
|
|
void clusterSendFail(char *nodename);
|
2013-09-20 09:22:21 +02:00
|
|
|
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request);
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterUpdateState(void);
|
|
|
|
int clusterNodeGetSlotBit(clusterNode *n, int slot);
|
2013-09-04 10:25:26 +02:00
|
|
|
sds clusterGenNodesDescription(int filter);
|
2011-04-07 17:46:28 +02:00
|
|
|
clusterNode *clusterLookupNode(char *name);
|
|
|
|
int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
|
2011-04-07 21:34:41 +02:00
|
|
|
int clusterAddSlot(clusterNode *n, int slot);
|
2013-02-21 11:44:58 +01:00
|
|
|
int clusterDelSlot(int slot);
|
2013-03-15 16:35:16 +01:00
|
|
|
int clusterDelNodeSlots(clusterNode *node);
|
2013-02-22 17:45:49 +01:00
|
|
|
int clusterNodeSetSlotBit(clusterNode *n, int slot);
|
2013-03-20 00:30:47 +01:00
|
|
|
void clusterSetMaster(clusterNode *n);
|
2013-09-26 16:54:43 +02:00
|
|
|
void clusterHandleSlaveFailover(void);
|
2013-02-28 15:41:54 +01:00
|
|
|
int bitmapTestBit(unsigned char *bitmap, int pos);
|
2013-10-03 09:55:20 +02:00
|
|
|
void clusterDoBeforeSleep(int flags);
|
2013-11-08 16:26:50 +01:00
|
|
|
void clusterSendUpdate(clusterLink *link, clusterNode *node);
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* Initialization
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-09-25 11:47:13 +02:00
|
|
|
/* This function is called at startup in order to set the currentEpoch
|
|
|
|
* (which is not saved on permanent storage) to the greatest configEpoch found
|
|
|
|
* in the loaded nodes (configEpoch is stored on permanent storage as soon as
|
|
|
|
* it changes for some node). */
|
|
|
|
void clusterSetStartupEpoch() {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
clusterNode *node = dictGetVal(de);
|
|
|
|
if (node->configEpoch > server.cluster->currentEpoch)
|
|
|
|
server.cluster->currentEpoch = node->configEpoch;
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
int clusterLoadConfig(char *filename) {
|
|
|
|
FILE *fp = fopen(filename,"r");
|
2011-04-07 12:55:02 +02:00
|
|
|
char *line;
|
2011-04-07 17:46:28 +02:00
|
|
|
int maxline, j;
|
2011-03-30 14:58:19 +02:00
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
if (fp == NULL) return REDIS_ERR;
|
2011-04-07 12:55:02 +02:00
|
|
|
|
|
|
|
/* Parse the file. Note that single liens of the cluster config file can
|
|
|
|
* be really long as they include all the hash slots of the node.
|
2013-03-04 19:45:36 +01:00
|
|
|
* This means in the worst possible case, half of the Redis slots will be
|
|
|
|
* present in a single line, possibly in importing or migrating state, so
|
|
|
|
* together with the node ID of the sender/receiver.
|
|
|
|
*
|
|
|
|
* To simplify we allocate 1024+REDIS_CLUSTER_SLOTS*128 bytes per line. */
|
|
|
|
maxline = 1024+REDIS_CLUSTER_SLOTS*128;
|
2011-04-07 12:55:02 +02:00
|
|
|
line = zmalloc(maxline);
|
|
|
|
while(fgets(line,maxline,fp) != NULL) {
|
|
|
|
int argc;
|
|
|
|
sds *argv = sdssplitargs(line,&argc);
|
2013-02-21 01:22:24 -08:00
|
|
|
if (argv == NULL) goto fmterr;
|
|
|
|
|
2011-04-07 17:46:28 +02:00
|
|
|
clusterNode *n, *master;
|
|
|
|
char *p, *s;
|
|
|
|
|
|
|
|
/* Create this node if it does not exist */
|
|
|
|
n = clusterLookupNode(argv[0]);
|
|
|
|
if (!n) {
|
|
|
|
n = createClusterNode(argv[0],0);
|
|
|
|
clusterAddNode(n);
|
|
|
|
}
|
|
|
|
/* Address and port */
|
|
|
|
if ((p = strchr(argv[1],':')) == NULL) goto fmterr;
|
|
|
|
*p = '\0';
|
|
|
|
memcpy(n->ip,argv[1],strlen(argv[1])+1);
|
|
|
|
n->port = atoi(p+1);
|
|
|
|
|
|
|
|
/* Parse flags */
|
|
|
|
p = s = argv[2];
|
|
|
|
while(p) {
|
|
|
|
p = strchr(s,',');
|
|
|
|
if (p) *p = '\0';
|
|
|
|
if (!strcasecmp(s,"myself")) {
|
2013-02-14 13:20:56 +01:00
|
|
|
redisAssert(server.cluster->myself == NULL);
|
|
|
|
server.cluster->myself = n;
|
2011-04-07 17:46:28 +02:00
|
|
|
n->flags |= REDIS_NODE_MYSELF;
|
|
|
|
} else if (!strcasecmp(s,"master")) {
|
|
|
|
n->flags |= REDIS_NODE_MASTER;
|
|
|
|
} else if (!strcasecmp(s,"slave")) {
|
|
|
|
n->flags |= REDIS_NODE_SLAVE;
|
|
|
|
} else if (!strcasecmp(s,"fail?")) {
|
|
|
|
n->flags |= REDIS_NODE_PFAIL;
|
|
|
|
} else if (!strcasecmp(s,"fail")) {
|
|
|
|
n->flags |= REDIS_NODE_FAIL;
|
2013-10-09 16:18:33 +02:00
|
|
|
n->fail_time = mstime();
|
2011-04-07 17:46:28 +02:00
|
|
|
} else if (!strcasecmp(s,"handshake")) {
|
|
|
|
n->flags |= REDIS_NODE_HANDSHAKE;
|
|
|
|
} else if (!strcasecmp(s,"noaddr")) {
|
|
|
|
n->flags |= REDIS_NODE_NOADDR;
|
2011-04-07 19:04:16 +02:00
|
|
|
} else if (!strcasecmp(s,"noflags")) {
|
|
|
|
/* nothing to do */
|
2011-04-07 17:46:28 +02:00
|
|
|
} else {
|
|
|
|
redisPanic("Unknown flag in redis cluster config file");
|
|
|
|
}
|
|
|
|
if (p) s = p+1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get master if any. Set the master and populate master's
|
|
|
|
* slave list. */
|
|
|
|
if (argv[3][0] != '-') {
|
|
|
|
master = clusterLookupNode(argv[3]);
|
|
|
|
if (!master) {
|
|
|
|
master = createClusterNode(argv[3],0);
|
|
|
|
clusterAddNode(master);
|
|
|
|
}
|
|
|
|
n->slaveof = master;
|
|
|
|
clusterNodeAddSlave(master,n);
|
|
|
|
}
|
|
|
|
|
2011-04-07 23:06:01 +02:00
|
|
|
/* Set ping sent / pong received timestamps */
|
2013-10-09 16:18:33 +02:00
|
|
|
if (atoi(argv[4])) n->ping_sent = mstime();
|
|
|
|
if (atoi(argv[5])) n->pong_received = mstime();
|
2011-04-07 23:06:01 +02:00
|
|
|
|
2013-09-25 11:47:13 +02:00
|
|
|
/* Set configEpoch for this node. */
|
|
|
|
n->configEpoch = strtoull(argv[6],NULL,10);
|
|
|
|
|
2011-04-07 17:46:28 +02:00
|
|
|
/* Populate hash slots served by this instance. */
|
2013-09-25 11:47:13 +02:00
|
|
|
for (j = 8; j < argc; j++) {
|
2011-04-07 17:46:28 +02:00
|
|
|
int start, stop;
|
|
|
|
|
2011-05-04 09:31:37 +02:00
|
|
|
if (argv[j][0] == '[') {
|
|
|
|
/* Here we handle migrating / importing slots */
|
|
|
|
int slot;
|
|
|
|
char direction;
|
|
|
|
clusterNode *cn;
|
|
|
|
|
|
|
|
p = strchr(argv[j],'-');
|
|
|
|
redisAssert(p != NULL);
|
|
|
|
*p = '\0';
|
|
|
|
direction = p[1]; /* Either '>' or '<' */
|
|
|
|
slot = atoi(argv[j]+1);
|
|
|
|
p += 3;
|
|
|
|
cn = clusterLookupNode(p);
|
|
|
|
if (!cn) {
|
|
|
|
cn = createClusterNode(p,0);
|
|
|
|
clusterAddNode(cn);
|
|
|
|
}
|
|
|
|
if (direction == '>') {
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->migrating_slots_to[slot] = cn;
|
2011-05-04 09:31:37 +02:00
|
|
|
} else {
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->importing_slots_from[slot] = cn;
|
2011-05-04 09:31:37 +02:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
} else if ((p = strchr(argv[j],'-')) != NULL) {
|
2011-04-07 17:46:28 +02:00
|
|
|
*p = '\0';
|
|
|
|
start = atoi(argv[j]);
|
|
|
|
stop = atoi(p+1);
|
|
|
|
} else {
|
|
|
|
start = stop = atoi(argv[j]);
|
|
|
|
}
|
|
|
|
while(start <= stop) clusterAddSlot(n, start++);
|
|
|
|
}
|
2011-04-07 12:55:02 +02:00
|
|
|
|
2013-03-06 12:36:07 +01:00
|
|
|
sdsfreesplitres(argv,argc);
|
2011-04-07 12:55:02 +02:00
|
|
|
}
|
|
|
|
zfree(line);
|
2011-03-29 17:51:15 +02:00
|
|
|
fclose(fp);
|
|
|
|
|
2011-04-07 12:55:02 +02:00
|
|
|
/* Config sanity check */
|
2013-02-14 13:20:56 +01:00
|
|
|
redisAssert(server.cluster->myself != NULL);
|
2011-03-29 17:51:15 +02:00
|
|
|
redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s",
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->myself->name);
|
2013-09-25 11:47:13 +02:00
|
|
|
clusterSetStartupEpoch();
|
2011-04-07 23:23:27 +02:00
|
|
|
clusterUpdateState();
|
2011-03-29 17:51:15 +02:00
|
|
|
return REDIS_OK;
|
|
|
|
|
|
|
|
fmterr:
|
2013-01-17 01:00:20 +08:00
|
|
|
redisLog(REDIS_WARNING,"Unrecoverable error: corrupted cluster config file.");
|
2011-03-29 17:51:15 +02:00
|
|
|
fclose(fp);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2011-03-30 14:58:19 +02:00
|
|
|
/* Cluster node configuration is exactly the same as CLUSTER NODES output.
|
|
|
|
*
|
|
|
|
* This function writes the node config and returns 0, on error -1
|
|
|
|
* is returned. */
|
2013-10-03 09:55:20 +02:00
|
|
|
int clusterSaveConfig(int do_fsync) {
|
2013-09-04 10:25:26 +02:00
|
|
|
sds ci = clusterGenNodesDescription(REDIS_NODE_HANDSHAKE);
|
2011-03-30 14:58:19 +02:00
|
|
|
int fd;
|
|
|
|
|
2013-02-14 13:20:56 +01:00
|
|
|
if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT|O_TRUNC,0644))
|
2011-04-07 12:55:02 +02:00
|
|
|
== -1) goto err;
|
2011-03-30 14:58:19 +02:00
|
|
|
if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err;
|
2013-10-03 09:55:20 +02:00
|
|
|
if (do_fsync) fsync(fd);
|
2011-03-30 14:58:19 +02:00
|
|
|
close(fd);
|
|
|
|
sdsfree(ci);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
sdsfree(ci);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2013-10-03 09:55:20 +02:00
|
|
|
void clusterSaveConfigOrDie(int do_fsync) {
|
|
|
|
if (clusterSaveConfig(do_fsync) == -1) {
|
2011-03-30 17:41:13 +02:00
|
|
|
redisLog(REDIS_WARNING,"Fatal: can't update cluster config file.");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterInit(void) {
|
2013-08-22 14:05:07 +02:00
|
|
|
int saveconf = 0;
|
2011-03-30 16:51:28 +02:00
|
|
|
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster = zmalloc(sizeof(clusterState));
|
|
|
|
server.cluster->myself = NULL;
|
2013-09-25 11:47:13 +02:00
|
|
|
server.cluster->currentEpoch = 0;
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->state = REDIS_CLUSTER_FAIL;
|
2013-02-22 19:18:30 +01:00
|
|
|
server.cluster->size = 1;
|
2013-12-17 12:22:02 +01:00
|
|
|
server.cluster->todo_before_sleep = 0;
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL);
|
2013-11-29 17:37:06 +01:00
|
|
|
server.cluster->nodes_black_list =
|
|
|
|
dictCreate(&clusterNodesBlackListDictType,NULL);
|
2013-03-13 13:10:49 +01:00
|
|
|
server.cluster->failover_auth_time = 0;
|
|
|
|
server.cluster->failover_auth_count = 0;
|
2013-09-26 11:13:17 +02:00
|
|
|
server.cluster->failover_auth_epoch = 0;
|
2013-09-26 13:00:41 +02:00
|
|
|
server.cluster->last_vote_epoch = 0;
|
2013-10-02 10:10:08 +02:00
|
|
|
server.cluster->stats_bus_messages_sent = 0;
|
|
|
|
server.cluster->stats_bus_messages_received = 0;
|
2013-02-14 13:20:56 +01:00
|
|
|
memset(server.cluster->migrating_slots_to,0,
|
|
|
|
sizeof(server.cluster->migrating_slots_to));
|
|
|
|
memset(server.cluster->importing_slots_from,0,
|
|
|
|
sizeof(server.cluster->importing_slots_from));
|
|
|
|
memset(server.cluster->slots,0,
|
|
|
|
sizeof(server.cluster->slots));
|
|
|
|
if (clusterLoadConfig(server.cluster_configfile) == REDIS_ERR) {
|
2011-03-29 17:51:15 +02:00
|
|
|
/* No configuration found. We will just use the random name provided
|
|
|
|
* by the createClusterNode() function. */
|
2013-02-22 19:24:01 +01:00
|
|
|
server.cluster->myself =
|
|
|
|
createClusterNode(NULL,REDIS_NODE_MYSELF|REDIS_NODE_MASTER);
|
2011-03-29 17:51:15 +02:00
|
|
|
redisLog(REDIS_NOTICE,"No cluster configuration found, I'm %.40s",
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->myself->name);
|
|
|
|
clusterAddNode(server.cluster->myself);
|
2011-03-30 16:51:28 +02:00
|
|
|
saveconf = 1;
|
|
|
|
}
|
2013-10-03 09:55:20 +02:00
|
|
|
if (saveconf) clusterSaveConfigOrDie(1);
|
2013-08-22 14:05:07 +02:00
|
|
|
|
|
|
|
/* We need a listening TCP port for our cluster messaging needs. */
|
2013-07-05 11:47:20 +02:00
|
|
|
server.cfd_count = 0;
|
2013-08-22 14:05:07 +02:00
|
|
|
if (listenToPort(server.port+REDIS_CLUSTER_PORT_INCR,
|
|
|
|
server.cfd,&server.cfd_count) == REDIS_ERR)
|
|
|
|
{
|
|
|
|
exit(1);
|
2013-08-22 14:53:53 +02:00
|
|
|
} else {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < server.cfd_count; j++) {
|
|
|
|
if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE,
|
|
|
|
clusterAcceptHandler, NULL) == AE_ERR)
|
|
|
|
redisPanic("Unrecoverable error creating Redis Cluster "
|
|
|
|
"file event.");
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
2013-08-22 14:05:07 +02:00
|
|
|
|
|
|
|
/* The slots -> keys map is a sorted set. Init it. */
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->slots_to_keys = zslCreate();
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER communication link
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
clusterLink *createClusterLink(clusterNode *node) {
|
|
|
|
clusterLink *link = zmalloc(sizeof(*link));
|
2013-10-09 16:18:33 +02:00
|
|
|
link->ctime = mstime();
|
2011-03-29 17:51:15 +02:00
|
|
|
link->sndbuf = sdsempty();
|
|
|
|
link->rcvbuf = sdsempty();
|
|
|
|
link->node = node;
|
|
|
|
link->fd = -1;
|
|
|
|
return link;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Free a cluster link, but does not free the associated node of course.
|
2013-04-11 17:02:39 +02:00
|
|
|
* This function will just make sure that the original node associated
|
2011-03-29 17:51:15 +02:00
|
|
|
* with this link will have the 'link' field set to NULL. */
|
|
|
|
void freeClusterLink(clusterLink *link) {
|
|
|
|
if (link->fd != -1) {
|
|
|
|
aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE);
|
|
|
|
aeDeleteFileEvent(server.el, link->fd, AE_READABLE);
|
|
|
|
}
|
|
|
|
sdsfree(link->sndbuf);
|
|
|
|
sdsfree(link->rcvbuf);
|
|
|
|
if (link->node)
|
|
|
|
link->node->link = NULL;
|
|
|
|
close(link->fd);
|
|
|
|
zfree(link);
|
|
|
|
}
|
|
|
|
|
|
|
|
void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
|
|
|
|
int cport, cfd;
|
2013-07-09 11:32:52 +02:00
|
|
|
char cip[REDIS_IP_STR_LEN];
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterLink *link;
|
|
|
|
REDIS_NOTUSED(el);
|
|
|
|
REDIS_NOTUSED(mask);
|
|
|
|
REDIS_NOTUSED(privdata);
|
|
|
|
|
2011-06-17 19:54:17 +01:00
|
|
|
cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
|
2011-03-29 17:51:15 +02:00
|
|
|
if (cfd == AE_ERR) {
|
|
|
|
redisLog(REDIS_VERBOSE,"Accepting cluster node: %s", server.neterr);
|
|
|
|
return;
|
|
|
|
}
|
2013-09-03 11:42:09 +02:00
|
|
|
anetNonBlock(NULL,cfd);
|
|
|
|
anetEnableTcpNoDelay(NULL,cfd);
|
|
|
|
|
|
|
|
/* Use non-blocking I/O for cluster messages. */
|
2011-06-18 19:43:47 +01:00
|
|
|
/* IPV6: might want to wrap a v6 address in [] */
|
2011-03-29 17:51:15 +02:00
|
|
|
redisLog(REDIS_VERBOSE,"Accepted cluster node %s:%d", cip, cport);
|
|
|
|
/* We need to create a temporary node in order to read the incoming
|
|
|
|
* packet in a valid contest. This node will be released once we
|
|
|
|
* read the packet and reply. */
|
|
|
|
link = createClusterLink(NULL);
|
|
|
|
link->fd = cfd;
|
|
|
|
aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* Key space handling
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-02-14 12:49:16 +01:00
|
|
|
/* We have 16384 hash slots. The hash slot of a given key is obtained
|
|
|
|
* as the least significant 14 bits of the crc16 of the key. */
|
2011-03-29 17:51:15 +02:00
|
|
|
unsigned int keyHashSlot(char *key, int keylen) {
|
2013-02-14 12:49:16 +01:00
|
|
|
return crc16(key,keylen) & 0x3FFF;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER node API
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
/* Create a new cluster node, with the specified flags.
|
|
|
|
* If "nodename" is NULL this is considered a first handshake and a random
|
|
|
|
* node name is assigned to this node (it will be fixed later when we'll
|
|
|
|
* receive the first pong).
|
|
|
|
*
|
|
|
|
* The node is created and returned to the user, but it is not automatically
|
|
|
|
* added to the nodes hash table. */
|
|
|
|
clusterNode *createClusterNode(char *nodename, int flags) {
|
|
|
|
clusterNode *node = zmalloc(sizeof(*node));
|
|
|
|
|
|
|
|
if (nodename)
|
|
|
|
memcpy(node->name, nodename, REDIS_CLUSTER_NAMELEN);
|
|
|
|
else
|
2012-03-08 10:08:44 +01:00
|
|
|
getRandomHexChars(node->name, REDIS_CLUSTER_NAMELEN);
|
2013-10-09 16:18:33 +02:00
|
|
|
node->ctime = mstime();
|
2013-09-25 11:47:13 +02:00
|
|
|
node->configEpoch = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
node->flags = flags;
|
|
|
|
memset(node->slots,0,sizeof(node->slots));
|
2013-02-28 15:11:05 +01:00
|
|
|
node->numslots = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
node->numslaves = 0;
|
|
|
|
node->slaves = NULL;
|
|
|
|
node->slaveof = NULL;
|
|
|
|
node->ping_sent = node->pong_received = 0;
|
2013-03-05 13:15:05 +01:00
|
|
|
node->fail_time = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
node->link = NULL;
|
2013-02-15 12:58:35 +01:00
|
|
|
memset(node->ip,0,sizeof(node->ip));
|
2013-02-14 13:01:28 +01:00
|
|
|
node->port = 0;
|
2013-02-22 17:43:35 +01:00
|
|
|
node->fail_reports = listCreate();
|
2013-09-26 13:00:41 +02:00
|
|
|
node->voted_time = 0;
|
2013-02-22 17:43:35 +01:00
|
|
|
listSetFreeMethod(node->fail_reports,zfree);
|
2011-03-29 17:51:15 +02:00
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
2013-02-22 17:43:35 +01:00
|
|
|
/* This function is called every time we get a failure report from a node.
|
|
|
|
* The side effect is to populate the fail_reports list (or to update
|
|
|
|
* the timestamp of an existing report).
|
|
|
|
*
|
|
|
|
* 'failing' is the node that is in failure state according to the
|
2013-02-26 14:58:39 +01:00
|
|
|
* 'sender' node.
|
|
|
|
*
|
|
|
|
* The function returns 0 if it just updates a timestamp of an existing
|
|
|
|
* failure report from the same sender. 1 is returned if a new failure
|
|
|
|
* report is created. */
|
|
|
|
int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
|
2013-02-22 17:43:35 +01:00
|
|
|
list *l = failing->fail_reports;
|
|
|
|
listNode *ln;
|
|
|
|
listIter li;
|
|
|
|
clusterNodeFailReport *fr;
|
|
|
|
|
|
|
|
/* If a failure report from the same sender already exists, just update
|
|
|
|
* the timestamp. */
|
|
|
|
listRewind(l,&li);
|
|
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
|
|
fr = ln->value;
|
|
|
|
if (fr->node == sender) {
|
2013-10-09 16:18:33 +02:00
|
|
|
fr->time = mstime();
|
2013-02-26 14:58:39 +01:00
|
|
|
return 0;
|
2013-02-22 17:43:35 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise create a new report. */
|
|
|
|
fr = zmalloc(sizeof(*fr));
|
|
|
|
fr->node = sender;
|
2013-10-09 16:18:33 +02:00
|
|
|
fr->time = mstime();
|
2013-02-22 17:43:35 +01:00
|
|
|
listAddNodeTail(l,fr);
|
2013-02-26 14:58:39 +01:00
|
|
|
return 1;
|
2013-02-22 17:43:35 +01:00
|
|
|
}
|
|
|
|
|
2013-02-26 11:19:48 +01:00
|
|
|
/* Remove failure reports that are too old, where too old means reasonably
|
|
|
|
* older than the global node timeout. Note that anyway for a node to be
|
|
|
|
* flagged as FAIL we need to have a local PFAIL state that is at least
|
|
|
|
* older than the global node timeout, so we don't just trust the number
|
|
|
|
* of failure reports from other nodes. */
|
|
|
|
void clusterNodeCleanupFailureReports(clusterNode *node) {
|
|
|
|
list *l = node->fail_reports;
|
|
|
|
listNode *ln;
|
|
|
|
listIter li;
|
|
|
|
clusterNodeFailReport *fr;
|
2013-10-09 16:18:33 +02:00
|
|
|
mstime_t maxtime = server.cluster_node_timeout *
|
2013-04-04 12:02:48 +02:00
|
|
|
REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT;
|
2013-10-09 16:18:33 +02:00
|
|
|
mstime_t now = mstime();
|
2013-02-26 11:19:48 +01:00
|
|
|
|
|
|
|
listRewind(l,&li);
|
|
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
|
|
fr = ln->value;
|
|
|
|
if (now - fr->time > maxtime) listDelNode(l,ln);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-25 19:13:22 +01:00
|
|
|
/* Remove the failing report for 'node' if it was previously considered
|
|
|
|
* failing by 'sender'. This function is called when a node informs us via
|
|
|
|
* gossip that a node is OK from its point of view (no FAIL or PFAIL flags).
|
|
|
|
*
|
|
|
|
* Note that this function is called relatively often as it gets called even
|
|
|
|
* when there are no nodes failing, and is O(N), however when the cluster is
|
|
|
|
* fine the failure reports list is empty so the function runs in constant
|
2013-02-26 14:58:39 +01:00
|
|
|
* time.
|
|
|
|
*
|
|
|
|
* The function returns 1 if the failure report was found and removed.
|
|
|
|
* Otherwise 0 is returned. */
|
|
|
|
int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
|
2013-02-25 19:13:22 +01:00
|
|
|
list *l = node->fail_reports;
|
|
|
|
listNode *ln;
|
|
|
|
listIter li;
|
|
|
|
clusterNodeFailReport *fr;
|
|
|
|
|
|
|
|
/* Search for a failure report from this sender. */
|
|
|
|
listRewind(l,&li);
|
|
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
|
|
fr = ln->value;
|
|
|
|
if (fr->node == sender) break;
|
|
|
|
}
|
2013-02-26 14:58:39 +01:00
|
|
|
if (!ln) return 0; /* No failure report from this sender. */
|
2013-02-25 19:13:22 +01:00
|
|
|
|
|
|
|
/* Remove the failure report. */
|
|
|
|
listDelNode(l,ln);
|
2013-02-26 11:08:03 +01:00
|
|
|
clusterNodeCleanupFailureReports(node);
|
2013-02-26 14:58:39 +01:00
|
|
|
return 1;
|
2013-02-25 19:13:22 +01:00
|
|
|
}
|
|
|
|
|
2013-02-22 17:43:35 +01:00
|
|
|
/* Return the number of external nodes that believe 'node' is failing,
|
|
|
|
* not including this node, that may have a PFAIL or FAIL state for this
|
|
|
|
* node as well. */
|
|
|
|
int clusterNodeFailureReportsCount(clusterNode *node) {
|
|
|
|
clusterNodeCleanupFailureReports(node);
|
|
|
|
return listLength(node->fail_reports);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < master->numslaves; j++) {
|
|
|
|
if (master->slaves[j] == slave) {
|
|
|
|
memmove(master->slaves+j,master->slaves+(j+1),
|
|
|
|
(master->numslaves-1)-j);
|
|
|
|
master->numslaves--;
|
|
|
|
return REDIS_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return REDIS_ERR;
|
|
|
|
}
|
|
|
|
|
|
|
|
int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
/* If it's already a slave, don't add it again. */
|
|
|
|
for (j = 0; j < master->numslaves; j++)
|
|
|
|
if (master->slaves[j] == slave) return REDIS_ERR;
|
|
|
|
master->slaves = zrealloc(master->slaves,
|
|
|
|
sizeof(clusterNode*)*(master->numslaves+1));
|
|
|
|
master->slaves[master->numslaves] = slave;
|
|
|
|
master->numslaves++;
|
|
|
|
return REDIS_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
void clusterNodeResetSlaves(clusterNode *n) {
|
|
|
|
zfree(n->slaves);
|
|
|
|
n->numslaves = 0;
|
2013-12-17 14:50:24 +01:00
|
|
|
n->slaves = NULL;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void freeClusterNode(clusterNode *n) {
|
|
|
|
sds nodename;
|
2013-02-22 17:43:35 +01:00
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
nodename = sdsnewlen(n->name, REDIS_CLUSTER_NAMELEN);
|
2013-02-14 13:20:56 +01:00
|
|
|
redisAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
|
2011-03-29 17:51:15 +02:00
|
|
|
sdsfree(nodename);
|
|
|
|
if (n->slaveof) clusterNodeRemoveSlave(n->slaveof, n);
|
|
|
|
if (n->link) freeClusterLink(n->link);
|
2013-02-22 17:43:35 +01:00
|
|
|
listRelease(n->fail_reports);
|
2011-03-29 17:51:15 +02:00
|
|
|
zfree(n);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add a node to the nodes hash table */
|
|
|
|
int clusterAddNode(clusterNode *node) {
|
|
|
|
int retval;
|
|
|
|
|
2013-02-14 13:20:56 +01:00
|
|
|
retval = dictAdd(server.cluster->nodes,
|
2011-03-29 17:51:15 +02:00
|
|
|
sdsnewlen(node->name,REDIS_CLUSTER_NAMELEN), node);
|
|
|
|
return (retval == DICT_OK) ? REDIS_OK : REDIS_ERR;
|
|
|
|
}
|
|
|
|
|
2013-02-27 17:55:59 +01:00
|
|
|
/* Remove a node from the cluster:
|
|
|
|
* 1) Mark all the nodes handled by it as unassigned.
|
|
|
|
* 2) Remove all the failure reports sent by this node.
|
|
|
|
* 3) Free the node, that will in turn remove it from the hash table
|
|
|
|
* and from the list of slaves of its master, if it is a slave node.
|
|
|
|
*/
|
|
|
|
void clusterDelNode(clusterNode *delnode) {
|
|
|
|
int j;
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
/* 1) Mark slots as unassigned. */
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (server.cluster->importing_slots_from[j] == delnode)
|
|
|
|
server.cluster->importing_slots_from[j] = NULL;
|
|
|
|
if (server.cluster->migrating_slots_to[j] == delnode)
|
|
|
|
server.cluster->migrating_slots_to[j] = NULL;
|
|
|
|
if (server.cluster->slots[j] == delnode)
|
|
|
|
clusterDelSlot(j);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 2) Remove failure reports. */
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2013-02-27 17:55:59 +01:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
clusterNode *node = dictGetVal(de);
|
|
|
|
|
|
|
|
if (node == delnode) continue;
|
|
|
|
clusterNodeDelFailureReport(node,delnode);
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
|
|
|
|
/* 3) Free the node, unlinking it from the cluster. */
|
|
|
|
freeClusterNode(delnode);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Node lookup by name */
|
|
|
|
clusterNode *clusterLookupNode(char *name) {
|
|
|
|
sds s = sdsnewlen(name, REDIS_CLUSTER_NAMELEN);
|
|
|
|
struct dictEntry *de;
|
|
|
|
|
2013-02-14 13:20:56 +01:00
|
|
|
de = dictFind(server.cluster->nodes,s);
|
2011-03-29 17:51:15 +02:00
|
|
|
sdsfree(s);
|
|
|
|
if (de == NULL) return NULL;
|
2011-11-08 17:07:55 +01:00
|
|
|
return dictGetVal(de);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* This is only used after the handshake. When we connect a given IP/PORT
|
|
|
|
* as a result of CLUSTER MEET we don't have the node name yet, so we
|
|
|
|
* pick a random one, and will fix it when we receive the PONG request using
|
|
|
|
* this function. */
|
|
|
|
void clusterRenameNode(clusterNode *node, char *newname) {
|
|
|
|
int retval;
|
|
|
|
sds s = sdsnewlen(node->name, REDIS_CLUSTER_NAMELEN);
|
|
|
|
|
|
|
|
redisLog(REDIS_DEBUG,"Renaming node %.40s into %.40s",
|
|
|
|
node->name, newname);
|
2013-02-14 13:20:56 +01:00
|
|
|
retval = dictDelete(server.cluster->nodes, s);
|
2011-03-29 17:51:15 +02:00
|
|
|
sdsfree(s);
|
|
|
|
redisAssert(retval == DICT_OK);
|
|
|
|
memcpy(node->name, newname, REDIS_CLUSTER_NAMELEN);
|
|
|
|
clusterAddNode(node);
|
|
|
|
}
|
|
|
|
|
2013-12-02 11:12:23 +01:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER nodes blacklist
|
|
|
|
*
|
|
|
|
* The nodes blacklist is just a way to ensure that a given node with a given
|
|
|
|
* Node ID is not readded before some time elapsed (this time is specified
|
|
|
|
* in seconds in REDIS_CLUSTER_BLACKLIST_TTL).
|
|
|
|
*
|
|
|
|
* This is useful when we want to remove a node from the cluster completely:
|
|
|
|
* when CLUSTER FORGET is called, it also puts the node into the blacklist so
|
|
|
|
* that even if we receive gossip messages from other nodes that still remember
|
|
|
|
* about the node we want to remove, we don't re-add it before some time.
|
|
|
|
*
|
|
|
|
* Currently the REDIS_CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
|
|
|
|
* that redis-trib has 60 seconds to send CLUSTER FORGET messages to nodes
|
|
|
|
* in the cluster without dealing with the problem if other nodes re-adding
|
|
|
|
* back the node to nodes we already sent the FORGET command to.
|
|
|
|
*
|
2013-12-05 16:35:32 +01:00
|
|
|
* The data structure used is a hash table with an sds string representing
|
2013-12-02 11:12:23 +01:00
|
|
|
* the node ID as key, and the time when it is ok to re-add the node as
|
|
|
|
* value.
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
#define REDIS_CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */
|
|
|
|
|
|
|
|
|
|
|
|
/* Before of the addNode() or Exists() operations we always remove expired
|
|
|
|
* entries from the black list. This is an O(N) operation but it is not a
|
|
|
|
* problem since add / exists operations are called very infrequently and
|
|
|
|
* the hash table is supposed to contain very little elements at max.
|
|
|
|
* However without the cleanup during long uptimes and with some automated
|
|
|
|
* node add/removal procedures, entries could accumulate. */
|
|
|
|
void clusterBlacklistCleanup(void) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
di = dictGetSafeIterator(server.cluster->nodes_black_list);
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
int64_t expire = dictGetUnsignedIntegerVal(de);
|
|
|
|
|
|
|
|
if (expire < server.unixtime)
|
|
|
|
dictDelete(server.cluster->nodes_black_list,dictGetKey(de));
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cleanup the blacklist and add a new node ID to the black list. */
|
|
|
|
void clusterBlacklistAddNode(clusterNode *node) {
|
|
|
|
dictEntry *de;
|
|
|
|
sds id = sdsnewlen(node->name,REDIS_CLUSTER_NAMELEN);
|
|
|
|
|
|
|
|
clusterBlacklistCleanup();
|
|
|
|
if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_ERR)
|
|
|
|
sdsfree(id); /* Key was already there. */
|
|
|
|
de = dictFind(server.cluster->nodes_black_list,node->name);
|
|
|
|
dictSetUnsignedIntegerVal(de,time(NULL));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return non-zero if the specified node ID exists in the blacklist.
|
|
|
|
* You don't need to pass an sds string here, any pointer to 40 bytes
|
|
|
|
* will work. */
|
|
|
|
int clusterBlacklistExists(char *nodeid) {
|
|
|
|
sds id = sdsnewlen(nodeid,REDIS_CLUSTER_NAMELEN);
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
retval = dictFind(server.cluster->nodes_black_list,id) != NULL;
|
|
|
|
sdsfree(id);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER messages exchange - PING/PONG and gossip
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-02-26 14:58:39 +01:00
|
|
|
/* This function checks if a given node should be marked as FAIL.
|
|
|
|
* It happens if the following conditions are met:
|
|
|
|
*
|
2013-09-20 11:26:44 +02:00
|
|
|
* 1) We received enough failure reports from other master nodes via gossip.
|
|
|
|
* Enough means that the majority of the masters signaled the node is
|
|
|
|
* down recently.
|
|
|
|
* 2) We believe this node is in PFAIL state.
|
2013-02-26 14:58:39 +01:00
|
|
|
*
|
|
|
|
* If a failure is detected we also inform the whole cluster about this
|
|
|
|
* event trying to force every other node to set the FAIL flag for the node.
|
2013-09-20 11:26:44 +02:00
|
|
|
*
|
|
|
|
* Note that the form of agreement used here is weak, as we collect the majority
|
|
|
|
* of masters state during some time, and even if we force agreement by
|
|
|
|
* propagating the FAIL message, because of partitions we may not reach every
|
|
|
|
* node. However:
|
|
|
|
*
|
|
|
|
* 1) Either we reach the majority and eventually the FAIL state will propagate
|
|
|
|
* to all the cluster.
|
|
|
|
* 2) Or there is no majority so no slave promotion will be authorized and the
|
|
|
|
* FAIL flag will be cleared after some time.
|
2013-02-26 14:58:39 +01:00
|
|
|
*/
|
|
|
|
void markNodeAsFailingIfNeeded(clusterNode *node) {
|
|
|
|
int failures;
|
|
|
|
int needed_quorum = (server.cluster->size / 2) + 1;
|
|
|
|
|
|
|
|
if (!(node->flags & REDIS_NODE_PFAIL)) return; /* We can reach it. */
|
|
|
|
if (node->flags & REDIS_NODE_FAIL) return; /* Already FAILing. */
|
|
|
|
|
2013-09-20 11:26:44 +02:00
|
|
|
failures = clusterNodeFailureReportsCount(node);
|
|
|
|
/* Also count myself as a voter if I'm a master. */
|
|
|
|
if (server.cluster->myself->flags & REDIS_NODE_MASTER)
|
|
|
|
failures += 1;
|
|
|
|
if (failures < needed_quorum) return; /* No weak agreement from masters. */
|
2013-02-26 14:58:39 +01:00
|
|
|
|
|
|
|
redisLog(REDIS_NOTICE,
|
|
|
|
"Marking node %.40s as failing (quorum reached).", node->name);
|
|
|
|
|
|
|
|
/* Mark the node as failing. */
|
|
|
|
node->flags &= ~REDIS_NODE_PFAIL;
|
|
|
|
node->flags |= REDIS_NODE_FAIL;
|
2013-10-09 16:18:33 +02:00
|
|
|
node->fail_time = mstime();
|
2013-02-26 14:58:39 +01:00
|
|
|
|
2013-09-20 11:26:44 +02:00
|
|
|
/* Broadcast the failing node name to everybody, forcing all the other
|
|
|
|
* reachable nodes to flag the node as FAIL. */
|
|
|
|
if (server.cluster->myself->flags & REDIS_NODE_MASTER)
|
|
|
|
clusterSendFail(node->name);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-02-26 14:58:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* This function is called only if a node is marked as FAIL, but we are able
|
|
|
|
* to reach it again. It checks if there are the conditions to undo the FAIL
|
2013-03-21 10:47:10 +01:00
|
|
|
* state. */
|
2013-02-26 14:58:39 +01:00
|
|
|
void clearNodeFailureIfNeeded(clusterNode *node) {
|
2013-12-17 09:45:42 +01:00
|
|
|
mstime_t now = mstime();
|
2013-03-05 15:05:32 +01:00
|
|
|
|
|
|
|
redisAssert(node->flags & REDIS_NODE_FAIL);
|
|
|
|
|
|
|
|
/* For slaves we always clear the FAIL flag if we can contact the
|
|
|
|
* node again. */
|
|
|
|
if (node->flags & REDIS_NODE_SLAVE) {
|
|
|
|
redisLog(REDIS_NOTICE,
|
2013-10-07 15:44:58 +02:00
|
|
|
"Clear FAIL state for node %.40s: slave is reachable again.",
|
2013-03-05 15:05:32 +01:00
|
|
|
node->name);
|
|
|
|
node->flags &= ~REDIS_NODE_FAIL;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-03-05 15:05:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* If it is a master and...
|
2013-10-09 16:18:33 +02:00
|
|
|
* 1) The FAIL state is old enough.
|
2013-03-05 15:05:32 +01:00
|
|
|
* 2) It is yet serving slots from our point of view (not failed over).
|
|
|
|
* Apparently no one is going to fix these slots, clear the FAIL flag. */
|
|
|
|
if (node->flags & REDIS_NODE_MASTER &&
|
|
|
|
node->numslots > 0 &&
|
2013-04-04 12:02:48 +02:00
|
|
|
(now - node->fail_time) >
|
2013-10-09 16:18:33 +02:00
|
|
|
(server.cluster_node_timeout * REDIS_CLUSTER_FAIL_UNDO_TIME_MULT))
|
2013-03-05 15:05:32 +01:00
|
|
|
{
|
2013-02-26 15:03:27 +01:00
|
|
|
redisLog(REDIS_NOTICE,
|
2013-03-05 15:05:32 +01:00
|
|
|
"Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time.",
|
2013-02-26 15:03:27 +01:00
|
|
|
node->name);
|
2013-02-26 15:15:44 +01:00
|
|
|
node->flags &= ~REDIS_NODE_FAIL;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-02-26 14:58:39 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-04 15:52:16 +02:00
|
|
|
/* Return true if we already have a node in HANDSHAKE state matching the
|
|
|
|
* specified ip address and port number. This function is used in order to
|
|
|
|
* avoid adding a new handshake node for the same address multiple times. */
|
|
|
|
int clusterHandshakeInProgress(char *ip, int port) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
clusterNode *node = dictGetVal(de);
|
|
|
|
|
|
|
|
if (!(node->flags & REDIS_NODE_HANDSHAKE)) continue;
|
|
|
|
if (!strcasecmp(node->ip,ip) && node->port == port) break;
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
return de != NULL;
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Process the gossip section of PING or PONG packets.
|
|
|
|
* Note that this function assumes that the packet is already sanity-checked
|
|
|
|
* by the caller, not in the content of the gossip section, but in the
|
|
|
|
* length. */
|
|
|
|
void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
|
|
|
|
uint16_t count = ntohs(hdr->count);
|
|
|
|
clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip;
|
|
|
|
clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender);
|
|
|
|
|
|
|
|
while(count--) {
|
|
|
|
sds ci = sdsempty();
|
|
|
|
uint16_t flags = ntohs(g->flags);
|
|
|
|
clusterNode *node;
|
|
|
|
|
|
|
|
if (flags == 0) ci = sdscat(ci,"noflags,");
|
|
|
|
if (flags & REDIS_NODE_MYSELF) ci = sdscat(ci,"myself,");
|
|
|
|
if (flags & REDIS_NODE_MASTER) ci = sdscat(ci,"master,");
|
|
|
|
if (flags & REDIS_NODE_SLAVE) ci = sdscat(ci,"slave,");
|
|
|
|
if (flags & REDIS_NODE_PFAIL) ci = sdscat(ci,"fail?,");
|
|
|
|
if (flags & REDIS_NODE_FAIL) ci = sdscat(ci,"fail,");
|
|
|
|
if (flags & REDIS_NODE_HANDSHAKE) ci = sdscat(ci,"handshake,");
|
|
|
|
if (flags & REDIS_NODE_NOADDR) ci = sdscat(ci,"noaddr,");
|
|
|
|
if (ci[sdslen(ci)-1] == ',') ci[sdslen(ci)-1] = ' ';
|
|
|
|
|
|
|
|
redisLog(REDIS_DEBUG,"GOSSIP %.40s %s:%d %s",
|
|
|
|
g->nodename,
|
|
|
|
g->ip,
|
|
|
|
ntohs(g->port),
|
|
|
|
ci);
|
|
|
|
sdsfree(ci);
|
|
|
|
|
|
|
|
/* Update our state accordingly to the gossip sections */
|
|
|
|
node = clusterLookupNode(g->nodename);
|
|
|
|
if (node != NULL) {
|
2013-08-26 16:16:25 +02:00
|
|
|
/* We already know this node.
|
|
|
|
Handle failure reports, only when the sender is a master. */
|
2013-02-26 15:15:44 +01:00
|
|
|
if (sender && sender->flags & REDIS_NODE_MASTER &&
|
|
|
|
node != server.cluster->myself)
|
|
|
|
{
|
2013-02-26 14:58:39 +01:00
|
|
|
if (flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) {
|
|
|
|
if (clusterNodeAddFailureReport(node,sender)) {
|
|
|
|
redisLog(REDIS_NOTICE,
|
|
|
|
"Node %.40s reported node %.40s as not reachable.",
|
|
|
|
sender->name, node->name);
|
|
|
|
}
|
|
|
|
markNodeAsFailingIfNeeded(node);
|
|
|
|
} else {
|
|
|
|
if (clusterNodeDelFailureReport(node,sender)) {
|
|
|
|
redisLog(REDIS_NOTICE,
|
|
|
|
"Node %.40s reported node %.40s is back online.",
|
|
|
|
sender->name, node->name);
|
|
|
|
}
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* If it's not in NOADDR state and we don't have it, we
|
2013-12-05 16:35:32 +01:00
|
|
|
* start a handshake process against this IP/PORT pairs.
|
2011-03-29 17:51:15 +02:00
|
|
|
*
|
|
|
|
* Note that we require that the sender of this gossip message
|
|
|
|
* is a well known node in our cluster, otherwise we risk
|
|
|
|
* joining another cluster. */
|
2013-09-04 15:52:16 +02:00
|
|
|
if (sender && !(flags & REDIS_NODE_NOADDR) &&
|
|
|
|
!clusterHandshakeInProgress(g->ip,ntohs(g->port)))
|
|
|
|
{
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *newnode;
|
|
|
|
|
|
|
|
redisLog(REDIS_DEBUG,"Adding the new node");
|
|
|
|
newnode = createClusterNode(NULL,REDIS_NODE_HANDSHAKE);
|
|
|
|
memcpy(newnode->ip,g->ip,sizeof(g->ip));
|
|
|
|
newnode->port = ntohs(g->port);
|
|
|
|
clusterAddNode(newnode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Next node */
|
|
|
|
g++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-18 19:19:10 +01:00
|
|
|
/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. */
|
2011-03-29 17:51:15 +02:00
|
|
|
void nodeIp2String(char *buf, clusterLink *link) {
|
2011-06-18 19:19:10 +01:00
|
|
|
struct sockaddr_storage sa;
|
2011-03-29 17:51:15 +02:00
|
|
|
socklen_t salen = sizeof(sa);
|
|
|
|
|
|
|
|
if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1)
|
|
|
|
redisPanic("getpeername() failed.");
|
2011-06-18 19:19:10 +01:00
|
|
|
|
|
|
|
if (sa.ss_family == AF_INET) {
|
|
|
|
struct sockaddr_in *s = (struct sockaddr_in *)&sa;
|
|
|
|
inet_ntop(AF_INET,(void*)&(s->sin_addr),buf,REDIS_CLUSTER_IPLEN);
|
|
|
|
} else {
|
|
|
|
struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa;
|
|
|
|
inet_ntop(AF_INET6,(void*)&(s->sin6_addr),buf,REDIS_CLUSTER_IPLEN);
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the node address to the IP address that can be extracted
|
2013-06-12 10:50:07 -07:00
|
|
|
* from link->fd, and at the specified port.
|
|
|
|
* Also disconnect the node link so that we'll connect again to the new
|
|
|
|
* address.
|
|
|
|
*
|
|
|
|
* If the ip/port pair are already correct no operation is performed at
|
|
|
|
* all.
|
|
|
|
*
|
|
|
|
* The function returns 0 if the node address is still the same,
|
|
|
|
* otherwise 1 is returned. */
|
|
|
|
int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) {
|
|
|
|
char ip[REDIS_IP_STR_LEN];
|
|
|
|
|
|
|
|
/* We don't proceed if the link is the same as the sender link, as this
|
|
|
|
* function is designed to see if the node link is consistent with the
|
|
|
|
* symmetric link that is used to receive PINGs from the node.
|
|
|
|
*
|
|
|
|
* As a side effect this function never frees the passed 'link', so
|
|
|
|
* it is safe to call during packet processing. */
|
|
|
|
if (link == node->link) return 0;
|
|
|
|
|
|
|
|
nodeIp2String(ip,link);
|
|
|
|
if (node->port == port && strcmp(ip,node->ip) == 0) return 0;
|
|
|
|
|
|
|
|
/* IP / port is different, update it. */
|
|
|
|
memcpy(node->ip,ip,sizeof(ip));
|
|
|
|
node->port = port;
|
|
|
|
if (node->link) freeClusterLink(node->link);
|
|
|
|
redisLog(REDIS_WARNING,"Address updated for node %.40s, now %s:%d",
|
|
|
|
node->name, node->ip, node->port);
|
|
|
|
return 1;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
2013-11-08 17:02:10 +01:00
|
|
|
/* Reconfigure the specified node 'n' as a master. This function is called when
|
|
|
|
* a node that we believed to be a slave is now acting as master in order to
|
|
|
|
* update the state of the node. */
|
|
|
|
void clusterSetNodeAsMaster(clusterNode *n) {
|
|
|
|
if (n->flags & REDIS_NODE_MASTER) return;
|
|
|
|
|
|
|
|
if (n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
|
|
|
|
n->flags &= ~REDIS_NODE_SLAVE;
|
|
|
|
n->flags |= REDIS_NODE_MASTER;
|
|
|
|
n->slaveof = NULL;
|
|
|
|
|
|
|
|
/* Update config and state. */
|
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function is called when we receive a master configuration via a
|
|
|
|
* PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the
|
|
|
|
* node, and the set of slots claimed under this configEpoch.
|
|
|
|
*
|
|
|
|
* What we do is to rebind the slots with newer configuration compared to our
|
|
|
|
* local configuration, and if needed, we turn ourself into a replica of the
|
|
|
|
* node (see the function comments for more info).
|
|
|
|
*
|
|
|
|
* The 'sender' is the node for which we received a configuration update.
|
|
|
|
* Sometimes it is not actaully the "Sender" of the information, like in the case
|
|
|
|
* we receive the info via an UPDATE packet. */
|
|
|
|
void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch,
|
|
|
|
unsigned char *slots)
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
clusterNode *curmaster, *newmaster = NULL;
|
|
|
|
|
|
|
|
/* Here we set curmaster to this node or the node this node
|
|
|
|
* replicates to if it's a slave. In the for loop we are
|
|
|
|
* interested to check if slots are taken away from curmaster. */
|
|
|
|
if (server.cluster->myself->flags & REDIS_NODE_MASTER)
|
|
|
|
curmaster = server.cluster->myself;
|
|
|
|
else
|
|
|
|
curmaster = server.cluster->myself->slaveof;
|
|
|
|
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (bitmapTestBit(slots,j)) {
|
|
|
|
/* We rebind the slot to the new node claiming it if:
|
|
|
|
* 1) The slot was unassigned.
|
|
|
|
* 2) The new node claims it with a greater configEpoch. */
|
|
|
|
if (server.cluster->slots[j] == sender) continue;
|
|
|
|
if (server.cluster->slots[j] == NULL ||
|
|
|
|
server.cluster->slots[j]->configEpoch <
|
|
|
|
senderConfigEpoch)
|
|
|
|
{
|
|
|
|
if (server.cluster->slots[j] == curmaster)
|
|
|
|
newmaster = sender;
|
|
|
|
clusterDelSlot(j);
|
|
|
|
clusterAddSlot(sender,j);
|
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE|
|
|
|
|
CLUSTER_TODO_FSYNC_CONFIG);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If at least one slot was reassigned from a node to another node
|
|
|
|
* with a greater configEpoch, it is possible that:
|
|
|
|
* 1) We are a master left without slots. This means that we were
|
|
|
|
* failed over and we should turn into a replica of the new
|
|
|
|
* master.
|
|
|
|
* 2) We are a slave and our master is left without slots. We need
|
|
|
|
* to replicate to the new slots owner. */
|
|
|
|
if (newmaster && curmaster->numslots == 0) {
|
|
|
|
redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name);
|
|
|
|
clusterSetMaster(sender);
|
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE|
|
|
|
|
CLUSTER_TODO_FSYNC_CONFIG);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* When this function is called, there is a packet to process starting
|
|
|
|
* at node->rcvbuf. Releasing the buffer is up to the caller, so this
|
|
|
|
* function should just handle the higher level stuff of processing the
|
|
|
|
* packet, modifying the cluster state if needed.
|
|
|
|
*
|
|
|
|
* The function returns 1 if the link is still valid after the packet
|
|
|
|
* was processed, otherwise 0 if the link was freed since the packet
|
|
|
|
* processing lead to some inconsistency error (for instance a PONG
|
|
|
|
* received from the wrong sender ID). */
|
|
|
|
int clusterProcessPacket(clusterLink *link) {
|
|
|
|
clusterMsg *hdr = (clusterMsg*) link->rcvbuf;
|
|
|
|
uint32_t totlen = ntohl(hdr->totlen);
|
|
|
|
uint16_t type = ntohs(hdr->type);
|
2013-03-20 10:48:42 +01:00
|
|
|
uint16_t flags = ntohs(hdr->flags);
|
2013-11-05 12:01:07 +01:00
|
|
|
uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *sender;
|
|
|
|
|
2013-10-02 10:10:08 +02:00
|
|
|
server.cluster->stats_bus_messages_received++;
|
2011-10-05 16:02:45 +02:00
|
|
|
redisLog(REDIS_DEBUG,"--- Processing packet of type %d, %lu bytes",
|
|
|
|
type, (unsigned long) totlen);
|
2011-10-07 16:34:16 +02:00
|
|
|
|
|
|
|
/* Perform sanity checks */
|
2011-03-29 17:51:15 +02:00
|
|
|
if (totlen < 8) return 1;
|
|
|
|
if (totlen > sdslen(link->rcvbuf)) return 1;
|
|
|
|
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
|
|
|
|
type == CLUSTERMSG_TYPE_MEET)
|
|
|
|
{
|
|
|
|
uint16_t count = ntohs(hdr->count);
|
|
|
|
uint32_t explen; /* expected length of this packet */
|
|
|
|
|
|
|
|
explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
explen += (sizeof(clusterMsgDataGossip)*count);
|
|
|
|
if (totlen != explen) return 1;
|
2013-03-13 17:27:06 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_FAIL) {
|
2011-03-29 17:51:15 +02:00
|
|
|
uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
|
|
|
|
explen += sizeof(clusterMsgDataFail);
|
|
|
|
if (totlen != explen) return 1;
|
2013-03-13 17:27:06 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_PUBLISH) {
|
2011-10-07 16:34:16 +02:00
|
|
|
uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
|
|
|
|
explen += sizeof(clusterMsgDataPublish) +
|
|
|
|
ntohl(hdr->data.publish.msg.channel_len) +
|
|
|
|
ntohl(hdr->data.publish.msg.message_len);
|
|
|
|
if (totlen != explen) return 1;
|
2013-03-14 16:42:56 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST ||
|
|
|
|
type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
|
2013-03-13 17:31:19 +01:00
|
|
|
uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
|
2013-11-08 17:02:10 +01:00
|
|
|
if (totlen != explen) return 1;
|
|
|
|
} else if (type == CLUSTERMSG_TYPE_UPDATE) {
|
|
|
|
uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
|
|
|
|
explen += sizeof(clusterMsgDataUpdate);
|
2013-03-13 17:31:19 +01:00
|
|
|
if (totlen != explen) return 1;
|
2011-10-07 16:34:16 +02:00
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-09-27 09:55:41 +02:00
|
|
|
/* Check if the sender is a known node. */
|
2011-03-29 17:51:15 +02:00
|
|
|
sender = clusterLookupNode(hdr->sender);
|
2013-09-25 12:36:29 +02:00
|
|
|
if (sender && !(sender->flags & REDIS_NODE_HANDSHAKE)) {
|
2013-09-27 09:55:41 +02:00
|
|
|
/* Update our curretEpoch if we see a newer epoch in the cluster. */
|
2013-09-25 12:36:29 +02:00
|
|
|
senderCurrentEpoch = ntohu64(hdr->currentEpoch);
|
|
|
|
senderConfigEpoch = ntohu64(hdr->configEpoch);
|
|
|
|
if (senderCurrentEpoch > server.cluster->currentEpoch)
|
|
|
|
server.cluster->currentEpoch = senderCurrentEpoch;
|
2013-09-27 09:55:41 +02:00
|
|
|
/* Update the sender configEpoch if it is publishing a newer one. */
|
2013-09-30 10:13:33 +02:00
|
|
|
if (senderConfigEpoch > sender->configEpoch) {
|
2013-09-27 09:55:41 +02:00
|
|
|
sender->configEpoch = senderConfigEpoch;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG);
|
2013-09-30 10:13:33 +02:00
|
|
|
}
|
2013-09-25 12:36:29 +02:00
|
|
|
}
|
2013-04-11 18:19:48 +02:00
|
|
|
|
2013-09-25 12:36:29 +02:00
|
|
|
/* Process packets by type. */
|
2011-03-29 17:51:15 +02:00
|
|
|
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
|
2013-02-27 12:27:15 +01:00
|
|
|
redisLog(REDIS_DEBUG,"Ping packet received: %p", (void*)link->node);
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
/* Add this node if it is new for us and the msg type is MEET.
|
|
|
|
* In this stage we don't try to add the node with the right
|
|
|
|
* flags, slaveof pointer, and so forth, as this details will be
|
2013-06-11 21:33:00 +02:00
|
|
|
* resolved when we'll receive PONGs from the node. */
|
2011-03-29 17:51:15 +02:00
|
|
|
if (!sender && type == CLUSTERMSG_TYPE_MEET) {
|
|
|
|
clusterNode *node;
|
|
|
|
|
|
|
|
node = createClusterNode(NULL,REDIS_NODE_HANDSHAKE);
|
|
|
|
nodeIp2String(node->ip,link);
|
|
|
|
node->port = ntohs(hdr->port);
|
|
|
|
clusterAddNode(node);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Get info from the gossip section */
|
|
|
|
clusterProcessGossipSection(hdr,link);
|
|
|
|
|
|
|
|
/* Anyway reply with a PONG */
|
|
|
|
clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
|
2013-04-11 18:19:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* PING or PONG: process config information. */
|
2013-08-22 11:53:28 +02:00
|
|
|
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
|
|
|
|
type == CLUSTERMSG_TYPE_MEET)
|
|
|
|
{
|
2013-04-11 18:19:48 +02:00
|
|
|
redisLog(REDIS_DEBUG,"%s packet received: %p",
|
|
|
|
type == CLUSTERMSG_TYPE_PING ? "ping" : "pong",
|
|
|
|
(void*)link->node);
|
2011-03-29 17:51:15 +02:00
|
|
|
if (link->node) {
|
|
|
|
if (link->node->flags & REDIS_NODE_HANDSHAKE) {
|
|
|
|
/* If we already have this node, try to change the
|
|
|
|
* IP/port of the node with the new one. */
|
|
|
|
if (sender) {
|
|
|
|
redisLog(REDIS_WARNING,
|
|
|
|
"Handshake error: we already know node %.40s, updating the address if needed.", sender->name);
|
2013-06-12 10:50:07 -07:00
|
|
|
if (nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port)))
|
|
|
|
{
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE);
|
2013-06-12 10:50:07 -07:00
|
|
|
}
|
|
|
|
/* Free this node as we alrady have it. This will
|
|
|
|
* cause the link to be freed as well. */
|
|
|
|
freeClusterNode(link->node);
|
2011-03-29 17:51:15 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First thing to do is replacing the random name with the
|
2013-12-05 16:35:32 +01:00
|
|
|
* right node name if this was a handshake stage. */
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterRenameNode(link->node, hdr->sender);
|
|
|
|
redisLog(REDIS_DEBUG,"Handshake with node %.40s completed.",
|
|
|
|
link->node->name);
|
|
|
|
link->node->flags &= ~REDIS_NODE_HANDSHAKE;
|
2013-03-25 13:03:01 +01:00
|
|
|
link->node->flags |= flags&(REDIS_NODE_MASTER|REDIS_NODE_SLAVE);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else if (memcmp(link->node->name,hdr->sender,
|
|
|
|
REDIS_CLUSTER_NAMELEN) != 0)
|
|
|
|
{
|
|
|
|
/* If the reply has a non matching node ID we
|
|
|
|
* disconnect this node and set it as not having an associated
|
|
|
|
* address. */
|
|
|
|
redisLog(REDIS_DEBUG,"PONG contains mismatching sender ID");
|
|
|
|
link->node->flags |= REDIS_NODE_NOADDR;
|
2013-02-27 17:09:33 +01:00
|
|
|
link->node->ip[0] = '\0';
|
|
|
|
link->node->port = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
freeClusterLink(link);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
|
2011-03-29 17:51:15 +02:00
|
|
|
/* FIXME: remove this node if we already have it.
|
|
|
|
*
|
|
|
|
* If we already have it but the IP is different, use
|
|
|
|
* the new one if the old node is in FAIL, PFAIL, or NOADDR
|
|
|
|
* status... */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2013-03-13 18:38:08 +01:00
|
|
|
|
2013-06-12 10:50:07 -07:00
|
|
|
/* Update the node address if it changed. */
|
|
|
|
if (sender && type == CLUSTERMSG_TYPE_PING &&
|
|
|
|
!(sender->flags & REDIS_NODE_HANDSHAKE) &&
|
|
|
|
nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port)))
|
|
|
|
{
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE);
|
2013-06-12 10:50:07 -07:00
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Update our info about the node */
|
2013-04-11 18:55:58 +02:00
|
|
|
if (link->node && type == CLUSTERMSG_TYPE_PONG) {
|
2013-10-09 16:18:33 +02:00
|
|
|
link->node->pong_received = mstime();
|
2013-04-11 18:55:58 +02:00
|
|
|
link->node->ping_sent = 0;
|
|
|
|
|
|
|
|
/* The PFAIL condition can be reversed without external
|
2013-06-11 21:33:00 +02:00
|
|
|
* help if it is momentary (that is, if it does not
|
2013-04-11 18:55:58 +02:00
|
|
|
* turn into a FAIL state).
|
|
|
|
*
|
|
|
|
* The FAIL condition is also reversible under specific
|
|
|
|
* conditions detected by clearNodeFailureIfNeeded(). */
|
|
|
|
if (link->node->flags & REDIS_NODE_PFAIL) {
|
|
|
|
link->node->flags &= ~REDIS_NODE_PFAIL;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE);
|
2013-04-11 18:55:58 +02:00
|
|
|
} else if (link->node->flags & REDIS_NODE_FAIL) {
|
|
|
|
clearNodeFailureIfNeeded(link->node);
|
|
|
|
}
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-11-08 10:32:16 +01:00
|
|
|
/* Check for role switch: slave -> master or master -> slave. */
|
2011-03-29 17:51:15 +02:00
|
|
|
if (sender) {
|
|
|
|
if (!memcmp(hdr->slaveof,REDIS_NODE_NULL_NAME,
|
|
|
|
sizeof(hdr->slaveof)))
|
|
|
|
{
|
2013-03-15 16:35:16 +01:00
|
|
|
/* Node is a master. */
|
2013-11-08 17:02:10 +01:00
|
|
|
clusterSetNodeAsMaster(sender);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else {
|
2013-03-15 16:35:16 +01:00
|
|
|
/* Node is a slave. */
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *master = clusterLookupNode(hdr->slaveof);
|
|
|
|
|
2013-03-15 16:35:16 +01:00
|
|
|
if (sender->flags & REDIS_NODE_MASTER) {
|
2013-03-20 00:30:47 +01:00
|
|
|
/* Master turned into a slave! Reconfigure the node. */
|
2013-03-15 16:35:16 +01:00
|
|
|
clusterDelNodeSlots(sender);
|
2013-03-19 16:01:30 +01:00
|
|
|
sender->flags &= ~REDIS_NODE_MASTER;
|
|
|
|
sender->flags |= REDIS_NODE_SLAVE;
|
2013-03-20 00:30:47 +01:00
|
|
|
|
2013-03-19 16:01:30 +01:00
|
|
|
/* Remove the list of slaves from the node. */
|
|
|
|
if (sender->numslaves) clusterNodeResetSlaves(sender);
|
2013-03-20 00:30:47 +01:00
|
|
|
|
|
|
|
/* Update config and state. */
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE);
|
2013-03-15 16:35:16 +01:00
|
|
|
}
|
|
|
|
|
2013-03-19 16:01:30 +01:00
|
|
|
/* Master node changed for this slave? */
|
|
|
|
if (sender->slaveof != master) {
|
2013-03-25 15:01:25 +01:00
|
|
|
if (sender->slaveof)
|
|
|
|
clusterNodeRemoveSlave(sender->slaveof,sender);
|
2013-03-05 11:50:11 +01:00
|
|
|
clusterNodeAddSlave(master,sender);
|
|
|
|
sender->slaveof = master;
|
2013-10-02 12:27:12 +02:00
|
|
|
|
|
|
|
/* Update config. */
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
|
2013-03-05 11:50:11 +01:00
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-15 16:15:40 +01:00
|
|
|
/* Update our info about served slots.
|
2013-11-08 10:32:16 +01:00
|
|
|
*
|
2013-03-15 16:15:40 +01:00
|
|
|
* Note: this MUST happen after we update the master/slave state
|
|
|
|
* so that REDIS_NODE_MASTER flag will be set. */
|
2013-11-08 10:32:16 +01:00
|
|
|
|
|
|
|
/* Many checks are only needed if the set of served slots this
|
|
|
|
* instance claims is different compared to the set of slots we have for
|
|
|
|
* it. Check this ASAP to avoid other computational expansive checks later. */
|
|
|
|
clusterNode *sender_master = NULL; /* Sender or its master if it is a slave. */
|
|
|
|
int dirty_slots = 0; /* Sender claimed slots don't match my view? */
|
|
|
|
|
|
|
|
if (sender) {
|
|
|
|
sender_master = (sender->flags & REDIS_NODE_MASTER) ? sender :
|
|
|
|
sender->slaveof;
|
|
|
|
if (sender_master) {
|
|
|
|
dirty_slots = memcmp(sender_master->slots,
|
|
|
|
hdr->myslots,sizeof(hdr->myslots)) != 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-08 17:02:10 +01:00
|
|
|
/* 1) If the sender of the message is a master, and we detected that the
|
|
|
|
* set of slots it claims changed, scan the slots to see if we need
|
|
|
|
* to update our configuration. */
|
2013-11-08 10:32:16 +01:00
|
|
|
if (sender && sender->flags & REDIS_NODE_MASTER && dirty_slots) {
|
2013-11-08 17:02:10 +01:00
|
|
|
clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots);
|
2013-11-08 10:32:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* 2) We also check for the reverse condition, that is, the sender claims
|
|
|
|
* to serve slots we know are served by a master with a greater
|
|
|
|
* configEpoch. If this happens we inform the sender.
|
|
|
|
*
|
|
|
|
* This is useful because sometimes after a partition heals, a reappearing
|
|
|
|
* master may be the last one to claim a given set of hash slots, but with
|
|
|
|
* a configuration that other instances know to be deprecated. Example:
|
|
|
|
*
|
|
|
|
* A and B are master and slave for slots 1,2,3.
|
|
|
|
* A is partitioned away, B gets promoted.
|
|
|
|
* B is partitioned away, and A returns available.
|
|
|
|
*
|
|
|
|
* Usually B would PING A publishing its set of served slots and its
|
|
|
|
* configEpoch, but because of the partition B can't inform A of the new
|
|
|
|
* configuration, so other nodes that have an updated table must do it.
|
|
|
|
* In this way A will stop to act as a master (or can try to failover if
|
|
|
|
* there are the conditions to win the election). */
|
|
|
|
if (sender && dirty_slots) {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (bitmapTestBit(hdr->myslots,j)) {
|
|
|
|
if (server.cluster->slots[j] == sender ||
|
|
|
|
server.cluster->slots[j] == NULL) continue;
|
|
|
|
if (server.cluster->slots[j]->configEpoch >
|
|
|
|
senderConfigEpoch)
|
|
|
|
{
|
2013-11-08 16:26:50 +01:00
|
|
|
redisLog(REDIS_WARNING,
|
|
|
|
"Node %.40s has old slots configuration, sending "
|
2013-11-08 17:27:59 +01:00
|
|
|
"an UPDATE message about %.40s",
|
2013-11-08 16:26:50 +01:00
|
|
|
sender->name, server.cluster->slots[j]->name);
|
|
|
|
clusterSendUpdate(sender->link,server.cluster->slots[j]);
|
2013-11-08 17:25:49 +01:00
|
|
|
|
|
|
|
/* TODO: instead of exiting the loop send every other
|
|
|
|
* UPDATE packet for other nodes that are the new owner
|
|
|
|
* of sender's slots. */
|
|
|
|
break;
|
2013-11-08 10:32:16 +01:00
|
|
|
}
|
2013-09-30 11:44:23 +02:00
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get info from the gossip section */
|
|
|
|
clusterProcessGossipSection(hdr,link);
|
2013-10-02 09:42:35 +02:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_FAIL) {
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *failing;
|
|
|
|
|
2013-10-02 09:42:35 +02:00
|
|
|
if (sender) {
|
|
|
|
failing = clusterLookupNode(hdr->data.fail.about.nodename);
|
|
|
|
if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF)))
|
|
|
|
{
|
|
|
|
redisLog(REDIS_NOTICE,
|
|
|
|
"FAIL message received from %.40s about %.40s",
|
|
|
|
hdr->sender, hdr->data.fail.about.nodename);
|
|
|
|
failing->flags |= REDIS_NODE_FAIL;
|
2013-10-09 16:18:33 +02:00
|
|
|
failing->fail_time = mstime();
|
2013-10-02 09:42:35 +02:00
|
|
|
failing->flags &= ~REDIS_NODE_PFAIL;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE);
|
2013-10-02 09:42:35 +02:00
|
|
|
}
|
|
|
|
} else {
|
2011-03-29 17:51:15 +02:00
|
|
|
redisLog(REDIS_NOTICE,
|
2013-10-02 09:42:35 +02:00
|
|
|
"Ignoring FAIL message from unknonw node %.40s about %.40s",
|
2011-03-29 17:51:15 +02:00
|
|
|
hdr->sender, hdr->data.fail.about.nodename);
|
|
|
|
}
|
2011-10-07 16:34:16 +02:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_PUBLISH) {
|
|
|
|
robj *channel, *message;
|
|
|
|
uint32_t channel_len, message_len;
|
|
|
|
|
2013-10-03 09:55:20 +02:00
|
|
|
/* Don't bother creating useless objects if there are no
|
|
|
|
* Pub/Sub subscribers. */
|
2011-10-07 16:34:16 +02:00
|
|
|
if (dictSize(server.pubsub_channels) || listLength(server.pubsub_patterns)) {
|
|
|
|
channel_len = ntohl(hdr->data.publish.msg.channel_len);
|
|
|
|
message_len = ntohl(hdr->data.publish.msg.message_len);
|
|
|
|
channel = createStringObject(
|
|
|
|
(char*)hdr->data.publish.msg.bulk_data,channel_len);
|
|
|
|
message = createStringObject(
|
|
|
|
(char*)hdr->data.publish.msg.bulk_data+channel_len, message_len);
|
|
|
|
pubsubPublishMessage(channel,message);
|
|
|
|
decrRefCount(channel);
|
|
|
|
decrRefCount(message);
|
|
|
|
}
|
2013-03-14 21:21:58 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
|
2013-06-12 10:50:07 -07:00
|
|
|
if (!sender) return 1; /* We don't know that node. */
|
2013-09-20 09:22:21 +02:00
|
|
|
clusterSendFailoverAuthIfNeeded(sender,hdr);
|
2013-03-14 21:21:58 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
|
2013-06-12 10:50:07 -07:00
|
|
|
if (!sender) return 1; /* We don't know that node. */
|
2013-10-01 15:40:20 +02:00
|
|
|
/* We consider this vote only if the sender is a master serving
|
2013-10-01 17:21:28 +02:00
|
|
|
* a non zero number of slots, and its currentEpoch is greater or
|
|
|
|
* equal to epoch where this node started the election. */
|
2013-09-26 13:00:41 +02:00
|
|
|
if (sender->flags & REDIS_NODE_MASTER &&
|
|
|
|
sender->numslots > 0 &&
|
2013-10-01 17:21:28 +02:00
|
|
|
senderCurrentEpoch >= server.cluster->failover_auth_epoch)
|
2013-09-26 13:00:41 +02:00
|
|
|
{
|
2013-03-14 21:21:58 +01:00
|
|
|
server.cluster->failover_auth_count++;
|
2013-09-26 16:54:43 +02:00
|
|
|
/* Maybe we reached a quorum here, set a flag to make sure
|
|
|
|
* we check ASAP. */
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
|
2013-09-26 13:00:41 +02:00
|
|
|
}
|
2013-11-08 17:02:10 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_UPDATE) {
|
|
|
|
clusterNode *n; /* The node the update is about. */
|
|
|
|
uint64_t reportedConfigEpoch = ntohu64(hdr->data.update.nodecfg.configEpoch);
|
|
|
|
|
|
|
|
if (!sender) return 1; /* We don't know the sender. */
|
|
|
|
n = clusterLookupNode(hdr->data.update.nodecfg.nodename);
|
|
|
|
if (!n) return 1; /* We don't know the reported node. */
|
|
|
|
if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */
|
|
|
|
|
|
|
|
/* If in our current config the node is a slave, set it as a master. */
|
|
|
|
if (n->flags & REDIS_NODE_SLAVE) clusterSetNodeAsMaster(n);
|
|
|
|
|
|
|
|
/* Check the bitmap of served slots and udpate our config accordingly. */
|
|
|
|
clusterUpdateSlotsConfigWith(n,reportedConfigEpoch,
|
|
|
|
hdr->data.update.nodecfg.slots);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else {
|
2011-10-07 15:37:34 +02:00
|
|
|
redisLog(REDIS_WARNING,"Received unknown packet type: %d", type);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function is called when we detect the link with this node is lost.
|
|
|
|
We set the node as no longer connected. The Cluster Cron will detect
|
|
|
|
this connection and will try to get it connected again.
|
|
|
|
|
|
|
|
Instead if the node is a temporary node used to accept a query, we
|
|
|
|
completely free the node on error. */
|
|
|
|
void handleLinkIOError(clusterLink *link) {
|
|
|
|
freeClusterLink(link);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Send data. This is handled using a trivial send buffer that gets
|
|
|
|
* consumed by write(). We don't try to optimize this for speed too much
|
|
|
|
* as this is a very low traffic channel. */
|
|
|
|
void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
|
|
|
|
clusterLink *link = (clusterLink*) privdata;
|
|
|
|
ssize_t nwritten;
|
|
|
|
REDIS_NOTUSED(el);
|
|
|
|
REDIS_NOTUSED(mask);
|
|
|
|
|
|
|
|
nwritten = write(fd, link->sndbuf, sdslen(link->sndbuf));
|
|
|
|
if (nwritten <= 0) {
|
2013-02-20 13:18:51 +01:00
|
|
|
redisLog(REDIS_DEBUG,"I/O error writing to node link: %s",
|
2011-03-29 17:51:15 +02:00
|
|
|
strerror(errno));
|
|
|
|
handleLinkIOError(link);
|
|
|
|
return;
|
|
|
|
}
|
2013-07-24 11:21:39 +02:00
|
|
|
sdsrange(link->sndbuf,nwritten,-1);
|
2011-03-29 17:51:15 +02:00
|
|
|
if (sdslen(link->sndbuf) == 0)
|
|
|
|
aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Read data. Try to read the first field of the header first to check the
|
|
|
|
* full length of the packet. When a whole packet is in memory this function
|
|
|
|
* will call the function to process the packet. And so forth. */
|
|
|
|
void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
|
2013-11-08 17:19:19 +01:00
|
|
|
char buf[sizeof(clusterMsg)];
|
2011-03-29 17:51:15 +02:00
|
|
|
ssize_t nread;
|
|
|
|
clusterMsg *hdr;
|
|
|
|
clusterLink *link = (clusterLink*) privdata;
|
2013-02-15 16:44:39 +01:00
|
|
|
int readlen, rcvbuflen;
|
2011-03-29 17:51:15 +02:00
|
|
|
REDIS_NOTUSED(el);
|
|
|
|
REDIS_NOTUSED(mask);
|
|
|
|
|
2013-09-03 11:43:07 +02:00
|
|
|
while(1) { /* Read as long as there is data to read. */
|
|
|
|
rcvbuflen = sdslen(link->rcvbuf);
|
|
|
|
if (rcvbuflen < 4) {
|
|
|
|
/* First, obtain the first four bytes to get the full message
|
|
|
|
* length. */
|
|
|
|
readlen = 4 - rcvbuflen;
|
|
|
|
} else {
|
|
|
|
/* Finally read the full message. */
|
|
|
|
hdr = (clusterMsg*) link->rcvbuf;
|
|
|
|
if (rcvbuflen == 4) {
|
|
|
|
/* Perform some sanity check on the message length. */
|
|
|
|
if (ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) {
|
|
|
|
redisLog(REDIS_WARNING,
|
|
|
|
"Bad message length received from Cluster bus.");
|
|
|
|
handleLinkIOError(link);
|
|
|
|
return;
|
|
|
|
}
|
2013-02-15 16:44:39 +01:00
|
|
|
}
|
2013-09-03 11:43:07 +02:00
|
|
|
readlen = ntohl(hdr->totlen) - rcvbuflen;
|
|
|
|
if (readlen > sizeof(buf)) readlen = sizeof(buf);
|
2013-02-15 16:44:39 +01:00
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-09-03 11:43:07 +02:00
|
|
|
nread = read(fd,buf,readlen);
|
|
|
|
if (nread == -1 && errno == EAGAIN) return; /* No more data ready. */
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-09-03 11:43:07 +02:00
|
|
|
if (nread <= 0) {
|
|
|
|
/* I/O error... */
|
|
|
|
redisLog(REDIS_DEBUG,"I/O error reading from node link: %s",
|
|
|
|
(nread == 0) ? "connection closed" : strerror(errno));
|
|
|
|
handleLinkIOError(link);
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
/* Read data and recast the pointer to the new buffer. */
|
|
|
|
link->rcvbuf = sdscatlen(link->rcvbuf,buf,nread);
|
|
|
|
hdr = (clusterMsg*) link->rcvbuf;
|
|
|
|
rcvbuflen += nread;
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-09-03 11:43:07 +02:00
|
|
|
/* Total length obtained? Process this packet. */
|
|
|
|
if (rcvbuflen >= 4 && rcvbuflen == ntohl(hdr->totlen)) {
|
|
|
|
if (clusterProcessPacket(link)) {
|
|
|
|
sdsfree(link->rcvbuf);
|
|
|
|
link->rcvbuf = sdsempty();
|
|
|
|
} else {
|
|
|
|
return; /* Link no longer valid. */
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-26 16:54:43 +02:00
|
|
|
/* Put stuff into the send buffer.
|
|
|
|
*
|
|
|
|
* It is guaranteed that this function will never have as a side effect
|
|
|
|
* the link to be invalidated, so it is safe to call this function
|
|
|
|
* from event handlers that will do stuff with the same link later. */
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) {
|
|
|
|
if (sdslen(link->sndbuf) == 0 && msglen != 0)
|
|
|
|
aeCreateFileEvent(server.el,link->fd,AE_WRITABLE,
|
|
|
|
clusterWriteHandler,link);
|
|
|
|
|
|
|
|
link->sndbuf = sdscatlen(link->sndbuf, msg, msglen);
|
2013-10-02 10:10:08 +02:00
|
|
|
server.cluster->stats_bus_messages_sent++;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
2013-03-15 15:36:36 +01:00
|
|
|
/* Send a message to all the nodes that are part of the cluster having
|
2013-09-26 16:54:43 +02:00
|
|
|
* a connected link.
|
|
|
|
*
|
|
|
|
* It is guaranteed that this function will never have as a side effect
|
|
|
|
* some node->link to be invalidated, so it is safe to call this function
|
|
|
|
* from event handlers that will do stuff with node links later. */
|
2011-10-07 15:37:34 +02:00
|
|
|
void clusterBroadcastMessage(void *buf, size_t len) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2011-10-07 15:37:34 +02:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
2011-11-08 17:07:55 +01:00
|
|
|
clusterNode *node = dictGetVal(de);
|
2011-10-07 15:37:34 +02:00
|
|
|
|
|
|
|
if (!node->link) continue;
|
2013-03-15 15:36:36 +01:00
|
|
|
if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE))
|
|
|
|
continue;
|
2011-10-07 15:37:34 +02:00
|
|
|
clusterSendMessage(node->link,buf,len);
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Build the message header */
|
|
|
|
void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
|
2012-01-25 16:46:19 +01:00
|
|
|
int totlen = 0;
|
2013-10-07 11:30:58 +02:00
|
|
|
clusterNode *master;
|
|
|
|
|
|
|
|
/* If this node is a master, we send its slots bitmap and configEpoch.
|
|
|
|
* If this node is a slave we send the master's information instead (the
|
|
|
|
* node is flagged as slave so the receiver knows that it is NOT really
|
|
|
|
* in charge for this slots. */
|
|
|
|
master = (server.cluster->myself->flags & REDIS_NODE_SLAVE &&
|
|
|
|
server.cluster->myself->slaveof) ?
|
|
|
|
server.cluster->myself->slaveof : server.cluster->myself;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
memset(hdr,0,sizeof(*hdr));
|
|
|
|
hdr->type = htons(type);
|
2013-02-14 13:20:56 +01:00
|
|
|
memcpy(hdr->sender,server.cluster->myself->name,REDIS_CLUSTER_NAMELEN);
|
2013-10-07 11:30:58 +02:00
|
|
|
|
|
|
|
memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots));
|
2011-03-29 17:51:15 +02:00
|
|
|
memset(hdr->slaveof,0,REDIS_CLUSTER_NAMELEN);
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->myself->slaveof != NULL) {
|
|
|
|
memcpy(hdr->slaveof,server.cluster->myself->slaveof->name,
|
2011-03-29 17:51:15 +02:00
|
|
|
REDIS_CLUSTER_NAMELEN);
|
|
|
|
}
|
|
|
|
hdr->port = htons(server.port);
|
2013-03-20 10:32:00 +01:00
|
|
|
hdr->flags = htons(server.cluster->myself->flags);
|
2013-02-14 13:20:56 +01:00
|
|
|
hdr->state = server.cluster->state;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-10-07 11:30:58 +02:00
|
|
|
/* Set the currentEpoch and configEpochs. */
|
2013-09-25 11:53:35 +02:00
|
|
|
hdr->currentEpoch = htonu64(server.cluster->currentEpoch);
|
2013-10-07 11:30:58 +02:00
|
|
|
hdr->configEpoch = htonu64(master->configEpoch);
|
2013-09-25 11:53:35 +02:00
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
if (type == CLUSTERMSG_TYPE_FAIL) {
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
totlen += sizeof(clusterMsgDataFail);
|
2013-11-08 16:26:50 +01:00
|
|
|
} else if (type == CLUSTERMSG_TYPE_UPDATE) {
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
totlen += sizeof(clusterMsgDataUpdate);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
hdr->totlen = htonl(totlen);
|
2013-09-25 11:53:35 +02:00
|
|
|
/* For PING, PONG, and MEET, fixing the totlen field is up to the caller. */
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Send a PING or PONG packet to the specified node, making sure to add enough
|
|
|
|
* gossip informations. */
|
|
|
|
void clusterSendPing(clusterLink *link, int type) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)];
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
int gossipcount = 0, totlen;
|
|
|
|
/* freshnodes is the number of nodes we can still use to populate the
|
|
|
|
* gossip section of the ping packet. Basically we start with the nodes
|
|
|
|
* we have in memory minus two (ourself and the node we are sending the
|
|
|
|
* message to). Every time we add a node we decrement the counter, so when
|
|
|
|
* it will drop to <= zero we know there is no more gossip info we can
|
|
|
|
* send. */
|
2013-02-14 13:20:56 +01:00
|
|
|
int freshnodes = dictSize(server.cluster->nodes)-2;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
if (link->node && type == CLUSTERMSG_TYPE_PING)
|
2013-10-09 16:18:33 +02:00
|
|
|
link->node->ping_sent = mstime();
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterBuildMessageHdr(hdr,type);
|
|
|
|
|
|
|
|
/* Populate the gossip fields */
|
|
|
|
while(freshnodes > 0 && gossipcount < 3) {
|
2013-02-14 13:20:56 +01:00
|
|
|
struct dictEntry *de = dictGetRandomKey(server.cluster->nodes);
|
2011-11-08 17:07:55 +01:00
|
|
|
clusterNode *this = dictGetVal(de);
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterMsgDataGossip *gossip;
|
|
|
|
int j;
|
|
|
|
|
2013-02-28 15:00:09 +01:00
|
|
|
/* In the gossip section don't include:
|
|
|
|
* 1) Myself.
|
|
|
|
* 2) Nodes in HANDSHAKE state.
|
|
|
|
* 3) Nodes with the NOADDR flag set.
|
|
|
|
* 4) Disconnected nodes if they don't have configured slots.
|
|
|
|
*/
|
2013-02-14 13:20:56 +01:00
|
|
|
if (this == server.cluster->myself ||
|
2013-02-28 15:00:09 +01:00
|
|
|
this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) ||
|
2013-02-28 15:13:32 +01:00
|
|
|
(this->link == NULL && this->numslots == 0))
|
2013-02-28 15:00:09 +01:00
|
|
|
{
|
2011-03-29 17:51:15 +02:00
|
|
|
freshnodes--; /* otherwise we may loop forever. */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check if we already added this node */
|
|
|
|
for (j = 0; j < gossipcount; j++) {
|
|
|
|
if (memcmp(hdr->data.ping.gossip[j].nodename,this->name,
|
|
|
|
REDIS_CLUSTER_NAMELEN) == 0) break;
|
|
|
|
}
|
|
|
|
if (j != gossipcount) continue;
|
|
|
|
|
|
|
|
/* Add it */
|
|
|
|
freshnodes--;
|
|
|
|
gossip = &(hdr->data.ping.gossip[gossipcount]);
|
|
|
|
memcpy(gossip->nodename,this->name,REDIS_CLUSTER_NAMELEN);
|
|
|
|
gossip->ping_sent = htonl(this->ping_sent);
|
|
|
|
gossip->pong_received = htonl(this->pong_received);
|
|
|
|
memcpy(gossip->ip,this->ip,sizeof(this->ip));
|
|
|
|
gossip->port = htons(this->port);
|
|
|
|
gossip->flags = htons(this->flags);
|
|
|
|
gossipcount++;
|
|
|
|
}
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
|
|
|
|
hdr->count = htons(gossipcount);
|
|
|
|
hdr->totlen = htonl(totlen);
|
|
|
|
clusterSendMessage(link,buf,totlen);
|
|
|
|
}
|
|
|
|
|
2013-09-26 16:54:43 +02:00
|
|
|
/* Send a PONG packet to every connected node that's not in handshake state
|
|
|
|
* and for which we have a valid link.
|
2013-03-15 15:43:53 +01:00
|
|
|
*
|
2013-09-26 16:54:43 +02:00
|
|
|
* In Redis Cluster pongs are not used just for failure detection, but also
|
|
|
|
* to carry important configuration information. So broadcasting a pong is
|
2013-03-15 15:43:53 +01:00
|
|
|
* useful when something changes in the configuration and we want to make
|
|
|
|
* the cluster aware ASAP (for instance after a slave promotion). */
|
2013-04-11 18:19:48 +02:00
|
|
|
void clusterBroadcastPong(void) {
|
2013-03-15 15:43:53 +01:00
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2013-03-15 15:43:53 +01:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
clusterNode *node = dictGetVal(de);
|
|
|
|
|
2013-09-26 16:54:43 +02:00
|
|
|
if (!node->link) continue;
|
2013-03-15 15:43:53 +01:00
|
|
|
if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue;
|
|
|
|
clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG);
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
2011-10-07 15:37:34 +02:00
|
|
|
/* Send a PUBLISH message.
|
|
|
|
*
|
|
|
|
* If link is NULL, then the message is broadcasted to the whole cluster. */
|
|
|
|
void clusterSendPublish(clusterLink *link, robj *channel, robj *message) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)], *payload;
|
2011-10-07 15:37:34 +02:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
uint32_t totlen;
|
|
|
|
uint32_t channel_len, message_len;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2011-10-07 15:37:34 +02:00
|
|
|
channel = getDecodedObject(channel);
|
|
|
|
message = getDecodedObject(message);
|
|
|
|
channel_len = sdslen(channel->ptr);
|
|
|
|
message_len = sdslen(message->ptr);
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2011-10-07 15:37:34 +02:00
|
|
|
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_PUBLISH);
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
totlen += sizeof(clusterMsgDataPublish) + channel_len + message_len;
|
|
|
|
|
|
|
|
hdr->data.publish.msg.channel_len = htonl(channel_len);
|
|
|
|
hdr->data.publish.msg.message_len = htonl(message_len);
|
|
|
|
hdr->totlen = htonl(totlen);
|
|
|
|
|
|
|
|
/* Try to use the local buffer if possible */
|
|
|
|
if (totlen < sizeof(buf)) {
|
|
|
|
payload = buf;
|
|
|
|
} else {
|
|
|
|
payload = zmalloc(totlen);
|
2012-04-24 11:28:10 +02:00
|
|
|
memcpy(payload,hdr,sizeof(*hdr));
|
2013-09-03 11:27:01 +02:00
|
|
|
hdr = (clusterMsg*) payload;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
2011-10-07 15:37:34 +02:00
|
|
|
memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr));
|
|
|
|
memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr),
|
|
|
|
message->ptr,sdslen(message->ptr));
|
|
|
|
|
|
|
|
if (link)
|
|
|
|
clusterSendMessage(link,payload,totlen);
|
|
|
|
else
|
|
|
|
clusterBroadcastMessage(payload,totlen);
|
|
|
|
|
|
|
|
decrRefCount(channel);
|
|
|
|
decrRefCount(message);
|
|
|
|
if (payload != buf) zfree(payload);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Send a FAIL message to all the nodes we are able to contact.
|
|
|
|
* The FAIL message is sent when we detect that a node is failing
|
|
|
|
* (REDIS_NODE_PFAIL) and we also receive a gossip confirmation of this:
|
|
|
|
* we switch the node state to REDIS_NODE_FAIL and ask all the other
|
|
|
|
* nodes to do the same ASAP. */
|
|
|
|
void clusterSendFail(char *nodename) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)];
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
|
|
|
|
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
|
|
|
|
memcpy(hdr->data.fail.about.nodename,nodename,REDIS_CLUSTER_NAMELEN);
|
|
|
|
clusterBroadcastMessage(buf,ntohl(hdr->totlen));
|
|
|
|
}
|
|
|
|
|
2013-11-08 16:26:50 +01:00
|
|
|
/* Send an UPDATE message to the specified link carrying the specified 'node'
|
|
|
|
* slots configuration. The node name, slots bitmap, and configEpoch info
|
|
|
|
* are included. */
|
|
|
|
void clusterSendUpdate(clusterLink *link, clusterNode *node) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)];
|
2013-11-08 16:26:50 +01:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
|
2013-12-17 12:28:37 +01:00
|
|
|
if (link == NULL) return;
|
2013-11-08 16:26:50 +01:00
|
|
|
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_UPDATE);
|
|
|
|
memcpy(hdr->data.update.nodecfg.nodename,node->name,REDIS_CLUSTER_NAMELEN);
|
|
|
|
hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
|
|
|
|
memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots));
|
|
|
|
clusterSendMessage(link,buf,ntohl(hdr->totlen));
|
|
|
|
}
|
|
|
|
|
2011-10-07 15:37:34 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER Pub/Sub support
|
|
|
|
*
|
|
|
|
* For now we do very little, just propagating PUBLISH messages across the whole
|
|
|
|
* cluster. In the future we'll try to get smarter and avoiding propagating those
|
|
|
|
* messages to hosts without receives for a given channel.
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
void clusterPropagatePublish(robj *channel, robj *message) {
|
|
|
|
clusterSendPublish(NULL, channel, message);
|
|
|
|
}
|
|
|
|
|
2013-03-13 12:44:02 +01:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* SLAVE node specific functions
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-03-13 17:21:20 +01:00
|
|
|
/* This function sends a FAILOVE_AUTH_REQUEST message to every node in order to
|
|
|
|
* see if there is the quorum for this slave instance to failover its failing
|
|
|
|
* master.
|
|
|
|
*
|
|
|
|
* Note that we send the failover request to everybody, master and slave nodes,
|
|
|
|
* but only the masters are supposed to reply to our query. */
|
|
|
|
void clusterRequestFailoverAuth(void) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)];
|
2013-03-13 17:21:20 +01:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
uint32_t totlen;
|
|
|
|
|
|
|
|
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
hdr->totlen = htonl(totlen);
|
2013-03-14 21:27:12 +01:00
|
|
|
clusterBroadcastMessage(buf,totlen);
|
2013-03-13 17:21:20 +01:00
|
|
|
}
|
|
|
|
|
2013-09-30 16:19:44 +02:00
|
|
|
/* Send a FAILOVER_AUTH_ACK message to the specified node. */
|
|
|
|
void clusterSendFailoverAuth(clusterNode *node) {
|
2013-11-08 17:19:19 +01:00
|
|
|
unsigned char buf[sizeof(clusterMsg)];
|
2013-03-14 16:31:57 +01:00
|
|
|
clusterMsg *hdr = (clusterMsg*) buf;
|
|
|
|
uint32_t totlen;
|
|
|
|
|
|
|
|
if (!node->link) return;
|
|
|
|
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
|
|
|
|
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
|
|
|
|
hdr->totlen = htonl(totlen);
|
2013-03-14 21:27:12 +01:00
|
|
|
clusterSendMessage(node->link,buf,totlen);
|
2013-03-14 16:31:57 +01:00
|
|
|
}
|
|
|
|
|
2013-09-26 13:00:41 +02:00
|
|
|
/* Vote for the node asking for our vote if there are the conditions. */
|
2013-09-20 09:22:21 +02:00
|
|
|
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
|
2013-03-13 19:08:03 +01:00
|
|
|
clusterNode *master = node->slaveof;
|
2013-10-08 12:45:35 +02:00
|
|
|
uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch);
|
|
|
|
uint64_t requestConfigEpoch = ntohu64(request->configEpoch);
|
|
|
|
unsigned char *claimed_slots = request->myslots;
|
|
|
|
int j;
|
2013-09-26 13:00:41 +02:00
|
|
|
|
|
|
|
/* IF we are not a master serving at least 1 slot, we don't have the
|
|
|
|
* right to vote, as the cluster size in Redis Cluster is the number
|
2013-11-29 16:17:05 +01:00
|
|
|
* of masters serving at least one slot, and quorum is the cluster
|
|
|
|
* size + 1 */
|
2013-09-26 13:00:41 +02:00
|
|
|
if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return;
|
|
|
|
if (server.cluster->myself->numslots == 0) return;
|
|
|
|
|
|
|
|
/* Request epoch must be >= our currentEpoch. */
|
2013-10-08 12:45:35 +02:00
|
|
|
if (requestCurrentEpoch < server.cluster->currentEpoch) return;
|
2013-03-13 19:08:03 +01:00
|
|
|
|
2013-09-26 13:00:41 +02:00
|
|
|
/* I already voted for this epoch? Return ASAP. */
|
|
|
|
if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return;
|
|
|
|
|
|
|
|
/* Node must be a slave and its master down. */
|
2013-03-13 19:08:03 +01:00
|
|
|
if (!(node->flags & REDIS_NODE_SLAVE) ||
|
|
|
|
master == NULL ||
|
|
|
|
!(master->flags & REDIS_NODE_FAIL)) return;
|
|
|
|
|
2013-09-26 13:00:41 +02:00
|
|
|
/* We did not voted for a slave about this master for two
|
|
|
|
* times the node timeout. This is not strictly needed for correctness
|
|
|
|
* of the algorithm but makes the base case more linear. */
|
2013-10-09 16:18:33 +02:00
|
|
|
if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2)
|
|
|
|
return;
|
2013-03-13 19:08:03 +01:00
|
|
|
|
2013-11-29 16:17:05 +01:00
|
|
|
/* The slave requesting the vote must have a configEpoch for the claimed
|
|
|
|
* slots that is >= the one of the masters currently serving the same
|
|
|
|
* slots in the current configuration. */
|
2013-10-08 12:45:35 +02:00
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (bitmapTestBit(claimed_slots, j) == 0) continue;
|
|
|
|
if (server.cluster->slots[j] == NULL ||
|
|
|
|
server.cluster->slots[j]->configEpoch <= requestConfigEpoch) continue;
|
|
|
|
/* If we reached this point we found a slot that in our current slots
|
|
|
|
* is served by a master with a greater configEpoch than the one claimed
|
|
|
|
* by the slave requesting our vote. Refuse to vote for this slave. */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-09-26 13:00:41 +02:00
|
|
|
/* We can vote for this slave. */
|
2013-09-30 16:19:44 +02:00
|
|
|
clusterSendFailoverAuth(node);
|
2013-09-26 13:00:41 +02:00
|
|
|
server.cluster->last_vote_epoch = server.cluster->currentEpoch;
|
2013-10-09 16:18:33 +02:00
|
|
|
node->slaveof->voted_time = mstime();
|
2013-03-13 18:38:08 +01:00
|
|
|
}
|
|
|
|
|
2013-03-13 12:44:02 +01:00
|
|
|
/* This function is called if we are a slave node and our master serving
|
2013-03-14 16:39:02 +01:00
|
|
|
* a non-zero amount of hash slots is in FAIL state.
|
2013-03-13 12:44:02 +01:00
|
|
|
*
|
|
|
|
* The gaol of this function is:
|
|
|
|
* 1) To check if we are able to perform a failover, is our data updated?
|
2013-09-26 11:13:17 +02:00
|
|
|
* 2) Try to get elected by masters.
|
2013-09-26 13:00:41 +02:00
|
|
|
* 3) Perform the failover informing all the other nodes.
|
2013-03-13 12:44:02 +01:00
|
|
|
*/
|
|
|
|
void clusterHandleSlaveFailover(void) {
|
2013-10-09 16:18:33 +02:00
|
|
|
mstime_t data_age;
|
2013-09-26 11:13:17 +02:00
|
|
|
mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
|
2013-03-13 13:10:49 +01:00
|
|
|
int needed_quorum = (server.cluster->size / 2) + 1;
|
2013-03-15 16:53:41 +01:00
|
|
|
int j;
|
2013-03-13 13:10:49 +01:00
|
|
|
|
2013-11-29 16:17:05 +01:00
|
|
|
/* Set data_age to the number of seconds we are disconnected from
|
|
|
|
* the master. */
|
2013-10-07 16:07:13 +02:00
|
|
|
if (server.repl_state == REDIS_REPL_CONNECTED) {
|
2013-10-09 16:36:00 +02:00
|
|
|
data_age = (server.unixtime - server.master->lastinteraction) * 1000;
|
2013-10-07 16:07:13 +02:00
|
|
|
} else {
|
2013-10-09 16:36:00 +02:00
|
|
|
data_age = (server.unixtime - server.repl_down_since) * 1000;
|
2013-10-07 16:07:13 +02:00
|
|
|
}
|
|
|
|
|
2013-09-26 16:54:43 +02:00
|
|
|
/* Pre conditions to run the function:
|
|
|
|
* 1) We are a slave.
|
|
|
|
* 2) Our master is flagged as FAIL.
|
|
|
|
* 3) It is serving slots. */
|
|
|
|
if (!(server.cluster->myself->flags & REDIS_NODE_SLAVE) ||
|
|
|
|
server.cluster->myself->slaveof == NULL ||
|
|
|
|
!(server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL) ||
|
|
|
|
server.cluster->myself->slaveof->numslots == 0) return;
|
|
|
|
|
2013-09-26 11:13:17 +02:00
|
|
|
/* Remove the node timeout from the data age as it is fine that we are
|
|
|
|
* disconnected from our master at least for the time it was down to be
|
|
|
|
* flagged as FAIL, that's the baseline. */
|
|
|
|
if (data_age > server.cluster_node_timeout)
|
|
|
|
data_age -= server.cluster_node_timeout;
|
|
|
|
|
2013-03-13 13:10:49 +01:00
|
|
|
/* Check if our data is recent enough. For now we just use a fixed
|
|
|
|
* constant of ten times the node timeout since the cluster should
|
|
|
|
* react much faster to a master down. */
|
2013-04-04 12:02:48 +02:00
|
|
|
if (data_age >
|
2013-04-09 11:24:18 +02:00
|
|
|
server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)
|
2013-04-04 12:02:48 +02:00
|
|
|
return;
|
2013-03-13 13:10:49 +01:00
|
|
|
|
2013-09-26 11:13:17 +02:00
|
|
|
/* Compute the time at which we can start an election. */
|
2013-11-29 16:17:05 +01:00
|
|
|
if (auth_age >
|
2013-10-09 16:18:33 +02:00
|
|
|
server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT)
|
2013-03-15 13:20:23 +01:00
|
|
|
{
|
2013-09-26 11:13:17 +02:00
|
|
|
server.cluster->failover_auth_time = mstime() +
|
|
|
|
500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
|
2013-10-09 16:18:33 +02:00
|
|
|
data_age / 10 + /* Add 100 milliseconds for every second of age. */
|
2013-09-26 11:13:17 +02:00
|
|
|
random() % 500; /* Random delay between 0 and 500 milliseconds. */
|
2013-03-13 13:10:49 +01:00
|
|
|
server.cluster->failover_auth_count = 0;
|
2013-09-26 11:13:17 +02:00
|
|
|
server.cluster->failover_auth_sent = 0;
|
2013-11-29 16:17:05 +01:00
|
|
|
redisLog(REDIS_WARNING,
|
|
|
|
"Start of election delayed for %lld milliseconds.",
|
2013-09-26 11:13:17 +02:00
|
|
|
server.cluster->failover_auth_time - mstime());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return ASAP if we can't still start the election. */
|
|
|
|
if (mstime() < server.cluster->failover_auth_time) return;
|
|
|
|
|
|
|
|
/* Return ASAP if the election is too old to be valid. */
|
2013-11-29 16:17:05 +01:00
|
|
|
if (auth_age > server.cluster_node_timeout) return;
|
2013-09-26 11:13:17 +02:00
|
|
|
|
|
|
|
/* Ask for votes if needed. */
|
|
|
|
if (server.cluster->failover_auth_sent == 0) {
|
|
|
|
server.cluster->currentEpoch++;
|
|
|
|
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
|
|
|
|
redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.",
|
2013-11-05 12:01:07 +01:00
|
|
|
(unsigned long long) server.cluster->currentEpoch);
|
2013-03-14 16:39:02 +01:00
|
|
|
clusterRequestFailoverAuth();
|
2013-09-26 11:13:17 +02:00
|
|
|
server.cluster->failover_auth_sent = 1;
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
|
|
|
|
CLUSTER_TODO_UPDATE_STATE|
|
|
|
|
CLUSTER_TODO_FSYNC_CONFIG);
|
2013-03-13 13:10:49 +01:00
|
|
|
return; /* Wait for replies. */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check if we reached the quorum. */
|
2013-03-15 13:20:23 +01:00
|
|
|
if (server.cluster->failover_auth_count >= needed_quorum) {
|
2013-03-15 16:53:41 +01:00
|
|
|
clusterNode *oldmaster = server.cluster->myself->slaveof;
|
|
|
|
|
2013-03-14 16:39:02 +01:00
|
|
|
redisLog(REDIS_WARNING,
|
2013-09-30 11:51:58 +02:00
|
|
|
"Failover election won: I'm the new master.");
|
2013-03-15 16:11:34 +01:00
|
|
|
/* We have the quorum, perform all the steps to correctly promote
|
|
|
|
* this slave to a master.
|
|
|
|
*
|
|
|
|
* 1) Turn this node into a master. */
|
|
|
|
clusterNodeRemoveSlave(server.cluster->myself->slaveof,
|
|
|
|
server.cluster->myself);
|
|
|
|
server.cluster->myself->flags &= ~REDIS_NODE_SLAVE;
|
|
|
|
server.cluster->myself->flags |= REDIS_NODE_MASTER;
|
|
|
|
server.cluster->myself->slaveof = NULL;
|
|
|
|
replicationUnsetMaster();
|
|
|
|
|
2013-03-15 16:53:41 +01:00
|
|
|
/* 2) Claim all the slots assigned to our master. */
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (clusterNodeGetSlotBit(oldmaster,j)) {
|
|
|
|
clusterDelSlot(j);
|
|
|
|
clusterAddSlot(server.cluster->myself,j);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-30 10:13:58 +02:00
|
|
|
/* 3) Update my configEpoch to the epoch of the election. */
|
2013-11-29 16:17:05 +01:00
|
|
|
server.cluster->myself->configEpoch =
|
|
|
|
server.cluster->failover_auth_epoch;
|
2013-09-26 11:13:17 +02:00
|
|
|
|
2013-09-30 10:13:58 +02:00
|
|
|
/* 4) Update state and save config. */
|
2013-03-15 16:53:41 +01:00
|
|
|
clusterUpdateState();
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterSaveConfigOrDie(1);
|
2013-09-30 10:13:58 +02:00
|
|
|
|
|
|
|
/* 5) Pong all the other nodes so that they can update the state
|
|
|
|
* accordingly and detect that we switched to master role. */
|
|
|
|
clusterBroadcastPong();
|
2013-03-13 13:10:49 +01:00
|
|
|
}
|
2013-03-13 12:44:02 +01:00
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER cron job
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-10-09 16:18:33 +02:00
|
|
|
/* This is executed 10 times every second */
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterCron(void) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
2013-03-07 15:40:53 +01:00
|
|
|
int j, update_state = 0;
|
2013-10-09 16:18:33 +02:00
|
|
|
mstime_t min_pong = 0, now = mstime();
|
2013-04-11 18:55:58 +02:00
|
|
|
clusterNode *min_pong_node = NULL;
|
2013-10-09 16:29:14 +02:00
|
|
|
static unsigned long long iteration = 0;
|
2013-10-11 10:34:32 +02:00
|
|
|
mstime_t handshake_timeout;
|
2013-10-09 16:29:14 +02:00
|
|
|
|
|
|
|
iteration++; /* Number of times this function was called so far. */
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-12-05 16:35:32 +01:00
|
|
|
/* The handshake timeout is the time after which a handshake node that was
|
2013-10-11 10:34:32 +02:00
|
|
|
* not turned into a normal node is removed from the nodes. Usually it is
|
|
|
|
* just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
|
|
|
|
* the value of 1 second. */
|
|
|
|
handshake_timeout = server.cluster_node_timeout;
|
|
|
|
if (handshake_timeout < 1000) handshake_timeout = 1000;
|
|
|
|
|
2013-01-17 01:00:20 +08:00
|
|
|
/* Check if we have disconnected nodes and re-establish the connection. */
|
2013-08-21 15:51:10 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2011-03-29 17:51:15 +02:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
2011-11-08 17:07:55 +01:00
|
|
|
clusterNode *node = dictGetVal(de);
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue;
|
2013-09-04 12:41:21 +02:00
|
|
|
|
|
|
|
/* A Node in HANDSHAKE state has a limited lifespan equal to the
|
|
|
|
* configured node timeout. */
|
|
|
|
if (node->flags & REDIS_NODE_HANDSHAKE &&
|
2013-10-11 10:34:32 +02:00
|
|
|
now - node->ctime > handshake_timeout)
|
2013-09-04 12:41:21 +02:00
|
|
|
{
|
|
|
|
freeClusterNode(node);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
if (node->link == NULL) {
|
|
|
|
int fd;
|
2013-10-09 16:18:33 +02:00
|
|
|
mstime_t old_ping_sent;
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterLink *link;
|
|
|
|
|
|
|
|
fd = anetTcpNonBlockConnect(server.neterr, node->ip,
|
|
|
|
node->port+REDIS_CLUSTER_PORT_INCR);
|
|
|
|
if (fd == -1) continue;
|
|
|
|
link = createClusterLink(node);
|
|
|
|
link->fd = fd;
|
|
|
|
node->link = link;
|
|
|
|
aeCreateFileEvent(server.el,link->fd,AE_READABLE,clusterReadHandler,link);
|
2013-04-11 18:55:58 +02:00
|
|
|
/* Queue a PING in the new connection ASAP: this is crucial
|
|
|
|
* to avoid false positives in failure detection.
|
|
|
|
*
|
|
|
|
* If the node is flagged as MEET, we send a MEET message instead
|
2011-03-29 17:51:15 +02:00
|
|
|
* of a PING one, to force the receiver to add us in its node
|
|
|
|
* table. */
|
2013-04-11 19:12:29 +02:00
|
|
|
old_ping_sent = node->ping_sent;
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterSendPing(link, node->flags & REDIS_NODE_MEET ?
|
|
|
|
CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
|
2013-04-11 19:12:29 +02:00
|
|
|
if (old_ping_sent) {
|
|
|
|
/* If there was an active ping before the link was
|
|
|
|
* disconnected, we want to restore the ping time, otherwise
|
|
|
|
* replaced by the clusterSendPing() call. */
|
|
|
|
node->ping_sent = old_ping_sent;
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
/* We can clear the flag after the first packet is sent.
|
|
|
|
* If we'll never receive a PONG, we'll never send new packets
|
|
|
|
* to this node. Instead after the PONG is received and we
|
|
|
|
* are no longer in meet/handshake status, we want to send
|
|
|
|
* normal PING packets. */
|
|
|
|
node->flags &= ~REDIS_NODE_MEET;
|
|
|
|
|
2013-02-20 13:18:51 +01:00
|
|
|
redisLog(REDIS_DEBUG,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
|
2013-10-09 16:29:14 +02:00
|
|
|
/* Ping some random node 1 time every 10 iterations, so that we usually ping
|
|
|
|
* one random node every second. */
|
|
|
|
if (!(iteration % 10)) {
|
|
|
|
/* Check a few random nodes and ping the one with the oldest
|
|
|
|
* pong_received time. */
|
|
|
|
for (j = 0; j < 5; j++) {
|
|
|
|
de = dictGetRandomKey(server.cluster->nodes);
|
|
|
|
clusterNode *this = dictGetVal(de);
|
|
|
|
|
|
|
|
/* Don't ping nodes disconnected or with a ping currently active. */
|
|
|
|
if (this->link == NULL || this->ping_sent != 0) continue;
|
|
|
|
if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue;
|
|
|
|
if (min_pong_node == NULL || min_pong > this->pong_received) {
|
|
|
|
min_pong_node = this;
|
|
|
|
min_pong = this->pong_received;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (min_pong_node) {
|
|
|
|
redisLog(REDIS_DEBUG,"Pinging node %.40s", min_pong_node->name);
|
|
|
|
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Iterate nodes to check if we need to flag something as failing */
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2011-03-29 17:51:15 +02:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
2011-11-08 17:07:55 +01:00
|
|
|
clusterNode *node = dictGetVal(de);
|
2013-10-09 16:18:33 +02:00
|
|
|
now = mstime(); /* Use an updated time at every iteration. */
|
2013-12-17 10:27:12 +01:00
|
|
|
mstime_t delay;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
if (node->flags &
|
2011-04-07 23:10:32 +02:00
|
|
|
(REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE))
|
|
|
|
continue;
|
2013-03-05 12:13:39 +01:00
|
|
|
|
2013-04-11 18:55:58 +02:00
|
|
|
/* If we are waiting for the PONG more than half the cluster
|
|
|
|
* timeout, reconnect the link: maybe there is a connection
|
|
|
|
* issue even if the node is alive. */
|
|
|
|
if (node->link && /* is connected */
|
2013-10-09 16:18:33 +02:00
|
|
|
now - node->link->ctime >
|
2013-05-03 12:37:45 +02:00
|
|
|
server.cluster_node_timeout && /* was not already reconnected */
|
2013-04-11 18:55:58 +02:00
|
|
|
node->ping_sent && /* we already sent a ping */
|
|
|
|
node->pong_received < node->ping_sent && /* still waiting pong */
|
|
|
|
/* and we are waiting for the pong more than timeout/2 */
|
|
|
|
now - node->ping_sent > server.cluster_node_timeout/2)
|
|
|
|
{
|
|
|
|
/* Disconnect the link, it will be reconnected automatically. */
|
|
|
|
freeClusterLink(node->link);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we have currently no active ping in this instance, and the
|
|
|
|
* received PONG is older than half the cluster timeout, send
|
|
|
|
* a new ping now, to ensure all the nodes are pinged without
|
|
|
|
* a too big delay. */
|
2013-03-05 12:13:39 +01:00
|
|
|
if (node->link &&
|
2013-04-11 18:55:58 +02:00
|
|
|
node->ping_sent == 0 &&
|
|
|
|
(now - node->pong_received) > server.cluster_node_timeout/2)
|
2013-03-05 12:13:39 +01:00
|
|
|
{
|
|
|
|
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-04-11 18:55:58 +02:00
|
|
|
/* Check only if we have an active ping for this instance. */
|
|
|
|
if (node->ping_sent == 0) continue;
|
2013-04-08 19:40:20 +02:00
|
|
|
|
2013-04-11 18:55:58 +02:00
|
|
|
/* Compute the delay of the PONG. Note that if we already received
|
|
|
|
* the PONG, then node->ping_sent is zero, so can't reach this
|
|
|
|
* code at all. */
|
|
|
|
delay = now - node->ping_sent;
|
2013-02-26 15:15:44 +01:00
|
|
|
|
2013-04-11 18:55:58 +02:00
|
|
|
if (delay > server.cluster_node_timeout) {
|
2013-01-17 01:00:20 +08:00
|
|
|
/* Timeout reached. Set the node as possibly failing if it is
|
2011-04-07 23:06:01 +02:00
|
|
|
* not already in this state. */
|
2011-04-07 23:10:32 +02:00
|
|
|
if (!(node->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) {
|
2011-03-29 17:51:15 +02:00
|
|
|
redisLog(REDIS_DEBUG,"*** NODE %.40s possibly failing",
|
|
|
|
node->name);
|
|
|
|
node->flags |= REDIS_NODE_PFAIL;
|
2013-03-07 15:40:53 +01:00
|
|
|
update_state = 1;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
2013-03-05 16:12:08 +01:00
|
|
|
|
|
|
|
/* If we are a slave node but the replication is still turned off,
|
|
|
|
* enable it if we know the address of our master and it appears to
|
|
|
|
* be up. */
|
|
|
|
if (server.cluster->myself->flags & REDIS_NODE_SLAVE &&
|
|
|
|
server.masterhost == NULL &&
|
|
|
|
server.cluster->myself->slaveof &&
|
|
|
|
!(server.cluster->myself->slaveof->flags & REDIS_NODE_NOADDR))
|
|
|
|
{
|
|
|
|
replicationSetMaster(server.cluster->myself->slaveof->ip,
|
|
|
|
server.cluster->myself->slaveof->port);
|
|
|
|
}
|
2013-03-07 15:40:53 +01:00
|
|
|
|
2013-09-26 16:54:43 +02:00
|
|
|
clusterHandleSlaveFailover();
|
|
|
|
if (update_state) clusterUpdateState();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function is called before the event handler returns to sleep for
|
|
|
|
* events. It is useful to perform operations that must be done ASAP in
|
|
|
|
* reaction to events fired but that are not safe to perform inside event
|
2013-10-03 09:55:20 +02:00
|
|
|
* handlers, or to perform potentially expansive tasks that we need to do
|
|
|
|
* a single time before replying to clients. */
|
2013-09-26 16:54:43 +02:00
|
|
|
void clusterBeforeSleep(void) {
|
2013-10-03 09:55:20 +02:00
|
|
|
/* Handle failover, this is needed when it is likely that there is already
|
|
|
|
* the quorum from masters in order to react fast. */
|
|
|
|
if (server.cluster->todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER)
|
2013-03-13 12:44:02 +01:00
|
|
|
clusterHandleSlaveFailover();
|
2013-10-03 09:55:20 +02:00
|
|
|
|
|
|
|
/* Update the cluster state. */
|
|
|
|
if (server.cluster->todo_before_sleep & CLUSTER_TODO_UPDATE_STATE)
|
|
|
|
clusterUpdateState();
|
|
|
|
|
|
|
|
/* Save the config, possibly using fsync. */
|
|
|
|
if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) {
|
|
|
|
int fsync = server.cluster->todo_before_sleep & CLUSTER_TODO_FSYNC_CONFIG;
|
|
|
|
clusterSaveConfigOrDie(fsync);
|
2013-03-13 12:44:02 +01:00
|
|
|
}
|
2013-10-03 09:55:20 +02:00
|
|
|
|
|
|
|
/* Reset our flags. */
|
|
|
|
server.cluster->todo_before_sleep = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void clusterDoBeforeSleep(int flags) {
|
|
|
|
server.cluster->todo_before_sleep |= flags;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* Slots management
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-10-08 12:45:35 +02:00
|
|
|
/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set,
|
2013-02-28 15:23:09 +01:00
|
|
|
* otherwise 0. */
|
|
|
|
int bitmapTestBit(unsigned char *bitmap, int pos) {
|
|
|
|
off_t byte = pos/8;
|
|
|
|
int bit = pos&7;
|
|
|
|
return (bitmap[byte] & (1<<bit)) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set the bit at position 'pos' in a bitmap. */
|
|
|
|
void bitmapSetBit(unsigned char *bitmap, int pos) {
|
|
|
|
off_t byte = pos/8;
|
|
|
|
int bit = pos&7;
|
|
|
|
bitmap[byte] |= 1<<bit;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Clear the bit at position 'pos' in a bitmap. */
|
|
|
|
void bitmapClearBit(unsigned char *bitmap, int pos) {
|
|
|
|
off_t byte = pos/8;
|
|
|
|
int bit = pos&7;
|
|
|
|
bitmap[byte] &= ~(1<<bit);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Set the slot bit and return the old value. */
|
|
|
|
int clusterNodeSetSlotBit(clusterNode *n, int slot) {
|
2013-02-28 15:23:09 +01:00
|
|
|
int old = bitmapTestBit(n->slots,slot);
|
|
|
|
bitmapSetBit(n->slots,slot);
|
2013-02-28 15:11:05 +01:00
|
|
|
if (!old) n->numslots++;
|
2011-03-29 17:51:15 +02:00
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Clear the slot bit and return the old value. */
|
|
|
|
int clusterNodeClearSlotBit(clusterNode *n, int slot) {
|
2013-02-28 15:23:09 +01:00
|
|
|
int old = bitmapTestBit(n->slots,slot);
|
|
|
|
bitmapClearBit(n->slots,slot);
|
2013-02-28 15:11:05 +01:00
|
|
|
if (old) n->numslots--;
|
2011-03-29 17:51:15 +02:00
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the slot bit from the cluster node structure. */
|
|
|
|
int clusterNodeGetSlotBit(clusterNode *n, int slot) {
|
2013-02-28 15:23:09 +01:00
|
|
|
return bitmapTestBit(n->slots,slot);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Add the specified slot to the list of slots that node 'n' will
|
|
|
|
* serve. Return REDIS_OK if the operation ended with success.
|
|
|
|
* If the slot is already assigned to another instance this is considered
|
|
|
|
* an error and REDIS_ERR is returned. */
|
|
|
|
int clusterAddSlot(clusterNode *n, int slot) {
|
2013-02-21 11:51:17 +01:00
|
|
|
if (server.cluster->slots[slot]) return REDIS_ERR;
|
|
|
|
clusterNodeSetSlotBit(n,slot);
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->slots[slot] = n;
|
2011-03-29 17:51:15 +02:00
|
|
|
return REDIS_OK;
|
|
|
|
}
|
|
|
|
|
2011-05-06 13:38:27 +02:00
|
|
|
/* Delete the specified slot marking it as unassigned.
|
|
|
|
* Returns REDIS_OK if the slot was assigned, otherwise if the slot was
|
|
|
|
* already unassigned REDIS_ERR is returned. */
|
|
|
|
int clusterDelSlot(int slot) {
|
2013-02-14 13:20:56 +01:00
|
|
|
clusterNode *n = server.cluster->slots[slot];
|
2011-05-06 13:38:27 +02:00
|
|
|
|
|
|
|
if (!n) return REDIS_ERR;
|
|
|
|
redisAssert(clusterNodeClearSlotBit(n,slot) == 1);
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->slots[slot] = NULL;
|
2011-05-06 13:38:27 +02:00
|
|
|
return REDIS_OK;
|
|
|
|
}
|
|
|
|
|
2013-03-15 16:35:16 +01:00
|
|
|
/* Delete all the slots associated with the specified node.
|
|
|
|
* The number of deleted slots is returned. */
|
|
|
|
int clusterDelNodeSlots(clusterNode *node) {
|
|
|
|
int deleted = 0, j;
|
|
|
|
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (clusterNodeGetSlotBit(node,j)) clusterDelSlot(j);
|
|
|
|
deleted++;
|
|
|
|
}
|
|
|
|
return deleted;
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* Cluster state evaluation function
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
void clusterUpdateState(void) {
|
2013-03-07 15:22:32 +01:00
|
|
|
int j, initial_state = server.cluster->state;
|
2013-03-07 15:36:59 +01:00
|
|
|
int unreachable_masters = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-03-06 18:24:43 +01:00
|
|
|
/* Start assuming the state is OK. We'll turn it into FAIL if there
|
|
|
|
* are the right conditions. */
|
|
|
|
server.cluster->state = REDIS_CLUSTER_OK;
|
|
|
|
|
2013-02-22 17:43:35 +01:00
|
|
|
/* Check if all the slots are covered. */
|
2011-03-29 17:51:15 +02:00
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->slots[j] == NULL ||
|
|
|
|
server.cluster->slots[j]->flags & (REDIS_NODE_FAIL))
|
2011-03-29 17:51:15 +02:00
|
|
|
{
|
2013-03-06 18:24:43 +01:00
|
|
|
server.cluster->state = REDIS_CLUSTER_FAIL;
|
2011-03-29 17:51:15 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2013-02-22 17:43:35 +01:00
|
|
|
|
2013-02-22 19:18:30 +01:00
|
|
|
/* Compute the cluster size, that is the number of master nodes
|
2013-03-07 15:36:59 +01:00
|
|
|
* serving at least a single slot.
|
|
|
|
*
|
|
|
|
* At the same time count the number of unreachable masters with
|
|
|
|
* at least one node. */
|
2013-02-22 19:18:30 +01:00
|
|
|
{
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
server.cluster->size = 0;
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2013-02-22 19:18:30 +01:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
clusterNode *node = dictGetVal(de);
|
|
|
|
|
2013-03-07 15:36:59 +01:00
|
|
|
if (node->flags & REDIS_NODE_MASTER && node->numslots) {
|
2013-02-22 19:18:30 +01:00
|
|
|
server.cluster->size++;
|
2013-03-07 15:36:59 +01:00
|
|
|
if (node->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL))
|
|
|
|
unreachable_masters++;
|
|
|
|
}
|
2013-02-22 19:18:30 +01:00
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
2013-03-07 15:22:32 +01:00
|
|
|
|
2013-03-07 15:36:59 +01:00
|
|
|
/* If we can't reach at least half the masters, change the cluster state
|
2013-09-27 09:55:41 +02:00
|
|
|
* to FAIL, as we are not even able to mark nodes as FAIL in this side
|
|
|
|
* of the netsplit because of lack of majority.
|
|
|
|
*
|
|
|
|
* TODO: when this condition is entered, we should not undo it for some
|
|
|
|
* (small) time after the majority is reachable again, to make sure that
|
|
|
|
* other nodes have enough time to inform this node of a configuration change.
|
|
|
|
* Otherwise a client with an old routing table may write to this node
|
|
|
|
* and later it may turn into a slave losing the write. */
|
2013-03-07 15:36:59 +01:00
|
|
|
{
|
|
|
|
int needed_quorum = (server.cluster->size / 2) + 1;
|
|
|
|
|
|
|
|
if (unreachable_masters >= needed_quorum)
|
|
|
|
server.cluster->state = REDIS_CLUSTER_FAIL;
|
|
|
|
}
|
|
|
|
|
2013-03-07 15:22:32 +01:00
|
|
|
/* Log a state change */
|
|
|
|
if (initial_state != server.cluster->state)
|
|
|
|
redisLog(REDIS_WARNING,"Cluster state changed: %s",
|
|
|
|
server.cluster->state == REDIS_CLUSTER_OK ? "ok" : "fail");
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
2013-02-25 11:20:17 +01:00
|
|
|
/* This function is called after the node startup in order to verify that data
|
|
|
|
* loaded from disk is in agreement with the cluster configuration:
|
|
|
|
*
|
|
|
|
* 1) If we find keys about hash slots we have no responsibility for, the
|
|
|
|
* following happens:
|
|
|
|
* A) If no other node is in charge according to the current cluster
|
|
|
|
* configuration, we add these slots to our node.
|
|
|
|
* B) If according to our config other nodes are already in charge for
|
|
|
|
* this lots, we set the slots as IMPORTING from our point of view
|
|
|
|
* in order to justify we have those slots, and in order to make
|
|
|
|
* redis-trib aware of the issue, so that it can try to fix it.
|
|
|
|
* 2) If we find data in a DB different than DB0 we return REDIS_ERR to
|
|
|
|
* signal the caller it should quit the server with an error message
|
|
|
|
* or take other actions.
|
|
|
|
*
|
|
|
|
* The function always returns REDIS_OK even if it will try to correct
|
|
|
|
* the error described in "1". However if data is found in DB different
|
|
|
|
* from DB0, REDIS_ERR is returned.
|
|
|
|
*
|
|
|
|
* The function also uses the logging facility in order to warn the user
|
|
|
|
* about desynchronizations between the data we have in memory and the
|
|
|
|
* cluster configuration. */
|
|
|
|
int verifyClusterConfigWithData(void) {
|
2013-02-25 11:43:49 +01:00
|
|
|
int j;
|
|
|
|
int update_config = 0;
|
|
|
|
|
2013-03-04 19:47:00 +01:00
|
|
|
/* If this node is a slave, don't perform the check at all as we
|
|
|
|
* completely depend on the replication stream. */
|
|
|
|
if (server.cluster->myself->flags & REDIS_NODE_SLAVE) return REDIS_OK;
|
|
|
|
|
2013-02-25 11:43:49 +01:00
|
|
|
/* Make sure we only have keys in DB0. */
|
|
|
|
for (j = 1; j < server.dbnum; j++) {
|
|
|
|
if (dictSize(server.db[j].dict)) return REDIS_ERR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check that all the slots we see populated memory have a corresponding
|
|
|
|
* entry in the cluster table. Otherwise fix the table. */
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (!countKeysInSlot(j)) continue; /* No keys in this slot. */
|
|
|
|
/* Check if we are assigned to this slot or if we are importing it.
|
|
|
|
* In both cases check the next slot as the configuration makes
|
|
|
|
* sense. */
|
|
|
|
if (server.cluster->slots[j] == server.cluster->myself ||
|
|
|
|
server.cluster->importing_slots_from[j] != NULL) continue;
|
|
|
|
|
|
|
|
/* If we are here data and cluster config don't agree, and we have
|
|
|
|
* slot 'j' populated even if we are not importing it, nor we are
|
|
|
|
* assigned to this slot. Fix this condition. */
|
|
|
|
|
|
|
|
update_config++;
|
|
|
|
/* Case A: slot is unassigned. Take responsability for it. */
|
|
|
|
if (server.cluster->slots[j] == NULL) {
|
|
|
|
redisLog(REDIS_WARNING, "I've keys about slot %d that is "
|
|
|
|
"unassigned. Taking responsability "
|
|
|
|
"for it.",j);
|
|
|
|
clusterAddSlot(server.cluster->myself,j);
|
|
|
|
} else {
|
|
|
|
redisLog(REDIS_WARNING, "I've keys about slot %d that is "
|
|
|
|
"already assigned to a different node. "
|
|
|
|
"Setting it in importing state.",j);
|
|
|
|
server.cluster->importing_slots_from[j] = server.cluster->slots[j];
|
|
|
|
}
|
|
|
|
}
|
2013-10-03 09:55:20 +02:00
|
|
|
if (update_config) clusterSaveConfigOrDie(1);
|
2013-02-25 11:20:17 +01:00
|
|
|
return REDIS_OK;
|
|
|
|
}
|
|
|
|
|
2013-03-04 13:15:09 +01:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* SLAVE nodes handling
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-03-20 11:32:35 +01:00
|
|
|
/* Set the specified node 'n' as master. Setup the node as a slave if
|
|
|
|
* needed. */
|
2013-03-04 13:15:09 +01:00
|
|
|
void clusterSetMaster(clusterNode *n) {
|
|
|
|
clusterNode *myself = server.cluster->myself;
|
|
|
|
|
|
|
|
redisAssert(n != myself);
|
2013-03-20 11:32:35 +01:00
|
|
|
redisAssert(myself->numslots == 0);
|
2013-03-04 13:15:09 +01:00
|
|
|
|
|
|
|
if (myself->flags & REDIS_NODE_MASTER) {
|
|
|
|
myself->flags &= ~REDIS_NODE_MASTER;
|
|
|
|
myself->flags |= REDIS_NODE_SLAVE;
|
|
|
|
}
|
|
|
|
myself->slaveof = n;
|
2013-03-04 15:27:58 +01:00
|
|
|
replicationSetMaster(n->ip, n->port);
|
2013-03-04 13:15:09 +01:00
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* CLUSTER command
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-09-04 10:25:26 +02:00
|
|
|
/* Generate a csv-alike representation of the nodes we are aware of,
|
|
|
|
* including the "myself" node, and return an SDS string containing the
|
|
|
|
* representation (it is up to the caller to free it).
|
|
|
|
*
|
|
|
|
* All the nodes matching at least one of the node flags specified in
|
|
|
|
* "filter" are excluded from the output, so using zero as a filter will
|
|
|
|
* include all the known nodes in the representation, including nodes in
|
|
|
|
* the HANDSHAKE state.
|
|
|
|
*
|
|
|
|
* The representation obtained using this function is used for the output
|
|
|
|
* of the CLUSTER NODES function, and as format for the cluster
|
|
|
|
* configuration file (nodes.conf) for a given node. */
|
|
|
|
sds clusterGenNodesDescription(int filter) {
|
2011-03-30 14:58:19 +02:00
|
|
|
sds ci = sdsempty();
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
2011-03-30 17:41:13 +02:00
|
|
|
int j, start;
|
2011-03-30 14:58:19 +02:00
|
|
|
|
2013-09-04 10:07:50 +02:00
|
|
|
di = dictGetSafeIterator(server.cluster->nodes);
|
2011-03-30 14:58:19 +02:00
|
|
|
while((de = dictNext(di)) != NULL) {
|
2011-11-08 17:07:55 +01:00
|
|
|
clusterNode *node = dictGetVal(de);
|
2011-03-30 14:58:19 +02:00
|
|
|
|
2013-09-04 10:25:26 +02:00
|
|
|
if (node->flags & filter) continue;
|
|
|
|
|
2011-03-30 14:58:19 +02:00
|
|
|
/* Node coordinates */
|
|
|
|
ci = sdscatprintf(ci,"%.40s %s:%d ",
|
|
|
|
node->name,
|
|
|
|
node->ip,
|
|
|
|
node->port);
|
|
|
|
|
|
|
|
/* Flags */
|
|
|
|
if (node->flags == 0) ci = sdscat(ci,"noflags,");
|
|
|
|
if (node->flags & REDIS_NODE_MYSELF) ci = sdscat(ci,"myself,");
|
|
|
|
if (node->flags & REDIS_NODE_MASTER) ci = sdscat(ci,"master,");
|
|
|
|
if (node->flags & REDIS_NODE_SLAVE) ci = sdscat(ci,"slave,");
|
|
|
|
if (node->flags & REDIS_NODE_PFAIL) ci = sdscat(ci,"fail?,");
|
|
|
|
if (node->flags & REDIS_NODE_FAIL) ci = sdscat(ci,"fail,");
|
|
|
|
if (node->flags & REDIS_NODE_HANDSHAKE) ci =sdscat(ci,"handshake,");
|
|
|
|
if (node->flags & REDIS_NODE_NOADDR) ci = sdscat(ci,"noaddr,");
|
|
|
|
if (ci[sdslen(ci)-1] == ',') ci[sdslen(ci)-1] = ' ';
|
|
|
|
|
|
|
|
/* Slave of... or just "-" */
|
|
|
|
if (node->slaveof)
|
|
|
|
ci = sdscatprintf(ci,"%.40s ",node->slaveof->name);
|
|
|
|
else
|
|
|
|
ci = sdscatprintf(ci,"- ");
|
|
|
|
|
|
|
|
/* Latency from the POV of this node, link status */
|
2013-12-17 09:38:11 +01:00
|
|
|
ci = sdscatprintf(ci,"%lld %lld %llu %s",
|
|
|
|
(long long) node->ping_sent,
|
|
|
|
(long long) node->pong_received,
|
2013-09-25 11:47:13 +02:00
|
|
|
(unsigned long long) node->configEpoch,
|
2011-09-28 18:28:36 +02:00
|
|
|
(node->link || node->flags & REDIS_NODE_MYSELF) ?
|
|
|
|
"connected" : "disconnected");
|
2011-03-30 17:41:13 +02:00
|
|
|
|
|
|
|
/* Slots served by this instance */
|
|
|
|
start = -1;
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
if ((bit = clusterNodeGetSlotBit(node,j)) != 0) {
|
|
|
|
if (start == -1) start = j;
|
|
|
|
}
|
|
|
|
if (start != -1 && (!bit || j == REDIS_CLUSTER_SLOTS-1)) {
|
|
|
|
if (j == REDIS_CLUSTER_SLOTS-1) j++;
|
|
|
|
|
|
|
|
if (start == j-1) {
|
|
|
|
ci = sdscatprintf(ci," %d",start);
|
|
|
|
} else {
|
|
|
|
ci = sdscatprintf(ci," %d-%d",start,j-1);
|
|
|
|
}
|
|
|
|
start = -1;
|
|
|
|
}
|
|
|
|
}
|
2011-05-02 19:04:33 +02:00
|
|
|
|
|
|
|
/* Just for MYSELF node we also dump info about slots that
|
|
|
|
* we are migrating to other instances or importing from other
|
|
|
|
* instances. */
|
|
|
|
if (node->flags & REDIS_NODE_MYSELF) {
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->migrating_slots_to[j]) {
|
2011-05-04 09:31:37 +02:00
|
|
|
ci = sdscatprintf(ci," [%d->-%.40s]",j,
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->migrating_slots_to[j]->name);
|
|
|
|
} else if (server.cluster->importing_slots_from[j]) {
|
2011-05-04 09:31:37 +02:00
|
|
|
ci = sdscatprintf(ci," [%d-<-%.40s]",j,
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->importing_slots_from[j]->name);
|
2011-05-02 19:04:33 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-04-07 19:04:16 +02:00
|
|
|
ci = sdscatlen(ci,"\n",1);
|
2011-03-30 14:58:19 +02:00
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
return ci;
|
|
|
|
}
|
|
|
|
|
2011-05-06 15:44:09 +02:00
|
|
|
int getSlotOrReply(redisClient *c, robj *o) {
|
|
|
|
long long slot;
|
|
|
|
|
|
|
|
if (getLongLongFromObject(o,&slot) != REDIS_OK ||
|
|
|
|
slot < 0 || slot > REDIS_CLUSTER_SLOTS)
|
|
|
|
{
|
|
|
|
addReplyError(c,"Invalid or out of range slot");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return (int) slot;
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
void clusterCommand(redisClient *c) {
|
|
|
|
if (server.cluster_enabled == 0) {
|
|
|
|
addReplyError(c,"This instance has cluster support disabled");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcasecmp(c->argv[1]->ptr,"meet") && c->argc == 4) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER MEET <ip> <port> */
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *n;
|
2011-06-18 19:21:43 +01:00
|
|
|
struct sockaddr_storage sa;
|
2011-03-29 17:51:15 +02:00
|
|
|
long port;
|
|
|
|
|
|
|
|
/* Perform sanity checks on IP/port */
|
2013-08-22 11:54:25 +02:00
|
|
|
if (inet_pton(AF_INET,c->argv[2]->ptr,
|
|
|
|
&(((struct sockaddr_in *)&sa)->sin_addr)))
|
|
|
|
{
|
|
|
|
sa.ss_family = AF_INET;
|
|
|
|
} else if (inet_pton(AF_INET6,c->argv[2]->ptr,
|
|
|
|
&(((struct sockaddr_in6 *)&sa)->sin6_addr)))
|
|
|
|
{
|
|
|
|
sa.ss_family = AF_INET6;
|
|
|
|
} else {
|
2011-03-29 17:51:15 +02:00
|
|
|
addReplyError(c,"Invalid IP address in MEET");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (getLongFromObjectOrReply(c, c->argv[3], &port, NULL) != REDIS_OK ||
|
|
|
|
port < 0 || port > (65535-REDIS_CLUSTER_PORT_INCR))
|
|
|
|
{
|
|
|
|
addReplyError(c,"Invalid TCP port specified");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Finally add the node to the cluster with a random name, this
|
|
|
|
* will get fixed in the first handshake (ping/pong). */
|
|
|
|
n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET);
|
2013-08-22 11:54:25 +02:00
|
|
|
|
|
|
|
/* Set node->ip as the normalized string representation of the node
|
|
|
|
* IP address. */
|
|
|
|
if (sa.ss_family == AF_INET)
|
|
|
|
inet_ntop(AF_INET,
|
|
|
|
(void*)&(((struct sockaddr_in *)&sa)->sin_addr),
|
|
|
|
n->ip,REDIS_CLUSTER_IPLEN);
|
|
|
|
else
|
|
|
|
inet_ntop(AF_INET6,
|
|
|
|
(void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr),
|
|
|
|
n->ip,REDIS_CLUSTER_IPLEN);
|
2011-03-29 17:51:15 +02:00
|
|
|
n->port = port;
|
|
|
|
clusterAddNode(n);
|
|
|
|
addReply(c,shared.ok);
|
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER NODES */
|
2011-03-29 17:51:15 +02:00
|
|
|
robj *o;
|
2013-09-04 10:25:26 +02:00
|
|
|
sds ci = clusterGenNodesDescription(0);
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
o = createObject(REDIS_STRING,ci);
|
|
|
|
addReplyBulk(c,o);
|
|
|
|
decrRefCount(o);
|
2013-03-19 09:58:05 +01:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) {
|
|
|
|
/* CLUSTER FLUSHSLOTS */
|
|
|
|
if (dictSize(server.db[0].dict) != 0) {
|
|
|
|
addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS.");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
clusterDelNodeSlots(server.cluster->myself);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-03-19 09:58:05 +01:00
|
|
|
addReply(c,shared.ok);
|
2011-05-06 13:38:27 +02:00
|
|
|
} else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
|
2011-10-13 14:51:29 +02:00
|
|
|
!strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3)
|
|
|
|
{
|
|
|
|
/* CLUSTER ADDSLOTS <slot> [slot] ... */
|
|
|
|
/* CLUSTER DELSLOTS <slot> [slot] ... */
|
2011-05-06 15:44:09 +02:00
|
|
|
int j, slot;
|
2011-03-29 17:51:15 +02:00
|
|
|
unsigned char *slots = zmalloc(REDIS_CLUSTER_SLOTS);
|
2011-05-06 13:38:27 +02:00
|
|
|
int del = !strcasecmp(c->argv[1]->ptr,"delslots");
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
memset(slots,0,REDIS_CLUSTER_SLOTS);
|
|
|
|
/* Check that all the arguments are parsable and that all the
|
|
|
|
* slots are not already busy. */
|
|
|
|
for (j = 2; j < c->argc; j++) {
|
2011-05-06 15:44:09 +02:00
|
|
|
if ((slot = getSlotOrReply(c,c->argv[j])) == -1) {
|
2011-03-29 17:51:15 +02:00
|
|
|
zfree(slots);
|
|
|
|
return;
|
|
|
|
}
|
2013-02-14 13:20:56 +01:00
|
|
|
if (del && server.cluster->slots[slot] == NULL) {
|
2011-05-06 15:44:09 +02:00
|
|
|
addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
|
2011-05-06 13:38:27 +02:00
|
|
|
zfree(slots);
|
|
|
|
return;
|
2013-02-14 13:20:56 +01:00
|
|
|
} else if (!del && server.cluster->slots[slot]) {
|
2011-05-06 15:44:09 +02:00
|
|
|
addReplyErrorFormat(c,"Slot %d is already busy", slot);
|
2011-03-29 17:51:15 +02:00
|
|
|
zfree(slots);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (slots[slot]++ == 1) {
|
|
|
|
addReplyErrorFormat(c,"Slot %d specified multiple times",
|
|
|
|
(int)slot);
|
|
|
|
zfree(slots);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
|
|
|
if (slots[j]) {
|
2011-05-06 16:08:10 +02:00
|
|
|
int retval;
|
|
|
|
|
|
|
|
/* If this slot was set as importing we can clear this
|
|
|
|
* state as now we are the real owner of the slot. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->importing_slots_from[j])
|
|
|
|
server.cluster->importing_slots_from[j] = NULL;
|
2011-05-06 16:08:10 +02:00
|
|
|
|
|
|
|
retval = del ? clusterDelSlot(j) :
|
2013-02-14 13:20:56 +01:00
|
|
|
clusterAddSlot(server.cluster->myself,j);
|
2011-10-04 18:43:03 +02:00
|
|
|
redisAssertWithInfo(c,NULL,retval == REDIS_OK);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
zfree(slots);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2011-03-29 17:51:15 +02:00
|
|
|
addReply(c,shared.ok);
|
2011-04-29 17:34:03 +02:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) {
|
2011-09-29 15:26:01 +02:00
|
|
|
/* SETSLOT 10 MIGRATING <node ID> */
|
|
|
|
/* SETSLOT 10 IMPORTING <node ID> */
|
2011-04-29 17:34:03 +02:00
|
|
|
/* SETSLOT 10 STABLE */
|
2011-09-29 15:26:01 +02:00
|
|
|
/* SETSLOT 10 NODE <node ID> */
|
2011-05-06 15:44:09 +02:00
|
|
|
int slot;
|
2011-04-29 17:34:03 +02:00
|
|
|
clusterNode *n;
|
|
|
|
|
2011-05-06 15:44:09 +02:00
|
|
|
if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return;
|
|
|
|
|
2011-04-29 17:34:03 +02:00
|
|
|
if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) {
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->slots[slot] != server.cluster->myself) {
|
2011-05-05 18:10:02 +02:00
|
|
|
addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
|
|
|
|
return;
|
|
|
|
}
|
2011-04-29 17:34:03 +02:00
|
|
|
if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
|
|
|
|
addReplyErrorFormat(c,"I don't know about node %s",
|
|
|
|
(char*)c->argv[4]->ptr);
|
|
|
|
return;
|
|
|
|
}
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->migrating_slots_to[slot] = n;
|
2011-04-29 17:34:03 +02:00
|
|
|
} else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) {
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->slots[slot] == server.cluster->myself) {
|
2011-05-05 18:10:02 +02:00
|
|
|
addReplyErrorFormat(c,
|
|
|
|
"I'm already the owner of hash slot %u",slot);
|
|
|
|
return;
|
|
|
|
}
|
2011-04-29 17:34:03 +02:00
|
|
|
if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
|
|
|
|
addReplyErrorFormat(c,"I don't know about node %s",
|
|
|
|
(char*)c->argv[3]->ptr);
|
|
|
|
return;
|
|
|
|
}
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->importing_slots_from[slot] = n;
|
2011-04-29 17:34:03 +02:00
|
|
|
} else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) {
|
2011-05-06 15:44:09 +02:00
|
|
|
/* CLUSTER SETSLOT <SLOT> STABLE */
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->importing_slots_from[slot] = NULL;
|
|
|
|
server.cluster->migrating_slots_to[slot] = NULL;
|
2011-09-30 22:17:24 +02:00
|
|
|
} else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) {
|
2011-05-06 15:44:09 +02:00
|
|
|
/* CLUSTER SETSLOT <SLOT> NODE <NODE ID> */
|
|
|
|
clusterNode *n = clusterLookupNode(c->argv[4]->ptr);
|
|
|
|
|
2013-02-27 17:53:48 +01:00
|
|
|
if (!n) {
|
|
|
|
addReplyErrorFormat(c,"Unknown node %s",
|
|
|
|
(char*)c->argv[4]->ptr);
|
|
|
|
return;
|
|
|
|
}
|
2011-05-06 15:44:09 +02:00
|
|
|
/* If this hash slot was served by 'myself' before to switch
|
|
|
|
* make sure there are no longer local keys for this hash slot. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->slots[slot] == server.cluster->myself &&
|
|
|
|
n != server.cluster->myself)
|
2011-05-06 15:44:09 +02:00
|
|
|
{
|
2013-02-25 11:24:42 +01:00
|
|
|
if (countKeysInSlot(slot) != 0) {
|
2011-05-06 15:44:09 +02:00
|
|
|
addReplyErrorFormat(c, "Can't assign hashslot %d to a different node while I still hold keys for this hash slot.", slot);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2011-05-06 16:08:10 +02:00
|
|
|
/* If this node was the slot owner and the slot was marked as
|
|
|
|
* migrating, assigning the slot to another node will clear
|
|
|
|
* the migratig status. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->slots[slot] == server.cluster->myself &&
|
|
|
|
server.cluster->migrating_slots_to[slot])
|
|
|
|
server.cluster->migrating_slots_to[slot] = NULL;
|
2011-05-06 16:08:10 +02:00
|
|
|
|
2011-09-30 19:20:56 +02:00
|
|
|
/* If this node was importing this slot, assigning the slot to
|
|
|
|
* itself also clears the importing status. */
|
2013-02-21 16:24:48 +01:00
|
|
|
if (n == server.cluster->myself &&
|
|
|
|
server.cluster->importing_slots_from[slot])
|
2013-02-14 13:20:56 +01:00
|
|
|
server.cluster->importing_slots_from[slot] = NULL;
|
2011-05-06 15:44:09 +02:00
|
|
|
clusterDelSlot(slot);
|
|
|
|
clusterAddSlot(n,slot);
|
2011-04-29 17:34:03 +02:00
|
|
|
} else {
|
|
|
|
addReplyError(c,"Invalid CLUSTER SETSLOT action or number of arguments");
|
2011-05-05 17:52:19 +02:00
|
|
|
return;
|
2011-04-29 17:34:03 +02:00
|
|
|
}
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2011-05-02 19:04:33 +02:00
|
|
|
addReply(c,shared.ok);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER INFO */
|
2011-03-29 17:51:15 +02:00
|
|
|
char *statestr[] = {"ok","fail","needhelp"};
|
|
|
|
int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
|
2013-02-14 13:20:56 +01:00
|
|
|
clusterNode *n = server.cluster->slots[j];
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
if (n == NULL) continue;
|
|
|
|
slots_assigned++;
|
|
|
|
if (n->flags & REDIS_NODE_FAIL) {
|
|
|
|
slots_fail++;
|
|
|
|
} else if (n->flags & REDIS_NODE_PFAIL) {
|
|
|
|
slots_pfail++;
|
|
|
|
} else {
|
|
|
|
slots_ok++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sds info = sdscatprintf(sdsempty(),
|
|
|
|
"cluster_state:%s\r\n"
|
|
|
|
"cluster_slots_assigned:%d\r\n"
|
|
|
|
"cluster_slots_ok:%d\r\n"
|
|
|
|
"cluster_slots_pfail:%d\r\n"
|
|
|
|
"cluster_slots_fail:%d\r\n"
|
2011-04-11 17:40:35 +02:00
|
|
|
"cluster_known_nodes:%lu\r\n"
|
2013-02-22 19:20:38 +01:00
|
|
|
"cluster_size:%d\r\n"
|
2013-09-25 12:38:36 +02:00
|
|
|
"cluster_current_epoch:%llu\r\n"
|
2013-10-02 10:10:08 +02:00
|
|
|
"cluster_stats_messages_sent:%lld\r\n"
|
|
|
|
"cluster_stats_messages_received:%lld\r\n"
|
2013-02-14 13:20:56 +01:00
|
|
|
, statestr[server.cluster->state],
|
2011-03-29 17:51:15 +02:00
|
|
|
slots_assigned,
|
|
|
|
slots_ok,
|
|
|
|
slots_pfail,
|
2011-04-11 17:40:35 +02:00
|
|
|
slots_fail,
|
2013-02-22 19:20:38 +01:00
|
|
|
dictSize(server.cluster->nodes),
|
2013-09-25 12:38:36 +02:00
|
|
|
server.cluster->size,
|
2013-10-02 10:10:08 +02:00
|
|
|
(unsigned long long) server.cluster->currentEpoch,
|
|
|
|
server.cluster->stats_bus_messages_sent,
|
|
|
|
server.cluster->stats_bus_messages_received
|
2011-03-29 17:51:15 +02:00
|
|
|
);
|
|
|
|
addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
|
|
|
|
(unsigned long)sdslen(info)));
|
|
|
|
addReplySds(c,info);
|
|
|
|
addReply(c,shared.crlf);
|
2013-09-04 10:32:09 +02:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) {
|
2013-10-03 09:55:20 +02:00
|
|
|
int retval = clusterSaveConfig(1);
|
2013-09-04 10:32:09 +02:00
|
|
|
|
|
|
|
if (retval == 0)
|
|
|
|
addReply(c,shared.ok);
|
|
|
|
else
|
|
|
|
addReplyErrorFormat(c,"error saving the cluster node config: %s",
|
|
|
|
strerror(errno));
|
2011-04-29 14:31:18 +02:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER KEYSLOT <key> */
|
2011-04-29 14:31:18 +02:00
|
|
|
sds key = c->argv[2]->ptr;
|
|
|
|
|
|
|
|
addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
|
2013-02-25 12:04:31 +01:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER COUNTKEYSINSLOT <slot> */
|
2013-02-25 12:04:31 +01:00
|
|
|
long long slot;
|
|
|
|
|
|
|
|
if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != REDIS_OK)
|
|
|
|
return;
|
2013-02-25 12:40:32 +01:00
|
|
|
if (slot < 0 || slot >= REDIS_CLUSTER_SLOTS) {
|
|
|
|
addReplyError(c,"Invalid slot");
|
|
|
|
return;
|
|
|
|
}
|
2013-02-25 12:04:31 +01:00
|
|
|
addReplyLongLong(c,countKeysInSlot(slot));
|
2011-04-29 16:17:58 +02:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) {
|
2013-02-27 16:12:59 +01:00
|
|
|
/* CLUSTER GETKEYSINSLOT <slot> <count> */
|
2011-04-29 16:17:58 +02:00
|
|
|
long long maxkeys, slot;
|
2011-04-29 17:34:03 +02:00
|
|
|
unsigned int numkeys, j;
|
2011-04-29 16:17:58 +02:00
|
|
|
robj **keys;
|
|
|
|
|
|
|
|
if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != REDIS_OK)
|
|
|
|
return;
|
|
|
|
if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) != REDIS_OK)
|
|
|
|
return;
|
2013-02-25 12:41:13 +01:00
|
|
|
if (slot < 0 || slot >= REDIS_CLUSTER_SLOTS || maxkeys < 0) {
|
2011-04-29 16:17:58 +02:00
|
|
|
addReplyError(c,"Invalid slot or number of keys");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
keys = zmalloc(sizeof(robj*)*maxkeys);
|
2013-02-25 11:24:42 +01:00
|
|
|
numkeys = getKeysInSlot(slot, keys, maxkeys);
|
2011-04-29 16:17:58 +02:00
|
|
|
addReplyMultiBulkLen(c,numkeys);
|
|
|
|
for (j = 0; j < numkeys; j++) addReplyBulk(c,keys[j]);
|
|
|
|
zfree(keys);
|
2013-02-27 17:55:59 +01:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) {
|
|
|
|
/* CLUSTER FORGET <NODE ID> */
|
|
|
|
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
|
|
|
|
|
|
|
|
if (!n) {
|
|
|
|
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
clusterDelNode(n);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-02-27 17:55:59 +01:00
|
|
|
addReply(c,shared.ok);
|
2013-03-04 13:15:09 +01:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) {
|
|
|
|
/* CLUSTER REPLICATE <NODE ID> */
|
|
|
|
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
|
|
|
|
|
|
|
|
/* Lookup the specified node in our table. */
|
|
|
|
if (!n) {
|
|
|
|
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* I can't replicate myself. */
|
|
|
|
if (n == server.cluster->myself) {
|
|
|
|
addReplyError(c,"Can't replicate myself");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Can't replicate a slave. */
|
|
|
|
if (n->slaveof != NULL) {
|
|
|
|
addReplyError(c,"I can only replicate a master, not a slave.");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We should have no assigned slots to accept to replicate some
|
|
|
|
* other node. */
|
|
|
|
if (server.cluster->myself->numslots != 0 ||
|
|
|
|
dictSize(server.db[0].dict) != 0)
|
|
|
|
{
|
|
|
|
addReplyError(c,"To set a master the node must be empty and without assigned slots.");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set the master. */
|
|
|
|
clusterSetMaster(n);
|
2013-10-03 09:55:20 +02:00
|
|
|
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
|
2013-03-04 13:15:09 +01:00
|
|
|
addReply(c,shared.ok);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else {
|
|
|
|
addReplyError(c,"Wrong CLUSTER subcommand or number of arguments");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* -----------------------------------------------------------------------------
|
2012-04-01 12:51:40 +02:00
|
|
|
* DUMP, RESTORE and MIGRATE commands
|
2011-03-29 17:51:15 +02:00
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2012-04-01 12:51:40 +02:00
|
|
|
/* Generates a DUMP-format representation of the object 'o', adding it to the
|
|
|
|
* io stream pointed by 'rio'. This function can't fail. */
|
|
|
|
void createDumpPayload(rio *payload, robj *o) {
|
2012-04-02 13:10:39 +02:00
|
|
|
unsigned char buf[2];
|
|
|
|
uint64_t crc;
|
2012-04-01 12:51:40 +02:00
|
|
|
|
|
|
|
/* Serialize the object in a RDB-like format. It consist of an object type
|
|
|
|
* byte followed by the serialized object. This is understood by RESTORE. */
|
|
|
|
rioInitWithBuffer(payload,sdsempty());
|
|
|
|
redisAssert(rdbSaveObjectType(payload,o));
|
|
|
|
redisAssert(rdbSaveObject(payload,o));
|
|
|
|
|
|
|
|
/* Write the footer, this is how it looks like:
|
2012-04-02 13:10:39 +02:00
|
|
|
* ----------------+---------------------+---------------+
|
|
|
|
* ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 |
|
|
|
|
* ----------------+---------------------+---------------+
|
|
|
|
* RDB version and CRC are both in little endian.
|
|
|
|
*/
|
2012-04-02 10:52:39 +02:00
|
|
|
|
|
|
|
/* RDB version */
|
2012-04-02 10:46:24 +02:00
|
|
|
buf[0] = REDIS_RDB_VERSION & 0xff;
|
|
|
|
buf[1] = (REDIS_RDB_VERSION >> 8) & 0xff;
|
2012-04-01 12:51:40 +02:00
|
|
|
payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,2);
|
|
|
|
|
2012-04-02 13:10:39 +02:00
|
|
|
/* CRC64 */
|
2012-04-09 12:33:57 +02:00
|
|
|
crc = crc64(0,(unsigned char*)payload->io.buffer.ptr,
|
2012-04-02 13:10:39 +02:00
|
|
|
sdslen(payload->io.buffer.ptr));
|
|
|
|
memrev64ifbe(&crc);
|
|
|
|
payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,8);
|
2012-04-01 12:51:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Verify that the RDB version of the dump payload matches the one of this Redis
|
2012-04-02 13:10:39 +02:00
|
|
|
* instance and that the checksum is ok.
|
2012-04-01 12:51:40 +02:00
|
|
|
* If the DUMP payload looks valid REDIS_OK is returned, otherwise REDIS_ERR
|
|
|
|
* is returned. */
|
|
|
|
int verifyDumpPayload(unsigned char *p, size_t len) {
|
2012-04-02 13:10:39 +02:00
|
|
|
unsigned char *footer;
|
2012-04-01 12:51:40 +02:00
|
|
|
uint16_t rdbver;
|
2012-04-02 13:10:39 +02:00
|
|
|
uint64_t crc;
|
2012-04-01 12:51:40 +02:00
|
|
|
|
2012-04-02 13:10:39 +02:00
|
|
|
/* At least 2 bytes of RDB version and 8 of CRC64 should be present. */
|
2012-04-01 12:51:40 +02:00
|
|
|
if (len < 10) return REDIS_ERR;
|
|
|
|
footer = p+(len-10);
|
2012-04-02 10:52:39 +02:00
|
|
|
|
|
|
|
/* Verify RDB version */
|
2012-04-02 10:46:24 +02:00
|
|
|
rdbver = (footer[1] << 8) | footer[0];
|
2012-04-01 12:51:40 +02:00
|
|
|
if (rdbver != REDIS_RDB_VERSION) return REDIS_ERR;
|
2012-04-02 10:52:39 +02:00
|
|
|
|
2012-04-02 13:10:39 +02:00
|
|
|
/* Verify CRC64 */
|
2012-04-09 12:33:57 +02:00
|
|
|
crc = crc64(0,p,len-8);
|
2012-04-02 13:10:39 +02:00
|
|
|
memrev64ifbe(&crc);
|
|
|
|
return (memcmp(&crc,footer+2,8) == 0) ? REDIS_OK : REDIS_ERR;
|
2012-04-01 12:51:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* DUMP keyname
|
|
|
|
* DUMP is actually not used by Redis Cluster but it is the obvious
|
|
|
|
* complement of RESTORE and can be useful for different applications. */
|
|
|
|
void dumpCommand(redisClient *c) {
|
|
|
|
robj *o, *dumpobj;
|
|
|
|
rio payload;
|
|
|
|
|
|
|
|
/* Check if the key is here. */
|
|
|
|
if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) {
|
|
|
|
addReply(c,shared.nullbulk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create the DUMP encoded representation. */
|
|
|
|
createDumpPayload(&payload,o);
|
|
|
|
|
|
|
|
/* Transfer to the client */
|
|
|
|
dumpobj = createObject(REDIS_STRING,payload.io.buffer.ptr);
|
|
|
|
addReplyBulk(c,dumpobj);
|
|
|
|
decrRefCount(dumpobj);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-11-07 10:57:23 +01:00
|
|
|
/* RESTORE key ttl serialized-value [REPLACE] */
|
2011-03-29 17:51:15 +02:00
|
|
|
void restoreCommand(redisClient *c) {
|
|
|
|
long ttl;
|
2011-05-13 17:31:00 +02:00
|
|
|
rio payload;
|
2012-11-07 10:57:23 +01:00
|
|
|
int j, type, replace = 0;
|
2011-05-13 22:14:39 +02:00
|
|
|
robj *obj;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2012-11-07 10:57:23 +01:00
|
|
|
/* Parse additional options */
|
|
|
|
for (j = 4; j < c->argc; j++) {
|
|
|
|
if (!strcasecmp(c->argv[j]->ptr,"replace")) {
|
|
|
|
replace = 1;
|
|
|
|
} else {
|
|
|
|
addReply(c,shared.syntaxerr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Make sure this key does not already exist here... */
|
2012-11-07 10:57:23 +01:00
|
|
|
if (!replace && lookupKeyWrite(c->db,c->argv[1]) != NULL) {
|
2011-03-29 17:51:15 +02:00
|
|
|
addReplyError(c,"Target key name is busy.");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check if the TTL value makes sense */
|
|
|
|
if (getLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != REDIS_OK) {
|
|
|
|
return;
|
|
|
|
} else if (ttl < 0) {
|
|
|
|
addReplyError(c,"Invalid TTL value, must be >= 0");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-04-02 13:10:39 +02:00
|
|
|
/* Verify RDB version and data checksum. */
|
2012-04-01 12:51:40 +02:00
|
|
|
if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr)) == REDIS_ERR) {
|
|
|
|
addReplyError(c,"DUMP payload version or checksum are wrong");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-09-22 16:00:40 +02:00
|
|
|
rioInitWithBuffer(&payload,c->argv[3]->ptr);
|
2011-05-13 22:14:39 +02:00
|
|
|
if (((type = rdbLoadObjectType(&payload)) == -1) ||
|
|
|
|
((obj = rdbLoadObject(type,&payload)) == NULL))
|
2011-04-05 13:57:28 +02:00
|
|
|
{
|
2011-05-13 22:14:39 +02:00
|
|
|
addReplyError(c,"Bad data format");
|
2011-03-29 17:51:15 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-11-07 10:57:23 +01:00
|
|
|
/* Remove the old key if needed. */
|
|
|
|
if (replace) dbDelete(c->db,c->argv[1]);
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Create the key and set the TTL if any */
|
2011-05-13 22:14:39 +02:00
|
|
|
dbAdd(c->db,c->argv[1],obj);
|
2012-04-02 11:14:47 +02:00
|
|
|
if (ttl) setExpire(c->db,c->argv[1],mstime()+ttl);
|
2011-10-20 11:17:30 +02:00
|
|
|
signalModifiedKey(c->db,c->argv[1]);
|
2011-03-29 17:51:15 +02:00
|
|
|
addReply(c,shared.ok);
|
2011-10-03 15:45:14 +02:00
|
|
|
server.dirty++;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
|
2012-11-12 00:45:10 +01:00
|
|
|
/* MIGRATE socket cache implementation.
|
|
|
|
*
|
|
|
|
* We take a map between host:ip and a TCP socket that we used to connect
|
|
|
|
* to this instance in recent time.
|
|
|
|
* This sockets are closed when the max number we cache is reached, and also
|
|
|
|
* in serverCron() when they are around for more than a few seconds. */
|
|
|
|
#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */
|
|
|
|
#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached socekts after 10 sec. */
|
|
|
|
|
|
|
|
typedef struct migrateCachedSocket {
|
|
|
|
int fd;
|
|
|
|
time_t last_use_time;
|
|
|
|
} migrateCachedSocket;
|
|
|
|
|
|
|
|
/* Return a TCP scoket connected with the target instance, possibly returning
|
|
|
|
* a cached one.
|
|
|
|
*
|
|
|
|
* This function is responsible of sending errors to the client if a
|
|
|
|
* connection can't be established. In this case -1 is returned.
|
|
|
|
* Otherwise on success the socket is returned, and the caller should not
|
|
|
|
* attempt to free it after usage.
|
|
|
|
*
|
|
|
|
* If the caller detects an error while using the socket, migrateCloseSocket()
|
|
|
|
* should be called so that the connection will be craeted from scratch
|
|
|
|
* the next time. */
|
|
|
|
int migrateGetSocket(redisClient *c, robj *host, robj *port, long timeout) {
|
|
|
|
int fd;
|
|
|
|
sds name = sdsempty();
|
|
|
|
migrateCachedSocket *cs;
|
|
|
|
|
|
|
|
/* Check if we have an already cached socket for this ip:port pair. */
|
|
|
|
name = sdscatlen(name,host->ptr,sdslen(host->ptr));
|
|
|
|
name = sdscatlen(name,":",1);
|
|
|
|
name = sdscatlen(name,port->ptr,sdslen(port->ptr));
|
|
|
|
cs = dictFetchValue(server.migrate_cached_sockets,name);
|
|
|
|
if (cs) {
|
|
|
|
sdsfree(name);
|
|
|
|
cs->last_use_time = server.unixtime;
|
|
|
|
return cs->fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No cached socket, create one. */
|
|
|
|
if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) {
|
|
|
|
/* Too many items, drop one at random. */
|
|
|
|
dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets);
|
|
|
|
cs = dictGetVal(de);
|
|
|
|
close(cs->fd);
|
|
|
|
zfree(cs);
|
|
|
|
dictDelete(server.migrate_cached_sockets,dictGetKey(de));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create the socket */
|
|
|
|
fd = anetTcpNonBlockConnect(server.neterr,c->argv[1]->ptr,
|
|
|
|
atoi(c->argv[2]->ptr));
|
|
|
|
if (fd == -1) {
|
|
|
|
sdsfree(name);
|
|
|
|
addReplyErrorFormat(c,"Can't connect to target node: %s",
|
|
|
|
server.neterr);
|
|
|
|
return -1;
|
|
|
|
}
|
2013-01-31 11:14:15 +01:00
|
|
|
anetEnableTcpNoDelay(server.neterr,fd);
|
2012-11-12 00:45:10 +01:00
|
|
|
|
|
|
|
/* Check if it connects within the specified timeout. */
|
2012-11-12 14:00:59 +01:00
|
|
|
if ((aeWait(fd,AE_WRITABLE,timeout) & AE_WRITABLE) == 0) {
|
2012-11-12 00:45:10 +01:00
|
|
|
sdsfree(name);
|
|
|
|
addReplySds(c,sdsnew("-IOERR error or timeout connecting to the client\r\n"));
|
|
|
|
close(fd);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add to the cache and return it to the caller. */
|
|
|
|
cs = zmalloc(sizeof(*cs));
|
|
|
|
cs->fd = fd;
|
|
|
|
cs->last_use_time = server.unixtime;
|
|
|
|
dictAdd(server.migrate_cached_sockets,name,cs);
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Free a migrate cached connection. */
|
|
|
|
void migrateCloseSocket(robj *host, robj *port) {
|
|
|
|
sds name = sdsempty();
|
|
|
|
migrateCachedSocket *cs;
|
|
|
|
|
|
|
|
name = sdscatlen(name,host->ptr,sdslen(host->ptr));
|
|
|
|
name = sdscatlen(name,":",1);
|
|
|
|
name = sdscatlen(name,port->ptr,sdslen(port->ptr));
|
|
|
|
cs = dictFetchValue(server.migrate_cached_sockets,name);
|
|
|
|
if (!cs) {
|
|
|
|
sdsfree(name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
close(cs->fd);
|
|
|
|
zfree(cs);
|
|
|
|
dictDelete(server.migrate_cached_sockets,name);
|
|
|
|
sdsfree(name);
|
|
|
|
}
|
|
|
|
|
|
|
|
void migrateCloseTimedoutSockets(void) {
|
|
|
|
dictIterator *di = dictGetSafeIterator(server.migrate_cached_sockets);
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
migrateCachedSocket *cs = dictGetVal(de);
|
|
|
|
|
|
|
|
if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) {
|
|
|
|
close(cs->fd);
|
|
|
|
zfree(cs);
|
|
|
|
dictDelete(server.migrate_cached_sockets,dictGetKey(de));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
2012-11-07 15:32:27 +01:00
|
|
|
/* MIGRATE host port key dbid timeout [COPY | REPLACE] */
|
2011-03-29 17:51:15 +02:00
|
|
|
void migrateCommand(redisClient *c) {
|
2012-11-14 11:30:24 +01:00
|
|
|
int fd, copy, replace, j;
|
2011-03-29 17:51:15 +02:00
|
|
|
long timeout;
|
|
|
|
long dbid;
|
2012-11-14 11:30:24 +01:00
|
|
|
long long ttl, expireat;
|
2011-03-29 17:51:15 +02:00
|
|
|
robj *o;
|
2011-05-13 17:31:00 +02:00
|
|
|
rio cmd, payload;
|
2012-11-14 11:30:24 +01:00
|
|
|
int retry_num = 0;
|
|
|
|
|
|
|
|
try_again:
|
|
|
|
/* Initialization */
|
|
|
|
copy = 0;
|
|
|
|
replace = 0;
|
|
|
|
ttl = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2012-11-07 15:32:27 +01:00
|
|
|
/* Parse additional options */
|
|
|
|
for (j = 6; j < c->argc; j++) {
|
|
|
|
if (!strcasecmp(c->argv[j]->ptr,"copy")) {
|
|
|
|
copy = 1;
|
|
|
|
} else if (!strcasecmp(c->argv[j]->ptr,"replace")) {
|
|
|
|
replace = 1;
|
|
|
|
} else {
|
|
|
|
addReply(c,shared.syntaxerr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* Sanity check */
|
|
|
|
if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != REDIS_OK)
|
|
|
|
return;
|
|
|
|
if (getLongFromObjectOrReply(c,c->argv[4],&dbid,NULL) != REDIS_OK)
|
|
|
|
return;
|
2012-11-12 15:04:54 +01:00
|
|
|
if (timeout <= 0) timeout = 1000;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
/* Check if the key is here. If not we reply with success as there is
|
|
|
|
* nothing to migrate (for instance the key expired in the meantime), but
|
|
|
|
* we include such information in the reply string. */
|
|
|
|
if ((o = lookupKeyRead(c->db,c->argv[3])) == NULL) {
|
2011-10-17 16:44:08 +02:00
|
|
|
addReplySds(c,sdsnew("+NOKEY\r\n"));
|
2011-03-29 17:51:15 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Connect */
|
2012-11-12 00:45:10 +01:00
|
|
|
fd = migrateGetSocket(c,c->argv[1],c->argv[2],timeout);
|
|
|
|
if (fd == -1) return; /* error sent to the client by migrateGetSocket() */
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2012-04-01 12:51:40 +02:00
|
|
|
/* Create RESTORE payload and generate the protocol to call the command. */
|
2011-09-22 16:00:40 +02:00
|
|
|
rioInitWithBuffer(&cmd,sdsempty());
|
2011-10-04 18:43:03 +02:00
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2));
|
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6));
|
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid));
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2012-04-03 15:10:42 +02:00
|
|
|
expireat = getExpire(c->db,c->argv[3]);
|
|
|
|
if (expireat != -1) {
|
|
|
|
ttl = expireat-mstime();
|
|
|
|
if (ttl < 1) ttl = 1;
|
|
|
|
}
|
2012-11-07 15:32:27 +01:00
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',replace ? 5 : 4));
|
2013-02-20 17:36:54 +01:00
|
|
|
if (server.cluster_enabled)
|
|
|
|
redisAssertWithInfo(c,NULL,
|
|
|
|
rioWriteBulkString(&cmd,"RESTORE-ASKING",14));
|
|
|
|
else
|
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7));
|
2012-06-05 21:50:10 +02:00
|
|
|
redisAssertWithInfo(c,NULL,sdsEncodedObject(c->argv[3]));
|
2011-10-04 18:43:03 +02:00
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr)));
|
2012-04-10 16:46:29 +02:00
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl));
|
2011-03-29 17:51:15 +02:00
|
|
|
|
2013-01-17 01:00:20 +08:00
|
|
|
/* Emit the payload argument, that is the serialized object using
|
2012-11-07 15:32:27 +01:00
|
|
|
* the DUMP format. */
|
2012-04-01 12:51:40 +02:00
|
|
|
createDumpPayload(&payload,o);
|
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,payload.io.buffer.ptr,
|
|
|
|
sdslen(payload.io.buffer.ptr)));
|
2011-05-13 17:31:00 +02:00
|
|
|
sdsfree(payload.io.buffer.ptr);
|
|
|
|
|
2012-11-07 15:32:27 +01:00
|
|
|
/* Add the REPLACE option to the RESTORE command if it was specified
|
|
|
|
* as a MIGRATE option. */
|
|
|
|
if (replace)
|
|
|
|
redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",7));
|
|
|
|
|
2013-01-17 01:00:20 +08:00
|
|
|
/* Transfer the query to the other node in 64K chunks. */
|
2012-11-14 11:30:24 +01:00
|
|
|
errno = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
{
|
2011-05-13 17:31:00 +02:00
|
|
|
sds buf = cmd.io.buffer.ptr;
|
|
|
|
size_t pos = 0, towrite;
|
|
|
|
int nwritten = 0;
|
|
|
|
|
|
|
|
while ((towrite = sdslen(buf)-pos) > 0) {
|
|
|
|
towrite = (towrite > (64*1024) ? (64*1024) : towrite);
|
2012-04-03 12:17:40 +02:00
|
|
|
nwritten = syncWrite(fd,buf+pos,towrite,timeout);
|
2011-05-13 17:31:00 +02:00
|
|
|
if (nwritten != (signed)towrite) goto socket_wr_err;
|
|
|
|
pos += nwritten;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-05-13 17:31:00 +02:00
|
|
|
/* Read back the reply. */
|
2011-03-29 17:51:15 +02:00
|
|
|
{
|
|
|
|
char buf1[1024];
|
|
|
|
char buf2[1024];
|
|
|
|
|
|
|
|
/* Read the two replies */
|
|
|
|
if (syncReadLine(fd, buf1, sizeof(buf1), timeout) <= 0)
|
|
|
|
goto socket_rd_err;
|
|
|
|
if (syncReadLine(fd, buf2, sizeof(buf2), timeout) <= 0)
|
2011-05-13 17:31:00 +02:00
|
|
|
goto socket_rd_err;
|
2011-03-29 17:51:15 +02:00
|
|
|
if (buf1[0] == '-' || buf2[0] == '-') {
|
|
|
|
addReplyErrorFormat(c,"Target instance replied with error: %s",
|
|
|
|
(buf1[0] == '-') ? buf1+1 : buf2+1);
|
|
|
|
} else {
|
2011-10-03 15:51:10 +02:00
|
|
|
robj *aux;
|
|
|
|
|
2012-11-07 15:32:27 +01:00
|
|
|
if (!copy) {
|
|
|
|
/* No COPY option: remove the local key, signal the change. */
|
|
|
|
dbDelete(c->db,c->argv[3]);
|
|
|
|
signalModifiedKey(c->db,c->argv[3]);
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
addReply(c,shared.ok);
|
2011-10-03 15:51:10 +02:00
|
|
|
server.dirty++;
|
|
|
|
|
|
|
|
/* Translate MIGRATE as DEL for replication/AOF. */
|
2011-10-17 16:39:05 +02:00
|
|
|
aux = createStringObject("DEL",3);
|
2011-10-03 15:51:10 +02:00
|
|
|
rewriteClientCommandVector(c,2,aux,c->argv[3]);
|
|
|
|
decrRefCount(aux);
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-05-13 17:31:00 +02:00
|
|
|
sdsfree(cmd.io.buffer.ptr);
|
2011-04-01 18:59:28 +02:00
|
|
|
return;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
socket_wr_err:
|
2011-05-13 17:31:00 +02:00
|
|
|
sdsfree(cmd.io.buffer.ptr);
|
2012-11-12 00:45:10 +01:00
|
|
|
migrateCloseSocket(c->argv[1],c->argv[2]);
|
2012-11-14 11:30:24 +01:00
|
|
|
if (errno != ETIMEDOUT && retry_num++ == 0) goto try_again;
|
|
|
|
addReplySds(c,
|
|
|
|
sdsnew("-IOERR error or timeout writing to target instance\r\n"));
|
2011-04-01 18:59:28 +02:00
|
|
|
return;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
socket_rd_err:
|
2011-05-13 17:31:00 +02:00
|
|
|
sdsfree(cmd.io.buffer.ptr);
|
2012-11-12 00:45:10 +01:00
|
|
|
migrateCloseSocket(c->argv[1],c->argv[2]);
|
2012-11-14 11:30:24 +01:00
|
|
|
if (errno != ETIMEDOUT && retry_num++ == 0) goto try_again;
|
|
|
|
addReplySds(c,
|
|
|
|
sdsnew("-IOERR error or timeout reading from target node\r\n"));
|
2011-04-01 18:59:28 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-10-17 17:35:23 +02:00
|
|
|
/* The ASKING command is required after a -ASK redirection.
|
2013-01-17 01:00:20 +08:00
|
|
|
* The client should issue ASKING before to actually send the command to
|
2011-10-17 17:35:23 +02:00
|
|
|
* the target instance. See the Redis Cluster specification for more
|
|
|
|
* information. */
|
|
|
|
void askingCommand(redisClient *c) {
|
|
|
|
if (server.cluster_enabled == 0) {
|
|
|
|
addReplyError(c,"This instance has cluster support disabled");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
c->flags |= REDIS_ASKING;
|
|
|
|
addReply(c,shared.ok);
|
|
|
|
}
|
|
|
|
|
2011-03-29 17:51:15 +02:00
|
|
|
/* -----------------------------------------------------------------------------
|
|
|
|
* Cluster functions related to serving / redirecting clients
|
|
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
|
2013-02-15 11:50:54 +01:00
|
|
|
/* Return the pointer to the cluster node that is able to serve the command.
|
|
|
|
* For the function to succeed the command should only target a single
|
|
|
|
* key (or the same key multiple times).
|
2011-03-29 17:51:15 +02:00
|
|
|
*
|
2011-05-05 11:13:21 +02:00
|
|
|
* If the returned node should be used only for this request, the *ask
|
|
|
|
* integer is set to '1', otherwise to '0'. This is used in order to
|
|
|
|
* let the caller know if we should reply with -MOVED or with -ASK.
|
|
|
|
*
|
2013-02-15 11:50:54 +01:00
|
|
|
* If the command contains multiple keys, and as a consequence it is not
|
|
|
|
* possible to handle the request in Redis Cluster, NULL is returned. */
|
2011-05-05 11:13:21 +02:00
|
|
|
clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask) {
|
2011-03-29 17:51:15 +02:00
|
|
|
clusterNode *n = NULL;
|
2011-05-05 11:13:21 +02:00
|
|
|
robj *firstkey = NULL;
|
2011-03-29 17:51:15 +02:00
|
|
|
multiState *ms, _ms;
|
|
|
|
multiCmd mc;
|
2011-05-05 11:13:21 +02:00
|
|
|
int i, slot = 0;
|
2011-03-29 17:51:15 +02:00
|
|
|
|
|
|
|
/* We handle all the cases as if they were EXEC commands, so we have
|
|
|
|
* a common code path for everything */
|
|
|
|
if (cmd->proc == execCommand) {
|
|
|
|
/* If REDIS_MULTI flag is not set EXEC is just going to return an
|
|
|
|
* error. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (!(c->flags & REDIS_MULTI)) return server.cluster->myself;
|
2011-03-29 17:51:15 +02:00
|
|
|
ms = &c->mstate;
|
|
|
|
} else {
|
2011-05-05 11:13:21 +02:00
|
|
|
/* In order to have a single codepath create a fake Multi State
|
|
|
|
* structure if the client is not in MULTI/EXEC state, this way
|
|
|
|
* we have a single codepath below. */
|
2011-03-29 17:51:15 +02:00
|
|
|
ms = &_ms;
|
|
|
|
_ms.commands = &mc;
|
|
|
|
_ms.count = 1;
|
|
|
|
mc.argv = argv;
|
|
|
|
mc.argc = argc;
|
|
|
|
mc.cmd = cmd;
|
|
|
|
}
|
|
|
|
|
2011-05-05 11:13:21 +02:00
|
|
|
/* Check that all the keys are the same key, and get the slot and
|
|
|
|
* node for this key. */
|
2011-03-29 17:51:15 +02:00
|
|
|
for (i = 0; i < ms->count; i++) {
|
|
|
|
struct redisCommand *mcmd;
|
|
|
|
robj **margv;
|
|
|
|
int margc, *keyindex, numkeys, j;
|
|
|
|
|
|
|
|
mcmd = ms->commands[i].cmd;
|
|
|
|
margc = ms->commands[i].argc;
|
|
|
|
margv = ms->commands[i].argv;
|
|
|
|
|
|
|
|
keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys,
|
2011-05-05 17:50:14 +02:00
|
|
|
REDIS_GETKEYS_ALL);
|
2011-03-29 17:51:15 +02:00
|
|
|
for (j = 0; j < numkeys; j++) {
|
2011-05-05 11:13:21 +02:00
|
|
|
if (firstkey == NULL) {
|
|
|
|
/* This is the first key we see. Check what is the slot
|
|
|
|
* and node. */
|
|
|
|
firstkey = margv[keyindex[j]];
|
|
|
|
|
|
|
|
slot = keyHashSlot((char*)firstkey->ptr, sdslen(firstkey->ptr));
|
2013-02-14 13:20:56 +01:00
|
|
|
n = server.cluster->slots[slot];
|
2011-10-04 18:43:03 +02:00
|
|
|
redisAssertWithInfo(c,firstkey,n != NULL);
|
2011-03-29 17:51:15 +02:00
|
|
|
} else {
|
2011-05-05 11:13:21 +02:00
|
|
|
/* If it is not the first key, make sure it is exactly
|
|
|
|
* the same key as the first we saw. */
|
|
|
|
if (!equalStringObjects(firstkey,margv[keyindex[j]])) {
|
|
|
|
getKeysFreeResult(keyindex);
|
|
|
|
return NULL;
|
|
|
|
}
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
getKeysFreeResult(keyindex);
|
|
|
|
}
|
2011-05-05 11:13:21 +02:00
|
|
|
if (ask) *ask = 0; /* This is the default. Set to 1 if needed later. */
|
|
|
|
/* No key at all in command? then we can serve the request
|
|
|
|
* without redirections. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (n == NULL) return server.cluster->myself;
|
2011-05-05 11:13:21 +02:00
|
|
|
if (hashslot) *hashslot = slot;
|
|
|
|
/* This request is about a slot we are migrating into another instance?
|
|
|
|
* Then we need to check if we have the key. If we have it we can reply.
|
|
|
|
* If instead is a new key, we pass the request to the node that is
|
|
|
|
* receiving the slot. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (n == server.cluster->myself &&
|
|
|
|
server.cluster->migrating_slots_to[slot] != NULL)
|
2011-05-05 11:13:21 +02:00
|
|
|
{
|
|
|
|
if (lookupKeyRead(&server.db[0],firstkey) == NULL) {
|
|
|
|
if (ask) *ask = 1;
|
2013-02-14 13:20:56 +01:00
|
|
|
return server.cluster->migrating_slots_to[slot];
|
2011-05-05 11:13:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Handle the case in which we are receiving this hash slot from
|
|
|
|
* another instance, so we'll accept the query even if in the table
|
2011-10-17 17:35:23 +02:00
|
|
|
* it is assigned to a different node, but only if the client
|
|
|
|
* issued an ASKING command before. */
|
2013-02-14 13:20:56 +01:00
|
|
|
if (server.cluster->importing_slots_from[slot] != NULL &&
|
2013-02-20 17:28:35 +01:00
|
|
|
(c->flags & REDIS_ASKING || cmd->flags & REDIS_CMD_ASKING)) {
|
2013-02-14 13:20:56 +01:00
|
|
|
return server.cluster->myself;
|
2011-10-17 17:35:23 +02:00
|
|
|
}
|
2011-05-05 11:13:21 +02:00
|
|
|
/* It's not a -ASK case. Base case: just return the right node. */
|
|
|
|
return n;
|
2011-03-29 17:51:15 +02:00
|
|
|
}
|