mirror of
https://codeberg.org/redict/redict.git
synced 2025-01-23 16:48:27 -05:00
5629dbe715
The cluster bus is established over TLS or non-TLS depending on the configuration tls-cluster. The client ports distributed in the cluster and sent to clients are assumed to be TLS or non-TLS also depending on tls-cluster. The cluster bus is now extended to also contain the non-TLS port of clients in a TLS cluster, when available. The non-TLS port of a cluster node, when available, is sent to clients connected without TLS in responses to CLUSTER SLOTS, CLUSTER NODES, CLUSTER SLAVES and MOVED and ASK redirects, instead of the TLS port. The user was able to override the client port by defining cluster-announce-port. Now cluster-announce-tls-port is added, so the user can define an alternative announce port for both TLS and non-TLS clients. Fixes #8134
297 lines
14 KiB
C
297 lines
14 KiB
C
#ifndef __CLUSTER_H
|
|
#define __CLUSTER_H
|
|
|
|
/*-----------------------------------------------------------------------------
|
|
* Redis cluster data structures, defines, exported API.
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
#define CLUSTER_SLOTS 16384
|
|
#define CLUSTER_OK 0 /* Everything looks ok */
|
|
#define CLUSTER_FAIL 1 /* The cluster can't work */
|
|
#define CLUSTER_NAMELEN 40 /* sha1 hex length */
|
|
#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
|
|
|
|
/* The following defines are amount of time, sometimes expressed as
|
|
* multiplicators of the node timeout value (when ending with MULT). */
|
|
#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
|
|
#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
|
|
#define CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
|
|
#define CLUSTER_FAILOVER_DELAY 5 /* Seconds */
|
|
#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
|
|
#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
|
|
#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
|
|
|
|
/* Redirection errors returned by getNodeByQuery(). */
|
|
#define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */
|
|
#define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */
|
|
#define CLUSTER_REDIR_UNSTABLE 2 /* -TRYAGAIN redirection required */
|
|
#define CLUSTER_REDIR_ASK 3 /* -ASK redirection required. */
|
|
#define CLUSTER_REDIR_MOVED 4 /* -MOVED redirection required. */
|
|
#define CLUSTER_REDIR_DOWN_STATE 5 /* -CLUSTERDOWN, global state. */
|
|
#define CLUSTER_REDIR_DOWN_UNBOUND 6 /* -CLUSTERDOWN, unbound slot. */
|
|
#define CLUSTER_REDIR_DOWN_RO_STATE 7 /* -CLUSTERDOWN, allow reads. */
|
|
|
|
struct clusterNode;
|
|
|
|
/* clusterLink encapsulates everything needed to talk with a remote node. */
|
|
typedef struct clusterLink {
|
|
mstime_t ctime; /* Link creation time */
|
|
connection *conn; /* Connection to remote node */
|
|
sds sndbuf; /* Packet send buffer */
|
|
char *rcvbuf; /* Packet reception buffer */
|
|
size_t rcvbuf_len; /* Used size of rcvbuf */
|
|
size_t rcvbuf_alloc; /* Allocated size of rcvbuf */
|
|
struct clusterNode *node; /* Node related to this link if any, or NULL */
|
|
} clusterLink;
|
|
|
|
/* Cluster node flags and macros. */
|
|
#define CLUSTER_NODE_MASTER 1 /* The node is a master */
|
|
#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */
|
|
#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */
|
|
#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */
|
|
#define CLUSTER_NODE_MYSELF 16 /* This node is myself */
|
|
#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
|
|
#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */
|
|
#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */
|
|
#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */
|
|
#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */
|
|
#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
|
|
|
|
#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
|
|
#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
|
|
#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
|
|
#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
|
|
#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR)
|
|
#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
|
|
#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
|
|
#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
|
|
|
|
/* Reasons why a slave is not able to failover. */
|
|
#define CLUSTER_CANT_FAILOVER_NONE 0
|
|
#define CLUSTER_CANT_FAILOVER_DATA_AGE 1
|
|
#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
|
|
#define CLUSTER_CANT_FAILOVER_EXPIRED 3
|
|
#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
|
|
#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */
|
|
|
|
/* clusterState todo_before_sleep flags. */
|
|
#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
|
|
#define CLUSTER_TODO_UPDATE_STATE (1<<1)
|
|
#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
|
|
#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
|
|
#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4)
|
|
|
|
/* Message types.
|
|
*
|
|
* Note that the PING, PONG and MEET messages are actually the same exact
|
|
* kind of packet. PONG is the reply to ping, in the exact format as a PING,
|
|
* while MEET is a special PING that forces the receiver to add the sender
|
|
* as a node (if it is not already in the list). */
|
|
#define CLUSTERMSG_TYPE_PING 0 /* Ping */
|
|
#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */
|
|
#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */
|
|
#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */
|
|
#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */
|
|
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
|
|
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */
|
|
#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */
|
|
#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */
|
|
#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */
|
|
#define CLUSTERMSG_TYPE_COUNT 10 /* Total number of message types. */
|
|
|
|
/* Flags that a module can set in order to prevent certain Redis Cluster
|
|
* features to be enabled. Useful when implementing a different distributed
|
|
* system on top of Redis Cluster message bus, using modules. */
|
|
#define CLUSTER_MODULE_FLAG_NONE 0
|
|
#define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1)
|
|
#define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2)
|
|
|
|
/* This structure represent elements of node->fail_reports. */
|
|
typedef struct clusterNodeFailReport {
|
|
struct clusterNode *node; /* Node reporting the failure condition. */
|
|
mstime_t time; /* Time of the last report from this node. */
|
|
} clusterNodeFailReport;
|
|
|
|
typedef struct clusterNode {
|
|
mstime_t ctime; /* Node object creation time. */
|
|
char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
|
|
int flags; /* CLUSTER_NODE_... */
|
|
uint64_t configEpoch; /* Last configEpoch observed for this node */
|
|
unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
|
|
sds slots_info; /* Slots info represented by string. */
|
|
int numslots; /* Number of slots handled by this node */
|
|
int numslaves; /* Number of slave nodes, if this is a master */
|
|
struct clusterNode **slaves; /* pointers to slave nodes */
|
|
struct clusterNode *slaveof; /* pointer to the master node. Note that it
|
|
may be NULL even if the node is a slave
|
|
if we don't have the master node in our
|
|
tables. */
|
|
mstime_t ping_sent; /* Unix time we sent latest ping */
|
|
mstime_t pong_received; /* Unix time we received the pong */
|
|
mstime_t data_received; /* Unix time we received any data */
|
|
mstime_t fail_time; /* Unix time when FAIL flag was set */
|
|
mstime_t voted_time; /* Last time we voted for a slave of this master */
|
|
mstime_t repl_offset_time; /* Unix time we received offset for this node */
|
|
mstime_t orphaned_time; /* Starting time of orphaned master condition */
|
|
long long repl_offset; /* Last known repl offset for this node. */
|
|
char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */
|
|
int port; /* Latest known clients port (TLS or plain). */
|
|
int pport; /* Latest known clients plaintext port. Only used
|
|
if the main clients port is for TLS. */
|
|
int cport; /* Latest known cluster port of this node. */
|
|
clusterLink *link; /* TCP/IP link with this node */
|
|
list *fail_reports; /* List of nodes signaling this as failing */
|
|
} clusterNode;
|
|
|
|
typedef struct clusterState {
|
|
clusterNode *myself; /* This node */
|
|
uint64_t currentEpoch;
|
|
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
|
|
int size; /* Num of master nodes with at least one slot */
|
|
dict *nodes; /* Hash table of name -> clusterNode structures */
|
|
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
|
|
clusterNode *migrating_slots_to[CLUSTER_SLOTS];
|
|
clusterNode *importing_slots_from[CLUSTER_SLOTS];
|
|
clusterNode *slots[CLUSTER_SLOTS];
|
|
uint64_t slots_keys_count[CLUSTER_SLOTS];
|
|
rax *slots_to_keys;
|
|
/* The following fields are used to take the slave state on elections. */
|
|
mstime_t failover_auth_time; /* Time of previous or next election. */
|
|
int failover_auth_count; /* Number of votes received so far. */
|
|
int failover_auth_sent; /* True if we already asked for votes. */
|
|
int failover_auth_rank; /* This slave rank for current auth request. */
|
|
uint64_t failover_auth_epoch; /* Epoch of the current election. */
|
|
int cant_failover_reason; /* Why a slave is currently not able to
|
|
failover. See the CANT_FAILOVER_* macros. */
|
|
/* Manual failover state in common. */
|
|
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
|
|
It is zero if there is no MF in progress. */
|
|
/* Manual failover state of master. */
|
|
clusterNode *mf_slave; /* Slave performing the manual failover. */
|
|
/* Manual failover state of slave. */
|
|
long long mf_master_offset; /* Master offset the slave needs to start MF
|
|
or -1 if still not received. */
|
|
int mf_can_start; /* If non-zero signal that the manual failover
|
|
can start requesting masters vote. */
|
|
/* The following fields are used by masters to take state on elections. */
|
|
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
|
|
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
|
|
/* Messages received and sent by type. */
|
|
long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
|
|
long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
|
|
long long stats_pfail_nodes; /* Number of nodes in PFAIL status,
|
|
excluding nodes without address. */
|
|
} clusterState;
|
|
|
|
/* Redis cluster messages header */
|
|
|
|
/* Initially we don't know our "name", but we'll find it once we connect
|
|
* to the first node, using the getsockname() function. Then we'll use this
|
|
* address for all the next messages. */
|
|
typedef struct {
|
|
char nodename[CLUSTER_NAMELEN];
|
|
uint32_t ping_sent;
|
|
uint32_t pong_received;
|
|
char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */
|
|
uint16_t port; /* base port last time it was seen */
|
|
uint16_t cport; /* cluster port last time it was seen */
|
|
uint16_t flags; /* node->flags copy */
|
|
uint16_t pport; /* plaintext-port, when base port is TLS */
|
|
uint16_t notused1;
|
|
} clusterMsgDataGossip;
|
|
|
|
typedef struct {
|
|
char nodename[CLUSTER_NAMELEN];
|
|
} clusterMsgDataFail;
|
|
|
|
typedef struct {
|
|
uint32_t channel_len;
|
|
uint32_t message_len;
|
|
unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
|
|
} clusterMsgDataPublish;
|
|
|
|
typedef struct {
|
|
uint64_t configEpoch; /* Config epoch of the specified instance. */
|
|
char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
|
|
unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */
|
|
} clusterMsgDataUpdate;
|
|
|
|
typedef struct {
|
|
uint64_t module_id; /* ID of the sender module. */
|
|
uint32_t len; /* ID of the sender module. */
|
|
uint8_t type; /* Type from 0 to 255. */
|
|
unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
|
|
} clusterMsgModule;
|
|
|
|
union clusterMsgData {
|
|
/* PING, MEET and PONG */
|
|
struct {
|
|
/* Array of N clusterMsgDataGossip structures */
|
|
clusterMsgDataGossip gossip[1];
|
|
} ping;
|
|
|
|
/* FAIL */
|
|
struct {
|
|
clusterMsgDataFail about;
|
|
} fail;
|
|
|
|
/* PUBLISH */
|
|
struct {
|
|
clusterMsgDataPublish msg;
|
|
} publish;
|
|
|
|
/* UPDATE */
|
|
struct {
|
|
clusterMsgDataUpdate nodecfg;
|
|
} update;
|
|
|
|
/* MODULE */
|
|
struct {
|
|
clusterMsgModule msg;
|
|
} module;
|
|
};
|
|
|
|
#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
|
|
|
|
typedef struct {
|
|
char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */
|
|
uint32_t totlen; /* Total length of this message */
|
|
uint16_t ver; /* Protocol version, currently set to 1. */
|
|
uint16_t port; /* TCP base port number. */
|
|
uint16_t type; /* Message type */
|
|
uint16_t count; /* Only used for some kind of messages. */
|
|
uint64_t currentEpoch; /* The epoch accordingly to the sending node. */
|
|
uint64_t configEpoch; /* The config epoch if it's a master, or the last
|
|
epoch advertised by its master if it is a
|
|
slave. */
|
|
uint64_t offset; /* Master replication offset if node is a master or
|
|
processed replication offset if node is a slave. */
|
|
char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
|
|
unsigned char myslots[CLUSTER_SLOTS/8];
|
|
char slaveof[CLUSTER_NAMELEN];
|
|
char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */
|
|
char notused1[32]; /* 32 bytes reserved for future usage. */
|
|
uint16_t pport; /* Sender TCP plaintext port, if base port is TLS */
|
|
uint16_t cport; /* Sender TCP cluster bus port */
|
|
uint16_t flags; /* Sender node flags */
|
|
unsigned char state; /* Cluster state from the POV of the sender */
|
|
unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
|
|
union clusterMsgData data;
|
|
} clusterMsg;
|
|
|
|
#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
|
|
|
|
/* Message flags better specify the packet content or are used to
|
|
* provide some information about the node state. */
|
|
#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
|
|
#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
|
|
master is up. */
|
|
|
|
/* ---------------------- API exported outside cluster.c -------------------- */
|
|
clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
|
|
int clusterRedirectBlockedClientIfNeeded(client *c);
|
|
void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code);
|
|
unsigned long getClusterConnectionsCount(void);
|
|
|
|
#endif /* __CLUSTER_H */
|