Sentinel: more robust failover detection as observer.

Sentinel observers detect failover checking if a slave attached to the
monitored master turns into its replication state from slave to master.
However while this change may in theory only happen after a SLAVEOF NO
ONE command, in practie it is very easy to reboot a slave instance with
a wrong configuration that turns it into a master, especially if it was
a past master before a successfull failover.

This commit changes the detection policy so that if an instance goes
from slave to master, but at the same time the runid has changed, we
sense a reboot, and in that case we don't detect a failover at all.

This commit also introduces the "reboot" sentinel event, that is logged
at "warning" level (so this will trigger an admin notification).

The commit also fixes a problem in the disconnect handler that assumed
that the instance object always existed, that is not the case. Now we
no longer assume that redisAsyncFree() will call the disconnection
handler before returning.
This commit is contained in:
antirez 2012-07-24 12:42:40 +02:00
parent 1bbdf1709f
commit d876d6feac

View File

@ -598,10 +598,12 @@ void releaseSentinelRedisInstance(sentinelRedisInstance *ri) {
/* Release hiredis connections. Note that redisAsyncFree() will call
* the disconnection callback. */
if (ri->cc) {
ri->cc->data = NULL;
redisAsyncFree(ri->cc);
ri->cc = NULL;
}
if (ri->pc) {
ri->pc->data = NULL;
redisAsyncFree(ri->pc);
ri->pc = NULL;
}
@ -865,8 +867,11 @@ char *sentinelHandleConfiguration(char **argv, int argc) {
* for async conenctions. */
void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c) {
sentinelRedisInstance *ri = c->data;
int pubsub = (ri->pc == c);
int pubsub;
if (ri == NULL) return; /* The instance no longer exists. */
pubsub = (ri->pc == c);
sentinelEvent(REDIS_DEBUG, pubsub ? "-pubsub-link" : "-cmd-link", ri,
"%@ #%s", c->errstr);
if (pubsub)
@ -961,7 +966,8 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
sds *lines;
int numlines, j;
int role = 0;
int runid_changed = 0; /* true if runid changed. */
int first_runid = 0; /* true if this is the first runid we receive. */
/* The following fields must be reset to a given value in the case they
* are not found at all in the INFO output. */
@ -977,10 +983,14 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
if (sdslen(l) >= 47 && !memcmp(l,"run_id:",7)) {
if (ri->runid == NULL) {
ri->runid = sdsnewlen(l+7,40);
first_runid = 1;
} else {
/* TODO: check if run_id has changed. This means the
* instance has been restarted, we want to set a flag
* and notify this event. */
if (strncmp(ri->runid,l+7,40) != 0) {
runid_changed = 1;
sentinelEvent(REDIS_NOTICE,"+reboot",ri,"%@");
sdsfree(ri->runid);
ri->runid = sdsnewlen(l+7,40);
}
}
}
@ -1048,7 +1058,22 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
/* Act if a slave turned into a master. */
if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
if (ri->flags & SRI_PROMOTED) {
if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
(runid_changed || first_runid))
{
int retval;
/* If a slave turned into a master, but at the same time the
* runid has changed, or it is simply the first time we see and
* INFO output from this instance, this is a reboot with a wrong
* configuration.
*
* Log the event and remove the slave. */
sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves");
retval = dictDelete(ri->master->slaves,ri->name);
redisAssert(retval == REDIS_OK);
return;
} else if (ri->flags & SRI_PROMOTED) {
/* If this is a promoted slave we can change state to the
* failover state machine. */
if (ri->master &&