2014-07-01 05:30:15 -04:00
|
|
|
/* The latency monitor allows to easily observe the sources of latency
|
|
|
|
* in a Redis instance using the LATENCY command. Different latency
|
|
|
|
* sources are monitored, like disk I/O, execution of commands, fork
|
|
|
|
* system call, and so forth.
|
|
|
|
*
|
|
|
|
* ----------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
|
|
* to endorse or promote products derived from this software without
|
|
|
|
* specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "redis.h"
|
|
|
|
|
2014-07-07 06:34:54 -04:00
|
|
|
/* Dictionary type for latency events. */
|
2014-07-01 05:30:15 -04:00
|
|
|
int dictStringKeyCompare(void *privdata, const void *key1, const void *key2) {
|
|
|
|
return strcmp(key1,key2) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int dictStringHash(const void *key) {
|
|
|
|
return dictGenHashFunction(key, strlen(key));
|
|
|
|
}
|
|
|
|
|
2014-07-07 06:34:54 -04:00
|
|
|
void dictVanillaFree(void *privdata, void *val);
|
|
|
|
|
2014-07-01 05:30:15 -04:00
|
|
|
dictType latencyTimeSeriesDictType = {
|
|
|
|
dictStringHash, /* hash function */
|
|
|
|
NULL, /* key dup */
|
|
|
|
NULL, /* val dup */
|
|
|
|
dictStringKeyCompare, /* key compare */
|
2014-07-07 06:34:54 -04:00
|
|
|
dictVanillaFree, /* key destructor */
|
|
|
|
dictVanillaFree /* val destructor */
|
2014-07-01 05:30:15 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
/* ---------------------------- Latency API --------------------------------- */
|
|
|
|
|
|
|
|
/* Latency monitor initialization. We just need to create the dictionary
|
|
|
|
* of time series, each time serie is craeted on demand in order to avoid
|
|
|
|
* having a fixed list to maintain. */
|
|
|
|
void latencyMonitorInit(void) {
|
|
|
|
server.latency_events = dictCreate(&latencyTimeSeriesDictType,NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add the specified sample to the specified time series "event".
|
|
|
|
* This function is usually called via latencyAddSampleIfNeeded(), that
|
|
|
|
* is a macro that only adds the sample if the latency is higher than
|
|
|
|
* server.latency_monitor_threshold. */
|
|
|
|
void latencyAddSample(char *event, mstime_t latency) {
|
|
|
|
struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
|
2014-07-01 11:12:09 -04:00
|
|
|
time_t now = time(NULL);
|
|
|
|
int prev;
|
2014-07-01 05:30:15 -04:00
|
|
|
|
|
|
|
/* Create the time series if it does not exist. */
|
|
|
|
if (ts == NULL) {
|
|
|
|
ts = zmalloc(sizeof(*ts));
|
|
|
|
ts->idx = 0;
|
|
|
|
ts->max = 0;
|
|
|
|
memset(ts->samples,0,sizeof(ts->samples));
|
|
|
|
dictAdd(server.latency_events,zstrdup(event),ts);
|
|
|
|
}
|
|
|
|
|
2014-07-01 11:12:09 -04:00
|
|
|
/* If the previous sample is in the same second, we update our old sample
|
|
|
|
* if this latency is > of the old one, or just return. */
|
|
|
|
prev = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
|
|
|
|
if (ts->samples[prev].time == now) {
|
|
|
|
if (latency > ts->samples[prev].latency)
|
|
|
|
ts->samples[prev].latency = latency;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-07-01 05:30:15 -04:00
|
|
|
ts->samples[ts->idx].time = time(NULL);
|
|
|
|
ts->samples[ts->idx].latency = latency;
|
|
|
|
if (latency > ts->max) ts->max = latency;
|
|
|
|
|
|
|
|
ts->idx++;
|
|
|
|
if (ts->idx == LATENCY_TS_LEN) ts->idx = 0;
|
|
|
|
}
|
|
|
|
|
2014-07-07 06:34:54 -04:00
|
|
|
/* Reset data for the specified event, or all the events data if 'event' is
|
|
|
|
* NULL.
|
|
|
|
*
|
|
|
|
* Note: this is O(N) even when event_to_reset is not NULL because makes
|
|
|
|
* the code simpler and we have a small fixed max number of events. */
|
|
|
|
int latencyResetEvent(char *event_to_reset) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
int resets = 0;
|
|
|
|
|
|
|
|
di = dictGetSafeIterator(server.latency_events);
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
char *event = dictGetKey(de);
|
|
|
|
|
|
|
|
if (event_to_reset == NULL || strcasecmp(event,event_to_reset) == 0) {
|
|
|
|
dictDelete(server.latency_events, event);
|
|
|
|
resets++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
return resets;
|
|
|
|
}
|
|
|
|
|
2014-07-07 08:59:57 -04:00
|
|
|
/* ------------------------ Latency reporting (doctor) ---------------------- */
|
|
|
|
|
|
|
|
/* Analyze the samples avaialble for a given event and return a structure
|
|
|
|
* populate with different metrics, average, MAD, min, max, and so forth.
|
|
|
|
* Check latency.h definition of struct latenctStat for more info.
|
|
|
|
* If the specified event has no elements the structure is populate with
|
|
|
|
* zero values. */
|
|
|
|
void analyzeLatencyForEvent(char *event, struct latencyStats *ls) {
|
|
|
|
struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
|
|
|
|
int j;
|
|
|
|
uint64_t sum;
|
|
|
|
|
|
|
|
ls->all_time_high = ts ? ts->max : 0;
|
|
|
|
ls->avg = 0;
|
|
|
|
ls->min = 0;
|
|
|
|
ls->max = 0;
|
|
|
|
ls->mad = 0;
|
|
|
|
ls->samples = 0;
|
|
|
|
if (!ts) return;
|
|
|
|
|
|
|
|
/* First pass, populate everything but the MAD. */
|
|
|
|
sum = 0;
|
|
|
|
for (j = 0; j < LATENCY_TS_LEN; j++) {
|
|
|
|
if (ts->samples[j].time == 0) continue;
|
|
|
|
ls->samples++;
|
|
|
|
if (ls->samples == 1) {
|
|
|
|
ls->min = ls->max = ts->samples[j].latency;
|
|
|
|
} else {
|
|
|
|
if (ls->min > ts->samples[j].latency)
|
|
|
|
ls->min = ts->samples[j].latency;
|
|
|
|
if (ls->max < ts->samples[j].latency)
|
|
|
|
ls->max = ts->samples[j].latency;
|
|
|
|
}
|
|
|
|
sum += ts->samples[j].latency;
|
|
|
|
}
|
|
|
|
if (ls->samples) ls->avg = sum / ls->samples;
|
|
|
|
|
|
|
|
/* Second pass, compute MAD. */
|
|
|
|
sum = 0;
|
|
|
|
for (j = 0; j < LATENCY_TS_LEN; j++) {
|
|
|
|
int64_t delta;
|
|
|
|
|
|
|
|
if (ts->samples[j].time == 0) continue;
|
|
|
|
delta = ls->avg - ts->samples[j].latency;
|
|
|
|
if (delta < 0) delta = -delta;
|
|
|
|
sum += delta;
|
|
|
|
}
|
|
|
|
if (ls->samples) ls->mad = sum / ls->samples;
|
|
|
|
}
|
|
|
|
|
2014-07-01 05:30:15 -04:00
|
|
|
/* ---------------------- Latency command implementation -------------------- */
|
|
|
|
|
2014-07-01 10:04:59 -04:00
|
|
|
/* latencyCommand() helper to produce a time-delay reply for all the samples
|
|
|
|
* in memory for the specified time series. */
|
|
|
|
void latencyCommandReplyWithSamples(redisClient *c, struct latencyTimeSeries *ts) {
|
|
|
|
void *replylen = addDeferredMultiBulkLength(c);
|
|
|
|
int samples = 0, j;
|
|
|
|
|
|
|
|
for (j = 0; j < LATENCY_TS_LEN; j++) {
|
|
|
|
int i = (ts->idx + j) % LATENCY_TS_LEN;
|
|
|
|
|
|
|
|
if (ts->samples[i].time == 0) continue;
|
|
|
|
addReplyMultiBulkLen(c,2);
|
|
|
|
addReplyLongLong(c,ts->samples[i].time);
|
|
|
|
addReplyLongLong(c,ts->samples[i].latency);
|
|
|
|
samples++;
|
|
|
|
}
|
|
|
|
setDeferredMultiBulkLength(c,replylen,samples);
|
|
|
|
}
|
|
|
|
|
2014-07-01 10:17:33 -04:00
|
|
|
/* latencyCommand() helper to produce the reply for the LATEST subcommand,
|
|
|
|
* listing the last latency sample for every event type registered so far. */
|
|
|
|
void latencyCommandReplyWithLatestEvents(redisClient *c) {
|
|
|
|
dictIterator *di;
|
|
|
|
dictEntry *de;
|
|
|
|
|
|
|
|
addReplyMultiBulkLen(c,dictSize(server.latency_events));
|
|
|
|
di = dictGetIterator(server.latency_events);
|
|
|
|
while((de = dictNext(di)) != NULL) {
|
|
|
|
char *event = dictGetKey(de);
|
|
|
|
struct latencyTimeSeries *ts = dictGetVal(de);
|
|
|
|
int last = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
|
|
|
|
|
2014-07-02 06:40:38 -04:00
|
|
|
addReplyMultiBulkLen(c,4);
|
2014-07-01 10:17:33 -04:00
|
|
|
addReplyBulkCString(c,event);
|
|
|
|
addReplyLongLong(c,ts->samples[last].time);
|
|
|
|
addReplyLongLong(c,ts->samples[last].latency);
|
2014-07-02 06:40:38 -04:00
|
|
|
addReplyLongLong(c,ts->max);
|
2014-07-01 10:17:33 -04:00
|
|
|
}
|
|
|
|
dictReleaseIterator(di);
|
|
|
|
}
|
|
|
|
|
2014-07-02 10:31:22 -04:00
|
|
|
#define LATENCY_GRAPH_COLS 80
|
|
|
|
sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
|
|
|
|
int j;
|
|
|
|
struct sequence *seq = createSparklineSequence();
|
|
|
|
sds graph = sdsempty();
|
2014-07-02 10:55:34 -04:00
|
|
|
uint32_t min = 0, max = 0;
|
2014-07-02 10:31:22 -04:00
|
|
|
|
|
|
|
for (j = 0; j < LATENCY_TS_LEN; j++) {
|
|
|
|
int i = (ts->idx + j) % LATENCY_TS_LEN;
|
|
|
|
int elapsed;
|
|
|
|
char *label;
|
|
|
|
char buf[64];
|
|
|
|
|
|
|
|
if (ts->samples[i].time == 0) continue;
|
|
|
|
/* Update min and max. */
|
|
|
|
if (seq->length == 0) {
|
|
|
|
min = max = ts->samples[i].latency;
|
|
|
|
} else {
|
|
|
|
if (ts->samples[i].latency > max) max = ts->samples[i].latency;
|
|
|
|
if (ts->samples[i].latency < min) min = ts->samples[i].latency;
|
|
|
|
}
|
|
|
|
/* Use as label the number of seconds / minutes / hours / days
|
|
|
|
* ago the event happened. */
|
|
|
|
elapsed = time(NULL) - ts->samples[i].time;
|
|
|
|
if (elapsed < 60)
|
|
|
|
snprintf(buf,sizeof(buf),"%ds",elapsed);
|
|
|
|
else if (elapsed < 3600)
|
|
|
|
snprintf(buf,sizeof(buf),"%dm",elapsed/60);
|
|
|
|
else if (elapsed < 3600*24)
|
|
|
|
snprintf(buf,sizeof(buf),"%dh",elapsed/3600);
|
|
|
|
else
|
|
|
|
snprintf(buf,sizeof(buf),"%dd",elapsed/(3600*24));
|
|
|
|
label = zstrdup(buf);
|
|
|
|
sparklineSequenceAddSample(seq,ts->samples[i].latency,label);
|
|
|
|
}
|
|
|
|
|
|
|
|
graph = sdscatprintf(graph,
|
|
|
|
"%s - high %lu ms, low %lu ms (all time high %lu ms)\n", event,
|
|
|
|
(unsigned long) max, (unsigned long) min, (unsigned long) ts->max);
|
|
|
|
for (j = 0; j < LATENCY_GRAPH_COLS; j++)
|
|
|
|
graph = sdscatlen(graph,"-",1);
|
|
|
|
graph = sdscatlen(graph,"\n",1);
|
2014-07-02 10:37:53 -04:00
|
|
|
graph = sparklineRender(graph,seq,LATENCY_GRAPH_COLS,4,SPARKLINE_FILL);
|
2014-07-02 10:31:22 -04:00
|
|
|
freeSparklineSequence(seq);
|
|
|
|
return graph;
|
|
|
|
}
|
|
|
|
|
2014-07-01 10:04:59 -04:00
|
|
|
/* LATENCY command implementations.
|
|
|
|
*
|
|
|
|
* LATENCY SAMPLES: return time-latency samples for the specified event.
|
|
|
|
* LATENCY LATEST: return the latest latency for all the events classes.
|
|
|
|
* LATENCY DOCTOR: returns an human readable analysis of instance latency.
|
|
|
|
* LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
|
|
|
|
*/
|
2014-07-01 05:30:15 -04:00
|
|
|
void latencyCommand(redisClient *c) {
|
2014-07-01 10:04:59 -04:00
|
|
|
struct latencyTimeSeries *ts;
|
|
|
|
|
2014-07-07 06:35:24 -04:00
|
|
|
if (!strcasecmp(c->argv[1]->ptr,"history") && c->argc == 3) {
|
|
|
|
/* LATENCY HISTORY <event> */
|
2014-07-01 10:04:59 -04:00
|
|
|
ts = dictFetchValue(server.latency_events,c->argv[2]->ptr);
|
|
|
|
if (ts == NULL) goto nodataerr;
|
|
|
|
latencyCommandReplyWithSamples(c,ts);
|
2014-07-02 10:31:22 -04:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"graph") && c->argc == 3) {
|
|
|
|
/* LATENCY GRAPH <event> */
|
|
|
|
sds graph;
|
|
|
|
dictEntry *de;
|
|
|
|
char *event;
|
|
|
|
|
|
|
|
de = dictFind(server.latency_events,c->argv[2]->ptr);
|
|
|
|
if (de == NULL) goto nodataerr;
|
|
|
|
ts = dictGetVal(de);
|
|
|
|
event = dictGetKey(de);
|
|
|
|
|
|
|
|
graph = latencyCommandGenSparkeline(event,ts);
|
|
|
|
addReplyBulkCString(c,graph);
|
|
|
|
sdsfree(graph);
|
2014-07-01 10:17:33 -04:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"latest") && c->argc == 2) {
|
|
|
|
/* LATENCY LATEST */
|
|
|
|
latencyCommandReplyWithLatestEvents(c);
|
2014-07-07 06:34:54 -04:00
|
|
|
} else if (!strcasecmp(c->argv[1]->ptr,"reset") && c->argc >= 2) {
|
|
|
|
/* LATENCY RESET */
|
|
|
|
if (c->argc == 2) {
|
|
|
|
addReplyLongLong(c,latencyResetEvent(NULL));
|
|
|
|
} else {
|
|
|
|
int j, resets = 0;
|
|
|
|
|
|
|
|
for (j = 2; j < c->argc; j++)
|
|
|
|
resets += latencyResetEvent(c->argv[j]->ptr);
|
|
|
|
addReplyLongLong(c,resets);
|
|
|
|
}
|
2014-07-01 10:04:59 -04:00
|
|
|
} else {
|
|
|
|
addReply(c,shared.syntaxerr);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
|
|
|
|
nodataerr:
|
|
|
|
/* Common error when the user asks for an event we have no latency
|
|
|
|
* information about. */
|
|
|
|
addReplyErrorFormat(c,
|
2014-07-02 10:56:08 -04:00
|
|
|
"No samples available for event '%s'", (char*) c->argv[2]->ptr);
|
2014-07-01 05:30:15 -04:00
|
|
|
}
|
2014-07-01 10:04:59 -04:00
|
|
|
|