Add oom-score-adj configuration option to control Linux OOM killer. (#1690)

Add Linux kernel OOM killer control option.

This adds the ability to control the Linux OOM killer oom_score_adj
parameter for all Redis processes, depending on the process role (i.e.
master, replica, background child).

A oom-score-adj global boolean flag control this feature. In addition,
specific values can be configured using oom-score-adj-values if
additional tuning is required.
This commit is contained in:
Yossi Gottlieb 2020-08-12 17:58:56 +03:00 committed by GitHub
parent f3df3ec134
commit 2530dc0ebd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 306 additions and 0 deletions

View File

@ -1049,6 +1049,32 @@ lazyfree-lazy-user-del no
# --threads option to match the number of Redis theads, otherwise you'll not
# be able to notice the improvements.
############################ KERNEL OOM CONTROL ##############################
# On Linux, it is possible to hint the kernel OOM killer on what processes
# should be killed first when out of memory.
#
# Enabling this feature makes Redis actively control the oom_score_adj value
# for all its processes, depending on their role. The default scores will
# attempt to have background child processes killed before all others, and
# replicas killed before masters.
oom-score-adj no
# When oom-score-adj is used, this directive controls the specific values used
# for master, replica and background child processes. Values range -1000 to
# 1000 (higher means more likely to be killed).
#
# Unprivileged processes (not root, and without CAP_SYS_RESOURCE capabilities)
# can freely increase their value, but not decrease it below its initial
# settings.
#
# Values are used relative to the initial value of oom_score_adj when the server
# starts. Because typically the initial value is 0, they will often match the
# absolute values.
oom-score-adj-values 0 200 800
############################## APPEND ONLY MODE ###############################
# By default Redis asynchronously dumps the dataset on disk. This mode is

View File

@ -111,6 +111,9 @@ clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = {
{1024*1024*32, 1024*1024*8, 60} /* pubsub */
};
/* OOM Score defaults */
int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT] = { 0, 200, 800 };
/* Generic config infrastructure function pointers
* int is_valid_fn(val, err)
* Return 1 when val is valid, and 0 when invalid.
@ -286,6 +289,59 @@ void queueLoadModule(sds path, sds *argv, int argc) {
listAddNodeTail(server.loadmodule_queue,loadmod);
}
/* Parse an array of CONFIG_OOM_COUNT sds strings, validate and populate
* server.oom_score_adj_values if valid.
*/
static int updateOOMScoreAdjValues(sds *args, char **err) {
int i;
int values[CONFIG_OOM_COUNT];
for (i = 0; i < CONFIG_OOM_COUNT; i++) {
char *eptr;
long long val = strtoll(args[i], &eptr, 10);
if (*eptr != '\0' || val < -1000 || val > 1000) {
if (err) *err = "Invalid oom-score-adj-values, elements must be between -1000 and 1000.";
return C_ERR;
}
values[i] = val;
}
/* Verify that the values make sense. If they don't omit a warning but
* keep the configuration, which may still be valid for privileged processes.
*/
if (values[CONFIG_OOM_REPLICA] < values[CONFIG_OOM_MASTER] ||
values[CONFIG_OOM_BGCHILD] < values[CONFIG_OOM_REPLICA]) {
serverLog(LOG_WARNING,
"The oom-score-adj-values configuration may not work for non-privileged processes! "
"Please consult the documentation.");
}
/* Store values, retain previous config for rollback in case we fail. */
int old_values[CONFIG_OOM_COUNT];
for (i = 0; i < CONFIG_OOM_COUNT; i++) {
old_values[i] = server.oom_score_adj_values[i];
server.oom_score_adj_values[i] = values[i];
}
/* Update */
if (setOOMScoreAdj(-1) == C_ERR) {
/* Roll back */
for (i = 0; i < CONFIG_OOM_COUNT; i++)
server.oom_score_adj_values[i] = old_values[i];
if (err)
*err = "Failed to apply oom-score-adj-values configuration, check server logs.";
return C_ERR;
}
return C_OK;
}
void initConfigValues() {
for (standardConfig *config = configs; config->name != NULL; config++) {
config->interface.init(config->data);
@ -479,6 +535,8 @@ void loadServerConfigFromString(char *config) {
server.client_obuf_limits[class].hard_limit_bytes = hard;
server.client_obuf_limits[class].soft_limit_bytes = soft;
server.client_obuf_limits[class].soft_limit_seconds = soft_seconds;
} else if (!strcasecmp(argv[0],"oom-score-adj-values") && argc == 1 + CONFIG_OOM_COUNT) {
if (updateOOMScoreAdjValues(&argv[1], &err) == C_ERR) goto loaderr;
} else if (!strcasecmp(argv[0],"notify-keyspace-events") && argc == 2) {
int flags = keyspaceEventsStringToFlags(argv[1]);
@ -728,6 +786,17 @@ void configSetCommand(client *c) {
server.client_obuf_limits[class].soft_limit_seconds = soft_seconds;
}
sdsfreesplitres(v,vlen);
} config_set_special_field("oom-score-adj-values") {
int vlen;
int success = 1;
sds *v = sdssplitlen(o->ptr, sdslen(o->ptr), " ", 1, &vlen);
if (vlen != CONFIG_OOM_COUNT || updateOOMScoreAdjValues(v, &errstr) == C_ERR)
success = 0;
sdsfreesplitres(v, vlen);
if (!success)
goto badfmt;
} config_set_special_field("notify-keyspace-events") {
int flags = keyspaceEventsStringToFlags(o->ptr);
@ -923,6 +992,22 @@ void configGetCommand(client *c) {
matches++;
}
if (stringmatch(pattern,"oom-score-adj-values",0)) {
sds buf = sdsempty();
int j;
for (j = 0; j < CONFIG_OOM_COUNT; j++) {
buf = sdscatprintf(buf,"%d", server.oom_score_adj_values[j]);
if (j != CONFIG_OOM_COUNT-1)
buf = sdscatlen(buf," ",1);
}
addReplyBulkCString(c,"oom-score-adj-values");
addReplyBulkCString(c,buf);
sdsfree(buf);
matches++;
}
setDeferredMapLen(c,replylen,matches);
}
@ -1330,6 +1415,25 @@ void rewriteConfigClientoutputbufferlimitOption(struct rewriteConfigState *state
}
}
/* Rewrite the oom-score-adj-values option. */
void rewriteConfigOOMScoreAdjValuesOption(struct rewriteConfigState *state) {
int force = 0;
int j;
char *option = "oom-score-adj-values";
sds line;
line = sdsempty();
for (j = 0; j < CONFIG_OOM_COUNT; j++) {
if (server.oom_score_adj_values[j] != configOOMScoreAdjValuesDefaults[j])
force = 1;
line = sdscatprintf(line, "%d", server.oom_score_adj_values[j]);
if (j+1 != CONFIG_OOM_COUNT)
line = sdscatlen(line, " ", 1);
}
rewriteConfigRewriteLine(state,option,line,force);
}
/* Rewrite the bind option. */
void rewriteConfigBindOption(struct rewriteConfigState *state) {
int force = 1;
@ -1528,6 +1632,7 @@ int rewriteConfig(char *path) {
rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,CONFIG_DEFAULT_CLUSTER_CONFIG_FILE);
rewriteConfigNotifykeyspaceeventsOption(state);
rewriteConfigClientoutputbufferlimitOption(state);
rewriteConfigOOMScoreAdjValuesOption(state);
/* Rewrite Sentinel config if in Sentinel mode. */
if (server.sentinel_mode) rewriteConfigSentinelOption(state);
@ -2082,6 +2187,19 @@ static int updateMaxclients(long long val, long long prev, char **err) {
return 1;
}
static int updateOOMScoreAdj(int val, int prev, char **err) {
UNUSED(prev);
if (val) {
if (setOOMScoreAdj(-1) == C_ERR) {
*err = "Failed to set current oom_score_adj. Check server logs.";
return 0;
}
}
return 1;
}
#ifdef USE_OPENSSL
static int updateTlsCfg(char *val, char *prev, char **err) {
UNUSED(val);
@ -2146,6 +2264,7 @@ standardConfig configs[] = {
createBoolConfig("crash-log-enabled", NULL, MODIFIABLE_CONFIG, server.crashlog_enabled, 1, NULL, updateSighandlerEnabled),
createBoolConfig("crash-memcheck-enabled", NULL, MODIFIABLE_CONFIG, server.memcheck_enabled, 1, NULL, NULL),
createBoolConfig("use-exit-on-panic", NULL, MODIFIABLE_CONFIG, server.use_exit_on_panic, 0, NULL, NULL),
createBoolConfig("oom-score-adj", NULL, MODIFIABLE_CONFIG, server.oom_score_adj, 0, NULL, updateOOMScoreAdj),
/* String Configs */
createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),

View File

@ -54,6 +54,7 @@
#define HAVE_PROC_MAPS 1
#define HAVE_PROC_SMAPS 1
#define HAVE_PROC_SOMAXCONN 1
#define HAVE_PROC_OOM_SCORE_ADJ 1
#endif
/* Test for task_info() */

View File

@ -2497,6 +2497,9 @@ void replicationSetMaster(char *ip, int port) {
server.masterhost = sdsnew(ip);
server.masterport = port;
/* Update oom_score_adj */
setOOMScoreAdj(-1);
/* Force our slaves to resync with us as well. They may hopefully be able
* to partially resync with us, but we can notify the replid change. */
disconnectSlaves();
@ -2564,6 +2567,9 @@ void replicationUnsetMaster(void) {
* master switch. */
server.slaveseldb = -1;
/* Update oom_score_adj */
setOOMScoreAdj(-1);
/* Once we turn from slave to master, we consider the starting time without
* slaves (that is used to count the replication backlog time to live) as
* starting from now. Otherwise the backlog will be freed after a

View File

@ -2422,6 +2422,10 @@ void initServerConfig(void) {
for (j = 0; j < CLIENT_TYPE_OBUF_COUNT; j++)
server.client_obuf_limits[j] = clientBufferLimitsDefaults[j];
/* Linux OOM Score config */
for (j = 0; j < CONFIG_OOM_COUNT; j++)
server.oom_score_adj_values[j] = configOOMScoreAdjValuesDefaults[j];
/* Double constants initialization */
R_Zero = 0.0;
R_PosInf = 1.0/R_Zero;
@ -2527,6 +2531,58 @@ int restartServer(int flags, mstime_t delay) {
return C_ERR; /* Never reached. */
}
static void readOOMScoreAdj(void) {
#ifdef HAVE_PROC_OOM_SCORE_ADJ
char buf[64];
int fd = open("/proc/self/oom_score_adj", O_RDONLY);
if (fd < 0) return;
if (read(fd, buf, sizeof(buf)) > 0)
server.oom_score_adj_base = atoi(buf);
close(fd);
#endif
}
/* This function will configure the current process's oom_score_adj according
* to user specified configuration. This is currently implemented on Linux
* only.
*
* A process_class value of -1 implies OOM_CONFIG_MASTER or OOM_CONFIG_REPLICA,
* depending on current role.
*/
int setOOMScoreAdj(int process_class) {
int fd;
int val;
char buf[64];
if (!server.oom_score_adj) return C_OK;
if (process_class == -1)
process_class = (server.masterhost ? CONFIG_OOM_REPLICA : CONFIG_OOM_MASTER);
serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT);
#ifdef HAVE_PROC_OOM_SCORE_ADJ
val = server.oom_score_adj_base + server.oom_score_adj_values[process_class];
if (val > 1000) val = 1000;
if (val < -1000) val = -1000;
snprintf(buf, sizeof(buf) - 1, "%d\n", val);
fd = open("/proc/self/oom_score_adj", O_WRONLY);
if (fd < 0 || write(fd, buf, strlen(buf)) < 0) {
serverLog(LOG_WARNING, "Unable to write oom_score_adj: %s", strerror(errno));
if (fd != -1) close(fd);
return C_ERR;
}
close(fd);
return C_OK;
#else
/* Unsupported */
return C_ERR;
#endif
}
/* This function will try to raise the max number of open files accordingly to
* the configured max number of clients. It also reserves a number of file
* descriptors (CONFIG_MIN_RESERVED_FDS) for extra operations of
@ -4866,6 +4922,7 @@ int redisFork() {
long long start = ustime();
if ((childpid = fork()) == 0) {
/* Child */
setOOMScoreAdj(CONFIG_OOM_BGCHILD);
closeListeningSockets(0);
setupChildSignalHandlers();
} else {
@ -5197,6 +5254,7 @@ int main(int argc, char **argv) {
server.supervised = redisIsSupervised(server.supervised_mode);
int background = server.daemonize && !server.supervised;
if (background) daemonize();
readOOMScoreAdj();
initServer();
if (background || server.pidfile) createPidFile();
@ -5250,6 +5308,8 @@ int main(int argc, char **argv) {
}
redisSetCpuAffinity(server.server_cpulist);
setOOMScoreAdj(-1);
aeMain(server.el);
aeDeleteEventLoop(server.el);
return 0;

View File

@ -150,6 +150,14 @@ typedef long long ustime_t; /* microsecond time type. */
* in order to make sure of not over provisioning more than 128 fds. */
#define CONFIG_FDSET_INCR (CONFIG_MIN_RESERVED_FDS+96)
/* OOM Score Adjustment classes. */
#define CONFIG_OOM_MASTER 0
#define CONFIG_OOM_REPLICA 1
#define CONFIG_OOM_BGCHILD 2
#define CONFIG_OOM_COUNT 3
extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
/* Hash table parameters */
#define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */
@ -1350,6 +1358,9 @@ struct redisServer {
int lfu_log_factor; /* LFU logarithmic counter factor. */
int lfu_decay_time; /* LFU counter decay factor. */
long long proto_max_bulk_len; /* Protocol bulk length maximum size. */
int oom_score_adj_base; /* Base oom_score_adj value, as observed on startup */
int oom_score_adj_values[CONFIG_OOM_COUNT]; /* Linux oom_score_adj configuration */
int oom_score_adj; /* If true, oom_score_adj is managed */
/* Blocked clients */
unsigned int blocked_clients; /* # of clients executing a blocking cmd.*/
unsigned int blocked_clients_by_type[BLOCKED_NUM];
@ -2016,6 +2027,7 @@ const char *evictPolicyToString(void);
struct redisMemOverhead *getMemoryOverheadData(void);
void freeMemoryOverheadData(struct redisMemOverhead *mh);
void checkChildrenDone(void);
int setOOMScoreAdj(int process_class);
#define RESTART_SERVER_NONE 0
#define RESTART_SERVER_GRACEFULLY (1<<0) /* Do proper shutdown. */

View File

@ -68,6 +68,7 @@ set ::all_tests {
unit/pendingquerybuf
unit/tls
unit/tracking
unit/oom-score-adj
}
# Index to the next test to run in the ::all_tests list.
set ::next_test 0

View File

@ -0,0 +1,81 @@
set system_name [string tolower [exec uname -s]]
set user_id [exec id -u]
if {$system_name eq {linux}} {
start_server {tags {"oom-score-adj"}} {
proc get_oom_score_adj {{pid ""}} {
if {$pid == ""} {
set pid [srv 0 pid]
}
set fd [open "/proc/$pid/oom_score_adj" "r"]
set val [gets $fd]
close $fd
return $val
}
proc get_child_pid {} {
set pid [srv 0 pid]
set fd [open "|ps --ppid $pid -o pid -h" "r"]
set child_pid [string trim [read $fd]]
close $fd
return $child_pid
}
test {CONFIG SET oom-score-adj works as expected} {
set base [get_oom_score_adj]
# Enable oom-score-adj, check defaults
r config set oom-score-adj-values "10 20 30"
r config set oom-score-adj yes
assert {[get_oom_score_adj] == [expr $base + 10]}
# Modify current class
r config set oom-score-adj-values "15 20 30"
assert {[get_oom_score_adj] == [expr $base + 15]}
# Check replica class
r replicaof localhost 1
assert {[get_oom_score_adj] == [expr $base + 20]}
r replicaof no one
assert {[get_oom_score_adj] == [expr $base + 15]}
# Check child process
r set key-a value-a
r config set rdb-key-save-delay 100000
r bgsave
set child_pid [get_child_pid]
assert {[get_oom_score_adj $child_pid] == [expr $base + 30]}
}
# Failed oom-score-adj tests can only run unprivileged
if {$user_id != 0} {
test {CONFIG SET oom-score-adj handles configuration failures} {
# Bad config
r config set oom-score-adj no
r config set oom-score-adj-values "-1000 -1000 -1000"
# Make sure it fails
catch {r config set oom-score-adj yes} e
assert_match {*Failed to set*} $e
# Make sure it remains off
assert {[r config get oom-score-adj] == "oom-score-adj no"}
# Fix config
r config set oom-score-adj-values "0 100 100"
r config set oom-score-adj yes
# Make sure it fails
catch {r config set oom-score-adj-values "-1000 -1000 -1000"} e
assert_match {*Failed*} $e
# Make sure previous values remain
assert {[r config get oom-score-adj-values] == {oom-score-adj-values {0 100 100}}}
}
}
}
}