/* Maxmemory directive handling (LRU eviction and other policies). * * ---------------------------------------------------------------------------- * * Copyright (c) 2009-2016, Salvatore Sanfilippo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Redis nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "server.h" #include "bio.h" /* Return the LRU clock, based on the clock resolution. This is a time * in a reduced-bits format that can be used to set and check the * object->lru field of redisObject structures. */ unsigned int getLRUClock(void) { return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX; } /* Given an object returns the min number of milliseconds the object was never * requested, using an approximated LRU algorithm. */ unsigned long long estimateObjectIdleTime(robj *o) { unsigned long long lruclock = LRU_CLOCK(); if (lruclock >= o->lru) { return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION; } else { return (lruclock + (LRU_CLOCK_MAX - o->lru)) * LRU_CLOCK_RESOLUTION; } } /* freeMemoryIfNeeded() gets called when 'maxmemory' is set on the config * file to limit the max memory used by the server, before processing a * command. * * The goal of the function is to free enough memory to keep Redis under the * configured memory limit. * * The function starts calculating how many bytes should be freed to keep * Redis under the limit, and enters a loop selecting the best keys to * evict accordingly to the configured policy. * * If all the bytes needed to return back under the limit were freed the * function returns C_OK, otherwise C_ERR is returned, and the caller * should block the execution of commands that will result in more memory * used by the server. * * ------------------------------------------------------------------------ * * LRU approximation algorithm * * Redis uses an approximation of the LRU algorithm that runs in constant * memory. Every time there is a key to expire, we sample N keys (with * N very small, usually in around 5) to populate a pool of best keys to * evict of M keys (the pool size is defined by MAXMEMORY_EVICTION_POOL_SIZE). * * The N keys sampled are added in the pool of good keys to expire (the one * with an old access time) if they are better than one of the current keys * in the pool. * * After the pool is populated, the best key we have in the pool is expired. * However note that we don't remove keys from the pool when they are deleted * so the pool may contain keys that no longer exist. * * When we try to evict a key, and all the entries in the pool don't exist * we populate it again. This time we'll be sure that the pool has at least * one key that can be evicted, if there is at least one key that can be * evicted in the whole database. */ /* Create a new eviction pool. */ struct evictionPoolEntry *evictionPoolAlloc(void) { struct evictionPoolEntry *ep; int j; ep = zmalloc(sizeof(*ep)*MAXMEMORY_EVICTION_POOL_SIZE); for (j = 0; j < MAXMEMORY_EVICTION_POOL_SIZE; j++) { ep[j].idle = 0; ep[j].key = NULL; } return ep; } /* This is an helper function for freeMemoryIfNeeded(), it is used in order * to populate the evictionPool with a few entries every time we want to * expire a key. Keys with idle time smaller than one of the current * keys are added. Keys are always added if there are free entries. * * We insert keys on place in ascending order, so keys with the smaller * idle time are on the left, and keys with the higher idle time on the * right. */ #define EVICTION_SAMPLES_ARRAY_SIZE 16 void evictionPoolPopulate(dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { int j, k, count; dictEntry *_samples[EVICTION_SAMPLES_ARRAY_SIZE]; dictEntry **samples; /* Try to use a static buffer: this function is a big hit... * Note: it was actually measured that this helps. */ if (server.maxmemory_samples <= EVICTION_SAMPLES_ARRAY_SIZE) { samples = _samples; } else { samples = zmalloc(sizeof(samples[0])*server.maxmemory_samples); } count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); for (j = 0; j < count; j++) { unsigned long long idle; sds key; robj *o; dictEntry *de; de = samples[j]; key = dictGetKey(de); /* If the dictionary we are sampling from is not the main * dictionary (but the expires one) we need to lookup the key * again in the key dictionary to obtain the value object. */ if (sampledict != keydict) de = dictFind(keydict, key); o = dictGetVal(de); idle = estimateObjectIdleTime(o); /* Insert the element inside the pool. * First, find the first empty bucket or the first populated * bucket that has an idle time smaller than our idle time. */ k = 0; while (k < MAXMEMORY_EVICTION_POOL_SIZE && pool[k].key && pool[k].idle < idle) k++; if (k == 0 && pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key != NULL) { /* Can't insert if the element is < the worst element we have * and there are no empty buckets. */ continue; } else if (k < MAXMEMORY_EVICTION_POOL_SIZE && pool[k].key == NULL) { /* Inserting into empty position. No setup needed before insert. */ } else { /* Inserting in the middle. Now k points to the first element * greater than the element to insert. */ if (pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key == NULL) { /* Free space on the right? Insert at k shifting * all the elements from k to end to the right. */ memmove(pool+k+1,pool+k, sizeof(pool[0])*(MAXMEMORY_EVICTION_POOL_SIZE-k-1)); } else { /* No free space on right? Insert at k-1 */ k--; /* Shift all elements on the left of k (included) to the * left, so we discard the element with smaller idle time. */ sdsfree(pool[0].key); memmove(pool,pool+1,sizeof(pool[0])*k); } } pool[k].key = sdsdup(key); pool[k].idle = idle; } if (samples != _samples) zfree(samples); } int freeMemoryIfNeeded(void) { size_t mem_reported, mem_used, mem_tofree, mem_freed; int slaves = listLength(server.slaves); mstime_t latency, eviction_latency; long long delta; /* Check if we are over the memory usage limit. If we are not, no need * to subtract the slaves output buffers. We can just return ASAP. */ mem_reported = zmalloc_used_memory(); if (mem_reported <= server.maxmemory) return C_OK; /* Remove the size of slaves output buffers and AOF buffer from the * count of used memory. */ mem_used = mem_reported; if (slaves) { listIter li; listNode *ln; listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = listNodeValue(ln); unsigned long obuf_bytes = getClientOutputBufferMemoryUsage(slave); if (obuf_bytes > mem_used) mem_used = 0; else mem_used -= obuf_bytes; } } if (server.aof_state != AOF_OFF) { mem_used -= sdslen(server.aof_buf); mem_used -= aofRewriteBufferSize(); } /* Check if we are still over the memory limit. */ if (mem_used <= server.maxmemory) return C_OK; /* Compute how much memory we need to free. */ mem_tofree = mem_used - server.maxmemory; mem_freed = 0; if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ latencyStartMonitor(latency); while (mem_freed < mem_tofree) { int j, k, keys_freed = 0; for (j = 0; j < server.dbnum; j++) { long bestval = 0; /* just to prevent warning */ sds bestkey = NULL; dictEntry *de; redisDb *db = server.db+j; dict *dict; if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU || server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) { dict = server.db[j].dict; } else { dict = server.db[j].expires; } if (dictSize(dict) == 0) continue; /* volatile-random and allkeys-random policy */ if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM || server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM) { de = dictGetRandomKey(dict); bestkey = dictGetKey(de); } /* volatile-lru and allkeys-lru policy */ else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU || server.maxmemory_policy == MAXMEMORY_VOLATILE_LRU) { struct evictionPoolEntry *pool = db->eviction_pool; while(bestkey == NULL) { evictionPoolPopulate(dict, db->dict, db->eviction_pool); /* Go backward from best to worst element to evict. */ for (k = MAXMEMORY_EVICTION_POOL_SIZE-1; k >= 0; k--) { if (pool[k].key == NULL) continue; de = dictFind(dict,pool[k].key); /* Remove the entry from the pool. */ sdsfree(pool[k].key); pool[k].key = NULL; pool[k].idle = 0; /* If the key exists, is our pick. Otherwise it is * a ghost and we need to try the next element. */ if (de) { bestkey = dictGetKey(de); break; } else { /* Ghost... */ continue; } } } } /* volatile-ttl */ else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { for (k = 0; k < server.maxmemory_samples; k++) { sds thiskey; long thisval; de = dictGetRandomKey(dict); thiskey = dictGetKey(de); thisval = (long) dictGetVal(de); /* Expire sooner (minor expire unix timestamp) is better * candidate for deletion */ if (bestkey == NULL || thisval < bestval) { bestkey = thiskey; bestval = thisval; } } } /* Finally remove the selected key. */ if (bestkey) { robj *keyobj = createStringObject(bestkey,sdslen(bestkey)); propagateExpire(db,keyobj,server.lazyfree_lazy_eviction); /* We compute the amount of memory freed by db*Delete() alone. * It is possible that actually the memory needed to propagate * the DEL in AOF and replication link is greater than the one * we are freeing removing the key, but we can't account for * that otherwise we would never exit the loop. * * AOF and Output buffer memory will be freed eventually so * we only care about memory used by the key space. */ delta = (long long) zmalloc_used_memory(); latencyStartMonitor(eviction_latency); if (server.lazyfree_lazy_eviction) dbAsyncDelete(db,keyobj); else dbSyncDelete(db,keyobj); latencyEndMonitor(eviction_latency); latencyAddSampleIfNeeded("eviction-del",eviction_latency); latencyRemoveNestedEvent(latency,eviction_latency); delta -= (long long) zmalloc_used_memory(); mem_freed += delta; server.stat_evictedkeys++; notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted", keyobj, db->id); decrRefCount(keyobj); keys_freed++; /* When the memory to free starts to be big enough, we may * start spending so much time here that is impossible to * deliver data to the slaves fast enough, so we force the * transmission here inside the loop. */ if (slaves) flushSlavesOutputBuffers(); } } if (!keys_freed) { latencyEndMonitor(latency); latencyAddSampleIfNeeded("eviction-cycle",latency); goto cant_free; /* nothing to free... */ } } latencyEndMonitor(latency); latencyAddSampleIfNeeded("eviction-cycle",latency); return C_OK; cant_free: /* We are here if we are not able to reclaim memory. There is only one * last thing we can try: check if the lazyfree thread has jobs in queue * and wait... */ while(bioPendingJobsOfType(BIO_LAZY_FREE)) { if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree) break; usleep(1000); } return C_ERR; }