negative caching implemented

2025-01-22 16:18:28 -05:00 · 2010-12-31 17:32:59 +01:00 · 2010-12-31 17:32:59 +01:00 · d934e1e85b
commit d934e1e85b
parent c4b64a1395
4 changed files with 102 additions and 28 deletions
--- a/src/db.c
+++ b/src/db.c
@ -38,6 +38,8 @@ robj *lookupKey(redisDb *db, robj *key) {
         * async loading of this key, what may happen is that the old
         * key is loaded in memory if this gets deleted in the meantime. */
        if (server.ds_enabled && cacheKeyMayExist(db,key)) {
+            redisLog(REDIS_DEBUG,"Force loading key %s via lookup",
+                key->ptr);
            val = dsGet(db,key,&expire);
            if (val) {
                int retval = dbAdd(db,key,val);
@ -142,14 +144,13 @@ robj *dbRandomKey(redisDb *db) {

 /* Delete a key, value, and associated expiration entry if any, from the DB */
 int dbDelete(redisDb *db, robj *key) {
-    /* If VM is enabled make sure to awake waiting clients for this key:
-     * deleting the key will kill the I/O thread bringing the key from swap
-     * to memory, so the client will never be notified and unblocked if we
-     * don't do it now. */
+    /* If diskstore is enabled make sure to awake waiting clients for this key
+     * as it is not really useful to wait for a key already deleted to be
+     * loaded from disk. */
    if (server.ds_enabled) handleClientsBlockedOnSwappedKey(db,key);

-    /* FIXME: we need to delete the IO Job loading the key, or simply we can
-     * wait for it to finish. */
+    /* Mark this key as non existing on disk as well */
+    cacheSetKeyDoesNotExistRemember(db,key);

    /* Deleting an entry from the expires dict will not free the sds of
     * the key, because it is shared with the main dictionary. */
--- a/src/dscache.c
+++ b/src/dscache.c
@ -105,6 +105,10 @@
 *   value so it will be evicted later.
 *
 *   Are there other patterns like this where we load stale data?
+ *
+ *   Also, make sure that key preloading is ONLY done for keys that are
+ *   not marked as cacheKeyDoesNotExist(), otherwise, again, we can load
+ *   data from disk that should instead be deleted.
 */

 /* Virtual Memory is composed mainly of two subsystems:
@ -259,7 +263,72 @@ int dsCanTouchDiskStore(void) {
    return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
 }

-/* =================== Virtual Memory - Threaded I/O  ======================= */
+/* ==================== Disk store negative caching  ========================
+ *
+ * When disk store is enabled, we need negative caching, that is, to remember
+ * keys that are for sure *not* on the disk key-value store.
+ *
+ * This is useful for two reasons:
+ *
+ * 1) Without negative caching cache misses will cost us a disk lookup, even
+ *    if the same non existing key is accessed again and again. We negative
+ *    caching we remember that the key is not on disk, so if it's not in memory
+ *    and we have a negative cache entry, we don't try a disk access at all.
+ *
+ * 2) Negative caching is the way to fix a specific race condition. For instance
+ *    think at the following sequence of commands:
+ *
+ *    SET foo bar
+ *    DEL foo
+ *    GET foo
+ *    
+ *    After the SET, we'll mark the value as dirty, so it will be flushed
+ *    on disk at some time. Later the key is deleted, so will be removed
+ *    from memory. Another job will be created to remove the key from the disk
+ *    store, but the removal is not synchronous, so may happen later in time.
+ *
+ *    Finally we have a GET foo operation. This operation may result in
+ *    reading back a value from disk that is not updated data, as the deletion
+ *    operaiton against the disk KV store was still not completed, so we
+ *    read old data.
+ *
+ * Remembering that the given key is deleted is important. We can discard this
+ * information once the key was really removed from the disk.
+ *
+ * So actually there are two kind of negative caching entries: entries that
+ * can be evicted when we need to reclaim memory, and entries that will
+ * not be evicted, for all the time we need this information to be available.
+ *
+ * The API allows to create both kind of negative caching. */
+
+int cacheKeyMayExist(redisDb *db, robj *key) {
+    return dictFind(db->io_negcache,key) == NULL;
+}
+
+void cacheSetKeyMayExist(redisDb *db, robj *key) {
+    dictDelete(db->io_negcache,key);
+}
+
+void cacheSetKeyDoesNotExist(redisDb *db, robj *key) {
+    struct dictEntry *de;
+
+    /* Don't overwrite negative cached entries with val set to 0, as this
+     * entries were created with cacheSetKeyDoesNotExistRemember(). */
+    de = dictFind(db->io_negcache,key);
+    if (de != NULL && dictGetEntryVal(de) == NULL) return;
+
+    if (dictReplace(db->io_negcache,key,(void*)time(NULL))) {
+        incrRefCount(key);
+    }
+}
+
+void cacheSetKeyDoesNotExistRemember(redisDb *db, robj *key) {
+    if (dictReplace(db->io_negcache,key,NULL)) {
+        incrRefCount(key);
+    }
+}
+
+/* ================== Disk store cache - Threaded I/O  ====================== */

 void freeIOJob(iojob *j) {
    decrRefCount(j->key);
@ -310,15 +379,20 @@ void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
            if (j->val != NULL) {
                /* Note: the key may already be here if between the time
                 * this key loading was scheduled and now there was the
-                 * need to blocking load the key for a key lookup. */
-                if (dbAdd(j->db,j->key,j->val) == REDIS_OK) {
+                 * need to blocking load the key for a key lookup.
+                 *
+                 * Also we don't add a key that was deleted in the
+                 * meantime and should not be on disk either. */
+                if (cacheKeyMayExist(j->db,j->key) &&
+                    dbAdd(j->db,j->key,j->val) == REDIS_OK)
+                {
                    incrRefCount(j->val);
                    if (j->expire != -1) setExpire(j->db,j->key,j->expire);
                }
            } else {
                /* The key does not exist. Create a negative cache entry
                 * for this key. */
-                /* FIXME: add this entry into the negative cache */
+                cacheSetKeyDoesNotExist(j->db,j->key);
            }
            /* Handle clients waiting for this key to be loaded. */
            handleClientsBlockedOnSwappedKey(j->db,j->key);
@ -327,6 +401,12 @@ void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
            if (j->val) {
                redisAssert(j->val->storage == REDIS_DS_SAVING);
                j->val->storage = REDIS_DS_MEMORY;
+                cacheSetKeyMayExist(j->db,j->key);
+            } else {
+                /* Key deleted. Probably we have this key marked as
+                 * non existing, and impossible to evict, in our negative
+                 * cache entry. Add it as a normal negative cache entry. */
+                cacheSetKeyMayExist(j->db,j->key);
            }
            freeIOJob(j);
        }
@ -585,15 +665,6 @@ void cacheCron(void) {
    }
 }

-/* ============ Negative caching for diskstore objects ====================== */
-/* Since accesses to keys that don't exist with disk store cost us a disk
- * access, we need to cache names of keys that do not exist but are frequently
- * accessed. */
-int cacheKeyMayExist(redisDb *db, robj *key) {
-    /* FIXME: for now we just always return true. */
-    return 1;
-}
-
 /* ============ Virtual Memory - Blocking clients on missing keys =========== */

 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
@ -624,6 +695,9 @@ int waitForSwappedKey(redisClient *c, robj *key) {
    de = dictFind(c->db->dict,key->ptr);
    if (de != NULL) return 0;

+    /* Don't wait for keys we are sure are not on disk either */
+    if (!cacheKeyMayExist(c->db,key)) return 0;
+
    /* Add the key to the list of keys this client is waiting for.
     * This maps clients to keys they are waiting for. */
    listAddNodeTail(c->io_keys,key);
@ -645,13 +719,6 @@ int waitForSwappedKey(redisClient *c, robj *key) {
    listAddNodeTail(l,c);

    /* Are we already loading the key from disk? If not create a job */
-    /* FIXME: if a given client was blocked for this key (so job already
-     * created) but the client was freed, there may be a job loading this
-     * key even if de == NULL. Does this creates some race condition?
-     *
-     * Example: after the first load the key gets a DEL that will schedule
-     * a write. But the write will happen later, the duplicated load will
-     * fire and we'll get again the key in memory. */
    if (de == NULL)
        dsCreateIOJob(REDIS_IOJOB_LOAD,c->db,key,NULL);
    return 1;
--- a/src/redis.c
+++ b/src/redis.c
@ -345,7 +345,7 @@ unsigned int dictEncObjHash(const void *key) {
    }
 }

-/* Sets type */
+/* Sets type and diskstore negative caching hash table */
 dictType setDictType = {
    dictEncObjHash,            /* hash function */
    NULL,                      /* key dup */
@ -854,8 +854,10 @@ void initServer() {
        server.db[j].expires = dictCreate(&keyptrDictType,NULL);
        server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
        server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
-        if (server.ds_enabled)
+        if (server.ds_enabled) {
            server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
+            server.db[j].io_negcache = dictCreate(&setDictType,NULL);
+        }
        server.db[j].id = j;
    }
    server.pubsub_channels = dictCreate(&keylistDictType,NULL);
--- a/src/redis.h
+++ b/src/redis.h
@ -269,6 +269,7 @@ typedef struct redisDb {
    dict *expires;              /* Timeout of keys with a timeout set */
    dict *blocking_keys;        /* Keys with clients waiting for data (BLPOP) */
    dict *io_keys;              /* Keys with clients waiting for VM I/O */
+    dict *io_negcache;          /* Negative caching for disk store */
    dict *watched_keys;         /* WATCHED keys for MULTI/EXEC CAS */
    int id;
 } redisDb;
@ -809,6 +810,9 @@ int cacheFreeOneEntry(void);
 void cacheScheduleForFlush(redisDb *db, robj *key);
 void cacheCron(void);
 int cacheKeyMayExist(redisDb *db, robj *key);
+void cacheSetKeyExists(redisDb *db, robj *key);
+void cacheSetKeyDoesNotExist(redisDb *db, robj *key);
+void cacheSetKeyDoesNotExistRemember(redisDb *db, robj *key);

 /* Set data type */
 robj *setTypeCreate(robj *value);