Optimize resizing hash table to resize not only non-empty dicts. (#12819)

The function `tryResizeHashTables` only attempts to shrink the dicts that has keys (change from #11695), this was a serious problem until the change in #12850 since it meant if all keys are deleted, we won't shrink the dick. But still, both dictShrink and dictExpand may be blocked by a fork child process, therefore, the cron job needs to perform both dictShrink and dictExpand, for not just non-empty dicts, but all dicts in DBs. What this PR does: 1. Try to resize all dicts in DBs (not just non-empty ones, as it was since #12850) 2. handle both shrink and expand (not just shrink, as it was since forever) 3. Refactor some APIs about dict resizing (get rid of `htNeedsShrink` `htNeedsShrink` `dictShrinkToFit`, and expose `dictShrinkIfNeeded` `dictExpandIfNeeded` which already contains all the code of those functions we get rid of, to make APIs more neat) 4. In the `Don't rehash if redis has child process` test, now that cron would do resizing, we no longer need to write to DB after the child process got killed, and can wait for the cron to expand the hash table.
2025-01-22 08:08:53 -05:00 · 2024-01-30 03:02:07 +08:00 · 2024-01-30 03:02:07 +08:00 · af7ceeb765
commit af7ceeb765
parent c5273cae18
7 changed files with 89 additions and 60 deletions
--- a/src/db.c
+++ b/src/db.c
@ -692,7 +692,7 @@ long long emptyDbStructure(redisDb *dbarray, int dbnum, int async,
        for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) {
            dbarray[j].sub_dict[subdict].non_empty_slots = 0;
            dbarray[j].sub_dict[subdict].key_count = 0;
-            dbarray[j].sub_dict[subdict].resize_cursor = -1;
+            dbarray[j].sub_dict[subdict].resize_cursor = 0;
            if (server.cluster_enabled) {
                dbarray[j].sub_dict[subdict].bucket_count = 0;
                unsigned long long *slot_size_index = dbarray[j].sub_dict[subdict].slot_size_index;
--- a/src/dict.c
+++ b/src/dict.c
@ -216,19 +216,6 @@ int _dictInit(dict *d, dictType *type)
    return DICT_OK;
 }
 /* Resize the table to the minimal size that contains all the elements,
 * but with the invariant of a USED/BUCKETS ratio near to <= 1 */
 int dictShrinkToFit(dict *d)
 {
    unsigned long minimal;
    if (dict_can_resize != DICT_RESIZE_ENABLE || dictIsRehashing(d)) return DICT_ERR;
    minimal = d->ht_used[0];
    if (minimal < DICT_HT_INITIAL_SIZE)
        minimal = DICT_HT_INITIAL_SIZE;
    return dictShrink(d, minimal);
 }
 /* Resize or create the hash table,
 * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1).
 * Returns DICT_OK if resize was performed, and DICT_ERR if skipped. */
@ -1484,19 +1471,17 @@ static int dictTypeResizeAllowed(dict *d, size_t size) {
                    (double)d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]));
 }
-/* Expand the hash table if needed */
+/* Returning DICT_OK indicates a successful expand or the dictionary is undergoing rehashing, 
-static void _dictExpandIfNeeded(dict *d)
+ * and there is nothing else we need to do about this dictionary currently. While DICT_ERR indicates
-{
+ * that expand has not been triggered (may be try shrinking?)*/
-    /* Automatic resizing is disallowed. Return */
+int dictExpandIfNeeded(dict *d) {
    if (d->pauseAutoResize > 0) return;
    /* Incremental rehashing already in progress. Return. */
-    if (dictIsRehashing(d)) return;
+    if (dictIsRehashing(d)) return DICT_OK;
    /* If the hash table is empty expand it to the initial size. */
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) {
        dictExpand(d, DICT_HT_INITIAL_SIZE);
-        return;
+        return DICT_OK;
    }
    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
@ -1508,22 +1493,30 @@ static void _dictExpandIfNeeded(dict *d)
        (dict_can_resize != DICT_RESIZE_FORBID &&
         d->ht_used[0] >= dict_force_resize_ratio * DICTHT_SIZE(d->ht_size_exp[0])))
    {
-        if (!dictTypeResizeAllowed(d, d->ht_used[0] + 1))
+        if (dictTypeResizeAllowed(d, d->ht_used[0] + 1))
-            return;
+            dictExpand(d, d->ht_used[0] + 1);
-        dictExpand(d, d->ht_used[0] + 1);
+        return DICT_OK;
    }
    return DICT_ERR;
 }
-static void _dictShrinkIfNeeded(dict *d) 
+/* Expand the hash table if needed */
-{
+static void _dictExpandIfNeeded(dict *d) {
    /* Automatic resizing is disallowed. Return */
    if (d->pauseAutoResize > 0) return;
    dictExpandIfNeeded(d);
 }
 /* Returning DICT_OK indicates a successful shrinking or the dictionary is undergoing rehashing, 
 * and there is nothing else we need to do about this dictionary currently. While DICT_ERR indicates
 * that shrinking has not been triggered (may be try expanding?)*/
 int dictShrinkIfNeeded(dict *d) {
    /* Incremental rehashing already in progress. Return. */
-    if (dictIsRehashing(d)) return;
+    if (dictIsRehashing(d)) return DICT_OK;
    /* If the size of hash table is DICT_HT_INITIAL_SIZE, don't shrink it. */
-    if (DICTHT_SIZE(d->ht_size_exp[0]) == DICT_HT_INITIAL_SIZE) return;
+    if (DICTHT_SIZE(d->ht_size_exp[0]) <= DICT_HT_INITIAL_SIZE) return DICT_OK;
    /* If we reached below 1:8 elements/buckets ratio, and we are allowed to resize
     * the hash table (global setting) or we should avoid it but the ratio is below 1:32,
@ -1533,10 +1526,19 @@ static void _dictShrinkIfNeeded(dict *d)
        (dict_can_resize != DICT_RESIZE_FORBID &&
         d->ht_used[0] * HASHTABLE_MIN_FILL * dict_force_resize_ratio <= DICTHT_SIZE(d->ht_size_exp[0])))
    {
-        if (!dictTypeResizeAllowed(d, d->ht_used[0]))
+        if (dictTypeResizeAllowed(d, d->ht_used[0]))
-            return;
+            dictShrink(d, d->ht_used[0]);
-        dictShrink(d, d->ht_used[0]);
+        return DICT_OK;
    }
    return DICT_ERR;
 }
 static void _dictShrinkIfNeeded(dict *d) 
 {
    /* Automatic resizing is disallowed. Return */
    if (d->pauseAutoResize > 0) return;
    dictShrinkIfNeeded(d);
 }
 /* Our hash table capability is a power of two */
--- a/src/dict.h
+++ b/src/dict.h
@ -195,7 +195,8 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table
 void dictRelease(dict *d);
 dictEntry * dictFind(dict *d, const void *key);
 void *dictFetchValue(dict *d, const void *key);
-int dictShrinkToFit(dict *d);
+int dictShrinkIfNeeded(dict *d);
 int dictExpandIfNeeded(dict *d);
 void dictSetKey(dict *d, dictEntry* de, void *key);
 void dictSetVal(dict *d, dictEntry *de, void *val);
 void dictSetSignedIntegerVal(dictEntry *de, int64_t val);
--- a/src/server.c
+++ b/src/server.c
@ -693,34 +693,23 @@ dictType clientDictType = {
    .no_value = 1               /* no values in this dict */
 };
 int htNeedsShrink(dict *dict) {
    long long size, used;
    size = dictBuckets(dict);
    used = dictSize(dict);
    return (size > DICT_HT_INITIAL_SIZE &&
            (used * HASHTABLE_MIN_FILL <= size));
 }
 /* In cluster-enabled setup, this method traverses through all main/expires dictionaries (CLUSTER_SLOTS)
 * and triggers a resize if the percentage of used buckets in the HT reaches (100 / HASHTABLE_MIN_FILL)
- * we resize the hash table to save memory.
+ * we shrink the hash table to save memory, or expand the hash when the percentage of used buckets reached
 * 100.
 *
 * In non cluster-enabled setup, it resize main/expires dictionary based on the same condition described above. */
 void tryResizeHashTables(int dbid) {
    redisDb *db = &server.db[dbid];
    int dicts_per_call = min(CRON_DICTS_PER_DB, db->dict_count);
    for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) {
-        if (dbSize(db, subdict) == 0) continue;
+        for (int i = 0; i < dicts_per_call; i++) {
        if (db->sub_dict[subdict].resize_cursor == -1)
            db->sub_dict[subdict].resize_cursor = findSlotByKeyIndex(db, 1, subdict);
        for (int i = 0; i < CRON_DBS_PER_CALL && db->sub_dict[subdict].resize_cursor != -1; i++) {
            int slot = db->sub_dict[subdict].resize_cursor;
            dict *d = (subdict == DB_MAIN ? db->dict[slot] : db->expires[slot]);
-            if (htNeedsShrink(d))
+            if (dictShrinkIfNeeded(d) == DICT_ERR) {
-                dictShrinkToFit(d);
+                dictExpandIfNeeded(d);
-            db->sub_dict[subdict].resize_cursor = dbGetNextNonEmptySlot(db, slot, subdict);
+            }
            db->sub_dict[subdict].resize_cursor = (slot + 1) % db->dict_count;
        }
    }
 }
@ -2685,7 +2674,7 @@ void initDbState(redisDb *db){
    for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) {
        db->sub_dict[subdict].non_empty_slots = 0;
        db->sub_dict[subdict].key_count = 0;
-        db->sub_dict[subdict].resize_cursor = -1;
+        db->sub_dict[subdict].resize_cursor = 0;
        db->sub_dict[subdict].slot_size_index = server.cluster_enabled ? zcalloc(sizeof(unsigned long long) * (CLUSTER_SLOTS + 1)) : NULL;
        db->sub_dict[subdict].bucket_count = 0;
    }
--- a/src/server.h
+++ b/src/server.h
@ -115,6 +115,7 @@ struct hdr_histogram;
 #define CONFIG_MAX_HZ            500
 #define MAX_CLIENTS_PER_CLOCK_TICK 200          /* HZ is adapted based on that. */
 #define CRON_DBS_PER_CALL 16
 #define CRON_DICTS_PER_DB 16
 #define NET_MAX_WRITES_PER_EVENT (1024*64)
 #define PROTO_SHARED_SELECT_CMDS 10
 #define OBJ_SHARED_INTEGERS 10000
@ -970,7 +971,7 @@ typedef struct replBufBlock {
 /* When adding fields, please check the swap db related logic. */
 typedef struct dbDictState {
-    int resize_cursor;                     /* Cron job uses this cursor to gradually resize dictionaries (only used for cluster-enabled). */
+    int resize_cursor;                     /* Cron job uses this cursor to gradually resize all dictionaries. */
    int non_empty_slots;                   /* The number of non-empty slots. */
    unsigned long long key_count;          /* Total number of keys in this DB. */
    unsigned long long bucket_count;       /* Total number of buckets in this DB across dictionaries (only used for cluster-enabled). */
@ -3111,7 +3112,6 @@ void serverLogRaw(int level, const char *msg);
 void serverLogRawFromHandler(int level, const char *msg);
 void usage(void);
 void updateDictResizePolicy(void);
 int htNeedsShrink(dict *dict);
 void populateCommandTable(void);
 void resetCommandTableStats(dict* commands);
 void resetErrorTableStats(void);
--- a/src/t_zset.c
+++ b/src/t_zset.c
@ -2024,7 +2024,7 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) {
            break;
        }
        dictResumeAutoResize(zs->dict);
-        if (htNeedsShrink(zs->dict)) dictShrinkToFit(zs->dict);
+        dictShrinkIfNeeded(zs->dict);
        if (dictSize(zs->dict) == 0) {
            dbDelete(c->db,key);
            keyremoved = 1;
@ -2554,7 +2554,7 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t *
    }
    /* Resize dict if needed after removing multiple elements */
-    if (htNeedsShrink(dstzset->dict)) dictShrinkToFit(dstzset->dict);
+    dictShrinkIfNeeded(dstzset->dict);
    /* Using this algorithm, we can't calculate the max element as we go,
     * we have to iterate through all elements to find the max one after. */
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@ -373,12 +373,14 @@ start_server {tags {"other external:skip"}} {
        assert_no_match "*table size: 8192*" [r debug HTSTATS 9]
        exec kill -9 [get_child_pid 0]
        waitForBgsave r
        after 200 ;# waiting for serverCron
        # Hash table should rehash since there is no child process,
        # size is power of two and over 4098, so it is 8192
-        r set k3 v3
+        wait_for_condition 50 100 {
-        assert_match "*table size: 8192*" [r debug HTSTATS 9]
+            [string match "*table size: 8192*" [r debug HTSTATS 9]]
        } else {
            fail "hash table did not rehash after child process killed"
        }
    } {} {needs:debug needs:local-process}
 }
@ -487,3 +489,38 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
        assert_match "*table size: 16*" [r debug HTSTATS 0]
    } {} {needs:debug}
 }
 proc get_overhead_hashtable_main {} {
    set main 0
    set stats [r memory stats]
    set list_stats [split $stats " "]
    for {set j 0} {$j < [llength $list_stats]} {incr j} {
        if {[string equal -nocase "\{overhead.hashtable.main" [lindex $list_stats $j]]} {
            set main [lindex $list_stats [expr $j+1]]
            break
        }
    }
    return $main
 }
 start_server {tags {"other external:skip"}} {
    test "Redis can resize empty dict" {
        # Write and then delete 128 keys, creating an empty dict
        r flushall
        for {set j 1} {$j <= 128} {incr j} {
            r set $j{b} a
        }
        for {set j 1} {$j <= 128} {incr j} {
            r del $j{b}
        }
        # Set a key to enable overhead display of db 0
        r set a b
        # The dict containing 128 keys must have expanded,
        # its hash table itself takes a lot more than 200 bytes
        wait_for_condition 100 50 {
            [get_overhead_hashtable_main] < 200
        } else {
            fail "dict did not resize in time"
        }   
    }
 }